Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 29 Jun 2010 08:26:52 GMT
From:      Andre Oppermann <andre@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 180313 for review
Message-ID:  <201006290826.o5T8Qq7S071410@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://p4web.freebsd.org/@@180313?ac=10

Change 180313 by andre@andre_t61 on 2010/06/29 08:26:05

	Import new reassembly queue implementation from //depot/projects/tcp_reass/netinet/tcp_reass.c #56/56

Affected files ...

.. //depot/projects/tcp_new/netinet/tcp_reass.c#3 edit

Differences ...

==== //depot/projects/tcp_new/netinet/tcp_reass.c#3 (text+ko) ====

@@ -1,6 +1,6 @@
 /*-
- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
- *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2007-2009
+ *	Andre Oppermann, Internet Business Solutions AG.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -27,14 +27,85 @@
  * SUCH DAMAGE.
  *
  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
+ * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.353 2007/10/07 20:44:24 silby Exp $");
+__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.364 2009/08/01 19:26:27 rwatson Exp $");
+
+/*
+ * Operational overview of TCP reassembly:
+ *
+ * It is the purpose of tcp reassembly to store segments that are received
+ * out of order.  This happens when packets are lost along the way due to
+ * various reasons.  The most common one is traffic overload which causes
+ * routers to drop packets for brief moments.
+ *
+ * Upon arrival of the missing segment the whole chain of stored contiguous
+ * segments is moved into the socket buffer at once.  In case of multiple
+ * missing segments the remainder kept in the queue until the next missing
+ * segment arrives.
+ *
+ * When the TCP receiver is in reassembly state *all* arrving segments are
+ * passed through the reassembly queue until all missing segments have been
+ * received and all data is dequeued.
+ *
+ * Implementation details and choices:
+ *
+ * Instead of storing all segments on their own and tracking each with
+ * with a queue structure we build blocks of contiguous segments merged
+ * together.  This way one missing segment needs only one queue structure
+ * tracking the whole block of following contiguous segments.  If a new
+ * segment matches the end of one block and the start of the next block
+ * the two are joined together.  If no match is found a new block is created.
+ *
+ * Instead of a classical list or tailqueue a red-black tree is used to
+ * cope with arbitrary complexity of the blocks and missing segments.  A
+ * queue has O(n) worst case behavior whereas the red-black tree is
+ * O(log n).  This prevents complexity attacks where a long chain of
+ * blocks would have to be traversed to find the right place for the new
+ * segment.  Especially with high bandwidth*delay product links and large
+ * socket buffers this is a valid concern.
+ *
+ * For the segment merging into a block queue structure the operator can
+ * chose between time and space efficiency.  For time efficiency only the
+ * head or tail mbuf chain pointers are updated with the new segments mbuf.
+ * This results in a minimal amount of data accesses and no memory allocations
+ * unless a new block is created.  For space efficiency mbufs and mbuf chains
+ * are compacted and possibly merged together.  It may even allocate a new
+ * and larger mbuf(-chain) to store the segment data with less overhead and
+ * less wasted space.  This makes it immune against mbuf exhaustion attacks
+ * where only tiny amounts of data are received per segment, possibly only
+ * one byte.  Usually network drivers allocate only mbuf cluster of 2KBytes
+ * on receive no matter how large the packet actually is for efficiency
+ * reasons and because can't easily know at DMA time how large the packet
+ * effectively actually is.
+ *
+ * To prevent resource exhaustion attacks a local and global limit governs
+ * the number of reassembly blocks.  The local limit prevents single connections
+ * from monopolizing the global limit.  When used in space efficient mode
+ * the total memory consumption of the reassembly queue can't be more than
+ * the receive socket buffer size.  To prevent lost connections from holding
+ * on for too long a timeout causes flushing of all queued data.
+ *
+ * The reassembly queue block structure is also used to track SACK information
+ * as the data receiver.  A double-linked list is added that tracks the blocks
+ * LIFO order of their arrival or updating.
+ *
+ * Implemented / relevant RFC's:
+ *  RFC793: Transmission Control Protocol
+ *  RFC1122: section 4.2.2.20 and section 4.2.2.21
+ *  RFC2018: SACK, section 4 including all optional parts marked as "SHOULD"
+ *  RFC2883: D-SACK, section 4
+ *
+ * TODO:
+ * - D-SACK when only one SACK slot available?
+ * - Direct pointer to highest seqnum block in RB-tree?
+ * - Remove T/TCP gonk.
+ * - Lots of testing.
+ */
 
 #include "opt_inet.h"
-#include "opt_inet6.h"
-#include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -50,236 +121,668 @@
 
 #include <net/if.h>
 #include <net/route.h>
+#include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
-#include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
-#include <netinet/ip6.h>
-#include <netinet6/in6_pcb.h>
-#include <netinet6/ip6_var.h>
-#include <netinet6/nd6.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
-#include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
-#ifdef TCPDEBUG
-#include <netinet/tcp_debug.h>
-#endif /* TCPDEBUG */
+
+uma_zone_t	tcp_reass_zone;
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
     "TCP Segment Reassembly Queue");
 
-static int tcp_reass_maxseg = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
-    &tcp_reass_maxseg, 0,
-    "Global maximum number of TCP Segments in Reassembly Queue");
+static int tcp_reass_enable = 1;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_RW,
+    &tcp_reass_enable, 0,
+    "Enable/disable use of TCP reassembly queue");
+
+static int tcp_reass_maxblocks = 32;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RW,
+    &tcp_reass_maxblocks, 0,
+    "Per connection limit of TCP segment blocks in reassembly queue");
+
+static int tcp_reass_globalmaxblocks = 65535;
+SYSCTL_PROC(_net_inet_tcp_reass, OID_AUTO, globalmaxblocks,
+    CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_TUN, &tcp_reass_zone, 0,
+    sysctl_zonelimit, "I",
+    "Global limit of TCP segment blocks in reassembly queue");
+
+static int tcp_reass_timeout = 0;
+SYSCTL_PROC(_net_inet_tcp_reass, OID_AUTO, timeout, CTLTYPE_INT|CTLFLAG_RW,
+    &tcp_reass_timeout, 0, sysctl_msec_to_ticks, "I",
+    "Reassembly queue flush timeout in milliseconds");
+
+static int tcp_reass_spacetime = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW,
+    &tcp_reass_spacetime, 0,
+    "Reassembly queue strategy of space vs. time efficiency");
+
+static struct tcp_reass_block *
+    tcp_reass_merge(struct tcp_reass_block *, struct tcp_reass_block *);
+
+/*
+ * Trim empty mbufs from the head of an mbuf chain.
+ */
+static struct mbuf *
+m_trimhead(struct mbuf *m)
+{
+
+	while (m != NULL && m->m_len == 0)
+		m = m_free(m);
+
+	return (m);
+}
+
+/*
+ * Initialize TCP reassembly zone on startup.
+ */
+void
+tcp_reass_init(void)
+{
+
+	TUNABLE_INT_FETCH("net.inet.tcp.reass.globalmaxblocks",
+	    &tcp_reass_globalmaxblocks);
+	tcp_reass_zone = uma_zcreate("tcpreass", sizeof(struct tcp_reass_block),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	uma_zone_set_max(tcp_reass_zone, tcp_reass_globalmaxblocks);
+	tcp_reass_timeout = 2 * TCPTV_MSL;
+}
+
+/*
+ * Compare function implementing the ranged lookup on the RB tree.
+ * NB: The tree must never have any overlapping elements.
+ */
+static __inline int
+tcp_reass_cmp(struct tcp_reass_block *a, struct tcp_reass_block *b)
+{
+
+	if (SEQ_LT(a->trb_seqe, b->trb_seqs))
+		return (-1);
+	else if (SEQ_GT(a->trb_seqs, b->trb_seqe))
+		return (1);
+	else
+		return (0);
+}
+
+RB_PROTOTYPE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp);
+RB_GENERATE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp);
+
+/*
+ * Verify the integrity of the reassembly queue.
+ */
+#ifdef INVARIANTS
+static int
+tcp_reass_verify(struct tcpcb *tp, int prepost)
+{
+	int i = 0, size = 0, total = 0;
+	struct mbuf *m;
+	struct tcp_reass_block *trb, *trbn;
 
-int tcp_reass_qsize = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD,
-    &tcp_reass_qsize, 0,
-    "Global number of TCP Segments currently in Reassembly Queue");
+	RB_FOREACH_SAFE(trb, tcp_ra, &tp->rcv_reass, trbn) {
+		KASSERT(SEQ_LT(trb->trb_seqs, trb->trb_seqe),
+		    ("%s: trb_seqs >= trb_seqe", __func__));
+		KASSERT(SEQ_GT(trb->trb_seqs, tp->rcv_nxt) ||
+		    prepost, ("%s: rcv_nxt >= trb_seqs", __func__));
+		KASSERT(trb->trb_m != NULL,
+		    ("%s: trb_m == NULL", __func__));
+		KASSERT(trb->trb_mt != NULL,
+		    ("%s: trb_mt == NULL", __func__));
+		size = SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
+		KASSERT(size == m_length(trb->trb_m, &m),
+		    ("%s: seq# size != actual mbuf size", __func__));
+		KASSERT(trb->trb_mt == m,
+		    ("%s: trb_mt is not last mbuf", __func__));
+		KASSERT(trbn == NULL || SEQ_LT(trb->trb_seqe, trbn->trb_seqs),
+		    ("%s: overlaps into next block", __func__));
+		total += size;
+		i++;
+	}
+	KASSERT(tp->rcv_reass_size == total,
+	    ("%s: rcv_reass_size not correct", __func__));
 
-static int tcp_reass_maxqlen = 48;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW,
-    &tcp_reass_maxqlen, 0,
-    "Maximum number of TCP Segments per individual Reassembly Queue");
+	LIST_FOREACH(trb, &tp->rcv_reass_sack, trb_sack) {
+		i--;
+	}
+	KASSERT(i == 0,
+	    ("%s: sack list incorrect", __func__));
 
-static int tcp_reass_overflows = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
-    &tcp_reass_overflows, 0,
-    "Global number of TCP Segment Reassembly Queue Overflows");
+	return (1);
+}
+#endif /* INVARIANTS */
 
-/* Initialize TCP reassembly queue */
+/*
+ * Remove a single block from the reassembly queue
+ * and free all of its mbufs, if any.
+ */
 static void
-tcp_reass_zone_change(void *tag)
+tcp_reass_free(struct tcpcb *tp, struct tcp_reass_block *trb)
 {
 
-	tcp_reass_maxseg = nmbclusters / 16;
-	uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg);
+	trb = RB_REMOVE(tcp_ra, &tp->rcv_reass, trb);
+	KASSERT(trb != NULL, ("%s: RB_REMOVE failed", __func__));
+	LIST_REMOVE(trb, trb_sack);
+	m_freem(trb->trb_m);
+	tp->rcv_reass_size -= SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
+	tp->rcv_reass_blocks--;
+	uma_zfree(tcp_reass_zone, trb);
 }
 
-uma_zone_t	tcp_reass_zone;
+/*
+ * Remove and free all blocks from the reassembly queue.
+ */
+void
+tcp_reass_flush(struct tcpcb *tp)
+{
+	struct tcp_reass_block *trb, *trbn;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	KASSERT(tcp_reass_verify(tp, 0),
+	    ("%s: reassembly queue inconsistent", __func__));
+
+	RB_FOREACH_SAFE(trb, tcp_ra, &tp->rcv_reass, trbn) {
+		tcp_reass_free(tp, trb);
+	}
+
+	KASSERT(tp->rcv_reass_size == 0,
+	    ("%s: rcv_reass_size not zero after flushing", __func__));
+}
 
-void
-tcp_reass_init(void)
+/*
+ * Move block to front of SACK list to report SACK blocks in LIFO order.
+ *  RFC2018: section 4
+ */
+static __inline void
+tcp_reass_sacktrack(struct tcpcb *tp, struct tcp_reass_block *trb)
 {
 
-	tcp_reass_maxseg = nmbclusters / 16;
-	TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
-	    &tcp_reass_maxseg);
-	tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
-	uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg);
-	EVENTHANDLER_REGISTER(nmbclusters_change,
-	    tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
+	if (LIST_FIRST(&tp->rcv_reass_sack) != trb) {
+		LIST_REMOVE(trb, trb_sack);
+		LIST_INSERT_HEAD(&tp->rcv_reass_sack, trb, trb_sack);
+	}
 }
 
+/*
+ * Integrate the new segment into the reassembly queue.  When the segment
+ * matches RCV.NXT append it to the socket buffer including all eglible
+ * data from the reassembly queue.
+ *
+ * NB: We must always consume the mbuf.  Either by appeding it to
+ * the queue or by freeing it.
+ */
 int
-tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
+tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int len, int hlen)
 {
-	struct tseg_qent *q;
-	struct tseg_qent *p = NULL;
-	struct tseg_qent *nq;
-	struct tseg_qent *te = NULL;
+	int thflags = 0;
+	tcp_seq th_seq;
 	struct socket *so = tp->t_inpcb->inp_socket;
-	int flags;
+	struct tcp_reass_block *trb = NULL, *trbn;
+	struct tcp_reass_block trbs;
 
-	INP_LOCK_ASSERT(tp->t_inpcb);
+	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
-	 * XXX: tcp_reass() is rather inefficient with its data structures
-	 * and should be rewritten (see NetBSD for optimizations).  While
-	 * doing that it should move to its own file tcp_reass.c.
+	 * Check if it is really neccessary to do all the work.
 	 */
+	if (!tcp_reass_enable && RB_EMPTY(&tp->rcv_reass))
+		goto done;
 
 	/*
-	 * Call with th==NULL after become established to
+	 * Call with th==NULL after becoming established to
 	 * force pre-ESTABLISHED data up to user socket.
+	 * XXX: Was used for T/TCP of which code remains.
 	 */
-	if (th == NULL)
+	if (th == NULL) {
+		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
+		    RB_EMPTY(&tp->rcv_reass) ||
+		    ((trb = RB_MIN(tcp_ra, &tp->rcv_reass)) &&
+		     trb->trb_seqs != tp->rcv_nxt))
+			return (0);
+		trb = RB_MIN(tcp_ra, &tp->rcv_reass);
 		goto present;
+	}
+
+	KASSERT(th != NULL, ("%s: th is NULL", __func__));
+	KASSERT(m != NULL, ("%s: m is NULL", __func__));
+	KASSERT(len + hlen == m_length(m, NULL),
+	    ("%s: len + hlen != mbuf length", __func__));
+	KASSERT(hlen <= m_length(m, NULL),
+	    ("%s: hlen > mbuf length", __func__));
+
+	/*
+	 * Store TCP header information in local variables as
+	 * we may lose access to it after header dropping and
+	 * mbuf compacting.
+	 */
+	thflags = th->th_flags;
+	th_seq = th->th_seq;
+	th = NULL;		/* Prevent further use. */
+
+	KASSERT(SEQ_GEQ(th_seq, tp->rcv_nxt),
+	    ("%s: sequence number below rcv_nxt", __func__));
+	KASSERT(!(tp->rcv_nxt == th_seq) || !(RB_EMPTY(&tp->rcv_reass)),
+	    ("%s: got missing segment but queue is empty", __func__));
+	KASSERT(tcp_reass_verify(tp, 0),
+	    ("%s: reassembly queue already inconsistent", __func__));
 
 	/*
 	 * Limit the number of segments in the reassembly queue to prevent
 	 * holding on to too many segments (and thus running out of mbufs).
 	 * Make sure to let the missing segment through which caused this
-	 * queue.  Always keep one global queue entry spare to be able to
-	 * process the missing segment.
+	 * queue.
+	 *
+	 * Count the gross space used by the mbufs in the reassembly queue
+	 * and limit it to the free space in the socket buffer.  This way
+	 * the reassembly queue can never consume more mbuf space than the
+	 * socket buffer got allocated anyway and it reflects the actual
+	 * amount of kernel memory used.  This effectively prevents mbuf
+	 * exhaustion due to pathological traffic (one byte segments with
+	 * a hole each time) on a single connection.
+	 *
+	 * Counting the gross mbuf space effectively sets the net data
+	 * limit lower than the socket buffer would allow.
+	 * Don't underestimates the effective free space in the socket
+	 * buffer vs. actual real data with 2k clusters and 1500 byte
+	 * packets by introducing a correction factor of 11/8th.
 	 */
-	if (th->th_seq != tp->rcv_nxt &&
-	    (tcp_reass_qsize + 1 >= tcp_reass_maxseg ||
-	     tp->t_segqlen >= tcp_reass_maxqlen)) {
-		tcp_reass_overflows++;
-		tcpstat.tcps_rcvmemdrop++;
-		m_freem(m);
-		*tlenp = 0;
-		return (0);
+	if (th_seq != tp->rcv_nxt &&
+	    tp->rcv_reass_blocks > tcp_reass_maxblocks) {
+		//(sbspace(&so->so_rcv) / 8 * 11)
+		TCPSTAT_INC(tcps_reass_overflow);
+		TCPSTAT_INC(tcps_rcvmemdrop);
+		goto done;
 	}
 
 	/*
-	 * Allocate a new queue entry. If we can't, or hit the zone limit
-	 * just drop the pkt.
+	 * FIN handling is a bit tricky.
+	 * We cannot trust a FIN that goes into the reassembly queue.
+	 * It can be easily spoofed as it may be anywhere in the receive
+	 * window (see RST attack mitigation in tcp-secure).
+	 * For this reason (and complexity avoidance) we generally ignore
+	 * any FIN arriving at the reassembly queue with one exception;
+	 * When it exactly matches rcv_nxt together with any data in the
+	 * same segment we can conclude it to be genuine and proceed with
+	 * flushing any other data waiting in the reassembly queue.
+	 * A FIN is part of the sequence space and will get retransmitted
+	 * if it was genuine.
+	 * This approach is based on a discussion on TCPM mailing list.
 	 */
-	te = uma_zalloc(tcp_reass_zone, M_NOWAIT);
-	if (te == NULL) {
-		tcpstat.tcps_rcvmemdrop++;
-		m_freem(m);
-		*tlenp = 0;
-		return (0);
+	if ((thflags & TH_FIN) && tp->rcv_nxt == th_seq) {
+		tcp_reass_flush(tp);
+		if (m->m_len == 0) {
+			tcp_timer_activate(tp, TT_REASS, 0);
+			return (thflags);
+		}
+	} else if (len == 0)
+		goto done;
+	else
+		thflags &= ~TH_FIN;
+
+	/* Statistics. */
+	if (tp->rcv_nxt != th_seq) {
+		TCPSTAT_INC(tcps_rcvoopack);
+		TCPSTAT_ADD(tcps_rcvoobyte, len);
 	}
-	tp->t_segqlen++;
-	tcp_reass_qsize++;
 
 	/*
-	 * Find a segment which begins after this one does.
+	 * Remove and free packet header and mtags.
+	 * Trim empty mbufs from head of chain.
+	 * Compact the mbuf chain.
 	 */
-	LIST_FOREACH(q, &tp->t_segq, tqe_q) {
-		if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
-			break;
-		p = q;
-	}
+	m_demote(m, 1);
+	m_adj(m, hlen);
+	m = m_trimhead(m);
+	if (tcp_reass_spacetime && m->m_next != NULL)
+		m = m_collapse(m, M_DONTWAIT, 1024);
+
+	KASSERT(m != NULL, ("%s: m is NULL after collapse", __func__));
+
+	/* Set up search structure. */
+	trbs.trb_seqs = th_seq;
+	trbs.trb_seqe = th_seq + len;
+	trbs.trb_m = m;
+	trbs.trb_mt = m_last(m);
 
 	/*
-	 * If there is a preceding segment, it may provide some of
-	 * our data already.  If so, drop the data from the incoming
-	 * segment.  If it provides all of our data, drop us.
+	 * Find a block that has at least partial overlap to either side.
+	 * If no block is found either insert a new one or use the stack
+	 * if the segment directly fits rcv_nxt.
+	 *  RFC793: section 3.9, page 69-76
+	 *  RFC2018: section 3
 	 */
-	if (p != NULL) {
-		int i;
-		/* conversion to int (in i) handles seq wraparound */
-		i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
-		if (i > 0) {
-			if (i >= *tlenp) {
-				tcpstat.tcps_rcvduppack++;
-				tcpstat.tcps_rcvdupbyte += *tlenp;
-				m_freem(m);
-				uma_zfree(tcp_reass_zone, te);
-				tp->t_segqlen--;
-				tcp_reass_qsize--;
-				/*
-				 * Try to present any queued data
-				 * at the left window edge to the user.
-				 * This is needed after the 3-WHS
-				 * completes.
-				 */
-				goto present;	/* ??? */
+	if ((trb = RB_FIND(tcp_ra, &tp->rcv_reass, &trbs)) != NULL) {
+		/*
+		 * The new segment is a retransmit and within an already
+		 * received block and thus a full duplicate.
+		 *
+		 * Report it with D-SACK only if it is a subset of and
+		 * not equal to the already existing block.
+		 *  RFC2883: section 4, part (1) and (4)
+		 *  RFC2883: section 4.1, Reporting Full Duplicate Segments
+		 */
+		if (SEQ_GEQ(trbs.trb_seqs, trb->trb_seqs) &&
+		    SEQ_LEQ(trbs.trb_seqe, trb->trb_seqe)) {
+			tcp_reass_sacktrack(tp, trb);
+			if (SEQ_GT(trbs.trb_seqs, trb->trb_seqs) ||
+			    SEQ_LT(trbs.trb_seqe, trb->trb_seqe)) {
+				tp->rcv_reass_dsack.start = trbs.trb_seqs;
+				tp->rcv_reass_dsack.end = trbs.trb_seqe;
 			}
-			m_adj(m, i);
-			*tlenp -= i;
-			th->th_seq += i;
+			TCPSTAT_INC(tcps_rcvduppack);
+			TCPSTAT_ADD(tcps_rcvdupbyte, len);
+			goto done;
+		}
+
+		/*
+		 * Merge the new segment with the already existing block.
+		 */
+		tp->rcv_reass_size += SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe);
+		(void)tcp_reass_merge(trb, &trbs);
+		tcp_reass_sacktrack(tp, trb);
+
+		/*
+		 * Update the D-SACK information.
+		 *  RFC2883: section 4.2,  Reporting Partial Duplicate Segments
+		 */
+		if ((len = SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe)) > 0) {
+			tp->rcv_reass_size -= len;
+			tp->rcv_reass_dsack.start = trbs.trb_seqs;
+			tp->rcv_reass_dsack.end = trbs.trb_seqe;
+			TCPSTAT_INC(tcps_rcvpartduppack);
+			TCPSTAT_ADD(tcps_rcvpartdupbyte, len);
+		}
+
+		/*
+		 * Merge in previous/next block(s) if there is overlap.
+		 */
+		while ((trbn = RB_PREV(tcp_ra, &tp->rcv_reass, trb)) != NULL &&
+		    SEQ_LEQ(trb->trb_seqs, trbn->trb_seqe)) {
+			trbn = tcp_reass_merge(trb, trbn);
+			tcp_reass_free(tp, trbn);
+			TCPSTAT_INC(tcps_reass_merge);
+		}
+		while ((trbn = RB_NEXT(tcp_ra, &tp->rcv_reass, trb)) != NULL &&
+		    SEQ_GEQ(trb->trb_seqe, trbn->trb_seqs)) {
+			trbn = tcp_reass_merge(trb, trbn);
+			tcp_reass_free(tp, trbn);
+			TCPSTAT_INC(tcps_reass_merge);
+		}
+	} else if (tp->rcv_nxt == th_seq) {
+		/*
+		 * For segments attaching to RCV.NXT do not allocate
+		 * a new block structure to prevent failure under tight
+		 * memory conditions.  Instead use temporary stack based
+		 * storage.
+		 */
+		trb = &trbs;
+	} else if ((trb = (struct tcp_reass_block *)uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO))) != NULL) {
+		/* Insert new block as no eglible existing block for merging was found. */
+		trb->trb_seqs = trbs.trb_seqs;
+		trb->trb_seqe = trbs.trb_seqe;
+		trb->trb_m = trbs.trb_m;
+		trb->trb_mt = trbs.trb_mt;
+		trbn = RB_INSERT(tcp_ra, &tp->rcv_reass, trb);
+		KASSERT(trbn == NULL, ("%s: RB_INSERT failed", __func__));
+		LIST_INSERT_HEAD(&tp->rcv_reass_sack, trb, trb_sack);
+		tp->rcv_reass_size += SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe);
+		tp->rcv_reass_blocks++;
+		if (tp->rcv_reass_blocks == 1)) {
+			KASSERT(tcp_timer_active(tp, TT_REASS) == 0,
+			    ("%s: reassembly timer already active", __func__));
+			tcp_timer_activate(tp, TT_REASS, tcp_reass_timeout);
 		}
+		TCPSTAT_INC(tcps_reass_blocks);
+	} else {
+		/* Memory allocation failure. */
+		TCPSTAT_INC(tcps_rcvmemdrop);
+		goto done;
 	}
-	tcpstat.tcps_rcvoopack++;
-	tcpstat.tcps_rcvoobyte += *tlenp;
+	KASSERT(tcp_reass_verify(tp, 1),
+	    ("%s: reassembly queue went inconsistent", __func__));
+
+	/* Deliver data if we've got the missing segment. */
+	if (trb->trb_seqs == tp->rcv_nxt)
+		goto present;
+
+	return (0);
 
+present:
 	/*
-	 * While we overlap succeeding segments trim them or,
-	 * if they are completely covered, dequeue them.
+	 * Present data to user, advancing rcv_nxt through the
+	 * completed sequence space.
 	 */
-	while (q) {
-		int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
-		if (i <= 0)
-			break;
-		if (i < q->tqe_len) {
-			q->tqe_th->th_seq += i;
-			q->tqe_len -= i;
-			m_adj(q->tqe_m, i);
-			break;
-		}
+	KASSERT(trb != NULL,
+	    ("%s: queue empty at present", __func__));
+	KASSERT(trb->trb_seqs == tp->rcv_nxt,
+	    ("%s: first block does not match rcv_nxt", __func__));
+
+	TCPSTAT_INC(tcps_reass_missingseg);
 
-		nq = LIST_NEXT(q, tqe_q);
-		LIST_REMOVE(q, tqe_q);
-		m_freem(q->tqe_m);
-		uma_zfree(tcp_reass_zone, q);
-		tp->t_segqlen--;
-		tcp_reass_qsize--;
-		q = nq;
+	SOCKBUF_LOCK(&so->so_rcv);
+	/*
+	 * We can only ever dequeue one consecutive
+	 * block of data at most.
+	 */
+	if (!(so->so_rcv.sb_state & SBS_CANTRCVMORE)) {
+		sbappendstream_locked(&so->so_rcv, trb->trb_m);
+		tp->rcv_nxt += SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
+		trb->trb_m = NULL;
+		trb->trb_mt = NULL;
 	}
 
-	/* Insert the new segment queue entry into place. */
-	te->tqe_m = m;
-	te->tqe_th = th;
-	te->tqe_len = *tlenp;
+	if (trb == &trbs)
+		m_freem(trb->trb_m);	/* NB: trb_m can be =! NULL */
+	else
+		tcp_reass_free(tp, trb);
+
+	/* NB: sorwakeup_locked() does an implicit socket buffer unlock. */
+	sorwakeup_locked(so);
+
+	/*
+	 * Don't hold on to data in the reassembly queue for too long.
+	 * Kernel memory is limited and if the connection doesn't make
+	 * any progress in filling the holes we don't want to wait
+	 * forever to prevent memory exhaustion attacks.  The sender
+	 * can always retransmit again.
+	 *  RFC2018: section 8, Data Receiver Reneging
+	 *
+	 * Restart the reassembly queue flush timer after advancing
+	 * the sequence space and if the queue still isn't empty.
+	 */
+	if (tcp_reass_timeout && !RB_EMPTY(&tp->rcv_reass))
+		tcp_timer_activate(tp, TT_REASS, tcp_reass_timeout);
+	else if (tcp_timer_active(tp, TT_REASS))
+		tcp_timer_activate(tp, TT_REASS, 0);
+
+	ND6_HINT(tp);
+	return (thflags);
+
+done:
+	m_freem(m);
+	return (0);
+}
+
+/*
+ * Trim a reassembly block.
+ * A positive value is from head, negative from tail.
+ */
+static void
+tcp_reass_trim(struct tcp_reass_block *trb, int i)
+{
+
+	m_adj(trb->trb_m, i);
 
-	if (p == NULL) {
-		LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
+	/*
+	 * Update the tail pointer or free empty mbufs
+	 * at the head of the chain.
+	 */
+	if (i < 0) {
+		trb->trb_mt = m_last(trb->trb_m);
+		trb->trb_seqe -= i;
 	} else {
-		LIST_INSERT_AFTER(p, te, tqe_q);
+		trb->trb_m = m_trimhead(trb->trb_m);
+		trb->trb_seqs += i;
+	}
+}
+
+/*
+ * Merge one or more consecutive blocks together.  The number of
+ * redundant bytes is reported as the difference between trbn->
+ * trb_seqs and trb_seqe.
+ *
+ * NB: trbn is always merged into trb!
+ */
+static struct tcp_reass_block *
+tcp_reass_merge(struct tcp_reass_block *trb, struct tcp_reass_block *trbn)
+{
+	int i;
+	tcp_seq s;
+
+	KASSERT(trb != NULL && trbn != NULL,
+	    ("%s: incomplete input", __func__));
+	KASSERT(SEQ_LT(trbn->trb_seqs, trb->trb_seqs) ||
+	    SEQ_GT(trbn->trb_seqe, trb->trb_seqe),
+	    ("%s: blocks not overlapping", __func__));
+
+	/*
+	 * Replace, prepend or append a block.
+	 */
+	if (SEQ_LEQ(trbn->trb_seqs, trb->trb_seqs) &&
+	    SEQ_GEQ(trbn->trb_seqe, trb->trb_seqe)) {
+
+		i = SEQ_DELTA(trb->trb_seqs, trbn->trb_seqe);
+		s = trb->trb_seqs;
+
+		m_freem(trb->trb_m);
+		trb->trb_seqs = trbn->trb_seqs;
+		trb->trb_seqe = trbn->trb_seqe;
+		trb->trb_m = trbn->trb_m;
+		trb->trb_mt = trbn->trb_mt;
+
+		TCPSTAT_INC(tcps_reass_replace);
+
+	} else if (SEQ_LT(trbn->trb_seqs, trb->trb_seqs)) {
+
+		if ((i = SEQ_DELTA(trb->trb_seqs, trbn->trb_seqe)) > 0)
+			tcp_reass_trim(trbn, -i);
+
+		s = trb->trb_seqs;
+		trb->trb_seqs = trbn->trb_seqs;
+		trbn->trb_mt->m_next = trb->trb_m;
+		trb->trb_m = trbn->trb_m;
+
+		if (tcp_reass_spacetime) {
+			trbn->trb_mt = m_collapse(trbn->trb_mt, M_DONTWAIT, 1024);
+			trb->trb_mt = m_last(trbn->trb_mt);
+		}
+
+	} else if (SEQ_GT(trbn->trb_seqe, trb->trb_seqe)) {
+
+		if ((i = SEQ_DELTA(trb->trb_seqe, trbn->trb_seqs)) > 0)
+			tcp_reass_trim(trbn, i);
+
+		s = trb->trb_seqe;
+		trb->trb_seqe = trbn->trb_seqe;
+		trb->trb_mt->m_next = trbn->trb_m;
+
+		if (tcp_reass_spacetime) {
+			trb->trb_mt = m_collapse(trb->trb_mt, M_DONTWAIT, 1024);
+			trb->trb_mt = m_last(trb->trb_mt);
+		} else
+			trb->trb_mt = trbn->trb_mt;
+
+	} else
+		return (NULL);
+
+	trbn->trb_seqs = s;
+	trbn->trb_seqe = trbn->trb_seqs + i;
+	trbn->trb_m = NULL;
+	trbn->trb_mt = NULL;
+	return (trbn);
+}
+
+/*
+ * Put the sequence number of the reassembly queue blocks into the
+ * SACK options of an outgoing segment.  If a D-SACK block is available
+ * insert it in the first position followed by the regular SACK blocks.
+ *  RFC2018: section 3, Sack Option Format
+ *  RFC2018: section 4, Generating Sack Options: Data Receiver Behavior
+ *  RFC2883: section 4, Use of the SACK option for reporting a duplicate segment
+ */
+int
+tcp_reass_sack(struct tcpcb *tp, u_char *optp, int numsacks)
+{
+	int nsacks = 0;
+	tcp_seq sack_seq;
+	struct tcp_reass_block *trb, trbs;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	KASSERT(numsacks > 0,
+	    ("%s: zero sack blocks to add", __func__));
+	KASSERT(!LIST_EMPTY(&tp->rcv_reass_sack),
+	    ("%s: sack list empty", __func__));
+
+	/*
+	 * Create fake SACK block on the stack for D-SACK and prepend it.
+	 *  RFC2883: section 4, part (3)
+	 */
+	if (tp->rcv_reass_dsack.start != tp->rcv_reass_dsack.end) {
+		bzero(&trbs, sizeof(trbs));
+		trbs.trb_seqs = htonl(tp->rcv_reass_dsack.start);
+		trbs.trb_seqe = htonl(tp->rcv_reass_dsack.end);
+		LIST_INSERT_HEAD(&tp->rcv_reass_sack, &trbs, trb_sack);
+	}
+
+	/*
+	 * The most recent block must appear first.  Add the other
+	 * blocks in most recent created or updated order.
+	 *  RFC2018: section 3 and 4, part (4) and (5)
+	 */
+	LIST_FOREACH(trb, &tp->rcv_reass_sack, trb_sack) {
+		if (numsacks < 1)
+			break;
+		sack_seq = htonl(trb->trb_seqs);
+		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
+		optp += sizeof(sack_seq);
+		sack_seq = htonl(trb->trb_seqe);
+		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
+		optp += sizeof(sack_seq);
+		numsacks--;
+		nsacks++;
 	}
 
-present:
 	/*
-	 * Present data to user, advancing rcv_nxt through
-	 * completed sequence space.
+	 * Remove fake D-SACK block again and zero out the D-SACK
+	 * information.  It must be reported only once.
+	 *  RFC2883: section 4, part (2)
 	 */
-	if (!TCPS_HAVEESTABLISHED(tp->t_state))
-		return (0);
-	q = LIST_FIRST(&tp->t_segq);
-	if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
-		return (0);
-	SOCKBUF_LOCK(&so->so_rcv);
-	do {
-		tp->rcv_nxt += q->tqe_len;
-		flags = q->tqe_th->th_flags & TH_FIN;
-		nq = LIST_NEXT(q, tqe_q);
-		LIST_REMOVE(q, tqe_q);
-		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
-			m_freem(q->tqe_m);
-		else
-			sbappendstream_locked(&so->so_rcv, q->tqe_m);
-		uma_zfree(tcp_reass_zone, q);
-		tp->t_segqlen--;
-		tcp_reass_qsize--;
-		q = nq;
-	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
-	//ND6_HINT(tp);
-	sorwakeup_locked(so);
-	return (flags);
+	if (LIST_FIRST(&tp->rcv_reass_sack) == &trbs) {
+		LIST_REMOVE(&trbs, trb_sack);
+		tp->rcv_reass_dsack.start = 0;
+		tp->rcv_reass_dsack.end = 0;
+	}
+
+	return (nsacks);
+}
+
+#ifdef DDB
+static void
+db_print_reassblocks(struct tcpcb *tp)
+{
+	struct tcp_reass_block *trb;
+
+	RB_FOREACH(trb, tcp_ra, &tp->rcv_reass) {
+		db_printf(" reass block 0x%08x - 0x%08x\n",
+		    trb->trb_seqs, trb->trb_seqe);
+	}
 }
+#endif



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201006290826.o5T8Qq7S071410>