Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 25 Jan 2008 23:39:17 GMT
From:      Andre Oppermann <andre@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 134109 for review
Message-ID:  <200801252339.m0PNdHjs088175@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=134109

Change 134109 by andre@andre_flirtbox on 2008/01/25 23:39:08

	Change the SACK tracking based on discussion on the IETF TCPM mailing
	list.  Track and report reassembly queue blocks (==SACK blocks) in the
	reverse order of their arrival.  RFC2018 specifies this as a SHOULD
	item in Section 4.  In the discussion some of the orginal authors of
	RFC2018 brought up more of the rationale and information about the
	real-world behavior that explain why this 'SHOULD' is beneficial to
	fully implement.  I have to condense the rationale and benefits into
	the files comments.
	
	Also a pointer to a paper on implementing SACK efficiently with the
	example of Linux was provided: "An Experimental Investigation of
	TCP Performance in High Bandwidth-Delay Product Paths", retrieved from
	http://www.hamilton.ie/publications/baruch_even_thesis.pdf
	
	The receive side design of the reassembly is pretty much at the optimum
	of computational efficiency.  SACK tracking is a natural fit and consists
	only of adding a list tracking the reverse receive order of blocks and
	approriate function to read and populate the outboud SACK options.
	
	The new SACK tracks as many blocks as segments fit into the reassembly
	queue over the previous fixed limit of 6 blocks.  Struct tcpcb is reduced
	by 44 bytes in size.  Struct trq is increased by two pointers.
	
	Add many KASSERTs and INVARIANTS to the SACK related parts.
	
	If the reassembly queue enable sysctl is disabled check that the queue
	is empty.
	
	The new reassembly and SACK code is tested and performed flawlessly at
	2% inbound packet loss for downloads of the FreeBSD 7.0RC1 ISO images
	and 183 ports in archivers and biology (make fetch) from their original
	distribution servers around the world.
	
	An inspection of packet traces with wireshark shows correct generation
	and ordering of the SACK options for outbound ACKs.
	
	TODO: [done] Testing of latest changes.
	TODO: [done for SACK part] Discussion on tcpm.
	TODO: [upcoming] "ipfw tcptruncate" option to test reassembly code with
	      wildly cut segments (think chainsaw massacre).
	TODO: Condense rationale form IETF TCPM discussion for file comments.
	TODO: KTR tracing of reassembly queue behavior.
	TODO: Use m_collapse() to keep down mbuf usage of blocks.
	TODO: ddb function to examine reassembly queue.

Affected files ...

.. //depot/projects/tcp_reass/netinet/tcp.h#2 edit
.. //depot/projects/tcp_reass/netinet/tcp_reass.c#15 edit
.. //depot/projects/tcp_reass/netinet/tcp_subr.c#4 edit
.. //depot/projects/tcp_reass/netinet/tcp_var.h#6 edit

Differences ...

==== //depot/projects/tcp_reass/netinet/tcp.h#2 (text+ko) ====

@@ -96,7 +96,6 @@
 #define	   TCPOLEN_SIGNATURE		18
 
 /* Miscellaneous constants */
-#define	MAX_SACK_BLKS	6	/* Max # SACK blocks stored at receiver side */
 #define	TCP_MAX_SACK	4	/* MAX # SACKs sent in any segment */
 
 

==== //depot/projects/tcp_reass/netinet/tcp_reass.c#15 (text+ko) ====

@@ -67,12 +67,20 @@
  *		m_next		m_next		m_next
  *
  *
+ * The reassembly queues block structure is also used to track SACK
+ * information as a data receiver.  A double-linked list is added
+ * that links the blocks the reverse order of their arrival or updating.
+ * This makes us fully compliant to RFC2018 Section 4 including all
+ * optional parts marked as "SHOULD".
+ *
+ * TODO:
  * A further improvement is to merge the content of mbufs together if the
  * preceeding one has enough space to hold the data of the new one.  When
  * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves
  * them in place.  Only when trimming from the tail it actually frees them.
  * Normally we don't get mbuf chains so this isn't too much of a concern
- * right now.  TODO.
+ * right now.  Use m_collapse() to compact the mbuf chains within the
+ * blocks.
  */
 
 #include "opt_inet.h"
@@ -186,7 +194,7 @@
 	}
 
 	/* Check if it is really neccessary to do all the work. */
-	if (!tcp_reass_enabled) {
+	if (!tcp_reass_enabled && TAILQ_EMPTY(&tp->t_trq)) {
 		*tlenp = 0;
 		m_freem(m);
 		return (0);
@@ -206,13 +214,20 @@
 	    ("%s: got missing segment but queue is empty", __func__));
 
 #ifdef INVARIANTS
+	i = 0;
 	TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
 		KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt),
 		    ("%s: trq_seq < rcv_nxt", __func__));
 		KASSERT(tqen == NULL ||
 		    SEQ_LT(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
 		    ("%s: overlapping blocks", __func__));
+		i++;
+	}
+	LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) {
+		i--;
 	}
+	KASSERT(i == 0, ("%s: SEQ# ordered tailq and arrival ordered "
+	    "list are not equally long", __func__));
 #endif
 
 	/*
@@ -270,7 +285,10 @@
 		tcp_reass_mcnt += mcnt;
 		tqe->trq_ml->m_next = m;
 		tqe->trq_ml = m_last(m);
-		tp->t_trq_last = tqe;
+		if (LIST_FIRST(&tp->t_trq_sack) != tqe) {
+			LIST_REMOVE(tqe, trq_s);
+			LIST_INSERT_HEAD(&tp->t_trq_sack, tqe, trq_s);
+		}
 		/* TCP statistics. */
 		tcpstat.tcps_rcvoopack++;
 		tcpstat.tcps_rcvoobyte += *tlenp;
@@ -337,7 +355,11 @@
 			tcpstat.tcps_rcvduppack++;
 			tcpstat.tcps_rcvdupbyte += *tlenp;
 			tcpstat.tcps_reass_covered++;
-			tp->t_trq_last = tqe;
+			/* XXXAO: What to SACK report when duplicate? */
+			if (LIST_FIRST(&tp->t_trq_sack) != tqe) {
+				LIST_REMOVE(tqe, trq_s);
+				LIST_INSERT_HEAD(&tp->t_trq_sack, tqe, trq_s);
+			}
 			m_freem(m);
 			*tlenp = 0;
 			return (0);
@@ -360,7 +382,10 @@
 			if (tqen != NULL &&
 			    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))
 				tcp_reass_merge(tp, tqe, tqen);
-			tp->t_trq_last = tqe;
+			if (LIST_FIRST(&tp->t_trq_sack) != tqe) {
+				LIST_REMOVE(tqe, trq_s);
+				LIST_INSERT_HEAD(&tp->t_trq_sack, tqe, trq_s);
+			}
 			tcpstat.tcps_reass_replace++;
 			return (0);
 		}
@@ -389,7 +414,10 @@
 			n = m_last(m);
 			n->m_next = tqe->trq_m;
 			tqe->trq_m = m;
-			tp->t_trq_last = tqe;
+			if (LIST_FIRST(&tp->t_trq_sack) != tqe) {
+				LIST_REMOVE(tqe, trq_s);
+				LIST_INSERT_HEAD(&tp->t_trq_sack, tqe, trq_s);
+			}
 			tcpstat.tcps_reass_prepend++;
 			return (0);
 		}
@@ -416,7 +444,10 @@
 			if (tqen != NULL &&
 			    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))
 				tcp_reass_merge(tp, tqe, tqen);
-			tp->t_trq_last = tqe;
+			if (LIST_FIRST(&tp->t_trq_sack) != tqe) {
+				LIST_REMOVE(tqe, trq_s);
+				LIST_INSERT_HEAD(&tp->t_trq_sack, tqe, trq_s);
+			}
 			tcpstat.tcps_reass_append++;
 			return (0);
 		}
@@ -439,7 +470,6 @@
 			*tlenp = 0;
 			return (0);
 		}
-		tp->t_trq_last = tqe;
 		tcpstat.tcps_reass_blocks++;
 	}
 	tcp_reass_qsize++;
@@ -458,7 +488,7 @@
 		TAILQ_INSERT_BEFORE(tqe, tqen, trq_q);
 	else {
 		KASSERT(TAILQ_EMPTY(&tp->t_trq),
-		    ("%s: queue not empty", __func__));
+		    ("%s: first element queue not empty", __func__));
 		TAILQ_INSERT_HEAD(&tp->t_trq, tqen, trq_q);
 		/*
 		 * Flush the reassembly queue after four times the
@@ -467,6 +497,7 @@
 		 */
 		tcp_timer_activate(tp, TT_REASS, tp->t_rxtcur * 4);
 	}
+	LIST_INSERT_HEAD(&tp->t_trq_sack, tqen, trq_s);
 
 	/* Missing segment? */
 	if (tp->rcv_nxt != th->th_seq)
@@ -502,8 +533,7 @@
 		tp->t_trqmcnt -= tqe->trq_mcnt;
 		tcp_reass_mcnt -= tqe->trq_mcnt;
 		TAILQ_REMOVE(&tp->t_trq, tqe, trq_q);
-		if (tp->t_trq_last == tqe)
-			tp->t_trq_last = NULL;
+		LIST_REMOVE(tqe, trq_s);
 		if (tqe != &tqes)
 			uma_zfree(tcp_reass_zone, tqe);
 		tcp_reass_qsize--;
@@ -554,6 +584,7 @@
 		tcp_reass_mcnt -= tqe->trq_mcnt;
 		m_freem(tqen->trq_m);
 		TAILQ_REMOVE(&tp->t_trq, tqen, trq_q);
+		LIST_REMOVE(tqen, trq_s);
 		uma_zfree(tcp_reass_zone, tqen);
 		tcp_reass_qsize--;
 		/* And the one after that. */
@@ -585,6 +616,7 @@
 	tqe->trq_ml->m_next = tqen->trq_m;
 	tqe->trq_ml = tqen->trq_ml;
 	TAILQ_REMOVE(&tp->t_trq, tqen, trq_q);
+	LIST_REMOVE(tqen, trq_s);
 	uma_zfree(tcp_reass_zone, tqen);
 	tcp_reass_qsize--;
 	tcpstat.tcps_reass_merge++;
@@ -602,28 +634,19 @@
 	int nsacks = 0;
 
 	KASSERT(numsacks > 0,
-	    ("%s: ", __func__));
+	    ("%s: zero sack blocks to add", __func__));
 	KASSERT(!TAILQ_EMPTY(&tp->t_trq),
-	    ("%s: ", __func__));
+	    ("%s: reassembly queue empty", __func__));
+	KASSERT(!LIST_EMPTY(&tp->t_trq_sack),
+	    ("%s: sack list empty", __func__));
 
-	/* The most recent block must appear first.  RFC2018, Section 4. */
-	if (tp->t_trq_last != NULL) {
-		sack_seq = htonl(tp->t_trq_last->trq_seq);
-		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
-		optp += sizeof(sack_seq);
-		sack_seq = htonl(tp->t_trq_last->trq_seq + tp->t_trq_last->trq_len);
-		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
-		optp += sizeof(sack_seq);
-		numsacks--;
-		nsacks++;
-	}
-
-	/* Add the other less recent blocks in ascending order. */
-	TAILQ_FOREACH(tqe, &tp->t_trq, trq_q) {
+	/*
+	 * The most recent block must appear first.  RFC2018, Section 4.
+	 * Add the other blocks in most recent created or updated order.
+	 */
+	LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) {
 		if (numsacks < 1)
 			break;
-		if (tp->t_trq_last == tqe)
-			continue;
 		sack_seq = htonl(tqe->trq_seq);
 		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 		optp += sizeof(sack_seq);
@@ -653,6 +676,7 @@
 		    ("%s: t_trqmcnt incorrect", __func__));
 		tp->t_trqmcnt -= tqe->trq_mcnt;
 		TAILQ_REMOVE(&tp->t_trq, tqe, trq_q);
+		LIST_REMOVE(tqe, trq_s);
 		uma_zfree(tcp_reass_zone, tqe);
 		tcp_reass_qsize--;
 	}

==== //depot/projects/tcp_reass/netinet/tcp_subr.c#4 (text+ko) ====

@@ -611,8 +611,9 @@
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (tcp_do_sack)
 		tp->t_flags |= TF_SACK_PERMIT;
-	TAILQ_INIT(&tp->snd_holes);		/* Covered by M_ZERO. */
-	TAILQ_INIT(&tp->t_trq);			/* Covered by M_ZERO. */
+	TAILQ_INIT(&tp->t_trq);
+	LIST_INIT(&tp->t_trq_sack);
+	TAILQ_INIT(&tp->snd_holes);
 	tp->t_inpcb = inp;	/* XXX */
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no

==== //depot/projects/tcp_reass/netinet/tcp_var.h#6 (text+ko) ====

@@ -42,7 +42,8 @@
 
 /* TCP reassembly queue segment block entry. */
 struct trq {
-	TAILQ_ENTRY(trq) trq_q;
+	TAILQ_ENTRY(trq) trq_q;		/* linked list in SEQ# order */
+	LIST_ENTRY(trq)	trq_s;		/* linked list in SACK order */
 	tcp_seq		trq_seq;	/* start of block */
 	int		trq_len;	/* length of block */
 	int		trq_mcnt;	/* gross mbuf size of block */
@@ -52,6 +53,7 @@
 	struct mbuf	*trq_ml;	/* last mbuf in chain of data */
 };
 TAILQ_HEAD(trq_head, trq);
+LIST_HEAD(trq_shead, trq);
 extern	struct uma_zone	*tcp_reass_zone;
 
 struct sackblk {
@@ -96,7 +98,7 @@
  */
 struct tcpcb {
 	struct	trq_head t_trq;		/* segment reassembly queue */
-	struct	trq *t_trq_last;	/* last addition to reassembly queue */
+	struct	trq_shead t_trq_sack;	/* last additions to reass queue */
 	int	t_trqmcnt;		/* segment reassembly queue gross usage */
 	int	t_dupacks;		/* consecutive dup acks recd */
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200801252339.m0PNdHjs088175>