Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 19 May 2008 13:37:11 GMT
From:      Andre Oppermann <andre@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 141858 for review
Message-ID:  <200805191337.m4JDbBYL069779@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=141858

Change 141858 by andre@andre_flirtbox on 2008/05/19 13:37:00

	Checkpoint WIP before I travel back home from BSDCan.
	
	- Remove global count of gross mbuf space in all reassembly
	  queues.  Access is unlocked and unreliable.  It should be
	  reported via netstat on a per connection basis.
	- Put some common code blocks into their own functions.
	- Add switch to select between space and time optimization.
	  Space optimization collapses the stored mbuf chains to
	  prevent mostly empty mbufs.  This may involve allocating
	  new mbufs and copying data around.  Time optimization
	  doesn't do that and just inserts the mbuf chains into
	  the queue without touching the content.  This is appropriate
	  for high speed transfers when almost all packets are MTU
	  sized (and jumbo on top of that).
	- Process FIN on a packet without any data.  We could get
	  stuck otherwise when bogus data beyond that FIN was received.
	- Adjust gross to net space ratio in relation to socket buffer
	  from 5/4 to 11/8 to better reflect the gross net ratio of
	  ethernet MTU to mbuf cluster size.

Affected files ...

.. //depot/projects/tcp_reass/netinet/tcp_reass.c#22 edit

Differences ...

==== //depot/projects/tcp_reass/netinet/tcp_reass.c#22 (text+ko) ====

@@ -127,20 +127,44 @@
     &tcp_reass_qsize, 0,
     "Global number of TCP Segment Blocks currently in Reassembly Queue");
 
-static int tcp_reass_mcnt = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, mbufbytes, CTLFLAG_RD,
-    &tcp_reass_mcnt, 0,
-    "Global gross memory size of all mbufs currently in Reassembly Queue");
-
 static int tcp_reass_qtimo = 0;
 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW,
     &tcp_reass_qtimo, 0,
     "Reassembly Queue Timeout in multiples of the Retransmission Timeout");
 
+static int tcp_reass_spacetime = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW,
+    &tcp_reass_spacetime, 0,
+    "Reassembly Queue strategy of space vs. time efficiency");
+
 static void	tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *);
 
 uma_zone_t	tcp_reass_zone;
 
+/* Trim empty mbufs from head of chain. */
+static struct mbuf *
+m_trimhead(struct mbuf *m) {
+	struct mbuf *n;
+
+	while (m->m_len == 0) {
+		n = m;
+		m = m->m_next;
+		m_free(n);
+	}
+	return (m);
+}
+
+static u_int
+m_storagesize(m) {
+	u_int mcnt;
+
+	for (mcnt = 0, m; n; m = m->m_next)
+		mcnt += (m->m_flags & M_EXT) ?
+		    m->m_ext.ext_size + MSIZE : MSIZE;
+	return (mcnt);
+}
+
+
 /*
  * Adjust TCP reassembly zone limits when the nmbclusters zone changes.
  */
@@ -217,15 +241,6 @@
 		return (0);
 	}
 
-	/* XXX: should not happen, but does for some reason. */
-	/* XXX: May get FIN on otherwise empty segment. */
-	if (*tlenp == 0) {
-		m_freem(m);
-		return (0);
-	}
-
-	KASSERT(*tlenp > 0,
-	    ("%s: segment doesn't contain any data", __func__));
 	KASSERT(SEQ_LEQ(tp->rcv_nxt, th_seq),
 	    ("%s: sequence number below rcv_nxt", __func__));
 	KASSERT(!(tp->rcv_nxt == th_seq) || !(TAILQ_EMPTY(&tp->t_trq)),
@@ -264,12 +279,12 @@
 	 *
 	 * Counting the gross mbuf space effectively sets the net data
 	 * limit lower than the socket buffer would allow.
-	 * It underestimates the effective free space in the socket buffer
-	 * vs. actual real data with 2k clusters and 1500 byte packets.
-	 * This shouldn't be too much of a problem though.
+	 * Don't underestimates the effective free space in the socket
+	 * buffer vs. actual real data with 2k clusters and 1500 byte
+	 * packets by introducing a correction factor of 11/8th.
 	 */
 	if (th_seq != tp->rcv_nxt &&
-	    tp->t_trqmcnt > (sbspace(&so->so_rcv) / 4 * 5)) {
+	    tp->t_trqmcnt > (sbspace(&so->so_rcv) / 8 * 11)) {
 		tcpstat.tcps_reass_overflow++;
 		tcpstat.tcps_rcvmemdrop++;
 		m_freem(m);
@@ -280,19 +295,11 @@
 	/* Get rid of packet header and mtags. */
 	m_demote(m, 1);
 
-#if 0
 	/* Trim empty mbufs from head of chain. */
-	while (m->m_len == 0) {
-		n = m;
-		m = m->m_next;
-		m_free(n);
-	}
-#endif
+	m = m_trimhead(m);
 
 	/* NB: m_adj(m, -i) may free mbufs at the tail of a chain. */
-	for (mcnt = 0, n = m; n; n = n->m_next)
-		mcnt += (n->m_flags & M_EXT) ?
-		    n->m_ext.ext_size + MSIZE : MSIZE;
+	mcnt = m_storagesize(m);
 
 	/*
 	 * FIN handling is a bit tricky.
@@ -311,6 +318,10 @@
 	if ((thflags & TH_FIN) && tp->rcv_nxt == th_seq) {
 		tcp_reass_qfree(tp);
 		tqe = NULL;
+		if (m->m_len == 0) {
+			tcp_timer_activate(tp, TT_REASS, 0);
+			return (thflags);
+		}
 		goto insert;
 	} else
 		thflags &= ~TH_FIN;
@@ -323,9 +334,14 @@
 		tqe->trq_len += *tlenp;
 		tqe->trq_mcnt += mcnt;
 		tp->t_trqmcnt += mcnt;
-		tcp_reass_mcnt += mcnt;
 		tqe->trq_ml->m_next = m;
 		tqe->trq_ml = m_last(m);
+		if (tcp_reass_spacetime) {
+			tqe->trq_m =  m_collapse(tqe->trq_m, M_DONTWAIT, 1024);
+			tp->t_trqmcnt -= tqe->trq_mcnt;
+			tqe->trq_mcnt = m_storagespace(tqe->trq_m);
+			tqe->trq_mcnt += tp->t_trqmcnt;
+		}
 		if (LIST_FIRST(&tp->t_trq_sack) != tqe) {
 			LIST_REMOVE(tqe, trq_s);
 			LIST_INSERT_HEAD(&tp->t_trq_sack, tqe, trq_s);
@@ -367,7 +383,6 @@
 			tqe->trq_len += *tlenp;
 			tqe->trq_mcnt += mcnt;
 			tp->t_trqmcnt += mcnt;
-			tcp_reass_mcnt += mcnt;
 			tqe->trq_seq = th_seq;
 			n = m_last(m);
 			n->m_next = tqe->trq_m;
@@ -415,7 +430,6 @@
 			tqe->trq_len = *tlenp;
 			tqe->trq_mcnt = mcnt;
 			tp->t_trqmcnt += mcnt;
-			tcp_reass_mcnt += mcnt;
 			tqe->trq_seq = th_seq;
 			tqe->trq_m = m;
 			tqe->trq_ml = m_last(m);
@@ -445,14 +459,11 @@
 				tcpstat.tcps_rcvpartduppack++;
 				tcpstat.tcps_rcvpartdupbyte += i;
 				/* Update accounting. */
-				for (mcnt = 0, n = m; n; n = n->m_next)
-					mcnt += (n->m_flags & M_EXT) ?
-					    n->m_ext.ext_size + MSIZE : MSIZE;
+				mcnt = m_storagespace(m);
 			}
 			tqe->trq_len += *tlenp;
 			tqe->trq_mcnt += mcnt;
 			tp->t_trqmcnt += mcnt;
-			tcp_reass_mcnt += mcnt;
 			tqe->trq_seq = th_seq;
 			n = m_last(m);
 			n->m_next = tqe->trq_m;
@@ -480,7 +491,6 @@
 			tqe->trq_len += *tlenp;
 			tqe->trq_mcnt += mcnt;
 			tp->t_trqmcnt += mcnt;
-			tcp_reass_mcnt += mcnt;
 			tqe->trq_ml->m_next = m;
 			tqe->trq_ml = m_last(m);
 			/* Check if segment bridges two blocks to merge. */
@@ -520,7 +530,6 @@
 	tqen->trq_len = *tlenp;
 	tqen->trq_mcnt = mcnt;
 	tp->t_trqmcnt += mcnt;
-	tcp_reass_mcnt += mcnt;
 	tqen->trq_m = m;
 	tqen->trq_ml = m_last(m);
 
@@ -574,7 +583,6 @@
 			sbappendstream_locked(&so->so_rcv, tqe->trq_m);
 		tp->rcv_nxt += tqe->trq_len;
 		tp->t_trqmcnt -= tqe->trq_mcnt;
-		tcp_reass_mcnt -= tqe->trq_mcnt;
 		TAILQ_REMOVE(&tp->t_trq, tqe, trq_q);
 		LIST_REMOVE(tqe, trq_s);
 		if (tqe != &tqes)
@@ -621,7 +629,6 @@
 		tcpstat.tcps_rcvpartdupbyte += tqen->trq_len;
 		tcpstat.tcps_reass_covered++;
 		tp->t_trqmcnt -= tqe->trq_mcnt;
-		tcp_reass_mcnt -= tqe->trq_mcnt;
 		m_freem(tqen->trq_m);
 		TAILQ_REMOVE(&tp->t_trq, tqen, trq_q);
 		LIST_REMOVE(tqen, trq_s);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200805191337.m4JDbBYL069779>