Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 20 Jun 2011 18:08:34 +0000 (UTC)
From:      John Baldwin <jhb@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-8@freebsd.org
Subject:   svn commit: r223343 - stable/8/sys/netinet
Message-ID:  <201106201808.p5KI8Yej060347@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jhb
Date: Mon Jun 20 18:08:34 2011
New Revision: 223343
URL: http://svn.freebsd.org/changeset/base/223343

Log:
  MFC 221346,223049:
  Handle a rare edge case with nearly full TCP receive buffers.  If a TCP
  buffer fills up causing the remote sender to enter into persist mode, but
  there is still room available in the receive buffer when a window probe
  arrives (either due to window scaling, or due to the local application
  very slowing draining data from the receive buffer), then the single byte
  of data in the window probe is accepted.  However, this can cause rcv_nxt
  to be greater than rcv_adv.  This condition will only last until the next
  ACK packet is pushed out via tcp_output(), and since the previous ACK
  advertised a zero window, the ACK should be pushed out while the TCP
  pcb is write-locked.  To guarantee this, advance the advertised window
  (rcv_adv) even if we advertise a zero window.
  
  During the window while rcv_nxt is greather than rcv_adv, a few places
  would compute the remaining receive window via rcv_adv - rcv_nxt.
  However, this value was then (uint32_t)-1.  On a 64 bit machine this
  could expand to a positive 2^32 - 1 when cast to a long.  In particular,
  when calculating the receive window in tcp_output(), the result would be
  that the receive window was computed as 2^32 - 1 resulting in advertising
  a far larger window to the remote peer than actually existed.
  
  Fix various places that compute the remaining receive window to either
  assert that it is not negative (i.e. rcv_nxt <= rcv_adv), or treat the
  window as full if rcv_nxt is greather than rcv_adv.

Modified:
  stable/8/sys/netinet/tcp_input.c
  stable/8/sys/netinet/tcp_output.c
  stable/8/sys/netinet/tcp_timewait.c
Directory Properties:
  stable/8/sys/   (props changed)
  stable/8/sys/amd64/include/xen/   (props changed)
  stable/8/sys/cddl/contrib/opensolaris/   (props changed)
  stable/8/sys/contrib/dev/acpica/   (props changed)
  stable/8/sys/contrib/pf/   (props changed)

Modified: stable/8/sys/netinet/tcp_input.c
==============================================================================
--- stable/8/sys/netinet/tcp_input.c	Mon Jun 20 16:48:00 2011	(r223342)
+++ stable/8/sys/netinet/tcp_input.c	Mon Jun 20 18:08:34 2011	(r223343)
@@ -1792,6 +1792,9 @@ tcp_do_segment(struct mbuf *m, struct tc
 	win = sbspace(&so->so_rcv);
 	if (win < 0)
 		win = 0;
+	KASSERT(SEQ_GEQ(tp->rcv_adv, tp->rcv_nxt),
+	    ("tcp_input negative window: tp %p rcv_nxt %u rcv_adv %u", tp,
+	    tp->rcv_adv, tp->rcv_nxt));
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 
 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
@@ -2830,7 +2833,10 @@ dodata:							/* XXX */
 		 * buffer size.
 		 * XXX: Unused.
 		 */
-		len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
+		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
+			len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
+		else
+			len = so->so_rcv.sb_hiwat;
 #endif
 	} else {
 		m_freem(m);

Modified: stable/8/sys/netinet/tcp_output.c
==============================================================================
--- stable/8/sys/netinet/tcp_output.c	Mon Jun 20 16:48:00 2011	(r223342)
+++ stable/8/sys/netinet/tcp_output.c	Mon Jun 20 18:08:34 2011	(r223343)
@@ -574,15 +574,21 @@ after_sack_rexmit:
 		 * taking into account that we are limited by
 		 * TCP_MAXWIN << tp->rcv_scale.
 		 */
-		long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) -
-			(tp->rcv_adv - tp->rcv_nxt);
+		long adv;
+		int oldwin;
+
+		adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
+		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
+			oldwin = (tp->rcv_adv - tp->rcv_nxt);
+			adv -= oldwin;
+		} else
+			oldwin = 0;
 
 		/* 
 		 * If the new window size ends up being the same as the old
 		 * size when it is scaled, then don't force a window update.
 		 */
-		if ((tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale ==
-		    (adv + tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale)
+		if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
 			goto dontupdate;
 		if (adv >= (long) (2 * tp->t_maxseg))
 			goto send;
@@ -996,7 +1002,8 @@ send:
 	if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
 	    recwin < (long)tp->t_maxseg)
 		recwin = 0;
-	if (recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
+	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
+	    recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
 		recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
 	if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
 		recwin = (long)TCP_MAXWIN << tp->rcv_scale;
@@ -1304,7 +1311,7 @@ out:
 	 * then remember the size of the advertised window.
 	 * Any pending ACK has now been sent.
 	 */
-	if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
+	if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
 		tp->rcv_adv = tp->rcv_nxt + recwin;
 	tp->last_ack_sent = tp->rcv_nxt;
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);

Modified: stable/8/sys/netinet/tcp_timewait.c
==============================================================================
--- stable/8/sys/netinet/tcp_timewait.c	Mon Jun 20 16:48:00 2011	(r223342)
+++ stable/8/sys/netinet/tcp_timewait.c	Mon Jun 20 18:08:34 2011	(r223343)
@@ -228,6 +228,9 @@ tcp_twstart(struct tcpcb *tp)
 	/*
 	 * Recover last window size sent.
 	 */
+	KASSERT(SEQ_GEQ(tp->rcv_adv, tp->rcv_nxt),
+	    ("tcp_twstart negative window: tp %p rcv_nxt %u rcv_adv %u", tp,
+	    tp->rcv_adv, tp->rcv_nxt));
 	tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
 
 	/*



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201106201808.p5KI8Yej060347>