Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 12 Jun 2008 09:11:02 GMT
From:      Andre Oppermann <andre@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 143359 for review
Message-ID:  <200806120911.m5C9B2Qr003113@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=143359

Change 143359 by andre@andre_flirtbox on 2008/06/12 09:10:27

	Checkpoint TCP Input work.
	
	Add more annotations, comments and refine checks based on a close
	reading of RFCs and comparison with other implementations (*BSD,
	Linux, OpenSolaris).
	
	The TCP data path in tcp_do_options() is almost feature complete.
	Still incomplete is the handling of simultaneous open and urgent
	data.
	
	The window update processing function has been rewritten and extensively
	annotaded.  There is considerable disagreement among the implementations
	on the correct algorithm for window updates.  An extensive analysis is
	made and an improved algorithm is implemented and proposed to the IETF
	TCPM mailing list.
	
	Next major item on the TODO list is the timing, RTT and RTTVAR measurements
	and RTO calculation.

Affected files ...

.. //depot/projects/tcp_new/netinet/tcp_input.c#6 edit

Differences ...

==== //depot/projects/tcp_new/netinet/tcp_input.c#6 (text+ko) ====

@@ -157,7 +157,8 @@
 		     struct tcpopt *to, int acked, int tlen);
 static void	 tcp_do_urg(struct tcpcb *tp, struct tcphdr *th, int tlen);
 static void	 tcp_do_wu(struct tcpcb *tp, struct tcphdr *th,
-		     struct tcpopt *to, int tiwin, int tlen);
+		     struct tcpopt *to, int tiwin, int acked, int tlen,
+		     int sacked);
 static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 static void	 tcp_pulloutofband(struct socket *,
@@ -875,8 +876,9 @@
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int drop_hdrlen, int tlen)
 {
-	int thflags, acked, ourfinisacked, nudgeoutput = 0;
+	int thflags, acked, sacked;
 	int rstreason, todrop, rwin;
+	int ourfinisacked, nudgeoutput = 0;		/* XXXAO: bool type? */
 	tcp_win tiwin;
 	struct tcpopt to;
 	char *s = NULL;
@@ -925,8 +927,9 @@
 
 	/*
 	 * Calculate amount of space in our receive window.
-	 * Receive window is the amount of space in rcv queue,
-	 * but not less than last advertised window.
+	 * Receive window is the amount of space in receive queue,
+	 * but not less than last advertised window.  It is strongly
+	 * discouraged to shrink the window.
 	 *  RFC793: section 3.7, page 42-44
 	 *  RFC1122: section 4.2.2.16
 	 */
@@ -944,10 +947,10 @@
 	 * into established state and initializations of the timers.
 	 */
 	case TCPS_SYN_RECEIVED:
-		tp->t_starttime = ticks;
-		tp->t_state = TCPS_ESTABLISHED;
+		tp->t_starttime = tcp_uptime();
+		TCPS_TRANS(tp, TCPS_ESTABLISHED);
+		soisconnected(so);
 
-		soisconnected(so);
 		tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);	/* XXX: not here */
 
 		tcpstat.tcps_connects++;
@@ -980,7 +983,7 @@
 		 *  RFC793: section 3.9, page 67-68, fourth and fifth check
 		 *
 		 * NB: We have to remove SYN from thflags to
-		 * prevent the later bogus SYN check from
+		 * prevent the later SYN check from bogously
 		 * triggering.
 		 */
 		if (!(thflags & TH_SYN)) {
@@ -1055,7 +1058,7 @@
 			 */
 			tcplog("Window Scaling Option unexpected, "
 			    "connection aborted");
-			tp->t_error = ENETRESET;	/* XXX: Correct error? */
+			tp->t_error = ENETRESET;	/* XXXAO: Correct error? */
 			tp = tcp_close(tp);
 			rstreason = BANDLIM_UNLIMITED;
 			goto dropwithreset;
@@ -1126,15 +1129,27 @@
 		tp->snd_wu_ack = th->th_ack;
 		th->th_seq++;		/* SYN is acked */
 
-		tp->t_starttime = ticks;
-		tp->t_state = TCPS_ESTABLISHED;
+		tp->t_starttime = tcp_uptime();
+		TCPS_TRANS(tp, TCPS_ESTABLISHED);
 #ifdef MAC
 		SOCK_LOCK(so);
 		mac_set_socket_peer_from_mbuf(m, so);
 		SOCK_UNLOCK(so);
 #endif
 		soisconnected(so);
-		tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);	/* XXX */
+
+		/*
+		 * If we sent data with our initial SYN and
+		 * it wasn't ACK'ed, then resend immediately.
+		 * Otherwise go into idle mode and wait for
+		 * either side to start transmitting data.
+		 */
+		if (tp->snd_una != snd_nxt) {
+			tcp_timer_activate(tp, TT_RXMIT, );  /* XXXAO */
+			nudgeoutput = 1;
+		} else {
+			tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);	/* XXXAO */
+		}
 
 		tcpstat.tcps_connects++;
 		break;
@@ -1198,18 +1213,22 @@
 		 *
 		 * Don't accept remote ts older than already seen,
 		 * or reflected ts newer than what we send last.
+		 *  RFC1323bis: section 4.2.1 and 4.2.3
+		 *  RFC4953: section 3.1.3
 		 *
 		 * We store the receive time as uptime with second
 		 * resolution.  This makes us independent from the
 		 * wrap-around after 2^32 / hz (24.8 days at 1ms hz).
 		 *
-		 *  RFC1323bis: section 4.2.1 and 4.2.3
+		 * XXXAO: Linux says PAWS is broken.  Analyze if true or not.
+		 * Retransmitted segments are not presented for further processing.
+		 * We do send an ACK though.
+		 * Linux 2.6.25, net/ipv4/tcp_input.c, tcp_disordered_ack()
 		 */
 		if (to.to_flags & TOF_TS) {
-			struct bintime bt;
+			time_t uptime = tcp_uptime();
 
-			getbinuptime(&bt);
-			if (bt.sec - tp->t_rcvtime < ((tcp_ts)0x0 - 1) / hz) {
+			if (uptime - tp->t_rcvtime < ((tcp_ts)0x0 - 1) / (hz * 2)) {
 				if (TSTMP_LT(to.to_tsval, tp->snd_tsecr) {
 					tcplog("Timestamp too old, "
 					    "sending challenge ack");
@@ -1220,7 +1239,8 @@
 					    "sending challenge ack");
 					goto dropafterack;
 				}
-			}
+			} else
+				to->to_flags &= ~TOF_TS;
 		}
 
 		/*
@@ -1263,21 +1283,50 @@
 		if (SEQ_LT(th->th_seq + tlen, tp->rcv_nxt) ||
 		    SEQ_GT(th->th_seq, tp->rcv_nxt + (rwin ? (rwin - 1) : rwin))) {
 			/*
-			 * The connection is idle and this
-			 * is a keepalive.
+			 * Test for exceptions to the in-window check.
 			 */
 			if (th->th_seq == tp->rcv_nxt - 1 &&
 			    th->th_ack == tp->snd_nxt &&
-			    tlen <= 1 && !(thflags & TH_URG)) {
+			    tlen <= 1 && !(thflags & TH_URG) &&
+			    !TCPS_HAVERCVDFIN(tp)) {
+				/*
+				 * The connection is idle and this
+				 * is a keepalive.  Force an ACK and
+				 * continue processing the segment.
+				 *
+				 * 4.2BSD style keepalives do not
+				 * include any data but are off-by-one.
+				 * Detect this case and move sequence
+				 * space forward.  Any FIN on this
+				 * segment must be bogus.
+				 */
 				if (tlen == 0) {
-					th->th_seq = tp->rcv_nxt;
+					th->th_seq++;
 					thflags &= ~TH_FIN;
 				}
 				tp->t_flags |= TF_ACKNOW;
 				tcps.tcps_rcv_keepalive++;
-			} else if ((thflags & TH_URG) && th->th_urg && tlen &&
-			    th->th_seq == tp->rcv_nxt) {
-				/* Continue. XXXAO: tighter check. */
+			} else if (th->th_seq == tp->rcv_nxt - 1 &&
+			    TCPS_HAVERCVDFIN(tp) && (thflags & TH_FIN)) {
+				/*
+				 * Continue with duplicate FIN.
+				 * When we received the orginal FIN
+				 * we integrated it into the sequence
+				 * space.  Thus the in-window check
+				 * fails for a retransmitted FIN.
+				 * Force an ACK and continue processing
+				 * the segment.  This may be just a
+				 * half-close and we continue sending
+				 * more data to the peer.
+				 */
+				tp->t_flags |= TF_ACKNOW;
+			} else if (th->th_seq == tp->rcv_nxt && tlen > 0 &&
+			    (thflags & TH_URG) && th->th_urg > 0) {
+				/*
+				 * We must accept urgent data even when
+				 * the window is closed.  Continue
+				 * processing the segment.
+				 */
 			} else {
 				tcplog("Data outside window, segment ignored, "
 				    "sending challenge ACK");
@@ -1299,7 +1348,8 @@
 		 *   first paragraph, last sentence
 		 *  tcpsecure: section 5.2, mitigation of blind data injection
 		 *  tcpsecure: section 5.2 changes this to just drop
-		 * XXXAO: why?
+		 * XXXAO: Issue raised on TCPM mailing list.  Sending challenge
+		 * ACK back is to be kept and the next draft will be adjusted.
 		 */
 		if (SEQ_GEQ(th->th_ack, tp->snd_nxt) ||
 		    SEQ_LT(th->th_ack, tp->snd_una - tp->snd_maxwnd)) {
@@ -1307,7 +1357,6 @@
 			    "segment ignored, sending challenge ACK");
 			tcpstat.tcps_rcvacktoomuch++;
 			goto dropafterack;
-			/* goto drop; */
 		}
 
 		/*
@@ -1315,14 +1364,13 @@
 		 * ACK for the segment in question.  This should normally
 		 * not happen and indicates either a timing problem at the
 		 * sender or we delay the ACK too much.
+		 * XXXAO: Informal.  Not part of any RFC.
 		 */
 		if (SEQ_GT(th->th_seq, tp->snd_lastack) &&
 		    SEQ_LT(th->th_seq, tp->rcv_nxt) {
 			tcplog("Received retransmit before we sent delayed ACK,"
 			    " no action");
 		}
-
-		/* XXXAO: stats? */
 		break;
 
 	/*
@@ -1338,12 +1386,15 @@
 	 * TCP-MD5 is done after the general acceptability checks
 	 * to run only on most likely valid segments through the
 	 * expensive MD5 hash computation.
-	 * In SYN_RECEIVED case syncache verified the signature
-	 * already.
 	 *  RFC2385: section 2.0, 3.0
-	 * XXXAO: Make work.
+	 *
+	 * NB: In SYN_RECEIVED case the syncache verified the signature
+	 * already.  We do it again here lacking a simple way to transport
+	 * this information.  The overhead is negligable.
+	 *
+	 * XXXAO: Make it work.
 	 */
-	if ((tp->t_flags & TF_SIGNATURE) && notalreadydone) {
+	if (tp->t_flags & TF_SIGNATURE) {
 		/* Copy signature and compare. */
 		tcp_signature_compute(m, sizeof(struct ip), len, optlen,
 		    (u_char *)(th + 1) + sigoff, IPSEC_DIR_INBOUND);
@@ -1368,6 +1419,7 @@
 	if (thflags & TH_RST) {
 		/*
 		 * Filter out what we determine NOT to be legitimate RST's.
+		 *  RFC793: section 3.4, page 36-37, Reset Generation
 		 */
 		switch (tp->t_state) {
 		case TCPS_SYN_SENT:
@@ -1380,6 +1432,7 @@
 				    "segment ignored");
 				goto drop;
 			}
+
 			/*
 			 * The ACK must be within what we sent but does
 			 * not have to ACK the SYN.
@@ -1408,16 +1461,29 @@
 				tcpstat.tcps_badrst++;
 				goto drop;
 			}
+
 			/*
 			 * Check if the sequence number is NOT acceptable to us.
+			 * In general an RST is only valid if it is within the
+			 * receive window.  A more restricted check is to allow
+			 * RST only on the edges of the window.  The fudge factor
+			 * of +/-1 byte is to allow for slightly broken implementations
+			 * that assume RST to be part of the sequence space or
+			 * have some other common off-by-one error.  Also a RST to
+			 * an old style 4.2BSD keepalive is rcv_nxt-1.  snd_last_ack
+			 * is used for the base plus rwin because the remote end
+			 * can't possibly have advance its snd_una without the
+			 * ACK from us.  When delayed ACKs are enabled such a
+			 * discrepancy can happen.  Most of the time tough rcv_nxt
+			 * and snd_last_ack are the same.
 			 *  RFC793: page 70, second check
 			 *  RFC4953: section 3.1.2 (discussion of various methods)
-			 * XXXAO: Three points: rcv_nxt, snd_last_ack, rcv_nxt+rwin?
-			 * XXXAO: Description of +-1 variance.
+			 *  tcp-secure: section 4.2
 			 */
 			if (tcp_secure_rst &&
 			    (SEQ_DELTA(th->th_seq, tp->rcv_nxt) > 1 ||
-			     SEQ_DELTA(th->th_seq, tp->snd_last_ack) > 1)) {
+			     SEQ_DELTA(th->th_seq, tp->snd_last_ack) > 1) ||
+			     SEQ_DELTA(th->th_seq, tp->snd_last_ack + rwin) > 1) {
 				tcplog("RST does not match (secure), segment ignored");
 				tcpstat.tcps_badrst++;
 				goto drop;
@@ -1430,7 +1496,7 @@
 			}
 			break;
 		}
-		tcplog("RST received, closing connection");
+		tcplog("valid RST received, closing connection");
 
 		/*
 		 * Unwind the connection according to its state.
@@ -1447,10 +1513,10 @@
 			 *  RFC1122: section 4.2.2.12 [RST cause]
 			 *  RFC1122: section 4.2.2.13
 			 */
-			so->so_error = ECONNRESET;
-			/* XXXAO: Macro encapsulating state transition changes? */
-			tp->t_state = TCPS_CLOSED;
 			tcpstat.tcps_drops++;
+			so->so_error = (tp->t_state == TCPS_SYN_SENT) ?
+			    ECONNREFUSED : ECONNRESET;
+			TCPS_TRANS(tp, TCPS_CLOSED);
 			tp = tcp_close(tp);
 			break;
 
@@ -1460,29 +1526,56 @@
 			 * The socket is already gone,
 			 * just clean up and be done.
 			 */
+			TCPS_TRANS(tp, TCPS_CLOSED);
 			tp = tcp_close(tp);
 			break;
 		}
+		/*
+		 * XXXAO: Extract the RST cause from the segment data
+		 * based on draft something.
+		 */
 		goto drop;
 	} else if (thflags & TH_SYN) {
 		/*
-		 * We may get a SYN in these cases:
-		 *  remote host went down and comes back up
-		 *  retransmitted SYN-ACK when our ACK was lost
-		 *  malicous SYN
-		 *  simultaneous open in SYN_SENT case is handled there
+		 * A SYN in an established connection has essentially
+		 * the same effect as a RST if it is within the window.
+		 * Otherwise only an ACK without further processing is
+		 * sent back.  Thus we have to be very careful too.
+		 *  RFC793: section 3.9, page 71, fourth check
+		 *
+		 * We may get a SYN in the following situations:
+		 *  o Remote host went down and comes back up,
+		 *    tries to re-establish the connection and
+		 *    hits the same window
+		 *  o Retransmitted SYN-ACK when our ACK was lost,
+		 *    and we have already moved to ESTABLISHED state
+		 *  o Malicious SYN attack to cause a connection abort
+		 *  o Simultaneous open, this is a special case of
+		 *    the SYN-SENT state and handled there.
 		 *
 		 * Instead of dropping the connection right away
 		 * we send a challenge ACK back.  If the connection
 		 * is dead or retried we get back a proper RST which
 		 * will be validated and close the connection then.
-		 * tcp-secure modified recommends behavior to protect
-		 * against "blind in the window" attacks.
-		 *  tcpsecure: section 4.2
+		 *  tcp-secure: section 4.2
 		 */
-		tcplog("SYN received, segment ignored, "
-		    "sending challenge ACK");
-		goto dropafterack;
+		if (!tcp_secure_rst &&
+		    SEQ_GEQ(th->th_seq, tp->rcv_nxt) &&
+		    SEQ_LT(th->th_seq, tp->rcv_nxt + rwin)) {
+			tcplog("SYN received in window (insecure), "
+			    "closing connection");
+			tcpstat.tcps_drops++;
+			so->so_error = ECONNRESET;
+			TCPS_TRANS(tp, TCPS_CLOSED);
+			tp = tcp_close(tp);
+			rstreason = BANDLIM_UNLIMITED;
+                        goto dropwithreset;
+		} else {
+			tcpstat.tcps_badsyn++;
+			tcplog("SYN received, segment ignored, "
+			    "sending challenge ACK");
+			goto dropafterack;
+		}
 	}
 
 	/*
@@ -1498,26 +1591,32 @@
 
 	/*
 	 * Trim segment on the left, i.e. it starts before rcv_nxt.
+	 * If the remote end has already shut down this connection
+	 * drop all data in this segment.
 	 *
 	 * XXXAO: we may have a zero window but have to process URG data.
 	 * Trim off all normal payload and leave only the urg data intact.
 	 */
-	todrop = tp->rcv_nxt - th->th_seq;
+	if (!TCPS_HAVERCVDFIN(tp->t_state))
+		todrop = tp->rcv_nxt - th->th_seq;
+	else
+		todrop = tlen;
 	if (todrop > 0) {
 		KASSERT(todrop <= tlen,
 		    ("%s: left todrop > tlen", __func__));
+
+		/*
+		 * If we drop all data from this segment,
+		 * send an ACK to resynchronize.
+		 * Keep on processing for ACK and options.
+		 *
+		 * NB: The problem pointed out by Stevens
+		 * doesn't apply here.  Segments always
+		 * attach to rcv_nxt and we never have to
+		 * remove the left edge and the FIN.
+		 *  Stevens Vol.2: section 28.8, page 959-960, figure 28.30
+		 */
 		if (todrop == tlen) {
-			/*
-			 * Any valid FIN must be to the left of the window.
-			 * At this point the FIN must be a duplicate or out
-			 * of sequence; drop it.
-			 * XXXAO: Fixup
-			 */
-			thflags &= ~TH_FIN;
-			/*
-			 * Send an ACK to resynchronize and drop any data.
-			 * But keep on processing for ACK and options.
-			 */
 			tp->t_flags |= TF_ACKNOW;
 			tcpstat.tcps_rcvduppack++;
 			tcpstat.tcps_rcvdupbyte += todrop;
@@ -1525,6 +1624,7 @@
 			tcpstat.tcps_rcvpartduppack++;
 			tcpstat.tcps_rcvpartdupbyte += todrop;
 		}
+
 		/*
 		 * Drop from front later together with
 		 * delayed header drop and adjust segment
@@ -1533,6 +1633,7 @@
 		drop_hdrlen += todrop;
 		th->th_seq += todrop;
 		tlen -= todrop;
+
 		/*
 		 * Adjust urgent pointer or blank it out
 		 * if it was within the dropped region.
@@ -1549,32 +1650,34 @@
 	 * Trim segment to the right, ie. it ends after window,
 	 * drop trailing data (and PUSH and FIN);
 	 * if nothing left, just ACK.
-	 * XXXAO: for window probe we may have to trim off the one byte.
 	 */
-	if (!TCPS_HAVERCVDFIN(tp->t_state))
-		todrop = th->th_seq + tlen, tp->rcv_nxt + rwin;
-	else
-		todrop = tlen;
+	todrop = th->th_seq + tlen, tp->rcv_nxt + rwin;
 	if (todrop > 0) {
-		KASSERT(todrop <= tlen, ("%s: right todrop > tlen", __func__));
-		if (todrop == tlen) {
-			/*
-			 * If window is closed we can only take segments at
-			 * the window edge, and have to drop data and PUSH
-			 * from incoming segments.  Continue processing, but
-			 * remember to ACK.  Otherwise, drop segment
-			 * and ACK.
-			 */
-			if (rwin == 0 && th->th_seq == tp->rcv_nxt) {
-				tp->t_flags |= TF_ACKNOW;
-				tcpstat.tcps_rcvwinprobe++;
-			}
+		KASSERT(todrop <= tlen,
+		    ("%s: right todrop > tlen", __func__));
+		KASSERT(!TCPS_HAVERCVDFIN(tp),
+		    ("%s: FIN received, todrop > 0", __func__));
+
+		/*
+		 * If out window is closed, this may be a window probe.
+		 * If so send an ACK and update statistics.
+		 */
+		if (todrop == tlen && rwin == 0 && th->th_seq == tp->rcv_nxt) {
+			tp->t_flags |= TF_ACKNOW;
+			tcpstat.tcps_rcvwinprobe++;
 		}
 		m_adj(m, -todrop);		/* Drop from tail. */
 		tlen -= todrop;
+
+		/*
+		 * A FIN must be at the right edge of the segment.
+		 * When we drop data because it exceeds the window,
+		 * FIN and PUSH become invalid.
+		 */
 		thflags &= ~(TH_PUSH|TH_FIN);
 		tcpstat.tcps_rcvpackafterwin++;
 		tcpstat.tcps_rcvbyteafterwin += todrop;
+
 		/* XXXAO: Urgent pointer? */
 		if ((thflags & TH_URG) && th->th_urp > todrop)
 			th->th_urp -= todrop;
@@ -1583,38 +1686,45 @@
 			th->th_urp = 0;
 		}
 	}
-
-	/*
-	 * Urgent pointer is invalid when we dont have any data.
-	 * XXXAO: Not really.  Urgent data is a hack.
-	 */
-	if ((thflags & TH_URG) && tlen == 0) {
-		thflags &= ~TH_URG;
-		th->th_urp = 0;
-	}
-
 	KASSERT(tlen >= 0,
 	    ("%s: tlen < 0", __func__));
 
 	/*
 	 * If new data is received on a connection after the
-	 * user process is gone, RST the other end.
-	 * We can't ACK the data as we won't be delivering it
-	 * to an application.  We can't just wait here until
-	 * the other side gives up as an connection could go
-	 * on forever.
+	 * socket is closed or the user process is gone, and
+	 * doesn't has a file descriptor reference anymore,
+	 * send an RST the other end.  This is an artifact
+	 * of how TCP's half-close works.  We sent a FIN but
+	 * that only tells the other end that we won't be
+	 * sending any more data, not that we can't receive it
+	 * either in this case.  We can't ACK the data as we
+	 * won't be delivering it to an application.  And we
+	 * can't just wait here and drop the data into a void
+	 * until the other side gives up as that could go on
+	 * forever.
+	 *  Stevens Vol.2: section 28.8, page 957, lines 687-696
+	 *
+	 * NB: Segments without any data but ack'ing our FIN are
+	 * handled later during ACK processing in the "ourfinisacked"
+	 * section.
 	 */
-	if (tp->t_state > TCPS_CLOSE_WAIT && tlen &&
+	if (TCPS_HAVECLOSED(tp->t_state) && tlen > 0 &&
 	    (so->so_state & SS_NOFDREF)) {
 		tcpstat.tcps_rcvafterclose++;
+		TCPS_TRANS(tp, TCPS_CLOSED);
 		tp = tcp_close(tp);
 		rstreason = BANDLIM_UNLIMITED;
 		goto dropwithreset;
 	}
 
 doack:
+	KASSERT((thflags & (TH_SYN|TH_RST|TH_ACK)) == TH_ACK,
+	    ("%s: no ACK flag or TH_SYN, TH_RST present", __func__));
+
 	/*
 	 * ACK, timing, option and congestion control processing.
+	 *
+	 * NB: From now on the tcpcb may be modified.
 	 */
 	acked = th->th_ack - tp->snd_una;
 	if (acked < 0)
@@ -1623,40 +1733,48 @@
 	tcpstat.tcps_rcvackbyte += acked;
 
 	/*
+	 * Update send SACK information and tell us how much more
+	 * data has left the network (relative to last SACK we got).
+	 */
+	if ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))
+		sacked = tcp_sack_doack(tp, &to, th->th_ack);
+	else
+		sacked = 0;
+
+	/*
 	 * Update send window information.
+	 *
+	 * NB: This function looks at the timestamp differences
+	 * (if available) and must run before the timestamp updates
+	 * are processed and integrated.
 	 */
-	nudgeoutput = tcp_do_wu(tp, th, &to, tiwin, tlen);
+	nudgeoutput = tcp_do_wu(tp, th, &to, tiwin, acked, tlen, sacked);
 
 	/*
 	 * Update and recompute connection timing information.
 	 * This includes RTT, SRTT, etc.
 	 */
-	tcp_do_time(tp, th, &to, acked, tlen);
-
-	/*
-	 * Update send SACK information.
-	 */
-	if ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))
-		tcp_sack_doack(tp, &to, th->th_ack);
+	tcp_do_time(tp, th, &to, acked, tlen, sacked);
 
 	/*
 	 * Update congestion control information.
 	 */
-	tcp_congest(tp, th, tiwin, acked, tlen, flags);
+	nudgeoutput = tcp_congest(tp, th, tiwin, acked, tlen, sacked);
 
 	/*
-	 * Drop acknowledged data from send socket buffer.
+	 * Drop acknowledged data from send socket buffer
+	 * and advance the unacknowledged pointer.
 	 *  RFC793: section 3.9, page 72, fifth check
 	 */
 	if (acked > 0)
-		int cantrcvmore;
-
 		SOCKBUF_LOCK(&so->so_snd);
 
 		KASSERT(SEQ_GT(th->th_ack, tp->snd_nxt),
-		    ("%s: ", __func__));
+		    ("%s: more acked than sent", __func__));
 		KASSERT(acked <= so->so_snd.sb_cc + 1,
 		    ("%s: more acked than in send buffer", __func__));
+		KASSERT(!TCPS_HAVERCVDFIN(tp),
+		    ("%s: FIN already processed but acked > 0", __func__));
 
 		/*
 		 * The ack to our FIN advances the sequence space
@@ -1676,16 +1794,10 @@
 
 		/*
 		 * Advance the unacknowledged pointer.
-		 *  RFC793: section 3.9, page 72, fifth check
 		 */
 		tp->snd_una = th->th_ack;
 
 		/*
-		 * Obtain sb_state before unlock for later use.
-		 */
-		cantrcvmore = so->so_rcv.sb_state & SBS_CANTRCVMORE;
-
-		/*
 		 * Wake up and inform any writers on the socket.
 		 *
 		 * NB: sowwakeup_locked() does an implicit unlock.
@@ -1709,28 +1821,47 @@
 			switch (tp->t_state) {
 			case TCPS_FIN_WAIT_1:
 				/*
-				 * If we can't receive any more data,
-				 * then closing user can proceed.
-				 * XXXAO: better description and reference
-				 * to discussion.
+				 * We sent FIN and now it has been ack'ed.
+				 *
+				 * Contrary to the specification we start
+				 * a timer if the connection was completely
+				 * closed (i.e. the socket was closed or the
+				 * process is gone).  This is an artifact
+				 * of the half-close system used by TCP.
+				 * If the other side doesn't closes or shuts
+				 * down and never sends a FIN we would wait
+				 * forever.  If the process did an explicit
+				 * half-close (using shutdown(s, SHUT_WR))
+				 * this does not apply as only our FIN was
+				 * sent and the process can still receive data.
+				 *  Stevens Vol.2: section 29.5, page 980,
+				 *  lines 972-975
+				 *
+				 * NB: We never transition directly into
+				 * TIME_WAIT state from here.  It is always
+				 * done in two steps via FIN_WAIT_2.  The FIN
+				 * handling later on will make the final
+				 * transition to TIME_WAIT if this segment
+				 * contains a FIN as well.
 				 *
-				 * NB: Starting the timer is contrary to the
-				 * specification, but if we don't get a FIN
-				 * we'll hang forever.
+				 * XXXAO: Make sure to disable other timers.
+				 * XXXAO: Unlocked access to receive sockbuf.
 				 */
-				if (cantrcvmore) {
+				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					soisdisconnected(so);
 					tcp_timer_activate(tp, TT_2MSL,
 					    (tcp_fast_finwait2_recycle ?
 						tcp_finwait2_timeout :
 						tcp_maxidle));
 				}
-				/* XXX: Make sure to disable other timers. */
-				tp->t_state = TCPS_FIN_WAIT_2;
+				TCPS_TRANS(tp, TCPS_FIN_WAIT_2);
 				break;
 
 			case TCPS_CLOSING:
 				/*
+				 * We sent FIN, now it has been ack'ed,
+				 * we received FIN and have to ACK it.
+				 *
 				 * Create a compressed TIME-WAIT state
 				 * with minimal information and discard
 				 * this tcpcb to save memory.
@@ -1738,19 +1869,20 @@
 				tcp_twstart(tp);
 				tp = NULL;
 				INP_INFO_WUNLOCK(&tcbinfo);
-				m_freem(m);
-				m = NULL;
-				goto done;
+				goto drop;
 				break;
 
 			case TCPS_LAST_ACK:
 				/*
+				 * We received FIN, ack'ed it, sent FIN
+				 * and now it is ack'ed as well.
+				 *
 				 * Our FIN is acknowledged.  The session
 				 * teardown is completed and this tcpcb
 				 * is discarded.
 				 *  RFC793: section 3.9, page 73
 				 */
-				tp->t_state = TCPS_CLOSED;
+				TCPS_TRANS(tp, TCPS_CLOSED);
 				tp = tcp_close(tp);
 				goto drop;
 				break;
@@ -1767,9 +1899,11 @@
 	}
 
 	/*
-	 * If the segment doesn't carry any data we are done.
+	 * If the segment doesn't carry any data, urgent data
+	 * or FIN we are done.
 	 */
-	if (tlen == 0 && !(th->th_flags & TH_FIN) && !TCPS_HAVERCVDFIN(tp->t_state)) {
+	if (tlen == 0 && (th->th_flags & TH_FIN) == 0 &&
+	    !TCPS_HAVERCVDFIN(tp->t_state)) {
 		INP_INFO_WUNLOCK(&tcpinfo);
 		m_freem(m);
 		m = NULL;
@@ -1790,13 +1924,21 @@
 	}
 
 	/*
-	 * Process the segment data, merging it into the TCP reassembly
-	 * queue, and arranging for acknowledgment of receipt if necessary.
-	 * If a FIN has already been received on this connection then
-	 * we just ignore the data.  In fact we should not get any.
+	 * Process the segment data and append it to the receive
+	 * socket buffer.  If the segment does not start with rcv_nxt
+	 * then merge it into the reassembly queue for later processing.
+	 * If a FIN has already been received and processed we just
+	 * ignore the data, it is bogus.
 	 *  RFC793: section 3.9, page 74, seventh action
+	 *
+	 * What we get here:
+	 *  segment with data (and FIN)
+	 *  segment after FIN received is truncated to zero
+	 *  segment with FIN only
+	 *  segment with urgent that got pulled and now is zero
 	 */
-	if (!TCPS_HAVERCVDFIN(tp->t_state)) {
+	if (!TCPS_HAVERCVDFIN(tp->t_state) &&
+	    (tlen > 0 || (tp->rcv_trq != NULL && th->th_flags & TH_FIN))) {
 		int newsize = 0;	/* Rcvbuf autoscaling. */
 
 		/*
@@ -1805,7 +1947,7 @@
 		m_adj(m, drop_hdrlen);
 		if (m->m_next != NULL) {
 			/*
-			 * XXX: m_adj doesn't drop mbufs from the front,
+			 * XXXAO: m_adj doesn't drop mbufs from the front,
 			 * it only empties and skips them.  Free empty
 			 * mbufs to optimize space usage.  Especially
 			 * important if we work with header splitting.
@@ -1831,8 +1973,6 @@
 		 * will go there as well.  Whenever a segment or a row of
 		 * segments can be reassembled and the left edge exactly
 		 * matches, it is returned together for further processing.
-		 * The FIN flag is queued as well and no segments beyond
-		 * its right edge are accepted.
 		 * The FIN flag is stored in the mbuf header as M_PROTO1.
 		 * Send a forced ACK for every segment we receive when
 		 * we are doing reassembly to immediately inform the sender
@@ -1843,7 +1983,7 @@
 		if (th->th_seq != tp->rcv_nxt || !TAILQ_EMPTY(&tp->rcv_trq)) {
 			m = tcp_reass(tp, m, tlen, thflags);
 
-			if (m && (m->m_flags & M_PROTO1))
+			if (m != NULL && (m->m_flags & M_PROTO1))
 				thflags |= TH_FIN;
 			else if (m == NULL)
 				thflags &= ~TH_FIN;
@@ -1884,10 +2024,13 @@
 		 * TODO: Only step up if the application is actually serving
 		 * the buffer to better manage the socket buffer resources.
 		 * TODO: Scope of one second may be too big.  Rather one RTT.
+		 *
+		 * XXXAO: When bulk data comes in from the reassembly queue
+		 * don't step up the socket buffer.
 		 */
-		if (m && tcp_do_autorcvbuf && (to.to_flags & TOF_TS) &&
+		if (m != NULL && tcp_do_autorcvbuf && (to.to_flags & TOF_TS) &&
 		    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
-			if (to.to_tsecr > tp->rfbuf_ts &&
+			if (TS_GT(to.to_tsecr, tp->rfbuf_ts) &&
 			    to.to_tsecr - tp->rfbuf_ts < hz) {
 				if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) &&
 				    so->so_rcv.sb_hiwat < tcp_autorcvbuf_max) {
@@ -1915,13 +2058,20 @@
 		 */
 		if (m) {
 			SOCKBUF_LOCK(&so->so_rcv);
-
+			/*
+			 * As long as the socket is still open we can
+			 * put data into the receive buffer.  If the
+			 * socket was closed we dispose of this tcpcb
+			 * and send a RST to the other side to inform
+			 * it of our inability to accept further data.
+			 *  RFC793: ...
+			 */
 			if (!(so->so_rcv.sb_state & SBS_CANTRCVMORE)) {
-				sbappendstream_locked(&so->so_rcv, m);
-
 				/*
-				 * Mbuf chain was consumed.
+				 * Append data to receive socket buffer and
+				 * note that mbuf chain was consumed.
 				 */
+				sbappendstream_locked(&so->so_rcv, m);
 				m = NULL;
 
 				/*
@@ -1931,8 +2081,7 @@
 				tp->rcv_nxt += tlen;
 
 				/*
-				 * Receive buffer autosizing.
-				 * Adjust socket buffer size.
+				 * Adjust socket buffer size when autosizing.
 				 */
 				if (newsize) {
 				    if (!sbreserve_locked(&so->so_rcv,
@@ -1955,21 +2104,31 @@
 				 * The only option we have is to close
 				 * and send a RST.  We can't ack the data
 				 * when the process won't ever read it.
+				 *
+				 * XXXAO: Overlap with TCPS_HAVECLOSED check?
+				 * The situation is the same but different
+				 * test?  Protecting against race condition
+				 * between earlier check and this one.
 				 */
 				tcpstat.tcps_rcvafterclose++;
+				TCPS_TRANS(tp, TCPS_CLOSED);
 				tp = tcp_close(tp);
 				rstreason = BANDLIM_UNLIMITED;
 				goto dropwithreset;
 			}
+			/*
+			 * NB: Continue with segment flags.
+			 */
 		}
 	} else {
-		KASSERT(TAILQ_EMPTY(&tp->rcv_trq),
-		    ("%s: FIN state but reassembly queue not empty", __func__));
+		KASSERT(!TCPS_HAVERCVDFIN(tp) || TAILQ_EMPTY(&tp->rcv_trq),
+		    ("%s: FIN received but reassembly queue not empty", __func__));
 		KASSERT(tlen == 0,
-		    ("%s: data after FIN", __func__))
+		    ("%s: data unexpected", __func__))
 
 		/*
-		 * 
+		 * Dispose of segment.  The data was already dropped
+		 * and the segment headers are now useless.
 		 */
 		m_freem(m);
 		m = NULL;
@@ -2001,44 +2160,69 @@
 
 		switch (tp->t_state) {
 		case TCPS_ESTABLISHED:
-			tp->t_state = TCPS_CLOSE_WAIT;
+			/*
+			 * Normal half-close where the other end
+			 * shuts down their side of the connection.
+			 * Just move to CLOSE_WAIT.  We can still
+			 * send data.
+			 */
+			TCPS_TRANS(tp, TCPS_CLOSE_WAIT);
 			break;
 
 		case TCPS_FIN_WAIT_1:
 			/*
-			 * If still in FIN_WAIT_1 state FIN has not
-			 * been acked so enter the CLOSING state.
-			 * XXXAO: ?
+			 * We have closed our send stream.  The
+			 * other side did the same at the same
+			 * time.  A simultaneous close is taking
+			 * place.  Our two FINs have crossed each
+			 * other in the network.  All that is left
+			 * is our ACK for their FIN and their ACK
+			 * for our FIN.
+			 *
+			 * NB: No direct transition into TIME_WAIT
+			 * happens because the ACK processing earlier
+			 * always moves us via FIN_WAIT_2.
 			 */
-			if (tp->snd_una == tp->snd_nxt)
-				/*
-				 * go into TIME_WAIT
-				 * we always go through fin_wait_2
-				 * when the fin-ack came in.
-				 */
-			else
-				tp->t_state = TCPS_CLOSING;
+			TCPS_TRANS(tp, TCPS_CLOSING);
 			break;
 
 		case TCPS_FIN_WAIT_2:
 			/*
-			 * In FIN_WAIT_2 state enter the TIME_WAIT state.
-			 * tcp_twstart() discards this tcpcb and creates
-			 * a compressed state.
+			 * Our FIN is ack'ed and we got the FIN
+			 * from the other end as well.  We have
+			 * to ACK their FIN and are done.
+			 * tcp_twstart() discards this tcpcb and
+			 * creates a compressed state.
+			 *
+			 * NB: This connection may have been in
+			 * FIN_WAIT_1 earlier and the ACK processing
+			 * moved it into FIN_WAIT_2 when it received
+			 * the ACK for our FIN.
 			 */
 			tcp_twstart(tp);
 			INP_INFO_WUNLOCK(&tcbinfo);
 			goto done;
 
 		case TCPS_CLOSING:
+			/*
+			 * We already received a FIN for this
+			 * connection and this is a retransmit.
+			 * Send an ACK back and wait for their
+			 * ACK to our FIN.
+			 */
+			break;
+
 		case TCPS_CLOSE_WAIT:
-		case TCPS_LAST_ACK:	/* XXXAO: Why wasn't this here before? */
 			/*
-			 * We may get retransmitted FINs,
-			 * just ack them below.
+			 * We already received a FIN for this
+			 * connection and this is a retransmit.
+			 * The TCP connection is only half-closed
+			 * and the application may still send data.
+			 * Send back an ACK for their FIN.
 			 */
 			break;
 		
+		case TCPS_LAST_ACK:
 		default:
 			KASSERT(1 == 0,
 			    ("%s: FIN processing, illegal state", __func__));
@@ -2057,13 +2241,17 @@
 	INP_INFO_UNLOCK_ASSERT(&tcbinfo);
 	INP_LOCK_ASSERT(tp->t_inpcb);
 	KASSERT(m == NULL, ("%s: mbuf leak", __func__));
+

>>> TRUNCATED FOR MAIL (1000 lines) <<<



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200806120911.m5C9B2Qr003113>