Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 3 Feb 2008 10:48:37 GMT
From:      Andre Oppermann <andre@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 134712 for review
Message-ID:  <200802031048.m13AmbeD066413@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=134712

Change 134712 by andre@andre_flirtbox on 2008/02/03 10:48:06

	Push work in progress on TCP rewrite from my local branch into perforce.
	
	TCP input is completely rewritten and restructured.  This condenses it
	down quite a bit and makes it very readable and understandable again.
	
	Details:
	    - tcp_do_segment() rewritten and condensed
	    - all cruft removed
	    - all congestion control removed to reappear in modular tcp_congest.c
	    - window update handling fixed and moved to its own function
	    - connection timing handling move to its own function (rtt, srtt)
	    - urgent data handling move to its own function
	
	Please note that this is work in progress and neither complete nor functional
	at the moment.
	
	The other modified files to follow one by one.

Affected files ...

.. //depot/projects/tcp_new/netinet/tcp_input.c#2 edit

Differences ...

==== //depot/projects/tcp_new/netinet/tcp_input.c#2 (text+ko) ====

@@ -153,6 +153,8 @@
 static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 static void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
 		     struct socket *, struct tcpcb *, int, int);
+static void	 tcp_do_time(struct tcpcb *tp, struct tcphdr *th,
+		     struct tcpopt *to);
 static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 static void	 tcp_pulloutofband(struct socket *,
@@ -864,392 +866,87 @@
 	return;
 }
 
+#define	tcplog	(s = tcp_log_addrs(tcpcbtoinc(tp), th, NULL, NULL))
+
 static void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int drop_hdrlen, int tlen)
 {
-	int thflags, acked, ourfinisacked, needoutput = 0;
-	int headlocked = 1;
-	int rstreason, todrop, win;
-	u_long tiwin;
+	int thflags, acked, ourfinisacked, nudgeoutput = 0;
+	int rstreason, todrop, rwin;
+	tcp_win tiwin;
 	struct tcpopt to;
+	char *s = NULL;
 
-#ifdef TCPDEBUG
-	/*
-	 * The size of tcp_saveipgen must be the size of the max ip header,
-	 * now IPv6.
-	 */
-	u_char tcp_saveipgen[IP6_HDR_LEN];
-	struct tcphdr tcp_savetcp;
-	short ostate = 0;
-#endif
-	thflags = th->th_flags;
-
 	INP_INFO_WLOCK_ASSERT(&tcbinfo);
 	INP_LOCK_ASSERT(tp->t_inpcb);
-	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
-	    __func__));
-	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
-	    __func__));
+	KASSERT(tp->t_state > TCPS_LISTEN,
+	    ("%s: TCPS_LISTEN", __func__));
+	KASSERT(tp->t_state != TCPS_TIME_WAIT,
+	    ("%s: TCPS_TIME_WAIT", __func__));
 
 	/*
-	 * Segment received on connection.
-	 * Reset idle time and keep-alive timer.
-	 * XXX: This should be done after segment
-	 * validation to ignore broken/spoofed segs.
+	 * Store the flags in a variable for easy manipulation.
 	 */
-	tp->t_rcvtime = ticks;
-	if (TCPS_HAVEESTABLISHED(tp->t_state))
-		tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
+	thflags = th->th_flags;
 
 	/*
 	 * Unscale the window into a 32-bit value.
-	 * For the SYN_SENT state the scale is zero.
+	 * 
+	 * NB: For the SYN_SENT state the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
 
 	/*
-	 * Parse options on any incoming segment.
+	 * Parse options on any incoming segment (if present).
 	 */
-	tcp_dooptions(&to, (u_char *)(th + 1),
-	    (th->th_off << 2) - sizeof(struct tcphdr),
-	    (thflags & TH_SYN) ? TO_SYN : 0);
+	if ((th->th_off << 2) != sizeof(struct tcphdr))
+		tcp_dooptions(&to, (u_char *)(th + 1),
+		    (th->th_off << 2) - sizeof(struct tcphdr),
+		    (thflags & TH_SYN) ? TO_SYN : 0);
+	else
+		to.to_flags = 0;
 
 	/*
-	 * If echoed timestamp is later than the current time,
-	 * fall back to non RFC1323 RTT calculation.  Normalize
-	 * timestamp if syncookies were used when this connection
-	 * was established.
+	 * Normalize timestamp if syncookies were used when this
+	 * connection was established.
 	 */
-	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
+	if (to.to_flags & TOF_TS)
 		to.to_tsecr -= tp->ts_offset;
-		if (TSTMP_GT(to.to_tsecr, ticks))
-			to.to_tsecr = 0;
-	}
 
 	/*
-	 * Process options only when we get SYN/ACK back. The SYN case
-	 * for incoming connections is handled in tcp_syncache.
-	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
-	 * or <SYN,ACK>) segment itself is never scaled.
-	 * XXX this is traditional behavior, may need to be cleaned up.
+	 * Calculate amount of space in receive window.
+	 * Receive window is amount of space in rcv queue,
+	 * but not less than advertised window.
 	 */
-	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
-		if ((to.to_flags & TOF_SCALE) &&
-		    (tp->t_flags & TF_REQ_SCALE)) {
-			tp->t_flags |= TF_RCVD_SCALE;
-			tp->snd_scale = to.to_wscale;
-		}
-		/*
-		 * Initial send window.  It will be updated with
-		 * the next incoming segment to the scaled value.
-		 */
-		tp->snd_wnd = th->th_win;
-		if (to.to_flags & TOF_TS) {
-			tp->t_flags |= TF_RCVD_TSTMP;
-			tp->ts_recent = to.to_tsval;
-			tp->ts_recent_age = ticks;
-		}
-		if (to.to_flags & TOF_MSS)
-			tcp_mss(tp, to.to_mss);
-		if ((tp->t_flags & TF_SACK_PERMIT) &&
-		    (to.to_flags & TOF_SACKPERM) == 0)
-			tp->t_flags &= ~TF_SACK_PERMIT;
-	}
+	rwin = sbspace(&so->so_rcv);
+	rwin = imax(rwin, (int)(tp->rcv_adv - tp->rcv_nxt));
 
 	/*
-	 * Header prediction: check for the two common cases
-	 * of a uni-directional data xfer.  If the packet has
-	 * no control flags, is in-sequence, the window didn't
-	 * change and we're not retransmitting, it's a
-	 * candidate.  If the length is zero and the ack moved
-	 * forward, we're the sender side of the xfer.  Just
-	 * free the data acked & wake any higher level process
-	 * that was blocked waiting for space.  If the length
-	 * is non-zero and the ack didn't move, we're the
-	 * receiver side.  If we're getting packets in-order
-	 * (the reassembly queue is empty), add the data to
-	 * the socket buffer and note that we need a delayed ack.
-	 * Make sure that the hidden state-flags are also off.
-	 * Since we check for TCPS_ESTABLISHED first, it can only
-	 * be TH_NEEDSYN.
-	 */
-	if (tp->t_state == TCPS_ESTABLISHED &&
-	    th->th_seq == tp->rcv_nxt &&
-	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
-	    tp->snd_nxt == tp->snd_max &&
-	    tiwin && tiwin == tp->snd_wnd && 
-	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
-	    LIST_EMPTY(&tp->t_segq) &&
-	    ((to.to_flags & TOF_TS) == 0 ||
-	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
-
-		/*
-		 * If last ACK falls within this segment's sequence numbers,
-		 * record the timestamp.
-		 * NOTE that the test is modified according to the latest
-		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
-		 */
-		if ((to.to_flags & TOF_TS) != 0 &&
-		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
-			tp->ts_recent_age = ticks;
-			tp->ts_recent = to.to_tsval;
-		}
-
-		if (tlen == 0) {
-			if (SEQ_GT(th->th_ack, tp->snd_una) &&
-			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
-			    tp->snd_cwnd >= tp->snd_wnd &&
-			    ((!tcp_do_newreno &&
-			      !(tp->t_flags & TF_SACK_PERMIT) &&
-			      tp->t_dupacks < tcprexmtthresh) ||
-			     ((tcp_do_newreno ||
-			       (tp->t_flags & TF_SACK_PERMIT)) &&
-			      !IN_FASTRECOVERY(tp) &&
-			      (to.to_flags & TOF_SACK) == 0 &&
-			      TAILQ_EMPTY(&tp->snd_holes)))) {
-				KASSERT(headlocked,
-				    ("%s: headlocked", __func__));
-				INP_INFO_WUNLOCK(&tcbinfo);
-				headlocked = 0;
-				/*
-				 * This is a pure ack for outstanding data.
-				 */
-				++tcpstat.tcps_predack;
-				/*
-				 * "bad retransmit" recovery.
-				 */
-				if (tp->t_rxtshift == 1 &&
-				    ticks < tp->t_badrxtwin) {
-					++tcpstat.tcps_sndrexmitbad;
-					tp->snd_cwnd = tp->snd_cwnd_prev;
-					tp->snd_ssthresh =
-					    tp->snd_ssthresh_prev;
-					tp->snd_recover = tp->snd_recover_prev;
-					if (tp->t_flags & TF_WASFRECOVERY)
-					    ENTER_FASTRECOVERY(tp);
-					tp->snd_nxt = tp->snd_max;
-					tp->t_badrxtwin = 0;
-				}
-
-				/*
-				 * Recalculate the transmit timer / rtt.
-				 *
-				 * Some boxes send broken timestamp replies
-				 * during the SYN+ACK phase, ignore
-				 * timestamps of 0 or we could calculate a
-				 * huge RTT and blow up the retransmit timer.
-				 */
-				if ((to.to_flags & TOF_TS) != 0 &&
-				    to.to_tsecr) {
-					if (!tp->t_rttlow ||
-					    tp->t_rttlow > ticks - to.to_tsecr)
-						tp->t_rttlow = ticks - to.to_tsecr;
-					tcp_xmit_timer(tp,
-					    ticks - to.to_tsecr + 1);
-				} else if (tp->t_rtttime &&
-				    SEQ_GT(th->th_ack, tp->t_rtseq)) {
-					if (!tp->t_rttlow ||
-					    tp->t_rttlow > ticks - tp->t_rtttime)
-						tp->t_rttlow = ticks - tp->t_rtttime;
-					tcp_xmit_timer(tp,
-							ticks - tp->t_rtttime);
-				}
-				tcp_xmit_bandwidth_limit(tp, th->th_ack);
-				acked = th->th_ack - tp->snd_una;
-				tcpstat.tcps_rcvackpack++;
-				tcpstat.tcps_rcvackbyte += acked;
-				sbdrop(&so->so_snd, acked);
-				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
-				    SEQ_LEQ(th->th_ack, tp->snd_recover))
-					tp->snd_recover = th->th_ack - 1;
-				tp->snd_una = th->th_ack;
-				/*
-				 * Pull snd_wl2 up to prevent seq wrap relative
-				 * to th_ack.
-				 */
-				tp->snd_wl2 = th->th_ack;
-				tp->t_dupacks = 0;
-				m_freem(m);
-				ND6_HINT(tp); /* Some progress has been made. */
-
-				/*
-				 * If all outstanding data are acked, stop
-				 * retransmit timer, otherwise restart timer
-				 * using current (possibly backed-off) value.
-				 * If process is waiting for space,
-				 * wakeup/selwakeup/signal.  If data
-				 * are ready to send, let tcp_output
-				 * decide between more output or persist.
-				 */
-#ifdef TCPDEBUG
-				if (so->so_options & SO_DEBUG)
-					tcp_trace(TA_INPUT, ostate, tp,
-					    (void *)tcp_saveipgen,
-					    &tcp_savetcp, 0);
-#endif
-				if (tp->snd_una == tp->snd_max)
-					tcp_timer_activate(tp, TT_REXMT, 0);
-				else if (!tcp_timer_active(tp, TT_PERSIST))
-					tcp_timer_activate(tp, TT_REXMT,
-						      tp->t_rxtcur);
-				sowwakeup(so);
-				if (so->so_snd.sb_cc)
-					(void) tcp_output(tp);
-				goto check_delack;
-			}
-		} else if (th->th_ack == tp->snd_una &&
-		    tlen <= sbspace(&so->so_rcv)) {
-			int newsize = 0;	/* automatic sockbuf scaling */
-
-			KASSERT(headlocked, ("%s: headlocked", __func__));
-			INP_INFO_WUNLOCK(&tcbinfo);
-			headlocked = 0;
-			/*
-			 * This is a pure, in-sequence data packet
-			 * with nothing on the reassembly queue and
-			 * we have enough buffer space to take it.
-			 */
-			/* Clean receiver SACK report if present */
-			if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
-				tcp_clean_sackreport(tp);
-			++tcpstat.tcps_preddat;
-			tp->rcv_nxt += tlen;
-			/*
-			 * Pull snd_wl1 up to prevent seq wrap relative to
-			 * th_seq.
-			 */
-			tp->snd_wl1 = th->th_seq;
-			/*
-			 * Pull rcv_up up to prevent seq wrap relative to
-			 * rcv_nxt.
-			 */
-			tp->rcv_up = tp->rcv_nxt;
-			tcpstat.tcps_rcvpack++;
-			tcpstat.tcps_rcvbyte += tlen;
-			ND6_HINT(tp);	/* Some progress has been made */
-#ifdef TCPDEBUG
-			if (so->so_options & SO_DEBUG)
-				tcp_trace(TA_INPUT, ostate, tp,
-				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
-#endif
-		/*
-		 * Automatic sizing of receive socket buffer.  Often the send
-		 * buffer size is not optimally adjusted to the actual network
-		 * conditions at hand (delay bandwidth product).  Setting the
-		 * buffer size too small limits throughput on links with high
-		 * bandwidth and high delay (eg. trans-continental/oceanic links).
-		 *
-		 * On the receive side the socket buffer memory is only rarely
-		 * used to any significant extent.  This allows us to be much
-		 * more aggressive in scaling the receive socket buffer.  For
-		 * the case that the buffer space is actually used to a large
-		 * extent and we run out of kernel memory we can simply drop
-		 * the new segments; TCP on the sender will just retransmit it
-		 * later.  Setting the buffer size too big may only consume too
-		 * much kernel memory if the application doesn't read() from
-		 * the socket or packet loss or reordering makes use of the
-		 * reassembly queue.
-		 *
-		 * The criteria to step up the receive buffer one notch are:
-		 *  1. the number of bytes received during the time it takes
-		 *     one timestamp to be reflected back to us (the RTT);
-		 *  2. received bytes per RTT is within seven eighth of the
-		 *     current socket buffer size;
-		 *  3. receive buffer size has not hit maximal automatic size;
-		 *
-		 * This algorithm does one step per RTT at most and only if
-		 * we receive a bulk stream w/o packet losses or reorderings.
-		 * Shrinking the buffer during idle times is not necessary as
-		 * it doesn't consume any memory when idle.
-		 *
-		 * TODO: Only step up if the application is actually serving
-		 * the buffer to better manage the socket buffer resources.
-		 */
-			if (tcp_do_autorcvbuf &&
-			    to.to_tsecr &&
-			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
-				if (to.to_tsecr > tp->rfbuf_ts &&
-				    to.to_tsecr - tp->rfbuf_ts < hz) {
-					if (tp->rfbuf_cnt >
-					    (so->so_rcv.sb_hiwat / 8 * 7) &&
-					    so->so_rcv.sb_hiwat <
-					    tcp_autorcvbuf_max) {
-						newsize =
-						    min(so->so_rcv.sb_hiwat +
-						    tcp_autorcvbuf_inc,
-						    tcp_autorcvbuf_max);
-					}
-					/* Start over with next RTT. */
-					tp->rfbuf_ts = 0;
-					tp->rfbuf_cnt = 0;
-				} else
-					tp->rfbuf_cnt += tlen;	/* add up */
-			}
-
-			/* Add data to socket buffer. */
-			SOCKBUF_LOCK(&so->so_rcv);
-			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
-				m_freem(m);
-			} else {
-				/*
-				 * Set new socket buffer size.
-				 * Give up when limit is reached.
-				 */
-				if (newsize)
-					if (!sbreserve_locked(&so->so_rcv,
-					    newsize, so, curthread))
-						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
-				m_adj(m, drop_hdrlen);	/* delayed header drop */
-				sbappendstream_locked(&so->so_rcv, m);
-			}
-			/* NB: sorwakeup_locked() does an implicit unlock. */
-			sorwakeup_locked(so);
-			if (DELAY_ACK(tp)) {
-				tp->t_flags |= TF_DELACK;
-			} else {
-				tp->t_flags |= TF_ACKNOW;
-				tcp_output(tp);
-			}
-			goto check_delack;
-		}
-	}
-
-	/*
-	 * Calculate amount of space in receive window,
-	 * and then do TCP input processing.
-	 * Receive window is amount of space in rcv queue,
-	 * but not less than advertised window.
+	 * Validation checks.  We may get any shit here.  Have to be careful.
 	 */
-	win = sbspace(&so->so_rcv);
-	if (win < 0)
-		win = 0;
-	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
-
-	/* Reset receive buffer auto scaling when not in bulk receive mode. */
-	tp->rfbuf_ts = 0;
-	tp->rfbuf_cnt = 0;
-
 	switch (tp->t_state) {
-
 	/*
 	 * If the state is SYN_RECEIVED:
-	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
+	 *	syncache handled all validation, socket, inpcb and tcpcb
+	 *	setup for us.  All that is left is the state transition
+	 *	into established state and initializations of the timers.
 	 */
 	case TCPS_SYN_RECEIVED:
-		if ((thflags & TH_ACK) &&
-		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
-		     SEQ_GT(th->th_ack, tp->snd_max))) {
-				rstreason = BANDLIM_RST_OPENPORT;
-				goto dropwithreset;
-		}
+		tp->t_starttime = ticks;
+		tp->t_state = TCPS_ESTABLISHED;
+
+		soisconnected(so);
+		tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);	/* XXX: not here */
+
+		tcpstat.tcps_connects++;
 		break;
 
 	/*
 	 * If the state is SYN_SENT:
+	 *	if seg contains a RST, then drop the connection.
+	 *	if seg does not contain SYN and ACK, then drop it.
 	 *	if seg contains an ACK, but not for our SYN, drop the input.
-	 *	if seg contains a RST, then drop the connection.
-	 *	if seg does not contain SYN, then drop it.
 	 * Otherwise this is an acceptable SYN segment
 	 *	initialize tp->rcv_nxt and tp->irs
 	 *	if seg contains ack then advance tp->snd_una
@@ -1258,771 +955,527 @@
 	 *	continue processing rest of data/controls, beginning with URG
 	 */
 	case TCPS_SYN_SENT:
-		if ((thflags & TH_ACK) &&
-		    (SEQ_LEQ(th->th_ack, tp->iss) ||
-		     SEQ_GT(th->th_ack, tp->snd_max))) {
-			rstreason = BANDLIM_UNLIMITED;
+		/*
+		 * RST is handled below.
+		 */
+		if (thflags & TH_RST)
+			break;
+
+		/*
+		 * SYN|ACK must be present.
+		 */
+		if (thflags & (TH_SYN|TH_ACK) != (TH_SYN|TH_ACK)) {
+			tcplog("Missing SYN|ACK, segment ignored");
+			goto drop;
+		}
+
+		/*
+		 * ACK must ack our ISN and any data we may
+		 * have sent with our SYN.
+		 */
+		if (SEQ_LEQ(th->th_ack, tp->snd_iss) ||
+		    SEQ_GEQ(th->th_ack, tp->snd_nxt) ||
+		    SEQ_LT(th->th_ack, tp->snd_una)) {
+			tcplog("Incorrect ACK, segment rejected");
+			/* XXXAO: Close connection? Or ignore. */
 			goto dropwithreset;
 		}
-		if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST))
-			tp = tcp_drop(tp, ECONNREFUSED);
-		if (thflags & TH_RST)
-			goto drop;
-		if (!(thflags & TH_SYN))
-			goto drop;
+
+		/*
+		 * Option processing:
+		 *
+		 * If there wasn't a MSS option fall back to
+		 * default mss.
+		 */
+		if (!(tp->t_flags & TF_NOOPT) && (to.to_flags & TOF_MSS))
+			tcp_mss(tp, to.to_mss);
+		else if (tcp_do_path_mtu_discovery)
+			/* MTU of interface... */
+		else
+			tcp_mss(tp, tcp_mssdflt);
 
-		tp->irs = th->th_seq;
-		tcp_rcvseqinit(tp);
-		if (thflags & TH_ACK) {
-			tcpstat.tcps_connects++;
-			soisconnected(so);
-#ifdef MAC
-			SOCK_LOCK(so);
-			mac_socketpeer_set_from_mbuf(m, so);
-			SOCK_UNLOCK(so);
-#endif
-			/* Do window scaling on this connection? */
-			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
-				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
-				tp->rcv_scale = tp->request_r_scale;
-			}
-			tp->rcv_adv += tp->rcv_wnd;
-			tp->snd_una++;		/* SYN is acked */
+		/*
+		 * Do window scaling on this connection?
+		 *
+		 * NB: According to RFC1323 the window field
+		 * in a SYN (i.e., a <SYN> or <SYN,ACK>)
+		 * segment itself is never scaled.
+		 */
+		if ((tp->t_flags & TF_WINSCALE) &&
+		    (to.to_flags & TOF_SCALE)) {
+			tp->snd_scale = to.to_wscale;
+		} else if (tp->t_flags & TF_WINSCALE) {
+			tp->t_flags &= ~TF_WINSCALE;
+			tp->rcv_scale = 0;
+		} else if (to.to_flags & TOF_SCALE) {
 			/*
-			 * If there's data, delay ACK; if there's also a FIN
-			 * ACKNOW will be turned on later.
+			 * The remote end doesn't play right with us
+			 * and introduces options we haven't sent.
 			 */
-			if (DELAY_ACK(tp) && tlen != 0)
-				tcp_timer_activate(tp, TT_DELACK,
-				    tcp_delacktime);
-			else
-				tp->t_flags |= TF_ACKNOW;
+			tcplog("Window Scaling Option unexpected, "
+			    "connection aborted");
+			tp->t_error = ENETRESET;	/* XXX: Correct error? */
+			tp = tcp_close(tp);
+			rstreason = BANDLIM_UNLIMITED;
+			goto dropwithreset;
+		}
+
+		/*
+		 * Do timestamps on this connection?
+		 */
+		if ((tp->t_flags & TF_TIMESTAMP) &&
+		    !(to.to_flags & TOF_TS))
+			tp->t_flags &= ~TF_TIMESTAMP;
+		if (!(tp->t_flags & TF_TIMESTAMP) &&
+		    (to.to_flags & TOF_TS)) {
 			/*
-			 * Received <SYN,ACK> in SYN_SENT[*] state.
-			 * Transitions:
-			 *	SYN_SENT  --> ESTABLISHED
-			 *	SYN_SENT* --> FIN_WAIT_1
+			 * The remote end doesn't play right with us
+			 * and introduces options we haven't sent.
 			 */
-			tp->t_starttime = ticks;
-			if (tp->t_flags & TF_NEEDFIN) {
-				tp->t_state = TCPS_FIN_WAIT_1;
-				tp->t_flags &= ~TF_NEEDFIN;
-				thflags &= ~TH_SYN;
-			} else {
-				tp->t_state = TCPS_ESTABLISHED;
-				tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
-			}
-		} else {
+			tcplog("Timestamp unexpected, "
+			    "connection aborted");
+			tp->t_error = ENETRESET;
+			tp = tcp_close(tp);
+			rstreason = BANDLIM_UNLIMITED;
+			goto dropwithreset;
+		}
+
+		/*
+		 * Do SACK on this connection?
+		 */
+		if ((tp->t_flags & TF_SACK_PERMIT) &&
+		    !(to.to_flags & TOF_SACKPERM))
+			tp->t_flags &= ~TF_SACK_PERMIT;
+		if (!(tp->t_flags & TF_SACK_PERMIT) &&
+		    (to.to_flags & TOF_SACKPERM)) {
 			/*
-			 * Received initial SYN in SYN-SENT[*] state =>
-			 * simultaneous open.  If segment contains CC option
-			 * and there is a cached CC, apply TAO test.
-			 * If it succeeds, connection is * half-synchronized.
-			 * Otherwise, do 3-way handshake:
-			 *        SYN-SENT -> SYN-RECEIVED
-			 *        SYN-SENT* -> SYN-RECEIVED*
-			 * If there was no CC option, clear cached CC value.
+			 * The remote end doesn't play right with us
+			 * and introduces options we haven't sent.
 			 */
-			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
-			tcp_timer_activate(tp, TT_REXMT, 0);
-			tp->t_state = TCPS_SYN_RECEIVED;
+			tcplog("SACK Permitted unexpected, "
+			    "connection aborted");
+			tp->t_error = ENETRESET;
+			tp = tcp_close(tp);
+			rstreason = BANDLIM_UNLIMITED;
+			goto dropwithreset;
 		}
 
-		KASSERT(headlocked, ("%s: trimthenstep6: head not locked",
-		    __func__));
-		INP_LOCK_ASSERT(tp->t_inpcb);
-
 		/*
-		 * Advance th->th_seq to correspond to first data byte.
-		 * If data, trim to stay within window,
-		 * dropping FIN if necessary.
+		 * Initialize receive structure.
 		 */
-		th->th_seq++;
-		if (tlen > tp->rcv_wnd) {
-			todrop = tlen - tp->rcv_wnd;
-			m_adj(m, -todrop);
-			tlen = tp->rcv_wnd;
-			thflags &= ~TH_FIN;
-			tcpstat.tcps_rcvpackafterwin++;
-			tcpstat.tcps_rcvbyteafterwin += todrop;
-		}
-		tp->snd_wl1 = th->th_seq - 1;
+		tp->rcv_adv += rwin;	/* XXX */
+		tp->irs = th->th_seq;
 		tp->rcv_up = th->th_seq;
+		tcp_rcvseqinit(tp);
+
 		/*
-		 * Client side of transaction: already sent SYN and data.
-		 * If the remote host used T/TCP to validate the SYN,
-		 * our data will be ACK'd; if so, enter normal data segment
-		 * processing in the middle of step 5, ack processing.
-		 * Otherwise, goto step 6.
+		 * Process SYN and integrate sequence number.
 		 */
-		if (thflags & TH_ACK)
-			goto process_ACK;
+		tp->snd_una++;
+		tp->snd_wu_seq = th->th_seq;
+		tp->snd_wu_ack = th->th_ack;
+		th->th_seq++;		/* SYN is acked */
+		thflags &= ~TH_SYN;	/* SYN is processed */
 
-		goto step6;
+		tp->t_starttime = ticks;
+		tp->t_state = TCPS_ESTABLISHED;
+#ifdef MAC
+		SOCK_LOCK(so);
+		mac_set_socket_peer_from_mbuf(m, so);
+		SOCK_UNLOCK(so);
+#endif
+		soisconnected(so);
+		tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);	/* XXX */
 
+		tcpstat.tcps_connects++;
+		break;
 	/*
-	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
-	 *      do normal processing.
-	 *
-	 * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
+	 * All other states where a connection was established before.
 	 */
+	case TCPS_ESTABLISHED:
+	case TCPS_CLOSE_WAIT:
+	case TCPS_FIN_WAIT_1:
+	case TCPS_CLOSING:
 	case TCPS_LAST_ACK:
-	case TCPS_CLOSING:
-		break;  /* continue normal processing */
-	}
+	case TCPS_FIN_WAIT_2:
+		/*
+		 * SYN and RST are handled separately below.
+		 */
+		if (thflags & (TH_SYN|TH_RST))
+			break;
+
+		/*
+		 * Segments without ACK are invalid.
+		 */
+		if (!(thflags & TH_ACK)) {
+			tcplog("ACK missing, segment ignored");
+			goto drop;
+		}
+
+		/*
+		 * Don't accept ack'ing of older than previously ack'd data.
+		 * XXXAO: Careful with out-of-order data. Must check seq too.
+		 * reordering and bidirectional data transfer.
+		 * XXXAO: Is this check really useful?
+		 */
+		if (SEQ_LT(th->th_ack, tp->snd_una) &&
+		    SEQ_LT(th->th_seq, tp->rcv_nxt)) {
+			tcplog("Acking old data, segment ignored, "
+			    "sending challenge ACK");
+			goto dropafterack;
+		}
+
+		/*
+		 * Don't accept ack'ing of more than actually sent data.
+		 */
+		if (SEQ_GT(th->th_ack, tp->snd_max)) {
+			tcplog("Acking data not yet sent, segment ignored, "
+			    "sending challenge ACK");
+			tcpstat.tcps_rcvacktoomuch++;
+			goto dropafterack;
+		}
+
+		/*
+		 * Don't accept start of SEQ beyond receive window.
+		 * Allow for a window probe with one byte.
+		 * XXXAO: Window probe statistics.
+		 */
+		if (SEQ_GT(th->th_seq, tp->rcv_nxt + tp->rcv_win)) {
+			tcplog("Data beyond window, segment ignored, "
+			    "sending challenge ACK");
+			goto dropafterack;
+		}
+
+		/*
+		 * Don't accept too old retransmits.
+		 * XXXAO: Use largest window we've ever sent.
+		 * sb_hiwat is pretty much that.  We normally
+		 * don't shrink the receive socket buffer.
+		 */
+		if (SEQ_LT(th->th_seq,
+		    tp->rcv_nxt - so->so_rcv.sb_hiwat - tlen)) {
+			tcplog("Too old retransmit, segment ignored, "
+			    "sending challenge ACK");
+			goto dropafterack;
+		}
 
-	/*
-	 * States other than LISTEN or SYN_SENT.
-	 * First check the RST flag and sequence number since reset segments
-	 * are exempt from the timestamp and connection count tests.  This
-	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
-	 * below which allowed reset segments in half the sequence space
-	 * to fall though and be processed (which gives forged reset
-	 * segments with a random sequence number a 50 percent chance of
-	 * killing a connection).
-	 * Then check timestamp, if present.
-	 * Then check the connection count, if present.
-	 * Then check that at least some bytes of segment are within
-	 * receive window.  If segment begins before rcv_nxt,
-	 * drop leading data (and SYN); if nothing left, just ack.
-	 *
-	 *
-	 * If the RST bit is set, check the sequence number to see
-	 * if this is a valid reset segment.
-	 * RFC 793 page 37:
-	 *   In all states except SYN-SENT, all reset (RST) segments
-	 *   are validated by checking their SEQ-fields.  A reset is
-	 *   valid if its sequence number is in the window.
-	 * Note: this does not take into account delayed ACKs, so
-	 *   we should test against last_ack_sent instead of rcv_nxt.
-	 *   The sequence number in the reset segment is normally an
-	 *   echo of our outgoing acknowlegement numbers, but some hosts
-	 *   send a reset with the sequence number at the rightmost edge
-	 *   of our receive window, and we have to handle this case.
-	 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
-	 *   that brute force RST attacks are possible.  To combat this,
-	 *   we use a much stricter check while in the ESTABLISHED state,
-	 *   only accepting RSTs where the sequence number is equal to
-	 *   last_ack_sent.  In all other states (the states in which a
-	 *   RST is more likely), the more permissive check is used.
-	 * If we have multiple segments in flight, the intial reset
-	 * segment sequence numbers will be to the left of last_ack_sent,
-	 * but they will eventually catch up.
-	 * In any case, it never made sense to trim reset segments to
-	 * fit the receive window since RFC 1122 says:
-	 *   4.2.2.12  RST Segment: RFC-793 Section 3.4
-	 *
-	 *    A TCP SHOULD allow a received RST segment to include data.
-	 *
-	 *    DISCUSSION
-	 *         It has been suggested that a RST segment could contain
-	 *         ASCII text that encoded and explained the cause of the
-	 *         RST.  No standard has yet been established for such
-	 *         data.
-	 *
-	 * If the reset segment passes the sequence number test examine
-	 * the state:
-	 *    SYN_RECEIVED STATE:
-	 *	If passive open, return to LISTEN state.
-	 *	If active open, inform user that connection was refused.
-	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
-	 *	Inform user that connection was reset, and close tcb.
-	 *    CLOSING, LAST_ACK STATES:
-	 *	Close the tcb.
-	 *    TIME_WAIT STATE:
-	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
-	 *      RFC 1337.
-	 */
-	if (thflags & TH_RST) {
-		if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
-		    SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
-			switch (tp->t_state) {
+		/*
+		 * Don't accept missing TS when TS was negotiated and
+		 * vice versa.
+		 */
+		if ((tp->t_flags & TF_TIMESTAMP) && !(to.to_flags & TOF_TS)) {
+			tcplog("Timestamp missing, segment ignored");
+			goto drop;
+		}
+		if (!(tp->t_flags & TF_TIMESTAMP) && (to.to_flags & TOF_TS)) {
+			tcplog("Timestamp unexpected, segment ignored");
+			goto drop;
+		}
 
-			case TCPS_SYN_RECEIVED:
-				so->so_error = ECONNREFUSED;
-				goto close;
+		/*
+		 * Don't accept remote ts older than already seen,
+		 * reflected ts newer than what we send last.
+		 *
+		 * TODO-AO:
+		 * PAWS
+		 */
+		if ((to.to_flags & TOF_TS) &&
+		    ticks - tp->t_rcvtime < PAWS &&
+		    (!TSTMP_LT(to.to_tsval, tp->snd_tsecr)) ||
+		     TSTMP_GT(to.to_tsecr, tp->snd_tsval))) {
+			tcplog("Timestamp too old or new, segment ignored");
+			goto drop;
+		}
 
-			case TCPS_ESTABLISHED:
-				if (tcp_insecure_rst == 0 &&
-				    !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) &&
-				    SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) &&
-				    !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
-				    SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) {
-					tcpstat.tcps_badrst++;
-					goto drop;
-				}
-				/* FALLTHROUGH */
-			case TCPS_FIN_WAIT_1:
-			case TCPS_FIN_WAIT_2:
-			case TCPS_CLOSE_WAIT:
-				so->so_error = ECONNRESET;
-			close:
-				tp->t_state = TCPS_CLOSED;
-				tcpstat.tcps_drops++;
-				KASSERT(headlocked, ("%s: trimthenstep6: "
-				    "tcp_close: head not locked", __func__));
-				tp = tcp_close(tp);
-				break;
+		/*
+		 * We may receive a retransmit before we sent the delayed
+		 * ACK for the segment in question.  This should normally
+		 * not happen and indicates either a timing problem at the
+		 * sender or we delay the ACK too much.
+		 */
+		if (SEQ_GT(th->th_seq, tp->snd_lastack) &&
+		    SEQ_LT(th->th_seq, tp->rcv_nxt) {
+			tcplog("Received retransmit before we sent delayed ACK, no action");
+		}
 
-			case TCPS_CLOSING:
-			case TCPS_LAST_ACK:
-				KASSERT(headlocked, ("%s: trimthenstep6: "
-				    "tcp_close.2: head not locked", __func__));
-				tp = tcp_close(tp);
-				break;
-			}
+		/*
+		 * Don't accept SACK when is wasn't negotiated at
+		 * connection setup time.
+		 */
+		if ((to.to_flags & TOF_SACK) &&
+		    !(tp->t_flags & TF_SACK_PERM)) {
+			tcplog("SACK unexpected, segment ignored");
+			goto drop;
 		}
+
+		/* XXX: stats */
+		break;
+
+	/*
+	 * Sanity check.
+	 */
+	default:
+		KASSERT(1 == 0, ("%s: Invalid TCP FSM state", __func__));
 		goto drop;
 	}
 
+#ifdef TCP_SIGNATURE
 	/*
-	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
-	 * and it's less than ts_recent, drop it.
+	 * TCP-MD5 is done after the general acceptability checks
+	 * to run only on most likely valid segments through the
+	 * expensive MD5 hash computation.
+	 * In SYN_RECEIVED case syncache verified the signature
+	 * already.
+	 */
+	if ((tp->t_flags & TF_SIGNATURE) && notalreadydone) {
+		/* Copy signature and compare. */
+		tcp_signature_compute(m, sizeof(struct ip), len, optlen,
+		    (u_char *)(th + 1) + sigoff, IPSEC_DIR_INBOUND);
+		if (!bcomp(orig, computed)) {
+			tcplog("MD5 signature does not match, "
+			    "segment ignored");
+			goto drop;
+		}
+	}
+#endif
+	/*
+	 * Fast path for ACK-only segments.
 	 */
-	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
-	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
+	if (tlen == 0 && (thflags & (TH_ACK|TH_RST|TH_SYN)) == TH_ACK)
+		goto doack;
 
-		/* Check to see if ts_recent is over 24 days old.  */
-		if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
+	/*
+	 * Handle SYN and RST flags for existing connections.
+	 *
+	 * NB: The SYN_SENT case has removed the SYN bit from thflags
+	 * if the segment was accepted.
+	 */
+	if (thflags & TH_RST) {
+		/*
+		 * Any RST after TCPS_SYN_SENT must NOT carry the ACK flag.
+		 * RFC 793 page 65, section SEGMENT ARRIVES.
+		 */
+		if (tp->t_state > TCPS_SYN_SENT &&
+		    (thflags & TH_ACK)) {
+			tcplog("RST with ACK invalid, segment ignored");
+			tcpstat.tcps_badrst++;
+			goto drop;
+		}
+		/*
+		 * Check if the sequence number is NOT acceptable to us.
+		 */
+		if (tp->t_state == TCPS_SYN_SENT) {
 			/*
-			 * Invalidate ts_recent.  If this segment updates
-			 * ts_recent, the age will be reset later and ts_recent
-			 * will get a valid value.  If it does not, setting
-			 * ts_recent to zero will at least satisfy the
-			 * requirement that zero be placed in the timestamp
-			 * echo reply when ts_recent isn't valid.  The
-			 * age isn't reset until we get a valid ts_recent
-			 * because we don't want out-of-order segments to be
-			 * dropped when ts_recent is old.
+			 * In TCPS_SYN_SENT the RST MUST carry the ACK flag.
 			 */
-			tp->ts_recent = 0;
-		} else {
-			tcpstat.tcps_rcvduppack++;
-			tcpstat.tcps_rcvdupbyte += tlen;
-			tcpstat.tcps_pawsdrop++;
-			if (tlen)
-				goto dropafterack;
+			if (!(thflags & TH_ACK)) {

>>> TRUNCATED FOR MAIL (1000 lines) <<<



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200802031048.m13AmbeD066413>