Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 16 Aug 2002 19:33:42 -0700 (PDT)
From:      Matthew Dillon <dillon@apollo.backplane.com>
To:        freebsd-hackers@FreeBSD.ORG, freebsd-net@FreeBSD.ORG
Subject:   Commit schedule for bandwidth delay product pipeline limiting for TCP
Message-ID:  <200208170233.g7H2XgqG047569@apollo.backplane.com>
References:  <200207200103.g6K135Ap081155@apollo.backplane.com> <3D3AB5AF.F2F637C3@pipeline.ch> <200207211747.g6LHlKHv003686@apollo.backplane.com>

next in thread | previous in thread | raw e-mail | index | archive | help
    Well, I'm back from vacation.  I see nobody in the general group has
    commented much on my bandwidth delay product code.  A couple of people
    have corresponded with me in email and generally the response is 
    positive.

    Since this code must be enabled via a sysctl I feel it is safe to
    commit it to -current.  I also intend to MFC it to -stable prior
    to the freeze (MFC after: 1 week).  I believe that we can eventually
    enable the sysctl by default.

    I intend to commit this code on Saturday (tomorrow).  I've included the
    patch set below for those who need a reminder of what this is.  Generally
    speaking this code is very similar, though not intended to duplicate,
    the algorithm described by the TCP Vegas paper.  I will also commit
    manual page updates to tcp(4) and tuning(7) to describe the effects
    of the sysctls.

						-Matt

Index: netinet/tcp_input.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.169
diff -u -r1.169 tcp_input.c
--- netinet/tcp_input.c	15 Aug 2002 18:51:26 -0000	1.169
+++ netinet/tcp_input.c	17 Aug 2002 02:24:01 -0000
@@ -1018,6 +1018,7 @@
 				else if (tp->t_rtttime &&
 					    SEQ_GT(th->th_ack, tp->t_rtseq))
 					tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+				tcp_xmit_bandwidth_limit(tp, th->th_ack);
 				acked = th->th_ack - tp->snd_una;
 				tcpstat.tcps_rcvackpack++;
 				tcpstat.tcps_rcvackbyte += acked;
@@ -1819,6 +1820,7 @@
 			tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
 		else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
 			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+		tcp_xmit_bandwidth_limit(tp, th->th_ack);
 
 		/*
 		 * If all outstanding data is acked, stop retransmit
@@ -2445,6 +2447,8 @@
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		if ((tp->t_rttvar += delta) <= 0)
 			tp->t_rttvar = 1;
+		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
+		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt.
@@ -2453,6 +2457,7 @@
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	tp->t_rtttime = 0;
 	tp->t_rxtshift = 0;
@@ -2592,6 +2597,7 @@
 		if (rt->rt_rmx.rmx_locks & RTV_RTT)
 			tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
 		tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
+		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
 		tcpstat.tcps_usedrtt++;
 		if (rt->rt_rmx.rmx_rttvar) {
 			tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
Index: netinet/tcp_output.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_output.c,v
retrieving revision 1.67
diff -u -r1.67 tcp_output.c
--- netinet/tcp_output.c	12 Aug 2002 03:22:46 -0000	1.67
+++ netinet/tcp_output.c	17 Aug 2002 02:24:01 -0000
@@ -168,6 +168,7 @@
 	sendalot = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	win = min(tp->snd_wnd, tp->snd_cwnd);
+	win = min(win, tp->snd_bwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*
@@ -780,7 +781,7 @@
 			tp->snd_max = tp->snd_nxt;
 			/*
 			 * Time this transmission if not a retransmission and
-			 * not currently timing anything.
+			 * not currently timing anything. 
 			 */
 			if (tp->t_rtttime == 0) {
 				tp->t_rtttime = ticks;
Index: netinet/tcp_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.140
diff -u -r1.140 tcp_subr.c
--- netinet/tcp_subr.c	1 Aug 2002 03:54:43 -0000	1.140
+++ netinet/tcp_subr.c	17 Aug 2002 02:24:01 -0000
@@ -146,6 +146,32 @@
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
 
+static int	tcp_inflight_enable = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
+    &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
+
+static int	tcp_inflight_debug = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
+    &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
+
+static int	tcp_inflight_min = 1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
+    &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
+
+static int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
+    &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
+
+#if 0
+static int	tcp_inflight_attack = 20;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_attack, CTLFLAG_RW,
+    &tcp_inflight_attack, 0, "TCP inflight compensation attack rate (%)");
+
+static int	tcp_inflight_shift = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_shift, CTLFLAG_RW,
+    &tcp_inflight_shift, 0, "TCP inflight compensation shift (+/-100) ");
+#endif
+
 static void	tcp_cleartaocache(void);
 static struct inpcb *tcp_notify(struct inpcb *, int);
 
@@ -566,8 +592,10 @@
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
+	tp->t_bw_rtttime = ticks;
         /*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
@@ -1531,3 +1559,101 @@
 tcp_cleartaocache()
 {
 }
+
+/*
+ * This code attempts to calculate the bandwidth-delay product.
+ * The problem with calculating this product is that our manipulation
+ * of the congestion window modifies both the perceived bandwidth
+ * and the srtt.  It is possible to get a fairly stable maximal
+ * bandwidth by increasing the congestion window.  The bandwidth
+ * calculation will be fairly good even if bwnd is set very high.
+ * However, figuring out the minimal srtt is far more difficult
+ * because we do not want the TCP stream to suffer greatly and therefore
+ * cannot reduce the congestion window to something very small.
+ *
+ * What we do is first increase the congestion window to try to
+ * obtain a maximal (or at least a 'larger') bandwidth, then decrease
+ * the congestion window to try to obtain a minimal (or at least a 'smaller')
+ * rtt.  We also have to detect the case where BWND is too high and
+ * neither increasing nor decreasing it has the desired effect on the
+ * calculation.  By detecting this special case we can stabilize the
+ * algorithm and recalculate bwnd within a reasonable period of time.
+ */
+void
+tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
+{
+	u_long bw;
+	u_long bwnd;
+	int save_ticks;
+
+	/*
+	 * If inflight_enable is disabled in the middle of a tcp connection,
+	 * make sure snd_bwnd is effectively disabled.
+	 */
+	if (tcp_inflight_enable == 0) {
+		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+		tp->snd_bandwidth = 0;
+		return;
+	}
+
+	/*
+	 * Figure out the bandwidth.  Due to the tick granularity this
+	 * is a very rough number and it MUST be averaged over a fairly
+	 * long period of time.
+	 */
+	save_ticks = ticks;
+	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
+		return;
+
+	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 
+	    (save_ticks - tp->t_bw_rtttime);
+	tp->t_bw_rtttime = save_ticks;
+	tp->t_bw_rtseq = ack_seq;
+	if (tp->t_bw_rtttime == 0)
+		return;
+	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
+
+	tp->snd_bandwidth = bw;
+
+	/*
+	 * Calculate the semi-static bandwidth delay product, plus two maximal
+	 * segments.  The additional slop puts us squarely in the sweet
+	 * spot and also handles the bandwidth run-up case.  Without the
+	 * slop we could be locking ourselves into a lower bandwidth.
+	 *
+	 * Situations Handled:
+	 *	(1) prevents over-queueing of packets on LANs, especially
+	 *	    high speed LANs, allowing larger TCP buffers to be
+	 *	    specified.
+	 *
+	 *	(2) able to handle increased network loads (bandwidth drops
+	 *	    so bwnd drops).
+	 *
+	 *	(3) Randomly changes the window size in order to force
+	 *	    bandwidth balancing between connections.
+	 */
+#define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
+	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + 2 * tp->t_maxseg;
+
+	if (tcp_inflight_debug > 0) {
+		static int ltime;
+		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
+			ltime = ticks;
+			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
+			    tp,
+			    bw,
+			    tp->t_rttbest,
+			    tp->t_srtt,
+			    bwnd
+			);
+		}
+	}
+	if ((long)bwnd < tcp_inflight_min)
+		bwnd = tcp_inflight_min;
+	if (bwnd > tcp_inflight_max)
+		bwnd = tcp_inflight_max;
+	if ((long)bwnd < tp->t_maxseg * 2)
+		bwnd = tp->t_maxseg * 2;
+	tp->snd_bwnd = bwnd;
+}
+
Index: netinet/tcp_usrreq.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.79
diff -u -r1.79 tcp_usrreq.c
--- netinet/tcp_usrreq.c	29 Jul 2002 09:01:39 -0000	1.79
+++ netinet/tcp_usrreq.c	17 Aug 2002 02:24:01 -0000
@@ -875,6 +875,7 @@
 	tp->t_state = TCPS_SYN_SENT;
 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
 	tp->iss = tcp_new_isn(tp);
+	tp->t_bw_rtseq = tp->iss;
 	tcp_sendseqinit(tp);
 
 	/*
@@ -961,6 +962,7 @@
 	tp->t_state = TCPS_SYN_SENT;
 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
 	tp->iss = tcp_new_isn(tp);
+	tp->t_bw_rtseq = tp->iss;
 	tcp_sendseqinit(tp);
 
 	/*
Index: netinet/tcp_var.h
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_var.h,v
retrieving revision 1.82
diff -u -r1.82 tcp_var.h
--- netinet/tcp_var.h	19 Jul 2002 18:27:39 -0000	1.82
+++ netinet/tcp_var.h	21 Jul 2002 02:26:36 -0000
@@ -124,10 +124,12 @@
 
 	u_long	snd_wnd;		/* send window */
 	u_long	snd_cwnd;		/* congestion-controlled window */
+	u_long	snd_bwnd;		/* bandwidth-controlled window */
 	u_long	snd_ssthresh;		/* snd_cwnd size threshold for
 					 * for slow start exponential to
 					 * linear switch
 					 */
+	u_long	snd_bandwidth;		/* calculated bandwidth or 0 */
 	tcp_seq	snd_recover;		/* for use in fast recovery */
 
 	u_int	t_maxopd;		/* mss plus options */
@@ -137,6 +139,9 @@
 	int	t_rtttime;		/* round trip time */
 	tcp_seq	t_rtseq;		/* sequence number being timed */
 
+	int	t_bw_rtttime;		/* used for bandwidth calculation */
+	tcp_seq	t_bw_rtseq;		/* used for bandwidth calculation */
+
 	int	t_rxtcur;		/* current retransmit value (ticks) */
 	u_int	t_maxseg;		/* maximum segment size */
 	int	t_srtt;			/* smoothed round-trip time */
@@ -144,6 +149,7 @@
 
 	int	t_rxtshift;		/* log(2) of rexmt exp. backoff */
 	u_int	t_rttmin;		/* minimum rtt allowed */
+	u_int	t_rttbest;		/* best rtt we've seen */
 	u_long	t_rttupdated;		/* number of times rtt sampled */
 	u_long	max_sndwnd;		/* largest window peer has offered */
 
@@ -473,6 +479,7 @@
 struct tcpcb *
 	 tcp_timers(struct tcpcb *, int);
 void	 tcp_trace(int, int, struct tcpcb *, void *, struct tcphdr *, int);
+void	 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq);
 void	 syncache_init(void);
 void	 syncache_unreach(struct in_conninfo *, struct tcphdr *);
 int	 syncache_expand(struct in_conninfo *, struct tcphdr *,

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-hackers" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200208170233.g7H2XgqG047569>