Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 20 Jul 2002 19:36:01 -0700 (PDT)
From:      Matthew Dillon <dillon@apollo.backplane.com>
To:        freebsd-hackers@FreeBSD.ORG, freebsd-net@FreeBSD.ORG
Subject:   Bandwidth delay product limiting for TCP - update.
Message-ID:  <200207210236.g6L2a11a000270@apollo.backplane.com>

next in thread | raw e-mail | index | archive | help
    Ok, I've done a bunch more work.  I've simplified the code enormously.

    Basically what it does now is calculate the window size based on the
    current bandwidth and the best RTT it's ever seen.  This will handle:

    (1) Limiting packet queueing to just the amount required to fill the
	pipe.

    (2) Reducing the window if the network load increases.

    (3) Partial ability to handle changes in latency which do not 
	effect the bandwidth carrying capacity of the network.

    #4 is the most difficult problem to deal with because calculating the
    window based on SRTT is a positive-feedback loop.  i.e. a larger window
    results in a larger SRTT which results in a larger window.  So I
    can't just use SRTT.  Instead I use ((SRTT + RTTBEST) / 2) for the RTT
    portion of the calculation.  This has the effect of allowing the 
    algorithm to compensate for the increased latencies but providing 
    negative bias that increases as the window increases, stabilizing the
    calculation of the window.

    I think this may be good enough to commit but I would really like as
    many people as possible to test it in real life situations (keeping
    in mind that it only effects the transmit side).  I believe I have
    solved just about all the problems that I had with previous versions.
    An the damn thing is actually clean(!).

						-Matt

Index: tcp_input.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.165
diff -u -r1.165 tcp_input.c
--- tcp_input.c	19 Jul 2002 18:27:39 -0000	1.165
+++ tcp_input.c	21 Jul 2002 02:32:57 -0000
@@ -1008,6 +1008,7 @@
 				else if (tp->t_rtttime &&
 					    SEQ_GT(th->th_ack, tp->t_rtseq))
 					tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+				tcp_xmit_bandwidth_limit(tp, th->th_ack);
 				acked = th->th_ack - tp->snd_una;
 				tcpstat.tcps_rcvackpack++;
 				tcpstat.tcps_rcvackbyte += acked;
@@ -1805,6 +1806,7 @@
 			tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
 		else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
 			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+		tcp_xmit_bandwidth_limit(tp, th->th_ack);
 
 		/*
 		 * If all outstanding data is acked, stop retransmit
@@ -2431,6 +2433,8 @@
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		if ((tp->t_rttvar += delta) <= 0)
 			tp->t_rttvar = 1;
+		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
+		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt.
@@ -2439,6 +2443,7 @@
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	tp->t_rtttime = 0;
 	tp->t_rxtshift = 0;
@@ -2578,6 +2583,7 @@
 		if (rt->rt_rmx.rmx_locks & RTV_RTT)
 			tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
 		tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
+		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
 		tcpstat.tcps_usedrtt++;
 		if (rt->rt_rmx.rmx_rttvar) {
 			tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
Index: tcp_output.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_output.c,v
retrieving revision 1.65
diff -u -r1.65 tcp_output.c
--- tcp_output.c	23 Jun 2002 21:25:36 -0000	1.65
+++ tcp_output.c	21 Jul 2002 02:32:57 -0000
@@ -164,6 +164,7 @@
 	sendalot = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	win = min(tp->snd_wnd, tp->snd_cwnd);
+	win = min(win, tp->snd_bwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*
@@ -773,7 +774,7 @@
 			tp->snd_max = tp->snd_nxt;
 			/*
 			 * Time this transmission if not a retransmission and
-			 * not currently timing anything.
+			 * not currently timing anything. 
 			 */
 			if (tp->t_rtttime == 0) {
 				tp->t_rtttime = ticks;
Index: tcp_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.137
diff -u -r1.137 tcp_subr.c
--- tcp_subr.c	18 Jul 2002 19:06:12 -0000	1.137
+++ tcp_subr.c	21 Jul 2002 02:32:57 -0000
@@ -144,6 +144,32 @@
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
 
+static int	tcp_inflight_enable = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
+    &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
+
+static int	tcp_inflight_debug = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
+    &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
+
+static int	tcp_inflight_min = 1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
+    &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
+
+static int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
+    &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
+
+#if 0
+static int	tcp_inflight_attack = 20;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_attack, CTLFLAG_RW,
+    &tcp_inflight_attack, 0, "TCP inflight compensation attack rate (%)");
+
+static int	tcp_inflight_shift = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_shift, CTLFLAG_RW,
+    &tcp_inflight_shift, 0, "TCP inflight compensation shift (+/-100) ");
+#endif
+
 static void	tcp_cleartaocache(void);
 static struct inpcb *tcp_notify(struct inpcb *, int);
 
@@ -547,8 +573,10 @@
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
+	tp->t_bw_rtttime = ticks;
         /*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
@@ -1509,3 +1537,102 @@
 tcp_cleartaocache()
 {
 }
+
+/*
+ * This code attempts to calculate the bandwidth-delay product.
+ * The problem with calculating this product is that our manipulation
+ * of the congestion window modifies both the perceived bandwidth
+ * and the srtt.  It is possible to get a fairly stable maximal
+ * bandwidth by increasing the congestion window.  The bandwidth
+ * calculation will be fairly good even if bwnd is set very high.
+ * However, figuring out the minimal srtt is far more difficult
+ * because we do not want the TCP stream to suffer greatly and therefore
+ * cannot reduce the congestion window to something very small.
+ *
+ * What we do is first increase the congestion window to try to
+ * obtain a maximal (or at least a 'larger') bandwidth, then decrease
+ * the congestion window to try to obtain a minimal (or at least a 'smaller')
+ * rtt.  We also have to detect the case where BWND is too high and
+ * neither increasing nor decreasing it has the desired effect on the
+ * calculation.  By detecting this special case we can stabilize the
+ * algorithm and recalculate bwnd within a reasonable period of time.
+ */
+void
+tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
+{
+	u_long bw;
+	u_long bwnd;
+	int save_ticks;
+
+	/*
+	 * If inflight_enable is disabled in the middle of a tcp connection,
+	 * make sure snd_bwnd is effectively disabled.
+	 */
+	if (tcp_inflight_enable == 0) {
+		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+		tp->snd_bandwidth = 0;
+		return;
+	}
+
+	/*
+	 * Figure out the bandwidth.  Due to the tick granularity this
+	 * is a very rough number and it MUST be averaged over a fairly
+	 * long period of time.
+	 */
+	save_ticks = ticks;
+	n = save_ticks - tp->t_bw_rtttime;
+	if ((u_int)n < 1)
+		return;
+
+	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 
+	    (save_ticks - tp->t_bw_rtttime);
+	tp->t_bw_rtttime = save_ticks;
+	tp->t_bw_rtseq = ack_seq;
+	if (tp->t_bw_rtttime == 0)
+		return;
+	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
+
+	tp->snd_bandwidth = bw;
+
+	/*
+	 * Calculate the semi-static bandwidth delay product, plus two maximal
+	 * segments.  The additional slop puts us squarely in the sweet
+	 * spot and also handles the bandwidth run-up case.  Without the
+	 * slop we could be locking ourselves into a lower bandwidth.
+	 *
+	 * Situations Handled:
+	 *	(1) prevents over-queueing of packets on LANs, especially
+	 *	    high speed LANs, allowing larger TCP buffers to be
+	 *	    specified.
+	 *
+	 *	(2) able to handle increased network loads (bandwidth drops
+	 *	    so bwnd drops).
+	 *
+	 *	(3) Randomly changes the window size in order to force
+	 *	    bandwidth balancing between connections.
+	 */
+#define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
+	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + 2 * tp->t_maxseg;
+
+	if (tcp_inflight_debug > 0) {
+		static int ltime;
+		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
+			ltime = ticks;
+			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
+			    tp,
+			    bw,
+			    tp->t_rttbest,
+			    tp->t_srtt,
+			    bwnd
+			);
+		}
+	}
+	if ((long)bwnd < tcp_inflight_min)
+		bwnd = tcp_inflight_min;
+	if (bwnd > tcp_inflight_max)
+		bwnd = tcp_inflight_max;
+	if ((long)bwnd < tp->t_maxseg * 2)
+		bwnd = tp->t_maxseg * 2;
+	tp->snd_bwnd = bwnd;
+}
+
Index: tcp_usrreq.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.76
diff -u -r1.76 tcp_usrreq.c
--- tcp_usrreq.c	13 Jun 2002 23:14:58 -0000	1.76
+++ tcp_usrreq.c	21 Jul 2002 02:32:58 -0000
@@ -875,6 +875,7 @@
 	tp->t_state = TCPS_SYN_SENT;
 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
 	tp->iss = tcp_new_isn(tp);
+	tp->t_bw_rtseq = tp->iss;
 	tcp_sendseqinit(tp);
 
 	/*
@@ -961,6 +962,7 @@
 	tp->t_state = TCPS_SYN_SENT;
 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
 	tp->iss = tcp_new_isn(tp);
+	tp->t_bw_rtseq = tp->iss;
 	tcp_sendseqinit(tp);
 
 	/*
Index: tcp_var.h
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_var.h,v
retrieving revision 1.82
diff -u -r1.82 tcp_var.h
--- tcp_var.h	19 Jul 2002 18:27:39 -0000	1.82
+++ tcp_var.h	21 Jul 2002 02:32:58 -0000
@@ -124,10 +124,12 @@
 
 	u_long	snd_wnd;		/* send window */
 	u_long	snd_cwnd;		/* congestion-controlled window */
+	u_long	snd_bwnd;		/* bandwidth-controlled window */
 	u_long	snd_ssthresh;		/* snd_cwnd size threshold for
 					 * for slow start exponential to
 					 * linear switch
 					 */
+	u_long	snd_bandwidth;		/* calculated bandwidth or 0 */
 	tcp_seq	snd_recover;		/* for use in fast recovery */
 
 	u_int	t_maxopd;		/* mss plus options */
@@ -137,6 +139,9 @@
 	int	t_rtttime;		/* round trip time */
 	tcp_seq	t_rtseq;		/* sequence number being timed */
 
+	int	t_bw_rtttime;		/* used for bandwidth calculation */
+	tcp_seq	t_bw_rtseq;		/* used for bandwidth calculation */
+
 	int	t_rxtcur;		/* current retransmit value (ticks) */
 	u_int	t_maxseg;		/* maximum segment size */
 	int	t_srtt;			/* smoothed round-trip time */
@@ -144,6 +149,7 @@
 
 	int	t_rxtshift;		/* log(2) of rexmt exp. backoff */
 	u_int	t_rttmin;		/* minimum rtt allowed */
+	u_int	t_rttbest;		/* best rtt we've seen */
 	u_long	t_rttupdated;		/* number of times rtt sampled */
 	u_long	max_sndwnd;		/* largest window peer has offered */
 
@@ -473,6 +479,7 @@
 struct tcpcb *
 	 tcp_timers(struct tcpcb *, int);
 void	 tcp_trace(int, int, struct tcpcb *, void *, struct tcphdr *, int);
+void	 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq);
 void	 syncache_init(void);
 void	 syncache_unreach(struct in_conninfo *, struct tcphdr *);
 int	 syncache_expand(struct in_conninfo *, struct tcphdr *,

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-hackers" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200207210236.g6L2a11a000270>