Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 28 May 2011 04:40:35 +0000 (UTC)
From:      Lawrence Stewart <lstewart@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-8@freebsd.org
Subject:   svn commit: r222401 - in stable/8/sys: conf netinet netinet/cc sys
Message-ID:  <201105280440.p4S4eZX6068131@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: lstewart
Date: Sat May 28 04:40:35 2011
New Revision: 222401
URL: http://svn.freebsd.org/changeset/base/222401

Log:
  MFC r215166,215377,215391,215392,215393,215395,216101,216103,216105,216107,
      216749,216760,217748,218167:
  
  - Add a KPI and supporting infrastructure to allow modular congestion control
    algorithms to be used in the net stack. Algorithms can maintain per-connection
    state if required, and connections maintain their own algorithm pointer, which
    allows different connections to concurrently use different algorithms. The
    TCP_CONGESTION socket option can be used with getsockopt()/setsockopt() to
    programmatically query or change the congestion control algorithm respectively
    from within an application at runtime.
  
  - Integrate the framework with the TCP stack in as least intrusive a manner as
    possible. Care was also taken to develop the framework in a way that should
    allow integration with other congestion aware transport protocols (e.g.  SCTP)
    in the future. The hope is that we will one day be able to share a single set
    of congestion control algorithm modules between all congestion aware transport
    protocols.
  
  - Introduce a new congestion recovery (TF_CONGRECOVERY) state into the TCP stack
    and use it to decouple the meaning of recovery from a congestion event and
    recovery from packet loss (TF_FASTRECOVERY) a la RFC2581. ECN and delay based
    congestion control protocols don't generally need to recover from packet loss
    and need a different way to note a congestion recovery episode within the
    stack.
  
  - Remove the net.inet.tcp.newreno sysctl, which simplifies some portions of code
    and ensures the stack always uses the appropriate mechanisms for recovering
    from packet loss during a congestion recovery episode.
  
  - Extract the NewReno congestion control algorithm from the TCP stack and
    massage it into module form. NewReno is always built into the kernel and will
    remain the default algorithm for the forseeable future. Implementations of
    additional different algorithms will become available in the near future.
  
  - Tweak the MFCed code to preserve the ABI of the 8-STABLE branch with respect
    to "struct tcpcb" by consuming some of the padding within the struct.
  
  - Bump __FreeBSD_version to 802504.
  
  In collaboration with:	David Hayes <dahayes at swin edu au> and
  				Grenville Armitage <garmitage at swin edu au>
  Sponsored by:	Cisco URP, FreeBSD Foundation
  Reviewed by:	rpaulo (r215166), bz (r215391,215395,216749,217748)
  Tested by:	David Hayes (r215166), trociny (r215377,215391,215392,215395)

Added:
  stable/8/sys/netinet/cc/
     - copied from r215166, head/sys/netinet/cc/
  stable/8/sys/netinet/cc.h
     - copied, changed from r215166, head/sys/netinet/cc.h
Modified:
  stable/8/sys/conf/files
  stable/8/sys/netinet/cc/cc.c
  stable/8/sys/netinet/cc/cc_newreno.c
  stable/8/sys/netinet/tcp_input.c
  stable/8/sys/netinet/tcp_output.c
  stable/8/sys/netinet/tcp_sack.c
  stable/8/sys/netinet/tcp_subr.c
  stable/8/sys/netinet/tcp_timer.c
  stable/8/sys/netinet/tcp_usrreq.c
  stable/8/sys/netinet/tcp_var.h
  stable/8/sys/sys/param.h
Directory Properties:
  stable/8/sys/   (props changed)
  stable/8/sys/amd64/include/xen/   (props changed)
  stable/8/sys/cddl/contrib/opensolaris/   (props changed)
  stable/8/sys/contrib/dev/acpica/   (props changed)
  stable/8/sys/contrib/pf/   (props changed)

Modified: stable/8/sys/conf/files
==============================================================================
--- stable/8/sys/conf/files	Sat May 28 04:10:44 2011	(r222400)
+++ stable/8/sys/conf/files	Sat May 28 04:40:35 2011	(r222401)
@@ -2568,6 +2568,8 @@ netinet/ip_mroute.c		optional mrouting i
 netinet/ip_options.c		optional inet
 netinet/ip_output.c		optional inet
 netinet/raw_ip.c		optional inet
+netinet/cc/cc.c			optional inet
+netinet/cc/cc_newreno.c		optional inet
 netinet/sctp_asconf.c		optional inet sctp
 netinet/sctp_auth.c		optional inet sctp
 netinet/sctp_bsd_addr.c		optional inet sctp

Copied and modified: stable/8/sys/netinet/cc.h (from r215166, head/sys/netinet/cc.h)
==============================================================================
--- head/sys/netinet/cc.h	Fri Nov 12 06:41:55 2010	(r215166, copy source)
+++ stable/8/sys/netinet/cc.h	Sat May 28 04:40:35 2011	(r222401)
@@ -58,11 +58,14 @@ extern STAILQ_HEAD(cc_head, cc_algo) cc_
 extern const int tcprexmtthresh;
 extern struct cc_algo newreno_cc_algo;
 
+/* Per-netstack bits. */
+VNET_DECLARE(struct cc_algo *, default_cc_ptr);
+#define	V_default_cc_ptr VNET(default_cc_ptr)
+
 /* Define the new net.inet.tcp.cc sysctl tree. */
 SYSCTL_DECL(_net_inet_tcp_cc);
 
 /* CC housekeeping functions. */
-void	cc_init(void);
 int	cc_register_algo(struct cc_algo *add_cc);
 int	cc_deregister_algo(struct cc_algo *remove_cc);
 
@@ -98,10 +101,12 @@ struct cc_var {
  * bits (0x01000000 - 0x80000000) are reserved for CC algos to declare their own
  * congestion signal types.
  */
-#define	CC_ECN		0x000001/* ECN marked packet received. */
-#define	CC_RTO		0x000002/* RTO fired. */
-#define	CC_RTO_ERR	0x000004/* RTO fired in error. */
-#define	CC_NDUPACK	0x000008/* Threshold of dupack's reached. */
+#define	CC_ECN		0x00000001	/* ECN marked packet received. */
+#define	CC_RTO		0x00000002	/* RTO fired. */
+#define	CC_RTO_ERR	0x00000004	/* RTO fired in error. */
+#define	CC_NDUPACK	0x00000008	/* Threshold of dupack's reached. */
+
+#define	CC_SIGPRIVMASK	0xFF000000	/* Mask to check if sig is private. */
 
 /*
  * Structure to hold data and function pointers that together represent a
@@ -147,7 +152,7 @@ struct cc_algo {
 #define	CC_DATA(tp)	((tp)->ccv->cc_data)
 
 /* Macro to obtain the system default CC algo's struct ptr. */
-#define	CC_DEFAULT()	STAILQ_FIRST(&cc_list)
+#define	CC_DEFAULT()	V_default_cc_ptr
 
 extern struct rwlock cc_list_lock;
 #define	CC_LIST_LOCK_INIT()	rw_init(&cc_list_lock, "cc_list")
@@ -156,6 +161,6 @@ extern struct rwlock cc_list_lock;
 #define	CC_LIST_RUNLOCK()	rw_runlock(&cc_list_lock)
 #define	CC_LIST_WLOCK()		rw_wlock(&cc_list_lock)
 #define	CC_LIST_WUNLOCK()	rw_wunlock(&cc_list_lock)
-#define	CC_LIST_WLOCK_ASSERT()	rw_assert(&cc_list_lock, RA_WLOCKED)
+#define	CC_LIST_LOCK_ASSERT()	rw_assert(&cc_list_lock, RA_LOCKED)
 
 #endif /* _NETINET_CC_H_ */

Modified: stable/8/sys/netinet/cc/cc.c
==============================================================================
--- head/sys/netinet/cc/cc.c	Fri Nov 12 06:41:55 2010	(r215166)
+++ stable/8/sys/netinet/cc/cc.c	Sat May 28 04:40:35 2011	(r222401)
@@ -81,24 +81,7 @@ struct cc_head cc_list = STAILQ_HEAD_INI
 /* Protects the cc_list TAILQ. */
 struct rwlock cc_list_lock;
 
-/*
- * Set the default CC algorithm to new_default. The default is identified
- * by being the first element in the cc_list TAILQ.
- */
-static void
-cc_set_default(struct cc_algo *new_default)
-{
-	CC_LIST_WLOCK_ASSERT();
-
-	/*
-	 * Make the requested system default CC algorithm the first element in
-	 * the list if it isn't already.
-	 */
-	if (new_default != CC_DEFAULT()) {
-		STAILQ_REMOVE(&cc_list, new_default, cc_algo, entries);
-		STAILQ_INSERT_HEAD(&cc_list, new_default, entries);
-	}
-}
+VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo;
 
 /*
  * Sysctl handler to show and change the default CC algorithm.
@@ -106,14 +89,13 @@ cc_set_default(struct cc_algo *new_defau
 static int
 cc_default_algo(SYSCTL_HANDLER_ARGS)
 {
+	char default_cc[TCP_CA_NAME_MAX];
 	struct cc_algo *funcs;
 	int err, found;
 
 	err = found = 0;
 
 	if (req->newptr == NULL) {
-		char default_cc[TCP_CA_NAME_MAX];
-
 		/* Just print the current default. */
 		CC_LIST_RLOCK();
 		strlcpy(default_cc, CC_DEFAULT()->name, TCP_CA_NAME_MAX);
@@ -121,15 +103,15 @@ cc_default_algo(SYSCTL_HANDLER_ARGS)
 		err = sysctl_handle_string(oidp, default_cc, 1, req);
 	} else {
 		/* Find algo with specified name and set it to default. */
-		CC_LIST_WLOCK();
+		CC_LIST_RLOCK();
 		STAILQ_FOREACH(funcs, &cc_list, entries) {
 			if (strncmp((char *)req->newptr, funcs->name,
 			    TCP_CA_NAME_MAX) == 0) {
 				found = 1;
-				cc_set_default(funcs);
+				V_default_cc_ptr = funcs;
 			}
 		}
-		CC_LIST_WUNLOCK();
+		CC_LIST_RUNLOCK();
 
 		if (!found)
 			err = ESRCH;
@@ -146,20 +128,37 @@ cc_list_available(SYSCTL_HANDLER_ARGS)
 {
 	struct cc_algo *algo;
 	struct sbuf *s;
-	int err, first;
+	int err, first, nalgos;
 
-	err = 0;
+	err = nalgos = 0;
 	first = 1;
-	s = sbuf_new(NULL, NULL, TCP_CA_NAME_MAX, SBUF_AUTOEXTEND);
+
+	CC_LIST_RLOCK();
+	STAILQ_FOREACH(algo, &cc_list, entries) {
+		nalgos++;
+	}
+	CC_LIST_RUNLOCK();
+
+	s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN);
 
 	if (s == NULL)
 		return (ENOMEM);
 
+	/*
+	 * It is theoretically possible for the CC list to have grown in size
+	 * since the call to sbuf_new() and therefore for the sbuf to be too
+	 * small. If this were to happen (incredibly unlikely), the sbuf will
+	 * reach an overflow condition, sbuf_printf() will return an error and
+	 * the sysctl will fail gracefully.
+	 */
 	CC_LIST_RLOCK();
 	STAILQ_FOREACH(algo, &cc_list, entries) {
 		err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
-		if (err)
+		if (err) {
+			/* Sbuf overflow condition. */
+			err = EOVERFLOW;
 			break;
+		}
 		first = 0;
 	}
 	CC_LIST_RUNLOCK();
@@ -174,10 +173,32 @@ cc_list_available(SYSCTL_HANDLER_ARGS)
 }
 
 /*
+ * Reset the default CC algo to NewReno for any netstack which is using the algo
+ * that is about to go away as its default.
+ */
+static void
+cc_checkreset_default(struct cc_algo *remove_cc)
+{
+	VNET_ITERATOR_DECL(vnet_iter);
+
+	CC_LIST_LOCK_ASSERT();
+
+	VNET_LIST_RLOCK_NOSLEEP();
+	VNET_FOREACH(vnet_iter) {
+		CURVNET_SET(vnet_iter);
+		if (strncmp(CC_DEFAULT()->name, remove_cc->name,
+		    TCP_CA_NAME_MAX) == 0)
+			V_default_cc_ptr = &newreno_cc_algo;
+		CURVNET_RESTORE();
+	}
+	VNET_LIST_RUNLOCK_NOSLEEP();
+}
+
+/*
  * Initialise CC subsystem on system boot.
  */
-void
-cc_init()
+static void
+cc_init(void)
 {
 	CC_LIST_LOCK_INIT();
 	STAILQ_INIT(&cc_list);
@@ -190,8 +211,6 @@ int
 cc_deregister_algo(struct cc_algo *remove_cc)
 {
 	struct cc_algo *funcs, *tmpfuncs;
-	struct tcpcb *tp;
-	struct inpcb *inp;
 	int err;
 
 	err = ENOENT;
@@ -204,58 +223,22 @@ cc_deregister_algo(struct cc_algo *remov
 	CC_LIST_WLOCK();
 	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
 		if (funcs == remove_cc) {
-			/*
-			 * If we're removing the current system default,
-			 * reset the default to newreno.
-			 */
-			if (strncmp(CC_DEFAULT()->name, remove_cc->name,
-			    TCP_CA_NAME_MAX) == 0)
-				cc_set_default(&newreno_cc_algo);
-
+			cc_checkreset_default(remove_cc);
 			STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
 			err = 0;
 			break;
 		}
 	}
 	CC_LIST_WUNLOCK();
-	
-	if (!err) {
+
+	if (!err)
 		/*
-		 * Check all active control blocks and change any that are
-		 * using this algorithm back to newreno. If the algorithm that
-		 * was in use requires cleanup code to be run, call it.
-		 *
-		 * New connections already part way through being initialised
-		 * with the CC algo we're removing will not race with this code
-		 * because the INP_INFO_WLOCK is held during initialisation.
-		 * We therefore don't enter the loop below until the connection
-		 * list has stabilised.
+		 * XXXLAS:
+		 * - We may need to handle non-zero return values in future.
+		 * - If we add CC framework support for protocols other than
+		 *   TCP, we may want a more generic way to handle this step.
 		 */
-		INP_INFO_RLOCK(&V_tcbinfo);
-		LIST_FOREACH(inp, &V_tcb, inp_list) {
-			INP_WLOCK(inp);
-			/* Important to skip tcptw structs. */
-			if (!(inp->inp_flags & INP_TIMEWAIT) &&
-			    (tp = intotcpcb(inp)) != NULL) {
-				/*
-				 * By holding INP_WLOCK here, we are
-				 * assured that the connection is not
-				 * currently executing inside the CC
-				 * module's functions i.e. it is safe to
-				 * make the switch back to newreno.
-				 */
-				if (CC_ALGO(tp) == remove_cc) {
-					tmpfuncs = CC_ALGO(tp);
-					/* Newreno does not require any init. */
-					CC_ALGO(tp) = &newreno_cc_algo;
-					if (tmpfuncs->cb_destroy != NULL)
-						tmpfuncs->cb_destroy(tp->ccv);
-				}
-			}
-			INP_WUNLOCK(inp);
-		}
-		INP_INFO_RUNLOCK(&V_tcbinfo);
-	}
+		tcp_ccalgounload(remove_cc);
 
 	return (err);
 }
@@ -328,11 +311,13 @@ cc_modevent(module_t mod, int event_type
 	return (err);
 }
 
+SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
+
 /* Declare sysctl tree and populate it. */
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
     "congestion control related settings");
 
-SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW,
+SYSCTL_VNET_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW,
     NULL, 0, cc_default_algo, "A", "default congestion control algorithm");
 
 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,

Modified: stable/8/sys/netinet/cc/cc_newreno.c
==============================================================================
--- head/sys/netinet/cc/cc_newreno.c	Fri Nov 12 06:41:55 2010	(r215166)
+++ stable/8/sys/netinet/cc/cc_newreno.c	Sat May 28 04:40:35 2011	(r222401)
@@ -52,41 +52,35 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
+#include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
+#include <sys/systm.h>
 
-#include <net/if.h>
-#include <net/if_var.h>
+#include <net/vnet.h>
 
 #include <netinet/cc.h>
-#include <netinet/in.h>
-#include <netinet/in_pcb.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_var.h>
 
 #include <netinet/cc/cc_module.h>
 
-void	newreno_ack_received(struct cc_var *ccv, uint16_t type);
-void	newreno_cong_signal(struct cc_var *ccv, uint32_t type);
-void	newreno_post_recovery(struct cc_var *ccv);
-void	newreno_after_idle(struct cc_var *ccv);
+static void	newreno_ack_received(struct cc_var *ccv, uint16_t type);
+static void	newreno_after_idle(struct cc_var *ccv);
+static void	newreno_cong_signal(struct cc_var *ccv, uint32_t type);
+static void	newreno_post_recovery(struct cc_var *ccv);
 
 struct cc_algo newreno_cc_algo = {
 	.name = "newreno",
 	.ack_received = newreno_ack_received,
+	.after_idle = newreno_after_idle,
 	.cong_signal = newreno_cong_signal,
 	.post_recovery = newreno_post_recovery,
-	.after_idle = newreno_after_idle
 };
 
-/*
- * Increase cwnd on receipt of a successful ACK:
- * if cwnd <= ssthresh, increases by 1 MSS per ACK
- * if cwnd > ssthresh, increase by ~1 MSS per RTT
- */
-void
+static void
 newreno_ack_received(struct cc_var *ccv, uint16_t type)
 {
 	if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
@@ -153,14 +147,45 @@ newreno_ack_received(struct cc_var *ccv,
 	}
 }
 
+static void
+newreno_after_idle(struct cc_var *ccv)
+{
+	int rw;
+
+	/*
+	 * If we've been idle for more than one retransmit timeout the old
+	 * congestion window is no longer current and we have to reduce it to
+	 * the restart window before we can transmit again.
+	 *
+	 * The restart window is the initial window or the last CWND, whichever
+	 * is smaller.
+	 *
+	 * This is done to prevent us from flooding the path with a full CWND at
+	 * wirespeed, overloading router and switch buffers along the way.
+	 *
+	 * See RFC5681 Section 4.1. "Restarting Idle Connections".
+	 */
+	if (V_tcp_do_rfc3390)
+		rw = min(4 * CCV(ccv, t_maxseg),
+		    max(2 * CCV(ccv, t_maxseg), 4380));
+	else
+		rw = CCV(ccv, t_maxseg) * 2;
+
+	CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd));
+}
+
 /*
- * manage congestion signals
+ * Perform any necessary tasks before we enter congestion recovery.
  */
-void
+static void
 newreno_cong_signal(struct cc_var *ccv, uint32_t type)
 {
 	u_int win;
 
+	/* Catch algos which mistakenly leak private signal types. */
+	KASSERT((type & CC_SIGPRIVMASK) == 0,
+	    ("%s: congestion signal type 0x%08x is private\n", __func__, type));
+
 	win = max(CCV(ccv, snd_cwnd) / 2 / CCV(ccv, t_maxseg), 2) *
 	    CCV(ccv, t_maxseg);
 
@@ -183,11 +208,9 @@ newreno_cong_signal(struct cc_var *ccv, 
 }
 
 /*
- * decrease the cwnd in response to packet loss or a transmit timeout.
- * th can be null, in which case cwnd will be set according to reno instead
- * of new reno.
+ * Perform any necessary tasks before we exit congestion recovery.
  */
-void
+static void
 newreno_post_recovery(struct cc_var *ccv)
 {
 	if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
@@ -209,23 +232,5 @@ newreno_post_recovery(struct cc_var *ccv
 	}
 }
 
-/*
- * if a connection has been idle for a while and more data is ready to be sent,
- * reset cwnd
- */
-void
-newreno_after_idle(struct cc_var *ccv)
-{
-	/*
-	 * We have been idle for "a while" and no acks are expected to clock out
-	 * any data we send -- slow start to get ack "clock" running again.
-	 */
-	if (V_tcp_do_rfc3390)
-		CCV(ccv, snd_cwnd) = min(4 * CCV(ccv, t_maxseg),
-		    max(2 * CCV(ccv, t_maxseg), 4380));
-	else
-		CCV(ccv, snd_cwnd) = CCV(ccv, t_maxseg) * 2;
-}
-
 
 DECLARE_CC_MODULE(newreno, &newreno_cc_algo);

Modified: stable/8/sys/netinet/tcp_input.c
==============================================================================
--- stable/8/sys/netinet/tcp_input.c	Sat May 28 04:10:44 2011	(r222400)
+++ stable/8/sys/netinet/tcp_input.c	Sat May 28 04:40:35 2011	(r222401)
@@ -1,6 +1,20 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2007-2008,2010
+ *	Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart, James Healy and
+ * David Hayes, made possible in part by a grant from the Cisco University
+ * Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -61,6 +75,7 @@ __FBSDID("$FreeBSD$");
 
 #define TCPSTATES		/* for logging */
 
+#include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
@@ -75,7 +90,6 @@ __FBSDID("$FreeBSD$");
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
-#include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
@@ -96,7 +110,7 @@ __FBSDID("$FreeBSD$");
 
 #include <security/mac/mac_framework.h>
 
-static const int tcprexmtthresh = 3;
+const int tcprexmtthresh = 3;
 
 VNET_DEFINE(struct tcpstat, tcpstat);
 SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
@@ -132,19 +146,16 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO,
     "Enable RFC 3042 (Limited Transmit)");
 
 VNET_DEFINE(int, tcp_do_rfc3390) = 1;
-#define	V_tcp_do_rfc3390	VNET(tcp_do_rfc3390)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3390), 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
 
 VNET_DEFINE(int, tcp_do_rfc3465) = 1;
-#define	V_tcp_do_rfc3465	VNET(tcp_do_rfc3465)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3465), 0,
     "Enable RFC 3465 (Appropriate Byte Counting)");
 
 VNET_DEFINE(int, tcp_abc_l_var) = 2;
-#define	V_tcp_abc_l_var		VNET(tcp_abc_l_var)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
     &VNET_NAME(tcp_abc_l_var), 2,
     "Cap the max cwnd increment during slow-start to this number of segments");
@@ -203,8 +214,10 @@ static void	 tcp_pulloutofband(struct so
 		     struct tcphdr *, struct mbuf *, int);
 static void	 tcp_xmit_timer(struct tcpcb *, int);
 static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
-static void inline
-		 tcp_congestion_exp(struct tcpcb *);
+static void inline	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
+			    uint16_t type);
+static void inline	cc_conn_init(struct tcpcb *tp);
+static void inline	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 static void inline 	tcp_fields_to_host(struct tcphdr *);
 #ifdef TCP_SIGNATURE
 static void inline 	tcp_fields_to_net(struct tcphdr *);
@@ -226,20 +239,190 @@ kmod_tcpstat_inc(int statnum)
 	(*((u_long *)&V_tcpstat + statnum))++;
 }
 
+/*
+ * CC wrapper hook functions
+ */
 static void inline
-tcp_congestion_exp(struct tcpcb *tp)
+cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
 {
-	u_int win;
-	
-	win = min(tp->snd_wnd, tp->snd_cwnd) /
-	    2 / tp->t_maxseg;
-	if (win < 2)
-		win = 2;
-	tp->snd_ssthresh = win * tp->t_maxseg;
-	ENTER_FASTRECOVERY(tp);
-	tp->snd_recover = tp->snd_max;
-	if (tp->t_flags & TF_ECN_PERMIT)
-		tp->t_flags |= TF_ECN_SND_CWR;
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
+	if (tp->snd_cwnd == min(tp->snd_cwnd, tp->snd_wnd))
+		tp->ccv->flags |= CCF_CWND_LIMITED;
+	else
+		tp->ccv->flags &= ~CCF_CWND_LIMITED;
+
+	if (type == CC_ACK) {
+		if (tp->snd_cwnd > tp->snd_ssthresh) {
+			tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
+			     V_tcp_abc_l_var * tp->t_maxseg);
+			if (tp->t_bytes_acked >= tp->snd_cwnd) {
+				tp->t_bytes_acked -= tp->snd_cwnd;
+				tp->ccv->flags |= CCF_ABC_SENTAWND;
+			}
+		} else {
+				tp->ccv->flags &= ~CCF_ABC_SENTAWND;
+				tp->t_bytes_acked = 0;
+		}
+	}
+
+	if (CC_ALGO(tp)->ack_received != NULL) {
+		/* XXXLAS: Find a way to live without this */
+		tp->ccv->curack = th->th_ack;
+		CC_ALGO(tp)->ack_received(tp->ccv, type);
+	}
+}
+
+static void inline
+cc_conn_init(struct tcpcb *tp)
+{
+	struct hc_metrics_lite metrics;
+	struct inpcb *inp = tp->t_inpcb;
+	int rtt;
+#ifdef INET6
+	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+#endif
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tcp_hc_get(&inp->inp_inc, &metrics);
+
+	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
+		tp->t_srtt = rtt;
+		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
+		TCPSTAT_INC(tcps_usedrtt);
+		if (metrics.rmx_rttvar) {
+			tp->t_rttvar = metrics.rmx_rttvar;
+			TCPSTAT_INC(tcps_usedrttvar);
+		} else {
+			/* default variation is +- 1 rtt */
+			tp->t_rttvar =
+			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+		}
+		TCPT_RANGESET(tp->t_rxtcur,
+		    ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+		    tp->t_rttmin, TCPTV_REXMTMAX);
+	}
+	if (metrics.rmx_ssthresh) {
+		/*
+		 * There's some sort of gateway or interface
+		 * buffer limit on the path.  Use this to set
+		 * the slow start threshhold, but set the
+		 * threshold to no less than 2*mss.
+		 */
+		tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh);
+		TCPSTAT_INC(tcps_usedssthresh);
+	}
+
+	/*
+	 * Set the slow-start flight size depending on whether this
+	 * is a local network or not.
+	 *
+	 * Extend this so we cache the cwnd too and retrieve it here.
+	 * Make cwnd even bigger than RFC3390 suggests but only if we
+	 * have previous experience with the remote host. Be careful
+	 * not make cwnd bigger than remote receive window or our own
+	 * send socket buffer. Maybe put some additional upper bound
+	 * on the retrieved cwnd. Should do incremental updates to
+	 * hostcache when cwnd collapses so next connection doesn't
+	 * overloads the path again.
+	 *
+	 * XXXAO: Initializing the CWND from the hostcache is broken
+	 * and in its current form not RFC conformant.  It is disabled
+	 * until fixed or removed entirely.
+	 *
+	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+	 * We currently check only in syncache_socket for that.
+	 */
+/* #define TCP_METRICS_CWND */
+#ifdef TCP_METRICS_CWND
+	if (metrics.rmx_cwnd)
+		tp->snd_cwnd = max(tp->t_maxseg, min(metrics.rmx_cwnd / 2,
+		    min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+	else
+#endif
+	if (V_tcp_do_rfc3390)
+		tp->snd_cwnd = min(4 * tp->t_maxseg,
+		    max(2 * tp->t_maxseg, 4380));
+#ifdef INET6
+	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
+		 (!isipv6 && in_localaddr(inp->inp_faddr)))
+#else
+	else if (in_localaddr(inp->inp_faddr))
+#endif
+		tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local;
+	else
+		tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz;
+
+	if (CC_ALGO(tp)->conn_init != NULL)
+		CC_ALGO(tp)->conn_init(tp->ccv);
+}
+
+void inline
+cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
+{
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	switch(type) {
+	case CC_NDUPACK:
+		if (!IN_FASTRECOVERY(tp->t_flags)) {
+			tp->snd_recover = tp->snd_max;
+			if (tp->t_flags & TF_ECN_PERMIT)
+				tp->t_flags |= TF_ECN_SND_CWR;
+		}
+		break;
+	case CC_ECN:
+		if (!IN_CONGRECOVERY(tp->t_flags)) {
+			TCPSTAT_INC(tcps_ecn_rcwnd);
+			tp->snd_recover = tp->snd_max;
+			if (tp->t_flags & TF_ECN_PERMIT)
+				tp->t_flags |= TF_ECN_SND_CWR;
+		}
+		break;
+	case CC_RTO:
+		tp->t_dupacks = 0;
+		tp->t_bytes_acked = 0;
+		EXIT_RECOVERY(tp->t_flags);
+		tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
+		    tp->t_maxseg) * tp->t_maxseg;
+		tp->snd_cwnd = tp->t_maxseg;
+		break;
+	case CC_RTO_ERR:
+		TCPSTAT_INC(tcps_sndrexmitbad);
+		/* RTO was unnecessary, so reset everything. */
+		tp->snd_cwnd = tp->snd_cwnd_prev;
+		tp->snd_ssthresh = tp->snd_ssthresh_prev;
+		tp->snd_recover = tp->snd_recover_prev;
+		if (tp->t_flags & TF_WASFRECOVERY)
+			ENTER_FASTRECOVERY(tp->t_flags);
+		if (tp->t_flags & TF_WASCRECOVERY)
+			ENTER_CONGRECOVERY(tp->t_flags);
+		tp->snd_nxt = tp->snd_max;
+		tp->t_badrxtwin = 0;
+		break;
+	}
+
+	if (CC_ALGO(tp)->cong_signal != NULL) {
+		if (th != NULL)
+			tp->ccv->curack = th->th_ack;
+		CC_ALGO(tp)->cong_signal(tp->ccv, type);
+	}
+}
+
+static void inline
+cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
+{
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	/* XXXLAS: KASSERT that we're in recovery? */
+
+	if (CC_ALGO(tp)->post_recovery != NULL) {
+		tp->ccv->curack = th->th_ack;
+		CC_ALGO(tp)->post_recovery(tp->ccv);
+	}
+	/* XXXLAS: EXIT_RECOVERY ? */
+	tp->t_bytes_acked = 0;
 }
 
 static inline void
@@ -1252,14 +1435,9 @@ tcp_do_segment(struct mbuf *m, struct tc
 			TCPSTAT_INC(tcps_ecn_ect1);
 			break;
 		}
-		/*
-		 * Congestion experienced.
-		 * Ignore if we are already trying to recover.
-		 */
-		if ((thflags & TH_ECE) &&
-		    SEQ_LEQ(th->th_ack, tp->snd_recover)) {
-			TCPSTAT_INC(tcps_ecn_rcwnd);
-			tcp_congestion_exp(tp);
+		/* Congestion experienced. */
+		if (thflags & TH_ECE) {
+			cc_cong_signal(tp, th, CC_ECN);
 		}
 	}
 
@@ -1354,15 +1532,9 @@ tcp_do_segment(struct mbuf *m, struct tc
 		if (tlen == 0) {
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
-			    tp->snd_cwnd >= tp->snd_wnd &&
-			    ((!V_tcp_do_newreno &&
-			      !(tp->t_flags & TF_SACK_PERMIT) &&
-			      tp->t_dupacks < tcprexmtthresh) ||
-			     ((V_tcp_do_newreno ||
-			       (tp->t_flags & TF_SACK_PERMIT)) &&
-			      !IN_FASTRECOVERY(tp) &&
-			      (to.to_flags & TOF_SACK) == 0 &&
-			      TAILQ_EMPTY(&tp->snd_holes)))) {
+			    !IN_RECOVERY(tp->t_flags) &&
+			    (to.to_flags & TOF_SACK) == 0 &&
+			    TAILQ_EMPTY(&tp->snd_holes)) {
 				/*
 				 * This is a pure ack for outstanding data.
 				 */
@@ -1382,15 +1554,7 @@ tcp_do_segment(struct mbuf *m, struct tc
 				 */
 				if (tp->t_rxtshift == 1 &&
 				    (int)(ticks - tp->t_badrxtwin) < 0) {
-					TCPSTAT_INC(tcps_sndrexmitbad);
-					tp->snd_cwnd = tp->snd_cwnd_prev;
-					tp->snd_ssthresh =
-					    tp->snd_ssthresh_prev;
-					tp->snd_recover = tp->snd_recover_prev;
-					if (tp->t_flags & TF_WASFRECOVERY)
-					    ENTER_FASTRECOVERY(tp);
-					tp->snd_nxt = tp->snd_max;
-					tp->t_badrxtwin = 0;
+					cc_cong_signal(tp, th, CC_RTO_ERR);
 				}
 
 				/*
@@ -1417,13 +1581,22 @@ tcp_do_segment(struct mbuf *m, struct tc
 							ticks - tp->t_rtttime);
 				}
 				tcp_xmit_bandwidth_limit(tp, th->th_ack);
-				acked = th->th_ack - tp->snd_una;
+				acked = BYTES_THIS_ACK(tp, th);
 				TCPSTAT_INC(tcps_rcvackpack);
 				TCPSTAT_ADD(tcps_rcvackbyte, acked);
 				sbdrop(&so->so_snd, acked);
 				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
 				    SEQ_LEQ(th->th_ack, tp->snd_recover))
 					tp->snd_recover = th->th_ack - 1;
+				
+				/*
+				 * Let the congestion control algorithm update
+				 * congestion control related information. This
+				 * typically means increasing the congestion
+				 * window.
+				 */
+				cc_ack_received(tp, th, CC_ACK);
+
 				tp->snd_una = th->th_ack;
 				/*
 				 * Pull snd_wl2 up to prevent seq wrap relative
@@ -1684,6 +1857,7 @@ tcp_do_segment(struct mbuf *m, struct tc
 				thflags &= ~TH_SYN;
 			} else {
 				tp->t_state = TCPS_ESTABLISHED;
+				cc_conn_init(tp);
 				tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 			}
 		} else {
@@ -2087,6 +2261,7 @@ tcp_do_segment(struct mbuf *m, struct tc
 			tp->t_flags &= ~TF_NEEDFIN;
 		} else {
 			tp->t_state = TCPS_ESTABLISHED;
+			cc_conn_init(tp);
 			tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 		}
 		/*
@@ -2155,11 +2330,10 @@ tcp_do_segment(struct mbuf *m, struct tc
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks > tcprexmtthresh ||
-				    ((V_tcp_do_newreno ||
-				      (tp->t_flags & TF_SACK_PERMIT)) &&
-				     IN_FASTRECOVERY(tp))) {
+				     IN_FASTRECOVERY(tp->t_flags)) {
+					cc_ack_received(tp, th, CC_DUPACK);
 					if ((tp->t_flags & TF_SACK_PERMIT) &&
-					    IN_FASTRECOVERY(tp)) {
+					    IN_FASTRECOVERY(tp->t_flags)) {
 						int awnd;
 						
 						/*
@@ -2190,19 +2364,20 @@ tcp_do_segment(struct mbuf *m, struct tc
 					 * recovery.
 					 */
 					if (tp->t_flags & TF_SACK_PERMIT) {
-						if (IN_FASTRECOVERY(tp)) {
+						if (IN_FASTRECOVERY(tp->t_flags)) {
 							tp->t_dupacks = 0;
 							break;
 						}
-					} else if (V_tcp_do_newreno ||
-					    V_tcp_do_ecn) {
+					} else {
 						if (SEQ_LEQ(th->th_ack,
 						    tp->snd_recover)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					}
-					tcp_congestion_exp(tp);
+					/* Congestion signal before ack. */
+					cc_cong_signal(tp, th, CC_NDUPACK);
+					cc_ack_received(tp, th, CC_DUPACK);
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
 					if (tp->t_flags & TF_SACK_PERMIT) {
@@ -2226,6 +2401,7 @@ tcp_do_segment(struct mbuf *m, struct tc
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (V_tcp_do_rfc3042) {
+					cc_ack_received(tp, th, CC_DUPACK);
 					u_long oldcwnd = tp->snd_cwnd;
 					tcp_seq oldsndmax = tp->snd_max;
 					u_int sent;
@@ -2267,37 +2443,14 @@ tcp_do_segment(struct mbuf *m, struct tc
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
-		if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) {
-			if (IN_FASTRECOVERY(tp)) {
-				if (SEQ_LT(th->th_ack, tp->snd_recover)) {
-					if (tp->t_flags & TF_SACK_PERMIT)
-						tcp_sack_partialack(tp, th);
-					else
-						tcp_newreno_partial_ack(tp, th);
-				} else {
-					/*
-					 * Out of fast recovery.
-					 * Window inflation should have left us
-					 * with approximately snd_ssthresh
-					 * outstanding data.
-					 * But in case we would be inclined to
-					 * send a burst, better to do it via
-					 * the slow start mechanism.
-					 */
-					if (SEQ_GT(th->th_ack +
-							tp->snd_ssthresh,
-						   tp->snd_max))
-						tp->snd_cwnd = tp->snd_max -
-								th->th_ack +
-								tp->t_maxseg;
-					else
-						tp->snd_cwnd = tp->snd_ssthresh;
-				}
-			}
-		} else {
-			if (tp->t_dupacks >= tcprexmtthresh &&
-			    tp->snd_cwnd > tp->snd_ssthresh)
-				tp->snd_cwnd = tp->snd_ssthresh;
+		if (IN_FASTRECOVERY(tp->t_flags)) {
+			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+				if (tp->t_flags & TF_SACK_PERMIT)
+					tcp_sack_partialack(tp, th);
+				else
+					tcp_newreno_partial_ack(tp, th);
+			} else
+				cc_post_recovery(tp, th);
 		}
 		tp->t_dupacks = 0;
 		/*
@@ -2328,7 +2481,7 @@ process_ACK:
 		    ("tcp_input: process_ACK ti_locked %d", ti_locked));
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
-		acked = th->th_ack - tp->snd_una;
+		acked = BYTES_THIS_ACK(tp, th);
 		TCPSTAT_INC(tcps_rcvackpack);
 		TCPSTAT_ADD(tcps_rcvackbyte, acked);
 
@@ -2339,16 +2492,8 @@ process_ACK:
 		 * original cwnd and ssthresh, and proceed to transmit where
 		 * we left off.
 		 */
-		if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) {
-			TCPSTAT_INC(tcps_sndrexmitbad);
-			tp->snd_cwnd = tp->snd_cwnd_prev;
-			tp->snd_ssthresh = tp->snd_ssthresh_prev;
-			tp->snd_recover = tp->snd_recover_prev;
-			if (tp->t_flags & TF_WASFRECOVERY)
-				ENTER_FASTRECOVERY(tp);
-			tp->snd_nxt = tp->snd_max;
-			tp->t_badrxtwin = 0;	/* XXX probably not required */
-		}
+		if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0)
+			cc_cong_signal(tp, th, CC_RTO_ERR);
 
 		/*
 		 * If we have a timestamp reply, update smoothed
@@ -2396,61 +2541,12 @@ process_ACK:
 			goto step6;
 
 		/*
-		 * When new data is acked, open the congestion window.
-		 * Method depends on which congestion control state we're
-		 * in (slow start or cong avoid) and if ABC (RFC 3465) is
-		 * enabled.
-		 *
-		 * slow start: cwnd <= ssthresh
-		 * cong avoid: cwnd > ssthresh
-		 *
-		 * slow start and ABC (RFC 3465):
-		 *   Grow cwnd exponentially by the amount of data
-		 *   ACKed capping the max increment per ACK to
-		 *   (abc_l_var * maxseg) bytes.
-		 *
-		 * slow start without ABC (RFC 2581):
-		 *   Grow cwnd exponentially by maxseg per ACK.
-		 *
-		 * cong avoid and ABC (RFC 3465):
-		 *   Grow cwnd linearly by maxseg per RTT for each
-		 *   cwnd worth of ACKed data.
-		 *
-		 * cong avoid without ABC (RFC 2581):
-		 *   Grow cwnd linearly by approximately maxseg per RTT using
-		 *   maxseg^2 / cwnd per ACK as the increment.
-		 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
-		 *   avoid capping cwnd.
-		 */
-		if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) ||
-		    !IN_FASTRECOVERY(tp)) {
-			u_int cw = tp->snd_cwnd;
-			u_int incr = tp->t_maxseg;
-			/* In congestion avoidance? */
-			if (cw > tp->snd_ssthresh) {
-				if (V_tcp_do_rfc3465) {
-					tp->t_bytes_acked += acked;
-					if (tp->t_bytes_acked >= tp->snd_cwnd)
-						tp->t_bytes_acked -= cw;
-					else
-						incr = 0;
-				}
-				else
-					incr = max((incr * incr / cw), 1);
-			/*
-			 * In slow-start with ABC enabled and no RTO in sight?
-			 * (Must not use abc_l_var > 1 if slow starting after an
-			 * RTO. On RTO, snd_nxt = snd_una, so the snd_nxt ==
-			 * snd_max check is sufficient to handle this).
-			 */
-			} else if (V_tcp_do_rfc3465 &&
-			    tp->snd_nxt == tp->snd_max)
-				incr = min(acked,
-				    V_tcp_abc_l_var * tp->t_maxseg);
-			/* ABC is on by default, so (incr == 0) frequently. */
-			if (incr > 0)
-				tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
-		}
+		 * Let the congestion control algorithm update congestion
+		 * control related information. This typically means increasing
+		 * the congestion window.
+		 */
+		cc_ack_received(tp, th, CC_ACK);
+
 		SOCKBUF_LOCK(&so->so_snd);
 		if (acked > so->so_snd.sb_cc) {

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201105280440.p4S4eZX6068131>