Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 4 Feb 2014 09:38:30 +0000
From:      "Eggert, Lars" <lars@netapp.com>
To:        "freebsd-net@freebsd.org" <freebsd-net@freebsd.org>
Cc:        "varis81@hotmail.com" <varis81@hotmail.com>
Subject:   Patches for RFC6937 and draft-ietf-tcpm-newcwv-00
Message-ID:  <259C9434-C6FE-42EA-823D-ECB024DBF3D7@netapp.com>

next in thread | raw e-mail | index | archive | help
--Apple-Mail=_5088A38A-DF43-450D-9E03-31C51BD176C7
Content-Type: multipart/mixed;
	boundary="Apple-Mail=_E3102E34-B41D-47A0-AB2F-5E03D718EBE4"


--Apple-Mail=_E3102E34-B41D-47A0-AB2F-5E03D718EBE4
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain;
	charset=us-ascii

Hi,

below are two patches that implement RFC6937 ("Proportional Rate =
Reduction for TCP") and draft-ietf-tcpm-newcwv-00 ("Updating TCP to =
support Rate-Limited Traffic"). They were done by Aris =
Angelogiannopoulos for his MS thesis, which is at =
https://eggert.org/students/angelogiannopoulos-thesis.pdf.

The patches should apply to -CURRENT as of Sep 17, 2013. (Sorry for the =
delay in sending them, we'd been trying to get some feedback from =
committers first, without luck.)

Please note that newcwv is still a work in progress in the IETF, and the =
patch has some limitations with regards to the "pipeACK Sampling Period" =
mentioned in the Internet-Draft. Aris says this in his thesis about what =
exactly he implemented:

"The second implementation choice, is in regards with the measurement of =
pipeACK. This variable is the most important introduced by the method =
and is used to compute the phase that the sender currently lies in. In =
order to compute pipeACK the approach suggested by the Internet Draft =
(ID) is followed [ncwv]. During initialization, pipeACK is set to the =
maximum possible value. A helper variable prevHighACK is introduced that =
is initialized to the initial sequence number (iss). prevHighACK holds =
the value of the highest acknowledged byte so far. pipeACK is measured =
once per RTT meaning that when an ACK covering prevHighACK is received, =
pipeACK becomes the difference between the current ACK and prevHighACK. =
This is called a pipeACK sample.  A newer version of the draft suggests =
that multiple pipeACK samples can be used during the pipeACK sampling =
period."

Lars


--Apple-Mail=_E3102E34-B41D-47A0-AB2F-5E03D718EBE4
Content-Disposition: attachment;
	filename=prr.patch
Content-Type: application/octet-stream;
	name="prr.patch"
Content-Transfer-Encoding: 7bit

diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 75609fd..70c29a8 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -145,6 +145,18 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
     &VNET_NAME(drop_synfin), 0,
     "Drop TCP packets with SYN+FIN set");
 
+VNET_DEFINE(int, tcp_do_prr_conservative) = 0;
+#define V_tcp_do_prr_conservative VNET(tcp_do_prr_conservative)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, do_prr_conservative, CTLFLAG_RW,
+    &VNET_NAME(tcp_do_prr_conservative), 0,
+    "Do conservative PRR");
+
+VNET_DEFINE(int, tcp_do_prr) = 0;
+#define V_tcp_do_prr  VNET(tcp_do_prr)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_RW,
+    &VNET_NAME(tcp_do_prr), 0,
+    "Do the Proportional Rate Reduction Algorithm");
+
 VNET_DEFINE(int, tcp_do_rfc3042) = 1;
 #define	V_tcp_do_rfc3042	VNET(tcp_do_rfc3042)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
@@ -229,6 +241,7 @@ static void	 tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 static void	 tcp_xmit_timer(struct tcpcb *, int);
 static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
+static void  tcp_prr_partial_ack(struct tcpcb *, struct tcphdr *);
 static void inline 	tcp_fields_to_host(struct tcphdr *);
 #ifdef TCP_SIGNATURE
 static void inline 	tcp_fields_to_net(struct tcphdr *);
@@ -2460,7 +2473,50 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 				else if (++tp->t_dupacks > tcprexmtthresh ||
 				     IN_FASTRECOVERY(tp->t_flags)) {
 					cc_ack_received(tp, th, CC_DUPACK);
-					if ((tp->t_flags & TF_SACK_PERMIT) &&
+					if (V_tcp_do_prr &&
+					    IN_FASTRECOVERY(tp->t_flags) &&
+					    (tp->t_flags & TF_SACK_PERMIT)) {
+						long snd_cnt = 0, limit = 0, del_data = 0, pipe = 0;
+						/*
+						 *In a duplicate ACK del_data is only the
+						 *diff_in_sack. If no SACK is used del_data will be 0.
+						 *Pipe is the amount of data we estimate to be
+						 *in the network.
+						 */
+						del_data = tp->diff_in_sack;
+						pipe = (tp->snd_nxt - tp->snd_fack) +
+						    tp->sackhint.sack_bytes_rexmit;
+						tp->prr_delivered += del_data;
+						if (pipe > tp->snd_ssthresh)
+							snd_cnt = (tp->prr_delivered * tp->snd_ssthresh /
+							    tp->recover_fs) + 1 - tp->prr_out;
+						else {
+							if (V_tcp_do_prr_conservative)
+								limit = tp->prr_delivered - tp->prr_out;
+							else
+								if ((tp->prr_delivered - tp->prr_out) > del_data)
+									limit = tp->prr_delivered - tp->prr_out +
+									    tp->t_maxseg;
+								else
+									limit = del_data + tp->t_maxseg;
+							if ((tp->snd_ssthresh - pipe) < limit)
+								snd_cnt = tp->snd_ssthresh - pipe;
+							else
+								snd_cnt = limit;
+						}
+						snd_cnt = (snd_cnt / tp->t_maxseg);
+						if (snd_cnt < 0)
+							snd_cnt = 0;
+						/*
+						 * Send snd_cnt new data into the network in
+						 * response to this ack.If there is gonna be a
+						 * SACK retransmission, adjust snd_cwnd
+						 * accordingly.
+						 */
+						tp->snd_cwnd = tp->snd_nxt - tp->sack_newdata +
+						    tp->sackhint.sack_bytes_rexmit + (snd_cnt*tp->t_maxseg);
+					}
+					else if ((tp->t_flags & TF_SACK_PERMIT) &&
 					    IN_FASTRECOVERY(tp->t_flags)) {
 						int awnd;
 
@@ -2495,12 +2551,18 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 					tcp_seq onxt = tp->snd_nxt;
 
 					/*
-					 * If we're doing sack, check to
-					 * see if we're already in sack
+					 * If we're doing sack or prr, check to
+					 * see if we're already in
 					 * recovery. If we're not doing sack,
 					 * check to see if we're in newreno
 					 * recovery.
 					 */
+					if (V_tcp_do_prr) {
+						if (IN_FASTRECOVERY(tp->t_flags)) {
+							tp->t_dupacks = 0;
+							break;
+						}
+					}
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						if (IN_FASTRECOVERY(tp->t_flags)) {
 							tp->t_dupacks = 0;
@@ -2518,6 +2580,15 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 					cc_ack_received(tp, th, CC_DUPACK);
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
+					if (V_tcp_do_prr) {
+						/*
+						 * snd_ssthresh is already updated by cc_cong_signal.
+						 */
+						tp->prr_delivered = 0;
+						tp->prr_out = 0;
+						if(!(tp->recover_fs = tp->snd_nxt - tp->snd_una))
+							tp->recover_fs = 1;
+					}
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						TCPSTAT_INC(
 						    tcps_sack_recovery_episode);
@@ -2614,7 +2685,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		 */
 		if (IN_FASTRECOVERY(tp->t_flags)) {
 			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
-				if (tp->t_flags & TF_SACK_PERMIT)
+				if (V_tcp_do_prr && (tp->t_flags & TF_SACK_PERMIT))
+					tcp_prr_partial_ack(tp, th);
+				else if (tp->t_flags & TF_SACK_PERMIT)
 					tcp_sack_partialack(tp, th);
 				else
 					tcp_newreno_partial_ack(tp, th);
@@ -3692,6 +3765,57 @@ tcp_mssopt(struct in_conninfo *inc)
 	return (mss);
 }
 
+static void
+tcp_prr_partial_ack(struct tcpcb *tp, struct tcphdr *th)
+{
+	long snd_cnt = 0, limit = 0, del_data = 0, pipe = 0;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tcp_timer_activate(tp, TT_REXMT, 0);
+	tp->t_rtttime = 0;
+	/*
+	 * Compute amount of data that this ACK is indicating (del_data)
+	 * and an estimate of how many bytes are in the network.
+	 */
+	if (SEQ_GEQ(th->th_ack,tp->snd_una))
+		del_data = BYTES_THIS_ACK(tp, th);
+	del_data += tp->diff_in_sack;
+	pipe = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit;
+	tp->prr_delivered += del_data;
+	/*
+	 * Proportional Rate Reduction
+	 */
+	if (pipe > tp->snd_ssthresh)
+		snd_cnt = (tp->prr_delivered * tp->snd_ssthresh / tp->recover_fs) -
+		    tp->prr_out;
+	else {
+		if (V_tcp_do_prr_conservative)
+			limit = tp->prr_delivered - tp->prr_out;
+		else
+			if ((tp->prr_delivered - tp->prr_out) > del_data)
+				limit = tp->prr_delivered - tp->prr_out + tp->t_maxseg;
+			else
+				limit = del_data + tp->t_maxseg;
+		if ((tp->snd_ssthresh - pipe) < limit)
+			snd_cnt = tp->snd_ssthresh - pipe;
+		else
+			snd_cnt = limit;
+	}
+	snd_cnt = (snd_cnt / tp->t_maxseg);
+	if (snd_cnt < 0)
+		snd_cnt = 0;
+	/*
+	 * Send snd_cnt new data into the network
+	 * in response to this ack.
+	 * If there is gonna be a SACK retransmission,
+	 * adjust snd_cwnd accordingly.
+	 */
+	tp->snd_cwnd = tp->snd_nxt - tp->sack_newdata +
+	    tp->sackhint.sack_bytes_rexmit + (snd_cnt * tp->t_maxseg);
+	tp->t_flags |= TF_ACKNOW;
+	(void) tcp_output(tp);
+}
 
 /*
  * On a partial ack arrives, force the retransmission of the
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index 00d5415..7b4936d 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -1194,6 +1194,8 @@ send:
 		    ((so->so_options & SO_DONTROUTE) ?  IP_ROUTETOIF : 0),
 		    NULL, NULL, tp->t_inpcb);
 
+		if (V_tcp_do_prr && IN_FASTRECOVERY(tp->t_flags))
+		    tp->prr_out += len;
 		if (error == EMSGSIZE && ro.ro_rt != NULL)
 			mtu = ro.ro_rt->rt_rmx.rmx_mtu;
 		RO_RTFREE(&ro);
@@ -1232,6 +1234,8 @@ send:
 	    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
 	    tp->t_inpcb);
 
+	if (V_tcp_do_prr && IN_FASTRECOVERY(tp->t_flags))
+	    tp->prr_out += len;
 	if (error == EMSGSIZE && ro.ro_rt != NULL)
 		mtu = ro.ro_rt->rt_rmx.rmx_mtu;
 	RO_RTFREE(&ro);
@@ -1323,6 +1327,8 @@ timer:
 		 * XXX: It is a POLA question whether calling tcp_drop right
 		 * away would be the really correct behavior instead.
 		 */
+		if (V_tcp_do_prr && IN_FASTRECOVERY(tp->t_flags))
+			tp->prr_out -= len;
 		if (((tp->t_flags & TF_FORCEDATA) == 0 ||
 		    !tcp_timer_active(tp, TT_PERSIST)) &&
 		    ((flags & TH_SYN) == 0) &&
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
index 440bd64..800df2f 100644
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@@ -348,9 +348,10 @@ tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole)
 void
 tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
 {
-	struct sackhole *cur, *temp;
+	struct sackhole *cur, *temp, *temp1;
 	struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp;
 	int i, j, num_sack_blks;
+	tcp_seq old = 0, new = 0;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
@@ -382,13 +383,25 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
 				sack_blocks[num_sack_blks++] = sack;
 		}
 	}
+	if (TAILQ_EMPTY(&tp->snd_holes))
+		/*
+		 * Empty scoreboard. Need to initialize snd_fack (it may be
+		 * uninitialized or have a bogus value). Scoreboard holes
+		 * (from the sack blocks received) are created later below
+		 * (in the logic that adds holes to the tail of the
+		 * scoreboard).
+		 */
+		tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
 	/*
 	 * Return if SND.UNA is not advanced and no valid SACK block is
-	 * received.
+	 * received.If no new valid SACK block the scoreboard remains
+	 *  the same, i.e. the difference is 0.
 	 */
-	if (num_sack_blks == 0)
+	if (num_sack_blks == 0){
+		if (V_tcp_do_prr)
+			tp->diff_in_sack = 0;
 		return;
-
+	}
 	/*
 	 * Sort the SACK blocks so we can update the scoreboard with just one
 	 * pass. The overhead of sorting upto 4+1 elements is less than
@@ -403,15 +416,14 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
 			}
 		}
 	}
-	if (TAILQ_EMPTY(&tp->snd_holes))
-		/*
-		 * Empty scoreboard. Need to initialize snd_fack (it may be
-		 * uninitialized or have a bogus value). Scoreboard holes
-		 * (from the sack blocks received) are created later below
-		 * (in the logic that adds holes to the tail of the
-		 * scoreboard).
-		 */
-		tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
+	if (V_tcp_do_prr)
+	    if(!TAILQ_EMPTY(&tp->snd_holes))
+		    TAILQ_FOREACH(temp, &tp->snd_holes, scblink) {
+			    if ((temp1 = TAILQ_NEXT(temp, scblink)) != NULL)
+				    old += temp1->start - temp->end;
+			    else if (SEQ_GT(tp->snd_fack, temp->end))
+				    old += tp->snd_fack - temp->end;
+		    }
 	/*
 	 * In the while-loop below, incoming SACK blocks (sack_blocks[]) and
 	 * SACK holes (snd_holes) are traversed from their tails with just
@@ -540,6 +552,19 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
 		else
 			sblkp--;
 	}
+	/*
+	 * Calculate number of bytes in the scoreboard.
+	 */
+	if (V_tcp_do_prr)
+		if (!TAILQ_EMPTY(&tp->snd_holes))
+		    TAILQ_FOREACH(temp, &tp->snd_holes, scblink) {
+				if ((temp1 = TAILQ_NEXT(temp, scblink)) != NULL)
+					new += temp1->start - temp->end;
+				else if (SEQ_GT(tp->snd_fack, temp->end))
+					new += tp->snd_fack - temp->end;
+			}
+	/* Change in the scoreboard in # of bytes */
+	tp->diff_in_sack = new - old;
 }
 
 /*
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 5d37b50..089d8c6 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -801,6 +801,7 @@ tcp_newtcpcb(struct inpcb *inp)
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+	tp->diff_in_sack = 0;
 	tp->t_rcvtime = ticks;
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index aaaa4a4..fe1507e 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -161,6 +161,11 @@ struct tcpcb {
 	u_long	t_rttupdated;		/* number of times rtt sampled */
 	u_long	max_sndwnd;		/* largest window peer has offered */
 
+	tcp_seq prr_delivered;		/* Total bytes delivered during PRR recovery */
+	tcp_seq prr_out;			/* Total bytes sent during PRR recovery */
+	tcp_seq recover_fs; 		/* FlightSize at the start of PRR recovery */
+	tcp_seq diff_in_sack;		/* (Signed) Difference of data in scoreboard due to the current ACK */
+
 	int	t_softerror;		/* possible error not yet reported */
 /* out-of-band data */
 	char	t_oobflags;		/* have some */
@@ -174,6 +179,7 @@ struct tcpcb {
 	u_int32_t  ts_offset;		/* our timestamp offset */
 
 	tcp_seq	last_ack_sent;
+
 /* experimental */
 	u_long	snd_cwnd_prev;		/* cwnd prior to retransmit */
 	u_long	snd_ssthresh_prev;	/* ssthresh prior to retransmit */
@@ -627,8 +633,10 @@ VNET_DECLARE(int, tcp_abc_l_var);
 #define	V_tcp_abc_l_var		VNET(tcp_abc_l_var)
 
 VNET_DECLARE(int, tcp_do_sack);			/* SACK enabled/disabled */
+VNET_DECLARE(int, tcp_do_prr);			/* PRR enabled/disabled */
 VNET_DECLARE(int, tcp_sc_rst_sock_fail);	/* RST on sock alloc failure */
 #define	V_tcp_do_sack		VNET(tcp_do_sack)
+#define	V_tcp_do_prr		VNET(tcp_do_prr)
 #define	V_tcp_sc_rst_sock_fail	VNET(tcp_sc_rst_sock_fail)
 
 VNET_DECLARE(int, tcp_do_ecn);			/* TCP ECN enabled/disabled */

--Apple-Mail=_E3102E34-B41D-47A0-AB2F-5E03D718EBE4
Content-Disposition: attachment;
	filename=newcwv.patch
Content-Type: application/octet-stream;
	name="newcwv.patch"
Content-Transfer-Encoding: 7bit

diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 75609fd..0d11d9f 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -145,6 +145,12 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
     &VNET_NAME(drop_synfin), 0,
     "Drop TCP packets with SYN+FIN set");
 
+VNET_DEFINE(int, tcp_do_ncwv) = 0;
+#define V_tcp_do_ncwv  VNET(tcp_do_ncwv)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, do_ncwv, CTLFLAG_RW,
+    &VNET_NAME(tcp_do_ncwv), 0,
+    "Do New-CWV targeted to rate-limited applications");
+
 VNET_DEFINE(int, tcp_do_rfc3042) = 1;
 #define	V_tcp_do_rfc3042	VNET(tcp_do_rfc3042)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
@@ -228,6 +234,7 @@ static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 static void	 tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 static void	 tcp_xmit_timer(struct tcpcb *, int);
+static void  ncwv_check_phase(struct tcpcb *, struct tcphdr *);
 static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
 static void inline 	tcp_fields_to_host(struct tcphdr *);
 #ifdef TCP_SIGNATURE
@@ -289,6 +296,7 @@ static void inline
 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
+	int use_cc_algo=1;
 
 	tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
 	if (tp->snd_cwnd <= tp->snd_wnd)
@@ -310,7 +318,12 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
 		}
 	}
 
-	if (CC_ALGO(tp)->ack_received != NULL) {
+	if (V_tcp_do_ncwv) {
+		ncwv_check_phase(tp,th);
+		if (!IN_VALPHASE(tp->t_flags))
+			use_cc_algo = 0;
+	}
+	if (use_cc_algo && CC_ALGO(tp)->ack_received != NULL) {
 		/* XXXLAS: Find a way to live without this */
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->ack_received(tp->ccv, type);
@@ -384,6 +397,10 @@ cc_conn_init(struct tcpcb *tp)
 			tp->snd_cwnd = 4 * tp->t_maxseg;
 	}
 
+	if (V_tcp_do_ncwv) {
+		tp->max_ack_prev = tp->iss;
+		tp->IW = tp->snd_cwnd;
+	}
 	if (CC_ALGO(tp)->conn_init != NULL)
 		CC_ALGO(tp)->conn_init(tp->ccv);
 }
@@ -433,6 +450,28 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
 		break;
 	}
 
+	/*
+	 *Exit the NVP, that means, stop the 5 min counter.
+	 *Flag on unset and stopped timer means that
+	 *we have exited the NVP. Set pipeAck to max value.
+	 *Record Flightsize so as to use it at the end
+	 *of the congestion.
+	 */
+	if (V_tcp_do_ncwv && type != CC_DUPACK && type != CC_RTO_ERR) {
+		/*
+		 *Exit the NVP, that means, stop the 5 min counter.
+		 *Flag on unset and stopped timer means that
+		 *we have exited the NVP. Set pipeAck to max value.
+		 *Record Flightsize so as to use it at the end
+		 *of the congestion.
+		 */
+		tp->lossflightsize = tp->snd_max - tp->snd_una;
+		tcp_timer_activate(tp, TT_NVP, 0);
+		EXIT_VALPHASE(tp->t_flags);
+		tp->pipeack = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+		if(type != CC_RTO)
+			tp->snd_cwnd = min(tp->snd_cwnd/2,max(tp->pipeack,tp->lossflightsize));
+	}
 	if (CC_ALGO(tp)->cong_signal != NULL) {
 		if (th != NULL)
 			tp->ccv->curack = th->th_ack;
@@ -451,6 +490,16 @@ cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->post_recovery(tp->ccv);
 	}
+	if (V_tcp_do_ncwv)
+		/*
+		 * Fast recovery will conclude after returning from this
+		 * function. Reset the cwin to the value specified by the draft,
+		 * cwnd = ((FlightSize - R)/2), if SACK is used, standard behaviour
+		 * otherwise and also reset pipeACK
+		 */
+		tp->pipeack = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+		if ((tp->t_flags & TF_SACK_PERMIT))
+			tp->snd_cwnd = (tp->lossflightsize - tp->sackhint.sack_bytes_rexmit) / 2;
 	/* XXXLAS: EXIT_RECOVERY ? */
 	tp->t_bytes_acked = 0;
 }
@@ -3692,6 +3741,30 @@ tcp_mssopt(struct in_conninfo *inc)
 	return (mss);
 }
 
+/*
+ * Check whether the sender lies in the Validated or
+ * non-Validate period.
+ */
+static void
+ncwv_check_phase(struct tcpcb *tp, struct tcphdr *th)
+{
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	if (th->th_ack >= tp->max_ack_prev) {
+		tp->pipeack = tp->snd_max - tp->snd_una;
+		tp->max_ack_prev = th->th_ack;
+		tcp_timer_activate(tp, TT_PACK, max(3*tp->t_srtt,1000));
+	}
+	if (tp->pipeack >= (tp->snd_cwnd / 2)){
+		ENTER_VALPHASE(tp->t_flags);
+		if(tcp_timer_active(tp, TT_NVP))
+			tcp_timer_activate(tp, TT_NVP, 0);
+	} else {
+		EXIT_VALPHASE(tp->t_flags);
+		if(!tcp_timer_active(tp, TT_NVP))
+			tcp_timer_activate(tp, TT_NVP, 300000);
+	}
+}
 
 /*
  * On a partial ack arrives, force the retransmission of the
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index 00d5415..fc37006 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -158,8 +158,10 @@ cc_after_idle(struct tcpcb *tp)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
-	if (CC_ALGO(tp)->after_idle != NULL)
+	if (!V_tcp_do_ncwv && CC_ALGO(tp)->after_idle != NULL)
 		CC_ALGO(tp)->after_idle(tp->ccv);
+	else if(V_tcp_do_ncwv)
+		tp->pipeack = 0;
 }
 
 /*
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 5d37b50..bd4151e 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -783,6 +783,8 @@ tcp_newtcpcb(struct inpcb *inp)
 	callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE);
+	callout_init(&tp->t_timers->tt_nvp, CALLOUT_MPSAFE);
+	callout_init(&tp->t_timers->tt_pack, CALLOUT_MPSAFE);
 
 	if (V_tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
@@ -802,6 +804,10 @@ tcp_newtcpcb(struct inpcb *inp)
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
+	if (V_tcp_do_ncwv) {
+		tp->lossflightsize = 0;
+		tp->pipeack = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+	}
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
@@ -931,6 +937,8 @@ tcp_discardcb(struct tcpcb *tp)
 	callout_stop(&tp->t_timers->tt_keep);
 	callout_stop(&tp->t_timers->tt_2msl);
 	callout_stop(&tp->t_timers->tt_delack);
+	callout_stop(&tp->t_timers->tt_nvp);
+	callout_stop(&tp->t_timers->tt_pack);
 
 	/*
 	 * If we got enough samples through the srtt filter,
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 7c27397..a4f211f 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -646,6 +646,54 @@ out:
 }
 
 void
+tcp_timer_nvp(void * xtp)
+{
+	struct tcpcb *tp = xtp;
+	struct inpcb *inp;
+	CURVNET_SET(tp->t_vnet);
+	INP_INFO_WLOCK(&V_tcbinfo);
+	inp = tp->t_inpcb;
+	if (inp == NULL) {
+		tcp_timer_race++;
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		CURVNET_RESTORE();
+		return;
+	}
+	INP_WLOCK(inp);
+	callout_deactivate(&tp->t_timers->tt_nvp);
+	tp->snd_cwnd = max(tp->snd_cwnd / 2 , tp->IW );
+	tp->snd_ssthresh =  max(tp->snd_ssthresh, 3 * tp->snd_cwnd /4);
+	if (tp != NULL)
+		INP_WUNLOCK(inp);
+	INP_INFO_WUNLOCK(&V_tcbinfo);
+	CURVNET_RESTORE();
+}
+
+void
+tcp_timer_pack(void * xtp)
+{
+	struct tcpcb *tp = xtp;
+	struct inpcb *inp;
+	CURVNET_SET(tp->t_vnet);
+	INP_INFO_WLOCK(&V_tcbinfo);
+	inp = tp->t_inpcb;
+	if (inp == NULL) {
+		tcp_timer_race++;
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		CURVNET_RESTORE();
+		return;
+	}
+	INP_WLOCK(inp);
+	callout_deactivate(&tp->t_timers->tt_pack);
+	tp->snd_cwnd = max(tp->snd_cwnd / 2, tp->IW );
+	tp->snd_ssthresh =  max(tp->snd_ssthresh, 3 * tp->snd_cwnd /4);
+	if (tp != NULL)
+		INP_WUNLOCK(inp);
+	INP_INFO_WUNLOCK(&V_tcbinfo);
+	CURVNET_RESTORE();
+}
+
+void
 tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
 {
 	struct callout *t_callout;
@@ -679,6 +727,14 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
 			t_callout = &tp->t_timers->tt_2msl;
 			f_callout = tcp_timer_2msl;
 			break;
+		case TT_NVP:
+			t_callout = &tp->t_timers->tt_nvp;
+			f_callout = tcp_timer_nvp;
+			break;
+		case TT_PACK:
+			t_callout = &tp->t_timers->tt_pack;
+			f_callout = tcp_timer_pack;
+			break;
 		default:
 			panic("bad timer_type");
 		}
@@ -710,6 +766,12 @@ tcp_timer_active(struct tcpcb *tp, int timer_type)
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			break;
+		case TT_NVP:
+			t_callout = &tp->t_timers->tt_nvp;
+			break;
+		case TT_PACK:
+			t_callout = &tp->t_timers->tt_pack;
+			break;
 		default:
 			panic("bad timer_type");
 		}
@@ -738,5 +800,9 @@ tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
 		xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_2msl))
 		xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
+	if (callout_active(&timer->tt_nvp))
+		xtimer->tt_nvp = ticks_to_msecs(timer->tt_nvp.c_time - ticks);
+	if (callout_active(&timer->tt_pack))
+		xtimer->tt_pack = ticks_to_msecs(timer->tt_pack.c_time - ticks);
 	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
 }
diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h
index 3115fb3..7d038e8 100644
--- a/sys/netinet/tcp_timer.h
+++ b/sys/netinet/tcp_timer.h
@@ -146,12 +146,16 @@ struct tcp_timer {
 	struct	callout tt_keep;	/* keepalive */
 	struct	callout tt_2msl;	/* 2*msl TIME_WAIT timer */
 	struct	callout tt_delack;	/* delayed ACK timer */
+	struct 	callout tt_nvp;     /* non validated timer period */
+	struct  callout tt_pack; /* timer for pipeack measurement */
 };
 #define TT_DELACK	0x01
 #define TT_REXMT	0x02
 #define TT_PERSIST	0x04
 #define TT_KEEP		0x08
 #define TT_2MSL		0x10
+#define TT_NVP		0x20
+#define TT_PACK		0x40
 
 #define	TP_KEEPINIT(tp)	((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit)
 #define	TP_KEEPIDLE(tp)	((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle)
@@ -183,6 +187,8 @@ void	tcp_timer_keep(void *xtp);
 void	tcp_timer_persist(void *xtp);
 void	tcp_timer_rexmt(void *xtp);
 void	tcp_timer_delack(void *xtp);
+void	tcp_timer_nvp(void *xtp);
+void	tcp_timer_pack(void *xtp);
 void	tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
 	struct xtcp_timer *xtimer);
 
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index aaaa4a4..c8c148f 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -137,7 +137,7 @@ struct tcpcb {
 					 * for slow start exponential to
 					 * linear switch
 					 */
-	u_long	snd_spare2;		/* unused */
+	u_long	IW;		/* initial cong window */
 	tcp_seq	snd_recover;		/* for use in NewReno Fast Recovery */
 
 	u_int	t_maxopd;		/* mss plus options */
@@ -147,8 +147,9 @@ struct tcpcb {
 	u_int	t_rtttime;		/* RTT measurement start time */
 	tcp_seq	t_rtseq;		/* sequence number being timed */
 
-	u_int	t_bw_spare1;		/* unused */
-	tcp_seq	t_bw_spare2;		/* unused */
+	u_long	lossflightsize;	/* flightsize at the beggining of current recovery event */
+	u_long	pipeack;		/* amount of data acked per RTT */
+	tcp_seq	max_ack_prev;	/* caching of previous value of snd_max when rtt was measured */
 
 	int	t_rxtcur;		/* current retransmit value (ticks) */
 	u_int	t_maxseg;		/* maximum segment size */
@@ -247,6 +248,11 @@ struct tcpcb {
 #define	TF_ECN_SND_ECE	0x10000000	/* ECN ECE in queue */
 #define	TF_CONGRECOVERY	0x20000000	/* congestion recovery mode */
 #define	TF_WASCRECOVERY	0x40000000	/* was in congestion recovery */
+#define TF_RLIMPHASE 0x80000000     /* ncwv phase */
+
+#define	IN_VALPHASE(t_flags)	(t_flags & TF_RLIMPHASE)
+#define	ENTER_VALPHASE(t_flags)	t_flags |= TF_RLIMPHASE
+#define	EXIT_VALPHASE(t_flags)	t_flags &= ~TF_RLIMPHASE
 
 #define	IN_FASTRECOVERY(t_flags)	(t_flags & TF_FASTRECOVERY)
 #define	ENTER_FASTRECOVERY(t_flags)	t_flags |= TF_FASTRECOVERY
@@ -561,6 +567,8 @@ struct xtcp_timer {
 	int tt_keep;	/* keepalive */
 	int tt_2msl;	/* 2*msl TIME_WAIT timer */
 	int tt_delack;	/* delayed ACK timer */
+	int tt_nvp;		/* non-Validation period timer */
+	int tt_pack;	/* pipeack sample timer */
 	int t_rcvtime;	/* Time since last packet received */
 };
 struct	xtcpcb {
@@ -627,8 +635,10 @@ VNET_DECLARE(int, tcp_abc_l_var);
 #define	V_tcp_abc_l_var		VNET(tcp_abc_l_var)
 
 VNET_DECLARE(int, tcp_do_sack);			/* SACK enabled/disabled */
+VNET_DECLARE(int, tcp_do_ncwv);			/* New-CWV enabled/disabled */
 VNET_DECLARE(int, tcp_sc_rst_sock_fail);	/* RST on sock alloc failure */
 #define	V_tcp_do_sack		VNET(tcp_do_sack)
+#define	V_tcp_do_ncwv		VNET(tcp_do_ncwv)
 #define	V_tcp_sc_rst_sock_fail	VNET(tcp_sc_rst_sock_fail)
 
 VNET_DECLARE(int, tcp_do_ecn);			/* TCP ECN enabled/disabled */

--Apple-Mail=_E3102E34-B41D-47A0-AB2F-5E03D718EBE4
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=us-ascii



--Apple-Mail=_E3102E34-B41D-47A0-AB2F-5E03D718EBE4--

--Apple-Mail=_5088A38A-DF43-450D-9E03-31C51BD176C7
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment; filename="signature.asc"
Content-Type: application/pgp-signature; name="signature.asc"
Content-Description: Message signed with OpenPGP using GPGMail

-----BEGIN PGP SIGNATURE-----

iQCVAwUBUvC1ENZcnpRveo1xAQKL0AQApMvaMxZPDSPEuEkTD2YRg8Q0YSYiSS7I
PT0PE/sOUZ8k9kx2K78APzb8uZ3rnBnvhi9sRskd1m0iWHwTROnbKbkxz6PVQSHQ
L7OCcUbAZkHGI/t3NTpxAPS5b8MZs81OUpjKFezJvnU3qvXObsN81Oh5u/eUZB2A
p259UbG6SaY=
=pyap
-----END PGP SIGNATURE-----

--Apple-Mail=_5088A38A-DF43-450D-9E03-31C51BD176C7--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?259C9434-C6FE-42EA-823D-ECB024DBF3D7>