Date: Wed, 1 Jul 2009 08:00:45 GMT From: Andre Oppermann <andre@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 165494 for review Message-ID: <200907010800.n6180jQT057236@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=165494 Change 165494 by andre@andre_t61 on 2009/07/01 08:00:14 Dump of WIP from my Laptop. Still much shuffling. Picture is refining a bit all the time blike a progressive jpeg. Affected files ... .. //depot/projects/tcp_new/netinet/tcp_output.c#9 edit .. //depot/projects/tcp_new/netinet/tcp_syncache.c#4 edit .. //depot/projects/tcp_new/netinet/tcp_var.h#9 edit Differences ... ==== //depot/projects/tcp_new/netinet/tcp_output.c#9 (text+ko) ==== @@ -100,6 +100,20 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, &tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); +static int +tcp_send(struct tcpcb *tp, struct tcpopt *to, int len, int rwin, int flags); +static int +tcp_retransmit(struct tcpcb *tp, int *len); +static int +tcp_send_segments(struct tcpcb *tp, struct tcphdr *ths, struct tcpopt *opt, + int off, int *olen, int optlen); +static u_int +tcp_rcv_wnd(struct tcpcb *tp, struct socket *so); +static void +tcp_snd_pace(struct tcpcp *tp); +static void +tcp_options(struct tcpcb *tp, struct tcpopt *to, int flags); + /* * Tcp output routine: figure out what should be sent and send it. * @@ -157,30 +171,6 @@ flags = tcp_outflags[tp->t_state]; /* - * Determine our current receive window. - * This value is used for the window field in the TCP - * header and to determine whether we have to send a - * window update. - * - * NB: rwin is already downscaled. - */ - rwin = tcp_rcv_wnd(tp, so); - - /* - * We have been idle for "a while" and no acks are - * expected to clock out any data we send -- - * slow start to get ack "clock" running again. - * RFC2581: Restart window. - * - * XXXAO: Use a decaying algorithm. It's not useful - * to have cwnd to drop of a cliff. See RFC2861. - */ - if (tp->snd_nxt == tp->snd_una && - (ticks - tp->t_rcvtime) >= max(tp->t_rxtcur, tcp_min_idle)) { - tp->snd_cwnd = tcp_init_cwnd(tp); - } - - /* * Determine length of data that should be transmitted, if there * is some data to send, then transmit; otherwise, investigate further. * @@ -230,18 +220,74 @@ return (0); /* next token is pending */ } + /* + * Conservative approximation of data still travelling in the network. + */ inflight = duna - tp->snd_sacked; + /* + * Determine our current receive window. + * This value is used for the window field in the TCP + * header and to determine whether we have to send a + * window update. + * + * NB: rwin is already downscaled. + */ + rwin = tcp_rcv_wnd(tp, so); + + /* + * Act based on the phase we are in. + */ switch (tp->t_phase) { case TP_IDLE: - case TP_SLOWSTART: - case TP_CONGAVOID: + /* + * We have been idle for "a while" and no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + * RFC2581: Restart window. + * + * XXXAO: Use a decaying algorithm. It's not useful + * to have cwnd to drop of a cliff. See RFC2861. + */ + if (tp->snd_nxt == tp->snd_una && + (ticks - tp->t_rcvtime) >= max(tp->t_rxtcur, tcp_min_idle)) { + tp->snd_cwnd = tcp_init_cwnd(tp); + } + break; + case TP_SENDING: + break; case TP_LOSSRECOV: + case TP_REXMT: tcp_retransmit(tp, &len); if (len = 0) return (0); - case TP_LOSSREXMT: + break; case TP_PERSIST: + /* + * Persistent mode. + * Send out probe byte if there is data available. + * RFC793: section 3.7, page 42-44 + * RFC1122: section 4.2.2.17 + */ + if (swnd == 0 && dlen > 0 && (tp->t_flags & TF_FORCEDATA)) { + len = 1; + goto send; + } + if (swnd == 0 && duna > tp->snd_wnd) { + /* + * Window shrank + * after we sent into it. If window shrank to 0, + * cancel pending retransmit, pull snd_nxt back + * to (closed) window, and set the persist timer + * if it isn't already going. If the window didn't + * close completely, just wait for an ACK. + */ + tcp_timer_activate(tp, TT_REXMT, 0); + tp->t_rxtshift = 0; + if (!tcp_timer_active(tp, TT_PERSIST)) + tcp_setpersist(tp); + } + break; case TP_RETRY: case TP_URGENT: break; @@ -290,7 +336,7 @@ * a duplicate ACK (if the ack value didn't move forward). The * question whether the other implementations see it the same way. */ - if ((tp->t_flags & TF_DUPACK) && tp->snd_dupack > 0) { + if ((tp->t_flags & TF_DUPACK) && tp->rcv_trqlen > 0) { if (!(tp->t_flags & TF_SACK_PERMIT)) len = 0; goto send; @@ -318,26 +364,30 @@ * b) silly window syndrome: buffer almost full * * Quoting Nagle: - * <<The concept behind delayed ACKs is to bet, when receiving some data from the net, - * that the local application will send a reply very soon. So there's no need to - * send an ACK immediately; the ACK can be piggybacked on the next data going the - * other way. If that doesn't happen, after a 500ms delay, an ACK is sent anyway. - * The concept behind the Nagle algorithm is that if the sender is doing very tiny - * writes (like single bytes, from Telnet), there's no reason to have more than one - * packet outstanding on the connection. This prevents slow links from choking with - * huge numbers of outstanding tinygrams. - * Both are reasonable. But they interact badly in the case where an application does - * two or more small writes to a socket, then waits for a reply. (X-Windows is notorious - * for this.) When an application does that, the first write results in an immediate - * packet send. The second write is held up until the first is acknowledged. But because - * of the delayed ACK strategy, that acknowledgement is held up for 500ms. This adds - * 500ms of latency to the transaction, even on a LAN. - * The real problem is that 500ms unconditional delay. (Why 500ms? That was a reasonable - * response time for a time-sharing system of the 1980s.) As mentioned above, delaying - * an ACK is a bet that the local application will reply to the data just received. - * Some apps, like character echo in Telnet servers, do respond every time. Others, - * like X-Windows "clients" (really servers, but X is backwards about this), only reply - * some of the time.>> + * <<The concept behind delayed ACKs is to bet, when receiving some + * data from the net, that the local application will send a reply + * very soon. So there's no need to send an ACK immediately; + * the ACK can be piggybacked on the next data going the other way. + * If that doesn't happen, after a 500ms delay, an ACK is sent anyway. + * The concept behind the Nagle algorithm is that if the sender is + * doing very tiny writes (like single bytes, from Telnet), there's + * no reason to have more than one packet outstanding on the connection. + * This prevents slow links from choking with huge numbers of outstanding + * tinygrams. Both are reasonable. But they interact badly in the case + * where an application does two or more small writes to a socket, then + * waits for a reply. (X-Windows is notorious for this.) When an + * application does that, the first write results in an immediate + * packet send. The second write is held up until the first is + * acknowledged. But because of the delayed ACK strategy, that + * acknowledgement is held up for 500ms. This adds 500ms of latency + * to the transaction, even on a LAN. The real problem is that 500ms + * unconditional delay. (Why 500ms? That was a reasonable response + * time for a time-sharing system of the 1980s.) As mentioned above, + * delaying an ACK is a bet that the local application will reply to + * the data just received. Some apps, like character echo in Telnet + * servers, do respond every time. Others, like X-Windows "clients" + * (really servers, but X is backwards about this), only reply some + * of the time.>> * http://developers.slashdot.org/comments.pl?sid=174457&threshold=1&commentsort=0&mode=thread&cid=14515105 * * XXXAO: mss - options! @@ -372,31 +422,6 @@ } /* - * Persistent mode. - * Send out probe byte if there is data available. - * RFC793: section 3.7, page 42-44 - * RFC1122: section 4.2.2.17 - */ - if (swnd == 0 && dlen > 0 && (tp->t_flags & TF_FORCEDATA)) { - len = 1; - goto send; - } - if (swnd == 0 && duna > tp->snd_wnd) { - /* - * Window shrank - * after we sent into it. If window shrank to 0, - * cancel pending retransmit, pull snd_nxt back - * to (closed) window, and set the persist timer - * if it isn't already going. If the window didn't - * close completely, just wait for an ACK. - */ - tcp_timer_activate(tp, TT_REXMT, 0); - tp->t_rxtshift = 0; - if (!tcp_timer_active(tp, TT_PERSIST)) - tcp_setpersist(tp); - } - - /* * Send window update? * * The receive window informs the remote side about the @@ -457,7 +482,7 @@ return (tcp_send(tp, &to, flags)); } -int +static int tcp_send(struct tcpcb *tp, struct tcpopt *to, int len, int rwin, int flags) { @@ -470,6 +495,7 @@ * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. + * * NB: For now we don't send any data with SYN. This will have * to change if some reincarnation of T/TCP comes up again. */ @@ -492,25 +518,18 @@ else if (tp->t_flags & TF_DUPACK) th->th_win = (u_short)tp->rcv_advwin; else - th->th_win = (u_short)(rwin >> tp->rcv_scale); + th->th_win = (u_short)rwin; - SOCKBUF_LOCK(&so->so_snd); /* * Fill in fields. */ - if (tp->snd_nxt == tp->snd_rxmit) { - th->th_seq = tp->snd_nxt; - off = tp->snd_nxt - tp->snd_una; - } else { - th->th_seq = tp->snd_rxmit; - off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc); - } - + th->th_seq = tp->snd_nxt; th->th_flags = flags; th->th_ack = tp->rcv_nxt; + SOCKBUF_LOCK(&so->so_snd); + off = tp->snd_nxt - tp->snd_una; error = tcp_send_segments(tp, &ths, opt, off, &len, optlen); - SOCKBUF_UNLOCK(&so->so_snd); /* @@ -735,7 +754,7 @@ static int tcp_retransmit(struct tcpcb *tp, int *len) { - struct tcphdr ths; + struct tcphdr ths, *th; /* * Retransmit over the SACK holes. @@ -744,6 +763,13 @@ * Retransmit only the stuff that was not SACK'ed. */ /* + * The moment we receive a duplicate ACK everything freezes. + * No more new data is sent except for those allowed by limited + * transmit. + * The fast recovery algorithms start their work by the third + * duplicate ACK. + */ + /* * We have the following mechanisms: * 1. Fast recovery: After we get three duplicate ACKs RFC2581 * 2. NewReno RFC3782 @@ -752,6 +778,28 @@ * 5. TCP congestion window validation RFC2861 */ + /* Limited transmit */ + if (tp->snd_dupack < 3) + *len = min(len, tp->snd_mss); /* one mss */ + else + *len = 0; + + if (tp->snd_dupack < 3) + return; + + /* + * XXXAO: Temporary. + */ + tp->snd_rxmit = tp->snd_una; + + /* + * Fill in headers. + */ + th->th_win = (u_short)rwin; + th->th_seq = tp->snd_rxmit; + th->th_flags = flags; + th->th_ack = tp->rcv_nxt; + /* * If resending a SYN or FIN, be sure NOT to use a new sequence number. */ @@ -761,21 +809,11 @@ th->th_seq == tp->snd_nxt) th->th_seq--; - /* - * The moment we receive a duplicate ACK everything freezes. - * No more new data is sent except for those allowed by limited - * transmit. - * The fast recovery algorithms start their work by the third - * duplicate ACK. - */ + SOCKBUF_LOCK(&so->so_snd); + off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc); error = tcp_send_segments(tp, &ths, opt, off, olen, optlen); + SOCKBUF_UNLOCK(&so->so_snd); - /* Limited transmit */ - if (tp->snd_dupack < 3) - *len = min(len, tp->snd_mss); /* one mss */ - else - *len = 0; - return; } @@ -879,6 +917,14 @@ ("%s: segment too big", __func__)); /* + * Do not send small fragments unless we empty the buffer + * or this is the only segment. + */ + if (slen < tp->snd_mss - optlen && *olen > 0 && + off + slen == so->so_snd.sb_cc) + break; + + /* * Allocate an mbuf sufficiently large to hold all * headers for this segment plus space for the link * headers to remove the need for prepends in the @@ -987,7 +1033,8 @@ * Set the PUSH bit to indicate that we have reached * the end of the send buffer. */ - if (slen > 0 && off + slen == so->so_snd.sb_cc) + if (slen > 0 && !(tp->t_flags & TF_MORETOCOME) && + off + slen == so->so_snd.sb_cc) th->th_flags |= TH_PUSH; KASSERT(off + slen <= so->so_snd.sb_cc, @@ -1178,7 +1225,7 @@ return; } -void +static void tcp_options(struct tcpcb *tp, struct tcpopt *to, int flags) { /* ==== //depot/projects/tcp_new/netinet/tcp_syncache.c#4 (text+ko) ==== @@ -768,6 +768,10 @@ goto abort; } } + + /* + * Initialize the TCP control block. + */ tp = intotcpcb(inp); tp->t_state = TCPS_SYN_RECEIVED; tp->iss = sc->sc_iss; ==== //depot/projects/tcp_new/netinet/tcp_var.h#9 (text+ko) ==== @@ -139,13 +139,12 @@ u_int t_phase; /* send phase we are currently in */ #define TP_IDLE 0 /* nothing to send */ -#define TP_SLOWSTART 1 /* slow start */ -#define TP_CONGAVOID 2 /* congestion avoidance */ -#define TP_LOSSRECOV 3 /* loss recovery */ -#define TP_LOSSREXMT 4 /* loss recovery failed, retransmit */ -#define TP_PERSIST 5 /* persistent mode */ -#define TP_RETRY 6 /* retry after ENOMEM or ENOBUF */ -#define TP_URGENT 7 /* urgent mode */ +#define TP_SENDING 1 /* sending data */ +#define TP_LOSSRECOV 2 /* loss recovery */ +#define TP_REXMT 3 /* loss recovery failed, retransmit */ +#define TP_PERSIST 4 /* persistent mode */ +#define TP_RETRY 5 /* retry after ENOMEM or ENOBUF */ +#define TP_URGENT 6 /* urgent mode */ int t_softerror; /* possible error not yet reported */ @@ -173,7 +172,7 @@ u_int snd_delackdelay; /* time to delay an ACK in ticks */ int snd_dupack; /* number of duplicate ACK's reveived */ - tcp_seq snd_fr_recover; /* fast retransmit recover */ + tcp_seq snd_recover; /* fast retransmit recover */ int snd_abcack; /* count the ack'ed data for ABC */ tcp_seq snd_rtseq; /* seq# of current RTT measurement */ @@ -200,6 +199,7 @@ uint8_t rcv_scale; /* window scaling for recv window */ struct trq_head rcv_trq; /* segment reassembly queue */ int rcv_trqlen; /* segment reassembly queue length in bytes */ + int rcv_dupack; /* duplicate acks we sent */ tcp_ts tsecr_recent; /* timestamp echo data */ u_long tsecr_age; /* when echo last updated */ @@ -247,7 +247,6 @@ tcp_win snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ tcp_win t_badrxtwin; /* window for retransmit recovery */ - int snd_limited; /* segments limited transmitted */ }; /*
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200907010800.n6180jQT057236>