From owner-p4-projects@FreeBSD.ORG Wed Jan 16 21:45:39 2008 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id 68CFF16A420; Wed, 16 Jan 2008 21:45:39 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 14D9C16A419 for ; Wed, 16 Jan 2008 21:45:39 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id 03DF313C4F6 for ; Wed, 16 Jan 2008 21:45:39 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.1/8.14.1) with ESMTP id m0GLjcV8036758 for ; Wed, 16 Jan 2008 21:45:38 GMT (envelope-from andre@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.1/8.14.1/Submit) id m0GLjctZ036755 for perforce@freebsd.org; Wed, 16 Jan 2008 21:45:38 GMT (envelope-from andre@freebsd.org) Date: Wed, 16 Jan 2008 21:45:38 GMT Message-Id: <200801162145.m0GLjctZ036755@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to andre@freebsd.org using -f From: Andre Oppermann To: Perforce Change Reviews Cc: Subject: PERFORCE change 133432 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 16 Jan 2008 21:45:39 -0000 http://perforce.freebsd.org/chv.cgi?CH=133432 Change 133432 by andre@andre_flirtbox on 2008/01/16 21:45:36 Handle all edge cases. Add SEQ_DELTA() function to ease segment trimming. Affected files ... .. //depot/projects/tcp_reass/netinet/tcp_reass.c#7 edit .. //depot/projects/tcp_reass/netinet/tcp_seq.h#2 edit Differences ... ==== //depot/projects/tcp_reass/netinet/tcp_reass.c#7 (text+ko) ==== @@ -100,23 +100,25 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); -static int tcp_reass_maxseg = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, - &tcp_reass_maxseg, 0, +static int tcp_reass_maxblocks = 0; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN, + &tcp_reass_maxblocks, 0, "Global maximum number of TCP Segment Blocks in Reassembly Queue"); int tcp_reass_qsize = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD, &tcp_reass_qsize, 0, "Global number of TCP Segment Blocks currently in Reassembly Queue"); +static void tcp_reass_merge(struct tcpcb *, int *, struct trq *, struct trq *); + /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) { - tcp_reass_maxseg = nmbclusters / 16; - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); + tcp_reass_maxblocks = nmbclusters / 16; + uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); } uma_zone_t tcp_reass_zone; @@ -126,12 +128,12 @@ { /* XXX: nmbclusters may be zero. */ - tcp_reass_maxseg = nmbclusters / 16; - TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", - &tcp_reass_maxseg); + tcp_reass_maxblocks = nmbclusters / 16; + TUNABLE_INT_FETCH("net.inet.tcp.reass.maxblocks", + &tcp_reass_maxblocks); tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct trq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); + uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); EVENTHANDLER_REGISTER(nmbclusters_change, tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -150,11 +152,21 @@ * Call with th==NULL after becoming established to * force pre-ESTABLISHED data up to user socket. */ - if (th == NULL) + if (th == NULL) { + if (!TCPS_HAVEESTABLISHED(tp->t_state) || + TAILQ_EMPTY(&tp->t_trq) || + ((tqe = TAILQ_FIRST(&tp->t_trq)) && + tqe->trq_seq != tp->rcv_nxt)) + return (0); goto present; + } - KASSERT(SEQ_LEQ(tp->rcv_nxt, th->th_seq), + KASSERT(*tlenp > 0, + ("%s: segment doesn't contain any data", __func__)); + KASSERT(SEQ_GT(tp->rcv_nxt, th->th_seq), ("%s: sequence number below rcv_nxt", __func__)); + KASSERT(!(tp->rcv_nxt == th->th_seq) || !(TAILQ_EMPTY(&tp->t_trq)), + ("%s: got missing segment but queue is empty", __func__)); /* * Limit the number of segments in the reassembly queue to prevent @@ -183,13 +195,22 @@ return (0); } + /* Get rid of packet header and mtags. */ + m_demote(m, 1); + +#if 0 + /* Trim empty mbufs from head of chain. */ + while (m->m_len == 0) { + n = m; + m = m->m_next; + m_free(n); + } +#endif + /* NB: m_adj(m, -i) may free mbufs at the tail of a chain. */ for (mcnt = 0, n = m; n; n = n->m_next) mcnt += (n->m_flags & M_EXT) ? n->m_ext.ext_size : MSIZE; - /* Get rid of packet header and mtags. */ - m_demote(m, 1); - /* Check if this segment attaches to the end. */ tqe = TAILQ_LAST(&tp->t_trq, trq_head); if (tqe && tqe->trq_seq + tqe->trq_len == th->th_seq) { @@ -204,27 +225,33 @@ return (0); } + /* Check if beyond last block. */ + if (tqe && SEQ_LT(tqe->trq_seq + tqe->trq_len, th->th_seq)) + goto insert; + + /* Check if this is the first segment. */ + if (TAILQ_EMPTY(&tp->t_trq)) + goto insert; + /* Check if this is the missing segment. */ if (tp->rcv_nxt == th->th_seq) { tqe = TAILQ_FIRST(&tp->t_trq); - KASSERT(tqe != NULL, - ("%s: missing segment but nothing in queue", __func__)); - KASSERT(SEQ_GT(tqe->trq_seq, th->th_seq), + KASSERT(SEQ_LEQ(tqe->trq_seq, th->th_seq), ("%s: first block starts below missing segment", __func__)); - if (SEQ_LT(tqe->trq_seq, th->th_seq + *tlenp)) { + /* Check if segment prepends first block. */ + if (SEQ_LEQ(tqe->trq_seq, th->th_seq + *tlenp)) { /* Trim tail of segment. */ - if ((i = tqe->trq_seq - (th->th_seq + *tlenp))) { - m_adj(m, i); - *tlenp += i; /* NB: i is negative */ + if ((i = SEQ_DELTA(tqe->trq_seq, th->th_seq + *tlenp))) { + m_adj(m, -i); + *tlenp -= i; /* TCP statistics. */ tcpstat.tcps_rcvpartduppack++; - tcpstat.tcps_rcvpartdupbyte -= i; + tcpstat.tcps_rcvpartdupbyte += i; /* Update accounting. */ for (mcnt = 0, n = m; n; n = n->m_next) mcnt += (n->m_flags & M_EXT) ? n->m_ext.ext_size : MSIZE; } - /* Segment prepends first block. */ tqe->trq_len += *tlenp; tqe->trq_mcnt += mcnt; tp->t_trqmcnt += mcnt; @@ -234,7 +261,7 @@ tqe->trq_m = m; goto present; } - goto insert; + goto insert; /* No statistics, this segment is in line. */ } /* TCP statistics. */ @@ -242,13 +269,19 @@ tcpstat.tcps_rcvoobyte += *tlenp; /* See where it fits. */ - TAILQ_FOREACH(tqe, &tp->t_trq, trq_q) { + TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { +#if 1 /* Segment is after this blocks coverage. */ if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th->th_seq)) continue; +#endif /* Segment is after the previous one but before this one. */ if (SEQ_GT(tqe->trq_seq, th->th_seq + *tlenp)) break; /* Insert as new block. */ + + KASSERT(SEQ_GT(tqe->trq_seq, th->th_seq + *tlenp), + ("%s: iterated past insert point", __func__)); + /* Segment is already fully covered. */ if (SEQ_LEQ(tqe->trq_seq, th->th_seq) && SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th->th_seq + *tlenp)) { @@ -258,54 +291,36 @@ *tlenp = 0; return (0); } - /* Segment appends to this block. */ - if (SEQ_LEQ(tqe->trq_seq + tqe->trq_len, th->th_seq)) { - /* Trim head of segment. */ - if ((i = tqe->trq_seq + tqe->trq_len - th->th_seq)) { - m_adj(m, i); - *tlenp -= i; - /* TCP Statistics. */ - tcpstat.tcps_rcvpartduppack++; - tcpstat.tcps_rcvpartdupbyte += i; - /* XXX dupes */ - } - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; + + /* Segment covers and extends on both ends. */ + if (SEQ_GEQ(tqe->trq_seq, th->th_seq) && + SEQ_LT(tqe->trq_seq + tqe->trq_len, th->th_seq + *tlenp)) { + /* Replace block content. */ + tp->t_trqmcnt -= tqe->trq_mcnt; + m_freem(tqe->trq_m); + tqe->trq_len = *tlenp; + tqe->trq_mcnt = mcnt; tp->t_trqmcnt += mcnt; - tqe->trq_ml->m_next = m; + tqe->trq_seq = th->th_seq; + tqe->trq_m = m; tqe->trq_ml = m_last(m); - /* Check if segment bridges two blocks to merge. */ - if ((tqen = TAILQ_NEXT(tqe, trq_q)) && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq)) { - /* Trim head of next block. */ - /* XXXAO: Should trim tail of segment instead. */ - if ((i = tqe->trq_seq + tqe->trq_len - - tqen->trq_seq)) { - m_adj(tqen->trq_m, i); - tqen->trq_len -= i; - /* TCP statistics. */ - tcpstat.tcps_rcvpartduppack++; - tcpstat.tcps_rcvpartdupbyte += i; - } - tqe->trq_len += tqen->trq_len; - tqe->trq_mcnt += tqen->trq_mcnt; - tqe->trq_ml->m_next = tqen->trq_m; - tqe->trq_ml = tqen->trq_ml; - TAILQ_REMOVE(&tp->t_trq, tqen, trq_q); - uma_zfree(tcp_reass_zone, tqen); - tcp_reass_qsize--; - } + /* Check if segment bridges next block to merge. */ + if (tqen != NULL && + SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq)) + tcp_reass_merge(tp, tlenp, tqe, tqen); return (0); } - /* Segment prepends. */ - if (SEQ_GT(tqe->trq_seq, th->th_seq)) { + + /* Segment prepends to this block. */ + if (SEQ_GT(tqe->trq_seq, th->th_seq) && + SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th->th_seq + *tlenp)) { /* Trim tail of segment. */ - if ((i = tqe->trq_seq - (th->th_seq + *tlenp))) { - m_adj(m, i); - *tlenp += i; /* NB: i is negative */ + if ((i = SEQ_DELTA(tqe->trq_seq, th->th_seq + *tlenp))) { + m_adj(m, -i); + *tlenp -= i; /* TCP statistics. */ tcpstat.tcps_rcvpartduppack++; - tcpstat.tcps_rcvpartdupbyte -= i; + tcpstat.tcps_rcvpartdupbyte += i; /* Update accounting. */ for (mcnt = 0, n = m; n; n = n->m_next) mcnt += (n->m_flags & M_EXT) ? @@ -320,6 +335,29 @@ tqe->trq_m = m; return (0); } + + /* Segment appends to this block. */ + if (SEQ_LEQ(tqe->trq_seq + tqe->trq_len, th->th_seq) && + SEQ_LT(tqe->trq_seq + tqe->trq_len, th->th_seq + *tlenp)) { + /* Trim head of segment. */ + if ((i = SEQ_DELTA(tqe->trq_seq + tqe->trq_len, th->th_seq))) { + m_adj(m, i); + *tlenp -= i; + /* TCP Statistics. */ + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte += i; + } + tqe->trq_len += *tlenp; + tqe->trq_mcnt += mcnt; + tp->t_trqmcnt += mcnt; + tqe->trq_ml->m_next = m; + tqe->trq_ml = m_last(m); + /* Check if segment bridges two blocks to merge. */ + if (tqen != NULL && + SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq)) + tcp_reass_merge(tp, tlenp, tqe, tqen); + return (0); + } } insert: @@ -340,10 +378,15 @@ tqen->trq_ml = m_last(m); /* Where to insert. */ - if (tqe) + if (tqe != NULL && SEQ_LT(tqe->trq_seq + tqe->trq_len, th->th_seq)) + TAILQ_INSERT_TAIL(&tp->t_trq, tqen, trq_q); + else if (tqe != NULL) TAILQ_INSERT_BEFORE(tqe, tqen, trq_q); - else + else { + KASSERT(TAILQ_EMPTY(&tp->t_trq), + ("%s: queue not empty", __func__)); TAILQ_INSERT_HEAD(&tp->t_trq, tqen, trq_q); + } /* Missing segment? */ if (tp->rcv_nxt != th->th_seq) @@ -353,25 +396,21 @@ * Present data to user, advancing rcv_nxt through * completed sequence space. */ - if (!TCPS_HAVEESTABLISHED(tp->t_state)) - return (0); - tqe = TAILQ_FIRST(&tp->t_trq); - if (tqe == NULL || tqe->trq_seq != tp->rcv_nxt) - return (0); SOCKBUF_LOCK(&so->so_rcv); TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { - /* We can never go more than one round. */ + KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt), + ("%s: trq_seq < rcv_nxt", __func__)); if (tqe->trq_seq != tp->rcv_nxt) break; #if 1 /* XXX: This is bogus if we had a FIN. */ - flags = tqe->trq_flags & TH_FIN; + flags = tqe->trq_flags; #endif - tp->rcv_nxt += tqe->trq_len; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(tqe->trq_m); else sbappendstream_locked(&so->so_rcv, tqe->trq_m); + tp->rcv_nxt += tqe->trq_len; tp->t_trqmcnt -= tqe->trq_mcnt; TAILQ_REMOVE(&tp->t_trq, tqe, trq_q); uma_zfree(tcp_reass_zone, tqe); @@ -387,11 +426,64 @@ #endif } +static void +tcp_reass_merge(struct tcpcb *tp, int *tlenp, struct trq *tqe, struct trq *tqen) +{ +#if 0 + struct mbuf *m; +#endif + int i; + + KASSERT(SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq), + ("%s: blocks do not overlap, nothing to merge", __func__)); + + /* Appended block may reach beyond next block. */ + while (SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq + tqen->trq_len)) { + tcpstat.tcps_rcvpartdupbyte += tqen->trq_len; /* Statistics */ + tp->t_trqmcnt -= tqe->trq_mcnt; + m_freem(tqen->trq_m); + TAILQ_REMOVE(&tp->t_trq, tqen, trq_q); + uma_zfree(tcp_reass_zone, tqen); + tcp_reass_qsize--; + /* And the one after that. */ + if ((tqen = TAILQ_NEXT(tqe, trq_q)) == NULL) + return; + } + + /* Trim head of next block. */ + if ((i = SEQ_DELTA(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))) { + m_adj(tqen->trq_m, i); + tqen->trq_len -= i; + tcpstat.tcps_rcvpartdupbyte += i; /* Statistics */ +#if 0 + /* Trim ... */ + while (tqen->trq_m->m_len == 0) { + m = tqen->trq_m; + tqen->trq_m = tqen->trq_m->m_next; + /* mcnt */ + m_free(m); + } +#endif + KASSERT(tqen->trq_m != NULL, + ("%s: no remaining mbufs in block", __func__)); + } + + /* Merge blocks together. */ + tqe->trq_len += tqen->trq_len; + tqe->trq_mcnt += tqen->trq_mcnt; + tqe->trq_ml->m_next = tqen->trq_m; + tqe->trq_ml = tqen->trq_ml; + TAILQ_REMOVE(&tp->t_trq, tqen, trq_q); + uma_zfree(tcp_reass_zone, tqen); + tcp_reass_qsize--; +} + /* * Free the reassembly queue on tcpcb free and on general memory shortage. */ void -tcp_reass_qfree(struct tcpcb *tp) { +tcp_reass_qfree(struct tcpcb *tp) +{ struct trq *tqe, *tqen; INP_LOCK_ASSERT(tp->t_inpcb); ==== //depot/projects/tcp_reass/netinet/tcp_seq.h#2 (text+ko) ==== @@ -44,6 +44,7 @@ #define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b)) #define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b)) +#define SEQ_DELTA(a,b) ((int)SEQ_MAX(a, b) - SEQ_MIN(a, b)) /* for modulo comparisons of timestamps */ #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)