From owner-p4-projects@FreeBSD.ORG Mon May 19 13:37:12 2008 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id 6C4211065672; Mon, 19 May 2008 13:37:12 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 2E4B21065671 for ; Mon, 19 May 2008 13:37:12 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id 15D688FC14 for ; Mon, 19 May 2008 13:37:12 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.1/8.14.1) with ESMTP id m4JDbB8P069781 for ; Mon, 19 May 2008 13:37:11 GMT (envelope-from andre@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.1/8.14.1/Submit) id m4JDbBYL069779 for perforce@freebsd.org; Mon, 19 May 2008 13:37:11 GMT (envelope-from andre@freebsd.org) Date: Mon, 19 May 2008 13:37:11 GMT Message-Id: <200805191337.m4JDbBYL069779@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to andre@freebsd.org using -f From: Andre Oppermann To: Perforce Change Reviews Cc: Subject: PERFORCE change 141858 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 19 May 2008 13:37:13 -0000 http://perforce.freebsd.org/chv.cgi?CH=141858 Change 141858 by andre@andre_flirtbox on 2008/05/19 13:37:00 Checkpoint WIP before I travel back home from BSDCan. - Remove global count of gross mbuf space in all reassembly queues. Access is unlocked and unreliable. It should be reported via netstat on a per connection basis. - Put some common code blocks into their own functions. - Add switch to select between space and time optimization. Space optimization collapses the stored mbuf chains to prevent mostly empty mbufs. This may involve allocating new mbufs and copying data around. Time optimization doesn't do that and just inserts the mbuf chains into the queue without touching the content. This is appropriate for high speed transfers when almost all packets are MTU sized (and jumbo on top of that). - Process FIN on a packet without any data. We could get stuck otherwise when bogus data beyond that FIN was received. - Adjust gross to net space ratio in relation to socket buffer from 5/4 to 11/8 to better reflect the gross net ratio of ethernet MTU to mbuf cluster size. Affected files ... .. //depot/projects/tcp_reass/netinet/tcp_reass.c#22 edit Differences ... ==== //depot/projects/tcp_reass/netinet/tcp_reass.c#22 (text+ko) ==== @@ -127,20 +127,44 @@ &tcp_reass_qsize, 0, "Global number of TCP Segment Blocks currently in Reassembly Queue"); -static int tcp_reass_mcnt = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, mbufbytes, CTLFLAG_RD, - &tcp_reass_mcnt, 0, - "Global gross memory size of all mbufs currently in Reassembly Queue"); - static int tcp_reass_qtimo = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW, &tcp_reass_qtimo, 0, "Reassembly Queue Timeout in multiples of the Retransmission Timeout"); +static int tcp_reass_spacetime = 0; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW, + &tcp_reass_spacetime, 0, + "Reassembly Queue strategy of space vs. time efficiency"); + static void tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *); uma_zone_t tcp_reass_zone; +/* Trim empty mbufs from head of chain. */ +static struct mbuf * +m_trimhead(struct mbuf *m) { + struct mbuf *n; + + while (m->m_len == 0) { + n = m; + m = m->m_next; + m_free(n); + } + return (m); +} + +static u_int +m_storagesize(m) { + u_int mcnt; + + for (mcnt = 0, m; n; m = m->m_next) + mcnt += (m->m_flags & M_EXT) ? + m->m_ext.ext_size + MSIZE : MSIZE; + return (mcnt); +} + + /* * Adjust TCP reassembly zone limits when the nmbclusters zone changes. */ @@ -217,15 +241,6 @@ return (0); } - /* XXX: should not happen, but does for some reason. */ - /* XXX: May get FIN on otherwise empty segment. */ - if (*tlenp == 0) { - m_freem(m); - return (0); - } - - KASSERT(*tlenp > 0, - ("%s: segment doesn't contain any data", __func__)); KASSERT(SEQ_LEQ(tp->rcv_nxt, th_seq), ("%s: sequence number below rcv_nxt", __func__)); KASSERT(!(tp->rcv_nxt == th_seq) || !(TAILQ_EMPTY(&tp->t_trq)), @@ -264,12 +279,12 @@ * * Counting the gross mbuf space effectively sets the net data * limit lower than the socket buffer would allow. - * It underestimates the effective free space in the socket buffer - * vs. actual real data with 2k clusters and 1500 byte packets. - * This shouldn't be too much of a problem though. + * Don't underestimates the effective free space in the socket + * buffer vs. actual real data with 2k clusters and 1500 byte + * packets by introducing a correction factor of 11/8th. */ if (th_seq != tp->rcv_nxt && - tp->t_trqmcnt > (sbspace(&so->so_rcv) / 4 * 5)) { + tp->t_trqmcnt > (sbspace(&so->so_rcv) / 8 * 11)) { tcpstat.tcps_reass_overflow++; tcpstat.tcps_rcvmemdrop++; m_freem(m); @@ -280,19 +295,11 @@ /* Get rid of packet header and mtags. */ m_demote(m, 1); -#if 0 /* Trim empty mbufs from head of chain. */ - while (m->m_len == 0) { - n = m; - m = m->m_next; - m_free(n); - } -#endif + m = m_trimhead(m); /* NB: m_adj(m, -i) may free mbufs at the tail of a chain. */ - for (mcnt = 0, n = m; n; n = n->m_next) - mcnt += (n->m_flags & M_EXT) ? - n->m_ext.ext_size + MSIZE : MSIZE; + mcnt = m_storagesize(m); /* * FIN handling is a bit tricky. @@ -311,6 +318,10 @@ if ((thflags & TH_FIN) && tp->rcv_nxt == th_seq) { tcp_reass_qfree(tp); tqe = NULL; + if (m->m_len == 0) { + tcp_timer_activate(tp, TT_REASS, 0); + return (thflags); + } goto insert; } else thflags &= ~TH_FIN; @@ -323,9 +334,14 @@ tqe->trq_len += *tlenp; tqe->trq_mcnt += mcnt; tp->t_trqmcnt += mcnt; - tcp_reass_mcnt += mcnt; tqe->trq_ml->m_next = m; tqe->trq_ml = m_last(m); + if (tcp_reass_spacetime) { + tqe->trq_m = m_collapse(tqe->trq_m, M_DONTWAIT, 1024); + tp->t_trqmcnt -= tqe->trq_mcnt; + tqe->trq_mcnt = m_storagespace(tqe->trq_m); + tqe->trq_mcnt += tp->t_trqmcnt; + } if (LIST_FIRST(&tp->t_trq_sack) != tqe) { LIST_REMOVE(tqe, trq_s); LIST_INSERT_HEAD(&tp->t_trq_sack, tqe, trq_s); @@ -367,7 +383,6 @@ tqe->trq_len += *tlenp; tqe->trq_mcnt += mcnt; tp->t_trqmcnt += mcnt; - tcp_reass_mcnt += mcnt; tqe->trq_seq = th_seq; n = m_last(m); n->m_next = tqe->trq_m; @@ -415,7 +430,6 @@ tqe->trq_len = *tlenp; tqe->trq_mcnt = mcnt; tp->t_trqmcnt += mcnt; - tcp_reass_mcnt += mcnt; tqe->trq_seq = th_seq; tqe->trq_m = m; tqe->trq_ml = m_last(m); @@ -445,14 +459,11 @@ tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += i; /* Update accounting. */ - for (mcnt = 0, n = m; n; n = n->m_next) - mcnt += (n->m_flags & M_EXT) ? - n->m_ext.ext_size + MSIZE : MSIZE; + mcnt = m_storagespace(m); } tqe->trq_len += *tlenp; tqe->trq_mcnt += mcnt; tp->t_trqmcnt += mcnt; - tcp_reass_mcnt += mcnt; tqe->trq_seq = th_seq; n = m_last(m); n->m_next = tqe->trq_m; @@ -480,7 +491,6 @@ tqe->trq_len += *tlenp; tqe->trq_mcnt += mcnt; tp->t_trqmcnt += mcnt; - tcp_reass_mcnt += mcnt; tqe->trq_ml->m_next = m; tqe->trq_ml = m_last(m); /* Check if segment bridges two blocks to merge. */ @@ -520,7 +530,6 @@ tqen->trq_len = *tlenp; tqen->trq_mcnt = mcnt; tp->t_trqmcnt += mcnt; - tcp_reass_mcnt += mcnt; tqen->trq_m = m; tqen->trq_ml = m_last(m); @@ -574,7 +583,6 @@ sbappendstream_locked(&so->so_rcv, tqe->trq_m); tp->rcv_nxt += tqe->trq_len; tp->t_trqmcnt -= tqe->trq_mcnt; - tcp_reass_mcnt -= tqe->trq_mcnt; TAILQ_REMOVE(&tp->t_trq, tqe, trq_q); LIST_REMOVE(tqe, trq_s); if (tqe != &tqes) @@ -621,7 +629,6 @@ tcpstat.tcps_rcvpartdupbyte += tqen->trq_len; tcpstat.tcps_reass_covered++; tp->t_trqmcnt -= tqe->trq_mcnt; - tcp_reass_mcnt -= tqe->trq_mcnt; m_freem(tqen->trq_m); TAILQ_REMOVE(&tp->t_trq, tqen, trq_q); LIST_REMOVE(tqen, trq_s);