From owner-p4-projects@FreeBSD.ORG Mon Jan 14 23:09:48 2008 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id 1A69216A55A; Mon, 14 Jan 2008 23:09:48 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id D2FD716A473 for ; Mon, 14 Jan 2008 23:09:47 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id B9F3513C44B for ; Mon, 14 Jan 2008 23:09:47 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.1/8.14.1) with ESMTP id m0EN9l78069572 for ; Mon, 14 Jan 2008 23:09:47 GMT (envelope-from andre@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.1/8.14.1/Submit) id m0EN9lF2069569 for perforce@freebsd.org; Mon, 14 Jan 2008 23:09:47 GMT (envelope-from andre@freebsd.org) Date: Mon, 14 Jan 2008 23:09:47 GMT Message-Id: <200801142309.m0EN9lF2069569@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to andre@freebsd.org using -f From: Andre Oppermann To: Perforce Change Reviews Cc: Subject: PERFORCE change 133280 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 14 Jan 2008 23:09:48 -0000 http://perforce.freebsd.org/chv.cgi?CH=133280 Change 133280 by andre@andre_flirtbox on 2008/01/14 23:09:09 Change reassembly queue size limiting from number of mbufs to gross mbuf space used in queue. This is more natural as it reflects the actual amount of kernel memory used. It properly accounts for the mbuf storage size, be it normal mbuf, cluster or jumbo mbuf. The upper limit is the remaining space in the socket buffer. This effectively prevents mbuf exhaustion due to pathological traffic (one byte segments with a hole each time) on a single connection. The maximal possible mbuf usage is now equivalent to an unserviced filled up socket buffer. Accounting for the gross mbuf space underestimates the effective free space in the socket buffer vs. actual real data with 2k clusters and 1500 byte packets. This shouldn't be too much of a problem though. This edge case will be solved later when mbuf chain compacting is done. Reassembly specific sysctls are now unused. Update all tcp statistics. Add new statistic for reassembly queue overflows. Affected files ... .. //depot/projects/tcp_reass/netinet/tcp_reass.c#4 edit .. //depot/projects/tcp_reass/netinet/tcp_usrreq.c#3 edit .. //depot/projects/tcp_reass/netinet/tcp_var.h#3 edit Differences ... ==== //depot/projects/tcp_reass/netinet/tcp_reass.c#4 (text+ko) ==== @@ -103,18 +103,20 @@ static int tcp_reass_maxseg = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, &tcp_reass_maxseg, 0, - "Global maximum number of TCP Segments in Reassembly Queue"); + "Global maximum number of TCP Segment Blocks in Reassembly Queue"); int tcp_reass_qsize = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, &tcp_reass_qsize, 0, - "Global number of TCP Segments currently in Reassembly Queue"); + "Global number of TCP Segment Blocks currently in Reassembly Queue"); +/* XXX: unused. */ static int tcp_reass_maxqlen = 48; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW, &tcp_reass_maxqlen, 0, "Maximum number of TCP Segments per individual Reassembly Queue"); +/* XXX: unused, moved to tcpstat. */ static int tcp_reass_overflows = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, &tcp_reass_overflows, 0, @@ -135,6 +137,7 @@ tcp_reass_init(void) { + /* XXX: nmbclusters may be zero. */ tcp_reass_maxseg = nmbclusters / 16; TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", &tcp_reass_maxseg); @@ -151,7 +154,7 @@ struct trq *tqe, *tqen; struct socket *so = tp->t_inpcb->inp_socket; struct mbuf *n; - int i, flags = 0, segs = 0; + int i, flags = 0, mcnt = 0; INP_LOCK_ASSERT(tp->t_inpcb); @@ -162,31 +165,39 @@ if (th == NULL) goto present; + KASSERT(SEQ_LEQ(tp->rcv_nxt, th->th_seq), + ("%s: sequence number below rcv_nxt", __func__)); + /* * Limit the number of segments in the reassembly queue to prevent * holding on to too many segments (and thus running out of mbufs). * Make sure to let the missing segment through which caused this - * queue. Always keep one global queue entry spare to be able to - * process the missing segment. + * queue. + * Count the gross space used by the mbufs in the reassembly queue + * and limit it to the free space in the socket buffer. This way + * the reassembly queue can never consume more mbuf space than the + * socket buffer got allocated anyway and it reflects the actual + * amount of kernel memory used. This effectively prevents mbuf + * exhaustion due to pathological traffic (one byte segments with + * a hole each time) on a single connection. + * Counting the gross mbuf space effectively sets the net data + * limit lower than the socket buffer would allow. + * It underestimates the effective free space in the socket buffer + * vs. actual real data with 2k clusters and 1500 byte packets. + * This shouldn't be too much of a problem though. */ if (th->th_seq != tp->rcv_nxt && - (tcp_reass_qsize + 1 >= tcp_reass_maxseg || - tp->t_trqlen >= tcp_reass_maxqlen)) { - tcp_reass_overflows++; + tp->t_trqmcnt > sbspace(&so->so_rcv)) { + tcpstat.tcps_rcvreassoverflow++; tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); } - /* Accounting. */ - tcpstat.tcps_rcvoopack++; - tcpstat.tcps_rcvoobyte += *tlenp; /* NB: m_adj(m, -i) may free mbufs at the tail of a chain. */ for (n = m; n; n = n->m_next) - segs++; - tp->t_trqlen += segs; - tcp_reass_qsize += segs; + mcnt += (n->m_flags & M_EXT) ? n->m_ext.ext_size : MSIZE; /* Get rid of packet header and mtags. */ m_demote(m, 1); @@ -195,9 +206,13 @@ tqe = TAILQ_LAST(&tp->t_trq, trq_head); if (tqe && tqe->trq_seq + tqe->trq_len == th->th_seq) { tqe->trq_len += *tlenp; - tqe->trq_segs += segs; + tqe->trq_mcnt += mcnt; + tp->t_trqmcnt += mcnt; tqe->trq_ml->m_next = m; tqe->trq_ml = m_last(m); + /* TCP statistics. */ + tcpstat.tcps_rcvoopack++; + tcpstat.tcps_rcvoobyte += *tlenp; return (0); } @@ -207,24 +222,24 @@ KASSERT(tqe != NULL, ("%s: missing segment but nothing in queue", __func__)); KASSERT(SEQ_GT(tqe->trq_seq, th->th_seq), - ("%s: first block already contains missing segment", __func__)); + ("%s: first block starts below missing segment", __func__)); if (SEQ_LT(tqe->trq_seq, th->th_seq + *tlenp)) { /* Trim tail of segment. */ if ((i = tqe->trq_seq - (th->th_seq + *tlenp))) { m_adj(m, i); - *tlenp -= i; - /* tcpstat */ + *tlenp += i; /* NB: i is negative */ + /* TCP statistics. */ + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte -= i; /* Update accounting. */ - if (segs > 1) { - for (n = m; n; n = n->m_next) - segs--; - tp->t_trqlen -= segs; - tcp_reass_qsize -= segs; - } + for (n = m; n; n = n->m_next) + mcnt += (n->m_flags & M_EXT) ? + n->m_ext.ext_size : MSIZE; } /* Segment prepends first block. */ tqe->trq_len += *tlenp; - tqe->trq_segs += segs; + tqe->trq_mcnt += mcnt; + tp->t_trqmcnt += mcnt; tqe->trq_seq = th->th_seq; n = m_last(m); n->m_next = tqe->trq_m; @@ -234,6 +249,10 @@ goto insert; } + /* TCP statistics. */ + tcpstat.tcps_rcvoopack++; + tcpstat.tcps_rcvoobyte += *tlenp; + /* See where it fits. */ TAILQ_FOREACH(tqe, &tp->t_trq, trq_q) { /* Segment is after this blocks coverage. */ @@ -248,8 +267,6 @@ tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); - tp->t_trqlen -= segs; - tcp_reass_qsize -= segs; *tlenp = 0; return (0); } @@ -259,10 +276,14 @@ if ((i = tqe->trq_seq + tqe->trq_len - th->th_seq)) { m_adj(m, i); *tlenp -= i; - /* tcpstat */ + /* TCP Statistics. */ + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte += i; + /* XXX dupes */ } tqe->trq_len += *tlenp; - tqe->trq_segs += segs; + tqe->trq_mcnt += mcnt; + tp->t_trqmcnt += mcnt; tqe->trq_ml->m_next = m; tqe->trq_ml = m_last(m); /* Check if segment bridges two blocks to merge. */ @@ -274,14 +295,17 @@ tqen->trq_seq)) { m_adj(tqen->trq_m, i); tqen->trq_len -= i; - /* tcpstat */ + /* TCP statistics. */ + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte += i; } tqe->trq_len += tqen->trq_len; - tqe->trq_segs += tqen->trq_segs; + tqe->trq_mcnt += tqen->trq_mcnt; tqe->trq_ml->m_next = tqen->trq_m; tqe->trq_ml = tqen->trq_ml; TAILQ_REMOVE(&tp->t_trq, tqen, trq_q); uma_zfree(tcp_reass_zone, tqen); + tcp_reass_qsize--; } return (0); } @@ -290,17 +314,18 @@ /* Trim tail of segment. */ if ((i = tqe->trq_seq - (th->th_seq + *tlenp))) { m_adj(m, i); - *tlenp -= i; + *tlenp += i; /* NB: i is negative */ + /* TCP statistics. */ + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte -= i; /* Update accounting. */ - if (segs > 1) { - for (n = m; n; n = n->m_next) - segs--; - tp->t_trqlen -= segs; - tcp_reass_qsize -= segs; - } + for (n = m; n; n = n->m_next) + mcnt += (n->m_flags & M_EXT) ? + n->m_ext.ext_size : MSIZE; } tqe->trq_len += *tlenp; - tqe->trq_segs += segs; + tqe->trq_mcnt += mcnt; + tp->t_trqmcnt += mcnt; tqe->trq_seq = th->th_seq; n = m_last(m); n->m_next = tqe->trq_m; @@ -318,9 +343,11 @@ *tlenp = 0; return (0); } + tcp_reass_qsize++; tqen->trq_seq = th->th_seq; tqen->trq_len = *tlenp; - tqen->trq_segs = segs; + tqen->trq_mcnt = mcnt; + tp->t_trqmcnt += mcnt; tqen->trq_m = m; tqen->trq_ml = m_last(m); @@ -345,6 +372,7 @@ return (0); SOCKBUF_LOCK(&so->so_rcv); TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { + /* We can never go more than one round. */ if (tqe->trq_seq != tp->rcv_nxt) break; #if 1 @@ -356,12 +384,10 @@ m_freem(tqe->trq_m); else sbappendstream_locked(&so->so_rcv, tqe->trq_m); - KASSERT(tp->t_trqlen >= tqe->trq_segs, - ("%s: t_trqlen incorrect", __func__)); - tp->t_trqlen -= tqe->trq_segs; - tcp_reass_qsize -= tqe->trq_segs; + tp->t_trqmcnt -= tqe->trq_mcnt; TAILQ_REMOVE(&tp->t_trq, tqe, trq_q); uma_zfree(tcp_reass_zone, tqe); + tcp_reass_qsize--; } /* NB: sorwakeup_locked() does an implicit socket buffer unlock. */ sorwakeup_locked(so); @@ -384,11 +410,11 @@ TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { m_freem(tqe->trq_m); - KASSERT(tp->t_trqlen >= tqe->trq_segs, - ("%s: t_trqlen incorrect", __func__)); - tp->t_trqlen -= tqe->trq_segs; - tcp_reass_qsize -= tqe->trq_segs; + KASSERT(tp->t_trqmcnt >= tqe->trq_mcnt, + ("%s: t_trqmcnt incorrect", __func__)); + tp->t_trqmcnt -= tqe->trq_mcnt; TAILQ_REMOVE(&tp->t_trq, tqe, trq_q); uma_zfree(tcp_reass_zone, tqe); + tcp_reass_qsize--; } } ==== //depot/projects/tcp_reass/netinet/tcp_usrreq.c#3 (text+ko) ==== @@ -1744,8 +1744,8 @@ indent += 2; db_print_indent(indent); - db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", - TAILQ_FIRST(&tp->t_trq), tp->t_trqlen, tp->t_dupacks); + db_printf("t_trq first: %p t_trqmcnt: %d t_dupacks: %d\n", + TAILQ_FIRST(&tp->t_trq), tp->t_trqmcnt, tp->t_dupacks); db_print_indent(indent); db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", ==== //depot/projects/tcp_reass/netinet/tcp_var.h#3 (text+ko) ==== @@ -40,12 +40,12 @@ */ extern int tcp_do_rfc1323; -/* TCP reassembly queue segment entry. */ +/* TCP reassembly queue segment block entry. */ struct trq { TAILQ_ENTRY(trq) trq_q; - tcp_seq trq_seq; /* start of segment */ - int trq_len; /* length of segment */ - int trq_segs; /* number of mbufs */ + tcp_seq trq_seq; /* start of block */ + int trq_len; /* length of block */ + int trq_mcnt; /* gross mbuf size of block */ int trq_flags; /* flags for segment chain */ #define TRQ_FIN 0x01 /* FIN was on last segment */ struct mbuf *trq_m; /* mbuf chain of data */ @@ -97,7 +97,7 @@ */ struct tcpcb { struct trq_head t_trq; /* segment reassembly queue */ - int t_trqlen; /* segment reassembly queue length */ + int t_trqmcnt; /* segment reassembly queue gross usage */ int t_dupacks; /* consecutive dup acks recd */ struct tcp_timer *t_timers; /* All the TCP timers in one struct */ @@ -384,6 +384,7 @@ u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ u_long tcps_rcvoopack; /* out-of-order packets received */ u_long tcps_rcvoobyte; /* out-of-order bytes received */ + u_long tcps_rcvreassoverflow; /* reassembly queue overflows */ u_long tcps_rcvpackafterwin; /* packets with data after window */ u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */ u_long tcps_rcvafterclose; /* packets rcvd after "close" */