From owner-p4-projects@FreeBSD.ORG Sun Jul 19 15:44:43 2009 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id 7FFFF1065676; Sun, 19 Jul 2009 15:44:42 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id E728F1065686 for ; Sun, 19 Jul 2009 15:44:41 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id CF3008FC39 for ; Sun, 19 Jul 2009 15:44:41 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.3/8.14.3) with ESMTP id n6JFifK4065385 for ; Sun, 19 Jul 2009 15:44:41 GMT (envelope-from andre@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.3/8.14.3/Submit) id n6JFifiw065383 for perforce@freebsd.org; Sun, 19 Jul 2009 15:44:41 GMT (envelope-from andre@freebsd.org) Date: Sun, 19 Jul 2009 15:44:41 GMT Message-Id: <200907191544.n6JFifiw065383@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to andre@freebsd.org using -f From: Andre Oppermann To: Perforce Change Reviews Cc: Subject: PERFORCE change 166270 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 19 Jul 2009 15:44:48 -0000 http://perforce.freebsd.org/chv.cgi?CH=166270 Change 166270 by andre@andre_t61 on 2009/07/19 15:44:00 Experiment with a rewritten TCP reassembly queue that uses a ranged red-black tree to store the received data blocks. Advantages are a simpler structure and O(log n) insertion/removal in all complexity cases compared to a tail queue. Affected files ... .. //depot/projects/tcp_reass/netinet/tcp_reass.c#34 edit .. //depot/projects/tcp_reass/netinet/tcp_var.h#18 edit Differences ... ==== //depot/projects/tcp_reass/netinet/tcp_reass.c#34 (text+ko) ==== @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2007 + * Copyright (c) 2007-2009 * Andre Oppermann, Internet Business Solutions AG. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -50,26 +50,11 @@ * queue. * * Instead of storing all segments on their own we build blocks of consequtive - * segments chained together. We use a tailq because a new segments has the - * highest probability to fit the tail of the chain. If not, the second - * highest probability is the beginning of the chain for being the missing - * segment. Otherwise we cycle through each consequtive block until a match - * is found. If a segment matches the end of one block and the start of the + * segments chained together. We use a red-black tree to cope with arbitrary + * complexity. If a segment matches the end of one block and the start of the * next block the two blocks are joined together. If no match is found a * new block is created. * - * This system is very efficient and can deal efficiently with long chains - * and many holes. - * - * trq_tail ----------------------------------------------\ - * trq_head --> [block] ------> [block] ------> [block] <-/ - * m_next m_next m_next - * | | | - * m_next m_next m_next - * | | | - * m_next m_next m_next - * - * * The reassembly queues block structure is also used to track SACK * information as a data receiver. A double-linked list is added * that links the blocks the reverse order of their arrival or updating. @@ -127,16 +112,11 @@ &tcp_reass_enabled, 0, "Enable/disable use of TCP Reassembly Queue"); -static int tcp_reass_maxblocks = 0; +static int tcp_reass_maxblocks = 65535; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN, &tcp_reass_maxblocks, 0, "Global maximum number of TCP Segment Blocks in Reassembly Queue"); -int tcp_reass_qsize = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD, - &tcp_reass_qsize, 0, - "Global number of TCP Segment Blocks currently in Reassembly Queue"); - static int tcp_reass_qtimo = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW, &tcp_reass_qtimo, 0, @@ -147,17 +127,8 @@ &tcp_reass_spacetime, 0, "Reassembly Queue strategy of space vs. time efficiency"); -static void tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *); - -static __inline void -sack_track(struct tcpcb *tp, struct trq *tqe) -{ - - if (LIST_FIRST(&tp->t_trq_sack) != (tqe)) { - LIST_REMOVE((tqe), trq_s); - LIST_INSERT_HEAD(&tp->t_trq_sack, (tqe), trq_s); - } -} +static struct tcp_reass_block + tcp_reass_merge(struct tcpcb *, struct tcp_reass_block *, struct tcp_reass_block *); /* Trim empty mbufs from head of chain. */ static struct mbuf * @@ -195,46 +166,116 @@ uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); } +/* + * Initialize TCP reassembly zone on startup. + */ +void +tcp_reass_init(void) +{ + + TUNABLE_INT_FETCH("net.inet.tcp.reass.maxblocks", + &tcp_reass_maxblocks); + tcp_reass_zone = uma_zcreate("tcpreass", sizeof(struct trb), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); + EVENTHANDLER_REGISTER(nmbclusters_change, + tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); +} + +/* + * Compare function implementing the ranged lookup on the RB tree. + * NB: The tree must never have any overlapping elements. + */ +static __inline int +tcp_reass_cmp(struct tcp_reass_block *a, struct tcp_reass_block *b) +{ + if (SEQ_LT(a->trb_seqe, b->trb_seqs)) + return (-1); + else if (SEQ_GT(a->trb_seqs, b->trb_seqe)) + return (1); + else + return (0); +} + +RB_PROTOTYPE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp); +RB_GENERATE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp); + #ifdef INVARIANTS static int tcp_reass_verify(struct tcpcb *tp) { - struct trq *tqe, *tqen; - int i = 0; + int i = 0, size = 0, total = 0; + struct mbuf *m; + struct tcp_reass_block *trb, *trbn; - TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { - KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt), - ("%s: trq_seq < rcv_nxt", __func__)); - KASSERT(tqen == NULL || - SEQ_LT(tqe->trq_seq + tqe->trq_len, tqen->trq_seq), - ("%s: overlapping blocks", __func__)); + RB_FOREACH_SAFE(trb, tcp_rb, &tp->rcv_reass, trbn) { + KASSERT(SEQ_LT(trb->trb_seqs, trb->trb_seqe), + ("%s: trb_seqs >= trb_seqe", __func__)); + KASSERT(SEQ_GT(trb->trb_seqs, tp->rcv_nxt), + ("%s: rcv_nxt >= trb_seqs", __func__)); + KASSERT(trb->trb_m != NULL, + ("%s: trb_m == NULL", __func__)); + KASSERT(trb->trb_mt != NULL, + ("%s: trb_mt == NULL", __func__)); + size = SEQ_DELTA(trb->trb_seqs, tsb->trb_seqe); + KASSERT(size == m_length(trb->trb_m, &m), + ("%s: seq# size != actual mbuf size", __func__)); + KASSERT(trb->trb_mt == m, + ("%s: trb_mt is not last mbuf", __func__)); + KASSERT(tsbn == NULL || SEQ_LT(tsb->trb_seqe, tsbn->trb_seqs), + ("%s: overlaps into next block", __func__)); + total += size; i++; } - LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) { + KASSERT(tp->rcv_reass_size == total, + ("%s: total not correct", __func__)); + + LIST_FOREACH(tcp_reass_block, &tp->rcv_reass_sack, trb_sack) { i--; } - KASSERT(i == 0, ("%s: SEQ# ordered tailq and arrival ordered " - "SACK list are not equally long", __func__)); + KASSERT(i == 0, + ("%s: sack list incorrect", __func__)); + return (0); } #endif -/* - * Initialize TCP reassembly zone on startup. - */ +static void +tcp_reass_free(struct tcpcb *tp, struct tcp_reass_block *trb) +{ + + trb = RB_REMOVE(tcp_ra, &tp->rcv_reass, trb); + KASSERT(trb != NULL, ("%s: RB_REMOVE failed", __func__)); + LIST_REMOVE(trb, trb_sack); + if (trb->trb_m != NULL) + m_freem(trb->trb_m); + tp->rcv_reass_size -= SEQ_DELTA(trb->trb_seqs, trb->trb_seqe); + uma_zfree(tcp_reass_zone, trb); +} + void -tcp_reass_init(void) +tcp_reass_flush(struct tcpcb *tp) +{ + struct tcp_reass_block *trb, *trbn; + + INP_WLOCK_ASSERT(tp->t_inpcb); + KASSERT(tcp_reass_verify(tp), + ("%s: reassembly queue inconsistent", __func__)); + + RB_FOREACH_SAFE(trb, tcp_rb, &tp->rcv_reass, trbn) { + tcp_reass_free(tp, trb); + } + KASSERT(tp->rcv_reass_size == 0, ("%s: snd_sacked not zero", __func__)); +} + +static __inline void +tcp_reass_sacktrack(struct tcpcb *tp, struct tcp_reass_block *trb) { - /* XXX: nmbclusters may be zero. */ - tcp_reass_maxblocks = nmbclusters / 16; - TUNABLE_INT_FETCH("net.inet.tcp.reass.maxblocks", - &tcp_reass_maxblocks); - tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct trq), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); - EVENTHANDLER_REGISTER(nmbclusters_change, - tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); + if (LIST_FIRST(&tp->rcv_reass_sack) != trb) { + LIST_REMOVE(trb, trb_sack); + LIST_INSERT_HEAD(&tp->rcv_reass_sack, trb, trb_sack); + } } /* @@ -246,12 +287,11 @@ int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { - struct trq *tqe, *tqen; + int i, thflags = 0; + tcp_seq th_seq; struct socket *so = tp->t_inpcb->inp_socket; - struct mbuf *n; - int i, thflags = 0, mcnt; - tcp_seq th_seq; - struct trq tqes; + struct tcp_reass_block *trb, *trbn; + struct tcp_reass_block trbs; INP_WLOCK_ASSERT(tp->t_inpcb); @@ -262,13 +302,19 @@ */ if (th == NULL) { if (!TCPS_HAVEESTABLISHED(tp->t_state) || - TAILQ_EMPTY(&tp->t_trq) || - ((tqe = TAILQ_FIRST(&tp->t_trq)) && - tqe->trq_seq != tp->rcv_nxt)) + RB_EMPTY(&tp->t_trq) || + ((trb = RB_MIN(tcp_ra, &tp->rcv_reass)) && + trb->trb_seqs != tp->rcv_nxt)) return (0); goto present; } + KASSERT(th != NULL, ("%s: th is NULL", __func__)); + KASSERT(tlenp != NULL, ("%s: tlenp is NULL", __func__)); + KASSERT(m != NULL, ("%s: m is NULL", __func__)); + KASSERT(*tlenp == m_length(m, NULL), + ("%s: tlen != mbuf length", __func__)); + /* * Store TCP header information in local variables as * we may lose access to it after mbuf compacting. @@ -278,15 +324,15 @@ th = NULL; /* Prevent further use. */ /* Check if it is really neccessary to do all the work. */ - if (!tcp_reass_enabled && TAILQ_EMPTY(&tp->t_trq)) { + if (!tcp_reass_enabled && RB_EMPTY(&tp->rcv_reass)) { *tlenp = 0; m_freem(m); return (0); } - KASSERT(SEQ_LEQ(tp->rcv_nxt, th_seq), + KASSERT(SEQ_LT(tp->rcv_nxt, th_seq), ("%s: sequence number below rcv_nxt", __func__)); - KASSERT(!(tp->rcv_nxt == th_seq) || !(TAILQ_EMPTY(&tp->t_trq)), + KASSERT(!(tp->rcv_nxt == th_seq) || !(RB_EMPTY(&tp->rcv_reass)), ("%s: got missing segment but queue is empty", __func__)); KASSERT(tcp_reass_verify(tp), ("%s: reassembly queue inconsistent", __func__)); @@ -311,6 +357,7 @@ * buffer vs. actual real data with 2k clusters and 1500 byte * packets by introducing a correction factor of 11/8th. */ + /* if (th_seq != tp->rcv_nxt && tp->t_trqmcnt > (sbspace(&so->so_rcv) / 8 * 11)) { TCPSTAT_INC(tcps_reass_overflow); @@ -319,15 +366,7 @@ *tlenp = 0; return (0); } - - /* Get rid of packet header and mtags. */ - m_demote(m, 1); - - /* Trim empty mbufs from head of chain. */ - m = m_trimhead(m); - - /* NB: m_adj(m, -i) may free mbufs at the tail of a chain. */ - mcnt = m_storagesize(m); + */ /* * FIN handling is a bit tricky. @@ -344,269 +383,119 @@ * This approach is based on a discussion on TCPM mailing list. */ if ((thflags & TH_FIN) && tp->rcv_nxt == th_seq) { - tcp_reass_qfree(tp); - tqe = NULL; + tcp_reass_flush(tp); if (m->m_len == 0) { tcp_timer_activate(tp, TT_REASS, 0); return (thflags); } - goto insert; - } else + } else if (*tlenp = 0) { + m_freem(m); + return (0); + else thflags &= ~TH_FIN; - /* Check if this is the first segment. */ - if (TAILQ_EMPTY(&tp->t_trq)) - goto insert; + /* Get rid of packet header and mtags. */ + m_demote(m, 1); + /* Trim empty mbufs from head of chain. */ + m = m_trimhead(m); + /* Compact mbuf chain. */ + m = m_collapse(m, M_DONTWAIT, 1024); - /* Starting point for the following tests. */ - tqe = TAILQ_LAST(&tp->t_trq, trq_head); + KASSERT(m != NULL, ("%s: m is NULL after collapse", __func__)); - /* Check if this segment directly attaches to the end. */ - if (tqe->trq_seq + tqe->trq_len == th_seq) { - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_ml->m_next = m; - tqe->trq_ml = m_last(m); - if (tcp_reass_spacetime) { - tqe->trq_m = m_collapse(tqe->trq_m, M_DONTWAIT, 1024); - tp->t_trqmcnt -= tqe->trq_mcnt; - tqe->trq_mcnt = m_storagesize(tqe->trq_m); - tqe->trq_mcnt += tp->t_trqmcnt; - } - sack_track(tp, tqe); - /* TCP statistics. */ - TCPSTAT_INC(tcps_rcvoopack); - TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); - TCPSTAT_INC(tcps_reass_tail); - return (0); - } + /* Set up search structure. */ + trbs.trb_seqs = th_seq; + trbs.trb_seqe = th_seq + *tlenp; + trbs.trb_m = m; + trbs.trb_mt = m_last(m); - /* Check if beyond last block. */ - if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq)) - goto insert; - - /* Check if this is the missing segment. */ - if (tp->rcv_nxt == th_seq) { - tqe = TAILQ_FIRST(&tp->t_trq); - KASSERT(SEQ_GT(tqe->trq_seq, th_seq), - ("%s: first block starts below missing segment", __func__)); - /* Check if segment prepends first block. */ - if (SEQ_LEQ(tqe->trq_seq, th_seq + *tlenp)) { - /* Trim tail of segment. */ - if ((i = SEQ_DELTA(tqe->trq_seq, th_seq + *tlenp))) { - m_adj(m, -i); - *tlenp -= i; - /* TCP statistics. */ - TCPSTAT_INC(tcps_rcvpartduppack); - TCPSTAT_ADD(tcps_rcvpartdupbyte, i); - /* Update accounting. */ - mcnt = m_storagesize(m); - } - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_seq = th_seq; - n = m_last(m); - n->m_next = tqe->trq_m; - tqe->trq_m = m; - goto present; - } - goto insert; /* No statistics, this segment is in line. */ - } - - /* TCP statistics. */ - TCPSTAT_INC(tcps_rcvoopack); - TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); - - /* See where it fits. */ - TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { - /* Segment is after this blocks coverage. */ - if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq)) - continue; - /* Segment is after the previous one but before this one. */ - if (SEQ_GT(tqe->trq_seq, th_seq + *tlenp)) - break; /* Insert as new block. */ - - /* Segment is already fully covered. */ - if (SEQ_LEQ(tqe->trq_seq, th_seq) && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) { - TCPSTAT_INC(tcps_rcvduppack); - TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp); - TCPSTAT_INC(tcps_reass_covered); - /* - * XXXAO: What to SACK report when duplicate? - * See RFC2883: D-SACK (Duplicate SACK) - */ - sack_track(tp, tqe); + /* + * Return match that has at least partial overlap to either side or + * insert a new reassembly block. + */ + if ((trb = RB_FIND(tcp_rb, &tp->rcv_reass, &trbs)) != NULL) { + /* Within an already known block. */ + if (SEQ_GEQ(trbs.trb_seqs, trb->trb_seqs) && + SEQ_LEQ(trbs.trb_seqe, trb->trb_seqe)) { + tcp_reass_sacktrack(tp, trb); m_freem(m); *tlenp = 0; return (0); } + tp->rcv_reass_size += SEQ_DELTA(trb->trb_seqs, trb->trb_seqe); - /* Segment covers and extends on both ends. */ - if (SEQ_GT(tqe->trq_seq, th_seq) && - SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) { - /* Replace block content. */ - tp->t_trqmcnt -= tqe->trq_mcnt; - m_freem(tqe->trq_m); - tqe->trq_len = *tlenp; - tqe->trq_mcnt = mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_seq = th_seq; - tqe->trq_m = m; - tqe->trq_ml = m_last(m); - /* Check if segment bridges next block to merge. */ - if (tqen != NULL && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq)) - tcp_reass_merge(tp, tqe, tqen); - sack_track(tp, tqe); - TCPSTAT_INC(tcps_reass_replace); - return (0); - } + /* Extends the end, common case. */ + if (SEQ_GT(trbs.trb_seqe, trb->trb_seqe)) { + (void)tcp_reass_merge(trb, &trbs); + tcp_reass_sacktrack(tp, trb); - /* Segment prepends to this block. */ - if (SEQ_GT(tqe->trq_seq, th_seq) && - SEQ_LEQ(tqe->trq_seq, th_seq + *tlenp) && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) { - KASSERT(!(thflags & TH_FIN), - ("%s: new segment with FIN can't prepend", __func__)); - /* Trim tail of segment. */ - if ((i = SEQ_DELTA(tqe->trq_seq, th_seq + *tlenp))) { - m_adj(m, -i); - *tlenp -= i; - /* TCP statistics. */ - TCPSTAT_INC(tcps_rcvpartduppack); - TCPSTAT_ADD(tcps_rcvpartdupbyte, i); - /* Update accounting. */ - mcnt = m_storagesize(m); + /* Merge in next blocks if there is overlap. */ + while ((trbn = RB_NEXT(tcp_rb, &tp->rcv_reass, trb)) != NULL && + SEQ_GEQ(trbn->trb_seqs, trb->trb_seqe)) { + trbn = tcp_reass_merge(trb, trbn); + tcp_reass_free(tp, trbn); } - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_seq = th_seq; - n = m_last(m); - n->m_next = tqe->trq_m; - tqe->trq_m = m; - sack_track(tp, tqe); - TCPSTAT_INC(tcps_reass_prepend); - return (0); } - /* Segment appends to this block. */ - if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp) && - SEQ_LEQ(tqe->trq_seq, th_seq) && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq)) { - /* Trim head of segment. */ - if ((i = SEQ_DELTA(tqe->trq_seq + tqe->trq_len, th_seq))) { - m_adj(m, i); - *tlenp -= i; - /* TCP Statistics. */ - TCPSTAT_INC(tcps_rcvpartduppack); - TCPSTAT_ADD(tcps_rcvpartdupbyte, i); + /* Extends the start. */ + if (SEQ_LT(trbs.trb_seqs, trb->trb_seqs)) { + (void)tcp_reass_merge(trb, &trbs); + tcp_reass_sacktrack(tp, trb); + + /* Merge in previous blocks if there is overlap. */ + while ((trbn = RB_PREV(tcp_rb, &tp->rcv_reass, trb)) != NULL && + SEQ_GEQ(trbn->trb_seqe, trb->trb_seqs)) { + trbn = tcp_reass_merge(tp, trb, trbn); + tcp_reass_free(tp, trbn); } - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_ml->m_next = m; - tqe->trq_ml = m_last(m); - /* Check if segment bridges two blocks to merge. */ - if (tqen != NULL && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq)) - tcp_reass_merge(tp, tqe, tqen); - sack_track(tp, tqe); - TCPSTAT_INC(tcps_reass_append); - return (0); } + } else if ((trb = (struct tcp_reass_block *)uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO))) != NULL) { + trb->trb_seqs = trbs.trb_seqs; + trb->trb_seqe = trbs.trb_seqe; + trb->trb_m = trbs.trb_m; + trb->trb_mt = trbs.trb_mt; + trbn = RB_INSERT(tcp_rb, &tp->rcv_reass, trb); + KASSERT(trbn == NULL, ("%s: RB_INSERT failed", __func__)); + tcp_reass_sacktrack(tp, trb); + tp->rcv_reass_size += SEQ_DELTA(trb->trb_seqs, trb->trb_seqe); + } else if (tp->rcv_nxt == th_seq) { + trbn = RB_INSERT(tcp_rb, &tp->rcv_reass, &trbs); + KASSERT(trbn == NULL, ("%s: RB_INSERT failed", __func__)); } + if (tp->rcv_nxt == th_seq) + goto present; -insert: - /* Prepare to insert into block queue. */ - if (tp->rcv_nxt == th_seq) { - /* - * Use temporary struct trq on the stack for missing - * segment to prevent blocking of all reassembly queues - * due to zone exhaustion. - */ - tqen = &tqes; - } else { - tqen = uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO)); - if (tqen == NULL) { - TCPSTAT_INC(tcps_rcvmemdrop); - m_freem(m); - *tlenp = 0; - return (0); - } - TCPSTAT_INC(tcps_reass_blocks); - } - tcp_reass_qsize++; - if (tcp_reass_spacetime) { - m = m_collapse(m, M_DONTWAIT, 1024); - mcnt = m_storagesize(m); - } - tqen->trq_seq = th_seq; - tqen->trq_len = *tlenp; - tqen->trq_mcnt = mcnt; - tp->t_trqmcnt += mcnt; - tqen->trq_m = m; - tqen->trq_ml = m_last(m); + KASSERT(tcp_reass_verify(tp), + ("%s: reassembly queue inconsistent", __func__)); + return (0); - /* Where to insert. */ - if (tqe != NULL && SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq)) - TAILQ_INSERT_AFTER(&tp->t_trq, tqe, tqen, trq_q); - else if (tqe != NULL) - TAILQ_INSERT_BEFORE(tqe, tqen, trq_q); - else { - KASSERT(TAILQ_EMPTY(&tp->t_trq), - ("%s: first element queue not empty", __func__)); - TAILQ_INSERT_HEAD(&tp->t_trq, tqen, trq_q); - /* - * Flush the reassembly queue after x times the - * current retransmit interval measured from the - * arrival time of the first segment. - */ - if (tcp_reass_qtimo) - tcp_timer_activate(tp, TT_REASS, - tp->t_rxtcur * tcp_reass_qtimo); - } - LIST_INSERT_HEAD(&tp->t_trq_sack, tqen, trq_s); - - /* Missing segment? */ - if (tp->rcv_nxt != th_seq) - return (0); present: /* * Present data to user, advancing rcv_nxt through the * completed sequence space. */ - KASSERT(!TAILQ_EMPTY(&tp->t_trq), + KASSERT(!RB_EMPTY(&tp->rcv_reass), ("%s: queue empty at present", __func__)); - KASSERT((TAILQ_FIRST(&tp->t_trq))->trq_seq == tp->rcv_nxt, + KASSERT((RB_MIN(tcp_ra, &tp->rcv_reass))->trb_seqs == tp->rcv_nxt, ("%s: first block does not match rcv_nxt", __func__)); TCPSTAT_INC(tcps_reass_missingseg); SOCKBUF_LOCK(&so->so_rcv); - TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { - KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt), - ("%s: trq_seq < rcv_nxt", __func__)); - KASSERT(tqen == NULL || - SEQ_LEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq), - ("%s: block overlaps into next one", __func__)); - if (tqe->trq_seq != tp->rcv_nxt) - break; - if (so->so_rcv.sb_state & SBS_CANTRCVMORE) - m_freem(tqe->trq_m); - else - sbappendstream_locked(&so->so_rcv, tqe->trq_m); + trb = RB_MIN(tcp_ra, &tp->rcv_reass); + if (!(so->so_rcv.sb_state & SBS_CANTRCVMORE)) { + sbappendstream_locked(&so->so_rcv, trb->trb_m); tp->rcv_nxt += tqe->trq_len; - tp->t_trqmcnt -= tqe->trq_mcnt; - TAILQ_REMOVE(&tp->t_trq, tqe, trq_q); - LIST_REMOVE(tqe, trq_s); - if (tqe != &tqes) - uma_zfree(tcp_reass_zone, tqe); - V_tcp_reass_qsize--; + trb->trb_m = NULL; + trb->trb_mt = NULL; } + if (trb == &trbs) { + RB_REMOVE(tcp_ra, &tp->rcv_reass, trb); + if (trb->trb_m != NULL) + m_freem(trb->trb_m); + } else + tcp_reass_free(tp, trb); + /* NB: sorwakeup_locked() does a implicit socket buffer unlock. */ sorwakeup_locked(so); @@ -615,7 +504,7 @@ * the sequence space and if queue is not empty. Otherwise * deactivate it. */ - if (tcp_reass_qtimo && !TAILQ_EMPTY(&tp->t_trq)) + if (tcp_reass_qtimo && !RB_EMPTY(&tp->rcv_reass)) tcp_timer_activate(tp, TT_REASS, tp->t_rxtcur * tcp_reass_qtimo); else @@ -627,57 +516,45 @@ /* * Merge one or more consecutive blocks together. + * Always merge trbn into trb! */ -static void -tcp_reass_merge(struct tcpcb *tp, struct trq *tqe, struct trq *tqen) +static struct tcp_reass_block * +tcp_reass_merge(struct tcpcb *tp, struct tcp_reass_block *trb, struct tcp_reass_block *trbn) { int i; - KASSERT(tqe != NULL && tqen != NULL, + KASSERT(trb != NULL && trbn != NULL, ("%s: incomplete input", __func__)); - KASSERT(SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq), - ("%s: blocks do not overlap, nothing to merge", __func__)); - /* Appended block may reach beyond next block. */ - while (SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq + tqen->trq_len)) { - /* TCP Statistics. */ - TCPSTAT_ADD(tcps_rcvpartdupbyte, tqen->trq_len); - TCPSTAT_INC(tcps_reass_covered); - tp->t_trqmcnt -= tqe->trq_mcnt; - m_freem(tqen->trq_m); - TAILQ_REMOVE(&tp->t_trq, tqen, trq_q); - LIST_REMOVE(tqen, trq_s); - uma_zfree(tcp_reass_zone, tqen); - tcp_reass_qsize--; - /* And the one after that. */ - if ((tqen = TAILQ_NEXT(tqe, trq_q)) == NULL) - return; - } - - /* Trim head of next block. */ - if ((i = SEQ_DELTA(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))) { - m_adj(tqen->trq_m, i); - tqen->trq_len -= i; - TCPSTAT_ADD(tcps_rcvpartdupbyte, i); /* Statistics */ - /* Dispose of empty mbufs. */ - if (tcp_reass_spacetime) { - tqen->trq_m = m_trimhead(tqen->trq_m); - tqen->trq_mcnt = m_storagesize(tqen->trq_m); + /* Append and prepend. */ + if (SEQ_GEQ(trb->trb_seqe, trbn->trb_seqs)) { + if (SEQ_GEQ(trb->trb_seqe, trbn->trb_seqe)) + return (trbn); + if ((i = SEQ_DELTA(trb->trb_seqe, trbn->trb_seqs)) > 0) { + m_adj(trbn->trb_m, i); + trbn->trb_m = m_trimhead(trbn->trb_m); + } + trb->trb_seqe = trbn->trb_seqe; + trb->trb_mt->m_next = trbn->trb_m; + trb->trb_mt = trbn->trb_mt; + } else if (SEQ_LEQ(trb->trb_seqs, trbn->trb_seqe)) { + if (SEQ_LEQ(trb->trb_seqs, trbn->trb_seqs) + return (trbn); + if ((i = SEQ_DELTA(trb->trb_seqs, trbn->trb_seqe)) > 0) { + m_adj(trb->trb_m, i); + trb->trb_m = m_trimhead(trb->trb_m); } - KASSERT(tqen->trq_m != NULL, - ("%s: no remaining mbufs in block", __func__)); - } + trb->trb_seqs = trbn->trb_seqs; + trbn->trb_mt->m_next = trb->trb_m; + trb->trb_m = trbn->trb_m; + } else + return (NULL); - /* Merge blocks together. */ - tqe->trq_len += tqen->trq_len; - tqe->trq_mcnt += tqen->trq_mcnt; - tqe->trq_ml->m_next = tqen->trq_m; - tqe->trq_ml = tqen->trq_ml; - TAILQ_REMOVE(&tp->t_trq, tqen, trq_q); - LIST_REMOVE(tqen, trq_s); - uma_zfree(tcp_reass_zone, tqen); - tcp_reass_qsize--; - TCPSTAT_INC(tcps_reass_merge); + trbn->trb_seqs = 0; + trbn->trb_seqe = i; + trbn->trb_m = NULL; + trbn->trb_mt = NULL; + return (trbn); } /* @@ -691,10 +568,9 @@ tcp_seq sack_seq; int nsacks = 0; + INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(numsacks > 0, ("%s: zero sack blocks to add", __func__)); - KASSERT(!TAILQ_EMPTY(&tp->t_trq), - ("%s: reassembly queue empty", __func__)); KASSERT(!LIST_EMPTY(&tp->t_trq_sack), ("%s: sack list empty", __func__)); @@ -718,25 +594,15 @@ return (nsacks); } -/* - * Free the reassembly queue on tcpcb disposal or on general memory shortage. - */ -void -tcp_reass_qfree(struct tcpcb *tp) +#ifdef DDB +static void +db_print_reassblocks(struct tcpcb *tp) { - struct trq *tqe, *tqen; + struct tcp_reass_block *trb; - INP_WLOCK_ASSERT(tp->t_inpcb); - - TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { - m_freem(tqe->trq_m); - KASSERT(tp->t_trqmcnt >= tqe->trq_mcnt, - ("%s: t_trqmcnt incorrect", __func__)); - tp->t_trqmcnt -= tqe->trq_mcnt; - TAILQ_REMOVE(&tp->t_trq, tqe, trq_q); - LIST_REMOVE(tqe, trq_s); - uma_zfree(tcp_reass_zone, tqe); - tcp_reass_qsize--; + RB_FOREACH(trb, tcp_ra, &tp->rcv_reass) { + db_printf(" reass block 0x%08x - 0x%08x\n", + trb->trb_seqs, trb->trb_seqe); } - tcp_timer_activate(tp, TT_REASS, 0); } +#endif ==== //depot/projects/tcp_reass/netinet/tcp_var.h#18 (text+ko) ==== @@ -51,14 +51,13 @@ #endif /* _KERNEL */ /* TCP reassembly queue segment block entry. */ -struct trq { - TAILQ_ENTRY(trq) trq_q; /* linked list in SEQ# order */ - LIST_ENTRY(trq) trq_s; /* linked list in SACK order */ - tcp_seq trq_seq; /* start of block */ - int trq_len; /* length of block */ - int trq_mcnt; /* gross mbuf size of block */ - struct mbuf *trq_m; /* mbuf chain of data */ - struct mbuf *trq_ml; /* last mbuf in chain of data */ +struct tcp_reass_block { + RB_ENTRY(tcp_reass_block) trb_rb; + LIST_ENTRY(tcp_reass_block) trb_sack; /* linked list in SACK order */ + tcp_seq trb_seqs; /* start of block */ + tcp_seq trb_seqe; /* end of block */ + struct mbuf *trb_m; /* mbuf chain of data */ + struct mbuf *trb_mt; /* last mbuf in chain of data */ }; struct sackblk { @@ -105,9 +104,9 @@ * Organized for 16 byte cacheline efficiency. */ struct tcpcb { - TAILQ_HEAD(trq_head, trq) t_trq; /* segment reassembly queue */ - LIST_HEAD(trq_shead, trq) t_trq_sack; /* last additions to reass queue */ - int t_trqmcnt; /* segment reassembly queue gross usage */ + RB_HEAD(tcp_ra, tcp_reass_block) rcv_reass; /* segment reassembly queue */ + LIST_HEAD(trq_shead, trq) rcv_reass_sack; /* last additions to reass queue */ + int rcv_reass_size; /* segment reassembly memory usage */ int t_dupacks; /* consecutive dup acks recd */ @@ -653,7 +652,7 @@ int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); void tcp_reass_init(void); int tcp_reass_sack(struct tcpcb *, u_char *, int); -void tcp_reass_qfree(struct tcpcb *); +void tcp_reass_flush(struct tcpcb *); void tcp_input(struct mbuf *, int); u_long tcp_maxmtu(struct in_conninfo *, int *); u_long tcp_maxmtu6(struct in_conninfo *, int *);