Date: Thu, 9 Apr 2015 22:13:28 +0000 (UTC) From: Gleb Smirnoff <glebius@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r281342 - head/sys/netinet Message-ID: <201504092213.t39MDS2K063576@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: glebius Date: Thu Apr 9 22:13:27 2015 New Revision: 281342 URL: https://svnweb.freebsd.org/changeset/base/281342 Log: Now that IP reassembly is no longer under single lock, book-keeping amount of allocations in V_nipq is racy. To fix that, we would simply stop doing book-keeping ourselves, and rely on UMA doing that. There could be a slight overcommit due to caches, but that isn't a big deal. o V_nipq and V_maxnipq go away. o net.inet.ip.fragpackets is now just SYSCTL_UMA_CUR() o net.inet.ip.maxfragpackets could have been just SYSCTL_UMA_MAX(), but historically it has special semantics about values of 0 and -1, so provide sysctl_maxfragpackets() to handle these special cases. o If zone limit lowers either due to net.inet.ip.maxfragpackets or due to kern.ipc.nmbclusters, then new function ipq_drain_tomax() goes over buckets and frees the oldest packets until we are in the limit. The code that (incorrectly) did that in ip_slowtimo() is removed. o ip_reass() doesn't check any limits and calls uma_zalloc(M_NOWAIT). If it fails, a new function ipq_reuse() is called. This function will find the oldest packet in the currently locked bucket, and if there is none, it will search in other buckets until success. Sponsored by: Nginx, Inc. Modified: head/sys/netinet/ip_input.c Modified: head/sys/netinet/ip_input.c ============================================================================== --- head/sys/netinet/ip_input.c Thu Apr 9 21:52:14 2015 (r281341) +++ head/sys/netinet/ip_input.c Thu Apr 9 22:13:27 2015 (r281342) @@ -172,12 +172,18 @@ struct ipqbucket { }; static VNET_DEFINE(struct ipqbucket, ipq[IPREASS_NHASH]); #define V_ipq VNET(ipq) +static VNET_DEFINE(int, noreass); +#define V_noreass VNET(noreass) + #define IPQ_LOCK(i) mtx_lock(&V_ipq[i].lock) +#define IPQ_TRYLOCK(i) mtx_trylock(&V_ipq[i].lock) #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) +#define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) -static void maxnipq_update(void); +static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); static void ipq_zone_change(void *); static void ip_drain_vnet(void); +static void ipq_drain_tomax(void); static void ipq_free(struct ipqhead *, struct ipq *); static inline void @@ -196,12 +202,11 @@ ipq_drop(struct ipqhead *head, struct ip ipq_free(head, fp); } -static VNET_DEFINE(int, maxnipq); /* Administrative limit on # reass queues. */ -static VNET_DEFINE(int, nipq); /* Total # of reass queues */ -#define V_maxnipq VNET(maxnipq) -#define V_nipq VNET(nipq) -SYSCTL_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET | CTLFLAG_RD, - &VNET_NAME(nipq), 0, +SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET | + CTLTYPE_INT | CTLFLAG_RW, NULL, 0, sysctl_maxfragpackets, "I", + "Maximum number of IPv4 fragment reassembly queue entries"); +SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET, + &VNET_NAME(ipq_zone), "Current number of IPv4 fragment reassembly queue entries"); static VNET_DEFINE(int, maxfragsperpacket); @@ -346,13 +351,13 @@ ip_init(void) /* Initialize IP reassembly queue. */ for (i = 0; i < IPREASS_NHASH; i++) { TAILQ_INIT(&V_ipq[i].head); - mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, MTX_DEF); + mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, + MTX_DEF | MTX_DUPOK); } - V_maxnipq = nmbclusters / 32; V_maxfragsperpacket = 16; V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - maxnipq_update(); + uma_zone_set_max(V_ipq_zone, nmbclusters / 32); /* Initialize packet filter hooks. */ V_inet_pfil_hook.ph_type = PFIL_TYPE_AF; @@ -810,25 +815,27 @@ bad: * reasons. */ static void -maxnipq_update(void) +ipq_drain_tomax(void) { + int target; /* - * -1 for unlimited allocation. - */ - if (V_maxnipq < 0) - uma_zone_set_max(V_ipq_zone, 0); - /* - * Positive number for specific bound. - */ - if (V_maxnipq > 0) - uma_zone_set_max(V_ipq_zone, V_maxnipq); - /* - * Zero specifies no further fragment queue allocation. + * If we are over the maximum number of fragments, + * drain off enough to get down to the new limit, + * stripping off last elements on queues. Every + * run we strip the oldest element from each bucket. */ - if (V_maxnipq == 0) { - uma_zone_set_max(V_ipq_zone, 1); - ip_drain_vnet(); + target = uma_zone_get_max(V_ipq_zone); + while (uma_zone_get_cur(V_ipq_zone) > target) { + struct ipq *fp; + + for (int i = 0; i < IPREASS_NHASH; i++) { + IPQ_LOCK(i); + fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); + if (fp != NULL) + ipq_timeout(&V_ipq[i].head, fp); + IPQ_UNLOCK(i); + } } } @@ -836,70 +843,86 @@ static void ipq_zone_change(void *tag) { - if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) { - V_maxnipq = nmbclusters / 32; - maxnipq_update(); - } + uma_zone_set_max(V_ipq_zone, nmbclusters / 32); + ipq_drain_tomax(); } +/* + * Change the limit on the UMA zone, or disable the fragment allocation + * at all. Since 0 and -1 is a special values here, we need our own handler, + * instead of sysctl_handle_uma_zone_max(). + */ static int -sysctl_maxnipq(SYSCTL_HANDLER_ARGS) +sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) { - int error, i; + int error, max; - i = V_maxnipq; - error = sysctl_handle_int(oidp, &i, 0, req); + if (V_noreass == 0) { + max = uma_zone_get_max(V_ipq_zone); + if (max == 0) + max = -1; + } else + max = 0; + error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); - - /* - * XXXRW: Might be a good idea to sanity check the argument and place - * an extreme upper bound. - */ - if (i < -1) + if (max > 0) { + /* + * XXXRW: Might be a good idea to sanity check the argument + * and place an extreme upper bound. + */ + max = uma_zone_set_max(V_ipq_zone, max); + ipq_drain_tomax(); + V_noreass = 0; + } else if (max == 0) { + V_noreass = 1; + ip_drain_vnet(); + } else if (max == -1) { + V_noreass = 0; + uma_zone_set_max(V_ipq_zone, 0); + } else return (EINVAL); - V_maxnipq = i; - maxnipq_update(); return (0); } -SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW, - NULL, 0, sysctl_maxnipq, "I", - "Maximum number of IPv4 fragment reassembly queue entries"); - #define M_IP_FRAG M_PROTO9 /* - * Attempt to purge something from the reassembly queue to make - * room. - * - * Must be called without any IPQ locks held, as it will attempt - * to lock each in turn. - * - * 'skip_bucket' is the bucket with which to skip over, or -1 to - * not skip over anything. - * - * Returns the bucket being freed, or -1 for no action. + * Seek for old fragment queue header that can be reused. Try to + * reuse a header from currently locked hash bucket. */ -static int -ip_reass_purge_element(int skip_bucket) +static struct ipq * +ipq_reuse(int start) { + struct ipq *fp; int i; - struct ipq *r; - for (i = 0; i < IPREASS_NHASH; i++) { - if (skip_bucket > -1 && i == skip_bucket) + IPQ_LOCK_ASSERT(start); + + for (i = start;; i++) { + if (i == IPREASS_NHASH) + i = 0; + if (i != start && IPQ_TRYLOCK(i) == 0) continue; - IPQ_LOCK(i); - r = TAILQ_LAST(&V_ipq[i].head, ipqhead); - if (r) { - ipq_timeout(&V_ipq[i].head, r); - IPQ_UNLOCK(i); - return (i); + fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); + if (fp) { + struct mbuf *m; + + IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); + while (fp->ipq_frags) { + m = fp->ipq_frags; + fp->ipq_frags = m->m_nextpkt; + m_freem(m); + } + TAILQ_REMOVE(&V_ipq[i].head, fp, ipq_list); + if (i != start) + IPQ_UNLOCK(i); + IPQ_LOCK_ASSERT(start); + return (fp); } - IPQ_UNLOCK(i); + if (i != start) + IPQ_UNLOCK(i); } - return (-1); } /* @@ -917,7 +940,7 @@ ip_reass(struct mbuf *m) { struct ip *ip; struct mbuf *p, *q, *nq, *t; - struct ipq *fp = NULL; + struct ipq *fp; struct ipqhead *head; int i, hlen, next; u_int8_t ecn, ecn0; @@ -925,10 +948,12 @@ ip_reass(struct mbuf *m) #ifdef RSS uint32_t rss_hash, rss_type; #endif - int do_purge = 0; - /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ - if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { + /* + * If no reassembling or maxfragsperpacket are 0, + * never accept fragments. + */ + if (V_noreass == 1 || V_maxfragsperpacket == 0) { IPSTAT_INC(ips_fragments); IPSTAT_INC(ips_fragdropped); m_freem(m); @@ -989,38 +1014,14 @@ ip_reass(struct mbuf *m) mac_ipq_match(m, fp) && #endif ip->ip_p == fp->ipq_p) - goto found; - - fp = NULL; - - /* - * Attempt to trim the number of allocated fragment queues if it - * exceeds the administrative limit. - */ - if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) { - /* - * drop something from the tail of the current queue - * before proceeding further - */ - struct ipq *q = TAILQ_LAST(head, ipqhead); - if (q == NULL) { /* gak */ - /* - * Defer doing this until later; when the - * lock is no longer held. - */ - do_purge = 1; - } else - ipq_timeout(head, q); - } - -found: + break; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) - goto dropfrag; + fp = ipq_reuse(hash); #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { uma_zfree(V_ipq_zone, fp); @@ -1030,7 +1031,6 @@ found: mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); - V_nipq++; fp->ipq_nfrags = 1; fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; @@ -1196,7 +1196,6 @@ found: ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); - V_nipq--; uma_zfree(V_ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); @@ -1206,19 +1205,6 @@ found: IPSTAT_INC(ips_reassembled); IPQ_UNLOCK(hash); - /* - * Do the delayed purge to keep fragment counts under - * the configured maximum. - * - * This is delayed so that it's not done with another IPQ bucket - * lock held. - * - * Note that we pass in the bucket to /skip/ over, not - * the bucket to /purge/. - */ - if (do_purge) - ip_reass_purge_element(hash); - #ifdef RSS /* * Query the RSS layer for the flowid / flowtype for the @@ -1281,7 +1267,6 @@ ipq_free(struct ipqhead *fhp, struct ipq } TAILQ_REMOVE(fhp, fp, ipq_list); uma_zfree(V_ipq_zone, fp); - V_nipq--; } /* @@ -1306,21 +1291,6 @@ ip_slowtimo(void) ipq_timeout(&V_ipq[i].head, fp); IPQ_UNLOCK(i); } - /* - * If we are over the maximum number of fragments - * (due to the limit being lowered), drain off - * enough to get down to the new limit. - */ - if (V_maxnipq >= 0 && V_nipq > V_maxnipq) { - for (i = 0; i < IPREASS_NHASH; i++) { - IPQ_LOCK(i); - while (V_nipq > V_maxnipq && - !TAILQ_EMPTY(&V_ipq[i].head)) - ipq_drop(&V_ipq[i].head, - TAILQ_FIRST(&V_ipq[i].head)); - IPQ_UNLOCK(i); - } - } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP();
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201504092213.t39MDS2K063576>