From owner-svn-src-all@FreeBSD.ORG Tue Aug 18 20:28:59 2009 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 47CE2106568B; Tue, 18 Aug 2009 20:28:59 +0000 (UTC) (envelope-from kmacy@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 346B58FC15; Tue, 18 Aug 2009 20:28:58 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id n7IKSw9C027603; Tue, 18 Aug 2009 20:28:58 GMT (envelope-from kmacy@svn.freebsd.org) Received: (from kmacy@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id n7IKSwJ9027599; Tue, 18 Aug 2009 20:28:58 GMT (envelope-from kmacy@svn.freebsd.org) Message-Id: <200908182028.n7IKSwJ9027599@svn.freebsd.org> From: Kip Macy Date: Tue, 18 Aug 2009 20:28:58 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r196368 - in head/sys: net netinet X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 18 Aug 2009 20:28:59 -0000 Author: kmacy Date: Tue Aug 18 20:28:58 2009 New Revision: 196368 URL: http://svn.freebsd.org/changeset/base/196368 Log: - change the interface to flowtable_lookup so that we don't rely on the mbuf for obtaining the fib index - check that a cached flow corresponds to the same fib index as the packet for which we are doing the lookup - at interface detach time flush any flows referencing stale rtentrys associated with the interface that is going away (fixes reported panics) - reduce the time between cleans in case the cleaner is running at the time the eventhandler is called and the wakeup is missed less time will elapse before the eventhandler returns - separate per-vnet initialization from global initialization (pointed out by jeli@) Reviewed by: sam@ Approved by: re@ Modified: head/sys/net/flowtable.c head/sys/net/flowtable.h head/sys/netinet/ip_output.c Modified: head/sys/net/flowtable.c ============================================================================== --- head/sys/net/flowtable.c Tue Aug 18 20:25:02 2009 (r196367) +++ head/sys/net/flowtable.c Tue Aug 18 20:28:58 2009 (r196368) @@ -29,6 +29,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "opt_route.h" #include "opt_mpath.h" +#include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); @@ -36,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -66,6 +68,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include struct ipv4_tuple { uint16_t ip_sport; /* source port */ @@ -94,8 +97,9 @@ union ipv6_flow { struct flentry { volatile uint32_t f_fhash; /* hash flowing forward */ uint16_t f_flags; /* flow flags */ - uint8_t f_pad; /* alignment */ + uint8_t f_pad; uint8_t f_proto; /* protocol */ + uint32_t f_fibnum; /* fib index */ uint32_t f_uptime; /* uptime at last access */ struct flentry *f_next; /* pointer to collision entry */ volatile struct rtentry *f_rt; /* rtentry for flow */ @@ -173,6 +177,10 @@ static VNET_DEFINE(uma_zone_t, flow_ipv6 #define V_flow_ipv4_zone VNET(flow_ipv4_zone) #define V_flow_ipv6_zone VNET(flow_ipv6_zone) +static struct cv flowclean_cv; +static struct mtx flowclean_lock; +static uint32_t flowclean_cycles; + /* * TODO: * - Make flowtable stats per-cpu, aggregated at sysctl call time, @@ -288,10 +296,10 @@ SYSCTL_VNET_PROC(_net_inet_flowtable, OI #ifndef RADIX_MPATH static void -in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fib) +in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum) { - rtalloc_ign_fib(ro, 0, fib); + rtalloc_ign_fib(ro, 0, fibnum); } #endif @@ -425,7 +433,7 @@ static bitstr_t * flowtable_mask(struct flowtable *ft) { bitstr_t *mask; - + if (ft->ft_flags & FL_PCPU) mask = ft->ft_masks[curcpu]; else @@ -501,7 +509,7 @@ flowtable_set_hashkey(struct flentry *fl static int flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, - uint8_t proto, struct route *ro, uint16_t flags) + uint8_t proto, uint32_t fibnum, struct route *ro, uint16_t flags) { struct flentry *fle, *fletail, *newfle, **flep; int depth; @@ -564,6 +572,7 @@ skip: fle->f_rt = ro->ro_rt; fle->f_lle = ro->ro_lle; fle->f_fhash = hash; + fle->f_fibnum = fibnum; fle->f_uptime = time_uptime; FL_ENTRY_UNLOCK(ft, hash); return (0); @@ -591,13 +600,13 @@ flowtable_key_equal(struct flentry *fle, } int -flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro) +flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro, uint32_t fibnum) { uint32_t key[9], hash; struct flentry *fle; uint16_t flags; uint8_t proto = 0; - int error = 0, fib = 0; + int error = 0; struct rtentry *rt; struct llentry *lle; @@ -640,6 +649,7 @@ keycheck: && fle->f_fhash == hash && flowtable_key_equal(fle, key) && (proto == fle->f_proto) + && (fibnum == fle->f_fibnum) && (rt->rt_flags & RTF_UP) && (rt->rt_ifp != NULL)) { V_flowtable_hits++; @@ -668,10 +678,8 @@ uncached: * of arpresolve with an rt_check variant that expected to * receive the route locked */ - if (m != NULL) - fib = M_GETFIB(m); - ft->ft_rtalloc(ro, hash, fib); + ft->ft_rtalloc(ro, hash, fibnum); if (ro->ro_rt == NULL) error = ENETUNREACH; else { @@ -692,7 +700,7 @@ uncached: ro->ro_rt = NULL; return (ENOENT); } - error = flowtable_insert(ft, hash, key, proto, + error = flowtable_insert(ft, hash, key, proto, fibnum, ro, flags); if (error) { @@ -791,35 +799,6 @@ flowtable_alloc(int nentry, int flags) return (ft); } -static void -flowtable_init(const void *unused __unused) -{ - - V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4), - NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET); - V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6), - NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET); - uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows); - uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows); - V_flowtable_ready = 1; -} - -VNET_SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, - flowtable_init, NULL); - -#ifdef VIMAGE -static void -flowtable_uninit(const void *unused __unused) -{ - - uma_zdestroy(V_flow_ipv4_zone); - uma_zdestroy(V_flow_ipv6_zone); -} - -VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, - flowtable_uninit, NULL); -#endif - /* * The rest of the code is devoted to garbage collection of expired entries. * It is a new additon made necessary by the switch to dynamically allocating @@ -973,12 +952,30 @@ flowtable_cleaner(void) } VNET_LIST_RUNLOCK(); + flowclean_cycles++; /* * The 20 second interval between cleaning checks * is arbitrary */ - pause("flowcleanwait", 20*hz); + mtx_lock(&flowclean_lock); + cv_broadcast(&flowclean_cv); + cv_timedwait(&flowclean_cv, &flowclean_lock, 10*hz); + mtx_unlock(&flowclean_lock); + } +} + +static void +flowtable_flush(void *unused __unused) +{ + uint64_t start; + + mtx_lock(&flowclean_lock); + start = flowclean_cycles; + while (start == flowclean_cycles) { + cv_broadcast(&flowclean_cv); + cv_wait(&flowclean_cv, &flowclean_lock); } + mtx_unlock(&flowclean_lock); } static struct kproc_desc flow_kp = { @@ -988,3 +985,159 @@ static struct kproc_desc flow_kp = { }; SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp); +static void +flowtable_init_vnet(const void *unused __unused) +{ + + V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4), + NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET); + V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6), + NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET); + uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows); + uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows); +} +VNET_SYSINIT(flowtable_init_vnet, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, + flowtable_init_vnet, NULL); + +static void +flowtable_init(const void *unused __unused) +{ + + cv_init(&flowclean_cv, "flowcleanwait"); + mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF); + EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL, + EVENTHANDLER_PRI_ANY); + V_flowtable_ready = 1; +} +SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, + flowtable_init, NULL); + + +#ifdef VIMAGE +static void +flowtable_uninit(const void *unused __unused) +{ + + uma_zdestroy(V_flow_ipv4_zone); + uma_zdestroy(V_flow_ipv6_zone); +} + +VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, + flowtable_uninit, NULL); +#endif + +#ifdef DDB +static bitstr_t * +flowtable_mask_pcpu(struct flowtable *ft, int cpuid) +{ + bitstr_t *mask; + + if (ft->ft_flags & FL_PCPU) + mask = ft->ft_masks[cpuid]; + else + mask = ft->ft_masks[0]; + + return (mask); +} + +static struct flentry ** +flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) +{ + struct flentry **fle; + int index = (hash % ft->ft_size); + + if (ft->ft_flags & FL_PCPU) { + fle = &ft->ft_table.pcpu[cpuid][index]; + } else { + fle = &ft->ft_table.global[index]; + } + + return (fle); +} + +static void +flow_show(struct flowtable *ft, struct flentry *fle) +{ + int idle_time; + int rt_valid; + + idle_time = (int)(time_uptime - fle->f_uptime); + rt_valid = fle->f_rt != NULL; + db_printf("hash=0x%08x idle_time=%03d rt=%p ifp=%p", + fle->f_fhash, idle_time, + fle->f_rt, rt_valid ? fle->f_rt->rt_ifp : NULL); + if (rt_valid && (fle->f_rt->rt_flags & RTF_UP)) + db_printf(" RTF_UP "); + if (fle->f_flags & FL_STALE) + db_printf(" FL_STALE "); + db_printf("\n"); +} + +static void +flowtable_show(struct flowtable *ft, int cpuid) +{ + int curbit = 0; + struct flentry *fle, **flehead; + bitstr_t *mask, *tmpmask; + + db_printf("cpu: %d\n", cpuid); + mask = flowtable_mask_pcpu(ft, cpuid); + tmpmask = ft->ft_tmpmask; + memcpy(tmpmask, mask, ft->ft_size/8); + /* + * XXX Note to self, bit_ffs operates at the byte level + * and thus adds gratuitous overhead + */ + bit_ffs(tmpmask, ft->ft_size, &curbit); + while (curbit != -1) { + if (curbit >= ft->ft_size || curbit < -1) { + db_printf("warning: bad curbit value %d \n", + curbit); + break; + } + + flehead = flowtable_entry_pcpu(ft, curbit, cpuid); + fle = *flehead; + + while (fle != NULL) { + flow_show(ft, fle); + fle = fle->f_next; + continue; + } + bit_clear(tmpmask, curbit); + bit_ffs(tmpmask, ft->ft_size, &curbit); + } +} + +static void +flowtable_show_vnet(void) +{ + struct flowtable *ft; + int i; + + ft = V_flow_list_head; + while (ft != NULL) { + if (ft->ft_flags & FL_PCPU) { + for (i = 0; i <= mp_maxid; i++) { + if (CPU_ABSENT(i)) + continue; + flowtable_show(ft, i); + } + } else { + flowtable_show(ft, 0); + } + ft = ft->ft_next; + } +} + +DB_SHOW_COMMAND(flowtables, db_show_flowtables) +{ + VNET_ITERATOR_DECL(vnet_iter); + + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + flowtable_show_vnet(); + CURVNET_RESTORE(); + } +} +#endif Modified: head/sys/net/flowtable.h ============================================================================== --- head/sys/net/flowtable.h Tue Aug 18 20:25:02 2009 (r196367) +++ head/sys/net/flowtable.h Tue Aug 18 20:28:58 2009 (r196368) @@ -49,7 +49,7 @@ struct flowtable *flowtable_alloc(int ne * */ int flowtable_lookup(struct flowtable *ft, struct mbuf *m, - struct route *ro); + struct route *ro, uint32_t fibnum); #endif /* _KERNEL */ #endif Modified: head/sys/netinet/ip_output.c ============================================================================== --- head/sys/netinet/ip_output.c Tue Aug 18 20:25:02 2009 (r196367) +++ head/sys/netinet/ip_output.c Tue Aug 18 20:28:58 2009 (r196368) @@ -157,7 +157,7 @@ ip_output(struct mbuf *m, struct mbuf *o * longer than that long for the stability of ro_rt. The * flow ID assignment must have happened before this point. */ - if (flowtable_lookup(V_ip_ft, m, ro) == 0) + if (flowtable_lookup(V_ip_ft, m, ro, M_GETFIB(m)) == 0) nortfree = 1; #endif }