From owner-svn-src-stable-8@FreeBSD.ORG Thu Apr 1 00:36:41 2010 Return-Path: Delivered-To: svn-src-stable-8@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 073DD1065670; Thu, 1 Apr 2010 00:36:41 +0000 (UTC) (envelope-from kmacy@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id E7F798FC0C; Thu, 1 Apr 2010 00:36:40 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id o310aeYo070173; Thu, 1 Apr 2010 00:36:40 GMT (envelope-from kmacy@svn.freebsd.org) Received: (from kmacy@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id o310ae4T070170; Thu, 1 Apr 2010 00:36:40 GMT (envelope-from kmacy@svn.freebsd.org) Message-Id: <201004010036.o310ae4T070170@svn.freebsd.org> From: Kip Macy Date: Thu, 1 Apr 2010 00:36:40 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-8@freebsd.org X-SVN-Group: stable-8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r206024 - in stable/8/sys: net netinet X-BeenThere: svn-src-stable-8@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: SVN commit messages for only the 8-stable src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 01 Apr 2010 00:36:41 -0000 Author: kmacy Date: Thu Apr 1 00:36:40 2010 New Revision: 206024 URL: http://svn.freebsd.org/changeset/base/206024 Log: MFC 205066, 205069, 205093, 205097, 205488: r205066: Log: - restructure flowtable to support ipv6 - add a name argument to flowtable_alloc for printing with ddb commands - extend ddb commands to print destination address or 4-tuples - don't parse ports in ulp header if FL_HASH_ALL is not passed - add kern_flowtable_insert to enable more generic use of flowtable (e.g. system calls for adding entries) - don't hash loopback addresses - cleanup whitespace - keep statistics per-cpu for per-cpu flowtables to avoid cache line contention - add sysctls to accumulate stats and report aggregate r205069: Log: fix stats reporting sysctl r205093: Log: re-update copyright to 2010 pointed out by danfe@ r205097: Log: flowtable_get_hashkey is only used by a DDB function - move under #ifdef DDB pointed out by jkim@ r205488: Log: - boot-time size the ipv4 flowtable and the maximum number of flows - increase flow cleaning frequency and decrease flow caching time when near the flow limit - stop allocating new flows when within 3% of maxflows don't start allocating again until below 12.5% Modified: stable/8/sys/net/flowtable.c stable/8/sys/net/flowtable.h stable/8/sys/net/if_llatbl.c stable/8/sys/net/if_llatbl.h stable/8/sys/netinet/ip_input.c stable/8/sys/netinet/ip_output.c Directory Properties: stable/8/sys/net/ (props changed) Modified: stable/8/sys/net/flowtable.c ============================================================================== --- stable/8/sys/net/flowtable.c Wed Mar 31 23:24:42 2010 (r206023) +++ stable/8/sys/net/flowtable.c Thu Apr 1 00:36:40 2010 (r206024) @@ -1,6 +1,6 @@ /************************************************************************** -Copyright (c) 2008-2009, BitGravity Inc. +Copyright (c) 2008-2010, BitGravity Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,6 +30,8 @@ POSSIBILITY OF SUCH DAMAGE. #include "opt_route.h" #include "opt_mpath.h" #include "opt_ddb.h" +#include "opt_inet.h" +#include "opt_inet6.h" #include __FBSDID("$FreeBSD$"); @@ -45,6 +47,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -63,6 +66,9 @@ __FBSDID("$FreeBSD$"); #include #include #include +#ifdef INET6 +#include +#endif #include #include #include @@ -140,31 +146,42 @@ union flentryp { struct flentry **pcpu[MAXCPU]; }; +struct flowtable_stats { + uint64_t ft_collisions; + uint64_t ft_allocated; + uint64_t ft_misses; + uint64_t ft_max_depth; + uint64_t ft_free_checks; + uint64_t ft_frees; + uint64_t ft_hits; + uint64_t ft_lookups; +} __aligned(CACHE_LINE_SIZE); + struct flowtable { + struct flowtable_stats ft_stats[MAXCPU]; int ft_size; int ft_lock_count; uint32_t ft_flags; - uint32_t ft_collisions; - uint32_t ft_allocated; - uint32_t ft_misses; - uint64_t ft_hits; - - uint32_t ft_udp_idle; - uint32_t ft_fin_wait_idle; - uint32_t ft_syn_idle; - uint32_t ft_tcp_idle; - + char *ft_name; fl_lock_t *ft_lock; fl_lock_t *ft_unlock; fl_rtalloc_t *ft_rtalloc; + /* + * XXX need to pad out + */ struct mtx *ft_locks; - - union flentryp ft_table; bitstr_t *ft_masks[MAXCPU]; bitstr_t *ft_tmpmask; struct flowtable *ft_next; -}; + + uint32_t ft_count __aligned(CACHE_LINE_SIZE); + uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE); + uint32_t ft_fin_wait_idle; + uint32_t ft_syn_idle; + uint32_t ft_tcp_idle; + boolean_t ft_full; +} __aligned(CACHE_LINE_SIZE); static struct proc *flowcleanerproc; static VNET_DEFINE(struct flowtable *, flow_list_head); @@ -177,16 +194,30 @@ static VNET_DEFINE(uma_zone_t, flow_ipv6 #define V_flow_ipv4_zone VNET(flow_ipv4_zone) #define V_flow_ipv6_zone VNET(flow_ipv6_zone) + static struct cv flowclean_cv; static struct mtx flowclean_lock; static uint32_t flowclean_cycles; +static uint32_t flowclean_freq; + +#ifdef FLOWTABLE_DEBUG +#define FLDPRINTF(ft, flags, fmt, ...) \ +do { \ + if ((ft)->ft_flags & (flags)) \ + printf((fmt), __VA_ARGS__); \ +} while (0); \ + +#else +#define FLDPRINTF(ft, flags, fmt, ...) + +#endif + /* * TODO: * - Make flowtable stats per-cpu, aggregated at sysctl call time, * to avoid extra cache evictions caused by incrementing a shared * counter - * - add IPv6 support to flow lookup * - add sysctls to resize && flush flow tables * - Add per flowtable sysctls for statistics and configuring timeouts * - add saturation counter to rtentry to support per-packet load-balancing @@ -200,29 +231,15 @@ static uint32_t flowclean_cycles; */ VNET_DEFINE(int, flowtable_enable) = 1; static VNET_DEFINE(int, flowtable_debug); -static VNET_DEFINE(int, flowtable_hits); -static VNET_DEFINE(int, flowtable_lookups); -static VNET_DEFINE(int, flowtable_misses); -static VNET_DEFINE(int, flowtable_frees); -static VNET_DEFINE(int, flowtable_free_checks); -static VNET_DEFINE(int, flowtable_max_depth); -static VNET_DEFINE(int, flowtable_collisions); static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE; static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE; static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE; static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE; -static VNET_DEFINE(int, flowtable_nmbflows) = 4096; +static VNET_DEFINE(int, flowtable_nmbflows); static VNET_DEFINE(int, flowtable_ready) = 0; #define V_flowtable_enable VNET(flowtable_enable) #define V_flowtable_debug VNET(flowtable_debug) -#define V_flowtable_hits VNET(flowtable_hits) -#define V_flowtable_lookups VNET(flowtable_lookups) -#define V_flowtable_misses VNET(flowtable_misses) -#define V_flowtable_frees VNET(flowtable_frees) -#define V_flowtable_free_checks VNET(flowtable_free_checks) -#define V_flowtable_max_depth VNET(flowtable_max_depth) -#define V_flowtable_collisions VNET(flowtable_collisions) #define V_flowtable_syn_expire VNET(flowtable_syn_expire) #define V_flowtable_udp_expire VNET(flowtable_udp_expire) #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire) @@ -235,20 +252,6 @@ SYSCTL_VNET_INT(_net_inet_flowtable, OID &VNET_NAME(flowtable_debug), 0, "print debug info."); SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW, &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD, - &VNET_NAME(flowtable_hits), 0, "# flowtable hits."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD, - &VNET_NAME(flowtable_lookups), 0, "# flowtable lookups."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD, - &VNET_NAME(flowtable_misses), 0, "#flowtable misses."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD, - &VNET_NAME(flowtable_frees), 0, "#flows freed."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD, - &VNET_NAME(flowtable_free_checks), 0, "#flows free checks."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD, - &VNET_NAME(flowtable_max_depth), 0, "max collision list length."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD, - &VNET_NAME(flowtable_collisions), 0, "#flowtable collisions."); /* * XXX This does not end up updating timeouts at runtime @@ -298,6 +301,77 @@ SYSCTL_VNET_PROC(_net_inet_flowtable, OI CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU", "Maximum number of flows allowed"); + + +#define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field) + +static void +fs_print(struct sbuf *sb, struct flowtable_stats *fs) +{ + + FS_PRINT(sb, collisions); + FS_PRINT(sb, allocated); + FS_PRINT(sb, misses); + FS_PRINT(sb, max_depth); + FS_PRINT(sb, free_checks); + FS_PRINT(sb, frees); + FS_PRINT(sb, hits); + FS_PRINT(sb, lookups); +} + +static void +flowtable_show_stats(struct sbuf *sb, struct flowtable *ft) +{ + int i; + struct flowtable_stats fs, *pfs; + + if (ft->ft_flags & FL_PCPU) { + bzero(&fs, sizeof(fs)); + pfs = &fs; + for (i = 0; i <= mp_maxid; i++) { + if (CPU_ABSENT(i)) + continue; + pfs->ft_collisions += ft->ft_stats[i].ft_collisions; + pfs->ft_allocated += ft->ft_stats[i].ft_allocated; + pfs->ft_misses += ft->ft_stats[i].ft_misses; + pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks; + pfs->ft_frees += ft->ft_stats[i].ft_frees; + pfs->ft_hits += ft->ft_stats[i].ft_hits; + pfs->ft_lookups += ft->ft_stats[i].ft_lookups; + if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth) + pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth; + } + } else { + pfs = &ft->ft_stats[0]; + } + fs_print(sb, pfs); +} + +static int +sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS) +{ + struct flowtable *ft; + struct sbuf *sb; + int error; + + sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN); + + ft = V_flow_list_head; + while (ft != NULL) { + sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name); + flowtable_show_stats(sb, ft); + ft = ft->ft_next; + } + sbuf_finish(sb); + error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); + + return (error); +} +SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics"); + + #ifndef RADIX_MPATH static void in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum) @@ -342,52 +416,122 @@ flowtable_pcpu_unlock(struct flowtable * #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash)) #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash)) -#define FL_STALE (1<<8) -#define FL_IPV6 (1<<9) +#define FL_STALE (1<<8) +#define FL_IPV6 (1<<9) +#define FL_OVERWRITE (1<<10) -static uint32_t -ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro, - uint32_t *key, uint16_t *flags, uint8_t *protop) +void +flow_invalidate(struct flentry *fle) { - uint16_t sport = 0, dport = 0; - struct ip *ip = NULL; - uint8_t proto = 0; + + fle->f_flags |= FL_STALE; +} + +static __inline int +proto_to_flags(uint8_t proto) +{ + int flag; + + switch (proto) { + case IPPROTO_TCP: + flag = FL_TCP; + break; + case IPPROTO_SCTP: + flag = FL_SCTP; + break; + case IPPROTO_UDP: + flag = FL_UDP; + break; + default: + flag = 0; + break; + } + + return (flag); +} + +static __inline int +flags_to_proto(int flags) +{ + int proto, protoflags; + + protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP); + switch (protoflags) { + case FL_TCP: + proto = IPPROTO_TCP; + break; + case FL_SCTP: + proto = IPPROTO_SCTP; + break; + case FL_UDP: + proto = IPPROTO_UDP; + break; + default: + proto = 0; + break; + } + return (proto); +} + +#ifdef INET +#ifdef FLOWTABLE_DEBUG +static void +ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin, + struct sockaddr_in *dsin) +{ + char saddr[4*sizeof "123"], daddr[4*sizeof "123"]; + + if (flags & FL_HASH_ALL) { + inet_ntoa_r(ssin->sin_addr, saddr); + inet_ntoa_r(dsin->sin_addr, daddr); + printf("proto=%d %s:%d->%s:%d\n", + proto, saddr, ntohs(ssin->sin_port), daddr, + ntohs(dsin->sin_port)); + } else { + inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr); + printf("proto=%d %s\n", proto, daddr); + } + +} +#endif + +static int +ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, + struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags) +{ + struct ip *ip; + uint8_t proto; int iphlen; - uint32_t hash; - struct sockaddr_in *sin; struct tcphdr *th; struct udphdr *uh; struct sctphdr *sh; + uint16_t sport, dport; - if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0)) - return (0); + proto = sport = dport = 0; + ip = mtod(m, struct ip *); + dsin->sin_family = AF_INET; + dsin->sin_len = sizeof(*dsin); + dsin->sin_addr = ip->ip_dst; + ssin->sin_family = AF_INET; + ssin->sin_len = sizeof(*ssin); + ssin->sin_addr = ip->ip_src; - key[1] = key[0] = 0; - sin = (struct sockaddr_in *)&ro->ro_dst; - if (m != NULL) { - ip = mtod(m, struct ip *); - sin->sin_family = AF_INET; - sin->sin_len = sizeof(*sin); - sin->sin_addr = ip->ip_dst; - } else - *flags &= ~FL_HASH_PORTS; - - key[2] = sin->sin_addr.s_addr; - - if ((*flags & FL_HASH_PORTS) == 0) + proto = ip->ip_p; + if ((*flags & FL_HASH_ALL) == 0) { + FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ", + *flags); goto skipports; + } - proto = ip->ip_p; iphlen = ip->ip_hl << 2; /* XXX options? */ - key[1] = ip->ip_src.s_addr; - + switch (proto) { case IPPROTO_TCP: th = (struct tcphdr *)((caddr_t)ip + iphlen); - sport = ntohs(th->th_sport); - dport = ntohs(th->th_dport); - *flags |= th->th_flags; - if (*flags & TH_RST) + sport = th->th_sport; + dport = th->th_dport; + if ((*flags & FL_HASH_ALL) && + (th->th_flags & (TH_RST|TH_FIN))) *flags |= FL_STALE; break; case IPPROTO_UDP: @@ -401,38 +545,288 @@ ipv4_flow_lookup_hash_internal(struct mb dport = sh->dest_port; break; default: - if (*flags & FL_HASH_PORTS) - goto noop; + FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto); + return (ENOTSUP); /* no port - hence not a protocol we care about */ break; } - *protop = proto; - /* - * If this is a transmit route cache then - * hash all flows to a given destination to - * the same bucket - */ - if ((*flags & FL_HASH_PORTS) == 0) - proto = sport = dport = 0; +skipports: + *flags |= proto_to_flags(proto); + ssin->sin_port = sport; + dsin->sin_port = dport; + return (0); +} - ((uint16_t *)key)[0] = sport; - ((uint16_t *)key)[1] = dport; +static uint32_t +ipv4_flow_lookup_hash_internal( + struct sockaddr_in *ssin, struct sockaddr_in *dsin, + uint32_t *key, uint16_t flags) +{ + uint16_t sport, dport; + uint8_t proto; + int offset = 0; -skipports: - hash = jenkins_hashword(key, 3, V_flow_hashjitter + proto); - if (m != NULL && (m->m_flags & M_FLOWID) == 0) { - m->m_flags |= M_FLOWID; - m->m_pkthdr.flowid = hash; + if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0)) + return (0); + proto = flags_to_proto(flags); + sport = dport = key[2] = key[1] = key[0] = 0; + if ((ssin != NULL) && (flags & FL_HASH_ALL)) { + key[1] = ssin->sin_addr.s_addr; + sport = ssin->sin_port; + } + if (dsin != NULL) { + key[2] = dsin->sin_addr.s_addr; + dport = dsin->sin_port; + } + if (flags & FL_HASH_ALL) { + ((uint16_t *)key)[0] = sport; + ((uint16_t *)key)[1] = dport; + } else + offset = V_flow_hashjitter + proto; + + return (jenkins_hashword(key, 3, offset)); +} + +static struct flentry * +flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m) +{ + struct sockaddr_storage ssa, dsa; + uint16_t flags; + struct sockaddr_in *dsin, *ssin; + + dsin = (struct sockaddr_in *)&dsa; + ssin = (struct sockaddr_in *)&ssa; + flags = ft->ft_flags; + if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0) + return (NULL); + + return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags)); +} + +void +flow_to_route(struct flentry *fle, struct route *ro) +{ + uint32_t *hashkey = NULL; + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&ro->ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; + sin->sin_addr.s_addr = hashkey[2]; + ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt); + ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle); +} +#endif /* INET */ + +#ifdef INET6 +/* + * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, + * then it sets p to point at the offset "len" in the mbuf. WARNING: the + * pointer might become stale after other pullups (but we never use it + * this way). + */ +#define PULLUP_TO(_len, p, T) \ +do { \ + int x = (_len) + sizeof(T); \ + if ((m)->m_len < x) { \ + goto receive_failed; \ + } \ + p = (mtod(m, char *) + (_len)); \ +} while (0) + +#define TCP(p) ((struct tcphdr *)(p)) +#define SCTP(p) ((struct sctphdr *)(p)) +#define UDP(p) ((struct udphdr *)(p)) + +static int +ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, + struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags) +{ + struct ip6_hdr *ip6; + uint8_t proto; + int hlen; + uint16_t src_port, dst_port; + u_short offset; + void *ulp; + + offset = hlen = src_port = dst_port = 0; + ulp = NULL; + ip6 = mtod(m, struct ip6_hdr *); + hlen = sizeof(struct ip6_hdr); + proto = ip6->ip6_nxt; + + if ((*flags & FL_HASH_ALL) == 0) + goto skipports; + + while (ulp == NULL) { + switch (proto) { + case IPPROTO_ICMPV6: + case IPPROTO_OSPFIGP: + case IPPROTO_PIM: + case IPPROTO_CARP: + case IPPROTO_ESP: + case IPPROTO_NONE: + ulp = ip6; + break; + case IPPROTO_TCP: + PULLUP_TO(hlen, ulp, struct tcphdr); + dst_port = TCP(ulp)->th_dport; + src_port = TCP(ulp)->th_sport; + if ((*flags & FL_HASH_ALL) && + (TCP(ulp)->th_flags & (TH_RST|TH_FIN))) + *flags |= FL_STALE; + break; + case IPPROTO_SCTP: + PULLUP_TO(hlen, ulp, struct sctphdr); + src_port = SCTP(ulp)->src_port; + dst_port = SCTP(ulp)->dest_port; + break; + case IPPROTO_UDP: + PULLUP_TO(hlen, ulp, struct udphdr); + dst_port = UDP(ulp)->uh_dport; + src_port = UDP(ulp)->uh_sport; + break; + case IPPROTO_HOPOPTS: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_hbh); + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; + ulp = NULL; + break; + case IPPROTO_ROUTING: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_rthdr); + hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; + proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; + ulp = NULL; + break; + case IPPROTO_FRAGMENT: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_frag); + hlen += sizeof (struct ip6_frag); + proto = ((struct ip6_frag *)ulp)->ip6f_nxt; + offset = ((struct ip6_frag *)ulp)->ip6f_offlg & + IP6F_OFF_MASK; + ulp = NULL; + break; + case IPPROTO_DSTOPTS: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_hbh); + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; + ulp = NULL; + break; + case IPPROTO_AH: /* RFC 2402 */ + PULLUP_TO(hlen, ulp, struct ip6_ext); + hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; + proto = ((struct ip6_ext *)ulp)->ip6e_nxt; + ulp = NULL; + break; + default: + PULLUP_TO(hlen, ulp, struct ip6_ext); + break; + } + } + + if (src_port == 0) { + receive_failed: + return (ENOTSUP); } - return (hash); -noop: - *protop = proto; +skipports: + dsin6->sin6_family = AF_INET6; + dsin6->sin6_len = sizeof(*dsin6); + dsin6->sin6_port = dst_port; + memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr)); + + ssin6->sin6_family = AF_INET6; + ssin6->sin6_len = sizeof(*ssin6); + ssin6->sin6_port = src_port; + memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr)); + *flags |= proto_to_flags(proto); + return (0); } +#define zero_key(key) \ +do { \ + key[0] = 0; \ + key[1] = 0; \ + key[2] = 0; \ + key[3] = 0; \ + key[4] = 0; \ + key[5] = 0; \ + key[6] = 0; \ + key[7] = 0; \ + key[8] = 0; \ +} while (0) + +static uint32_t +ipv6_flow_lookup_hash_internal( + struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, + uint32_t *key, uint16_t flags) +{ + uint16_t sport, dport; + uint8_t proto; + int offset = 0; + + if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0)) + return (0); + + proto = flags_to_proto(flags); + zero_key(key); + sport = dport = 0; + if (dsin6 != NULL) { + memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr)); + dport = dsin6->sin6_port; + } + if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) { + memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr)); + sport = ssin6->sin6_port; + } + if (flags & FL_HASH_ALL) { + ((uint16_t *)key)[0] = sport; + ((uint16_t *)key)[1] = dport; + } else + offset = V_flow_hashjitter + proto; + + return (jenkins_hashword(key, 9, offset)); +} + +static struct flentry * +flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m) +{ + struct sockaddr_storage ssa, dsa; + struct sockaddr_in6 *dsin6, *ssin6; + uint16_t flags; + + dsin6 = (struct sockaddr_in6 *)&dsa; + ssin6 = (struct sockaddr_in6 *)&ssa; + flags = ft->ft_flags; + + if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0) + return (NULL); + + return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags)); +} + +void +flow_to_route_in6(struct flentry *fle, struct route_in6 *ro) +{ + uint32_t *hashkey = NULL; + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&ro->ro_dst; + + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; + memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr)); + ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt); + ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle); + +} +#endif /* INET6 */ + static bitstr_t * flowtable_mask(struct flowtable *ft) { @@ -511,22 +905,78 @@ flowtable_set_hashkey(struct flentry *fl hashkey[i] = key[i]; } +static struct flentry * +flow_alloc(struct flowtable *ft) +{ + struct flentry *newfle; + uma_zone_t zone; + + newfle = NULL; + zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone; + + newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO); + if (newfle != NULL) + atomic_add_int(&ft->ft_count, 1); + return (newfle); +} + +static void +flow_free(struct flentry *fle, struct flowtable *ft) +{ + uma_zone_t zone; + + zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone; + atomic_add_int(&ft->ft_count, -1); + uma_zfree(zone, fle); +} + +static int +flow_full(struct flowtable *ft) +{ + boolean_t full; + uint32_t count; + + full = ft->ft_full; + count = ft->ft_count; + + if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3)))) + ft->ft_full = FALSE; + else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5)))) + ft->ft_full = TRUE; + + if (full && !ft->ft_full) { + flowclean_freq = 4*hz; + if ((ft->ft_flags & FL_HASH_ALL) == 0) + ft->ft_udp_idle = ft->ft_fin_wait_idle = + ft->ft_syn_idle = ft->ft_tcp_idle = 5; + cv_broadcast(&flowclean_cv); + } else if (!full && ft->ft_full) { + flowclean_freq = 20*hz; + if ((ft->ft_flags & FL_HASH_ALL) == 0) + ft->ft_udp_idle = ft->ft_fin_wait_idle = + ft->ft_syn_idle = ft->ft_tcp_idle = 30; + } + + return (ft->ft_full); +} + static int flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, - uint8_t proto, uint32_t fibnum, struct route *ro, uint16_t flags) + uint32_t fibnum, struct route *ro, uint16_t flags) { struct flentry *fle, *fletail, *newfle, **flep; + struct flowtable_stats *fs = &ft->ft_stats[curcpu]; int depth; - uma_zone_t flezone; bitstr_t *mask; + uint8_t proto; - flezone = (flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone; - newfle = uma_zalloc(flezone, M_NOWAIT | M_ZERO); + newfle = flow_alloc(ft); if (newfle == NULL) return (ENOMEM); newfle->f_flags |= (flags & FL_IPV6); - + proto = flags_to_proto(flags); + FL_ENTRY_LOCK(ft, hash); mask = flowtable_mask(ft); flep = flowtable_entry(ft, hash); @@ -539,7 +989,7 @@ flowtable_insert(struct flowtable *ft, u } depth = 0; - V_flowtable_collisions++; + fs->ft_collisions++; /* * find end of list and make sure that we were not * preempted by another thread handling this flow @@ -551,8 +1001,10 @@ flowtable_insert(struct flowtable *ft, u * or we lost a race to insert */ FL_ENTRY_UNLOCK(ft, hash); - uma_zfree((newfle->f_flags & FL_IPV6) ? - V_flow_ipv6_zone : V_flow_ipv4_zone, newfle); + flow_free(newfle, ft); + + if (flags & FL_OVERWRITE) + goto skip; return (EEXIST); } /* @@ -565,8 +1017,8 @@ flowtable_insert(struct flowtable *ft, u fle = fle->f_next; } - if (depth > V_flowtable_max_depth) - V_flowtable_max_depth = depth; + if (depth > fs->ft_max_depth) + fs->ft_max_depth = depth; fletail->f_next = newfle; fle = newfle; skip: @@ -582,6 +1034,35 @@ skip: return (0); } +int +kern_flowtable_insert(struct flowtable *ft, + struct sockaddr_storage *ssa, struct sockaddr_storage *dsa, + struct route *ro, uint32_t fibnum, int flags) +{ + uint32_t key[9], hash; + + flags = (ft->ft_flags | flags | FL_OVERWRITE); + hash = 0; + +#ifdef INET + if (ssa->ss_family == AF_INET) + hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa, + (struct sockaddr_in *)dsa, key, flags); +#endif +#ifdef INET6 + if (ssa->ss_family == AF_INET6) + hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa, + (struct sockaddr_in6 *)dsa, key, flags); +#endif + if (ro->ro_rt == NULL || ro->ro_lle == NULL) + return (EINVAL); + + FLDPRINTF(ft, FL_DEBUG, + "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n", + key[0], key[1], key[2], hash, fibnum, flags); + return (flowtable_insert(ft, hash, key, fibnum, ro, flags)); +} + static int flowtable_key_equal(struct flentry *fle, uint32_t *key) { @@ -595,7 +1076,7 @@ flowtable_key_equal(struct flentry *fle, nwords = 3; hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; } - + for (i = 0; i < nwords; i++) if (hashkey[i] != key[i]) return (0); @@ -603,44 +1084,86 @@ flowtable_key_equal(struct flentry *fle, return (1); } -int -flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro, uint32_t fibnum) +struct flentry * +flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af) +{ + struct flentry *fle = NULL; + +#ifdef INET + if (af == AF_INET) + fle = flowtable_lookup_mbuf4(ft, m); +#endif +#ifdef INET6 + if (af == AF_INET6) + fle = flowtable_lookup_mbuf6(ft, m); +#endif + if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) { + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = fle->f_fhash; + } + return (fle); +} + +struct flentry * +flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa, + struct sockaddr_storage *dsa, uint32_t fibnum, int flags) { uint32_t key[9], hash; struct flentry *fle; - uint16_t flags; + struct flowtable_stats *fs = &ft->ft_stats[curcpu]; uint8_t proto = 0; int error = 0; struct rtentry *rt; struct llentry *lle; + struct route sro, *ro; + struct route_in6 sro6; - flags = ft->ft_flags; - ro->ro_rt = NULL; - ro->ro_lle = NULL; + sro.ro_rt = sro6.ro_rt = NULL; + sro.ro_lle = sro6.ro_lle = NULL; + ro = NULL; + hash = 0; + flags |= ft->ft_flags; + proto = flags_to_proto(flags); +#ifdef INET + if (ssa->ss_family == AF_INET) { + struct sockaddr_in *ssin, *dsin; + + ro = &sro; + memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in)); + dsin = (struct sockaddr_in *)dsa; + ssin = (struct sockaddr_in *)ssa; + if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) || + (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || + (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) + return (NULL); - /* - * The internal hash lookup is the only IPv4 specific bit - * remaining - * - * XXX BZ: to add IPv6 support just add a check for the - * address type in m and ro and an equivalent ipv6 lookup - * function - the rest of the code should automatically - * handle an ipv6 flow (note that m can be NULL in which - * case ro will be set) - */ - hash = ipv4_flow_lookup_hash_internal(m, ro, key, - &flags, &proto); + hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags); + } +#endif +#ifdef INET6 + if (ssa->ss_family == AF_INET6) { + struct sockaddr_in6 *ssin6, *dsin6; + + ro = (struct route *)&sro6; + memcpy(&sro6.ro_dst, dsa, + sizeof(struct sockaddr_in6)); + dsin6 = (struct sockaddr_in6 *)dsa; + ssin6 = (struct sockaddr_in6 *)ssa; + flags |= FL_IPV6; + hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags); + } +#endif /* * Ports are zero and this isn't a transmit cache * - thus not a protocol for which we need to keep * state - * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP + * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP */ - if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS))) - return (ENOENT); + if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL))) + return (NULL); - V_flowtable_lookups++; + fs->ft_lookups++; FL_ENTRY_LOCK(ft, hash); if ((fle = FL_ENTRY(ft, hash)) == NULL) { FL_ENTRY_UNLOCK(ft, hash); @@ -656,21 +1179,21 @@ keycheck: && (fibnum == fle->f_fibnum) && (rt->rt_flags & RTF_UP) && (rt->rt_ifp != NULL)) { - V_flowtable_hits++; + fs->ft_hits++; fle->f_uptime = time_uptime; fle->f_flags |= flags; - ro->ro_rt = rt; - ro->ro_lle = lle; *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***