Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 17 Feb 2014 11:50:56 +0000 (UTC)
From:      Gleb Smirnoff <glebius@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r262027 - in head/sys: conf net netinet netinet6
Message-ID:  <201402171150.s1HBou3Q018877@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: glebius
Date: Mon Feb 17 11:50:56 2014
New Revision: 262027
URL: http://svnweb.freebsd.org/changeset/base/262027

Log:
  o Remove at compile time the HASH_ALL code, that was never
    tested and is unfinished. However, I've tested my version,
    it works okay. As before it is unfinished: timeout aren't
    driven by TCP session state. To enable the HASH_ALL mode,
    one needs in kernel config:
  
  	options FLOWTABLE_HASH_ALL
  
  o Reduce the alignment on flentry to 64 bytes. Without
    the FLOWTABLE_HASH_ALL option, twice less memory would
    be consumed by flows.
  o API to ip_output()/ip6_output() got even more thin: 1 liner.
  o Remove unused unions. Simply use fle->f_key[].
  o Merge all IPv4 code into flowtable_lookup_ipv4(), and do same
    flowtable_lookup_ipv6(). Stop copying data to on stack
    sockaddr structures, simply use key[] on stack.
  o Move code from flowtable_lookup_common() that actually works
    on insertion into flowtable_insert().
  
  Sponsored by:	Netflix
  Sponsored by:	Nginx, Inc.

Modified:
  head/sys/conf/options
  head/sys/net/flowtable.c
  head/sys/net/flowtable.h
  head/sys/netinet/ip_output.c
  head/sys/netinet6/ip6_output.c

Modified: head/sys/conf/options
==============================================================================
--- head/sys/conf/options	Mon Feb 17 11:44:58 2014	(r262026)
+++ head/sys/conf/options	Mon Feb 17 11:50:56 2014	(r262027)
@@ -440,6 +440,7 @@ TCP_SIGNATURE		opt_inet.h
 VLAN_ARRAY		opt_vlan.h
 XBONEHACK
 FLOWTABLE		opt_route.h
+FLOWTABLE_HASH_ALL	opt_route.h
 
 #
 # SCTP

Modified: head/sys/net/flowtable.c
==============================================================================
--- head/sys/net/flowtable.c	Mon Feb 17 11:44:58 2014	(r262026)
+++ head/sys/net/flowtable.c	Mon Feb 17 11:50:56 2014	(r262027)
@@ -73,91 +73,53 @@ __FBSDID("$FreeBSD$");
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
+#ifdef FLOWTABLE_HASH_ALL
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <netinet/sctp.h>
+#endif
 
 #include <ddb/ddb.h>
 
-#ifdef INET
-struct ipv4_tuple {
-	uint16_t 	ip_sport;	/* source port */
-	uint16_t 	ip_dport;	/* destination port */
-	in_addr_t 	ip_saddr;	/* source address */
-	in_addr_t 	ip_daddr;	/* destination address */
-};
-
-union ipv4_flow {
-	struct ipv4_tuple ipf_ipt;
-	uint32_t 	ipf_key[3];
-};
+#ifdef	FLOWTABLE_HASH_ALL
+#define	KEY_PORTS	(sizeof(uint16_t) * 2)
+#define	KEY_ADDRS	2
+#else
+#define	KEY_PORTS	0
+#define	KEY_ADDRS	1
 #endif
 
-#ifdef INET6
-struct ipv6_tuple {
-	uint16_t 	ip_sport;	/* source port */
-	uint16_t 	ip_dport;	/* destination port */
-	struct in6_addr	ip_saddr;	/* source address */
-	struct in6_addr	ip_daddr;	/* destination address */
-};
-
-union ipv6_flow {
-	struct ipv6_tuple ipf_ipt;
-	uint32_t 	ipf_key[9];
-};
+#ifdef	INET6
+#define	KEY_ADDR_LEN	sizeof(struct in6_addr)
+#else
+#define	KEY_ADDR_LEN	sizeof(struct in_addr)
 #endif
 
+#define	KEYLEN	((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t))
+
 struct flentry {
-	uint32_t		f_fhash;	/* hash flowing forward */
-	uint16_t		f_flags;	/* flow flags */
-	uint8_t			f_pad;
-	uint8_t			f_proto;	/* protocol */
-	uint32_t		f_fibnum;	/* fib index */
+	uint32_t		f_hash;		/* hash flowing forward */
+	uint32_t		f_key[KEYLEN];	/* address(es and ports) */
 	uint32_t		f_uptime;	/* uptime at last access */
+	uint16_t		f_fibnum;	/* fib index */
+#ifdef FLOWTABLE_HASH_ALL
+	uint8_t			f_proto;	/* protocol */
+	uint8_t			f_flags;	/* stale? */
+#define FL_STALE 		1
+#endif
 	SLIST_ENTRY(flentry)	f_next;		/* pointer to collision entry */
 	struct rtentry		*f_rt;		/* rtentry for flow */
 	struct llentry		*f_lle;		/* llentry for flow */
-	union {
-#ifdef INET
-		union ipv4_flow	v4;
-#endif
-#ifdef INET6
-		union ipv6_flow	v6;
-#endif
-	} f_flow;
-#define	f_flow4	f_flow.v4
-#define	f_flow6	f_flow.v6
 };
-#define	KEYLEN(flags)	((((flags) & FL_IPV6) ? 9 : 3) * 4)
-
-/* Make sure f_flow begins with key. */
-#ifdef INET
-CTASSERT(offsetof(struct flentry, f_flow) ==
-    offsetof(struct flentry, f_flow4.ipf_key));
-#endif
-#ifdef INET6
-CTASSERT(offsetof(struct flentry, f_flow) ==
-    offsetof(struct flentry, f_flow6.ipf_key));
-#endif
+#undef KEYLEN
 
 SLIST_HEAD(flist, flentry);
 /* Make sure we can use pcpu_zone_ptr for struct flist. */
 CTASSERT(sizeof(struct flist) == sizeof(void *));
 
-#define	SECS_PER_HOUR		3600
-#define	SECS_PER_DAY		(24*SECS_PER_HOUR)
-
-#define	SYN_IDLE		300
-#define	UDP_IDLE		300
-#define	FIN_WAIT_IDLE		600
-#define	TCP_IDLE		SECS_PER_DAY
-
 struct flowtable {
 	counter_u64_t	*ft_stat;
 	int 		ft_size;
-	uint32_t	ft_flags;
-	uint32_t	ft_max_depth;
-
 	/*
 	 * ft_table is a malloc(9)ed array of pointers.  Pointers point to
 	 * memory from UMA_ZONE_PCPU zone.
@@ -167,12 +129,6 @@ struct flowtable {
 	struct flist	**ft_table;
 	bitstr_t 	**ft_masks;
 	bitstr_t	*ft_tmpmask;
-
-	uint32_t	ft_udp_idle;
-	uint32_t	ft_fin_wait_idle;
-	uint32_t	ft_syn_idle;
-	uint32_t	ft_tcp_idle;
-	boolean_t	ft_full;
 };
 
 #define	FLOWSTAT_ADD(ft, name, v)	\
@@ -186,7 +142,6 @@ static struct cv 	flowclean_f_cv;
 static struct cv 	flowclean_c_cv;
 static struct mtx	flowclean_lock;
 static uint32_t		flowclean_cycles;
-static uint32_t		flowclean_freq;
 
 /*
  * TODO:
@@ -213,16 +168,7 @@ static VNET_DEFINE(struct flowtable, ip6
 static uma_zone_t flow_zone;
 
 static VNET_DEFINE(int, flowtable_enable) = 1;
-static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
-static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
-static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
-static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
-
 #define	V_flowtable_enable		VNET(flowtable_enable)
-#define	V_flowtable_syn_expire		VNET(flowtable_syn_expire)
-#define	V_flowtable_udp_expire		VNET(flowtable_udp_expire)
-#define	V_flowtable_fin_wait_expire	VNET(flowtable_fin_wait_expire)
-#define	V_flowtable_tcp_expire		VNET(flowtable_tcp_expire)
 
 static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
     "flowtable");
@@ -231,197 +177,96 @@ SYSCTL_VNET_INT(_net_flowtable, OID_AUTO
 SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
     &flow_zone, "Maximum number of flows allowed");
 
-/*
- * XXX This does not end up updating timeouts at runtime
- * and only reflects the value for the last table added :-/
- */
-SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
-    &VNET_NAME(flowtable_syn_expire), 0,
-    "seconds after which to remove syn allocated flow.");
-SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
-    &VNET_NAME(flowtable_udp_expire), 0,
-    "seconds after which to remove flow allocated to UDP.");
-SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
-    &VNET_NAME(flowtable_fin_wait_expire), 0,
-    "seconds after which to remove a flow in FIN_WAIT.");
-SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
-    &VNET_NAME(flowtable_tcp_expire), 0,
-    "seconds after which to remove flow allocated to a TCP connection.");
-
-#define FL_STALE 	(1<<8)
-
 static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
 
-static struct flentry *flowtable_lookup_common(struct flowtable *,
-    struct sockaddr_storage *, struct sockaddr_storage *, struct mbuf *, int);
-
-static __inline int
-proto_to_flags(uint8_t proto)
-{
-	int flag;
-
-	switch (proto) {
-	case IPPROTO_TCP:
-		flag = FL_TCP;
-		break;
-	case IPPROTO_SCTP:
-		flag = FL_SCTP;
-		break;
-	case IPPROTO_UDP:
-		flag = FL_UDP;
-		break;
-	default:
-		flag = 0;
-		break;
-	}
-
-	return (flag);
-}
-
-static __inline int
-flags_to_proto(int flags)
-{
-	int proto, protoflags;
-
-	protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
-	switch (protoflags) {
-	case FL_TCP:
-		proto = IPPROTO_TCP;
-		break;
-	case FL_SCTP:
-		proto = IPPROTO_SCTP;
-		break;
-	case FL_UDP:
-		proto = IPPROTO_UDP;
-		break;
-	default:
-		proto = 0;
-		break;
-	}
-	return (proto);
-}
+static struct flentry *
+flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
 
 #ifdef INET
-static int
-ipv4_mbuf_demarshal(struct mbuf *m, struct sockaddr_in *ssin,
-    struct sockaddr_in *dsin, uint16_t *flags)
+static struct flentry *
+flowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
 {
+	struct flentry *fle;
+	struct sockaddr_in *sin;
 	struct ip *ip;
-	uint8_t proto;
+	uint32_t fibnum;
+#ifdef FLOWTABLE_HASH_ALL
+	uint32_t key[3];
 	int iphlen;
-	struct tcphdr *th;
-	struct udphdr *uh;
-	struct sctphdr *sh;
 	uint16_t sport, dport;
+	uint8_t proto;
+#endif
 
-	proto = sport = dport = 0;
 	ip = mtod(m, struct ip *);
-	dsin->sin_family = AF_INET;
-	dsin->sin_len = sizeof(*dsin);
-	dsin->sin_addr = ip->ip_dst;
-	ssin->sin_family = AF_INET;
-	ssin->sin_len = sizeof(*ssin);
-	ssin->sin_addr = ip->ip_src;
 
-	proto = ip->ip_p;
-	if ((*flags & FL_HASH_ALL) == 0)
-		goto skipports;
+	if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
+	    (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
+	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
+		return (NULL);
+
+	fibnum = M_GETFIB(m);
 
-	iphlen = ip->ip_hl << 2; /* XXX options? */
+#ifdef FLOWTABLE_HASH_ALL
+	iphlen = ip->ip_hl << 2;
+	proto = ip->ip_p;
 
 	switch (proto) {
-	case IPPROTO_TCP:
-		th = (struct tcphdr *)((caddr_t)ip + iphlen);
+	case IPPROTO_TCP: {
+		struct tcphdr *th;
+
+		th = (struct tcphdr *)((char *)ip + iphlen);
 		sport = th->th_sport;
 		dport = th->th_dport;
-		if ((*flags & FL_HASH_ALL) &&
-		    (th->th_flags & (TH_RST|TH_FIN)))
-			*flags |= FL_STALE;
+		if (th->th_flags & (TH_RST|TH_FIN))
+			fibnum |= (FL_STALE << 24);
 		break;
-	case IPPROTO_UDP:
-		uh = (struct udphdr *)((caddr_t)ip + iphlen);
+	}
+	case IPPROTO_UDP: {
+		struct udphdr *uh;
+
+		uh = (struct udphdr *)((char *)ip + iphlen);
 		sport = uh->uh_sport;
 		dport = uh->uh_dport;
 		break;
-	case IPPROTO_SCTP:
-		sh = (struct sctphdr *)((caddr_t)ip + iphlen);
+	}
+	case IPPROTO_SCTP: {
+		struct sctphdr *sh;
+
+		sh = (struct sctphdr *)((char *)ip + iphlen);
 		sport = sh->src_port;
 		dport = sh->dest_port;
+		/* XXXGL: handle stale? */
 		break;
+	}
 	default:
-		return (ENOTSUP);
-		/* no port - hence not a protocol we care about */
+		sport = dport = 0;
 		break;
-
 	}
 
-skipports:
-	*flags |= proto_to_flags(proto);
-	ssin->sin_port = sport;
-	dsin->sin_port = dport;
-	return (0);
-}
-
-static uint32_t
-ipv4_flow_lookup_hash(
-	struct sockaddr_in *ssin, struct sockaddr_in *dsin,
-	    uint32_t *key, uint16_t flags)
-{
-	uint16_t sport, dport;
-	uint8_t proto;
-	int offset = 0;
+	key[0] = ip->ip_dst.s_addr;
+	key[1] = ip->ip_src.s_addr;
+	key[2] = (dport << 16) | sport;
+	fibnum |= proto << 16;
 
-	proto = flags_to_proto(flags);
-	sport = dport = key[2] = key[1] = key[0] = 0;
-	if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
-		key[1] = ssin->sin_addr.s_addr;
-		sport = ssin->sin_port;
-	}
-	if (dsin != NULL) {
-		key[2] = dsin->sin_addr.s_addr;
-		dport = dsin->sin_port;
-	}
-	if (flags & FL_HASH_ALL) {
-		((uint16_t *)key)[0] = sport;
-		((uint16_t *)key)[1] = dport;
-	} else
-		offset = flow_hashjitter + proto;
+	fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
+	    fibnum);
 
-	return (jenkins_hash32(key, 3, offset));
-}
+#else	/* !FLOWTABLE_HASH_ALL */
 
-static struct flentry *
-flowtable_lookup_ipv4(struct mbuf *m)
-{
-	struct sockaddr_storage ssa, dsa;
-	uint16_t flags;
-	struct sockaddr_in *dsin, *ssin;
-
-	dsin = (struct sockaddr_in *)&dsa;
-	ssin = (struct sockaddr_in *)&ssa;
-	bzero(dsin, sizeof(*dsin));
-	bzero(ssin, sizeof(*ssin));
-	flags = V_ip4_ft.ft_flags;
-	if (ipv4_mbuf_demarshal(m, ssin, dsin, &flags) != 0)
-		return (NULL);
+	fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
+	    sizeof(struct in_addr), fibnum);
 
-	return (flowtable_lookup_common(&V_ip4_ft, &ssa, &dsa, m, flags));
-}
+#endif	/* FLOWTABLE_HASH_ALL */
 
-void
-flow_to_route(struct flentry *fle, struct route *ro)
-{
-	uint32_t *hashkey = NULL;
-	struct sockaddr_in *sin;
+	if (fle == NULL)
+		return (NULL);
 
 	sin = (struct sockaddr_in *)&ro->ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
-	hashkey = fle->f_flow4.ipf_key;
-	sin->sin_addr.s_addr = hashkey[2];
-	ro->ro_rt = fle->f_rt;
-	ro->ro_lle = fle->f_lle;
-	ro->ro_flags |= RT_NORTREF;
+	sin->sin_addr = ip->ip_dst;
+
+	return (fle);
 }
 #endif /* INET */
 
@@ -435,9 +280,8 @@ flow_to_route(struct flentry *fle, struc
 #define PULLUP_TO(_len, p, T)						\
 do {									\
 	int x = (_len) + sizeof(T);					\
-	if ((m)->m_len < x) {						\
-		goto receive_failed;					\
-	}								\
+	if ((m)->m_len < x)						\
+		return (NULL);						\
 	p = (mtod(m, char *) + (_len));					\
 } while (0)
 
@@ -445,26 +289,35 @@ do {									\
 #define	SCTP(p)		((struct sctphdr *)(p))
 #define	UDP(p)		((struct udphdr *)(p))
 
-static int
-ipv6_mbuf_demarshal(struct mbuf *m, struct sockaddr_in6 *ssin6,
-    struct sockaddr_in6 *dsin6, uint16_t *flags)
+static struct flentry *
+flowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
 {
+	struct flentry *fle;
+	struct sockaddr_in6 *sin6;
 	struct ip6_hdr *ip6;
-	uint8_t proto;
+	uint32_t fibnum;
+#ifdef FLOWTABLE_HASH_ALL
+	uint32_t key[9];
+	void *ulp;
 	int hlen;
-	uint16_t src_port, dst_port;
+	uint16_t sport, dport;
 	u_short offset;
-	void *ulp;
+	uint8_t proto;
+#else
+	uint32_t key[4];
+#endif
 
-	offset = hlen = src_port = dst_port = 0;
-	ulp = NULL;
 	ip6 = mtod(m, struct ip6_hdr *);
-	hlen = sizeof(struct ip6_hdr);
-	proto = ip6->ip6_nxt;
+	if (in6_localaddr(&ip6->ip6_dst))
+		return (NULL);
 
-	if ((*flags & FL_HASH_ALL) == 0)
-		goto skipports;
+	fibnum = M_GETFIB(m);
 
+#ifdef	FLOWTABLE_HASH_ALL
+	hlen = sizeof(struct ip6_hdr);
+	proto = ip6->ip6_nxt;
+	offset = sport = dport = 0;
+	ulp = NULL;
 	while (ulp == NULL) {
 		switch (proto) {
 		case IPPROTO_ICMPV6:
@@ -477,21 +330,21 @@ ipv6_mbuf_demarshal(struct mbuf *m, stru
 			break;
 		case IPPROTO_TCP:
 			PULLUP_TO(hlen, ulp, struct tcphdr);
-			dst_port = TCP(ulp)->th_dport;
-			src_port = TCP(ulp)->th_sport;
-			if ((*flags & FL_HASH_ALL) &&
-			    (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
-				*flags |= FL_STALE;
+			dport = TCP(ulp)->th_dport;
+			sport = TCP(ulp)->th_sport;
+			if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
+				fibnum |= (FL_STALE << 24);
 			break;
 		case IPPROTO_SCTP:
 			PULLUP_TO(hlen, ulp, struct sctphdr);
-			src_port = SCTP(ulp)->src_port;
-			dst_port = SCTP(ulp)->dest_port;
+			dport = SCTP(ulp)->src_port;
+			sport = SCTP(ulp)->dest_port;
+			/* XXXGL: handle stale? */
 			break;
 		case IPPROTO_UDP:
 			PULLUP_TO(hlen, ulp, struct udphdr);
-			dst_port = UDP(ulp)->uh_dport;
-			src_port = UDP(ulp)->uh_sport;
+			dport = UDP(ulp)->uh_dport;
+			sport = UDP(ulp)->uh_sport;
 			break;
 		case IPPROTO_HOPOPTS:	/* RFC 2460 */
 			PULLUP_TO(hlen, ulp, struct ip6_hbh);
@@ -531,102 +384,28 @@ ipv6_mbuf_demarshal(struct mbuf *m, stru
 		}
 	}
 
-	if (src_port == 0) {
-	receive_failed:
-		return (ENOTSUP);
-	}
-
-skipports:
-	dsin6->sin6_family = AF_INET6;
-	dsin6->sin6_len = sizeof(*dsin6);
-	dsin6->sin6_port = dst_port;
-	memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
-
-	ssin6->sin6_family = AF_INET6;
-	ssin6->sin6_len = sizeof(*ssin6);
-	ssin6->sin6_port = src_port;
-	memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
-	*flags |= proto_to_flags(proto);
-
-	return (0);
-}
-
-#define zero_key(key) 		\
-do {				\
-	key[0] = 0;		\
-	key[1] = 0;		\
-	key[2] = 0;		\
-	key[3] = 0;		\
-	key[4] = 0;		\
-	key[5] = 0;		\
-	key[6] = 0;		\
-	key[7] = 0;		\
-	key[8] = 0;		\
-} while (0)
-
-static uint32_t
-ipv6_flow_lookup_hash(
-	struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
-	    uint32_t *key, uint16_t flags)
-{
-	uint16_t sport, dport;
-	uint8_t proto;
-	int offset = 0;
-
-	proto = flags_to_proto(flags);
-	zero_key(key);
-	sport = dport = 0;
-	if (dsin6 != NULL) {
-		memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
-		dport = dsin6->sin6_port;
-	}
-	if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
-		memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
-		sport = ssin6->sin6_port;
-	}
-	if (flags & FL_HASH_ALL) {
-		((uint16_t *)key)[0] = sport;
-		((uint16_t *)key)[1] = dport;
-	} else
-		offset = flow_hashjitter + proto;
-
-	return (jenkins_hash32(key, 9, offset));
-}
-
-static struct flentry *
-flowtable_lookup_ipv6(struct mbuf *m)
-{
-	struct sockaddr_storage ssa, dsa;
-	struct sockaddr_in6 *dsin6, *ssin6;
-	uint16_t flags;
-
-	dsin6 = (struct sockaddr_in6 *)&dsa;
-	ssin6 = (struct sockaddr_in6 *)&ssa;
-	bzero(dsin6, sizeof(*dsin6));
-	bzero(ssin6, sizeof(*ssin6));
-	flags = V_ip6_ft.ft_flags;
+	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
+	bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
+	key[8] = (dport << 16) | sport;
+	fibnum |= proto << 16;
+
+	fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
+	    fibnum);
+#else	/* !FLOWTABLE_HASH_ALL */
+	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
+	fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
+	    fibnum);
+#endif	/* FLOWTABLE_HASH_ALL */
 
-	if (ipv6_mbuf_demarshal(m, ssin6, dsin6, &flags) != 0)
+	if (fle == NULL)
 		return (NULL);
 
-	return (flowtable_lookup_common(&V_ip6_ft, &ssa, &dsa, m, flags));
-}
-
-void
-flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
-{
-	uint32_t *hashkey = NULL;
-	struct sockaddr_in6 *sin6;
-
 	sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
-
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(*sin6);
-	hashkey = fle->f_flow6.ipf_key;
-	memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
-	ro->ro_rt = fle->f_rt;
-	ro->ro_lle = fle->f_lle;
-	ro->ro_flags |= RT_NORTREF;
+	bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
+
+	return (fle);
 }
 #endif /* INET6 */
 
@@ -654,75 +433,57 @@ flowtable_list(struct flowtable *ft, uin
 }
 
 static int
-flow_stale(struct flowtable *ft, struct flentry *fle)
+flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
 {
-	time_t idle_time;
 
-	if ((fle->f_fhash == 0)
-	    || ((fle->f_rt->rt_flags & RTF_HOST) &&
-		((fle->f_rt->rt_flags & (RTF_UP))
-		    != (RTF_UP)))
-	    || (fle->f_rt->rt_ifp == NULL)
-	    || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
+	if (((fle->f_rt->rt_flags & RTF_HOST) &&
+	    ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) ||
+	    (fle->f_rt->rt_ifp == NULL) ||
+	    !RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
+	    (fle->f_lle->la_flags & LLE_VALID) == 0)
 		return (1);
 
-	idle_time = time_uptime - fle->f_uptime;
+	if (time_uptime - fle->f_uptime > maxidle)
+		return (1);
 
-	if ((fle->f_flags & FL_STALE) ||
-	    ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
-		&& (idle_time > ft->ft_udp_idle)) ||
-	    ((fle->f_flags & TH_FIN)
-		&& (idle_time > ft->ft_fin_wait_idle)) ||
-	    ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
-		&& (idle_time > ft->ft_syn_idle)) ||
-	    ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
-		&& (idle_time > ft->ft_tcp_idle)) ||
-	    ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
-		(fle->f_rt->rt_ifp == NULL)))
+#ifdef FLOWTABLE_HASH_ALL
+	if (fle->f_flags & FL_STALE)
 		return (1);
+#endif
 
 	return (0);
 }
 
 static int
-flow_full(struct flowtable *ft)
+flow_full(void)
 {
-	boolean_t full;
 	int count, max;
 
-	full = ft->ft_full;
 	count = uma_zone_get_cur(flow_zone);
 	max = uma_zone_get_max(flow_zone);
 
-	if (full && (count < (max - (max >> 3))))
-		ft->ft_full = FALSE;
-	else if (!full && (count > (max - (max >> 5))))
-		ft->ft_full = TRUE;
-
-	if (full && !ft->ft_full) {
-		flowclean_freq = 4*hz;
-		if ((ft->ft_flags & FL_HASH_ALL) == 0)
-			ft->ft_udp_idle = ft->ft_fin_wait_idle =
-			    ft->ft_syn_idle = ft->ft_tcp_idle = 5;
-		cv_broadcast(&flowclean_c_cv);
-	} else if (!full && ft->ft_full) {
-		flowclean_freq = 20*hz;
-		if ((ft->ft_flags & FL_HASH_ALL) == 0)
-			ft->ft_udp_idle = ft->ft_fin_wait_idle =
-			    ft->ft_syn_idle = ft->ft_tcp_idle = 30;
-	}
-
-	return (ft->ft_full);
+	return (count > (max - (max >> 3)));
 }
 
 static int
-flow_matches(struct flentry *fle, uint32_t hash, uint32_t *key, uint8_t
-   proto, uint32_t fibnum)
+flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
 {
+#ifdef FLOWTABLE_HASH_ALL
+	uint8_t proto;
+
+	proto = (fibnum >> 16) & 0xff;
+	fibnum &= 0xffff;
+#endif
+
+	CRITICAL_ASSERT(curthread);
 
-	if (fle->f_fhash == hash &&
-	    bcmp(&fle->f_flow, key, KEYLEN(fle->f_flags)) == 0 &&
-	    proto == fle->f_proto && fibnum == fle->f_fibnum &&
+	/* Microoptimization for IPv4: don't use bcmp(). */
+	if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) ||
+	    (bcmp(fle->f_key, key, keylen) == 0)) &&
+	    fibnum == fle->f_fibnum &&
+#ifdef FLOWTABLE_HASH_ALL
+	    proto == fle->f_proto &&
+#endif
 	    (fle->f_rt->rt_flags & RTF_UP) &&
 	    fle->f_rt->rt_ifp != NULL &&
 	    (fle->f_lle->la_flags & LLE_VALID))
@@ -733,27 +494,131 @@ flow_matches(struct flentry *fle, uint32
 
 static struct flentry *
 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
-    uint32_t fibnum, struct route *ro, uint16_t flags)
+    int keylen, uint32_t fibnum0)
 {
+#ifdef INET6
+        struct route_in6 sro6;
+#endif
+#ifdef INET
+        struct route sro;
+#endif
+	struct route *ro = NULL;
+	struct rtentry *rt;
+	struct lltable *lt = NULL;
+	struct llentry *lle;
+	struct sockaddr_storage *l3addr;
+	struct ifnet *ifp;
 	struct flist *flist;
 	struct flentry *fle, *iter;
 	bitstr_t *mask;
-	int depth;
+	uint16_t fibnum = fibnum0;
+#ifdef FLOWTABLE_HASH_ALL
 	uint8_t proto;
 
+	proto = (fibnum0 >> 16) & 0xff;
+	fibnum = fibnum0 & 0xffff;
+#endif
+
+	/*
+	 * This bit of code ends up locking the
+	 * same route 3 times (just like ip_output + ether_output)
+	 * - at lookup
+	 * - in rt_check when called by arpresolve
+	 * - dropping the refcount for the rtentry
+	 *
+	 * This could be consolidated to one if we wrote a variant
+	 * of arpresolve with an rt_check variant that expected to
+	 * receive the route locked
+	 */
+#ifdef INET
+	if (ft == &V_ip4_ft) {
+		struct sockaddr_in *sin;
+
+		ro = &sro;
+		bzero(&sro.ro_dst, sizeof(sro.ro_dst));
+
+		sin = (struct sockaddr_in *)&sro.ro_dst;
+		sin->sin_family = AF_INET;
+		sin->sin_len = sizeof(*sin);
+		sin->sin_addr.s_addr = key[0];
+	}
+#endif
+#ifdef INET6
+	if (ft == &V_ip6_ft) {
+		struct sockaddr_in6 *sin6;
+
+		ro = (struct route *)&sro6;
+		sin6 = &sro6.ro_dst;
+
+		bzero(sin6, sizeof(*sin6));
+		sin6->sin6_family = AF_INET6;
+		sin6->sin6_len = sizeof(*sin6);
+		bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
+	}
+#endif
+
+	ro->ro_rt = NULL;
+#ifdef RADIX_MPATH
+	rtalloc_mpath_fib(ro, hash, fibnum);
+#else
+	rtalloc_ign_fib(ro, 0, fibnum);
+#endif
+	if (ro->ro_rt == NULL)
+		return (NULL);
+
+	rt = ro->ro_rt;
+	ifp = rt->rt_ifp;
+
+	if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
+		RTFREE(rt);
+		return (NULL);
+	}
+
+#ifdef INET
+	if (ft == &V_ip4_ft)
+		lt = LLTABLE(ifp);
+#endif
+#ifdef INET6
+	if (ft == &V_ip6_ft)
+		lt = LLTABLE6(ifp);
+#endif
+
+	if (rt->rt_flags & RTF_GATEWAY)
+		l3addr = (struct sockaddr_storage *)rt->rt_gateway;
+	else
+		l3addr = (struct sockaddr_storage *)&ro->ro_dst;
+	lle = llentry_alloc(ifp, lt, l3addr);
+
+	if (lle == NULL) {
+		RTFREE(rt);
+		return (NULL);
+	}
+
+	/* Don't insert the entry if the ARP hasn't yet finished resolving. */
+	if ((lle->la_flags & LLE_VALID) == 0) {
+		RTFREE(rt);
+		LLE_FREE(lle);
+		FLOWSTAT_INC(ft, ft_fail_lle_invalid);
+		return (NULL);
+	}
+
 	fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
-	if (fle == NULL)
+	if (fle == NULL) {
+		RTFREE(rt);
+		LLE_FREE(lle);
 		return (NULL);
+	}
 
-	proto = flags_to_proto(flags);
-	bcopy(key, &fle->f_flow, KEYLEN(flags));
-	fle->f_flags |= (flags & FL_IPV6);
-	fle->f_proto = proto;
-	fle->f_rt = ro->ro_rt;
-	fle->f_lle = ro->ro_lle;
-	fle->f_fhash = hash;
+	fle->f_hash = hash;
+	bcopy(key, &fle->f_key, keylen);
+	fle->f_rt = rt;
+	fle->f_lle = lle;
 	fle->f_fibnum = fibnum;
 	fle->f_uptime = time_uptime;
+#ifdef FLOWTABLE_HASH_ALL
+	fle->f_proto = proto;
+	fle->f_flags = fibnum0 >> 24;
+#endif
 
 	critical_enter();
 	mask = flowtable_mask(ft);
@@ -765,13 +630,13 @@ flowtable_insert(struct flowtable *ft, u
 		goto skip;
 	}
 
-	depth = 0;
 	/*
 	 * find end of list and make sure that we were not
 	 * preempted by another thread handling this flow
 	 */
 	SLIST_FOREACH(iter, flist, f_next) {
-		if (flow_matches(iter, hash, key, proto, fibnum)) {
+		KASSERT(iter->f_hash == hash, ("%s: wrong hash", __func__));
+		if (flow_matches(iter, key, keylen, fibnum)) {
 			/*
 			 * We probably migrated to an other CPU after
 			 * lookup in flowtable_lookup_common() failed.
@@ -779,18 +644,16 @@ flowtable_insert(struct flowtable *ft, u
 			 * entry.
 			 */
 			iter->f_uptime = time_uptime;
-			iter->f_flags |= flags;
+#ifdef FLOWTABLE_HASH_ALL
+			iter->f_flags |= fibnum >> 24;
+#endif
 			critical_exit();
 			FLOWSTAT_INC(ft, ft_collisions);
 			uma_zfree(flow_zone, fle);
 			return (iter);
 		}
-		depth++;
 	}
 
-	if (depth > ft->ft_max_depth)
-		ft->ft_max_depth = depth;
-
 	SLIST_INSERT_HEAD(flist, fle, f_next);
 skip:
 	critical_exit();
@@ -799,215 +662,75 @@ skip:
 	return (fle);
 }
 
-struct flentry *
-flowtable_lookup(sa_family_t sa, struct mbuf *m)
+int
+flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
 {
+	struct flentry *fle;
+
+	if (V_flowtable_enable == 0)
+		return (ENXIO);
 
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
-		return (flowtable_lookup_ipv4(m));
+		fle = flowtable_lookup_ipv4(m, ro);
+		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
-		return (flowtable_lookup_ipv6(m));
+		fle = flowtable_lookup_ipv6(m, ro);
+		break;
 #endif
 	default:
 		panic("%s: sa %d", __func__, sa);
 	}
-}
 
-static struct flentry *
-flowtable_lookup_common(struct flowtable *ft, struct sockaddr_storage *ssa,
-    struct sockaddr_storage *dsa, struct mbuf *m, int flags)
-{
-	struct route_in6 sro6;
-	struct route sro, *ro;
-	struct flist *flist;
-	struct flentry *fle;
-	struct rtentry *rt;
-	struct llentry *lle;
-	struct sockaddr_storage *l3addr;
-	struct ifnet *ifp;
-	uint32_t key[9], hash, fibnum;
-	uint8_t proto;
-
-	if (V_flowtable_enable == 0)
-		return (NULL);
-
-	sro.ro_rt = sro6.ro_rt = NULL;
-	sro.ro_lle = sro6.ro_lle = NULL;
-	flags |= ft->ft_flags;
-	proto = flags_to_proto(flags);
-	fibnum = M_GETFIB(m);
-
-	switch (ssa->ss_family) {
-#ifdef INET
-	case AF_INET: {
-		struct sockaddr_in *ssin, *dsin;
-
-		KASSERT(dsa->ss_family == AF_INET,
-		    ("%s: dsa family %d\n", __func__, dsa->ss_family));
-
-		ro = &sro;
-		memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
-		/*
-		 * The harvested source and destination addresses
-		 * may contain port information if the packet is
-		 * from a transport protocol (e.g. TCP/UDP). The
-		 * port field must be cleared before performing
-		 * a route lookup.
-		 */
-		((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
-		dsin = (struct sockaddr_in *)dsa;
-		ssin = (struct sockaddr_in *)ssa;
-		if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
-		    (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
-		    (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
-			return (NULL);
+	if (fle == NULL)
+		return (EHOSTUNREACH);
 
-		hash = ipv4_flow_lookup_hash(ssin, dsin, key, flags);
-		break;
+	if (!(m->m_flags & M_FLOWID)) {
+		m->m_flags |= M_FLOWID;
+		m->m_pkthdr.flowid = fle->f_hash;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201402171150.s1HBou3Q018877>