Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 1 Jan 2009 22:11:44 +0000 (UTC)
From:      Robert Watson <rwatson@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-projects@freebsd.org
Subject:   svn commit: r186686 - projects/pnet/sys/netinet
Message-ID:  <200901012211.n01MBist080219@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: rwatson
Date: Thu Jan  1 22:11:44 2009
New Revision: 186686
URL: http://svn.freebsd.org/changeset/base/186686

Log:
  Add IP SUBSET patch to pnet branch: the IP_SUBSET socket option allows
  identically bound UDP sockets to balance load between them using various
  strategies, including random assignment, flow-based assignment,
  CPU-based assignment, and kernel thread ID-based assignment.
  
  UDP applications, such as BIND, memcached, etc, can create multiple
  sockets, each with SO_REUSEPORT set, followed by specifying their index
  among a set of matching sockets all servicing the same port number.

Modified:
  projects/pnet/sys/netinet/in.h
  projects/pnet/sys/netinet/in_pcb.c
  projects/pnet/sys/netinet/in_pcb.h
  projects/pnet/sys/netinet/in_proto.c
  projects/pnet/sys/netinet/udp_usrreq.c
  projects/pnet/sys/netinet/udp_var.h

Modified: projects/pnet/sys/netinet/in.h
==============================================================================
--- projects/pnet/sys/netinet/in.h	Thu Jan  1 20:47:09 2009	(r186685)
+++ projects/pnet/sys/netinet/in.h	Thu Jan  1 22:11:44 2009	(r186686)
@@ -486,6 +486,21 @@ __END_DECLS
 #define	MCAST_BLOCK_SOURCE		84   /* block a source */
 #define	MCAST_UNBLOCK_SOURCE		85   /* unblock a source */
 
+/* Binding subsets. */
+#define	IP_SUBSET			86	/* get/set binding subset */
+
+struct ip_subset {
+	u_int	is_strategy;
+	u_int	is_count;
+	u_int	is_member;
+};
+
+#define	IP_SUBSET_STRATEGY_DISABLED	0
+#define	IP_SUBSET_STRATEGY_FLOW		1
+#define	IP_SUBSET_STRATEGY_RANDOM	2
+#define	IP_SUBSET_STRATEGY_THREADID	3
+#define	IP_SUBSET_STRATEGY_CPU		4
+
 /*
  * Defaults and limits for options
  */

Modified: projects/pnet/sys/netinet/in_pcb.c
==============================================================================
--- projects/pnet/sys/netinet/in_pcb.c	Thu Jan  1 20:47:09 2009	(r186685)
+++ projects/pnet/sys/netinet/in_pcb.c	Thu Jan  1 22:11:44 2009	(r186686)
@@ -204,6 +204,7 @@ in_pcballoc(struct socket *so, struct in
 	inp->inp_socket = so;
 	inp->inp_cred = crhold(so->so_cred);
 	inp->inp_inc.inc_fibnum = so->so_fibnum;
+	inp->inp_subset_strategy = IP_SUBSET_STRATEGY_DISABLED;
 #ifdef MAC
 	error = mac_inpcb_init(inp, M_NOWAIT);
 	if (error != 0)
@@ -1284,12 +1285,114 @@ in_pcblookup_local(struct inpcbinfo *pcb
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
 /*
+ * Implement various subsetting strategies: determine whether a particular
+ * inpcb, implementing a particular strategy, matches the passed tuple or
+ * not.
+ */
+static int
+in_subset_match(struct inpcb *inp, struct in_addr faddr, u_short fport,
+    struct in_addr laddr, u_short lport, u_short ip_id, u_int32_t flowid)
+{
+
+	switch (inp->inp_subset_strategy) {
+	case IP_SUBSET_STRATEGY_FLOW:
+		/*
+		 * If the packet has a flow tag, use that, but otherwise,
+		 * calculate our own flow tag using the IP/port tuple.
+		 */
+		if (flowid != 0) {
+			if ((flowid % inp->inp_subset_count) ==
+			    inp->inp_subset_member)
+				return (1);
+		} else {
+			/*
+			 * XXXRW: This hash is not the hash that you are
+			 * looking for.
+			 */
+			if (((faddr.s_addr ^ laddr.s_addr ^ fport ^ lport) %
+			    inp->inp_subset_count) == inp->inp_subset_member)
+				return (1);
+		}
+		return (0);
+
+	case IP_SUBSET_STRATEGY_RANDOM:
+		/*
+		 * If there is a flow tag, use that and the IP ID as a source
+		 * of entropy.  Otherwise, calculate our own flow tag as
+		 * above and combine with the IP ID.
+		 *
+		 * XXXRW: This hash is also not the hash that you are looking
+		 * for.
+		 */
+		if (flowid != 0) {
+			if (((flowid ^ ip_id) % inp->inp_subset_count) ==
+			    inp->inp_subset_member)
+				return (1);
+		} else {
+			if (((faddr.s_addr ^ laddr.s_addr ^ fport ^ lport ^
+			    ip_id) % inp->inp_subset_count) ==
+			    inp->inp_subset_member)
+				return (1);
+		}
+		return (0);
+
+	case IP_SUBSET_STRATEGY_THREADID:
+		/*
+		 * Experiment: pick the socket to use based on the kernel
+		 * thread ID processing the packet.  This will be fixed for
+		 * particular RSS input queues, so will assign work to a
+		 * particular socket based on which input queue it came from.
+		 * This doesn't attempt to balance the work at all, simply
+		 * ensure that datagrams local to a particular CPU are
+		 * assigned to the same socket consistently.
+		 */
+		if ((curthread->td_tid % inp->inp_subset_count) ==
+		    inp->inp_subset_member)
+			return (1);
+		return (0);
+
+	case IP_SUBSET_STRATEGY_CPU:
+		/*
+		 * Experimental: packets from the same CPU will always get
+		 * assigned to the same socket.  Doesn't attempt to load
+		 * balance or maintain ordering, as source threads may not
+		 * always be on the same CPU.  However, may achieve a more
+		 * even or predictable balance than
+		 * IP_SUBSET_STRATEGY_THREADID.
+		 *
+		 * This might be quite a bit more interesting if sockets had
+		 * a formal affinity themselves, as then we could direct
+		 * datagrams to that explicitly.
+		 */
+		if ((curcpu % inp->inp_subset_count) ==
+		    inp->inp_subset_member)
+			return (1);
+		return (0);
+
+	/* case IP_SUBSET_STRATEGY_FILLSOCK: */
+		/*
+		 * In this theoretical mode, we attempt to fill sockets in
+		 * the order they are matched, and don't move onto the next
+		 * socket unless the previous one is filled.  This requires
+		 * us to peak up a layer and see if there is room for the
+		 * current datagram; this proves somewhat tricky as we need
+		 * to make sure we don't return ICMP when the last one proves
+		 * full, so we don't try to do that yet.
+		 */
+
+	default:
+		panic("in_subset_match: strategy %d",
+		    inp->inp_subset_strategy);
+	}
+}
+
+/*
  * Lookup PCB in hash list.
  */
 struct inpcb *
-in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
-    u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
-    struct ifnet *ifp)
+in_pcblookup_hash_full(struct inpcbinfo *pcbinfo, struct in_addr faddr,
+    u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_short ip_id,
+    u_int32_t flowid, int wildcard, struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
@@ -1309,20 +1412,25 @@ in_pcblookup_hash(struct inpcbinfo *pcbi
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
-		if (inp->inp_faddr.s_addr == faddr.s_addr &&
-		    inp->inp_laddr.s_addr == laddr.s_addr &&
-		    inp->inp_fport == fport &&
-		    inp->inp_lport == lport) {
-			/*
-			 * XXX We should be able to directly return
-			 * the inp here, without any checks.
-			 * Well unless both bound with SO_REUSEPORT?
-			 */
-			if (jailed(inp->inp_cred))
-				return (inp);
-			if (tmpinp == NULL)
-				tmpinp = inp;
-		}
+		if (inp->inp_faddr.s_addr != faddr.s_addr ||
+		    inp->inp_laddr.s_addr != laddr.s_addr ||
+		    inp->inp_fport != fport ||
+		    inp->inp_lport != lport)
+			continue;
+		if (inp->inp_subset_strategy != IP_SUBSET_STRATEGY_DISABLED
+		    && !in_subset_match(inp, faddr, fport, laddr, lport,
+		    ip_id, flowid))
+			continue;
+
+		/*
+		 * XXX We should be able to directly return
+		 * the inp here, without any checks.
+		 * Well unless both bound with SO_REUSEPORT?
+		 */
+		if (jailed(inp->inp_cred))
+			return (inp);
+		if (tmpinp == NULL)
+			tmpinp = inp;
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
@@ -1372,6 +1480,12 @@ in_pcblookup_hash(struct inpcbinfo *pcbi
 					continue;
 			}
 
+			if (inp->inp_subset_strategy !=
+			    IP_SUBSET_STRATEGY_DISABLED &&
+			    !in_subset_match(inp, faddr, fport, laddr, lport,
+			    ip_id, flowid))
+				continue;
+
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					return (inp);
@@ -1405,6 +1519,16 @@ in_pcblookup_hash(struct inpcbinfo *pcbi
 	return (NULL);
 }
 
+struct inpcb *
+in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
+    u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
+    struct ifnet *ifp)
+{
+
+	return (in_pcblookup_hash_full(pcbinfo, faddr, fport_arg, laddr,
+	    lport_arg, 0, 0, wildcard, ifp));
+}
+
 /*
  * Insert PCB onto various hash lists.
  */

Modified: projects/pnet/sys/netinet/in_pcb.h
==============================================================================
--- projects/pnet/sys/netinet/in_pcb.h	Thu Jan  1 20:47:09 2009	(r186685)
+++ projects/pnet/sys/netinet/in_pcb.h	Thu Jan  1 22:11:44 2009	(r186686)
@@ -199,6 +199,9 @@ struct inpcb {
 	} inp_depend6;
 	LIST_ENTRY(inpcb) inp_portlist;	/* (i/p) */
 	struct	inpcbport *inp_phd;	/* (i/p) head of this list */
+	u_int		inp_subset_strategy;
+	u_int		inp_subset_count;
+	u_int		inp_subset_member;
 #define inp_zero_size offsetof(struct inpcb, inp_gencnt)
 	inp_gen_t	inp_gencnt;	/* (c) generation count */
 	struct rwlock	inp_lock;
@@ -493,6 +496,11 @@ struct inpcb *
 struct inpcb *
 	in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int,
 	    struct in_addr, u_int, int, struct ifnet *);
+struct inpcb *
+	in_pcblookup_hash_full(struct inpcbinfo *pcbinfo,
+	    struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
+	    u_int lport_arg, u_short ip_id, u_int32_t flowid, int wildcard,
+	    struct ifnet *ifp);
 void	in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
 	    int, struct inpcb *(*)(struct inpcb *, int));
 void	in_pcbref(struct inpcb *);

Modified: projects/pnet/sys/netinet/in_proto.c
==============================================================================
--- projects/pnet/sys/netinet/in_proto.c	Thu Jan  1 20:47:09 2009	(r186685)
+++ projects/pnet/sys/netinet/in_proto.c	Thu Jan  1 22:11:44 2009	(r186686)
@@ -124,7 +124,7 @@ struct protosw inetsw[] = {
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		udp_input,
 	.pr_ctlinput =		udp_ctlinput,
-	.pr_ctloutput =		ip_ctloutput,
+	.pr_ctloutput =		udp_ctloutput,
 	.pr_init =		udp_init,
 	.pr_usrreqs =		&udp_usrreqs
 },

Modified: projects/pnet/sys/netinet/udp_usrreq.c
==============================================================================
--- projects/pnet/sys/netinet/udp_usrreq.c	Thu Jan  1 20:47:09 2009	(r186685)
+++ projects/pnet/sys/netinet/udp_usrreq.c	Thu Jan  1 22:11:44 2009	(r186686)
@@ -526,8 +526,8 @@ udp_input(struct mbuf *m, int off)
 	/*
 	 * Locate pcb for datagram.
 	 */
-	inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport,
-	    ip->ip_dst, uh->uh_dport, 1, ifp);
+	inp = in_pcblookup_hash_full(&V_udbinfo, ip->ip_src, uh->uh_sport,
+	    ip->ip_dst, uh->uh_dport, ip->ip_id, m->m_pkthdr.flowid, 1, ifp);
 	if (inp == NULL) {
 		if (udp_log_in_vain) {
 			char buf[4*sizeof "123"];
@@ -621,6 +621,9 @@ udp_ctlinput(int cmd, struct sockaddr *s
 	 *
 	 * XXX: We never get this from ICMP, otherwise it makes an excellent
 	 * DoS attack on machines with many connections.
+	 *
+	 * XXXRW: With subsetting, we should deliver this to all matching
+	 * connections for the specific tuple.
 	 */
 	if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
@@ -644,6 +647,67 @@ udp_ctlinput(int cmd, struct sockaddr *s
 		    udp_notify);
 }
 
+int
+udp_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+	INIT_VNET_INET(so->so_vnet);
+	struct ip_subset is;
+	struct inpcb *inp;
+	int error;
+
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("udp_ctloutput: inp == NULL"));
+
+	if (sopt->sopt_level != IPPROTO_UDP)
+		return (ip_ctloutput(so, sopt));
+
+	switch (sopt->sopt_dir) {
+	case SOPT_GET:
+		switch (sopt->sopt_name) {
+		case IP_SUBSET:
+			bzero(&is, sizeof(is));
+			INP_RLOCK(inp);
+			is.is_strategy = inp->inp_subset_strategy;
+			is.is_count = inp->inp_subset_count;
+			is.is_member = inp->inp_subset_member;
+			INP_RUNLOCK(inp);
+			return (sooptcopyout(sopt, &is, sizeof(is)));
+		}
+		break;
+
+	case SOPT_SET:
+		switch (sopt->sopt_name) {
+		case IP_SUBSET:
+			error = sooptcopyin(sopt, &is, sizeof(is),
+			    sizeof(is));
+			if (error)
+				return (error);
+			switch (is.is_strategy) {
+			case IP_SUBSET_STRATEGY_DISABLED:
+				break;
+
+			case IP_SUBSET_STRATEGY_FLOW:
+			case IP_SUBSET_STRATEGY_RANDOM:
+				if (is.is_count == 0 ||
+				    is.is_member >= is.is_count)
+					return (EINVAL);
+				break;
+
+			default:
+				return (EINVAL);
+			}
+			INP_WLOCK(inp);
+			inp->inp_subset_strategy = is.is_strategy;
+			inp->inp_subset_count = is.is_count;
+			inp->inp_subset_member = is.is_member;
+			INP_WUNLOCK(inp);
+			return (0);
+		}
+		break;
+	}
+	return (ENOPROTOOPT);
+}
+
 static int
 udp_pcblist(SYSCTL_HANDLER_ARGS)
 {
@@ -758,6 +822,11 @@ udp_getcred(SYSCTL_HANDLER_ARGS)
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
+
+	/*
+	 * XXXRW: with IP subsetting, potentially more than one socket may
+	 * match, so we just return the cred for the first one.
+	 */
 	INP_INFO_RLOCK(&V_udbinfo);
 	inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 				addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);

Modified: projects/pnet/sys/netinet/udp_var.h
==============================================================================
--- projects/pnet/sys/netinet/udp_var.h	Thu Jan  1 20:47:09 2009	(r186685)
+++ projects/pnet/sys/netinet/udp_var.h	Thu Jan  1 22:11:44 2009	(r186686)
@@ -106,6 +106,7 @@ extern u_long			udp_recvspace;
 extern int			udp_log_in_vain;
 
 void		 udp_ctlinput(int, struct sockaddr *, void *);
+int		 udp_ctloutput(struct socket *so, struct sockopt *sopt);
 void		 udp_init(void);
 void		 udp_input(struct mbuf *, int);
 struct inpcb	*udp_notify(struct inpcb *inp, int errno);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200901012211.n01MBist080219>