Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 23 Aug 2018 16:54:22 +0000 (UTC)
From:      "Bjoern A. Zeeb" <bz@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r338257 - head/sys/netinet6
Message-ID:  <201808231654.w7NGsMbI026404@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: bz
Date: Thu Aug 23 16:54:22 2018
New Revision: 338257
URL: https://svnweb.freebsd.org/changeset/base/338257

Log:
  MFp4 bz_ipv6_fast:
  
  Migrate udp6_send() v4mapped code to udp6_output() saving us a re-lock and
  further simplifying the address-family handling code by eliminating
  AF_INET checks and almost all v4mapped handling right after the start
  as cases could actually not happen anymore.
  
  Rework output path locking similar to UDP4 allowing for better
  parallelism (see r222488, and later versions).
  
  Sponsored by: The FreeBSD Foundation (2012)
  Sponsored by: iXsystems (2012)
  Differential Revision:	https://reviews.freebsd.org/D3721

Modified:
  head/sys/netinet6/udp6_usrreq.c

Modified: head/sys/netinet6/udp6_usrreq.c
==============================================================================
--- head/sys/netinet6/udp6_usrreq.c	Thu Aug 23 16:52:52 2018	(r338256)
+++ head/sys/netinet6/udp6_usrreq.c	Thu Aug 23 16:54:22 2018	(r338257)
@@ -679,35 +679,38 @@ udp6_getcred(SYSCTL_HANDLER_ARGS)
 SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0,
     0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection");
 
+#define	UH_WLOCKED	2
+#define	UH_RLOCKED	1
+#define	UH_UNLOCKED	0
 static int
-udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6,
-    struct mbuf *control, struct thread *td)
+udp6_output(struct socket *so, int flags_arg, struct mbuf *m,
+    struct sockaddr *addr6, struct mbuf *control, struct thread *td)
 {
-	u_int32_t ulen = m->m_pkthdr.len;
-	u_int32_t plen = sizeof(struct udphdr) + ulen;
+	struct inpcbinfo *pcbinfo;
+	struct inpcb *inp;
 	struct ip6_hdr *ip6;
 	struct udphdr *udp6;
 	struct in6_addr *laddr, *faddr, in6a;
-	struct sockaddr_in6 *sin6 = NULL;
-	int cscov_partial = 0;
-	int scope_ambiguous = 0;
-	u_short fport;
-	int error = 0;
-	uint8_t nxt;
-	uint16_t cscov = 0;
 	struct ip6_pktopts *optp, opt;
-	int af = AF_INET6, hlen = sizeof(struct ip6_hdr);
-	int flags;
-	struct sockaddr_in6 tmp;
+	struct sockaddr_in6 *sin6, tmp;
+	struct epoch_tracker et;
+	int cscov_partial, error, flags, hlen, scope_ambiguous;
+	u_int32_t ulen, plen;
+	uint16_t cscov;
+	u_short fport;
+	uint8_t nxt, unlock_udbinfo;
 
-	INP_WLOCK_ASSERT(inp);
-	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
+	/* addr6 has been validated in udp6_send(). */
+	sin6 = (struct sockaddr_in6 *)addr6;
 
-	if (addr6) {
-		/* addr6 has been validated in udp6_send(). */
-		sin6 = (struct sockaddr_in6 *)addr6;
+	/*
+	 * In contrast to to IPv4 we do not validate the max. packet length
+	 * here due to IPv6 Jumbograms (RFC2675).
+	 */
 
-		/* protect *sin6 from overwrites */
+	scope_ambiguous = 0;
+	if (sin6) {
+		/* Protect *addr6 from overwrites. */
 		tmp = *sin6;
 		sin6 = &tmp;
 
@@ -721,22 +724,86 @@ udp6_output(struct inpcb *inp, struct mbuf *m, struct 
 		 */
 		if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
 			scope_ambiguous = 1;
-		if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
+		if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) {
+			if (control)
+				m_freem(control);
+			m_freem(m);
 			return (error);
+		}
 	}
 
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
+	INP_RLOCK(inp);
 	nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
 	    IPPROTO_UDP : IPPROTO_UDPLITE;
+
+#ifdef INET
+	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
+		int hasv4addr;
+
+		if (sin6 == NULL)
+			hasv4addr = (inp->inp_vflag & INP_IPV4);
+		else
+			hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)
+			    ? 1 : 0;
+		if (hasv4addr) {
+			struct pr_usrreqs *pru;
+
+			/*
+			 * XXXRW: We release UDP-layer locks before calling
+			 * udp_send() in order to avoid recursion.  However,
+			 * this does mean there is a short window where inp's
+			 * fields are unstable.  Could this lead to a
+			 * potential race in which the factors causing us to
+			 * select the UDPv4 output routine are invalidated?
+			 */
+			INP_RUNLOCK(inp);
+			if (sin6)
+				in6_sin6_2_sin_in_sock((struct sockaddr *)sin6);
+			pru = inetsw[ip_protox[nxt]].pr_usrreqs;
+			/* addr will just be freed in sendit(). */
+			return ((*pru->pru_send)(so, flags_arg, m,
+			    (struct sockaddr *)sin6, control, td));
+		}
+	}
+#endif
+
 	if (control) {
 		if ((error = ip6_setpktopts(control, &opt,
-		    inp->in6p_outputopts, td->td_ucred, nxt)) != 0)
-			goto release;
+		    inp->in6p_outputopts, td->td_ucred, nxt)) != 0) {
+			INP_RUNLOCK(inp);
+			ip6_clearpktopts(&opt, -1);
+			if (control)
+				m_freem(control);
+			m_freem(m);
+			return (error);
+		}
 		optp = &opt;
 	} else
 		optp = inp->in6p_outputopts;
 
+	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
+	if (sin6 != NULL &&
+	    IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0) {
+		INP_RUNLOCK(inp);
+		/*
+		 * XXX there is a short window here which could lead to a race;
+		 * should we re-check that what got us here is still valid?
+		 */
+		INP_WLOCK(inp);
+		INP_HASH_WLOCK(pcbinfo);
+		unlock_udbinfo = UH_WLOCKED;
+	} else if (sin6 != NULL &&
+	    (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
+	    IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
+	    inp->inp_lport == 0)) {
+		INP_HASH_RLOCK_ET(pcbinfo, et);
+		unlock_udbinfo = UH_RLOCKED;
+	} else
+		unlock_udbinfo = UH_UNLOCKED;
+
 	if (sin6) {
-		faddr = &sin6->sin6_addr;
 
 		/*
 		 * Since we saw no essential reason for calling in_pcbconnect,
@@ -755,85 +822,47 @@ udp6_output(struct inpcb *inp, struct mbuf *m, struct 
 			goto release;
 		}
 
-		fport = sin6->sin6_port; /* allow 0 port */
+		/*
+		 * Given we handle the v4mapped case in the INET block above
+		 * assert here that it must not happen anymore.
+		 */
+		KASSERT(!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr),
+		    ("%s: sin6(%p)->sin6_addr is v4mapped which we "
+		    "should have handled.", __func__, sin6));
 
-		if (IN6_IS_ADDR_V4MAPPED(faddr)) {
-			if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
-				/*
-				 * I believe we should explicitly discard the
-				 * packet when mapped addresses are disabled,
-				 * rather than send the packet as an IPv6 one.
-				 * If we chose the latter approach, the packet
-				 * might be sent out on the wire based on the
-				 * default route, the situation which we'd
-				 * probably want to avoid.
-				 * (20010421 jinmei@kame.net)
-				 */
-				error = EINVAL;
-				goto release;
-			}
-			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
-			    !IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) {
-				/*
-				 * when remote addr is an IPv4-mapped address,
-				 * local addr should not be an IPv6 address,
-				 * since you cannot determine how to map IPv6
-				 * source address to IPv4.
-				 */
-				error = EINVAL;
-				goto release;
-			}
+		/* This only requires read-locking. */
+		error = in6_selectsrc_socket(sin6, optp, inp,
+		    td->td_ucred, scope_ambiguous, &in6a, NULL);
+		if (error)
+			goto release;
+		laddr = &in6a;
 
-			af = AF_INET;
-		}
+		if (inp->inp_lport == 0) {
 
-		if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
-			error = in6_selectsrc_socket(sin6, optp, inp,
-			    td->td_ucred, scope_ambiguous, &in6a, NULL);
-			if (error)
+			INP_WLOCK_ASSERT(inp);
+			error = in6_pcbsetport(laddr, inp, td->td_ucred);
+			if (error != 0) {
+				/* Undo an address bind that may have occurred. */
+				inp->in6p_laddr = in6addr_any;
 				goto release;
-			laddr = &in6a;
-		} else
-			laddr = &inp->in6p_laddr;	/* XXX */
-		if (laddr == NULL) {
-			if (error == 0)
-				error = EADDRNOTAVAIL;
-			goto release;
+			}
 		}
-		if (inp->inp_lport == 0 &&
-		    (error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0) {
-			/* Undo an address bind that may have occurred. */
-			inp->in6p_laddr = in6addr_any;
-			goto release;
-		}
+		faddr = &sin6->sin6_addr;
+		fport = sin6->sin6_port; /* allow 0 port */
+
 	} else {
 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 			error = ENOTCONN;
 			goto release;
 		}
-		if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) {
-			if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
-				/*
-				 * XXX: this case would happen when the
-				 * application sets the V6ONLY flag after
-				 * connecting the foreign address.
-				 * Such applications should be fixed,
-				 * so we bark here.
-				 */
-				log(LOG_INFO, "udp6_output: IPV6_V6ONLY "
-				    "option was set for a connected socket\n");
-				error = EINVAL;
-				goto release;
-			} else
-				af = AF_INET;
-		}
 		laddr = &inp->in6p_laddr;
 		faddr = &inp->in6p_faddr;
 		fport = inp->inp_fport;
 	}
 
-	if (af == AF_INET)
-		hlen = sizeof(struct ip);
+	ulen = m->m_pkthdr.len;
+	plen = sizeof(struct udphdr) + ulen;
+	hlen = sizeof(struct ip6_hdr);
 
 	/*
 	 * Calculate data length and get a mbuf
@@ -848,6 +877,7 @@ udp6_output(struct inpcb *inp, struct mbuf *m, struct 
 	/*
 	 * Stuff checksum and output datagram.
 	 */
+	cscov = cscov_partial = 0;
 	udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen);
 	udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */
 	udp6->uh_dport = fport;
@@ -870,59 +900,59 @@ udp6_output(struct inpcb *inp, struct mbuf *m, struct 
 		udp6->uh_ulen = 0;
 	udp6->uh_sum = 0;
 
-	switch (af) {
-	case AF_INET6:
-		ip6 = mtod(m, struct ip6_hdr *);
-		ip6->ip6_flow	= inp->inp_flow & IPV6_FLOWINFO_MASK;
-		ip6->ip6_vfc	&= ~IPV6_VERSION_MASK;
-		ip6->ip6_vfc	|= IPV6_VERSION;
-		ip6->ip6_plen	= htons((u_short)plen);
-		ip6->ip6_nxt	= nxt;
-		ip6->ip6_hlim	= in6_selecthlim(inp, NULL);
-		ip6->ip6_src	= *laddr;
-		ip6->ip6_dst	= *faddr;
+	ip6 = mtod(m, struct ip6_hdr *);
+	ip6->ip6_flow	= inp->inp_flow & IPV6_FLOWINFO_MASK;
+	ip6->ip6_vfc	&= ~IPV6_VERSION_MASK;
+	ip6->ip6_vfc	|= IPV6_VERSION;
+	ip6->ip6_plen	= htons((u_short)plen);
+	ip6->ip6_nxt	= nxt;
+	ip6->ip6_hlim	= in6_selecthlim(inp, NULL);
+	ip6->ip6_src	= *laddr;
+	ip6->ip6_dst	= *faddr;
 
-		if (cscov_partial) {
-			if ((udp6->uh_sum = in6_cksum_partial(m, nxt,
-			    sizeof(struct ip6_hdr), plen, cscov)) == 0)
-				udp6->uh_sum = 0xffff;
-		} else {
-			udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0);
-			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
-			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
-		}
+#ifdef MAC
+	mac_inpcb_create_mbuf(inp, m);
+#endif
 
+	if (cscov_partial) {
+		if ((udp6->uh_sum = in6_cksum_partial(m, nxt,
+		    sizeof(struct ip6_hdr), plen, cscov)) == 0)
+			udp6->uh_sum = 0xffff;
+	} else {
+		udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0);
+		m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
+		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+	}
+
+	flags = 0;
 #ifdef	RSS
-		{
-			uint32_t hash_val, hash_type;
-			uint8_t pr;
+	{
+		uint32_t hash_val, hash_type;
+		uint8_t pr;
 
-			pr = inp->inp_socket->so_proto->pr_protocol;
-			/*
-			 * Calculate an appropriate RSS hash for UDP and
-			 * UDP Lite.
-			 *
-			 * The called function will take care of figuring out
-			 * whether a 2-tuple or 4-tuple hash is required based
-			 * on the currently configured scheme.
-			 *
-			 * Later later on connected socket values should be
-			 * cached in the inpcb and reused, rather than constantly
-			 * re-calculating it.
-			 *
-			 * UDP Lite is a different protocol number and will
-			 * likely end up being hashed as a 2-tuple until
-			 * RSS / NICs grow UDP Lite protocol awareness.
-			 */
-			if (rss_proto_software_hash_v6(faddr, laddr, fport,
-			    inp->inp_lport, pr, &hash_val, &hash_type) == 0) {
-				m->m_pkthdr.flowid = hash_val;
-				M_HASHTYPE_SET(m, hash_type);
-			}
+		pr = inp->inp_socket->so_proto->pr_protocol;
+		/*
+		 * Calculate an appropriate RSS hash for UDP and
+		 * UDP Lite.
+		 *
+		 * The called function will take care of figuring out
+		 * whether a 2-tuple or 4-tuple hash is required based
+		 * on the currently configured scheme.
+		 *
+		 * Later later on connected socket values should be
+		 * cached in the inpcb and reused, rather than constantly
+		 * re-calculating it.
+		 *
+		 * UDP Lite is a different protocol number and will
+		 * likely end up being hashed as a 2-tuple until
+		 * RSS / NICs grow UDP Lite protocol awareness.
+		 */
+		if (rss_proto_software_hash_v6(faddr, laddr, fport,
+		    inp->inp_lport, pr, &hash_val, &hash_type) == 0) {
+			m->m_pkthdr.flowid = hash_val;
+			M_HASHTYPE_SET(m, hash_type);
 		}
-#endif
-		flags = 0;
-#ifdef	RSS
+
 		/*
 		 * Don't override with the inp cached flowid.
 		 *
@@ -932,28 +962,43 @@ udp6_output(struct inpcb *inp, struct mbuf *m, struct 
 		flags |= IP_NODEFAULTFLOWID;
 #endif
 
-		if (nxt == IPPROTO_UDPLITE)
-			UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6);
-		else
-			UDP_PROBE(send, NULL, inp, ip6, inp, udp6);
-		UDPSTAT_INC(udps_opackets);
-		error = ip6_output(m, optp, &inp->inp_route6, flags,
-		    inp->in6p_moptions, NULL, inp);
-		break;
-	case AF_INET:
-		error = EAFNOSUPPORT;
-		goto release;
+	UDPSTAT_INC(udps_opackets);
+	if (unlock_udbinfo == UH_WLOCKED)
+		INP_HASH_WUNLOCK(pcbinfo);
+	else if (unlock_udbinfo == UH_RLOCKED)
+		INP_HASH_RUNLOCK_ET(pcbinfo, et);
+	if (nxt == IPPROTO_UDPLITE)
+		UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6);
+	else
+		UDP_PROBE(send, NULL, inp, ip6, inp, udp6);
+	error = ip6_output(m, optp, &inp->inp_route6, flags,
+	    inp->in6p_moptions, NULL, inp);
+	if (unlock_udbinfo == UH_WLOCKED)
+		INP_WUNLOCK(inp);
+	else
+		INP_RUNLOCK(inp);
+
+	if (control) {
+		ip6_clearpktopts(&opt, -1);
+		m_freem(control);
 	}
-	goto releaseopt;
+	return (error);
 
 release:
-	m_freem(m);
-
-releaseopt:
+	if (unlock_udbinfo == UH_WLOCKED) {
+		INP_HASH_WUNLOCK(pcbinfo);
+		INP_WUNLOCK(inp);
+	} else if (unlock_udbinfo == UH_RLOCKED) {
+		INP_HASH_RUNLOCK_ET(pcbinfo, et);
+		INP_RUNLOCK(inp);
+	} else
+		INP_RUNLOCK(inp);
 	if (control) {
 		ip6_clearpktopts(&opt, -1);
 		m_freem(control);
 	}
+	m_freem(m);
+
 	return (error);
 }
 
@@ -1257,15 +1302,8 @@ static int
 udp6_send(struct socket *so, int flags, struct mbuf *m,
     struct sockaddr *addr, struct mbuf *control, struct thread *td)
 {
-	struct inpcb *inp;
-	struct inpcbinfo *pcbinfo;
-	int error = 0;
+	int error;
 
-	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("udp6_send: inp == NULL"));
-
-	INP_WLOCK(inp);
 	if (addr) {
 		if (addr->sa_len != sizeof(struct sockaddr_in6)) {
 			error = EINVAL;
@@ -1277,53 +1315,11 @@ udp6_send(struct socket *so, int flags, struct mbuf *m
 		}
 	}
 
-#ifdef INET
-	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
-		int hasv4addr;
-		struct sockaddr_in6 *sin6 = NULL;
+	return (udp6_output(so, flags, m, addr, control, td));
 
-		if (addr == NULL)
-			hasv4addr = (inp->inp_vflag & INP_IPV4);
-		else {
-			sin6 = (struct sockaddr_in6 *)addr;
-			hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)
-			    ? 1 : 0;
-		}
-		if (hasv4addr) {
-			struct pr_usrreqs *pru;
-			uint8_t nxt;
-
-			nxt = (inp->inp_socket->so_proto->pr_protocol ==
-			    IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE;
-			/*
-			 * XXXRW: We release UDP-layer locks before calling
-			 * udp_send() in order to avoid recursion.  However,
-			 * this does mean there is a short window where inp's
-			 * fields are unstable.  Could this lead to a
-			 * potential race in which the factors causing us to
-			 * select the UDPv4 output routine are invalidated?
-			 */
-			INP_WUNLOCK(inp);
-			if (sin6)
-				in6_sin6_2_sin_in_sock(addr);
-			pru = inetsw[ip_protox[nxt]].pr_usrreqs;
-			/* addr will just be freed in sendit(). */
-			return ((*pru->pru_send)(so, flags, m, addr, control,
-			    td));
-		}
-	}
-#endif
-#ifdef MAC
-	mac_inpcb_create_mbuf(inp, m);
-#endif
-	INP_HASH_WLOCK(pcbinfo);
-	error = udp6_output(inp, m, addr, control, td);
-	INP_HASH_WUNLOCK(pcbinfo);
-	INP_WUNLOCK(inp);
-	return (error);
-
 bad:
-	INP_WUNLOCK(inp);
+	if (control)
+		m_freem(control);
 	m_freem(m);
 	return (error);
 }



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808231654.w7NGsMbI026404>