Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 19 Oct 2014 21:07:36 +0000 (UTC)
From:      "Alexander V. Chernikov" <melifaro@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-projects@freebsd.org
Subject:   svn commit: r273289 - in projects/routing/sys: net netinet
Message-ID:  <201410192107.s9JL7a1s062998@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: melifaro
Date: Sun Oct 19 21:07:35 2014
New Revision: 273289
URL: https://svnweb.freebsd.org/changeset/base/273289

Log:
  Switch IPv4 output path to use new routing api.
  
  The goals of the new API is to provide consumers with minimal
    needed information, but as fast as possible. So we provide
    full nexthop info copied into alighed on-cache structure
    instead of rte/ia pointers, their refcounts and locks.
    This does not provide solution for protecting from egress
    ifp destruction, but does not make it any worse.
  
  Current changes:
  
  nhops:
  Add fib4_lookup_prepend() function which stores either full
  L2+L3 prepend info (e.g. MAC header in case of plain IPv4) or
  L3 info with NH_FLAGS_L2_INCOMPLETE flag indicating that no valid L2
  info exists and we have to take "slow" path.
  
  ip_output:
  Currently ip[ 46]_output consumers use 'struct route' for
  the following purposes:
    1) double lookup avoidance(route caching)
    2) plain route caching
    3) get path MTU to be able to notify source.
  The former pattern is mostly used by various tunnels
   (gif, gre, stf). (Actually, gre is the only remaining,
   others were already converted. Their locking model did
   not scale good enogh to benefit from such caching, so
   we have (temporarily) removed it without any performance
   loss).
  Plain route caching used by SCTP is simply wrong and should be removed.
    Temporary break it for now just to be able to compile.
  Optimize path mtu reporting by providing it in new 'route_info' stucture.
  
  Minimize games with @ia locking/refcounting for route lookup:
    add special nhop[46]_extended structure to store more route attributes.
    Pointer to given structure can be passed to fib4_lookup_prepend() to indicate
    we want this info (we actually needs it for UDP and raw IP).
  
  ether_output:
  Provide light-weight ether_output2() call to deal with
  transmitting L2 frame (e.g. properly handle broadcast/simloop/bridge/
    other L2 hooks before actually transmitting frame by if_transmit()).
  Add a hack based on new RT_NHOP ro_flag to distinguish which version should
    we call. Better way is probably to add a new "if_output_frame" driver
    callbacks.
  
   Next steps:
  * Convert ip_fastfwd part
  * Implement auto-growing array for per-radix nexthops
  * Implement LLE tracking for nexthop calculations to be able to
    immediately provide all necessary info in single route lookup
    for gateway routes
  * Switch radix locking scheme to runtime/cfg lock
  * Implement multipath support for rtsock
  * Implement "tracked nexthops" for tunnels (e.g. _proper_
    nexthop caching)
  * Add IPv6 support for remaining parts (postponed not to
     interfere with user/ae/inet6 branch)
  * Consider adding "if_output_frame" driver call to
    ease logical frame pushing.

Modified:
  projects/routing/sys/net/if_ethersubr.c
  projects/routing/sys/net/if_gre.c
  projects/routing/sys/net/if_stf.c
  projects/routing/sys/net/route.h
  projects/routing/sys/net/rt_nhops.c
  projects/routing/sys/net/rt_nhops.h
  projects/routing/sys/netinet/if_ether.c
  projects/routing/sys/netinet/if_ether.h
  projects/routing/sys/netinet/in_gif.c
  projects/routing/sys/netinet/ip_input.c
  projects/routing/sys/netinet/ip_output.c
  projects/routing/sys/netinet/ip_var.h
  projects/routing/sys/netinet/sctp_os_bsd.h
  projects/routing/sys/netinet/tcp_output.c

Modified: projects/routing/sys/net/if_ethersubr.c
==============================================================================
--- projects/routing/sys/net/if_ethersubr.c	Sun Oct 19 21:03:42 2014	(r273288)
+++ projects/routing/sys/net/if_ethersubr.c	Sun Oct 19 21:07:35 2014	(r273289)
@@ -78,6 +78,7 @@
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif
+#include <net/rt_nhops.h>
 
 int (*ef_inputp)(struct ifnet*, struct ether_header *eh, struct mbuf *m);
 int (*ef_outputp)(struct ifnet *ifp, struct mbuf **mp,
@@ -119,6 +120,14 @@ static	int ether_resolvemulti(struct ifn
 static	void ether_reassign(struct ifnet *, struct vnet *, char *);
 #endif
 
+int ether_output_full(struct ifnet *ifp, struct mbuf *m,
+    const struct sockaddr *dst, struct route *ro);
+int ether_output2(struct ifnet *ifp, struct mbuf *m, struct nhop_data *nh,
+    int af);
+
+static int loopback_frame(struct ifnet *ifp, struct mbuf *m, int family,
+    int hlen);
+
 /* XXX: should be in an arp support file, not here */
 static MALLOC_DEFINE(M_ARPCOM, "arpcom", "802.* interface internals");
 
@@ -143,6 +152,17 @@ update_mbuf_csumflags(struct mbuf *src, 
 		dst->m_pkthdr.csum_data = 0xffff;
 }
 
+int
+ether_output(struct ifnet *ifp, struct mbuf *m,
+	const struct sockaddr *dst, struct route *ro)
+{
+	if (ro != NULL && (ro->ro_flags & RT_NHOP))
+		return (ether_output2(ifp, m, (struct nhop_data *)ro->ro_lle,
+		    (ro->ro_flags >> 8) & 0xFF));
+
+	return (ether_output_full(ifp, m, dst, ro));
+}
+
 /*
  * Ethernet output routine.
  * Encapsulate a packet of type family for the local net.
@@ -150,7 +170,7 @@ update_mbuf_csumflags(struct mbuf *src, 
  * packet leaves a multiple of 512 bytes of data in remainder.
  */
 int
-ether_output(struct ifnet *ifp, struct mbuf *m,
+ether_output_full(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	short type;
@@ -287,31 +307,11 @@ ether_output(struct ifnet *ifp, struct m
 	 */
 	if ((ifp->if_flags & IFF_SIMPLEX) && loop_copy &&
 	    ((t = pf_find_mtag(m)) == NULL || !t->routed)) {
-		if (m->m_flags & M_BCAST) {
-			struct mbuf *n;
-
-			/*
-			 * Because if_simloop() modifies the packet, we need a
-			 * writable copy through m_dup() instead of a readonly
-			 * one as m_copy[m] would give us. The alternative would
-			 * be to modify if_simloop() to handle the readonly mbuf,
-			 * but performancewise it is mostly equivalent (trading
-			 * extra data copying vs. extra locking).
-			 *
-			 * XXX This is a local workaround.  A number of less
-			 * often used kernel parts suffer from the same bug.
-			 * See PR kern/105943 for a proposed general solution.
-			 */
-			if ((n = m_dup(m, M_NOWAIT)) != NULL) {
-				update_mbuf_csumflags(m, n);
-				(void)if_simloop(ifp, n, dst->sa_family, hlen);
-			} else
-				if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
-		} else if (bcmp(eh->ether_dhost, eh->ether_shost,
-				ETHER_ADDR_LEN) == 0) {
-			update_mbuf_csumflags(m, m);
-			(void) if_simloop(ifp, m, dst->sa_family, hlen);
-			return (0);	/* XXX */
+		if ((m->m_flags & M_BCAST) || (bcmp(eh->ether_dhost,
+		    eh->ether_shost, ETHER_ADDR_LEN) == 0)) {
+			/* Either broadcast or to-us L2 header */
+			if (loopback_frame(ifp, m, dst->sa_family, hlen) == 1)
+				return (0);
 		}
 	}
 
@@ -347,6 +347,112 @@ bad:			if (m != NULL)
 }
 
 /*
+ * We assume this function to be called for
+ * ip[6]_output(), with already pre-compiled L2 header.
+ *
+ * Function assumes all loopback routing is already done on L3,
+ * so the only reason to push packet (copy) to host is M_BCAST flag.
+ */
+int
+ether_output2(struct ifnet *ifp, struct mbuf *m, struct nhop_data *nh, int af)
+{
+	int error;
+
+#ifdef MAC
+	error = mac_ifnet_check_transmit(ifp, m);
+	if (error)
+		senderr(error);
+#endif
+
+	M_PROFILE(m);
+	if (ifp->if_flags & IFF_MONITOR)
+		senderr(ENETDOWN);
+	if (!((ifp->if_flags & IFF_UP) &&
+	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
+		senderr(ENETDOWN);
+
+	if ((ifp->if_flags & IFF_SIMPLEX) && (m->m_flags & M_BCAST)) {
+		/* We have to copy frame to-us */
+		if (loopback_frame(NH_LIFP(nh), m, af, nh->nh_count) != 0)
+			return (0);
+	}
+
+       /*
+	* Bridges require special output handling.
+	*/
+	if (ifp->if_bridge) {
+		BRIDGE_OUTPUT(ifp, m, error);
+		return (error);
+	}
+
+#if defined(INET) || defined(INET6)
+	if (ifp->if_carp) {
+		struct sockaddr_in dst;
+		memset(&dst, 0, sizeof(dst));
+		//dst.sin_addr = 
+	    	error = (*carp_output_p)(ifp, m,
+		    (const struct sockaddr *)&dst);
+		if (error != 0)
+			goto bad;
+	}
+#endif
+
+	/* Handle ng_ether(4) processing, if any */
+	if (IFP2AC(ifp)->ac_netgraph != NULL) {
+		KASSERT(ng_ether_output_p != NULL,
+		    ("ng_ether_output_p is NULL"));
+		if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) {
+bad:			if (m != NULL)
+				m_freem(m);
+			return (error);
+		}
+		if (m == NULL)
+			return (0);
+	}
+
+	/* Continue with link-layer output */
+	return (ether_output_frame(ifp, m));
+}
+
+static int
+loopback_frame(struct ifnet *ifp, struct mbuf *m, int family, int hlen)
+{
+	struct ether_header *eh;
+
+	if (m->m_flags & M_BCAST) {
+		struct mbuf *n;
+
+		/*
+		 * Because if_simloop() modifies the packet, we need a
+		 * writable copy through m_dup() instead of a readonly
+		 * one as m_copy[m] would give us. The alternative would
+		 * be to modify if_simloop() to handle the readonly mbuf,
+		 * but performancewise it is mostly equivalent (trading
+		 * extra data copying vs. extra locking).
+		 *
+		 * XXX This is a local workaround.  A number of less
+		 * often used kernel parts suffer from the same bug.
+		 * See PR kern/105943 for a proposed general solution.
+		 */
+		if ((n = m_dup(m, M_NOWAIT)) != NULL) {
+			update_mbuf_csumflags(m, n);
+			if_simloop(ifp, n, family, hlen);
+		} else
+			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
+	} else {
+		eh = mtod(m, struct ether_header *);
+		if (bcmp(eh->ether_dhost, eh->ether_shost,
+		    ETHER_ADDR_LEN) == 0) {
+			update_mbuf_csumflags(m, m);
+			if_simloop(ifp, m, family, hlen);
+			return (1);
+		}
+	}
+
+	return (0);
+}
+
+/*
  * Ethernet link layer output routine to send a raw frame to the device.
  *
  * This assumes that the 14 byte Ethernet header is present and contiguous

Modified: projects/routing/sys/net/if_gre.c
==============================================================================
--- projects/routing/sys/net/if_gre.c	Sun Oct 19 21:03:42 2014	(r273288)
+++ projects/routing/sys/net/if_gre.c	Sun Oct 19 21:07:35 2014	(r273289)
@@ -507,7 +507,7 @@ gre_output(struct ifnet *ifp, struct mbu
 	 * overwriting the ip_id again.  ip_id is already set to the
 	 * ip_id of the encapsulated packet.
 	 */
-	error = ip_output(m, NULL, &sc->route, IP_FORWARDING,
+	error = ip_output(m, NULL, NULL, IP_FORWARDING,
 	    (struct ip_moptions *)NULL, (struct inpcb *)NULL);
   end:
 	if (error)

Modified: projects/routing/sys/net/if_stf.c
==============================================================================
--- projects/routing/sys/net/if_stf.c	Sun Oct 19 21:03:42 2014	(r273288)
+++ projects/routing/sys/net/if_stf.c	Sun Oct 19 21:07:35 2014	(r273289)
@@ -558,7 +558,7 @@ stf_output(struct ifnet *ifp, struct mbu
 sendit:
 	M_SETFIB(m, sc->sc_fibnum);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
-	error = ip_output(m, NULL, cached_route, 0, NULL, NULL);
+	error = ip_output(m, NULL, NULL, 0, NULL, NULL);
 
 	if (cached_route != NULL)
 		mtx_unlock(&(sc)->sc_ro_mtx);

Modified: projects/routing/sys/net/route.h
==============================================================================
--- projects/routing/sys/net/route.h	Sun Oct 19 21:03:42 2014	(r273288)
+++ projects/routing/sys/net/route.h	Sun Oct 19 21:07:35 2014	(r273289)
@@ -59,6 +59,7 @@ struct route {
 
 #define	RT_CACHING_CONTEXT	0x1	/* XXX: not used anywhere */
 #define	RT_NORTREF		0x2	/* doesn't hold reference on ro_rt */
+#define	RT_NHOP			0x4
 
 struct rt_metrics {
 	u_long	rmx_locks;	/* Kernel must leave these values alone */

Modified: projects/routing/sys/net/rt_nhops.c
==============================================================================
--- projects/routing/sys/net/rt_nhops.c	Sun Oct 19 21:03:42 2014	(r273288)
+++ projects/routing/sys/net/rt_nhops.c	Sun Oct 19 21:07:35 2014	(r273289)
@@ -62,9 +62,13 @@
 #endif
 
 #include <netinet/in.h>
+#include <netinet/in_var.h>
 #include <netinet/ip_mroute.h>
 #include <netinet/ip6.h>
 
+#include <net/if_types.h>
+#include <netinet/if_ether.h>
+#include <net/ethernet.h>
 #include <net/rt_nhops.h>
 
 #include <vm/uma.h>
@@ -104,6 +108,18 @@ static struct rwlock fwd_lock;
 int fwd_attach_fib(struct fwd_module *fm, u_int fib);
 int fwd_destroy_fib(struct fwd_module *fm, u_int fib);
 #endif
+
+#ifdef INET
+static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
+    struct nhop4_extended *pnh4);
+static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
+    struct nhop4_basic *pnh4);
+#endif
+#ifdef INET
+static void fib6_rte_to_nh_basic(struct rtentry *rte, struct in6_addr dst,
+    struct nhop6_basic *pnh6);
+#endif
+
 MALLOC_DEFINE(M_RTFIB, "rtfib", "routing fwd");
 
 
@@ -132,14 +148,243 @@ MALLOC_DEFINE(M_RTFIB, "rtfib", "routing
 #define	NHOP_FLAGS_MASK	(RTF_REJECT|RTF_BLACKHOLE)
 //#define	NHOP_DIRECT	
 #define RNTORT(p)	((struct rtentry *)(p))
+
+
+/*
+ * Copies proper nexthop data based on @nh_src nexthop.
+ *
+ * For non-ECMP nexthop function simply copies @nh_src.
+ * For ECMP nexthops flowid is used to select proper
+ * nexthop.
+ *
+ */
+static inline void
+fib_choose_prepend(uint32_t fibnum, struct nhop_data *nh_src,
+    uint32_t flowid, struct nhop_data *nh, int af)
+{
+	struct nhop_multi *nh_multi;
+	int idx;
+
+	if ((nh_src->nh_flags & NH_FLAGS_RECURSE) != 0) {
+
+		/*
+		 * Recursive nexthop. Choose direct nexthop
+		 * based on flowid.
+		 */
+		nh_multi = (struct nhop_multi *)nh_src;
+		idx = nh_multi->nh_nhops[flowid % nh_multi->nh_count];
+#if 0
+		KASSERT((fibnum < rt_numfibs), ("fib4_lookup_prependĀ§: bad fibnum"));
+		rnh = rt_tables_get_rnh(fibnum, AF_INET);
+		//nh_src = &rnh->nhops[i];
+#endif
+	}
+
+	*nh = *nh_src; 
+	/* TODO: Do some light-weight refcounting on egress ifp's */
+}
+
+static inline void
+fib_free_nh(uint32_t fibnum, struct nhop_data *nh, int af)
+{
+
+	/* TODO: Do some light-weight refcounting on egress ifp's */
+}
+
 #ifdef INET
+void
+fib4_free_nh(uint32_t fibnum, struct nhop_data *nh)
+{
+
+	fib_free_nh(fibnum, nh, AF_INET);
+}
+
+void
+fib4_choose_prepend(uint32_t fibnum, struct nhop_data *nh_src,
+    uint32_t flowid, struct nhop_data *nh, struct nhop4_extended *nh_ext)
+{
+
+	fib_choose_prepend(fibnum, nh_src, flowid, nh, AF_INET);
+	if (nh_ext == NULL)
+		return;
+
+	nh_ext->nh_ifp = NH_LIFP(nh);
+	nh_ext->nh_mtu = nh->nh_mtu;
+	nh_ext->nh_flags = nh->nh_flags;
+#if 0
+	/* TODO: copy source/gw address from extended nexthop data */
+	nh_ext->nh_addr = ;
+	nh_ext->nh_src= ;
+#endif
+}
+
+/*
+ * Function performs lookup in IPv4 table fib @fibnum.
+ *
+ * In case of successful lookup @nh header is filled with
+ * appropriate interface info and full L2 header to prepend.
+ *
+ * If no valid ARP record is present, NH_FLAGS_L2_INCOMPLETE flag
+ * is set and gateway address is stored into nh->d.gw4
+ *
+ * If @nh_ext is not NULL, additional nexthop data is stored there.
+ *
+ * Returns 0 on success.
+ *
+ */
+int
+fib4_lookup_prepend(uint32_t fibnum, struct in_addr dst, struct mbuf *m,
+    struct nhop_data *nh, struct nhop4_extended *nh_ext)
+{
+	struct radix_node_head *rnh;
+	struct radix_node *rn;
+	struct sockaddr_in *gw_sa, sin;
+	struct ifnet *lifp;
+	struct in_addr gw;
+	struct ether_header *eh;
+	int error, flags;
+	//uint32_t flowid;
+	struct rtentry *rte;
+
+	KASSERT((fibnum < rt_numfibs), ("fib4_lookup_prepend: bad fibnum"));
+	rnh = rt_tables_get_rnh(fibnum, AF_INET);
+	if (rnh == NULL)
+		return (EHOSTUNREACH);
+
+	/* Prepare lookup key */
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_len = sizeof(struct sockaddr_in);
+	sin.sin_addr = dst;
+
+	RADIX_NODE_HEAD_RLOCK(rnh);
+	rn = rnh->rnh_matchaddr((void *)&sin, rnh);
+	rte = RNTORT(rn);
+	if (rn == NULL || ((rn->rn_flags & RNF_ROOT) != 0) ||
+	    RT_LINK_IS_UP(rte->rt_ifp) == 0) {
+		RADIX_NODE_HEAD_RUNLOCK(rnh);
+		return (EHOSTUNREACH);
+	}
+
+	/*
+	 * Currently we fill in @nh ourselves.
+	 * In near future rte will have nhop index to copy from.
+	 */
+
+	/* Calculate L3 info */
+	flags = 0;
+	nh->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
+	if (rte->rt_flags & RTF_GATEWAY) {
+		gw_sa = (struct sockaddr_in *)rte->rt_gateway;
+		gw = gw_sa->sin_addr;
+	} else
+		gw = dst;
+	/* Set flags */
+	flags = rte->rt_flags & NHOP_FLAGS_MASK;
+	gw_sa = (struct sockaddr_in *)rt_key(rte);
+	if (gw_sa->sin_addr.s_addr == 0)
+		flags |= NHOP_DEFAULT;
+
+	/*
+	 * TODO: nh L2/L3 resolve.
+	 * Currently all we have is rte ifp.
+	 * Simply use it.
+	 */
+	lifp = rte->rt_ifp;
+	/* Save both logical and transmit interface indexes */
+	nh->lifp_idx = lifp->if_index;
+	nh->i.ifp_idx = nh->lifp_idx;
+
+	if (nh_ext != NULL) {
+		/* Fill in extended info */
+		fib4_rte_to_nh_extended(rte, dst, nh_ext);
+	}
+
+	RADIX_NODE_HEAD_RUNLOCK(rnh);
+
+	nh->nh_flags = flags;
+	/*
+	 * Try to lookup L2 info.
+	 * Do this using separate LLE locks.
+	 * TODO: move this under radix lock.
+	 */
+	if (lifp->if_type == IFT_ETHER) {
+		eh = (struct ether_header *)nh->d.data;
+
+		/*
+		 * Fill in ethernet header.
+		 * It should be already presented if we're
+		 * sending data via known gateway.
+		 */
+		error = arpresolve_fast(lifp, gw, m->m_flags, eh->ether_dhost);
+		if (error == 0) {
+			memcpy(&eh->ether_shost, IF_LLADDR(lifp), ETHER_ADDR_LEN);
+			eh->ether_type = htons(ETHERTYPE_IP);
+			nh->nh_count = ETHER_HDR_LEN;
+			return (0);
+		}
+	}
+
+	/* Notify caller that no L2 info is linked */
+	nh->nh_count = 0;
+	nh->nh_flags |= NH_FLAGS_L2_INCOMPLETE;
+	/* ..And save gateway address */
+	nh->d.gw4 = gw;
+	return (0);
+}
+
+static void
+fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
+    struct nhop4_extended *pnh4)
+{
+	struct sockaddr_in *gw;
+	struct in_ifaddr *ia;
+
+	pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
+	pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
+	if (rte->rt_flags & RTF_GATEWAY) {
+		gw = (struct sockaddr_in *)rte->rt_gateway;
+		pnh4->nh_addr = gw->sin_addr;
+	} else
+		pnh4->nh_addr = dst;
+
+	ia = ifatoia(rte->rt_ifa);
+	pnh4->nh_src = IA_SIN(ia)->sin_addr;
+
+	/* Set flags */
+	pnh4->nh_flags = rte->rt_flags & NHOP_FLAGS_MASK;
+	gw = (struct sockaddr_in *)rt_key(rte);
+	if (gw->sin_addr.s_addr == 0)
+		pnh4->nh_flags |= NHOP_DEFAULT;
+}
+
+
+static void
+fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
+    struct nhop4_basic *pnh4)
+{
+	struct sockaddr_in *gw;
+
+	pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
+	pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
+	if (rte->rt_flags & RTF_GATEWAY) {
+		gw = (struct sockaddr_in *)rte->rt_gateway;
+		pnh4->nh_addr = gw->sin_addr;
+	} else
+		pnh4->nh_addr = dst;
+	/* Set flags */
+	pnh4->nh_flags = rte->rt_flags & NHOP_FLAGS_MASK;
+	gw = (struct sockaddr_in *)rt_key(rte);
+	if (gw->sin_addr.s_addr == 0)
+		pnh4->nh_flags |= NHOP_DEFAULT;
+}
+
 int
 fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flowid,
     struct nhop4_basic *pnh4)
 {
 	struct radix_node_head *rnh;
 	struct radix_node *rn;
-	struct sockaddr_in *gw, sin;
+	struct sockaddr_in sin;
 	struct rtentry *rte;
 
 	KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum"));
@@ -157,18 +402,7 @@ fib4_lookup_nh_basic(uint32_t fibnum, st
 		rte = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rte->rt_ifp)) {
-			pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
-			pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
-			if (rte->rt_flags & RTF_GATEWAY) {
-				gw = (struct sockaddr_in *)rte->rt_gateway;
-				pnh4->nh_addr = gw->sin_addr;
-			} else
-				pnh4->nh_addr = dst;
-			/* Set flags */
-			pnh4->nh_flags = rte->rt_flags & NHOP_FLAGS_MASK;
-			gw = (struct sockaddr_in *)rt_key(rte);
-			if (gw->sin_addr.s_addr == 0)
-				pnh4->nh_flags |= NHOP_DEFAULT;
+			fib4_rte_to_nh_basic(rte, dst, pnh4);
 			RADIX_NODE_HEAD_RUNLOCK(rnh);
 
 			return (0);
@@ -181,13 +415,59 @@ fib4_lookup_nh_basic(uint32_t fibnum, st
 #endif
 
 #ifdef INET6
+void
+fib6_free_nh(uint32_t fibnum, struct nhop_data *nh)
+{
+
+	fib_free_nh(fibnum, nh, AF_INET6);
+}
+
+void
+fib6_choose_prepend(uint32_t fibnum, struct nhop_data *nh_src,
+    uint32_t flowid, struct nhop_data *nh, struct nhop6_extended *nh_ext)
+{
+
+	fib_choose_prepend(fibnum, nh_src, flowid, nh, AF_INET6);
+	if (nh_ext == NULL)
+		return;
+
+	nh_ext->nh_ifp = NH_LIFP(nh);
+	nh_ext->nh_mtu = nh->nh_mtu;
+	nh_ext->nh_flags = nh->nh_flags;
+/*
+	nh_ext->nh_addr = ;
+	nh_ext->nh_src= ;
+*/
+}
+
+
+static void
+fib6_rte_to_nh_basic(struct rtentry *rte, struct in6_addr dst,
+    struct nhop6_basic *pnh6)
+{
+	struct sockaddr_in6 *gw;
+
+	pnh6->nh_ifp = rte->rt_ifa->ifa_ifp;
+	pnh6->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
+	if (rte->rt_flags & RTF_GATEWAY) {
+		gw = (struct sockaddr_in6 *)rte->rt_gateway;
+		pnh6->nh_addr = gw->sin6_addr;
+	} else
+		pnh6->nh_addr = dst;
+	/* Set flags */
+	pnh6->nh_flags = rte->rt_flags & NHOP_FLAGS_MASK;
+	gw = (struct sockaddr_in6 *)rt_key(rte);
+	if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
+		pnh6->nh_flags |= NHOP_DEFAULT;
+}
+
 int
 fib6_lookup_nh_basic(uint32_t fibnum, struct in6_addr dst, uint32_t flowid,
     struct nhop6_basic *pnh6)
 {
 	struct radix_node_head *rnh;
 	struct radix_node *rn;
-	struct sockaddr_in6 *gw, sin6;
+	struct sockaddr_in6 sin6;
 	struct rtentry *rte;
 
 	KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum"));
@@ -205,18 +485,7 @@ fib6_lookup_nh_basic(uint32_t fibnum, st
 		rte = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rte->rt_ifp)) {
-			pnh6->nh_ifp = rte->rt_ifa->ifa_ifp;
-			pnh6->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
-			if (rte->rt_flags & RTF_GATEWAY) {
-				gw = (struct sockaddr_in6 *)rte->rt_gateway;
-				pnh6->nh_addr = gw->sin6_addr;
-			} else
-				pnh6->nh_addr = dst;
-			/* Set flags */
-			pnh6->nh_flags = rte->rt_flags & NHOP_FLAGS_MASK;
-			gw = (struct sockaddr_in6 *)rt_key(rte);
-			if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
-				pnh6->nh_flags |= NHOP_DEFAULT;
+			fib6_rte_to_nh_basic(rte, dst, pnh6);
 			RADIX_NODE_HEAD_RUNLOCK(rnh);
 			return (0);
 		}
@@ -228,8 +497,45 @@ fib6_lookup_nh_basic(uint32_t fibnum, st
 #endif
 
 
+#if 0
+typedef void nhop_change_cb_t(void *state);
 
 
+struct nhop_tracker {
+	TAILQ_ENTRY(nhop_tracker)	next;
+	nhop_change_cb_t	*f;
+	void		*state;
+	uint32_t	fibnum;
+	struct sockaddr_storage	ss;
+};
+
+struct nhop_tracker *
+nhop_alloc_tracked(uint32_t fibnum, struct sockaddr *sa, nhop_change_cb_t *f,
+    void *state)
+{
+	struct nhop_tracker *nt;
+
+	nt = malloc(sizeof(struct nhop_tracker), M_RTFIB, M_WAITOK | M_ZERO);
+
+	nt->f = f;
+	nt-state = state;
+	nt->fibnum = fibnum;
+	memcpy(&nt->ss, sa, sa->sa_len);
+
+	return (nt);
+}
+
+
+int
+nhop_bind(struct nhop_tracker *nt)
+{
+	NHOP_LOCK(nnh);
+
+	NHOP_UNLOCK(nnh);
+
+	return (0);
+}
+#endif
 
 
 

Modified: projects/routing/sys/net/rt_nhops.h
==============================================================================
--- projects/routing/sys/net/rt_nhops.h	Sun Oct 19 21:03:42 2014	(r273288)
+++ projects/routing/sys/net/rt_nhops.h	Sun Oct 19 21:07:35 2014	(r273289)
@@ -30,7 +30,6 @@
 #ifndef _NET_RT_NHOPS_H_
 #define	_NET_RT_NHOPS_H_
 
-#define	MAX_PREPEND_LEN		64	/* Max data that can be prepended */
 
 
 #define	NH_TYPE_DIRECT		1	/* Directly reachable, no data */
@@ -40,7 +39,7 @@
 #define	NH_TYPE_MUTATOR		5	/* NH+callback function  */
 #define	NH_TYPE_MULTIPATH	6	/* Multipath route */
 
-struct nhop_info {
+struct nhop_ctl_info {
 	uint64_t	refcnt;		/* Use references */
 	uint64_t	flags;		/* Options */
 
@@ -61,19 +60,49 @@ struct nhop_mutator_info {
 	char		data[];
 };
 
-/* Structure used for forwarding purposes */
+/* Structures used for forwarding purposes */
+#define	MAX_PREPEND_LEN		56	/* Max data that can be prepended */
+
+/* Non-recursive nexthop */
 struct nhop_data {
-	uint8_t		flags;	/* NH flags */
-	uint8_t		count;	/* Number of nexthops or data length */
-	uint16_t	mtu;
+	uint8_t		nh_flags;		/* NH flags */
+	uint8_t		nh_count;		/* Number of nexthops or data length */
+	uint16_t	nh_mtu;		/* given nhop MTU */
 	uint16_t	lifp_idx;	/* Logical interface index */
-	uint16_t	ifp_idx;	/* Transmit interface index */
 	union {
-		struct nhop_mpath_info mp[32];	/* Multipath info */
-		struct nhop_mutator_info mm;	/* mutator info */
-		char	data[MAX_PREPEND_LEN - 8];	/* data to prepend */
+		uint16_t	ifp_idx;	/* Transmit interface index */
+		uint16_t	nhop_idx;	/* L2 multipath nhop index */
+	} i;
+	union {
+		char	data[MAX_PREPEND_LEN];	/* data to prepend */
+#ifdef INET
+		struct in_addr	gw4;		/* IPv4 gw address */
+#endif
+#ifdef INET6
+		struct in6_addr	gw6;		/* IPv4 gw address */
+#endif
 	} d;
 };
+/* Internal flags */
+#define	NH_FLAGS_RECURSE	0x01	/* Nexthop structure is recursive */
+#define	NH_FLAGS_L2_NHOP	0x02	/* L2 interface has to be selected */
+#define	NH_FLAGS_L2_ME		0x04	/* dst L2 address is our address */
+#define	NH_FLAGS_L2_INCOMPLETE 	0x08	/* L2 header not prepended */
+
+#define	NH_LIFP(nh)	ifnet_byindex_locked((nh)->lifp_idx)
+#define	NH_TIFP(nh)	ifnet_byindex_locked((nh)->i.ifp_idx)
+
+/* L2/L3 recursive nexthop */
+struct nhop_multi {
+	uint8_t		nh_flags;	/* NH flags */
+	uint8_t		nh_count;	/* Number of nexthops or data length */
+	uint8_t		spare[2];
+	uint16_t	nh_nhops[30];	/* Nexthop indexes */
+};
+
+/* Control plane nexthop data */
+struct nhop_info {
+};
 
 /* Per-AF per-fib nhop table */
 struct nhops_descr {
@@ -105,6 +134,7 @@ struct nhop6_basic {
 	struct ifnet	*nh_ifp;	/* Logical egress interface */
 	uint16_t	nh_mtu;		/* nexthop mtu */
 	uint16_t	nh_flags;	/* nhop flags */
+	uint8_t		spare[4];
 	struct in6_addr	nh_addr;	/* GW/DST IPv4 address */
 };
 
@@ -115,11 +145,63 @@ struct nhop64_basic {
 	} u;
 };
 
+/* Extended nexthop info used for control protocols */
+struct nhop4_extended {
+	struct ifnet	*nh_ifp;	/* Logical egress interface */
+	uint16_t	nh_mtu;		/* nexthop mtu */
+	uint16_t	nh_flags;	/* nhop flags */
+	uint8_t		spare[4];
+	struct in_addr	nh_addr;	/* GW/DST IPv4 address */
+	struct in_addr	nh_src;		/* default source IPv4 address */
+	uint64_t	spare2[2];
+};
+
+struct nhop6_extended {
+	struct ifnet	*nh_ifp;	/* Logical egress interface */
+	uint16_t	nh_mtu;		/* nexthop mtu */
+	uint16_t	nh_flags;	/* nhop flags */
+	uint8_t		spare[4];
+	struct in6_addr	nh_addr;	/* GW/DST IPv6 address */
+	struct in6_addr	nh_src;		/* default source IPv6 address */
+	uint64_t	spare2[2];
+};
+
+struct nhop64_extended {
+	union {
+		struct nhop4_extended	nh4;
+		struct nhop6_extended	nh6;
+	} u;
+};
+
+struct route_info {
+	struct nhop_data	*ri_nh;		/* Desired nexthop to use */
+	struct nhop64_basic	*ri_nh_info;	/* Get selected route info */
+	uint16_t		ri_mtu;
+	uint16_t		spare[3];
+};
+
+struct route_compat {
+	struct nhop_data	*ro_nh;
+	void			*spare0;
+	void			*spare1;
+	int			ro_flags;
+};
+
 int fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flowid,
     struct nhop4_basic *pnh4);
 int fib6_lookup_nh_basic(uint32_t fibnum, struct in6_addr dst, uint32_t flowid,
     struct nhop6_basic *pnh6);
 
+void fib4_free_nh(uint32_t fibnum, struct nhop_data *nh);
+void fib4_choose_prepend(uint32_t fibnum, struct nhop_data *nh_src,
+    uint32_t flowid, struct nhop_data *nh, struct nhop4_extended *nh_ext);
+int fib4_lookup_prepend(uint32_t fibnum, struct in_addr dst, struct mbuf *m,
+    struct nhop_data *nh, struct nhop4_extended *nh_ext);
+
+void fib6_free_nh(uint32_t fibnum, struct nhop_data *nh);
+void fib6_choose_prepend(uint32_t fibnum, struct nhop_data *nh_src,
+    uint32_t flowid, struct nhop_data *nh, struct nhop6_extended *nh_ext);
+
 #define	NHOP_REJECT	RTF_REJECT
 #define	NHOP_BLACKHOLE	RTF_BLACKHOLE
 #define	NHOP_DEFAULT	0x80	/* Default route */

Modified: projects/routing/sys/netinet/if_ether.c
==============================================================================
--- projects/routing/sys/netinet/if_ether.c	Sun Oct 19 21:03:42 2014	(r273288)
+++ projects/routing/sys/netinet/if_ether.c	Sun Oct 19 21:07:35 2014	(r273289)
@@ -283,6 +283,72 @@ arprequest(struct ifnet *ifp, const stru
 }
 
 /*
+ *
+ * Saves lle address for @dst in @dst_addr.
+ * Returns 0 if address was found&valid.
+ */
+int
+arpresolve_fast(struct ifnet *ifp, struct in_addr dst, u_int mflags,
+    u_char *dst_addr)
+{
+	int do_arp, error;
+	struct llentry *la;
+	struct sockaddr_in sin;
+
+	if (mflags & M_BCAST) {
+		memcpy(dst_addr, ifp->if_broadcastaddr, ifp->if_addrlen);
+		return (0);
+	}
+	if (mflags & M_MCAST) {
+		ETHER_MAP_IP_MULTICAST(&dst, dst_addr);
+		return (0);
+	}
+
+	do_arp = 0;
+	error = EAGAIN;
+
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_addr = dst;
+	sin.sin_family = AF_INET;
+	sin.sin_len = sizeof(sin);
+
+	IF_AFDATA_RLOCK(ifp);
+	la = lla_lookup(LLTABLE(ifp), 0, (const struct sockaddr *)&sin);
+
+	/*
+	 * XXX: We need to convert all these checks to single one
+	 */
+	if (la != NULL && (la->la_flags & LLE_VALID) &&
+	    ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
+		bcopy(&la->ll_addr, dst_addr, ifp->if_addrlen);
+		/*
+		 * If entry has an expiry time and it is approaching,
+		 * see if we need to send an ARP request within this
+		 * arpt_down interval.
+		 */
+		if (!(la->la_flags & LLE_STATIC) &&
+		    time_uptime + la->la_preempt > la->la_expire) {
+			do_arp = 1;
+			la->la_preempt--;
+		}
+		error = 0;
+	}
+	if (la != NULL)
+		LLE_RUNLOCK(la);
+	IF_AFDATA_RUNLOCK(ifp);
+
+	/*
+	 * XXX: For compat reasons only.
+	 * We should delay the job to slowpath queue.
+	 */
+	if (do_arp != 0)
+		arprequest(ifp, NULL, &dst, NULL);
+
+	return (error);
+}
+
+
+/*
  * Resolve an IP address into an ethernet address.
  * On input:
  *    ifp is the interface we use

Modified: projects/routing/sys/netinet/if_ether.h
==============================================================================
--- projects/routing/sys/netinet/if_ether.h	Sun Oct 19 21:03:42 2014	(r273288)
+++ projects/routing/sys/netinet/if_ether.h	Sun Oct 19 21:07:35 2014	(r273289)
@@ -117,6 +117,8 @@ struct ifaddr;
 
 int	arpresolve(struct ifnet *ifp, struct rtentry *rt, struct mbuf *m,
 	    const struct sockaddr *dst, u_char *desten, struct llentry **lle);
+int	arpresolve_fast(struct ifnet *ifp, struct in_addr dst, u_int mflags,
+	    u_char *dst_addr);
 void	arprequest(struct ifnet *, const struct in_addr *,
 	    const struct in_addr *, u_char *);
 void	arp_ifinit(struct ifnet *, struct ifaddr *);

Modified: projects/routing/sys/netinet/in_gif.c
==============================================================================
--- projects/routing/sys/netinet/in_gif.c	Sun Oct 19 21:03:42 2014	(r273288)
+++ projects/routing/sys/netinet/in_gif.c	Sun Oct 19 21:07:35 2014	(r273289)
@@ -259,7 +259,7 @@ in_gif_output(struct ifnet *ifp, int fam
 	}
 
 	m->m_flags &= ~(M_BCAST|M_MCAST);
-	error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL);
+	error = ip_output(m, NULL, NULL, 0, NULL, NULL);
 
 	if (!(GIF2IFP(sc)->if_flags & IFF_LINK0) &&
 	    sc->gif_ro.ro_rt != NULL) {

Modified: projects/routing/sys/netinet/ip_input.c
==============================================================================
--- projects/routing/sys/netinet/ip_input.c	Sun Oct 19 21:03:42 2014	(r273288)
+++ projects/routing/sys/netinet/ip_input.c	Sun Oct 19 21:07:35 2014	(r273289)
@@ -80,6 +80,8 @@ __FBSDID("$FreeBSD$");
 #endif /* IPSEC */
 #include <netinet/in_rss.h>
 
+#include <net/rt_nhops.h>
+
 #include <sys/socketvar.h>
 
 #include <security/mac/mac_framework.h>
@@ -1471,6 +1473,7 @@ ip_forward(struct mbuf *m, int srcrt)
 	struct mbuf *mcopy;
 	struct in_addr dest;
 	struct route ro;
+	struct route_info ri;
 	int error, type = 0, code = 0, mtu = 0;
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
@@ -1591,13 +1594,12 @@ ip_forward(struct mbuf *m, int srcrt)
 	 * Try to cache the route MTU from ip_output so we can consider it for
 	 * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191.
 	 */
-	bzero(&ro, sizeof(ro));
+	bzero(&ri, sizeof(ri));
 
-	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
+	error = ip_output(m, NULL, &ri, IP_FORWARDING, NULL, NULL);
 
-	if (error == EMSGSIZE && ro.ro_rt)
-		mtu = ro.ro_rt->rt_mtu;
-	RO_RTFREE(&ro);
+	if (error == EMSGSIZE)
+		mtu = ri.ri_mtu;
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);

Modified: projects/routing/sys/netinet/ip_output.c
==============================================================================
--- projects/routing/sys/netinet/ip_output.c	Sun Oct 19 21:03:42 2014	(r273288)
+++ projects/routing/sys/netinet/ip_output.c	Sun Oct 19 21:07:35 2014	(r273289)
@@ -81,6 +81,8 @@ __FBSDID("$FreeBSD$");
 #include <netinet/sctp_crc32.h>
 #endif
 
+#include <net/rt_nhops.h>
+
 #ifdef IPSEC
 #include <netinet/ip_ipsec.h>
 #include <netipsec/ipsec.h>
@@ -98,8 +100,9 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_
 	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
 #endif
 
-static void	ip_mloopback
-	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
+static void ip_mloopback (struct ifnet *, struct mbuf *, int);
+static inline int ip_sendmbuf(struct ifnet *ifp, struct mbuf *m,
+    struct nhop_data *nh, struct in_addr dst);
 
 
 extern int in_mcast_loop;
@@ -118,7 +121,7 @@ extern	struct protosw inetsw[];
  * inserted, so must have a NULL opt pointer.
  */
 int
-ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
+ip_output(struct mbuf *m, struct mbuf *opt, struct route_info *ri, int flags,
     struct ip_moptions *imo, struct inpcb *inp)
 {
 	struct ip *ip;
@@ -127,17 +130,16 @@ ip_output(struct mbuf *m, struct mbuf *o
 	int hlen = sizeof (struct ip);
 	int mtu;
 	int error = 0;
-	struct sockaddr_in *dst;
-	const struct sockaddr_in *gw;
+	struct in_addr dst, local_addr;
+	struct sockaddr_in gw_out, *tmp;
 	struct in_ifaddr *ia;
 	int isbroadcast;
 	uint16_t ip_len, ip_off;
-	struct route iproute;
-	struct rtentry *rte;	/* cache for ro->ro_rt */
+	struct nhop_data local_nh, *nh;
+	struct nhop4_extended nhe, *pnhe;
 	struct in_addr odst;
 	struct m_tag *fwd_tag = NULL;
 	uint32_t fibnum;
-	int have_ia_ref;
 	int needfiblookup;
 #ifdef IPSEC
 	int no_route_but_check_spd = 0;
@@ -155,14 +157,11 @@ ip_output(struct mbuf *m, struct mbuf *o
 		}

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201410192107.s9JL7a1s062998>