From owner-svn-src-stable-8@FreeBSD.ORG Fri Apr 2 05:02:50 2010 Return-Path: Delivered-To: svn-src-stable-8@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id D6166106566B; Fri, 2 Apr 2010 05:02:50 +0000 (UTC) (envelope-from qingli@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id C3CDF8FC1A; Fri, 2 Apr 2010 05:02:50 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id o3252oN5054756; Fri, 2 Apr 2010 05:02:50 GMT (envelope-from qingli@svn.freebsd.org) Received: (from qingli@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id o3252otY054748; Fri, 2 Apr 2010 05:02:50 GMT (envelope-from qingli@svn.freebsd.org) Message-Id: <201004020502.o3252otY054748@svn.freebsd.org> From: Qing Li Date: Fri, 2 Apr 2010 05:02:50 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-8@freebsd.org X-SVN-Group: stable-8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r206067 - in stable/8/sys: net netinet X-BeenThere: svn-src-stable-8@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: SVN commit messages for only the 8-stable src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 02 Apr 2010 05:02:51 -0000 Author: qingli Date: Fri Apr 2 05:02:50 2010 New Revision: 206067 URL: http://svn.freebsd.org/changeset/base/206067 Log: MFC 204902 One of the advantages of enabling ECMP (a.k.a RADIX_MPATH) is to allow for connection load balancing across interfaces. Currently the address alias handling method is colliding with the ECMP code. For example, when two interfaces are configured on the same prefix, only one prefix route is installed. So connection load balancing among the available interfaces is not possible. The other advantage of ECMP is for failover. The issue with the current code, is that the interface link-state is not reflected in the route entry. For example, if there are two interfaces on the same prefix, the cable on one interface is unplugged, new and existing connections should switch over to the other interface. This is not done today and packets go into a black hole. Also, there is a small bug in the kernel where deleting ECMP routes in the userland will always return an error even though the command is successfully executed. Modified: stable/8/sys/net/flowtable.c stable/8/sys/net/radix.c stable/8/sys/net/radix_mpath.c stable/8/sys/net/route.c stable/8/sys/net/route.h stable/8/sys/netinet/in.c stable/8/sys/netinet/ip_output.c Directory Properties: stable/8/sys/ (props changed) stable/8/sys/amd64/include/xen/ (props changed) stable/8/sys/cddl/contrib/opensolaris/ (props changed) stable/8/sys/contrib/dev/acpica/ (props changed) stable/8/sys/contrib/pf/ (props changed) stable/8/sys/dev/xen/xenpci/ (props changed) stable/8/sys/net/ (props changed) Modified: stable/8/sys/net/flowtable.c ============================================================================== --- stable/8/sys/net/flowtable.c Fri Apr 2 04:58:17 2010 (r206066) +++ stable/8/sys/net/flowtable.c Fri Apr 2 05:02:50 2010 (r206067) @@ -870,7 +870,8 @@ flow_stale(struct flowtable *ft, struct || ((fle->f_rt->rt_flags & RTF_HOST) && ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) - || (fle->f_rt->rt_ifp == NULL)) + || (fle->f_rt->rt_ifp == NULL) + || !RT_LINK_IS_UP(fle->f_rt->rt_ifp)) return (1); idle_time = time_uptime - fle->f_uptime; Modified: stable/8/sys/net/radix.c ============================================================================== --- stable/8/sys/net/radix.c Fri Apr 2 04:58:17 2010 (r206066) +++ stable/8/sys/net/radix.c Fri Apr 2 05:02:50 2010 (r206067) @@ -761,8 +761,10 @@ on2: if (m->rm_flags & RNF_NORMAL) { mmask = m->rm_leaf->rn_mask; if (tt->rn_flags & RNF_NORMAL) { +#if !defined(RADIX_MPATH) log(LOG_ERR, "Non-unique normal route, mask not entered\n"); +#endif return tt; } } else Modified: stable/8/sys/net/radix_mpath.c ============================================================================== --- stable/8/sys/net/radix_mpath.c Fri Apr 2 04:58:17 2010 (r206066) +++ stable/8/sys/net/radix_mpath.c Fri Apr 2 05:02:50 2010 (r206067) @@ -270,7 +270,8 @@ rtalloc_mpath_fib(struct route *ro, uint * XXX we don't attempt to lookup cached route again; what should * be done for sendto(3) case? */ - if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP)) + if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP) + && RT_LINK_IS_UP(ro->ro_rt->rt_ifp)) return; ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, 0, fibnum); Modified: stable/8/sys/net/route.c ============================================================================== --- stable/8/sys/net/route.c Fri Apr 2 04:58:17 2010 (r206066) +++ stable/8/sys/net/route.c Fri Apr 2 05:02:50 2010 (r206067) @@ -830,7 +830,13 @@ rt_getifa_fib(struct rt_addrinfo *info, int rtexpunge(struct rtentry *rt) { +#if !defined(RADIX_MPATH) struct radix_node *rn; +#else + struct rt_addrinfo info; + int fib; + struct rtentry *rt0; +#endif struct radix_node_head *rnh; struct ifaddr *ifa; int error = 0; @@ -843,14 +849,26 @@ rtexpunge(struct rtentry *rt) if (rnh == NULL) return (EAFNOSUPPORT); RADIX_NODE_HEAD_LOCK_ASSERT(rnh); -#if 0 - /* - * We cannot assume anything about the reference count - * because protocols call us in many situations; often - * before unwinding references to the table entry. - */ - KASSERT(rt->rt_refcnt <= 1, ("bogus refcnt %ld", rt->rt_refcnt)); -#endif + +#ifdef RADIX_MPATH + fib = rt->rt_fibnum; + bzero(&info, sizeof(info)); + info.rti_ifp = rt->rt_ifp; + info.rti_flags = RTF_RNH_LOCKED; + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr; + + RT_UNLOCK(rt); + error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib); + + if (error == 0 && rt0 != NULL) { + rt = rt0; + RT_LOCK(rt); + } else if (error != 0) { + RT_LOCK(rt); + return (error); + } +#else /* * Remove the item from the tree; it should be there, * but when callers invoke us blindly it may not (sigh). @@ -864,6 +882,7 @@ rtexpunge(struct rtentry *rt) ("unexpected flags 0x%x", rn->rn_flags)); KASSERT(rt == RNTORT(rn), ("lookup mismatch, rt %p rn %p", rt, rn)); +#endif /* RADIX_MPATH */ rt->rt_flags &= ~RTF_UP; @@ -886,7 +905,9 @@ rtexpunge(struct rtentry *rt) * linked to the routing table. */ V_rttrash++; +#if !defined(RADIX_MPATH) bad: +#endif return (error); } @@ -1044,6 +1065,7 @@ rtrequest1_fib(int req, struct rt_addrin */ if (error != ENOENT) goto bad; + error = 0; } #endif /* Modified: stable/8/sys/net/route.h ============================================================================== --- stable/8/sys/net/route.h Fri Apr 2 04:58:17 2010 (r206066) +++ stable/8/sys/net/route.h Fri Apr 2 05:02:50 2010 (r206067) @@ -319,6 +319,8 @@ struct rt_addrinfo { #ifdef _KERNEL +#define RT_LINK_IS_UP(ifp) ((ifp)->if_link_state == LINK_STATE_UP) + #define RT_LOCK_INIT(_rt) \ mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK) #define RT_LOCK(_rt) mtx_lock(&(_rt)->rt_mtx) Modified: stable/8/sys/netinet/in.c ============================================================================== --- stable/8/sys/netinet/in.c Fri Apr 2 04:58:17 2010 (r206066) +++ stable/8/sys/netinet/in.c Fri Apr 2 05:02:50 2010 (r206067) @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include "opt_carp.h" +#include "opt_mpath.h" #include #include @@ -1040,6 +1041,13 @@ in_addprefix(struct in_ifaddr *target, i * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { +#ifdef RADIX_MPATH + if (ia->ia_addr.sin_addr.s_addr == + target->ia_addr.sin_addr.s_addr) + return (EEXIST); + else + break; +#endif if (V_sameprefixcarponly && target->ia_ifp->if_type != IFT_CARP && ia->ia_ifp->if_type != IFT_CARP) { Modified: stable/8/sys/netinet/ip_output.c ============================================================================== --- stable/8/sys/netinet/ip_output.c Fri Apr 2 04:58:17 2010 (r206066) +++ stable/8/sys/netinet/ip_output.c Fri Apr 2 05:02:50 2010 (r206067) @@ -208,6 +208,8 @@ again: */ rte = ro->ro_rt; if (rte && ((rte->rt_flags & RTF_UP) == 0 || + rte->rt_ifp == NULL || + !RT_LINK_IS_UP(rte->rt_ifp) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { if (!nortfree) @@ -279,7 +281,9 @@ again: #endif rte = ro->ro_rt; } - if (rte == NULL) { + if (rte == NULL || + rte->rt_ifp == NULL || + !RT_LINK_IS_UP(rte->rt_ifp)) { #ifdef IPSEC /* * There is no route for this packet, but it is