Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 17 Oct 2008 03:59:25 +0000 (UTC)
From:      Kip Macy <kmacy@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r183965 - in user/kmacy/HEAD_ECMP/sys: conf i386/conf net netinet
Message-ID:  <200810170359.m9H3xPks077916@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kmacy
Date: Fri Oct 17 03:59:25 2008
New Revision: 183965
URL: http://svn.freebsd.org/changeset/base/183965

Log:
  Add experimental flow tracking support to provide stateful ECMP

Added:
  user/kmacy/HEAD_ECMP/sys/i386/conf/ECMP_TEST
  user/kmacy/HEAD_ECMP/sys/net/flowtable.c   (contents, props changed)
Modified:
  user/kmacy/HEAD_ECMP/sys/conf/files
  user/kmacy/HEAD_ECMP/sys/net/radix_mpath.c
  user/kmacy/HEAD_ECMP/sys/net/radix_mpath.h
  user/kmacy/HEAD_ECMP/sys/net/route.c
  user/kmacy/HEAD_ECMP/sys/net/route.h
  user/kmacy/HEAD_ECMP/sys/netinet/ip_input.c

Modified: user/kmacy/HEAD_ECMP/sys/conf/files
==============================================================================
--- user/kmacy/HEAD_ECMP/sys/conf/files	Fri Oct 17 03:17:10 2008	(r183964)
+++ user/kmacy/HEAD_ECMP/sys/conf/files	Fri Oct 17 03:59:25 2008	(r183965)
@@ -1824,6 +1824,7 @@ net/if_stf.c			optional stf
 net/if_tun.c			optional tun
 net/if_tap.c			optional tap
 net/if_vlan.c			optional vlan
+net/flowtable.c			optional inet
 net/mppcc.c			optional netgraph_mppc_compression
 net/mppcd.c			optional netgraph_mppc_compression
 net/netisr.c			standard

Added: user/kmacy/HEAD_ECMP/sys/i386/conf/ECMP_TEST
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/kmacy/HEAD_ECMP/sys/i386/conf/ECMP_TEST	Fri Oct 17 03:59:25 2008	(r183965)
@@ -0,0 +1,241 @@
+#
+# GENERIC -- Generic kernel configuration file for FreeBSD/i386
+#
+# For more information on this file, please read the handbook section on
+# Kernel Configuration Files:
+#
+#    http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
+#
+# The handbook is also available locally in /usr/share/doc/handbook
+# if you've installed the doc distribution, otherwise always see the
+# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
+# latest information.
+#
+# An exhaustive list of options and more detailed explanations of the
+# device lines is also present in the ../../conf/NOTES and NOTES files.
+# If you are in doubt as to the purpose or necessity of a line, check first
+# in NOTES.
+#
+# $FreeBSD: user/kmacy/HEAD_ECMP/sys/i386/conf/GENERIC 183735 2008-10-09 21:25:01Z n_hibma $
+
+cpu		I486_CPU
+cpu		I586_CPU
+cpu		I686_CPU
+ident		GENERIC
+
+# To statically compile in device wiring instead of /boot/device.hints
+#hints		"GENERIC.hints"		# Default places to look for devices.
+
+makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
+
+options 	SCHED_ULE		# ULE scheduler
+options 	PREEMPTION		# Enable kernel thread preemption
+options 	INET			# InterNETworking
+options 	INET6			# IPv6 communications protocols
+options 	SCTP			# Stream Control Transmission Protocol
+options 	FFS			# Berkeley Fast Filesystem
+options 	SOFTUPDATES		# Enable FFS soft updates support
+options 	UFS_ACL			# Support for access control lists
+options 	UFS_DIRHASH		# Improve performance on big directories
+options 	UFS_GJOURNAL		# Enable gjournal-based UFS journaling
+options 	MD_ROOT			# MD is a potential root device
+options 	NFSCLIENT		# Network Filesystem Client
+options 	NFSSERVER		# Network Filesystem Server
+options 	NFSLOCKD		# Network Lock Manager
+options 	NFS_ROOT		# NFS usable as /, requires NFSCLIENT
+options 	MSDOSFS			# MSDOS Filesystem
+options 	CD9660			# ISO 9660 Filesystem
+options 	PROCFS			# Process filesystem (requires PSEUDOFS)
+options 	PSEUDOFS		# Pseudo-filesystem framework
+options 	GEOM_PART_GPT		# GUID Partition Tables.
+options 	GEOM_LABEL		# Provides labelization
+options 	COMPAT_43TTY		# BSD 4.3 TTY compat [KEEP THIS!]
+options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
+options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
+options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
+options 	COMPAT_FREEBSD7		# Compatible with FreeBSD7
+options 	SCSI_DELAY=5000		# Delay (in ms) before probing SCSI
+options 	KTRACE			# ktrace(1) support
+options 	STACK			# stack(9) support
+options 	SYSVSHM			# SYSV-style shared memory
+options 	SYSVMSG			# SYSV-style message queues
+options 	SYSVSEM			# SYSV-style semaphores
+options 	_KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
+options 	KBD_INSTALL_CDEV	# install a CDEV entry in /dev
+options 	STOP_NMI		# Stop CPUS using NMI instead of IPI
+options  	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
+options 	AUDIT			# Security event auditing
+
+# Debugging for use in -current
+options 	KDB			# Enable kernel debugger support.
+options 	DDB			# Support DDB.
+options 	GDB			# Support remote GDB.
+options 	INVARIANTS		# Enable calls of extra sanity checking
+options 	INVARIANT_SUPPORT	# Extra sanity checks of internal structures, required by INVARIANTS
+options 	WITNESS			# Enable checks to detect deadlocks and cycles
+options 	WITNESS_SKIPSPIN	# Don't run witness on spinlocks for speed
+
+# To make an SMP kernel, the next two lines are needed
+options 	SMP			# Symmetric MultiProcessor Kernel
+device		apic			# I/O APIC
+
+# CPU frequency control
+device		cpufreq
+
+# Bus support.
+device		acpi
+device		eisa
+device		pci
+
+# Floppy drives
+device		fdc
+
+# ATA and ATAPI devices
+device		ata
+device		atadisk		# ATA disk drives
+device		ataraid		# ATA RAID drives
+device		atapicd		# ATAPI CDROM drives
+device		atapifd		# ATAPI floppy drives
+device		atapist		# ATAPI tape drives
+options 	ATA_STATIC_ID	# Static device numbering
+
+# SCSI Controllers
+device		ahb		# EISA AHA1742 family
+device		ahc		# AHA2940 and onboard AIC7xxx devices
+options 	AHC_REG_PRETTY_PRINT	# Print register bitfields in debug
+					# output.  Adds ~128k to driver.
+device		ahd		# AHA39320/29320 and onboard AIC79xx devices
+options 	AHD_REG_PRETTY_PRINT	# Print register bitfields in debug
+					# output.  Adds ~215k to driver.
+device		amd		# AMD 53C974 (Tekram DC-390(T))
+device		hptiop		# Highpoint RocketRaid 3xxx series
+device		isp		# Qlogic family
+#device 	ispfw		# Firmware for QLogic HBAs- normally a module
+device		mpt		# LSI-Logic MPT-Fusion
+#device		ncr		# NCR/Symbios Logic
+device		sym		# NCR/Symbios Logic (newer chipsets + those of `ncr')
+
+# SCSI peripherals
+device		scbus		# SCSI bus (required for SCSI)
+device		ch		# SCSI media changers
+device		da		# Direct Access (disks)
+device		sa		# Sequential Access (tape etc)
+device		cd		# CD
+device		pass		# Passthrough device (direct SCSI access)
+device		ses		# SCSI Environmental Services (and SAF-TE)
+
+# atkbdc0 controls both the keyboard and the PS/2 mouse
+device		atkbdc		# AT keyboard controller
+device		atkbd		# AT keyboard
+device		psm		# PS/2 mouse
+
+device		kbdmux		# keyboard multiplexer
+
+device		vga		# VGA video card driver
+
+device		splash		# Splash screen and screen saver support
+
+# syscons is the default console driver, resembling an SCO console
+device		sc
+
+device		agp		# support several AGP chipsets
+
+# Power management support (see NOTES for more options)
+#device		apm
+# Add suspend/resume support for the i8254.
+device		pmtimer
+
+# PCCARD (PCMCIA) support
+# PCMCIA and cardbus bridge support
+device		cbb		# cardbus (yenta) bridge
+device		pccard		# PC Card (16-bit) bus
+device		cardbus		# CardBus (32-bit) bus
+
+# Serial (COM) ports
+device		uart		# Generic UART driver
+
+# If you've got a "dumb" serial or parallel PCI card that is
+# supported by the puc(4) glue driver, uncomment the following
+# line to enable it (connects to sio, uart and/or ppc drivers):
+#device		puc
+
+# PCI Ethernet NICs.
+device		em		# Intel PRO/1000 Gigabit Ethernet Family
+device		igb		# Intel PRO/1000 PCIE Server Gigabit Family
+device		ixgb		# Intel PRO/10GbE Ethernet Card
+device		le		# AMD Am7900 LANCE and Am79C9xx PCnet
+
+
+# PCI Ethernet NICs that use the common MII bus controller code.
+# NOTE: Be sure to keep the 'device miibus' line in order to use these NICs!
+device		miibus		# MII bus support
+device		ae		# Attansic/Atheros L2 FastEthernet
+device		age		# Attansic/Atheros L1 Gigabit Ethernet
+device		bce		# Broadcom BCM5706/BCM5708 Gigabit Ethernet
+device		bfe		# Broadcom BCM440x 10/100 Ethernet
+device		bge		# Broadcom BCM570xx Gigabit Ethernet
+device		et		# Agere ET1310 10/100/Gigabit Ethernet
+device		fxp		# Intel EtherExpress PRO/100B (82557, 82558)
+device		jme		# JMicron JMC250 Gigabit/JMC260 Fast Ethernet
+device		lge		# Level 1 LXT1001 gigabit Ethernet
+device		msk		# Marvell/SysKonnect Yukon II Gigabit Ethernet
+
+# Pseudo devices.
+device		loop		# Network loopback
+device		random		# Entropy device
+device		ether		# Ethernet support
+device		tun		# Packet tunnel.
+device		pty		# BSD-style compatibility pseudo ttys
+device		md		# Memory "disks"
+device		gif		# IPv6 and IPv4 tunneling
+device		faith		# IPv6-to-IPv4 relaying (translation)
+device		firmware	# firmware assist module
+
+# The `bpf' device enables the Berkeley Packet Filter.
+# Be aware of the administrative consequences of enabling this!
+# Note that 'bpf' is required for DHCP.
+device		bpf		# Berkeley packet filter
+
+# USB support
+device		uhci		# UHCI PCI->USB interface
+device		ohci		# OHCI PCI->USB interface
+device		ehci		# EHCI PCI->USB interface (USB 2.0)
+device		usb		# USB Bus (required)
+#device		udbp		# USB Double Bulk Pipe devices
+device		ugen		# Generic
+device		uhid		# "Human Interface Devices"
+device		ukbd		# Keyboard
+device		ulpt		# Printer
+device		umass		# Disks/Mass storage - Requires scbus and da
+device		ums		# Mouse
+device		urio		# Diamond Rio 500 MP3 player
+device		uscanner	# Scanners
+# USB Serial devices
+device		ucom		# Generic com ttys
+device		u3g		# USB-based 3G modems (Option, Huawei, Sierra)
+device		uark		# Technologies ARK3116 based serial adapters
+device		ubsa		# Belkin F5U103 and compatible serial adapters
+device		uftdi		# For FTDI usb serial adapters
+device		uipaq		# Some WinCE based devices
+device		uplcom		# Prolific PL-2303 serial adapters
+device		uslcom		# SI Labs CP2101/CP2102 serial adapters
+device		uvisor		# Visor and Palm devices
+device		uvscom		# USB serial support for DDI pocket's PHS
+# USB Ethernet, requires miibus
+device		aue		# ADMtek USB Ethernet
+device		axe		# ASIX Electronics USB Ethernet
+device		cdce		# Generic USB over Ethernet
+device		cue		# CATC USB Ethernet
+device		kue		# Kawasaki LSI USB Ethernet
+device		rue		# RealTek RTL8150 USB Ethernet
+device		udav		# Davicom DM9601E USB
+
+# FireWire support
+device		firewire	# FireWire bus code
+device		sbp		# SCSI over FireWire (Requires scbus and da)
+device		fwe		# Ethernet over FireWire (non-standard!)
+device		fwip		# IP over FireWire (RFC 2734,3146)
+device		dcons		# Dumb console driver
+device		dcons_crom	# Configuration ROM for dcons
+
+options		RADIX_MPATH

Added: user/kmacy/HEAD_ECMP/sys/net/flowtable.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/kmacy/HEAD_ECMP/sys/net/flowtable.c	Fri Oct 17 03:59:25 2008	(r183965)
@@ -0,0 +1,604 @@
+#include "opt_mpath.h"
+
+#include <sys/param.h>  
+#include <sys/types.h>
+#include <sys/limits.h>
+#include <sys/kernel.h>  
+#include <sys/bitstring.h>
+#include <sys/vimage.h>
+
+
+#include <sys/callout.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+
+#include <net/route.h> 
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <netinet/sctp.h>
+
+#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
+	
+
+/*
+ * Taken from http://burtleburtle.net/bob/c/lookup3.c
+ */
+
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+/*
+-------------------------------------------------------------------------------
+mix -- mix 3 32-bit values reversibly.
+
+This is reversible, so any information in (a,b,c) before mix() is
+still in (a,b,c) after mix().
+
+If four pairs of (a,b,c) inputs are run through mix(), or through
+mix() in reverse, there are at least 32 bits of the output that
+are sometimes the same for one pair and different for another pair.
+This was tested for:
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or 
+  all zero plus a counter that starts at zero.
+
+Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+satisfy this are
+    4  6  8 16 19  4
+    9 15  3 18 27 15
+   14  9  3  7 17  3
+Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
+for "differ" defined as + with a one-bit base and a two-bit delta.  I
+used http://burtleburtle.net/bob/hash/avalanche.html to choose 
+the operations, constants, and arrangements of the variables.
+
+This does not achieve avalanche.  There are input bits of (a,b,c)
+that fail to affect some output bits of (a,b,c), especially of a.  The
+most thoroughly mixed value is c, but it doesn't really even achieve
+avalanche in c.
+
+This allows some parallelism.  Read-after-writes are good at doubling
+the number of bits affected, so the goal of mixing pulls in the opposite
+direction as the goal of parallelism.  I did what I could.  Rotates
+seem to cost as much as shifts on every machine I could lay my hands
+on, and rotates are much kinder to the top and bottom bits, so I used
+rotates.
+-------------------------------------------------------------------------------
+*/
+#define mix(a,b,c) \
+{ \
+  a -= c;  a ^= rot(c, 4);  c += b; \
+  b -= a;  b ^= rot(a, 6);  a += c; \
+  c -= b;  c ^= rot(b, 8);  b += a; \
+  a -= c;  a ^= rot(c,16);  c += b; \
+  b -= a;  b ^= rot(a,19);  a += c; \
+  c -= b;  c ^= rot(b, 4);  b += a; \
+}
+
+/*
+-------------------------------------------------------------------------------
+final -- final mixing of 3 32-bit values (a,b,c) into c
+
+Pairs of (a,b,c) values differing in only a few bits will usually
+produce values of c that look totally different.  This was tested for
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or 
+  all zero plus a counter that starts at zero.
+
+These constants passed:
+ 14 11 25 16 4 14 24
+ 12 14 25 16 4 14 24
+and these came close:
+  4  8 15 26 3 22 24
+ 10  8 15 26 3 22 24
+ 11  8 15 26 3 22 24
+-------------------------------------------------------------------------------
+*/
+#define final(a,b,c) \
+{ \
+  c ^= b; c -= rot(b,14); \
+  a ^= c; a -= rot(c,11); \
+  b ^= a; b -= rot(a,25); \
+  c ^= b; c -= rot(b,16); \
+  a ^= c; a -= rot(c,4);  \
+  b ^= a; b -= rot(a,14); \
+  c ^= b; c -= rot(b,24); \
+}
+
+/*
+--------------------------------------------------------------------
+ This works on all machines.  To be useful, it requires
+ -- that the key be an array of uint32_t's, and
+ -- that the length be the number of uint32_t's in the key
+
+ The function hashword() is identical to hashlittle() on little-endian
+ machines, and identical to hashbig() on big-endian machines,
+ except that the length has to be measured in uint32_ts rather than in
+ bytes.  hashlittle() is more complicated than hashword() only because
+ hashlittle() has to dance around fitting the key bytes into registers.
+--------------------------------------------------------------------
+*/
+static uint32_t hashword(
+const uint32_t *k,                   /* the key, an array of uint32_t values */
+size_t          length,               /* the length of the key, in uint32_ts */
+uint32_t        initval)         /* the previous hash, or an arbitrary value */
+{
+  uint32_t a,b,c;
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
+
+  /*------------------------------------------------- handle most of the key */
+  while (length > 3)
+  {
+    a += k[0];
+    b += k[1];
+    c += k[2];
+    mix(a,b,c);
+    length -= 3;
+    k += 3;
+  }
+
+  /*------------------------------------------- handle the last 3 uint32_t's */
+  switch(length)                     /* all the case statements fall through */
+  { 
+  case 3 : c+=k[2];
+  case 2 : b+=k[1];
+  case 1 : a+=k[0];
+    final(a,b,c);
+  case 0:     /* case 0: nothing left to add */
+    break;
+  }
+  /*------------------------------------------------------ report the result */
+  return c;
+}
+
+
+struct ip_tuple {
+	in_addr_t 	ip_saddr;	/* source address */
+	in_addr_t 	ip_daddr;	/* destination address */
+	uint16_t 	ip_sport;	/* source port */
+	uint16_t 	ip_dport;	/* destination port */
+};
+
+union ip_flow {
+	struct ip_tuple ipf_ipt;
+	uint32_t 	ipf_key[3];
+};
+
+struct flentry_v4 {
+	uint32_t	fl_fhash;	/* hash flowing forward */
+	uint32_t	fl_ticks;	/* last time this flow was accessed */
+	uint16_t	fl_flags;	/* flow flags */
+	uint8_t		fl_pad;
+	uint8_t		fl_proto;	/* protocol */
+	union ip_flow	fl_flow;
+	struct rtentry *fl_rt;		/* rtentry for flow */
+	uint32_t	fl_refcnt;
+	uint32_t	fl_hash_next;	/* needed for GC */
+	uint32_t	fl_hash_prev;
+};
+
+#define	TICKS_PER_MINUTE	(60*hz)
+#define	TICKS_PER_HOUR		(60*TICKS_PER_MINUTE)
+#define	TICKS_PER_DAY		(24*TICKS_PER_HOUR)
+
+
+#define SYN_IDLE		(5*TICKS_PER_MINUTE)
+#define UDP_IDLE		(5*TICKS_PER_MINUTE)
+#define FIN_WAIT_IDLE		(10*TICKS_PER_MINUTE)
+#define TCP_IDLE		TICKS_PER_DAY
+
+
+static struct flentry_v4 *ipv4_flow_table;
+static int ipv4_flow_table_size;
+static bitstr_t *ipv4_flow_bitstring;
+static int ipv4_flow_allocated;
+struct mtx *ipv4_flow_locks;
+static int ipv4_flow_lock_count;
+extern uint32_t hashjitter;
+static uint32_t ipv4_flow_route_lookup_fail;
+static uint32_t	ipv4_flow_collisions;
+struct callout ipv4_flow_callout;
+static int ipv4_flow_max_count;
+
+
+#define FL_ENTRY_INDEX(hash)((hash) % ipv4_flow_table_size)
+#define FL_ENTRY(hash) (&ipv4_flow_table[FL_ENTRY_INDEX((hash))])
+#define FL_ENTRY_LOCK(hash) mtx_lock(&ipv4_flow_locks[(hash)&(ipv4_flow_lock_count - 1)])
+#define FL_ENTRY_UNLOCK(hash) mtx_lock(&ipv4_flow_locks[(hash)&(ipv4_flow_lock_count - 1)])
+
+#define FL_STALE (1<<8)
+
+static uint32_t
+ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
+    uint32_t *key, uint16_t *flags, uint8_t *protop)
+{
+	uint16_t sport = 0, dport = 0;
+	struct ip *ip = mtod(m, struct ip *);
+	uint8_t proto = ip->ip_p;
+	int iphlen = ip->ip_hl << 2;
+	struct sockaddr_in *sin;
+	struct tcphdr *th;
+	struct udphdr *uh;
+	struct sctphdr *sh;
+
+	key[0] = ip->ip_src.s_addr;
+	key[1] = ip->ip_dst.s_addr;	
+
+	sin = (struct sockaddr_in *)&ro->ro_dst;
+	sin->sin_family = AF_INET;
+	sin->sin_len = sizeof(*sin);
+	sin->sin_addr = ip->ip_dst;
+
+	switch (proto) {
+	case IPPROTO_TCP:
+		th = (struct tcphdr *)((caddr_t)ip + iphlen);
+		sport = th->th_sport;
+		dport = th->th_dport;
+		*flags = th->th_flags;
+		if (*flags & TH_RST)
+			*flags |= FL_STALE;
+	break;
+	case IPPROTO_UDP:
+		uh = (struct udphdr *)((caddr_t)ip + iphlen);
+		sport = uh->uh_sport;
+		dport = uh->uh_dport;
+	break;
+	case IPPROTO_SCTP:
+		sh = (struct sctphdr *)((caddr_t)ip + iphlen);
+		sport = sh->src_port;
+		dport = sh->dest_port;
+	break;
+	default:
+		/* no port - hence not a protocol we care about */
+		break;;
+	
+	}
+	((uint16_t *)key)[4] = sport;
+	((uint16_t *)key)[5] = dport;
+
+	*protop = proto;
+	return (hashword(key, 3, hashjitter + proto));
+}
+
+uint32_t
+ipv4_flow_lookup_hash(struct mbuf *m)
+{
+	struct route ro;
+	uint32_t key[3];
+	uint16_t flags;
+	uint8_t proto;
+	
+	bzero(&ro, sizeof(ro));
+	return (ipv4_flow_lookup_hash_internal(m, &ro, key, &flags, &proto));
+}
+
+static void
+ipv4_flow_insert(uint32_t hash, uint32_t *key, uint8_t proto,
+    struct rtentry *rt, uint16_t flags)
+{
+	struct flentry_v4 *fle, *fle2;
+	uint32_t *hashkey;
+	
+	fle = FL_ENTRY(hash);
+	hashkey = fle->fl_flow.ipf_key;
+
+	hashkey[0] = key[0];
+	hashkey[1] = key[1];
+	hashkey[2] = key[2];
+
+	bit_set(ipv4_flow_bitstring, FL_ENTRY_INDEX(hash));
+	if (rt->rt_flow_head == 0) {
+		rt->rt_flow_head = hash;
+		fle->fl_hash_next = fle->fl_hash_prev = 0;
+	} else {
+		fle->fl_hash_next = rt->rt_flow_head;
+		fle2 = FL_ENTRY(rt->rt_flow_head);
+		rt->rt_flow_head = hash;
+		fle2->fl_hash_prev = hash;
+	}
+	fle->fl_proto = proto;
+	fle->fl_rt = rt;
+	fle->fl_fhash = hash;
+	fle->fl_ticks = ticks;
+	rt->rt_refcnt++;
+	ipv4_flow_allocated++;
+}
+
+uint32_t
+ipv4_flow_alloc(struct mbuf *m, struct route *ro)
+{
+	uint32_t key[3], hash, *hashkey;
+	struct flentry_v4 *fle;
+	uint16_t flags = 0;
+	uint8_t proto;
+	
+	/*
+	 * Only handle IPv4 for now
+	 *
+	 */
+	hash = ipv4_flow_lookup_hash_internal(m, ro, key, &flags, &proto);
+
+	/*
+	 * Ports are zero - thus not a protocol for which 
+	 * we need to keep state
+	 */
+	if (key[3] == 0)
+		return (hash);
+	
+	FL_ENTRY_LOCK(hash);
+	fle = FL_ENTRY(hash);
+
+	hashkey = fle->fl_flow.ipf_key;
+	
+	if (fle->fl_fhash == 0) {
+		FL_ENTRY_UNLOCK(hash);
+		rtalloc_mpath_fib(ro, hash, M_GETFIB(m));
+		if (ro->ro_rt) {
+			FL_ENTRY_LOCK(hash);
+			ipv4_flow_insert(hash, key, proto, ro->ro_rt, flags);
+			RT_UNLOCK(ro->ro_rt);
+		} else
+			ipv4_flow_route_lookup_fail++;
+	} else if (fle->fl_fhash == hash
+	    && key[0] == hashkey[0] 
+	    && key[1] == hashkey[1]
+	    && key[2] == hashkey[2]
+	    && proto == fle->fl_proto) {
+		fle->fl_ticks = ticks;
+		fle->fl_flags |= flags;
+		fle->fl_refcnt++;
+		ro->ro_rt = fle->fl_rt;
+	} else 
+		ipv4_flow_collisions++;
+		
+	FL_ENTRY_UNLOCK(hash);
+
+	return (hash);
+}
+
+/*
+ * Internal helper routine
+ * hash - the hash of the entry to free
+ * stale - indicates to only free the entry if it is marked stale
+ */
+
+static uint32_t
+ipv4_flow_free_internal(uint32_t hash, int staleonly)
+{
+	struct flentry_v4 *fle, *fleprev, *flenext;
+	uint32_t hash_next;
+
+	fle = FL_ENTRY(hash);
+	hash_next = fle->fl_hash_next;
+	
+	if (staleonly && ((fle->fl_flags & FL_STALE) == 0))
+	    return (hash_next);
+	
+	if (fle->fl_hash_next) {
+		flenext = FL_ENTRY(fle->fl_hash_next);
+		flenext->fl_hash_prev = fle->fl_hash_prev;
+	}
+	if (fle->fl_hash_prev) {
+		fleprev = FL_ENTRY(fle->fl_hash_prev);
+		fleprev->fl_hash_next = fle->fl_hash_next;
+	}
+	fle->fl_hash_next = fle->fl_hash_prev = 0;
+	    
+	if (fle->fl_refcnt == 0) {
+		fle->fl_rt->rt_refcnt--;
+		ipv4_flow_allocated--;
+		bit_clear(ipv4_flow_bitstring, FL_ENTRY_INDEX(hash));
+		bzero(fle, sizeof(struct flentry_v4));
+	} else if (!staleonly) 
+		fle->fl_flags |= FL_STALE;
+
+	return (hash_next);
+}
+
+/*
+ * drops the refcount on the flow after alloc was called and 
+ * checks if the flow has become stale since alloc was called
+ *
+ */
+void
+ipv4_flow_free(uint32_t hash)
+{
+	struct flentry_v4 *fle;
+	struct rtentry *rt;
+	int stale;
+
+	fle = FL_ENTRY(hash);
+	KASSERT(fle->fl_refcnt > 0,
+	    ("route referenced with flow refcount set to zero"));
+
+	stale = ((fle->fl_flags & FL_STALE) &&
+	    (fle->fl_refcnt == 1));
+
+	rt = fle->fl_rt;
+	if (stale)
+		RT_LOCK(rt);
+	
+	FL_ENTRY_LOCK(hash);
+	fle->fl_refcnt--;
+
+	if (stale) {
+ 		ipv4_flow_free_internal(hash, 0);
+		RTFREE_LOCKED(rt);
+	} 
+	FL_ENTRY_UNLOCK(hash);
+}
+
+/*
+ *
+ * Frees all flows that are linked to this rtentry
+ *
+ */
+void
+ipv4_flow_free_all(struct rtentry *rt)
+{
+	uint32_t hash_next = rt->rt_flow_head;
+
+	RT_LOCK_ASSERT(rt);
+	while (hash_next) 
+		hash_next = ipv4_flow_free_internal(hash_next, 0);
+}
+
+/*
+ * Frees all flows tied to this rt that 
+ * have been marked stale
+ *
+ */
+static int
+ipv4_flow_free_stale(struct radix_node *rn, void *unused)
+{
+	struct rtentry *rt = (struct rtentry *)rn;
+	uint32_t hash_next; 
+
+	if (rt->rt_flow_head == 0)
+		return (0);
+
+	RT_LOCK(rt);
+	hash_next = rt->rt_flow_head;
+	while (hash_next)
+		hash_next = ipv4_flow_free_internal(hash_next, 1);
+	RT_UNLOCK(rt);
+
+	return (0);
+}
+
+struct radix_node_head *ipv4_flow_rnh_list[100];
+static void
+ipv4_flow_check_stale(struct flentry_v4 *fle,
+    struct radix_node_head **rnh_list, int *rnh_count)
+{
+	int count = *rnh_count;
+	uint32_t idle_ticks;
+	struct radix_node_head *rnh;
+	struct rtentry *rt;
+	int i, stale = 0, found = 0;
+	
+	if (ticks > fle->fl_ticks)
+		idle_ticks = ticks - fle->fl_ticks;
+	else
+		idle_ticks = (INT_MAX - fle->fl_ticks) + ticks ;
+	
+	if ((fle->fl_flags & FL_STALE) ||
+	    ((fle->fl_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
+		&& (idle_ticks > UDP_IDLE)) ||
+	    ((fle->fl_flags & TH_FIN)
+		&& (idle_ticks > FIN_WAIT_IDLE)) ||
+	    ((fle->fl_flags & (TH_SYN|TH_ACK)) == TH_SYN
+		&& (idle_ticks > SYN_IDLE)) ||
+	    ((fle->fl_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
+		&& (idle_ticks > TCP_IDLE)))
+		stale = 1;
+
+	if (stale == 0)
+		return;
+
+	fle->fl_flags |= FL_STALE;
+	rt = fle->fl_rt;
+	rnh = V_rt_tables[rt->rt_fibnum][rt_key(rt)->sa_family];
+
+	for (i = 0; i < count; i++) 
+		if (rnh_list[i] == rnh) {
+			found  = 1;
+			break;
+		}
+	if (found == 0) {
+		rnh_list[count] = rnh;
+		count++;
+		*rnh_count = count;
+	}
+}
+
+
+static __inline int
+bit_fns(bitstr_t *name, int nbits, int lastbit)
+{
+	int lastbit_start = lastbit & ~0x7;
+	bitstr_t *bitstr_start = &name[lastbit_start];
+	int value = 0;
+
+	while (value <= lastbit && value != 1)
+		bit_ffs(bitstr_start, nbits, &value);
+
+	return (value);
+}
+
+
+static int ipv4_flow_last_index;
+static void
+ipv4_flow_timeout(void *arg)
+{
+	int i, idx, rnh_count = 0;
+	struct radix_node_head *rnh;
+	
+	/*
+	 * scan 1/4th of the table once a second
+	 */
+	for (i = 0; i < (ipv4_flow_allocated >> 2); i++) {
+		idx = bit_fns(ipv4_flow_bitstring, ipv4_flow_table_size,
+		    ipv4_flow_last_index);
+		if (idx == -1) {
+			ipv4_flow_last_index = 0;
+			break;
+		}
+		
+		FL_ENTRY_LOCK(idx);
+		ipv4_flow_check_stale(FL_ENTRY(idx), ipv4_flow_rnh_list, &rnh_count);
+		FL_ENTRY_UNLOCK(idx);
+	}
+	for (i = 0; i < rnh_count; i++) {
+		rnh = ipv4_flow_rnh_list[i];
+		RADIX_NODE_HEAD_LOCK(rnh);
+		rnh->rnh_walktree(rnh, ipv4_flow_free_stale, NULL);
+		RADIX_NODE_HEAD_UNLOCK(rnh);
+	}
+
+	callout_reset(&ipv4_flow_callout, hz, ipv4_flow_timeout, NULL);
+}
+
+static void
+flowtable_init(void *unused) 
+{
+	int i, nentry;
+
+	nentry = ipv4_flow_max_count;
+	/*
+	 * round mp_ncpus up to the next power of 2 and double
+	 * to determine the number of locks
+	 */
+	ipv4_flow_lock_count = (1 << fls(mp_ncpus)) << 1;
+	
+	ipv4_flow_table_size = nentry;
+	ipv4_flow_table = malloc(nentry*sizeof(struct flentry_v4),
+	    M_RTABLE, M_WAITOK | M_ZERO);
+	ipv4_flow_bitstring = bit_alloc(nentry);
+	ipv4_flow_locks = malloc(ipv4_flow_lock_count*sizeof(struct mtx),
+	    M_RTABLE, M_WAITOK | M_ZERO);
+	for (i = 0; i < ipv4_flow_lock_count; i++)
+		mtx_init(&ipv4_flow_locks[i], "ipv4_flow", NULL, MTX_DEF);
+	
+}
+SYSINIT(flowtable, SI_SUB_INIT_IF, SI_ORDER_ANY, flowtable_init, NULL);

Modified: user/kmacy/HEAD_ECMP/sys/net/radix_mpath.c
==============================================================================
--- user/kmacy/HEAD_ECMP/sys/net/radix_mpath.c	Fri Oct 17 03:17:10 2008	(r183964)
+++ user/kmacy/HEAD_ECMP/sys/net/radix_mpath.c	Fri Oct 17 03:59:25 2008	(r183965)
@@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$");
 /*
  * give some jitter to hash, to avoid synchronization between routers
  */
-static u_int32_t hashjitter;
+uint32_t hashjitter;
 
 int
 rn_mpath_capable(struct radix_node_head *rnh)
@@ -298,7 +298,7 @@ rtalloc_mpath_fib(struct route *ro, u_in
 		return;
 	}
 	
-	rtfree(ro->ro_rt);
+	RTFREE(ro->ro_rt);
 	ro->ro_rt = (struct rtentry *)rn;
 	RT_LOCK(ro->ro_rt);
 	RT_ADDREF(ro->ro_rt);

Modified: user/kmacy/HEAD_ECMP/sys/net/radix_mpath.h
==============================================================================
--- user/kmacy/HEAD_ECMP/sys/net/radix_mpath.h	Fri Oct 17 03:17:10 2008	(r183964)
+++ user/kmacy/HEAD_ECMP/sys/net/radix_mpath.h	Fri Oct 17 03:59:25 2008	(r183965)
@@ -58,6 +58,11 @@ int rt_mpath_deldup(struct rtentry *, st
 int	rn4_mpath_inithead(void **, int);
 int	rn6_mpath_inithead(void **, int);
 
+uint32_t ipv4_flow_alloc(struct mbuf *m, struct route *ro);
+void ipv4_flow_free(uint32_t hash);
+
+uint32_t ipv4_flow_lookup_hash(struct mbuf *m);
+void ipv4_flow_free_all(struct rtentry *rt);
 #endif
 
 #endif /* _NET_RADIX_MPATH_H_ */

Modified: user/kmacy/HEAD_ECMP/sys/net/route.c
==============================================================================
--- user/kmacy/HEAD_ECMP/sys/net/route.c	Fri Oct 17 03:17:10 2008	(r183964)
+++ user/kmacy/HEAD_ECMP/sys/net/route.c	Fri Oct 17 03:59:25 2008	(r183965)
@@ -808,8 +808,10 @@ rtexpunge(struct rtentry *rt)
 		("unexpected flags 0x%x", rn->rn_flags));
 	KASSERT(rt == RNTORT(rn),
 		("lookup mismatch, rt %p rn %p", rt, rn));
-
 	rt->rt_flags &= ~RTF_UP;
+#ifdef RADIX_MPATH
+	ipv4_flow_free_all(rt);
+#endif
 
 	/*
 	 * Now search what's left of the subtree for any cloned
@@ -948,6 +950,9 @@ rtrequest1_fib(int req, struct rt_addrin
 			RT_LOCK(rt);
 			RT_ADDREF(rt);
 			rt->rt_flags &= ~RTF_UP;
+#ifdef RADIX_MPATH
+			ipv4_flow_free_all(rt);
+#endif
 			goto deldone;  /* done with the RTM_DELETE command */
 		}
 
@@ -966,7 +971,9 @@ normal_rtdel:
 		RT_LOCK(rt);
 		RT_ADDREF(rt);
 		rt->rt_flags &= ~RTF_UP;
-
+#ifdef RADIX_MPATH
+		ipv4_flow_free_all(rt);
+#endif	
 		/*
 		 * Now search what's left of the subtree for any cloned
 		 * routes which might have been formed from this node.

Modified: user/kmacy/HEAD_ECMP/sys/net/route.h
==============================================================================
--- user/kmacy/HEAD_ECMP/sys/net/route.h	Fri Oct 17 03:17:10 2008	(r183964)
+++ user/kmacy/HEAD_ECMP/sys/net/route.h	Fri Oct 17 03:59:25 2008	(r183965)
@@ -148,6 +148,9 @@ struct rtentry {
 #ifdef _KERNEL
 	/* XXX ugly, user apps use this definition but don't have a mtx def */
 	struct	mtx rt_mtx;		/* mutex for routing entry */
+#ifdef RADIX_MPATH
+	uint32_t 	rt_flow_head;
+#endif 
 #endif
 };
 

Modified: user/kmacy/HEAD_ECMP/sys/netinet/ip_input.c
==============================================================================
--- user/kmacy/HEAD_ECMP/sys/netinet/ip_input.c	Fri Oct 17 03:17:10 2008	(r183964)
+++ user/kmacy/HEAD_ECMP/sys/netinet/ip_input.c	Fri Oct 17 03:59:25 2008	(r183965)
@@ -1286,7 +1286,7 @@ ip_forward(struct mbuf *m, int srcrt)
 	struct mbuf *mcopy;
 	struct in_addr dest;
 	struct route ro;
-	int error, type = 0, code = 0, mtu = 0;
+	int error, type = 0, code = 0, mtu = 0, cached = 0;
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
 		V_ipstat.ips_cantforward++;
@@ -1305,7 +1305,24 @@ ip_forward(struct mbuf *m, int srcrt)
 	}
 #endif
 
+	bzero(&ro, sizeof(ro));
+#ifdef RADIX_MPATH
+	hash = ipv4_flow_alloc(m, &ro);
+	
+	if (ro.ro_rt == NULL) 
+		rtalloc_mpath_fib(&ro, hash, M_GETFIB(m));
+	else
+		cached = 1;
+
+	if (ro->ro_rt != NULL)
+		ia = ifatoia(ro.ro_rt->rt_ifa);
+#else	
+	/*
+	 * I love how we go to all the trouble to look up the
+	 * route and then throw it away KMM
+	 */
 	ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
+#endif	
 	if (!srcrt && ia == NULL) {
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		return;
@@ -1365,7 +1382,6 @@ ip_forward(struct mbuf *m, int srcrt)
 		struct sockaddr_in *sin;
 		struct rtentry *rt;
 
-		bzero(&ro, sizeof(ro));
 		sin = (struct sockaddr_in *)&ro.ro_dst;
 		sin->sin_family = AF_INET;
 		sin->sin_len = sizeof(*sin);
@@ -1390,7 +1406,7 @@ ip_forward(struct mbuf *m, int srcrt)
 				code = ICMP_REDIRECT_HOST;
 			}
 		}
-		if (rt)
+		if (rt && (cached == 0))
 			RTFREE(rt);
 	}
 
@@ -1398,13 +1414,15 @@ ip_forward(struct mbuf *m, int srcrt)
 	 * Try to cache the route MTU from ip_output so we can consider it for
 	 * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191.
 	 */
-	bzero(&ro, sizeof(ro));

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200810170359.m9H3xPks077916>