Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 05 Sep 2014 20:37:53 +0200
From:      Hans Petter Selasky <hps@selasky.org>
To:        FreeBSD Current <freebsd-current@freebsd.org>,  "freebsd-net@freebsd.org" <freebsd-net@freebsd.org>, Scott Long <scottl@FreeBSD.org>
Subject:   [RFC] Patch to improve TSO limitation formula in general
Message-ID:  <540A0301.9040701@selasky.org>

next in thread | raw e-mail | index | archive | help
This is a multi-part message in MIME format.
--------------020609020709040903060408
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit

Hi,

I've tested the attached patch with success and would like to have some 
feedback from other FreeBSD network developers. The problem is that the 
current TSO limitation only limits the number of bytes that can be 
transferred in a TSO packet and not the number of mbuf's.

The current solution is to have a quick and dirty custom m_dup() in the 
TX path to re-allocate the mbuf chains into 4K ones to make it simple. 
All of this hack can be avoided if the definition of the TSO limit can 
be changed a bit, like shown here:


  /*
+ * Structure defining hardware TSO limits.
+ */
+struct if_tso_limit {
+       u_int raw_value[0];     /* access all fields as one */
+       u_char frag_count;      /* maximum number of fragments: 1..255 */
+       u_char frag_size_log2;  /* maximum fragment size: 2 ** (12..16) */
+       u_char hdr_size_log2;   /* maximum header size: 2 ** (2..8) */
+       u_char reserved;        /* zero */
+};


First we need to know the maximum fragment count. Typical value is 32.
Second we need to know the maximum fragment size. Typical value is 4K.
Last we need to know of any headers that should be subtracted from the 
maximum. Hence this code is running in the fast path, I would like to 
use "u_char" for all fields and allow copy-only access as a "u_int" as 
an optimization. This avoids cludges and messing with additional header 
files.

I would like to push this patch after some more testing to -current and 
then to 10-stable hopefully before the coming 10-release, because the 
current solution is affecting performance of the Mellanox based network 
adapters in an unfair way. For example by setting the current TSO limit 
to 32KBytes which will be OK for all-2K fragments, we see a severe 
degradation in performance. Even though the hardware is fully capable of 
transmitting 16 4K mbufs.

Comments and reviews are welcome!

--HPS

--------------020609020709040903060408
Content-Type: text/x-diff;
 name="tso.diff"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="tso.diff"

=== sys/dev/oce/oce_if.c
==================================================================
--- sys/dev/oce/oce_if.c	(revision 270996)
+++ sys/dev/oce/oce_if.c	(local)
@@ -1731,7 +1731,9 @@
 	sc->ifp->if_baudrate = IF_Gbps(10);
 
 #if __FreeBSD_version >= 1000000
-	sc->ifp->if_hw_tsomax = OCE_MAX_TSO_SIZE;
+	sc->ifp->if_hw_tsomax.frag_count = 29;		/* 29 elements */
+	sc->ifp->if_hw_tsomax.frag_size_log2 = 12;	/* 4K */
+	sc->ifp->if_hw_tsomax.hdr_size_log2 = 5;	/* ETH+VLAN < 2**5 */
 #endif
 
 	ether_ifattach(sc->ifp, sc->macaddr.mac_addr);
=== sys/dev/oce/oce_if.h
==================================================================
--- sys/dev/oce/oce_if.h	(revision 270996)
+++ sys/dev/oce/oce_if.h	(local)
@@ -152,7 +152,6 @@
 #define OCE_MAX_TX_ELEMENTS		29
 #define OCE_MAX_TX_DESC			1024
 #define OCE_MAX_TX_SIZE			65535
-#define OCE_MAX_TSO_SIZE		(65535 - ETHER_HDR_LEN)
 #define OCE_MAX_RX_SIZE			4096
 #define OCE_MAX_RQ_POSTS		255
 #define OCE_DEFAULT_PROMISCUOUS		0
=== sys/dev/vmware/vmxnet3/if_vmx.c
==================================================================
--- sys/dev/vmware/vmxnet3/if_vmx.c	(revision 270996)
+++ sys/dev/vmware/vmxnet3/if_vmx.c	(local)
@@ -1722,7 +1722,9 @@
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_init = vmxnet3_init;
 	ifp->if_ioctl = vmxnet3_ioctl;
-	ifp->if_hw_tsomax = VMXNET3_TSO_MAXSIZE;
+	ifp->if_hw_tsomax.frag_count = VMXNET3_TX_MAXSEGS;
+	ifp->if_hw_tsomax.frag_size_log2 = VMXNET3_TX_MAXSEGSHIFT;
+	ifp->if_hw_tsomax.hdr_size_log2 = 5;	/* ETH+VLAN < 2**5 */
 
 #ifdef VMXNET3_LEGACY_TX
 	ifp->if_start = vmxnet3_start;
=== sys/dev/vmware/vmxnet3/if_vmxvar.h
==================================================================
--- sys/dev/vmware/vmxnet3/if_vmxvar.h	(revision 270996)
+++ sys/dev/vmware/vmxnet3/if_vmxvar.h	(local)
@@ -277,14 +277,13 @@
  */
 #define VMXNET3_TX_MAXSEGS		32
 #define VMXNET3_TX_MAXSIZE		(VMXNET3_TX_MAXSEGS * MCLBYTES)
-#define VMXNET3_TSO_MAXSIZE \
-    (VMXNET3_TX_MAXSIZE - sizeof(struct ether_vlan_header))
 
 /*
  * Maximum support Tx segments size. The length field in the
  * Tx descriptor is 14 bits.
  */
-#define VMXNET3_TX_MAXSEGSIZE		(1 << 14)
+#define VMXNET3_TX_MAXSEGSHIFT		14
+#define VMXNET3_TX_MAXSEGSIZE		(1 << VMXNET3_TX_MAXSEGSHIFT)
 
 /*
  * The maximum number of Rx segments we accept. When LRO is enabled,
=== sys/dev/xen/netfront/netfront.c
==================================================================
--- sys/dev/xen/netfront/netfront.c	(revision 270996)
+++ sys/dev/xen/netfront/netfront.c	(local)
@@ -134,7 +134,6 @@
  * to mirror the Linux MAX_SKB_FRAGS constant.
  */
 #define	MAX_TX_REQ_FRAGS (65536 / PAGE_SIZE + 2)
-#define	NF_TSO_MAXBURST ((IP_MAXPACKET / PAGE_SIZE) * MCLBYTES)
 
 #define RX_COPY_THRESHOLD 256
 
@@ -2102,7 +2101,9 @@
 	
     	ifp->if_hwassist = XN_CSUM_FEATURES;
     	ifp->if_capabilities = IFCAP_HWCSUM;
-	ifp->if_hw_tsomax = NF_TSO_MAXBURST;
+	ifp->if_hw_tsomax.frag_count = MAX_TX_REQ_FRAGS;
+	ifp->if_hw_tsomax.frag_size_log2 = PAGE_SHIFT;
+	ifp->if_hw_tsomax.hdr_size_log2 = 5;	/* ETH+VLAN < 2**5 */
 	
     	ether_ifattach(ifp, np->mac);
     	callout_init(&np->xn_stat_ch, CALLOUT_MPSAFE);
=== sys/net/if.c
==================================================================
--- sys/net/if.c	(revision 270996)
+++ sys/net/if.c	(local)
@@ -445,6 +445,7 @@
 	ifp->if_index = idx;
 	ifp->if_type = type;
 	ifp->if_alloctype = type;
+	ifp->if_hw_tsomax = IF_TSO_LIMIT_DEFAULT();
 	if (if_com_alloc[type] != NULL) {
 		ifp->if_l2com = if_com_alloc[type](type, ifp);
 		if (ifp->if_l2com == NULL) {
@@ -657,16 +658,6 @@
 		TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 		/* Reliably crash if used uninitialized. */
 		ifp->if_broadcastaddr = NULL;
-
-#if defined(INET) || defined(INET6)
-		/* Initialize to max value. */
-		if (ifp->if_hw_tsomax == 0)
-			ifp->if_hw_tsomax = min(IP_MAXPACKET, 32 * MCLBYTES -
-			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
-		KASSERT(ifp->if_hw_tsomax <= IP_MAXPACKET &&
-		    ifp->if_hw_tsomax >= IP_MAXPACKET / 8,
-		    ("%s: tsomax outside of range", __func__));
-#endif
 	}
 #ifdef VIMAGE
 	else {
=== sys/net/if_lagg.c
==================================================================
--- sys/net/if_lagg.c	(revision 270996)
+++ sys/net/if_lagg.c	(local)
@@ -445,11 +445,7 @@
 	struct lagg_port *lp;
 	int cap = ~0, ena = ~0;
 	u_long hwa = ~0UL;
-#if defined(INET) || defined(INET6)
-	u_int hw_tsomax = IP_MAXPACKET;	/* Initialize to the maximum value. */
-#else
-	u_int hw_tsomax = ~0;	/* if_hw_tsomax is only for INET/INET6, but.. */
-#endif
+	struct if_tso_limit hw_tsomax = IF_TSO_LIMIT_DEFAULT();
 
 	LAGG_WLOCK_ASSERT(sc);
 
@@ -458,10 +454,9 @@
 		cap &= lp->lp_ifp->if_capabilities;
 		ena &= lp->lp_ifp->if_capenable;
 		hwa &= lp->lp_ifp->if_hwassist;
-		/* Set to the minimum value of the lagg ports. */
-		if (lp->lp_ifp->if_hw_tsomax < hw_tsomax &&
-		    lp->lp_ifp->if_hw_tsomax > 0)
-			hw_tsomax = lp->lp_ifp->if_hw_tsomax;
+		/* Set to the common value of the lagg ports. */
+		hw_tsomax = IF_TSO_LIMIT_COMMON(&hw_tsomax,
+		    &lp->lp_ifp->if_hw_tsomax);
 	}
 	cap = (cap == ~0 ? 0 : cap);
 	ena = (ena == ~0 ? 0 : ena);
@@ -470,7 +465,7 @@
 	if (sc->sc_ifp->if_capabilities != cap ||
 	    sc->sc_ifp->if_capenable != ena ||
 	    sc->sc_ifp->if_hwassist != hwa ||
-	    sc->sc_ifp->if_hw_tsomax != hw_tsomax) {
+	    IF_TSO_LIMIT_CMP(&sc->sc_ifp->if_hw_tsomax, !=, &hw_tsomax)) {
 		sc->sc_ifp->if_capabilities = cap;
 		sc->sc_ifp->if_capenable = ena;
 		sc->sc_ifp->if_hwassist = hwa;
=== sys/net/if_var.h
==================================================================
--- sys/net/if_var.h	(revision 270996)
+++ sys/net/if_var.h	(local)
@@ -120,6 +120,36 @@
 typedef	uint64_t (*if_get_counter_t)(if_t, ifnet_counter);
 
 /*
+ * Structure defining hardware TSO limits.
+ */
+struct if_tso_limit {
+	u_int raw_value[0];	/* access all fields as one */
+	u_char frag_count;	/* maximum number of fragments: 1..255 */
+	u_char frag_size_log2;	/* maximum fragment size: 2 ** (12..16) */
+	u_char hdr_size_log2;	/* maximum header size: 2 ** (2..8) */
+	u_char reserved;	/* zero */
+};
+
+#define	IF_TSO_LIMIT_DEFAULT() ({		\
+struct if_tso_limit tso_temp = {		\
+  .frag_count = 128,				\
+  .frag_size_log2 = 16,				\
+  .hdr_size_log2 = 2,				\
+  .reserved = 0,				\
+}; tso_temp; })
+
+#define	IF_TSO_LIMIT_COMMON(a,b) ({				\
+struct if_tso_limit tso_temp = {				\
+  .frag_count = min((a)->frag_count, (b)->frag_count),		\
+  .frag_size_log2 = min((a)->frag_size_log2, (b)->frag_size_log2),\
+  .hdr_size_log2 = max((a)->hdr_size_log2, (b)->hdr_size_log2),	\
+  .reserved = 0,						\
+}; tso_temp; })
+
+#define	IF_TSO_LIMIT_CMP(a,op,b)		\
+  ((a)->raw_value[0] op (b)->raw_value[0])
+
+/*
  * Structure defining a network interface.
  *
  * Size ILP32:  592 (approx)
@@ -222,10 +252,8 @@
 	if_get_counter_t if_get_counter; /* get counter values */
 
 	/* Stuff that's only temporary and doesn't belong here. */
-	u_int	if_hw_tsomax;		/* tso burst length limit, the minimum
-					 * is (IP_MAXPACKET / 8).
-					 * XXXAO: Have to find a better place
-					 * for it eventually. */
+	struct if_tso_limit if_hw_tsomax;
+
 	/*
 	 * Old, racy and expensive statistics, should not be used in
 	 * new drivers.
=== sys/net/if_vlan.c
==================================================================
--- sys/net/if_vlan.c	(revision 270996)
+++ sys/net/if_vlan.c	(local)
@@ -1511,8 +1511,8 @@
 	 * propagate the hardware-assisted flag. TSO on VLANs
 	 * does not necessarily require hardware VLAN tagging.
 	 */
-	if (p->if_hw_tsomax > 0)
-		ifp->if_hw_tsomax = p->if_hw_tsomax;
+	ifp->if_hw_tsomax = IF_TSO_LIMIT_COMMON(&ifp->if_hw_tsomax,
+	    &p->if_hw_tsomax);
 	if (p->if_capabilities & IFCAP_VLAN_HWTSO)
 		ifp->if_capabilities |= p->if_capabilities & IFCAP_TSO;
 	if (p->if_capenable & IFCAP_VLAN_HWTSO) {
=== sys/netinet/tcp_output.c
==================================================================
--- sys/netinet/tcp_output.c	(revision 270996)
+++ sys/netinet/tcp_output.c	(local)
@@ -767,9 +767,70 @@
 		flags &= ~TH_FIN;
 
 		if (tso) {
+			struct if_tso_limit if_hw_tsomax;
+			struct mbuf *mb;
+			u_int rem_frags;
+			u_int moff;
+			int max_len;
+
+			/* copy TSO limit information */
+			if_hw_tsomax.raw_value[0] = tp->t_tsomax;
+
+			/* compute maximum TSO length */
+			max_len = (((u_int)if_hw_tsomax.frag_count) <<
+			    if_hw_tsomax.frag_size_log2) - hdrlen -
+			    (1 << if_hw_tsomax.hdr_size_log2);
+
+			/* clamp maximum length value */
+			if (max_len > IP_MAXPACKET)
+				max_len = IP_MAXPACKET;
+			else if (max_len < 0)
+				max_len = 0;
+
+			/* get smallest length */
+			if (len > (u_int)max_len) {
+				sendalot = 1;
+				len = (u_int)max_len;
+			}
+
+			/* get remaining fragments */
+			rem_frags = if_hw_tsomax.frag_count;
+
 			KASSERT(ipoptlen == 0,
 			    ("%s: TSO can't do IP options", __func__));
 
+			max_len = 0;
+			mb = sbsndptr(&so->so_snd, off, len, &moff);
+
+			/* now make sure the number of fragments fit too */
+			while (mb != NULL && (u_int)max_len < len) {
+				u_int cur_length;
+				u_int cur_frags;
+
+				/*
+				 * Get length of mbuf fragment and how
+				 * many hardware frags it would use:
+				 */
+				cur_length = (mb->m_len - moff);
+				cur_frags = (cur_length +
+				    (1 << if_hw_tsomax.frag_size_log2) - 1)
+				    >> if_hw_tsomax.frag_size_log2;
+
+				/* Handle special case: Zero Length Mbuf */
+				if (cur_frags == 0)
+					cur_frags = 1;
+
+				/* Check if fragment limit will be exceeded */
+				if (cur_frags >= rem_frags) {
+					max_len += min(cur_length, rem_frags << if_hw_tsomax.frag_size_log2);
+					break;
+				}
+				max_len += cur_length;
+				rem_frags -= cur_frags;
+				moff = 0;
+				mb = mb->m_next;
+			}
+
 			/*
 			 * Limit a burst to t_tsomax minus IP,
 			 * TCP and options length to keep ip->ip_len
@@ -776,8 +837,8 @@
 			 * from overflowing or exceeding the maximum
 			 * length allowed by the network interface.
 			 */
-			if (len > tp->t_tsomax - hdrlen) {
-				len = tp->t_tsomax - hdrlen;
+			if (len > (u_int)max_len) {
+				len = (u_int)max_len;
 				sendalot = 1;
 			}
 
=== sys/netinet/tcp_subr.c
==================================================================
--- sys/netinet/tcp_subr.c	(revision 270996)
+++ sys/netinet/tcp_subr.c	(local)
@@ -1818,7 +1818,7 @@
 			if (ifp->if_capenable & IFCAP_TSO4 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
-				cap->tsomax = ifp->if_hw_tsomax;
+				cap->tsomax = ifp->if_hw_tsomax.raw_value[0];
 			}
 		}
 		RTFREE(sro.ro_rt);
@@ -1857,7 +1857,7 @@
 			if (ifp->if_capenable & IFCAP_TSO6 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
-				cap->tsomax = ifp->if_hw_tsomax;
+				cap->tsomax = ifp->if_hw_tsomax.raw_value[0];
 			}
 		}
 		RTFREE(sro6.ro_rt);

--------------020609020709040903060408--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?540A0301.9040701>