Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 7 May 2014 04:00:05 +0000 (UTC)
From:      Navdeep Parhar <np@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-9@freebsd.org
Subject:   svn commit: r265481 - stable/9/sys/dev/cxgbe
Message-ID:  <201405070400.s47405jC020122@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: np
Date: Wed May  7 04:00:05 2014
New Revision: 265481
URL: http://svnweb.freebsd.org/changeset/base/265481

Log:
  MFC r255050, r255052.
  
  r255050:
  Implement support for rx buffer packing.  Enable it by default for T5
  cards.
  
  This is a T4 and T5 chip feature which lets the chip deliver multiple
  Ethernet frames in a single buffer.  This is more efficient within the
  chip, in the driver, and reduces wastage of space in rx buffers.
  
  - Always allocate rx buffers from the jumbop zone, no matter what the
    MTU is.  Do not use the normal cluster refcounting mechanism.
  - Reserve space for an mbuf and a refcount in the cluster itself and let
    the chip DMA multiple frames in the rest.
  - Use the embedded mbuf for the first frame and allocate mbufs on the
    fly for any additional frames delivered in the cluster.  Each of these
    mbufs has a reference on the underlying cluster.
  
  r255052:
  Fix the sysctl that displays whether buffer packing is enabled
  or not.

Modified:
  stable/9/sys/dev/cxgbe/adapter.h
  stable/9/sys/dev/cxgbe/t4_sge.c
Directory Properties:
  stable/9/sys/   (props changed)
  stable/9/sys/dev/   (props changed)

Modified: stable/9/sys/dev/cxgbe/adapter.h
==============================================================================
--- stable/9/sys/dev/cxgbe/adapter.h	Wed May  7 03:17:21 2014	(r265480)
+++ stable/9/sys/dev/cxgbe/adapter.h	Wed May  7 04:00:05 2014	(r265481)
@@ -128,9 +128,9 @@ enum {
 
 	RX_FL_ESIZE = EQ_ESIZE,	/* 8 64bit addresses */
 #if MJUMPAGESIZE != MCLBYTES
-	FL_BUF_SIZES = 4,	/* cluster, jumbop, jumbo9k, jumbo16k */
+	FL_BUF_SIZES_MAX = 5,	/* cluster, jumbop, jumbo9k, jumbo16k, extra */
 #else
-	FL_BUF_SIZES = 3,	/* cluster, jumbo9k, jumbo16k */
+	FL_BUF_SIZES_MAX = 4,	/* cluster, jumbo9k, jumbo16k, extra */
 #endif
 
 	CTRL_EQ_QSIZE = 128,
@@ -165,6 +165,7 @@ enum {
 	MASTER_PF	= (1 << 3),
 	ADAP_SYSCTL_CTX	= (1 << 4),
 	TOM_INIT_DONE	= (1 << 5),
+	BUF_PACKING_OK	= (1 << 6),
 
 	CXGBE_BUSY	= (1 << 9),
 
@@ -231,12 +232,11 @@ struct port_info {
 };
 
 struct fl_sdesc {
-	struct mbuf *m;
 	bus_dmamap_t map;
 	caddr_t cl;
-	uint8_t tag_idx;	/* the sc->fl_tag this map comes from */
+	uint8_t tag_idx;	/* the fl->tag entry this map comes from */
 #ifdef INVARIANTS
-	__be64 ba_tag;
+	__be64 ba_hwtag;
 #endif
 };
 
@@ -358,9 +358,22 @@ struct sge_eq {
 	uint32_t unstalled;	/* recovered from stall */
 };
 
+struct fl_buf_info {
+	u_int size;
+	int type;
+	int hwtag:4;	/* tag in low 4 bits of the pa. */
+	uma_zone_t zone;
+};
+#define FL_BUF_SIZES(sc)	(sc->sge.fl_buf_sizes)
+#define FL_BUF_SIZE(sc, x)	(sc->sge.fl_buf_info[x].size)
+#define FL_BUF_TYPE(sc, x)	(sc->sge.fl_buf_info[x].type)
+#define FL_BUF_HWTAG(sc, x)	(sc->sge.fl_buf_info[x].hwtag)
+#define FL_BUF_ZONE(sc, x)	(sc->sge.fl_buf_info[x].zone)
+
 enum {
 	FL_STARVING	= (1 << 0), /* on the adapter's list of starving fl's */
 	FL_DOOMED	= (1 << 1), /* about to be destroyed */
+	FL_BUF_PACKING	= (1 << 2), /* buffer packing enabled */
 };
 
 #define FL_RUNNING_LOW(fl)	(fl->cap - fl->needed <= fl->lowat)
@@ -369,7 +382,8 @@ enum {
 struct sge_fl {
 	bus_dma_tag_t desc_tag;
 	bus_dmamap_t desc_map;
-	bus_dma_tag_t tag[FL_BUF_SIZES];
+	bus_dma_tag_t tag[FL_BUF_SIZES_MAX]; /* only first FL_BUF_SIZES(sc) are
+						valid */
 	uint8_t tag_idx;
 	struct mtx fl_lock;
 	char lockname[16];
@@ -382,11 +396,13 @@ struct sge_fl {
 	uint16_t qsize;		/* size (# of entries) of the queue */
 	uint16_t cntxt_id;	/* SGE context id for the freelist */
 	uint32_t cidx;		/* consumer idx (buffer idx, NOT hw desc idx) */
+	uint32_t rx_offset;	/* offset in fl buf (when buffer packing) */
 	uint32_t pidx;		/* producer idx (buffer idx, NOT hw desc idx) */
 	uint32_t needed;	/* # of buffers needed to fill up fl. */
 	uint32_t lowat;		/* # of buffers <= this means fl needs help */
 	uint32_t pending;	/* # of bufs allocated since last doorbell */
-	unsigned int dmamap_failed;
+	u_int dmamap_failed;
+	struct mbuf *mstash[8];
 	TAILQ_ENTRY(sge_fl) link; /* All starving freelists */
 };
 
@@ -518,6 +534,9 @@ struct sge {
 	int eq_start;
 	struct sge_iq **iqmap;	/* iq->cntxt_id to iq mapping */
 	struct sge_eq **eqmap;	/* eq->cntxt_id to eq mapping */
+
+	u_int fl_buf_sizes __aligned(CACHE_LINE_SIZE);
+	struct fl_buf_info fl_buf_info[FL_BUF_SIZES_MAX];
 };
 
 struct rss_header;

Modified: stable/9/sys/dev/cxgbe/t4_sge.c
==============================================================================
--- stable/9/sys/dev/cxgbe/t4_sge.c	Wed May  7 03:17:21 2014	(r265480)
+++ stable/9/sys/dev/cxgbe/t4_sge.c	Wed May  7 04:00:05 2014	(r265481)
@@ -56,19 +56,6 @@ __FBSDID("$FreeBSD$");
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
 
-struct fl_buf_info {
-	int size;
-	int type;
-	uma_zone_t zone;
-};
-
-/* Filled up by t4_sge_modload */
-static struct fl_buf_info fl_buf_info[FL_BUF_SIZES];
-
-#define FL_BUF_SIZE(x)	(fl_buf_info[x].size)
-#define FL_BUF_TYPE(x)	(fl_buf_info[x].type)
-#define FL_BUF_ZONE(x)	(fl_buf_info[x].zone)
-
 #ifdef T4_PKT_TIMESTAMP
 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
 #else
@@ -85,7 +72,8 @@ TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_
 /*
  * Pad ethernet payload up to this boundary.
  * -1: driver should figure out a good value.
- *  Any power of 2, from 32 to 4096 (both inclusive) is a valid value.
+ *  0: disable padding.
+ *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
  */
 static int fl_pad = -1;
 TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad);
@@ -107,6 +95,33 @@ TUNABLE_INT("hw.cxgbe.spg_len", &spg_len
 static int cong_drop = 0;
 TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop);
 
+/*
+ * Deliver multiple frames in the same free list buffer if they fit.
+ * -1: let the driver decide whether to enable buffer packing or not.
+ *  0: disable buffer packing.
+ *  1: enable buffer packing.
+ */
+static int buffer_packing = -1;
+TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing);
+
+/*
+ * Start next frame in a packed buffer at this boundary.
+ * -1: driver should figure out a good value.
+ * T4:
+ * ---
+ * if fl_pad != 0
+ * 	value specified here will be overridden by fl_pad.
+ * else
+ * 	power of 2 from 32 to 4096 (both inclusive) is a valid value here.
+ * T5:
+ * ---
+ * 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
+ */
+static int fl_pack = -1;
+static int t4_fl_pack;
+static int t5_fl_pack;
+TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
+
 /* Used to track coalesced tx work request */
 struct txpkts {
 	uint64_t *flitp;	/* ptr to flit where next pkt should start */
@@ -123,12 +138,15 @@ struct sgl {
 };
 
 static int service_iq(struct sge_iq *, int);
-static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t,
+static struct mbuf *get_fl_payload1(struct adapter *, struct sge_fl *, uint32_t,
+    int *);
+static struct mbuf *get_fl_payload2(struct adapter *, struct sge_fl *, uint32_t,
     int *);
 static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int,
     int);
-static inline void init_fl(struct sge_fl *, int, int, char *);
+static inline void init_fl(struct adapter *, struct sge_fl *, int, int, int,
+    char *);
 static inline void init_eq(struct sge_eq *, int, int, uint8_t, uint16_t,
     char *);
 static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
@@ -170,8 +188,8 @@ static inline void ring_fl_db(struct ada
 static int refill_fl(struct adapter *, struct sge_fl *, int);
 static void refill_sfl(void *);
 static int alloc_fl_sdesc(struct sge_fl *);
-static void free_fl_sdesc(struct sge_fl *);
-static void set_fl_tag_idx(struct sge_fl *, int);
+static void free_fl_sdesc(struct adapter *, struct sge_fl *);
+static void set_fl_tag_idx(struct adapter *, struct sge_fl *, int);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int);
@@ -198,27 +216,20 @@ static int handle_fw_msg(struct sge_iq *
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
 
 /*
- * Called on MOD_LOAD.  Fills up fl_buf_info[] and validates/calculates the SGE
- * tunables.
+ * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
  */
 void
 t4_sge_modload(void)
 {
-	int i;
-	int bufsize[FL_BUF_SIZES] = {
-		MCLBYTES,
-#if MJUMPAGESIZE != MCLBYTES
-		MJUMPAGESIZE,
-#endif
-		MJUM9BYTES,
-		MJUM16BYTES
-	};
+	int pad;
 
-	for (i = 0; i < FL_BUF_SIZES; i++) {
-		FL_BUF_SIZE(i) = bufsize[i];
-		FL_BUF_TYPE(i) = m_gettype(bufsize[i]);
-		FL_BUF_ZONE(i) = m_getzone(bufsize[i]);
-	}
+	/* set pad to a reasonable powerof2 between 16 and 4096 (inclusive) */
+#if defined(__i386__) || defined(__amd64__)
+	pad = max(cpu_clflush_line_size, 16);
+#else
+	pad = max(CACHE_LINE_SIZE, 16);
+#endif
+	pad = min(pad, 4096);
 
 	if (fl_pktshift < 0 || fl_pktshift > 7) {
 		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
@@ -226,23 +237,35 @@ t4_sge_modload(void)
 		fl_pktshift = 2;
 	}
 
-	if (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad)) {
-		int pad;
-
-#if defined(__i386__) || defined(__amd64__)
-		pad = max(cpu_clflush_line_size, 32);
-#else
-		pad = max(CACHE_LINE_SIZE, 32);
-#endif
-		pad = min(pad, 4096);
+	if (fl_pad != 0 &&
+	    (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad))) {
 
 		if (fl_pad != -1) {
 			printf("Invalid hw.cxgbe.fl_pad value (%d),"
-			    " using %d instead.\n", fl_pad, pad);
+			    " using %d instead.\n", fl_pad, max(pad, 32));
 		}
-		fl_pad = pad;
+		fl_pad = max(pad, 32);
 	}
 
+	/*
+	 * T4 has the same pad and pack boundary.  If a pad boundary is set,
+	 * pack boundary must be set to the same value.  Otherwise take the
+	 * specified value or auto-calculate something reasonable.
+	 */
+	if (fl_pad)
+		t4_fl_pack = fl_pad;
+	else if (fl_pack < 32 || fl_pack > 4096 || !powerof2(fl_pack))
+		t4_fl_pack = max(pad, 32);
+	else
+		t4_fl_pack = fl_pack;
+
+	/* T5's pack boundary is independent of the pad boundary. */
+	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
+	    !powerof2(fl_pack))
+	       t5_fl_pack = max(pad, 64);
+	else
+	       t5_fl_pack = fl_pack;
+
 	if (spg_len != 64 && spg_len != 128) {
 		int len;
 
@@ -289,17 +312,41 @@ t4_tweak_chip_settings(struct adapter *s
 	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
 	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
+	int sw_flbuf_sizes[] = {
+		MCLBYTES,
+#if MJUMPAGESIZE != MCLBYTES
+		MJUMPAGESIZE,
+#endif
+		MJUM9BYTES,
+		MJUM16BYTES,
+		MJUMPAGESIZE - MSIZE
+	};
 
 	KASSERT(sc->flags & MASTER_PF,
 	    ("%s: trying to change chip settings when not master.", __func__));
 
-	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE |
-	    V_INGPADBOUNDARY(M_INGPADBOUNDARY) | F_EGRSTATUSPAGESIZE;
+	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
-	    V_INGPADBOUNDARY(ilog2(fl_pad) - 5) |
 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
+	if (is_t4(sc) && (fl_pad || buffer_packing)) {
+		/* t4_fl_pack has the correct value even when fl_pad = 0 */
+		m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY);
+		v |= V_INGPADBOUNDARY(ilog2(t4_fl_pack) - 5);
+	} else if (is_t5(sc) && fl_pad) {
+		m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY);
+		v |= V_INGPADBOUNDARY(ilog2(fl_pad) - 5);
+	}
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
+	if (is_t5(sc) && buffer_packing) {
+		m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
+		if (t5_fl_pack == 16)
+			v = V_INGPACKBOUNDARY(0);
+		else
+			v = V_INGPACKBOUNDARY(ilog2(t5_fl_pack) - 5);
+		t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
+	}
+
 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
@@ -310,9 +357,9 @@ t4_tweak_chip_settings(struct adapter *s
 	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
 	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
 
-	for (i = 0; i < FL_BUF_SIZES; i++) {
+	for (i = 0; i < min(nitems(sw_flbuf_sizes), 16); i++) {
 		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
-		    FL_BUF_SIZE(i));
+		    sw_flbuf_sizes[i]);
 	}
 
 	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
@@ -373,21 +420,48 @@ int
 t4_read_chip_settings(struct adapter *sc)
 {
 	struct sge *s = &sc->sge;
-	int i, rc = 0;
+	int i, j, n, rc = 0;
 	uint32_t m, v, r;
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
+	uint32_t sge_flbuf_sizes[16], sw_flbuf_sizes[] = {
+		MCLBYTES,
+#if MJUMPAGESIZE != MCLBYTES
+		MJUMPAGESIZE,
+#endif
+		MJUM9BYTES,
+		MJUM16BYTES
+	};
 
-	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE |
-	    V_INGPADBOUNDARY(M_INGPADBOUNDARY) | F_EGRSTATUSPAGESIZE;
+	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
-	    V_INGPADBOUNDARY(ilog2(fl_pad) - 5) |
 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
+	if (is_t4(sc) && (fl_pad || buffer_packing)) {
+		m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY);
+		v |= V_INGPADBOUNDARY(ilog2(t4_fl_pack) - 5);
+	} else if (is_t5(sc) && fl_pad) {
+		m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY);
+		v |= V_INGPADBOUNDARY(ilog2(fl_pad) - 5);
+	}
 	r = t4_read_reg(sc, A_SGE_CONTROL);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
+	if (is_t5(sc) && buffer_packing) {
+		m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
+		if (t5_fl_pack == 16)
+			v = V_INGPACKBOUNDARY(0);
+		else
+			v = V_INGPACKBOUNDARY(ilog2(t5_fl_pack) - 5);
+		r = t4_read_reg(sc, A_SGE_CONTROL2);
+		if ((r & m) != v) {
+			device_printf(sc->dev,
+			    "invalid SGE_CONTROL2(0x%x)\n", r);
+			rc = EINVAL;
+		}
+	}
+
 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
@@ -402,14 +476,45 @@ t4_read_chip_settings(struct adapter *sc
 		rc = EINVAL;
 	}
 
-	for (i = 0; i < FL_BUF_SIZES; i++) {
-		v = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i));
-		if (v != FL_BUF_SIZE(i)) {
-			device_printf(sc->dev,
-			    "invalid SGE_FL_BUFFER_SIZE[%d](0x%x)\n", i, v);
-			rc = EINVAL;
+	/*
+	 * Make a list of SGE FL buffer sizes programmed in the chip and tally
+	 * it with the FL buffer sizes that we'd like to use.
+	 */
+	n = 0;
+	for (i = 0; i < nitems(sge_flbuf_sizes); i++) {
+		r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i));
+		sge_flbuf_sizes[i] = r;
+		if (r == MJUMPAGESIZE - MSIZE &&
+		    (sc->flags & BUF_PACKING_OK) == 0) {
+			sc->flags |= BUF_PACKING_OK;
+			FL_BUF_HWTAG(sc, n) = i;
+			FL_BUF_SIZE(sc, n) = MJUMPAGESIZE - MSIZE;
+			FL_BUF_TYPE(sc, n) = m_gettype(MJUMPAGESIZE);
+			FL_BUF_ZONE(sc, n) = m_getzone(MJUMPAGESIZE);
+			n++;
+		}
+	}
+	for (i = 0; i < nitems(sw_flbuf_sizes); i++) {
+		for (j = 0; j < nitems(sge_flbuf_sizes); j++) {
+			if (sw_flbuf_sizes[i] != sge_flbuf_sizes[j])
+				continue;
+			FL_BUF_HWTAG(sc, n) = j;
+			FL_BUF_SIZE(sc, n) = sw_flbuf_sizes[i];
+			FL_BUF_TYPE(sc, n) = m_gettype(sw_flbuf_sizes[i]);
+			FL_BUF_ZONE(sc, n) = m_getzone(sw_flbuf_sizes[i]);
+			n++;
+			break;
 		}
 	}
+	if (n == 0) {
+		device_printf(sc->dev, "no usable SGE FL buffer size.\n");
+		rc = EINVAL;
+	} else if (n == 1 && (sc->flags & BUF_PACKING_OK)) {
+		device_printf(sc->dev,
+		    "no usable SGE FL buffer size when not packing buffers.\n");
+		rc = EINVAL;
+	}
+	FL_BUF_SIZES(sc) = n;
 
 	r = t4_read_reg(sc, A_SGE_INGRESS_RX_THRESHOLD);
 	s->counter_val[0] = G_THRESHOLD_0(r);
@@ -496,6 +601,17 @@ t4_create_dma_tag(struct adapter *sc)
 	return (rc);
 }
 
+static inline int
+enable_buffer_packing(struct adapter *sc)
+{
+
+	if (sc->flags & BUF_PACKING_OK &&
+	    ((is_t5(sc) && buffer_packing) ||	/* 1 or -1 both ok for T5 */
+	    (is_t4(sc) && buffer_packing == 1)))
+		return (1);
+	return (0);
+}
+
 void
 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *children)
@@ -512,6 +628,14 @@ t4_sge_sysctls(struct adapter *sc, struc
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
 	    NULL, cong_drop, "congestion drop setting");
+
+	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "buffer_packing", CTLFLAG_RD,
+	    NULL, enable_buffer_packing(sc),
+	    "pack multiple frames in one fl buffer");
+
+	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
+	    NULL, is_t5(sc) ? t5_fl_pack : t4_fl_pack,
+	    "payload pack boundary (bytes)");
 }
 
 int
@@ -703,7 +827,7 @@ t4_setup_port_queues(struct port_info *p
 	struct ifnet *ifp = pi->ifp;
 	struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
-	int bufsize;
+	int bufsize, pack;
 
 	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD,
 	    NULL, "rx queues");
@@ -725,6 +849,7 @@ t4_setup_port_queues(struct port_info *p
 	 * b) allocate queue iff it will take direct interrupts.
 	 */
 	bufsize = mtu_to_bufsize(ifp->if_mtu);
+	pack = enable_buffer_packing(sc);
 	for_each_rxq(pi, i, rxq) {
 
 		init_iq(&rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, pi->qsize_rxq,
@@ -732,7 +857,7 @@ t4_setup_port_queues(struct port_info *p
 
 		snprintf(name, sizeof(name), "%s rxq%d-fl",
 		    device_get_nameunit(pi->dev), i);
-		init_fl(&rxq->fl, pi->qsize_rxq / 8, bufsize, name);
+		init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, bufsize, pack, name);
 
 		if (sc->flags & INTR_DIRECT
 #ifdef TCP_OFFLOAD
@@ -749,6 +874,7 @@ t4_setup_port_queues(struct port_info *p
 
 #ifdef TCP_OFFLOAD
 	bufsize = mtu_to_bufsize_toe(sc, ifp->if_mtu);
+	pack = 0;	/* XXX: think about this some more */
 	for_each_ofld_rxq(pi, i, ofld_rxq) {
 
 		init_iq(&ofld_rxq->iq, sc, pi->tmr_idx, pi->pktc_idx,
@@ -756,7 +882,8 @@ t4_setup_port_queues(struct port_info *p
 
 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
 		    device_get_nameunit(pi->dev), i);
-		init_fl(&ofld_rxq->fl, pi->qsize_rxq / 8, bufsize, name);
+		init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, bufsize, pack,
+		    name);
 
 		if (sc->flags & INTR_DIRECT ||
 		    (sc->intr_count > 1 && pi->nofldrxq > pi->nrxq)) {
@@ -1030,7 +1157,12 @@ service_iq(struct sge_iq *iq, int budget
 				    ("%s: data for an iq (%p) with no freelist",
 				    __func__, iq));
 
-				m0 = get_fl_payload(sc, fl, lq, &fl_bufs_used);
+				m0 = fl->flags & FL_BUF_PACKING ?
+				    get_fl_payload1(sc, fl, lq, &fl_bufs_used) :
+				    get_fl_payload2(sc, fl, lq, &fl_bufs_used);
+
+				if (__predict_false(m0 == NULL))
+					goto process_iql;
 #ifdef T4_PKT_TIMESTAMP
 				/*
 				 * 60 bit timestamp for the payload is
@@ -1106,6 +1238,7 @@ service_iq(struct sge_iq *iq, int budget
 			}
 		}
 
+process_iql:
 		if (STAILQ_EMPTY(&iql))
 			break;
 
@@ -1151,13 +1284,100 @@ service_iq(struct sge_iq *iq, int budget
 	return (0);
 }
 
+static int
+fill_mbuf_stash(struct sge_fl *fl)
+{
+	int i;
+
+	for (i = 0; i < nitems(fl->mstash); i++) {
+		if (fl->mstash[i] == NULL) {
+			struct mbuf *m;
+			if ((m = m_get(M_NOWAIT, MT_NOINIT)) == NULL)
+				return (ENOBUFS);
+			fl->mstash[i] = m;
+		}
+	}
+	return (0);
+}
+
 static struct mbuf *
-get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
+get_mbuf_from_stash(struct sge_fl *fl)
+{
+	int i;
+
+	for (i = 0; i < nitems(fl->mstash); i++) {
+		if (fl->mstash[i] != NULL) {
+			struct mbuf *m;
+
+			m = fl->mstash[i];
+			fl->mstash[i] = NULL;
+			return (m);
+		} else
+			fl->mstash[i] = m_get(M_NOWAIT, MT_NOINIT);
+	}
+
+	return (m_get(M_NOWAIT, MT_NOINIT));
+}
+
+static void
+return_mbuf_to_stash(struct sge_fl *fl, struct mbuf *m)
+{
+	int i;
+
+	if (m == NULL)
+		return;
+
+	for (i = 0; i < nitems(fl->mstash); i++) {
+		if (fl->mstash[i] == NULL) {
+			fl->mstash[i] = m;
+			return;
+		}
+	}
+	m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0);
+	m_free(m);
+}
+
+/* buf can be any address within the buffer */
+static inline u_int *
+find_buf_refcnt(caddr_t buf)
+{
+	uintptr_t ptr = (uintptr_t)buf;
+
+	return ((u_int *)((ptr & ~(MJUMPAGESIZE - 1)) + MSIZE - sizeof(u_int)));
+}
+
+static inline struct mbuf *
+find_buf_mbuf(caddr_t buf)
+{
+	uintptr_t ptr = (uintptr_t)buf;
+
+	return ((struct mbuf *)(ptr & ~(MJUMPAGESIZE - 1)));
+}
+
+static void
+rxb_free(void *arg1, void *arg2)
+{
+	uma_zone_t zone = arg1;
+	caddr_t cl = arg2;
+#ifdef INVARIANTS
+	u_int refcount;
+
+	refcount = *find_buf_refcnt(cl);
+	KASSERT(refcount == 0, ("%s: cl %p refcount is %u", __func__,
+	    cl - MSIZE, refcount));
+#endif
+	cl -= MSIZE;
+	uma_zfree(zone, cl);
+}
+
+static struct mbuf *
+get_fl_payload1(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
     int *fl_bufs_used)
 {
 	struct mbuf *m0, *m;
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	unsigned int nbuf, len;
+	int pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack;
 
 	/*
 	 * No assertion for the fl lock because we don't need it.  This routine
@@ -1168,29 +1388,194 @@ get_fl_payload(struct adapter *sc, struc
 	 * lock but this routine does not).
 	 */
 
+	KASSERT(fl->flags & FL_BUF_PACKING,
+	    ("%s: buffer packing disabled for fl %p", __func__, fl));
+
+	len = G_RSPD_LEN(len_newbuf);
+
+	if ((len_newbuf & F_RSPD_NEWBUF) == 0) {
+		KASSERT(fl->rx_offset > 0,
+		    ("%s: packed frame but driver at offset=0", __func__));
+
+		/* A packed frame is guaranteed to fit entirely in this buf. */
+		KASSERT(FL_BUF_SIZE(sc, sd->tag_idx) - fl->rx_offset >= len,
+		    ("%s: packing error.  bufsz=%u, offset=%u, len=%u",
+		    __func__, FL_BUF_SIZE(sc, sd->tag_idx), fl->rx_offset,
+		    len));
+
+		m0 = get_mbuf_from_stash(fl);
+		if (m0 == NULL ||
+		    m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR) != 0) {
+			return_mbuf_to_stash(fl, m0);
+			return (NULL);
+		}
+
+		bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
+		    BUS_DMASYNC_POSTREAD);
+		if (len < RX_COPY_THRESHOLD) {
+#ifdef T4_PKT_TIMESTAMP
+			/* Leave room for a timestamp */
+			m0->m_data += 8;
+#endif
+			bcopy(sd->cl + fl->rx_offset, mtod(m0, caddr_t), len);
+			m0->m_pkthdr.len = len;
+			m0->m_len = len;
+		} else {
+			m0->m_pkthdr.len = len;
+			m0->m_len = len;
+			m_extaddref(m0, sd->cl + fl->rx_offset,
+			    roundup2(m0->m_len, fl_pad),
+			    find_buf_refcnt(sd->cl), rxb_free,
+			    FL_BUF_ZONE(sc, sd->tag_idx), sd->cl);
+		}
+		fl->rx_offset += len;
+		fl->rx_offset = roundup2(fl->rx_offset, fl_pad);
+		fl->rx_offset = roundup2(fl->rx_offset, pack_boundary);
+		if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) {
+			fl->rx_offset = 0;
+			(*fl_bufs_used) += 1;
+			if (__predict_false(++fl->cidx == fl->cap))
+				fl->cidx = 0;
+		}
+
+		return (m0);
+	}
+
+	KASSERT(len_newbuf & F_RSPD_NEWBUF,
+	    ("%s: only new buffer handled here", __func__));
+
+	nbuf = 0;
+
+	/*
+	 * Move to the start of the next buffer if we are still in the middle of
+	 * some buffer.  This is the case where there was some room left in the
+	 * previous buffer but not enough to fit this frame in its entirety.
+	 */
+	if (fl->rx_offset > 0) {
+		KASSERT(roundup2(len, fl_pad) > FL_BUF_SIZE(sc, sd->tag_idx) -
+		    fl->rx_offset, ("%s: frame (%u bytes) should have fit at "
+		    "cidx %u offset %u bufsize %u", __func__, len, fl->cidx,
+		    fl->rx_offset, FL_BUF_SIZE(sc, sd->tag_idx)));
+		nbuf++;
+		fl->rx_offset = 0;
+		sd++;
+		if (__predict_false(++fl->cidx == fl->cap)) {
+			sd = fl->sdesc;
+			fl->cidx = 0;
+		}
+	}
+
+	m0 = find_buf_mbuf(sd->cl);
+	if (m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR | M_NOFREE))
+		goto done;
+	bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, BUS_DMASYNC_POSTREAD);
+	m0->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx));
+	m_extaddref(m0, sd->cl, roundup2(m0->m_len, fl_pad),
+	    find_buf_refcnt(sd->cl), rxb_free, FL_BUF_ZONE(sc, sd->tag_idx),
+	    sd->cl);
+	m0->m_pkthdr.len = len;
+
+	fl->rx_offset = roundup2(m0->m_len, fl_pad);
+	fl->rx_offset = roundup2(fl->rx_offset, pack_boundary);
+	if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) {
+		fl->rx_offset = 0;
+		nbuf++;
+		sd++;
+		if (__predict_false(++fl->cidx == fl->cap)) {
+			sd = fl->sdesc;
+			fl->cidx = 0;
+		}
+	}
+
+	m = m0;
+	len -= m->m_len;
+
+	while (len > 0) {
+		m->m_next = find_buf_mbuf(sd->cl);
+		m = m->m_next;
+
+		bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
+		    BUS_DMASYNC_POSTREAD);
+
+		/* m_init for !M_PKTHDR can't fail so don't bother */
+		m_init(m, NULL, 0, M_NOWAIT, MT_DATA, M_NOFREE);
+		m->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx));
+		m_extaddref(m, sd->cl, roundup2(m->m_len, fl_pad),
+		    find_buf_refcnt(sd->cl), rxb_free,
+		    FL_BUF_ZONE(sc, sd->tag_idx), sd->cl);
+
+		fl->rx_offset = roundup2(m->m_len, fl_pad);
+		fl->rx_offset = roundup2(fl->rx_offset, pack_boundary);
+		if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) {
+			fl->rx_offset = 0;
+			nbuf++;
+			sd++;
+			if (__predict_false(++fl->cidx == fl->cap)) {
+				sd = fl->sdesc;
+				fl->cidx = 0;
+			}
+		}
+
+		len -= m->m_len;
+	}
+done:
+	(*fl_bufs_used) += nbuf;
+	return (m0);
+}
+
+static struct mbuf *
+get_fl_payload2(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
+    int *fl_bufs_used)
+{
+	struct mbuf *m0, *m;
+	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
+	unsigned int nbuf, len;
+
+	/*
+	 * No assertion for the fl lock because we don't need it.  This routine
+	 * is called only from the rx interrupt handler and it only updates
+	 * fl->cidx.  (Contrast that with fl->pidx/fl->needed which could be
+	 * updated in the rx interrupt handler or the starvation helper routine.
+	 * That's why code that manipulates fl->pidx/fl->needed needs the fl
+	 * lock but this routine does not).
+	 */
+
+	KASSERT((fl->flags & FL_BUF_PACKING) == 0,
+	    ("%s: buffer packing enabled for fl %p", __func__, fl));
 	if (__predict_false((len_newbuf & F_RSPD_NEWBUF) == 0))
 		panic("%s: cannot handle packed frames", __func__);
 	len = G_RSPD_LEN(len_newbuf);
 
-	m0 = sd->m;
-	sd->m = NULL;	/* consumed */
+	/*
+	 * We never want to run out of mbufs in between a frame when a frame
+	 * spans multiple fl buffers.  If the fl's mbuf stash isn't full and
+	 * can't be filled up to the brim then fail early.
+	 */
+	if (len > FL_BUF_SIZE(sc, sd->tag_idx) && fill_mbuf_stash(fl) != 0)
+		return (NULL);
+
+	m0 = get_mbuf_from_stash(fl);
+	if (m0 == NULL ||
+	    m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR) != 0) {
+		return_mbuf_to_stash(fl, m0);
+		return (NULL);
+	}
 
 	bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, BUS_DMASYNC_POSTREAD);
-	m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR);
-#ifdef T4_PKT_TIMESTAMP
-	/* Leave room for a timestamp */
-	m0->m_data += 8;
-#endif
 
 	if (len < RX_COPY_THRESHOLD) {
+#ifdef T4_PKT_TIMESTAMP
+		/* Leave room for a timestamp */
+		m0->m_data += 8;
+#endif
 		/* copy data to mbuf, buffer will be recycled */
 		bcopy(sd->cl, mtod(m0, caddr_t), len);
 		m0->m_len = len;
 	} else {
 		bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map);
-		m_cljset(m0, sd->cl, FL_BUF_TYPE(sd->tag_idx));
+		m_cljset(m0, sd->cl, FL_BUF_TYPE(sc, sd->tag_idx));
 		sd->cl = NULL;	/* consumed */
-		m0->m_len = min(len, FL_BUF_SIZE(sd->tag_idx));
+		m0->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx));
 	}
 	m0->m_pkthdr.len = len;
 
@@ -1205,23 +1590,23 @@ get_fl_payload(struct adapter *sc, struc
 	nbuf = 1;	/* # of fl buffers used */
 
 	while (len > 0) {
-		m->m_next = sd->m;
-		sd->m = NULL;	/* consumed */
+		/* Can't fail, we checked earlier that the stash was full. */
+		m->m_next = get_mbuf_from_stash(fl);
 		m = m->m_next;
 
 		bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
 		    BUS_DMASYNC_POSTREAD);
 
+		/* m_init for !M_PKTHDR can't fail so don't bother */
 		m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0);
 		if (len <= MLEN) {
 			bcopy(sd->cl, mtod(m, caddr_t), len);
 			m->m_len = len;
 		} else {
-			bus_dmamap_unload(fl->tag[sd->tag_idx],
-			    sd->map);
-			m_cljset(m, sd->cl, FL_BUF_TYPE(sd->tag_idx));
+			bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map);
+			m_cljset(m, sd->cl, FL_BUF_TYPE(sc, sd->tag_idx));
 			sd->cl = NULL;	/* consumed */
-			m->m_len = min(len, FL_BUF_SIZE(sd->tag_idx));
+			m->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx));
 		}
 
 		sd++;
@@ -1586,6 +1971,7 @@ void
 t4_update_fl_bufsize(struct ifnet *ifp)
 {
 	struct port_info *pi = ifp->if_softc;
+	struct adapter *sc = pi->adapter;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
@@ -1598,7 +1984,7 @@ t4_update_fl_bufsize(struct ifnet *ifp)
 		fl = &rxq->fl;
 
 		FL_LOCK(fl);
-		set_fl_tag_idx(fl, bufsize);
+		set_fl_tag_idx(sc, fl, bufsize);
 		FL_UNLOCK(fl);
 	}
 #ifdef TCP_OFFLOAD
@@ -1607,7 +1993,7 @@ t4_update_fl_bufsize(struct ifnet *ifp)
 		fl = &ofld_rxq->fl;
 
 		FL_LOCK(fl);
-		set_fl_tag_idx(fl, bufsize);
+		set_fl_tag_idx(sc, fl, bufsize);
 		FL_UNLOCK(fl);
 	}
 #endif
@@ -1641,11 +2027,15 @@ init_iq(struct sge_iq *iq, struct adapte
 }
 
 static inline void
-init_fl(struct sge_fl *fl, int qsize, int bufsize, char *name)
+init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int bufsize, int pack,
+    char *name)
 {
+
 	fl->qsize = qsize;
 	strlcpy(fl->lockname, name, sizeof(fl->lockname));
-	set_fl_tag_idx(fl, bufsize);
+	if (pack)
+		fl->flags |= FL_BUF_PACKING;
+	set_fl_tag_idx(sc, fl, bufsize);
 }
 
 static inline void
@@ -1774,7 +2164,7 @@ alloc_iq_fl(struct port_info *pi, struct
 	if (fl) {
 		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
 
-		for (i = 0; i < FL_BUF_SIZES; i++) {
+		for (i = 0; i < FL_BUF_SIZES(sc); i++) {
 
 			/*
 			 * A freelist buffer must be 16 byte aligned as the SGE
@@ -1783,8 +2173,8 @@ alloc_iq_fl(struct port_info *pi, struct
 			 */
 			rc = bus_dma_tag_create(sc->dmat, 16, 0,
 			    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
-			    FL_BUF_SIZE(i), 1, FL_BUF_SIZE(i), BUS_DMA_ALLOCNOW,
-			    NULL, NULL, &fl->tag[i]);
+			    FL_BUF_SIZE(sc, i), 1, FL_BUF_SIZE(sc, i),
+			    BUS_DMA_ALLOCNOW, NULL, NULL, &fl->tag[i]);
 			if (rc != 0) {
 				device_printf(sc->dev,
 				    "failed to create fl DMA tag[%d]: %d\n",
@@ -1813,7 +2203,9 @@ alloc_iq_fl(struct port_info *pi, struct
 		c.iqns_to_fl0congen |=
 		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
 			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
-			F_FW_IQ_CMD_FL0PADEN);
+			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
+			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
+			    0));
 		if (cong >= 0) {
 			c.iqns_to_fl0congen |=
 				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
@@ -1934,12 +2326,21 @@ free_iq_fl(struct port_info *pi, struct 
 		    fl->desc);
 
 		if (fl->sdesc)
-			free_fl_sdesc(fl);
+			free_fl_sdesc(sc, fl);
+
+		for (i = 0; i < nitems(fl->mstash); i++) {
+			struct mbuf *m = fl->mstash[i];
+
+			if (m != NULL) {
+				m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0);
+				m_free(m);
+			}
+		}
 
 		if (mtx_initialized(&fl->fl_lock))
 			mtx_destroy(&fl->fl_lock);
 
-		for (i = 0; i < FL_BUF_SIZES; i++) {
+		for (i = 0; i < FL_BUF_SIZES(sc); i++) {
 			if (fl->tag[i])
 				bus_dma_tag_destroy(fl->tag[i]);
 		}
@@ -2100,6 +2501,10 @@ alloc_rxq(struct port_info *pi, struct s
 	    "SGE context id of the queue");
 	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
 	    &rxq->fl.cidx, 0, "consumer index");
+	if (rxq->fl.flags & FL_BUF_PACKING) {
+		SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "rx_offset",
+		    CTLFLAG_RD, &rxq->fl.rx_offset, 0, "packing rx offset");
+	}
 	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
 	    &rxq->fl.pidx, 0, "producer index");
 
@@ -2661,6 +3066,12 @@ refill_fl(struct adapter *sc, struct sge
 	int rc;
 
 	FL_LOCK_ASSERT_OWNED(fl);
+#ifdef INVARIANTS
+	if (fl->flags & FL_BUF_PACKING)
+		KASSERT(sd->tag_idx == 0,
+		    ("%s: expected tag 0 but found tag %d at pidx %u instead",
+		    __func__, sd->tag_idx, fl->pidx));
+#endif
 
 	if (nbufs > fl->needed)
 		nbufs = fl->needed;
@@ -2669,24 +3080,34 @@ refill_fl(struct adapter *sc, struct sge
 
 		if (sd->cl != NULL) {
 
-			/*
-			 * This happens when a frame small enough to fit

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201405070400.s47405jC020122>