Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 26 Jan 2015 14:47:32 +0300
From:      Andrew Rybchenko <Andrew.Rybchenko@oktetlabs.ru>
To:        net@FreeBSD.org
Subject:   [PATCH] sfxge improvements and bug fixes
Message-ID:  <54C62954.5030004@oktetlabs.ru>

next in thread | raw e-mail | index | archive | help
This is a multi-part message in MIME format.
--------------050201000503010305030302
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Transfer-Encoding: 7bit

Hello,

my colleagues and me have a number of patches to sfxge driver including:
  - performance optimizations
  - fix TSO to work fine when VLAN is used (IPv6 does not work yet, to 
be fixed in the nearest time)
  - add tunable to control maximum number of RSS channels
  - improve software Tx queue to have separate limits for TCP and 
non-TCP traffic and more granular overflow statistics
  - cleanup

Patches are attached.

The goal of the submission is to get feedback and review notes 
(including mentor).

Best regards,
Andrew.

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="01-64bit.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="01-64bit.patch"

sfxge: using 64-bit access for x86-64

Submitted by:   Artem V. Andreev <Artem.Andreev at oktetlabs.ru>
Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/common/efsys.h
===================================================================
--- sys/dev/sfxge/common/efsys.h	(revision 277726)
+++ sys/dev/sfxge/common/efsys.h	(working copy)
@@ -51,7 +51,11 @@
 #include <machine/endian.h>
 
 #define	EFSYS_HAS_UINT64 1
+#if defined(__x86_64__)
+#define	EFSYS_USE_UINT64 1
+#else
 #define	EFSYS_USE_UINT64 0
+#endif
 #if _BYTE_ORDER == _BIG_ENDIAN
 #define	EFSYS_IS_BIG_ENDIAN 1
 #define	EFSYS_IS_LITTLE_ENDIAN 0
@@ -399,8 +403,28 @@
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
+#if defined(__x86_64__)
 #define	EFSYS_MEM_READQ(_esmp, _offset, _eqp)				\
 	do {								\
+		uint64_t *addr;						\
+									\
+		_NOTE(CONSTANTCONDITION)				\
+		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_qword_t)),	\
+		    ("not power of 2 aligned"));			\
+									\
+		addr = (void *)((_esmp)->esm_base + (_offset));		\
+									\
+		(_eqp)->eq_u64[0] = *addr;				\
+									\
+		EFSYS_PROBE3(mem_readq, unsigned int, (_offset),	\
+		    uint32_t, (_eqp)->eq_u32[1],			\
+		    uint32_t, (_eqp)->eq_u32[0]);			\
+									\
+	_NOTE(CONSTANTCONDITION)					\
+	} while (B_FALSE)
+#else
+#define	EFSYS_MEM_READQ(_esmp, _offset, _eqp)				\
+	do {								\
 		uint32_t *addr;						\
 									\
 		_NOTE(CONSTANTCONDITION)				\
@@ -418,9 +442,33 @@
 									\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
+#endif
 
+#if defined(__x86_64__)
 #define	EFSYS_MEM_READO(_esmp, _offset, _eop)				\
 	do {								\
+		uint64_t *addr;						\
+									\
+		_NOTE(CONSTANTCONDITION)				\
+		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_oword_t)),	\
+		    ("not power of 2 aligned"));			\
+									\
+		addr = (void *)((_esmp)->esm_base + (_offset));		\
+									\
+		(_eop)->eo_u64[0] = *addr++;				\
+		(_eop)->eo_u64[1] = *addr;				\
+									\
+		EFSYS_PROBE5(mem_reado, unsigned int, (_offset),	\
+		    uint32_t, (_eop)->eo_u32[3],			\
+		    uint32_t, (_eop)->eo_u32[2],			\
+		    uint32_t, (_eop)->eo_u32[1],			\
+		    uint32_t, (_eop)->eo_u32[0]);			\
+									\
+	_NOTE(CONSTANTCONDITION)					\
+	} while (B_FALSE)
+#else
+#define	EFSYS_MEM_READO(_esmp, _offset, _eop)				\
+	do {								\
 		uint32_t *addr;						\
 									\
 		_NOTE(CONSTANTCONDITION)				\
@@ -442,6 +490,7 @@
 									\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
+#endif
 
 #define	EFSYS_MEM_WRITED(_esmp, _offset, _edp)				\
 	do {								\
@@ -461,8 +510,29 @@
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
+#if defined(__x86_64__)
 #define	EFSYS_MEM_WRITEQ(_esmp, _offset, _eqp)				\
 	do {								\
+		uint64_t *addr;						\
+									\
+		_NOTE(CONSTANTCONDITION)				\
+		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_qword_t)),	\
+		    ("not power of 2 aligned"));			\
+									\
+		EFSYS_PROBE3(mem_writeq, unsigned int, (_offset),	\
+		    uint32_t, (_eqp)->eq_u32[1],			\
+		    uint32_t, (_eqp)->eq_u32[0]);			\
+									\
+		addr = (void *)((_esmp)->esm_base + (_offset));		\
+									\
+		*addr   = (_eqp)->eq_u64[0];				\
+									\
+	_NOTE(CONSTANTCONDITION)					\
+	} while (B_FALSE)
+
+#else
+#define	EFSYS_MEM_WRITEQ(_esmp, _offset, _eqp)				\
+	do {								\
 		uint32_t *addr;						\
 									\
 		_NOTE(CONSTANTCONDITION)				\
@@ -480,9 +550,33 @@
 									\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
+#endif
 
+#if defined(__x86_64__)
 #define	EFSYS_MEM_WRITEO(_esmp, _offset, _eop)				\
 	do {								\
+		uint64_t *addr;						\
+									\
+		_NOTE(CONSTANTCONDITION)				\
+		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_oword_t)),	\
+		    ("not power of 2 aligned"));			\
+									\
+		EFSYS_PROBE5(mem_writeo, unsigned int, (_offset),	\
+		    uint32_t, (_eop)->eo_u32[3],			\
+		    uint32_t, (_eop)->eo_u32[2],			\
+		    uint32_t, (_eop)->eo_u32[1],			\
+		    uint32_t, (_eop)->eo_u32[0]);			\
+									\
+		addr = (void *)((_esmp)->esm_base + (_offset));		\
+									\
+		*addr++ = (_eop)->eo_u64[0];				\
+		*addr   = (_eop)->eo_u64[1];				\
+									\
+	_NOTE(CONSTANTCONDITION)					\
+	} while (B_FALSE)
+#else
+#define	EFSYS_MEM_WRITEO(_esmp, _offset, _eop)				\
+	do {								\
 		uint32_t *addr;						\
 									\
 		_NOTE(CONSTANTCONDITION)				\
@@ -504,6 +598,7 @@
 									\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
+#endif
 
 #define	EFSYS_MEM_ADDR(_esmp)						\
 	((_esmp)->esm_addr)
@@ -540,6 +635,7 @@
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
+#if defined(__x86_64__)
 #define	EFSYS_BAR_READQ(_esbp, _offset, _eqp)				\
 	do {								\
 		_NOTE(CONSTANTCONDITION)				\
@@ -548,6 +644,53 @@
 									\
 		mtx_lock(&((_esbp)->esb_lock));				\
 									\
+		(_eqp)->eq_u64[0] = bus_space_read_8((_esbp)->esb_tag,	\
+		    (_esbp)->esb_handle, (_offset));			\
+									\
+		EFSYS_PROBE3(bar_readq, unsigned int, (_offset),	\
+		    uint32_t, (_eqp)->eq_u32[1],			\
+		    uint32_t, (_eqp)->eq_u32[0]);			\
+									\
+		mtx_unlock(&((_esbp)->esb_lock));			\
+	_NOTE(CONSTANTCONDITION)					\
+	} while (B_FALSE)
+
+#define	EFSYS_BAR_READO(_esbp, _offset, _eop, _lock)			\
+	do {								\
+		_NOTE(CONSTANTCONDITION)				\
+		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_oword_t)),	\
+		    ("not power of 2 aligned"));			\
+									\
+		_NOTE(CONSTANTCONDITION)				\
+		if (_lock)						\
+			mtx_lock(&((_esbp)->esb_lock));			\
+									\
+		(_eop)->eo_u64[0] = bus_space_read_8((_esbp)->esb_tag,	\
+		    (_esbp)->esb_handle, (_offset));			\
+		(_eop)->eo_u64[1] = bus_space_read_8((_esbp)->esb_tag,	\
+		    (_esbp)->esb_handle, (_offset+8));			\
+									\
+		EFSYS_PROBE5(bar_reado, unsigned int, (_offset),	\
+		    uint32_t, (_eop)->eo_u32[3],			\
+		    uint32_t, (_eop)->eo_u32[2],			\
+		    uint32_t, (_eop)->eo_u32[1],			\
+		    uint32_t, (_eop)->eo_u32[0]);			\
+									\
+		_NOTE(CONSTANTCONDITION)				\
+		if (_lock)						\
+			mtx_unlock(&((_esbp)->esb_lock));		\
+	_NOTE(CONSTANTCONDITION)					\
+	} while (B_FALSE)
+
+#else
+#define	EFSYS_BAR_READQ(_esbp, _offset, _eqp)				\
+	do {								\
+		_NOTE(CONSTANTCONDITION)				\
+		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_qword_t)),	\
+		    ("not power of 2 aligned"));			\
+									\
+		mtx_lock(&((_esbp)->esb_lock));				\
+									\
 		(_eqp)->eq_u32[0] = bus_space_read_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset));			\
 		(_eqp)->eq_u32[1] = bus_space_read_4((_esbp)->esb_tag,	\
@@ -591,6 +734,7 @@
 			mtx_unlock(&((_esbp)->esb_lock));		\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
+#endif
 
 #define	EFSYS_BAR_WRITED(_esbp, _offset, _edp, _lock)			\
 	do {								\
@@ -614,6 +758,7 @@
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
+#if defined(__x86_64__)
 #define	EFSYS_BAR_WRITEQ(_esbp, _offset, _eqp)				\
 	do {								\
 		_NOTE(CONSTANTCONDITION)				\
@@ -626,6 +771,25 @@
 		    uint32_t, (_eqp)->eq_u32[1],			\
 		    uint32_t, (_eqp)->eq_u32[0]);			\
 									\
+		bus_space_write_8((_esbp)->esb_tag, (_esbp)->esb_handle,\
+		    (_offset), (_eqp)->eq_u64[0]);			\
+									\
+		mtx_unlock(&((_esbp)->esb_lock));			\
+	_NOTE(CONSTANTCONDITION)					\
+	} while (B_FALSE)
+#else
+#define	EFSYS_BAR_WRITEQ(_esbp, _offset, _eqp)				\
+	do {								\
+		_NOTE(CONSTANTCONDITION)				\
+		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_qword_t)),	\
+		    ("not power of 2 aligned"));			\
+									\
+		mtx_lock(&((_esbp)->esb_lock));				\
+									\
+		EFSYS_PROBE3(bar_writeq, unsigned int, (_offset),	\
+		    uint32_t, (_eqp)->eq_u32[1],			\
+		    uint32_t, (_eqp)->eq_u32[0]);			\
+									\
 		bus_space_write_4((_esbp)->esb_tag, (_esbp)->esb_handle,\
 		    (_offset), (_eqp)->eq_u32[0]);			\
 		bus_space_write_4((_esbp)->esb_tag, (_esbp)->esb_handle,\
@@ -634,7 +798,9 @@
 		mtx_unlock(&((_esbp)->esb_lock));			\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
+#endif
 
+#if defined(__x86_64__)
 #define	EFSYS_BAR_WRITEO(_esbp, _offset, _eop, _lock)			\
 	do {								\
 		_NOTE(CONSTANTCONDITION)				\
@@ -651,6 +817,34 @@
 		    uint32_t, (_eop)->eo_u32[1],			\
 		    uint32_t, (_eop)->eo_u32[0]);			\
 									\
+		bus_space_write_8((_esbp)->esb_tag, (_esbp)->esb_handle,\
+		    (_offset), (_eop)->eo_u64[0]);			\
+		bus_space_write_8((_esbp)->esb_tag, (_esbp)->esb_handle,\
+		    (_offset+8), (_eop)->eo_u64[1]);			\
+									\
+		_NOTE(CONSTANTCONDITION)				\
+		if (_lock)						\
+			mtx_unlock(&((_esbp)->esb_lock));		\
+	_NOTE(CONSTANTCONDITION)					\
+	} while (B_FALSE)
+
+#else
+#define	EFSYS_BAR_WRITEO(_esbp, _offset, _eop, _lock)			\
+	do {								\
+		_NOTE(CONSTANTCONDITION)				\
+		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_oword_t)),	\
+		    ("not power of 2 aligned"));			\
+									\
+		_NOTE(CONSTANTCONDITION)				\
+		if (_lock)						\
+			mtx_lock(&((_esbp)->esb_lock));			\
+									\
+		EFSYS_PROBE5(bar_writeo, unsigned int, (_offset),	\
+		    uint32_t, (_eop)->eo_u32[3],			\
+		    uint32_t, (_eop)->eo_u32[2],			\
+		    uint32_t, (_eop)->eo_u32[1],			\
+		    uint32_t, (_eop)->eo_u32[0]);			\
+									\
 		bus_space_write_4((_esbp)->esb_tag, (_esbp)->esb_handle,\
 		    (_offset), (_eop)->eo_u32[0]);			\
 		bus_space_write_4((_esbp)->esb_tag, (_esbp)->esb_handle,\
@@ -665,6 +859,7 @@
 			mtx_unlock(&((_esbp)->esb_lock));		\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
+#endif
 
 /* SPIN */
 

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="02-sfxge_ev_qpoll.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="02-sfxge_ev_qpoll.patch"

sfxge: Change sfxge_ev_qpoll() proto to avoid EVQ pointers array access

It was the only place on data path where sc->evq array is accessed.

Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/sfxge_intr.c
===================================================================
--- sys/dev/sfxge/sfxge_intr.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_intr.c	(working copy)
@@ -110,9 +110,8 @@
 sfxge_intr_line(void *arg)
 {
 	struct sfxge_evq *evq = arg;
-	struct sfxge_softc *sc = evq->sc;
 
-	(void)sfxge_ev_qpoll(sc, 0);
+	(void)sfxge_ev_qpoll(evq);
 }
 
 static void
@@ -146,7 +145,7 @@
 		return;
 	}
 
-	(void)sfxge_ev_qpoll(sc, index);
+	(void)sfxge_ev_qpoll(evq);
 }
 
 static int
Index: sys/dev/sfxge/sfxge.h
===================================================================
--- sys/dev/sfxge/sfxge.h	(revision 277726)
+++ sys/dev/sfxge/sfxge.h	(working copy)
@@ -281,7 +281,7 @@
 extern void sfxge_ev_fini(struct sfxge_softc *sc);
 extern int sfxge_ev_start(struct sfxge_softc *sc);
 extern void sfxge_ev_stop(struct sfxge_softc *sc);
-extern int sfxge_ev_qpoll(struct sfxge_softc *sc, unsigned int index);
+extern int sfxge_ev_qpoll(struct sfxge_evq *evq);
 
 /*
  * From sfxge_intr.c.
Index: sys/dev/sfxge/sfxge_ev.c
===================================================================
--- sys/dev/sfxge/sfxge_ev.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_ev.c	(working copy)
@@ -569,13 +569,10 @@
 
 
 int
-sfxge_ev_qpoll(struct sfxge_softc *sc, unsigned int index)
+sfxge_ev_qpoll(struct sfxge_evq *evq)
 {
-	struct sfxge_evq *evq;
 	int rc;
 
-	evq = sc->evq[index];
-
 	mtx_lock(&evq->lock);
 
 	if (evq->init_state != SFXGE_EVQ_STARTING &&

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="03-txq_next.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="03-txq_next.patch"

sfxge: Move txq->next pointer to part writable on completion path

In fact the pointer is used only if more than one TXQ is processed in one
interrupt.
It is used (read-write) on completion path only.
Also it makes the first part of the structure smaller and it fits now
into one 128byte cache line. So, TXQ structure becomes 128 bytes smaller.

Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/sfxge_tx.h
===================================================================
--- sys/dev/sfxge/sfxge_tx.h	(revision 277726)
+++ sys/dev/sfxge/sfxge_tx.h	(working copy)
@@ -139,7 +139,6 @@
 	bus_dma_tag_t			packet_dma_tag;
 	efx_buffer_t			*pend_desc;
 	efx_txq_t			*common;
-	struct sfxge_txq		*next;
 
 	efsys_mem_t			*tsoh_buffer;
 
@@ -173,6 +172,7 @@
 	 */
 	unsigned int			pending __aligned(CACHE_LINE_SIZE);
 	unsigned int			completed;
+	struct sfxge_txq		*next;
 };
 
 extern int sfxge_tx_packet_add(struct sfxge_txq *, struct mbuf *);

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="04-expect_init_state.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="04-expect_init_state.patch"

sfxge: Expect required init_state on data path and in periodic calls

Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/sfxge_port.c
===================================================================
--- sys/dev/sfxge/sfxge_port.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_port.c	(working copy)
@@ -50,7 +50,7 @@
 
 	mtx_lock(&port->lock);
 
-	if (port->init_state != SFXGE_PORT_STARTED) {
+	if (__predict_false(port->init_state != SFXGE_PORT_STARTED)) {
 		rc = 0;
 		goto out;
 	}
@@ -181,7 +181,7 @@
 
 		port->wanted_fc = fcntl;
 
-		if (port->init_state != SFXGE_PORT_STARTED)
+		if (__predict_false(port->init_state != SFXGE_PORT_STARTED))
 			goto out;
 
 		error = efx_mac_fcntl_set(sc->enp, port->wanted_fc, B_TRUE);
@@ -208,7 +208,8 @@
 	port = &sc->port;
 
 	mtx_lock(&port->lock);
-	if (port->init_state == SFXGE_PORT_STARTED && SFXGE_LINK_UP(sc))
+	if (__predict_true(port->init_state == SFXGE_PORT_STARTED) &&
+	    SFXGE_LINK_UP(sc))
 		efx_mac_fcntl_get(sc->enp, &wanted_fc, &link_fc);
 	else
 		link_fc = 0;
@@ -264,7 +265,7 @@
 
 	mtx_lock(&port->lock);
 
-	if (port->init_state != SFXGE_PORT_STARTED)
+	if (__predict_false(port->init_state != SFXGE_PORT_STARTED))
 		goto done;
 
 	/* This may sleep waiting for MCDI completion */
@@ -331,7 +332,7 @@
 	 * lock is held in sleeping thread. Both problems are repeatable
 	 * on LAG with LACP proto bring up.
 	 */
-	if (port->init_state == SFXGE_PORT_STARTED)
+	if (__predict_true(port->init_state == SFXGE_PORT_STARTED))
 		rc = sfxge_mac_filter_set_locked(sc);
 	else
 		rc = 0;
@@ -455,7 +456,7 @@
 
 	mtx_lock(&port->lock);
 
-	if (port->init_state != SFXGE_PORT_STARTED) {
+	if (__predict_false(port->init_state != SFXGE_PORT_STARTED)) {
 		rc = 0;
 		goto out;
 	}
Index: sys/dev/sfxge/sfxge_rx.c
===================================================================
--- sys/dev/sfxge/sfxge_rx.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_rx.c	(working copy)
@@ -209,7 +209,7 @@
 
 	mtx_assert(&evq->lock, MA_OWNED);
 
-	if (rxq->init_state != SFXGE_RXQ_STARTED)
+	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
 		return;
 
 	rxfill = rxq->added - rxq->completed;
@@ -269,7 +269,7 @@
 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
 {
 
-	if (rxq->init_state != SFXGE_RXQ_STARTED)
+	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
 		return;
 
 	/* Make sure the queue is full */
@@ -760,7 +760,7 @@
 		rx_desc = &rxq->queue[id];
 		m = rx_desc->mbuf;
 
-		if (rxq->init_state != SFXGE_RXQ_STARTED)
+		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
 			goto discard;
 
 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
Index: sys/dev/sfxge/sfxge_ev.c
===================================================================
--- sys/dev/sfxge/sfxge_ev.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_ev.c	(working copy)
@@ -99,7 +99,7 @@
 	KASSERT(evq->index == rxq->index,
 	    ("evq->index != rxq->index"));
 
-	if (rxq->init_state != SFXGE_RXQ_STARTED)
+	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
 		goto done;
 
 	expected = rxq->pending++ & rxq->ptr_mask;
@@ -244,7 +244,7 @@
 	KASSERT(evq->index == txq->evq_index,
 	    ("evq->index != txq->evq_index"));
 
-	if (txq->init_state != SFXGE_TXQ_STARTED)
+	if (__predict_false(txq->init_state != SFXGE_TXQ_STARTED))
 		goto done;
 
 	stop = (id + 1) & txq->ptr_mask;
@@ -415,7 +415,7 @@
 
 	sx_xlock(&sc->softc_lock);
 
-	if (sc->evq[0]->init_state != SFXGE_EVQ_STARTED)
+	if (__predict_false(sc->evq[0]->init_state != SFXGE_EVQ_STARTED))
 		goto out;
 
 	now = ticks;
@@ -578,8 +578,8 @@
 
 	mtx_lock(&evq->lock);
 
-	if (evq->init_state != SFXGE_EVQ_STARTING &&
-	    evq->init_state != SFXGE_EVQ_STARTED) {
+	if (__predict_false(evq->init_state != SFXGE_EVQ_STARTING &&
+			    evq->init_state != SFXGE_EVQ_STARTED)) {
 		rc = EINVAL;
 		goto fail;
 	}
Index: sys/dev/sfxge/sfxge_tx.c
===================================================================
--- sys/dev/sfxge/sfxge_tx.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_tx.c	(working copy)
@@ -285,7 +285,7 @@
 	if (mbuf->m_pkthdr.csum_flags & CSUM_TSO)
 		prefetch_read_many(mbuf->m_data);
 
-	if (txq->init_state != SFXGE_TXQ_STARTED) {
+	if (__predict_false(txq->init_state != SFXGE_TXQ_STARTED)) {
 		rc = EINTR;
 		goto reject;
 	}
@@ -1084,7 +1084,7 @@
 
 	mtx_assert(&evq->lock, MA_OWNED);
 
-	if (txq->init_state != SFXGE_TXQ_STARTED)
+	if (__predict_false(txq->init_state != SFXGE_TXQ_STARTED))
 		return;
 
 	mtx_lock(SFXGE_TXQ_LOCK(txq));
Index: sys/dev/sfxge/sfxge_intr.c
===================================================================
--- sys/dev/sfxge/sfxge_intr.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_intr.c	(working copy)
@@ -135,7 +135,7 @@
 	KASSERT(intr->type == EFX_INTR_MESSAGE,
 	    ("intr->type != EFX_INTR_MESSAGE"));
 
-	if (intr->state != SFXGE_INTR_STARTED)
+	if (__predict_false(intr->state != SFXGE_INTR_STARTED))
 		return;
 
 	(void)efx_intr_status_message(enp, index, &fatal);

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="05-bus_stream.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="05-bus_stream.patch"

sfxge: Using bus_space_*_stream_* API for better portability

Submitted by:   Artem V. Andreev <Artem.Andreev at oktetlabs.ru>
Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/common/efsys.h
===================================================================
--- sys/dev/sfxge/common/efsys.h
+++ sys/dev/sfxge/common/efsys.h
@@ -95,6 +95,13 @@
 #define	ISP2(x)			(((x) & ((x) - 1)) == 0)
 #endif
 
+#if defined(__x86_64__)
+#if !defined(bus_space_read_stream_8)
+#define bus_space_read_stream_8(t, h, o)        bus_space_read_8((t), (h), (o))
+#define bus_space_write_stream_8(t, h, o)       bus_space_write_8((t), (h), (o))
+#endif
+#endif
+
 #define	ENOTACTIVE EINVAL
 
 /* Memory type to use on FreeBSD */
@@ -624,7 +631,7 @@
 		if (_lock)						\
 			mtx_lock(&((_esbp)->esb_lock));			\
 									\
-		(_edp)->ed_u32[0] = bus_space_read_4((_esbp)->esb_tag,	\
+		(_edp)->ed_u32[0] = bus_space_read_stream_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset));			\
 									\
 		EFSYS_PROBE2(bar_readd, unsigned int, (_offset),	\
@@ -645,7 +652,7 @@
 									\
 		mtx_lock(&((_esbp)->esb_lock));				\
 									\
-		(_eqp)->eq_u64[0] = bus_space_read_8((_esbp)->esb_tag,	\
+		(_eqp)->eq_u64[0] = bus_space_read_stream_8((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset));			\
 									\
 		EFSYS_PROBE3(bar_readq, unsigned int, (_offset),	\
@@ -666,9 +673,9 @@
 		if (_lock)						\
 			mtx_lock(&((_esbp)->esb_lock));			\
 									\
-		(_eop)->eo_u64[0] = bus_space_read_8((_esbp)->esb_tag,	\
+		(_eop)->eo_u64[0] = bus_space_read_stream_8((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset));			\
-		(_eop)->eo_u64[1] = bus_space_read_8((_esbp)->esb_tag,	\
+		(_eop)->eo_u64[1] = bus_space_read_stream_8((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset+8));			\
 									\
 		EFSYS_PROBE5(bar_reado, unsigned int, (_offset),	\
@@ -692,9 +699,9 @@
 									\
 		mtx_lock(&((_esbp)->esb_lock));				\
 									\
-		(_eqp)->eq_u32[0] = bus_space_read_4((_esbp)->esb_tag,	\
+		(_eqp)->eq_u32[0] = bus_space_read_stream_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset));			\
-		(_eqp)->eq_u32[1] = bus_space_read_4((_esbp)->esb_tag,	\
+		(_eqp)->eq_u32[1] = bus_space_read_stream_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset+4));			\
 									\
 		EFSYS_PROBE3(bar_readq, unsigned int, (_offset),	\
@@ -715,13 +722,13 @@
 		if (_lock)						\
 			mtx_lock(&((_esbp)->esb_lock));			\
 									\
-		(_eop)->eo_u32[0] = bus_space_read_4((_esbp)->esb_tag,	\
+		(_eop)->eo_u32[0] = bus_space_read_stream_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset));			\
-		(_eop)->eo_u32[1] = bus_space_read_4((_esbp)->esb_tag,	\
+		(_eop)->eo_u32[1] = bus_space_read_stream_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset+4));			\
-		(_eop)->eo_u32[2] = bus_space_read_4((_esbp)->esb_tag,	\
+		(_eop)->eo_u32[2] = bus_space_read_stream_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset+8));			\
-		(_eop)->eo_u32[3] = bus_space_read_4((_esbp)->esb_tag,	\
+		(_eop)->eo_u32[3] = bus_space_read_stream_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset+12));			\
 									\
 		EFSYS_PROBE5(bar_reado, unsigned int, (_offset),	\

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="06-no_evq_stats_build.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="06-no_evq_stats_build.patch"

sfxge: Make it possible to build without EVQ statistics

Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/sfxge.h
===================================================================
--- sys/dev/sfxge/sfxge.h	(revision 277726)
+++ sys/dev/sfxge/sfxge.h	(working copy)
@@ -225,8 +225,10 @@
 
 	struct sfxge_evq		*evq[SFXGE_RX_SCALE_MAX];
 	unsigned int			ev_moderation;
+#if EFSYS_OPT_QSTATS
 	clock_t				ev_stats_update_time;
 	uint64_t			ev_stats[EV_NQSTATS];
+#endif
 
 	uma_zone_t			rxq_cache;
 	struct sfxge_rxq		*rxq[SFXGE_RX_SCALE_MAX];
Index: sys/dev/sfxge/sfxge_ev.c
===================================================================
--- sys/dev/sfxge/sfxge_ev.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_ev.c	(working copy)
@@ -406,6 +406,8 @@
 	return (B_FALSE);
 }
 
+#if EFSYS_OPT_QSTATS
+
 static void
 sfxge_ev_stat_update(struct sfxge_softc *sc)
 {
@@ -467,6 +469,8 @@
 	}
 }
 
+#endif /* EFSYS_OPT_QSTATS */
+
 static void
 sfxge_ev_qmoderate(struct sfxge_softc *sc, unsigned int idx, unsigned int us)
 {
@@ -630,8 +634,10 @@
 	evq->read_ptr = 0;
 	evq->exception = B_FALSE;
 
+#if EFSYS_OPT_QSTATS
 	/* Add event counts before discarding the common evq state */
 	efx_ev_qstats_update(evq->common, sc->ev_stats);
+#endif
 
 	efx_ev_qdestroy(evq->common);
 	efx_sram_buf_tbl_clear(sc->enp, evq->buf_base_id,
@@ -886,7 +892,9 @@
 			goto fail;
 	}
 
+#if EFSYS_OPT_QSTATS
 	sfxge_ev_stat_init(sc);
+#endif
 
 	return (0);
 
Index: sys/dev/sfxge/common/efx_tx.c
===================================================================
--- sys/dev/sfxge/common/efx_tx.c	(revision 277726)
+++ sys/dev/sfxge/common/efx_tx.c	(working copy)
@@ -358,6 +358,7 @@
 	return (rc);
 }
 
+#if EFSYS_OPT_QSTATS
 #if EFSYS_OPT_NAMES
 /* START MKCONFIG GENERATED EfxTransmitQueueStatNamesBlock 78ca9ab00287fffb */
 static const char 	__cs * __cs __efx_tx_qstat_name[] = {
@@ -378,6 +379,7 @@
 	return (__efx_tx_qstat_name[id]);
 }
 #endif	/* EFSYS_OPT_NAMES */
+#endif	/* EFSYS_OPT_QSTATS */
 
 #if EFSYS_OPT_QSTATS
 					void
Index: sys/dev/sfxge/common/efx_ev.c
===================================================================
--- sys/dev/sfxge/common/efx_ev.c	(revision 277726)
+++ sys/dev/sfxge/common/efx_ev.c	(working copy)
@@ -995,6 +995,7 @@
 	return (rc);
 }
 
+#if EFSYS_OPT_QSTATS
 #if EFSYS_OPT_NAMES
 /* START MKCONFIG GENERATED EfxEventQueueStatNamesBlock 67e9bdcd920059bd */
 static const char 	__cs * __cs __efx_ev_qstat_name[] = {
@@ -1052,6 +1053,7 @@
 	return (__efx_ev_qstat_name[id]);
 }
 #endif	/* EFSYS_OPT_NAMES */
+#endif	/* EFSYS_OPT_QSTATS */
 
 #if EFSYS_OPT_QSTATS
 					void

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="07-sfxge_evq.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="07-sfxge_evq.patch"

sfxge: Remove extra cache-line alignment and reorder sfxge_evq_t

Remove the first member alignment to cacheline since it is nop.
Use __aligned() for the whole structure to make sure that the structure
size is cacheline aligned.
Remove lock alignment to make the structure smaller and fit all members
used on event queue processing into one cacheline (128 bytes) on x86-64.
The lock is obtained as well from different context when event queue
statistics are retrived from sysctl context, but it is infrequent.
Reorder members to avoid padding and go in usage order on event processing.
As the result all structure members used on event queue processing fit into
exactly one cacheline (128 byte) now.

Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/sfxge.h
===================================================================
--- sys/dev/sfxge/sfxge.h	(revision 277726)
+++ sys/dev/sfxge/sfxge.h	(working copy)
@@ -103,27 +103,27 @@
 #define	SFXGE_EV_BATCH	16384
 
 struct sfxge_evq {
-	struct sfxge_softc	*sc  __aligned(CACHE_LINE_SIZE);
-	struct mtx		lock __aligned(CACHE_LINE_SIZE);
-
+	/* Structure members below are sorted by usage order */
+	struct sfxge_softc	*sc;
+	struct mtx		lock;
+	unsigned int		index;
 	enum sfxge_evq_state	init_state;
-	unsigned int		index;
-	unsigned int		entries;
 	efsys_mem_t		mem;
-	unsigned int		buf_base_id;
-
-	boolean_t		exception;
-
 	efx_evq_t		*common;
 	unsigned int		read_ptr;
+	boolean_t		exception;
 	unsigned int		rx_done;
 	unsigned int		tx_done;
 
 	/* Linked list of TX queues with completions to process */
 	struct sfxge_txq	*txq;
 	struct sfxge_txq	**txqs;
-};
 
+	/* Structure members not used on event processing path */
+	unsigned int		buf_base_id;
+	unsigned int		entries;
+} __aligned(CACHE_LINE_SIZE);
+
 #define	SFXGE_NDESCS	1024
 #define	SFXGE_MODERATION	30
 

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="08-tso_vs_vlan.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="08-tso_vs_vlan.patch"

sfxge: fixed TSO code to cope with VLAN headers

Submitted by:   Artem V. Andreev <Artem.Andreev at oktetlabs.ru>
Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/sfxge_tx.c
===================================================================
--- sys/dev/sfxge/sfxge_tx.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_tx.c	(working copy)
@@ -855,9 +855,7 @@
 		tso->tcph_off = tso->nh_off + sizeof(struct ip6_hdr);
 	}
 
-	/* We assume all headers are linear in the head mbuf */
 	tso->header_len = tso->tcph_off + 4 * tso_tcph(tso)->th_off;
-	KASSERT(tso->header_len <= mbuf->m_len, ("packet headers fragmented"));
 	tso->full_packet_size = tso->header_len + mbuf->m_pkthdr.tso_segsz;
 
 	tso->seqnum = ntohl(tso_tcph(tso)->th_seq);
@@ -972,7 +970,7 @@
 	tsoh_th = (struct tcphdr *)(header + tso->tcph_off);
 
 	/* Copy and update the headers. */
-	memcpy(header, tso->mbuf->m_data, tso->header_len);
+	m_copydata(tso->mbuf, 0, tso->header_len, header);
 
 	tsoh_th->th_seq = htonl(tso->seqnum);
 	tso->seqnum += tso->mbuf->m_pkthdr.tso_segsz;
@@ -1018,20 +1016,18 @@
 {
 	struct sfxge_tso_state tso;
 	unsigned int id, next_id;
+	unsigned skipped = 0;
 
 	tso_start(&tso, mbuf);
 
-	/* Grab the first payload fragment. */
-	if (dma_seg->ds_len == tso.header_len) {
+	while (dma_seg->ds_len + skipped <= tso.header_len) {
+		skipped += dma_seg->ds_len;
 		--n_dma_seg;
 		KASSERT(n_dma_seg, ("no payload found in TSO packet"));
 		++dma_seg;
-		tso.in_len = dma_seg->ds_len;
-		tso.dma_addr = dma_seg->ds_addr;
-	} else {
-		tso.in_len = dma_seg->ds_len - tso.header_len;
-		tso.dma_addr = dma_seg->ds_addr + tso.header_len;
 	}
+	tso.in_len = dma_seg->ds_len + (tso.header_len - skipped);
+	tso.dma_addr = dma_seg->ds_addr + (tso.header_len - skipped);
 
 	id = txq->added & txq->ptr_mask;
 	if (__predict_false(tso_start_new_packet(txq, &tso, id)))

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="09-sfxge_tx_qcomplete.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="09-sfxge_tx_qcomplete.patch"

sfxge: Add evq argument to sfxge_tx_qcomplete()

It removes necessity to get evq pointer by its index in soft context.

Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/sfxge_tx.h
===================================================================
--- sys/dev/sfxge/sfxge_tx.h	(revision 277726)
+++ sys/dev/sfxge/sfxge_tx.h	(working copy)
@@ -175,13 +175,15 @@
 	unsigned int			completed;
 };
 
+struct sfxge_evq;
+
 extern int sfxge_tx_packet_add(struct sfxge_txq *, struct mbuf *);
 
 extern int sfxge_tx_init(struct sfxge_softc *sc);
 extern void sfxge_tx_fini(struct sfxge_softc *sc);
 extern int sfxge_tx_start(struct sfxge_softc *sc);
 extern void sfxge_tx_stop(struct sfxge_softc *sc);
-extern void sfxge_tx_qcomplete(struct sfxge_txq *txq);
+extern void sfxge_tx_qcomplete(struct sfxge_txq *txq, struct sfxge_evq *evq);
 extern void sfxge_tx_qflush_done(struct sfxge_txq *txq);
 #ifdef SFXGE_HAVE_MQ
 extern void sfxge_if_qflush(struct ifnet *ifp);
Index: sys/dev/sfxge/sfxge_ev.c
===================================================================
--- sys/dev/sfxge/sfxge_ev.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_ev.c	(working copy)
@@ -68,7 +68,7 @@
 			    ("txq->evq_index != index"));
 
 			if (txq->pending != txq->completed)
-				sfxge_tx_qcomplete(txq);
+				sfxge_tx_qcomplete(txq, evq);
 
 			txq = next;
 		} while (txq != NULL);
@@ -262,7 +262,7 @@
 	}
 
 	if (txq->pending - txq->completed >= SFXGE_TX_BATCH)
-		sfxge_tx_qcomplete(txq);
+		sfxge_tx_qcomplete(txq, evq);
 
 done:
 	return (evq->tx_done >= SFXGE_EV_BATCH);
Index: sys/dev/sfxge/sfxge_tx.c
===================================================================
--- sys/dev/sfxge/sfxge_tx.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_tx.c	(working copy)
@@ -105,15 +105,10 @@
 			      const bus_dma_segment_t *dma_seg, int n_dma_seg);
 
 void
-sfxge_tx_qcomplete(struct sfxge_txq *txq)
+sfxge_tx_qcomplete(struct sfxge_txq *txq, struct sfxge_evq *evq)
 {
-	struct sfxge_softc *sc;
-	struct sfxge_evq *evq;
 	unsigned int completed;
 
-	sc = txq->sc;
-	evq = sc->evq[txq->evq_index];
-
 	mtx_assert(&evq->lock, MA_OWNED);
 
 	completed = txq->completed;
@@ -1151,7 +1146,7 @@
 	txq->blocked = 0;
 	txq->pending = txq->added;
 
-	sfxge_tx_qcomplete(txq);
+	sfxge_tx_qcomplete(txq, evq);
 	KASSERT(txq->completed == txq->added,
 	    ("txq->completed != txq->added"));
 

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="10-no_extra_bzero.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="10-no_extra_bzero.patch"

sfxge: Do not bzero() DMA allocated memory once again

sfxge_dma_alloc() calls bus_dmamem_alloc() with BUS_DMA_ZERO flag, so
allocated memory is already filled in by zeros

Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/sfxge_port.c
===================================================================
--- sys/dev/sfxge/sfxge_port.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_port.c	(working copy)
@@ -583,7 +583,6 @@
 					    M_SFXGE, M_WAITOK | M_ZERO);
 	if ((rc = sfxge_dma_alloc(sc, EFX_PHY_STATS_SIZE, phy_stats_buf)) != 0)
 		goto fail;
-	bzero(phy_stats_buf->esm_base, phy_stats_buf->esm_size);
 	sfxge_phy_stat_init(sc);
 
 	sysctl_ctx = device_get_sysctl_ctx(sc->dev);
@@ -605,7 +604,6 @@
 					    M_SFXGE, M_WAITOK | M_ZERO);
 	if ((rc = sfxge_dma_alloc(sc, EFX_MAC_STATS_SIZE, mac_stats_buf)) != 0)
 		goto fail2;
-	bzero(mac_stats_buf->esm_base, mac_stats_buf->esm_size);
 	sfxge_mac_stat_init(sc);
 
 	port->init_state = SFXGE_PORT_INITIALIZED;

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="11-no_esm_size.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="11-no_esm_size.patch"

sfxge: Remove unused esm_size member of the efsys_mem_t structure

esm_size is not even initialized properly when memory is allocated.

Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/common/efsys.h
===================================================================
--- sys/dev/sfxge/common/efsys.h	(revision 277726)
+++ sys/dev/sfxge/common/efsys.h	(working copy)
@@ -370,7 +370,6 @@
 	bus_dmamap_t		esm_map;
 	caddr_t			esm_base;
 	efsys_dma_addr_t	esm_addr;
-	size_t			esm_size;
 } efsys_mem_t;
 
 

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="12-free_vaddr.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="12-free_vaddr.patch"

sfxge: Pass correct address to free allocated memory in the case of load error

Most likely is was just memory leak on the error handling path since
typically efsys_mem_t is filled in by zeros on allocation.

Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/sfxge_dma.c
===================================================================
--- sys/dev/sfxge/sfxge_dma.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_dma.c	(working copy)
@@ -157,7 +157,7 @@
 	if (bus_dmamap_load(esmp->esm_tag, esmp->esm_map, vaddr, len,
 	    sfxge_dma_cb, &esmp->esm_addr, 0) != 0) {
 		device_printf(sc->dev, "Couldn't load DMA mapping\n");
-		bus_dmamem_free(esmp->esm_tag, esmp->esm_base, esmp->esm_map);
+		bus_dmamem_free(esmp->esm_tag, vaddr, esmp->esm_map);
 		bus_dma_tag_destroy(esmp->esm_tag);
 		return (ENOMEM);
 	}

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="13-evq_moderation_init.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="13-evq_moderation_init.patch"

sfxge: Use SFXGE_MODERATION to initialize event moderation

Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/sfxge_ev.c
===================================================================
--- sys/dev/sfxge/sfxge_ev.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_ev.c	(working copy)
@@ -872,7 +872,7 @@
 	/* Set default interrupt moderation; add a sysctl to
 	 * read and change it.
 	 */
-	sc->ev_moderation = 30;
+	sc->ev_moderation = SFXGE_MODERATION;
 	SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 			OID_AUTO, "int_mod", CTLTYPE_UINT|CTLFLAG_RW,
 			sc, 0, sfxge_int_mod_handler, "IU",

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="14-max_rss_channels.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="14-max_rss_channels.patch"

sfxge: implemented parameter to restrict RSS channels

Submitted by:   Artem V. Andreev <Artem.Andreev at oktetlabs.ru>
Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/sfxge.h
===================================================================
--- sys/dev/sfxge/sfxge.h	(revision 277726)
+++ sys/dev/sfxge/sfxge.h	(working copy)
@@ -228,6 +228,7 @@
 	clock_t				ev_stats_update_time;
 	uint64_t			ev_stats[EV_NQSTATS];
 
+	unsigned int			max_rss_channels;
 	uma_zone_t			rxq_cache;
 	struct sfxge_rxq		*rxq[SFXGE_RX_SCALE_MAX];
 	unsigned int			rx_indir_table[SFXGE_RX_SCALE_MAX];
Index: sys/dev/sfxge/sfxge_intr.c
===================================================================
--- sys/dev/sfxge/sfxge_intr.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_intr.c	(working copy)
@@ -303,6 +303,9 @@
 	if (count > EFX_MAXRSS)
 		count = EFX_MAXRSS;
 
+	if (sc->max_rss_channels > 0 && count > sc->max_rss_channels)
+		count = sc->max_rss_channels;
+
 	rid = PCIR_BAR(4);
 	resp = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
 	if (resp == NULL)
Index: sys/dev/sfxge/sfxge.c
===================================================================
--- sys/dev/sfxge/sfxge.c	(revision 277726)
+++ sys/dev/sfxge/sfxge.c	(working copy)
@@ -397,11 +397,18 @@
 	device_t dev;
 	efx_nic_t *enp;
 	int error;
+	char rss_param_name[sizeof(SFXGE_PARAM(%d.max_rss_channels))];
 
 	dev = sc->dev;
 
 	sx_init(&sc->softc_lock, "sfxge_softc");
 
+	sc->max_rss_channels = 0;
+	snprintf(rss_param_name, sizeof(rss_param_name),
+		 SFXGE_PARAM(%d.max_rss_channels),
+		 (int)device_get_unit(dev));
+	TUNABLE_INT_FETCH(rss_param_name, &sc->max_rss_channels);
+
 	sc->stats_node = SYSCTL_ADD_NODE(
 		device_get_sysctl_ctx(dev),
 		SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
Index: share/man/man4/sfxge.4
===================================================================
--- share/man/man4/sfxge.4	(revision 277726)
+++ share/man/man4/sfxge.4	(working copy)
@@ -108,6 +108,10 @@
 .Va tx_early_drops
 counter is incremented and the local sender receives ENOBUFS.
 The value must be greater than or equal to 0.
+.It Va hw.sfxge.N.max_rss_channels
+The maximum number of allocated RSS channels for the Nth adapter.
+If set to 0 or unset, the number of channels is determined by the number
+of CPU cores.
 .El
 .Sh SUPPORT
 For general information and support,

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="15-stats_port_lock.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="15-stats_port_lock.patch"

sfxge: access statistics buffers under port lock

Allow access to statistics data not only from sysctl handlers.

Submitted by:   Boris Misenov <Boris.Misenov at oktetlabs.ru>
Sponsored by:   Solarflare Communications, Inc.

Index: sys/dev/sfxge/sfxge_port.c
===================================================================
--- sys/dev/sfxge/sfxge_port.c	(revision 277726)
+++ sys/dev/sfxge/sfxge_port.c	(working copy)
@@ -48,7 +48,7 @@
 	unsigned int count;
 	int rc;
 
-	mtx_lock(&port->lock);
+	mtx_assert(&port->lock, MA_OWNED);
 
 	if (port->init_state != SFXGE_PORT_STARTED) {
 		rc = 0;
@@ -82,7 +82,6 @@
 
 	rc = ETIMEDOUT;
 out:
-	mtx_unlock(&port->lock);
 	return (rc);
 }
 
@@ -93,12 +92,16 @@
 	unsigned int id = arg2;
 	int rc;
 
+	mtx_lock(&sc->port.lock);
 	if ((rc = sfxge_mac_stat_update(sc)) != 0)
-		return (rc);
+		goto out;
 
-	return (SYSCTL_OUT(req,
-			  (uint64_t *)sc->port.mac_stats.decode_buf + id,
-			  sizeof(uint64_t)));
+	rc = SYSCTL_OUT(req,
+			(uint64_t *)sc->port.mac_stats.decode_buf + id,
+			sizeof(uint64_t));
+out:
+	mtx_unlock(&sc->port.lock);
+	return (rc);
 }
 
 static void
@@ -453,7 +456,7 @@
 	unsigned int count;
 	int rc;
 
-	mtx_lock(&port->lock);
+	mtx_assert(&port->lock, MA_OWNED);
 
 	if (port->init_state != SFXGE_PORT_STARTED) {
 		rc = 0;
@@ -487,7 +490,6 @@
 
 	rc = ETIMEDOUT;
 out:
-	mtx_unlock(&port->lock);
 	return (rc);
 }
 
@@ -498,12 +500,16 @@
 	unsigned int id = arg2;
 	int rc;
 
+	mtx_lock(&sc->port.lock);
 	if ((rc = sfxge_phy_stat_update(sc)) != 0)
-		return (rc);
+		goto out;
 
-	return (SYSCTL_OUT(req,
-			  (uint32_t *)sc->port.phy_stats.decode_buf + id,
-			  sizeof(uint32_t)));
+	rc = SYSCTL_OUT(req,
+			(uint32_t *)sc->port.phy_stats.decode_buf + id,
+			sizeof(uint32_t));
+out:
+	mtx_unlock(&sc->port.lock);
+	return (rc);
 }
 
 static void

--------------050201000503010305030302
Content-Type: text/x-patch;
 name="17-non_tcp_get_list.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="17-non_tcp_get_list.patch"

sfxge: Separate software Tx queue limit for non-TCP traffic

Add separate software Tx queue limit for non-TCP traffic to make total
limit higher and avoid local drops of TCP packets because of no
backpressure.
There is no point to make non-TCP limit high since without backpressure
UDP stream easily overflows any sensible limit.

Split early drops statistics since it is better to have separate counter
for each drop reason to make it unabmiguous.

Add software Tx queue high watermark. The information is very useful to
understand how big queues grow under traffic load.

Sponsored by:   Solarflare Communications, Inc.

Index: share/man/man4/sfxge.4
===================================================================
--- share/man/man4/sfxge.4	(revision 277737)
+++ share/man/man4/sfxge.4	(working copy)
@@ -93,19 +93,27 @@
 .It Va hw.sfxge.tx_dpl_get_max
 The maximum length of the deferred packet
 .Dq get-list
-for queued transmit
-packets, used only if the transmit queue lock can be acquired.
+for queued transmit packets (TCP and non-TCP), used only if the transmit
+queue lock can be acquired.
 If a packet is dropped, the
-.Va tx_early_drops
+.Va tx_get_overflow
 counter is incremented and the local sender receives ENOBUFS.
 The value must be greater than 0.
+.It Va hw.sfxge.tx_dpl_get_non_tcp_max
+The maximum number of non-TCP packets in the deferred packet
+.Dq get-list
+, used only if the transmit queue lock can be acquired.
+If packet is dropped, the
+.Va tx_get_non_tcp_overflow
+counter is incremented and the local sender receives ENOBUFS.
+The value must be greater than 0.
 .It Va hw.sfxge.tx_dpl_put_max
 The maximum length of the deferred packet
 .Dq put-list
 for queued transmit
 packets, used if the transmit queue lock cannot be acquired.
 If a packet is dropped, the
-.Va tx_early_drops
+.Va tx_put_overflow
 counter is incremented and the local sender receives ENOBUFS.
 The value must be greater than or equal to 0.
 .El
Index: sys/dev/sfxge/sfxge_tx.c
===================================================================
--- sys/dev/sfxge/sfxge_tx.c	(revision 277737)
+++ sys/dev/sfxge/sfxge_tx.c	(working copy)
@@ -85,14 +85,23 @@
 TUNABLE_INT(SFXGE_PARAM_TX_DPL_GET_MAX, &sfxge_tx_dpl_get_max);
 SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_get_max, CTLFLAG_RDTUN,
 	   &sfxge_tx_dpl_get_max, 0,
-	   "Maximum number of packets in deferred packet get-list");
+	   "Maximum number of any packets in deferred packet get-list");
 
+#define	SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX \
+	SFXGE_PARAM(tx_dpl_get_non_tcp_max)
+static int sfxge_tx_dpl_get_non_tcp_max =
+	SFXGE_TX_DPL_GET_NON_TCP_PKT_LIMIT_DEFAULT;
+TUNABLE_INT(SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX, &sfxge_tx_dpl_get_non_tcp_max);
+SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_get_non_tcp_max, CTLFLAG_RDTUN,
+	   &sfxge_tx_dpl_get_non_tcp_max, 0,
+	   "Maximum number of non-TCP packets in deferred packet get-list");
+
 #define	SFXGE_PARAM_TX_DPL_PUT_MAX	SFXGE_PARAM(tx_dpl_put_max)
 static int sfxge_tx_dpl_put_max = SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT;
 TUNABLE_INT(SFXGE_PARAM_TX_DPL_PUT_MAX, &sfxge_tx_dpl_put_max);
 SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_put_max, CTLFLAG_RDTUN,
 	   &sfxge_tx_dpl_put_max, 0,
-	   "Maximum number of packets in deferred packet put-list");
+	   "Maximum number of any packets in deferred packet put-list");
 
 #endif
 
@@ -152,6 +161,15 @@
 
 #ifdef SFXGE_HAVE_MQ
 
+static inline unsigned int
+sfxge_is_mbuf_non_tcp(struct mbuf *mbuf)
+{
+	/* Absense of TCP checksum flags does not mean that it is non-TCP
+	 * but it should be true if user wants to achieve high throughput.
+	 */
+	return (!(mbuf->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)));
+}
+
 /*
  * Reorder the put list and append it to the get list.
  */
@@ -163,6 +181,7 @@
 	volatile uintptr_t *putp;
 	uintptr_t put;
 	unsigned int count;
+	unsigned int non_tcp_count;
 
 	mtx_assert(&txq->lock, MA_OWNED);
 
@@ -181,9 +200,11 @@
 	get_next = NULL;
 
 	count = 0;
+	non_tcp_count = 0;
 	do {
 		struct mbuf *put_next;
 
+		non_tcp_count += sfxge_is_mbuf_non_tcp(mbuf);
 		put_next = mbuf->m_nextpkt;
 		mbuf->m_nextpkt = get_next;
 		get_next = mbuf;
@@ -197,6 +218,7 @@
 	*stdp->std_getp = get_next;
 	stdp->std_getp = get_tailp;
 	stdp->std_get_count += count;
+	stdp->std_get_non_tcp_count += non_tcp_count;
 }
 
 #endif /* SFXGE_HAVE_MQ */
@@ -387,6 +409,7 @@
 	struct sfxge_tx_dpl *stdp;
 	struct mbuf *mbuf, *next;
 	unsigned int count;
+	unsigned int non_tcp_count;
 	unsigned int pushed;
 	int rc;
 
@@ -401,7 +424,11 @@
 
 	mbuf = stdp->std_get;
 	count = stdp->std_get_count;
+	non_tcp_count = stdp->std_get_non_tcp_count;
 
+	if (count > stdp->std_get_hiwat)
+		stdp->std_get_hiwat = count;
+
 	while (count != 0) {
 		KASSERT(mbuf != NULL, ("mbuf == NULL"));
 
@@ -415,6 +442,7 @@
 
 		rc = sfxge_tx_queue_mbuf(txq, mbuf);
 		--count;
+		non_tcp_count -= sfxge_is_mbuf_non_tcp(mbuf);
 		mbuf = next;
 		if (rc != 0)
 			continue;
@@ -431,12 +459,16 @@
 
 	if (count == 0) {
 		KASSERT(mbuf == NULL, ("mbuf != NULL"));
+		KASSERT(non_tcp_count == 0,
+			("inconsistent TCP/non-TCP detection"));
 		stdp->std_get = NULL;
 		stdp->std_get_count = 0;
+		stdp->std_get_non_tcp_count = 0;
 		stdp->std_getp = &stdp->std_get;
 	} else {
 		stdp->std_get = mbuf;
 		stdp->std_get_count = count;
+		stdp->std_get_non_tcp_count = non_tcp_count;
 	}
 
 	if (txq->added != pushed)
@@ -496,8 +528,18 @@
 
 		sfxge_tx_qdpl_swizzle(txq);
 
-		if (stdp->std_get_count >= stdp->std_get_max)
+		if (stdp->std_get_count >= stdp->std_get_max) {
+			txq->get_overflow++;
 			return (ENOBUFS);
+		}
+		if (sfxge_is_mbuf_non_tcp(mbuf)) {
+			if (stdp->std_get_non_tcp_count >=
+			    stdp->std_get_non_tcp_max) {
+				txq->get_non_tcp_overflow++;
+				return (ENOBUFS);
+			}
+			stdp->std_get_non_tcp_count++;
+		}
 
 		*(stdp->std_getp) = mbuf;
 		stdp->std_getp = &mbuf->m_nextpkt;
@@ -518,8 +560,10 @@
 				old_len = mp->m_pkthdr.csum_data;
 			} else
 				old_len = 0;
-			if (old_len >= stdp->std_put_max)
+			if (old_len >= stdp->std_put_max) {
+				atomic_add_long(&txq->put_overflow, 1);
 				return (ENOBUFS);
+			}
 			mbuf->m_pkthdr.csum_data = old_len + 1;
 			mbuf->m_nextpkt = (void *)old;
 		} while (atomic_cmpset_ptr(putp, old, new) == 0);
@@ -540,6 +584,7 @@
 
 	if (!SFXGE_LINK_UP(txq->sc)) {
 		rc = ENETDOWN;
+		atomic_add_long(&txq->netdown_drops, 1);
 		goto fail;
 	}
 
@@ -577,7 +622,6 @@
 
 fail:
 	m_freem(m);
-	atomic_add_long(&txq->early_drops, 1);
 	return (rc);
 }
 
@@ -596,6 +640,7 @@
 	}
 	stdp->std_get = NULL;
 	stdp->std_get_count = 0;
+	stdp->std_get_non_tcp_count = 0;
 	stdp->std_getp = &stdp->std_get;
 
 	mtx_unlock(&txq->lock);
@@ -1411,6 +1456,13 @@
 		rc = EINVAL;
 		goto fail_tx_dpl_get_max;
 	}
+	if (sfxge_tx_dpl_get_non_tcp_max <= 0) {
+		log(LOG_ERR, "%s=%d must be greater than 0",
+		    SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX,
+		    sfxge_tx_dpl_get_non_tcp_max);
+		rc = EINVAL;
+		goto fail_tx_dpl_get_max;
+	}
 	if (sfxge_tx_dpl_put_max < 0) {
 		log(LOG_ERR, "%s=%d must be greater or equal to 0",
 		    SFXGE_PARAM_TX_DPL_PUT_MAX, sfxge_tx_dpl_put_max);
@@ -1422,6 +1474,7 @@
 	stdp = &txq->dpl;
 	stdp->std_put_max = sfxge_tx_dpl_put_max;
 	stdp->std_get_max = sfxge_tx_dpl_get_max;
+	stdp->std_get_non_tcp_max = sfxge_tx_dpl_get_non_tcp_max;
 	stdp->std_getp = &stdp->std_get;
 
 	mtx_init(&txq->lock, "txq", NULL, MTX_DEF);
@@ -1430,6 +1483,14 @@
 			SYSCTL_CHILDREN(txq_node), OID_AUTO,
 			"dpl_get_count", CTLFLAG_RD | CTLFLAG_STATS,
 			&stdp->std_get_count, 0, "");
+	SYSCTL_ADD_UINT(device_get_sysctl_ctx(sc->dev),
+			SYSCTL_CHILDREN(txq_node), OID_AUTO,
+			"dpl_get_non_tcp_count", CTLFLAG_RD | CTLFLAG_STATS,
+			&stdp->std_get_non_tcp_count, 0, "");
+	SYSCTL_ADD_UINT(device_get_sysctl_ctx(sc->dev),
+			SYSCTL_CHILDREN(txq_node), OID_AUTO,
+			"dpl_get_hiwat", CTLFLAG_RD | CTLFLAG_STATS,
+			&stdp->std_get_hiwat, 0, "");
 #endif
 
 	txq->type = type;
@@ -1467,7 +1528,10 @@
 	SFXGE_TX_STAT(tso_long_headers, tso_long_headers),
 	SFXGE_TX_STAT(tx_collapses, collapses),
 	SFXGE_TX_STAT(tx_drops, drops),
-	SFXGE_TX_STAT(tx_early_drops, early_drops),
+	SFXGE_TX_STAT(tx_get_overflow, get_overflow),
+	SFXGE_TX_STAT(tx_get_non_tcp_overflow, get_non_tcp_overflow),
+	SFXGE_TX_STAT(tx_put_overflow, put_overflow),
+	SFXGE_TX_STAT(tx_netdown_drops, netdown_drops),
 };
 
 static int
Index: sys/dev/sfxge/sfxge_tx.h
===================================================================
--- sys/dev/sfxge/sfxge_tx.h	(revision 277737)
+++ sys/dev/sfxge/sfxge_tx.h	(working copy)
@@ -75,21 +75,29 @@
 	enum sfxge_tx_buf_flags	flags;
 };
 
-#define	SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT	1024
-#define	SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT	64
+#define	SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT		(64 * 1024)
+#define	SFXGE_TX_DPL_GET_NON_TCP_PKT_LIMIT_DEFAULT	1024
+#define	SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT		64
 
 /*
  * Deferred packet list.
  */
 struct sfxge_tx_dpl {
-	unsigned int		std_get_max;	/* Maximum number of packets
+	unsigned int	std_get_max;		/* Maximum number  of packets
 						 * in get list */
-	unsigned int		std_put_max;	/* Maximum number of packets
+	unsigned int	std_get_non_tcp_max;	/* Maximum number
+						 * of non-TCP packets
+						 * in get list */
+	unsigned int	std_put_max;		/* Maximum number of packets
 						 * in put list */
-	uintptr_t		std_put;	/* Head of put list. */
-	struct mbuf		*std_get;	/* Head of get list. */
-	struct mbuf		**std_getp;	/* Tail of get list. */
-	unsigned int		std_get_count;	/* Packets in get list. */
+	uintptr_t	std_put;		/* Head of put list. */
+	struct mbuf	*std_get;		/* Head of get list. */
+	struct mbuf	**std_getp;		/* Tail of get list. */
+	unsigned int	std_get_count;		/* Packets in get list. */
+	unsigned int	std_get_non_tcp_count;	/* Non-TCP packets
+						 * in get list */
+	unsigned int	std_get_hiwat;		/* Packets in get list
+						 * high watermark */
 };
 
 
@@ -166,7 +174,10 @@
 	unsigned long			tso_long_headers;
 	unsigned long			collapses;
 	unsigned long			drops;
-	unsigned long			early_drops;
+	unsigned long			get_overflow;
+	unsigned long			get_non_tcp_overflow;
+	unsigned long			put_overflow;
+	unsigned long			netdown_drops;
 
 	/* The following fields change more often, and are used mostly
 	 * on the completion path

--------------050201000503010305030302--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?54C62954.5030004>