Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 5 Dec 2011 12:06:53 +0000 (UTC)
From:      Luigi Rizzo <luigi@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r228276 - in head: sys/dev/ixgbe sys/dev/netmap sys/net tools/tools/netmap
Message-ID:  <201112051206.pB5C6rxH036343@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: luigi
Date: Mon Dec  5 12:06:53 2011
New Revision: 228276
URL: http://svn.freebsd.org/changeset/base/228276

Log:
  1. Fix the handling of link reset while in netmap more.
     A link reset now is completely transparent for the netmap client:
     even if the NIC resets its own ring (e.g. restarting from 0),
     the client will not see any change in the current rx/tx positions,
     because the driver will keep track of the offset between the two.
  
  2. make the device-specific code more uniform across different drivers
     There were some inconsistencies in the implementation of the netmap
     support routines, now drivers have been aligned to a common
     code structure.
  
  3. import netmap support for ixgbe . This is implemented as a very
     small patch for ixgbe.c (233 lines, 11 chunks, mostly comments:
     in total the patch has only 54 lines of new code) , as most of
     the code is in an external file sys/dev/netmap/ixgbe_netmap.h ,
     following some initial comments from Jack Vogel about making
     changes less intrusive.
     (Note, i have emailed Jack multiple times asking if he had
     comments on this structure of the code; i got no reply so
     i assume he is fine with it).
  
  Support for other drivers (em, lem, re, igb) will come later.
  
  "ixgbe" is now the reference driver for netmap support. Both the
  external file (sys/dev/netmap/ixgbe_netmap.h) and the device-specific
  patches (in sys/dev/ixgbe/ixgbe.c) are heavily commented and should
  serve as a reference for other device drivers.
  
  Tested on i386 and amd64 with the pkt-gen program in tools/tools/netmap,
  the sender does 14.88 Mpps at 1050 Mhz and 14.2 Mpps at 900 MHz
  on an i7-860 with 4 cores and 82599 card. Haven't tried yet more
  aggressive optimizations such as adding 'prefetch' instructions
  in the time-critical parts of the code.

Modified:
  head/sys/dev/ixgbe/ixgbe.c
  head/sys/dev/netmap/if_em_netmap.h
  head/sys/dev/netmap/if_igb_netmap.h
  head/sys/dev/netmap/if_lem_netmap.h
  head/sys/dev/netmap/if_re_netmap.h
  head/sys/dev/netmap/ixgbe_netmap.h
  head/sys/dev/netmap/netmap.c
  head/sys/dev/netmap/netmap_kern.h
  head/sys/net/netmap.h
  head/tools/tools/netmap/pkt-gen.c

Modified: head/sys/dev/ixgbe/ixgbe.c
==============================================================================
--- head/sys/dev/ixgbe/ixgbe.c	Mon Dec  5 10:34:52 2011	(r228275)
+++ head/sys/dev/ixgbe/ixgbe.c	Mon Dec  5 12:06:53 2011	(r228276)
@@ -313,6 +313,18 @@ static int atr_sample_rate = 20;
 static int fdir_pballoc = 1;
 #endif
 
+#ifdef DEV_NETMAP
+/*
+ * The #ifdef DEV_NETMAP / #endif blocks in this file are meant to
+ * be a reference on how to implement netmap support in a driver.
+ * Additional comments are in ixgbe_netmap.h .
+ *
+ * <dev/netma/ixgbe_netmap.h> contains functions for netmap support
+ * that extend the standard driver.
+ */
+#include <dev/netmap/ixgbe_netmap.h>
+#endif /* DEV_NETMAP */
+
 /*********************************************************************
  *  Device identification routine
  *
@@ -578,6 +590,9 @@ ixgbe_attach(device_t dev)
 
 	ixgbe_add_hw_stats(adapter);
 
+#ifdef DEV_NETMAP
+	ixgbe_netmap_attach(adapter);
+#endif /* DEV_NETMAP */
 	INIT_DEBUGOUT("ixgbe_attach: end");
 	return (0);
 err_late:
@@ -652,6 +667,9 @@ ixgbe_detach(device_t dev)
 
 	ether_ifdetach(adapter->ifp);
 	callout_drain(&adapter->timer);
+#ifdef DEV_NETMAP
+	netmap_detach(adapter->ifp);
+#endif /* DEV_NETMAP */
 	ixgbe_free_pci_resources(adapter);
 	bus_generic_detach(dev);
 	if_free(adapter->ifp);
@@ -2813,9 +2831,20 @@ ixgbe_setup_transmit_ring(struct tx_ring
 	struct adapter *adapter = txr->adapter;
 	struct ixgbe_tx_buf *txbuf;
 	int i;
+#ifdef DEV_NETMAP
+	struct netmap_adapter *na = NA(adapter->ifp);
+	struct netmap_slot *slot;
+#endif /* DEV_NETMAP */
 
 	/* Clear the old ring contents */
 	IXGBE_TX_LOCK(txr);
+#ifdef DEV_NETMAP
+	/*
+	 * (under lock): if in netmap mode, do some consistency
+	 * checks and set slot to entry 0 of the netmap ring.
+	 */
+	slot = netmap_reset(na, NR_TX, txr->me, 0);
+#endif /* DEV_NETMAP */
 	bzero((void *)txr->tx_base,
 	      (sizeof(union ixgbe_adv_tx_desc)) * adapter->num_tx_desc);
 	/* Reset indices */
@@ -2832,6 +2861,26 @@ ixgbe_setup_transmit_ring(struct tx_ring
 			m_freem(txbuf->m_head);
 			txbuf->m_head = NULL;
 		}
+#ifdef DEV_NETMAP
+		/*
+		 * In netmap mode, set the map for the packet buffer.
+		 * NOTE: Some drivers (not this one) also need to set
+		 * the physical buffer address in the NIC ring.
+		 * Slots in the netmap ring (indexed by "si") are
+		 * kring->nkr_hwofs positions "ahead" wrt the
+		 * corresponding slot in the NIC ring. In some drivers
+		 * (not here) nkr_hwofs can be negative. When computing
+		 * si = i + kring->nkr_hwofs make sure to handle wraparounds.
+		 */
+		if (slot) {
+			int si = i + na->tx_rings[txr->me].nkr_hwofs;
+
+			if (si >= na->num_tx_desc)
+				si -= na->num_tx_desc;
+			netmap_load_map(txr->txtag, txbuf->map,
+			    NMB(slot + si), na->buff_size);
+		}
+#endif /* DEV_NETMAP */
 		/* Clear the EOP index */
 		txbuf->eop_index = -1;
         }
@@ -3310,6 +3359,29 @@ ixgbe_txeof(struct tx_ring *txr)
 
 	mtx_assert(&txr->tx_mtx, MA_OWNED);
 
+#ifdef DEV_NETMAP
+	if (ifp->if_capenable & IFCAP_NETMAP) {
+		struct netmap_adapter *na = NA(ifp);
+
+		/*
+		 * In netmap mode, all the work is done in the context
+		 * of the client thread. Interrupt handlers only wake up
+		 * clients, which may be sleeping on individual rings
+		 * or on a global resource for all rings.
+		 * When the driver has separate locks, we need to
+		 * release and re-acquire txlock to avoid deadlocks.
+		 * XXX see if we can find a better way.
+		 */
+		selwakeuppri(&na->tx_rings[txr->me].si, PI_NET);
+		IXGBE_TX_UNLOCK(txr);
+		IXGBE_CORE_LOCK(adapter);
+		selwakeuppri(&na->tx_rings[na->num_queues + 1].si, PI_NET);
+		IXGBE_CORE_UNLOCK(adapter);
+		IXGBE_TX_LOCK(txr);
+		return FALSE;
+	}
+#endif /* DEV_NETMAP */
+
 	if (txr->tx_avail == adapter->num_tx_desc) {
 		txr->queue_status = IXGBE_QUEUE_IDLE;
 		return FALSE;
@@ -3698,6 +3770,10 @@ ixgbe_setup_receive_ring(struct rx_ring 
 	bus_dma_segment_t	pseg[1], hseg[1];
 	struct lro_ctrl		*lro = &rxr->lro;
 	int			rsize, nsegs, error = 0;
+#ifdef DEV_NETMAP
+	struct netmap_adapter *na = NA(rxr->adapter->ifp);
+	struct netmap_slot *slot;
+#endif /* DEV_NETMAP */
 
 	adapter = rxr->adapter;
 	ifp = adapter->ifp;
@@ -3705,6 +3781,10 @@ ixgbe_setup_receive_ring(struct rx_ring 
 
 	/* Clear the ring contents */
 	IXGBE_RX_LOCK(rxr);
+#ifdef DEV_NETMAP
+	/* same as in ixgbe_setup_transmit_ring() */
+	slot = netmap_reset(na, NR_RX, rxr->me, 0);
+#endif /* DEV_NETMAP */
 	rsize = roundup2(adapter->num_rx_desc *
 	    sizeof(union ixgbe_adv_rx_desc), DBA_ALIGN);
 	bzero((void *)rxr->rx_base, rsize);
@@ -3721,6 +3801,29 @@ ixgbe_setup_receive_ring(struct rx_ring 
 		struct mbuf	*mh, *mp;
 
 		rxbuf = &rxr->rx_buffers[j];
+#ifdef DEV_NETMAP
+		/*
+		 * In netmap mode, fill the map and set the buffer
+		 * address in the NIC ring, considering the offset
+		 * between the netmap and NIC rings (see comment in
+		 * ixgbe_setup_transmit_ring() ). No need to allocate
+		 * an mbuf, so end the block with a continue;
+		 */
+		if (slot) {
+			int sj = j + na->rx_rings[rxr->me].nkr_hwofs;
+			void *addr;
+
+			if (sj >= na->num_rx_desc)
+				sj -= na->num_rx_desc;
+			addr = NMB(slot + sj);
+			netmap_load_map(rxr->ptag,
+			    rxbuf->pmap, addr, na->buff_size);
+			/* Update descriptor */
+			rxr->rx_base[j].read.pkt_addr =
+			    htole64(vtophys(addr));
+			continue;
+		}
+#endif /* DEV_NETMAP */
 		/*
 		** Don't allocate mbufs if not
 		** doing header split, its wasteful
@@ -3913,6 +4016,35 @@ ixgbe_initialize_receive_units(struct ad
 
 		/* Setup the HW Rx Head and Tail Descriptor Pointers */
 		IXGBE_WRITE_REG(hw, IXGBE_RDH(i), 0);
+#ifdef DEV_NETMAP
+		/*
+		 * In netmap mode, we must preserve the buffers made
+		 * available to userspace before the if_init()
+		 * (this is true by default on the TX side, because
+		 * init makes all buffers available to userspace).
+		 *
+		 * netmap_reset() and the device specific routines
+		 * (e.g. ixgbe_setup_receive_rings()) map these
+		 * buffers at the end of the NIC ring, so here we
+		 * must set the RDT (tail) register to make sure
+		 * they are not overwritten.
+		 *
+		 * In this driver the NIC ring starts at RDH = 0,
+		 * RDT points to the first 'busy' slot, so RDT = 0
+		 * means the whole ring is available, and
+		 * RDT = (num_rx_desc - X) means X slots are available.
+		 * Computations are done modulo the ring size.
+		 */
+		if (ifp->if_capenable & IFCAP_NETMAP) {
+			struct netmap_adapter *na = NA(adapter->ifp);
+			struct netmap_kring *kring = &na->rx_rings[i];
+			int t = na->num_rx_desc - kring->nr_hwavail;
+
+			if (t >= na->num_rx_desc)
+				t -= adapter->num_rx_desc;
+			IXGBE_WRITE_REG(hw, IXGBE_RDT(i), t);
+		} else
+#endif /* DEV_NETMAP */
 		IXGBE_WRITE_REG(hw, IXGBE_RDT(i), 0);
 	}
 
@@ -4148,6 +4280,22 @@ ixgbe_rxeof(struct ix_queue *que, int co
 
 	IXGBE_RX_LOCK(rxr);
 
+#ifdef DEV_NETMAP
+	if (ifp->if_capenable & IFCAP_NETMAP) {
+		/*
+		 * Same as the txeof routine, only wakeup clients
+		 * and make sure there are no deadlocks.
+		 */
+		struct netmap_adapter *na = NA(ifp);
+
+		selwakeuppri(&na->rx_rings[rxr->me].si, PI_NET);
+		IXGBE_RX_UNLOCK(rxr);
+		IXGBE_CORE_LOCK(adapter);
+		selwakeuppri(&na->rx_rings[na->num_queues + 1].si, PI_NET);
+		IXGBE_CORE_UNLOCK(adapter);
+		return (FALSE);
+	}
+#endif /* DEV_NETMAP */
 	for (i = rxr->next_to_check; count != 0;) {
 		struct mbuf	*sendmp, *mh, *mp;
 		u32		rsc, ptype;

Modified: head/sys/dev/netmap/if_em_netmap.h
==============================================================================
--- head/sys/dev/netmap/if_em_netmap.h	Mon Dec  5 10:34:52 2011	(r228275)
+++ head/sys/dev/netmap/if_em_netmap.h	Mon Dec  5 12:06:53 2011	(r228276)
@@ -9,7 +9,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -25,9 +25,12 @@
 
 /*
  * $FreeBSD$
- * $Id: if_em_netmap.h 9662 2011-11-16 13:18:06Z luigi $
+ * $Id: if_em_netmap.h 9802 2011-12-02 18:42:37Z luigi $
  *
  * netmap changes for if_em.
+ *
+ * For structure and details on the individual functions please see
+ * ixgbe_netmap.h
  */
 
 #include <net/netmap.h>
@@ -58,12 +61,7 @@ em_netmap_attach(struct adapter *adapter
 	na.nm_rxsync = em_netmap_rxsync;
 	na.nm_lock = em_netmap_lock_wrapper;
 	na.nm_register = em_netmap_reg;
-	/*
-	 * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode
-	 * we allocate the buffers on the first register. So we must
-	 * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set.
-	 */
-	na.buff_size = MCLBYTES;
+	na.buff_size = NETMAP_BUF_SIZE;
 	netmap_attach(&na, adapter->num_queues);
 }
 
@@ -100,6 +98,7 @@ em_netmap_lock_wrapper(void *_a, int wha
 }
 
 
+// XXX do we need to block/unblock the tasks ?
 static void
 em_netmap_block_tasks(struct adapter *adapter)
 {
@@ -162,9 +161,6 @@ em_netmap_reg(struct ifnet *ifp, int ono
 	if (onoff) {
 		ifp->if_capenable |= IFCAP_NETMAP;
 
-		/* save if_transmit for later restore.
-		 * XXX also if_start and if_qflush ?
-		 */
 		na->if_transmit = ifp->if_transmit;
 		ifp->if_transmit = netmap_start;
 
@@ -179,15 +175,13 @@ fail:
 		ifp->if_transmit = na->if_transmit;
 		ifp->if_capenable &= ~IFCAP_NETMAP;
 		em_init_locked(adapter);	/* also enable intr */
-
 	}
 	em_netmap_unblock_tasks(adapter);
 	return (error);
 }
 
 /*
- * Reconcile hardware and user view of the transmit ring, see
- * ixgbe.c for details.
+ * Reconcile hardware and user view of the transmit ring.
  */
 static int
 em_netmap_txsync(void *a, u_int ring_nr, int do_lock)
@@ -197,13 +191,13 @@ em_netmap_txsync(void *a, u_int ring_nr,
 	struct netmap_adapter *na = NA(adapter->ifp);
 	struct netmap_kring *kring = &na->tx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
-	int j, k, n, lim = kring->nkr_num_slots - 1;
+	int j, k, l, n = 0, lim = kring->nkr_num_slots - 1;
 
 	/* generate an interrupt approximately every half ring */
 	int report_frequency = kring->nkr_num_slots >> 1;
 
 	k = ring->cur;
-	if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+	if (k > lim)
 		return netmap_ring_reinit(kring);
 
 	if (do_lock)
@@ -211,35 +205,20 @@ em_netmap_txsync(void *a, u_int ring_nr,
 	bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
 			BUS_DMASYNC_POSTREAD);
 
-	/* record completed transmissions TODO
-	 *
-	 * instead of using TDH, we could read the transmitted status bit.
+	/* check for new packets to send.
+	 * j indexes the netmap ring, l indexes the nic ring, and
+	 *	j = kring->nr_hwcur, l = E1000_TDT (not tracked),
+	 *	j == (l + kring->nkr_hwofs) % ring_size
 	 */
-	j = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
-	if (j >= kring->nkr_num_slots) { /* XXX can happen */
-		D("TDH wrap %d", j);
-		j -= kring->nkr_num_slots;
-	}
-	int delta = j - txr->next_to_clean;
-	if (delta) {
-		/* new transmissions were completed, increment
-		   ring->nr_hwavail. */
-		if (delta < 0)
-			delta += kring->nkr_num_slots;
-		txr->next_to_clean = j;
-		kring->nr_hwavail += delta;
-	}
-
-	/* update avail to what the hardware knows */
-	ring->avail = kring->nr_hwavail;
-
 	j = kring->nr_hwcur;
 	if (j != k) {	/* we have packets to send */
-		n = 0;
+		l = j - kring->nkr_hwofs;
+		if (l < 0)
+			l += lim + 1;
 		while (j != k) {
 			struct netmap_slot *slot = &ring->slot[j];
-			struct e1000_tx_desc *curr = &txr->tx_base[j];
-			struct em_buffer *txbuf = &txr->tx_buffers[j];
+			struct e1000_tx_desc *curr = &txr->tx_base[l];
+			struct em_buffer *txbuf = &txr->tx_buffers[l];
 			int flags = ((slot->flags & NS_REPORT) ||
 				j == 0 || j == report_frequency) ?
 					E1000_TXD_CMD_RS : 0;
@@ -254,42 +233,61 @@ em_netmap_txsync(void *a, u_int ring_nr,
 			slot->flags &= ~NS_REPORT;
 			curr->upper.data = 0;
 			curr->lower.data = 
-			    htole32(
-				adapter->txd_cmd |
-				(E1000_TXD_CMD_EOP | flags) |
-				slot->len);
+			    htole32(adapter->txd_cmd | len |
+				(E1000_TXD_CMD_EOP | flags) );
 			if (slot->flags & NS_BUF_CHANGED) {
 				curr->buffer_addr = htole64(vtophys(addr));
-				/* buffer has changed, unload and reload map */
+				/* buffer has changed, reload map */
 				netmap_reload_map(txr->txtag, txbuf->map,
-					addr, na->buff_size);
+				    addr, na->buff_size);
 				slot->flags &= ~NS_BUF_CHANGED;
 			}
 
 			bus_dmamap_sync(txr->txtag, txbuf->map,
 				BUS_DMASYNC_PREWRITE);
 			j = (j == lim) ? 0 : j + 1;
+			l = (l == lim) ? 0 : l + 1;
 			n++;
 		}
-		kring->nr_hwcur = ring->cur;
+		kring->nr_hwcur = k;
 
 		/* decrease avail by number of sent packets */
-		ring->avail -= n;
-		kring->nr_hwavail = ring->avail;
+		kring->nr_hwavail -= n;
 
 		bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
-			BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
-		E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me),
-			ring->cur);
+		E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), l);
 	}
+
+	if (n == 0 || kring->nr_hwavail < 1) {
+		int delta;
+
+		/* record completed transmissions using THD. */
+		l = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+		if (l >= kring->nkr_num_slots) { /* XXX can happen */
+			D("TDH wrap %d", l);
+			l -= kring->nkr_num_slots;
+		}
+		delta = l - txr->next_to_clean;
+		if (delta) {
+			/* some completed, increment hwavail. */
+			if (delta < 0)
+				delta += kring->nkr_num_slots;
+			txr->next_to_clean = l;
+			kring->nr_hwavail += delta;
+		}
+	}
+	/* update avail to what the hardware knows */
+	ring->avail = kring->nr_hwavail;
+
 	if (do_lock)
 		EM_TX_UNLOCK(txr);
 	return 0;
 }
 
 /*
- * Reconcile kernel and user view of the receive ring, see ixgbe.c
+ * Reconcile kernel and user view of the receive ring.
  */
 static int
 em_netmap_rxsync(void *a, u_int ring_nr, int do_lock)
@@ -299,10 +297,10 @@ em_netmap_rxsync(void *a, u_int ring_nr,
 	struct netmap_adapter *na = NA(adapter->ifp);
 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
-	int j, k, n, lim = kring->nkr_num_slots - 1;
+	int j, k, l, n, lim = kring->nkr_num_slots - 1;
 
 	k = ring->cur;
-	if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+	if (k > lim)
 		return netmap_ring_reinit(kring);
  
 	if (do_lock)
@@ -311,36 +309,52 @@ em_netmap_rxsync(void *a, u_int ring_nr,
 	bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
 			BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
-	/* acknowledge all the received packets. */
-	j = rxr->next_to_check;
+	/* import newly received packets into the netmap ring.
+	 * j is an index in the netmap ring, l in the NIC ring, and
+	 *	j = (kring->nr_hwcur + kring->nr_hwavail) % ring_size
+	 *	l = rxr->next_to_check;
+	 * and
+	 *	j == (l + kring->nkr_hwofs) % ring_size
+	 */
+	l = rxr->next_to_check;
+	j = l + kring->nkr_hwofs;
+	/* here nkr_hwofs can be negative so must check for j < 0 */
+	if (j < 0)
+		j += lim + 1;
+	else if (j > lim)
+		j -= lim + 1;
 	for (n = 0; ; n++) {
-		struct e1000_rx_desc *curr = &rxr->rx_base[j];
+		struct e1000_rx_desc *curr = &rxr->rx_base[l];
 
 		if ((curr->status & E1000_RXD_STAT_DD) == 0)
 			break;
 		ring->slot[j].len = le16toh(curr->length);
-		bus_dmamap_sync(rxr->tag, rxr->rx_buffers[j].map,
+		bus_dmamap_sync(rxr->tag, rxr->rx_buffers[l].map,
 			BUS_DMASYNC_POSTREAD);
 		j = (j == lim) ? 0 : j + 1;
+		/* make sure next_to_refresh follows next_to_check */
+		rxr->next_to_refresh = l;	// XXX
+		l = (l == lim) ? 0 : l + 1;
 	}
 	if (n) {
-		rxr->next_to_check = j;
+		rxr->next_to_check = l;
 		kring->nr_hwavail += n;
 	}
 
-	/* skip past packets that userspace has already processed:
-	 * making them available for reception.
-	 * advance nr_hwcur and issue a bus_dmamap_sync on the
-	 * buffers so it is safe to write to them.
-	 * Also increase nr_hwavail
-         */
+	/* skip past packets that userspace has already processed */
 	j = kring->nr_hwcur;
 	if (j != k) { /* userspace has read some packets. */
 		n = 0;
+		l = j - kring->nkr_hwofs; /* NIC ring index */
+		/* here nkr_hwofs can be negative so check for l > lim */
+		if (l < 0)
+			l += lim + 1;
+		else if (l > lim)
+			l -= lim + 1;
 		while (j != k) {
 			struct netmap_slot *slot = &ring->slot[j];
-			struct e1000_rx_desc *curr = &rxr->rx_base[j];
-			struct em_buffer *rxbuf = &rxr->rx_buffers[j];
+			struct e1000_rx_desc *curr = &rxr->rx_base[l];
+			struct em_buffer *rxbuf = &rxr->rx_buffers[l];
 			void *addr = NMB(slot);
 
 			if (addr == netmap_buffer_base) { /* bad buf */
@@ -352,28 +366,29 @@ em_netmap_rxsync(void *a, u_int ring_nr,
 			curr->status = 0;
 			if (slot->flags & NS_BUF_CHANGED) {
 				curr->buffer_addr = htole64(vtophys(addr));
-				/* buffer has changed, unload and reload map */
+				/* buffer has changed, reload map */
 				netmap_reload_map(rxr->rxtag, rxbuf->map,
-					addr, na->buff_size);
+				    addr, na->buff_size);
 				slot->flags &= ~NS_BUF_CHANGED;
 			}
 
 			bus_dmamap_sync(rxr->rxtag, rxbuf->map,
-				BUS_DMASYNC_PREREAD);
+			    BUS_DMASYNC_PREREAD);
 
 			j = (j == lim) ? 0 : j + 1;
+			l = (l == lim) ? 0 : l + 1;
 			n++;
 		}
 		kring->nr_hwavail -= n;
-		kring->nr_hwcur = ring->cur;
+		kring->nr_hwcur = k;
 		bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
-			BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 		/*
 		 * IMPORTANT: we must leave one free slot in the ring,
-		 * so move j back by one unit
+		 * so move l back by one unit
 		 */
-		j = (j == 0) ? lim : j - 1;
-		E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), j);
+		l = (l == 0) ? lim : l - 1;
+		E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), l);
 	}
 	/* tell userspace that there are new packets */
 	ring->avail = kring->nr_hwavail ;

Modified: head/sys/dev/netmap/if_igb_netmap.h
==============================================================================
--- head/sys/dev/netmap/if_igb_netmap.h	Mon Dec  5 10:34:52 2011	(r228275)
+++ head/sys/dev/netmap/if_igb_netmap.h	Mon Dec  5 12:06:53 2011	(r228276)
@@ -25,7 +25,7 @@
 
 /*
  * $FreeBSD$
- * $Id: if_igb_netmap.h 9662 2011-11-16 13:18:06Z luigi $
+ * $Id: if_igb_netmap.h 9802 2011-12-02 18:42:37Z luigi $
  *
  * netmap modifications for igb
  * contribured by Ahmed Kooli
@@ -58,12 +58,7 @@ igb_netmap_attach(struct adapter *adapte
 	na.nm_rxsync = igb_netmap_rxsync;
 	na.nm_lock = igb_netmap_lock_wrapper;
 	na.nm_register = igb_netmap_reg;
-	/*
-	 * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode
-	 * we allocate the buffers on the first register. So we must
-	 * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set.
-	 */
-	na.buff_size = MCLBYTES;
+	na.buff_size = NETMAP_BUF_SIZE;
 	netmap_attach(&na, adapter->num_queues);
 }	
 
@@ -111,7 +106,7 @@ igb_netmap_reg(struct ifnet *ifp, int on
 	struct netmap_adapter *na = NA(ifp);
 	int error = 0;
 
-	if (!na)
+	if (na == NULL)
 		return EINVAL;
 
 	igb_disable_intr(adapter);
@@ -144,21 +139,6 @@ fail:
 
 /*
  * Reconcile kernel and user view of the transmit ring.
- *
- * Userspace has filled tx slots up to cur (excluded).
- * The last unused slot previously known to the kernel was nr_hwcur,
- * and the last interrupt reported nr_hwavail slots available
- * (using the special value -1 to indicate idle transmit ring).
- * The function must first update avail to what the kernel
- * knows, subtract the newly used slots (cur - nr_hwcur)
- * from both avail and nr_hwavail, and set nr_hwcur = cur
- * issuing a dmamap_sync on all slots.
- *
- * Check parameters in the struct netmap_ring.
- * We don't use avail, only check for bogus values.
- * Make sure cur is valid, and same goes for buffer indexes and lengths.
- * To avoid races, read the values once, and never use those from
- * the ring afterwards.
  */
 static int
 igb_netmap_txsync(void *a, u_int ring_nr, int do_lock)
@@ -168,54 +148,40 @@ igb_netmap_txsync(void *a, u_int ring_nr
 	struct netmap_adapter *na = NA(adapter->ifp);
 	struct netmap_kring *kring = &na->tx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
-	int j, k, n, lim = kring->nkr_num_slots - 1;
+	int j, k, l, n = 0, lim = kring->nkr_num_slots - 1;
 
 	/* generate an interrupt approximately every half ring */
 	int report_frequency = kring->nkr_num_slots >> 1;
 
-	k = ring->cur;	/* ring is not protected by any lock */
-	if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+	k = ring->cur;
+	if (k > lim)
 		return netmap_ring_reinit(kring);
 
 	if (do_lock)
 		IGB_TX_LOCK(txr);
 	bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
-			BUS_DMASYNC_POSTREAD);
-
-	/* record completed transmissions. TODO
-	 *
-	 * Instead of reading from the TDH register, we could and try to check
-	 * the status bit of descriptor packets.
-	 */
-	j = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
-	if (j >= kring->nkr_num_slots) /* XXX can it happen ? */
-		j -= kring->nkr_num_slots;
-	int delta = j - txr->next_to_clean;
-	if (delta) {
-		/* new tx were completed */
-		if (delta < 0)
-			delta += kring->nkr_num_slots;
-		txr->next_to_clean = j;
-		kring->nr_hwavail += delta;
-	}
+	    BUS_DMASYNC_POSTREAD);
 
 	/* update avail to what the hardware knows */
 	ring->avail = kring->nr_hwavail;
 
-	j = kring->nr_hwcur;
+	j = kring->nr_hwcur; /* netmap ring index */
 	if (j != k) {	/* we have new packets to send */
 		u32 olinfo_status = 0;
-		n = 0;
+		int n = 0;
 
+		l = j - kring->nkr_hwofs; /* NIC ring index */
+		if (l < 0)
+			l += lim + 1;
 		/* 82575 needs the queue index added */
 		if (adapter->hw.mac.type == e1000_82575)
 			olinfo_status |= txr->me << 4;
 
 		while (j != k) {
 			struct netmap_slot *slot = &ring->slot[j];
-			struct igb_tx_buffer *txbuf = &txr->tx_buffers[j];
+			struct igb_tx_buffer *txbuf = &txr->tx_buffers[l];
 			union e1000_adv_tx_desc *curr =
-				(union e1000_adv_tx_desc *)&txr->tx_base[j];
+			    (union e1000_adv_tx_desc *)&txr->tx_base[l];
 			void *addr = NMB(slot);
 			int flags = ((slot->flags & NS_REPORT) ||
 				j == 0 || j == report_frequency) ?
@@ -229,6 +195,7 @@ igb_netmap_txsync(void *a, u_int ring_nr
 			}
 
 			slot->flags &= ~NS_REPORT;
+			// XXX do we need to set the address ?
 			curr->read.buffer_addr = htole64(vtophys(addr));
 			curr->read.olinfo_status =
 			    htole32(olinfo_status |
@@ -239,7 +206,7 @@ igb_netmap_txsync(void *a, u_int ring_nr
 				    E1000_ADVTXD_DCMD_DEXT |
 				    E1000_ADVTXD_DCMD_EOP | flags);
 			if (slot->flags & NS_BUF_CHANGED) {
-				/* buffer has changed, unload and reload map */
+				/* buffer has changed, reload map */
 				netmap_reload_map(txr->txtag, txbuf->map,
 					addr, na->buff_size);
 				slot->flags &= ~NS_BUF_CHANGED;
@@ -248,22 +215,40 @@ igb_netmap_txsync(void *a, u_int ring_nr
 			bus_dmamap_sync(txr->txtag, txbuf->map,
 				BUS_DMASYNC_PREWRITE);
 			j = (j == lim) ? 0 : j + 1;
+			l = (l == lim) ? 0 : l + 1;
 			n++;
 		}
 		kring->nr_hwcur = k;
 
 		/* decrease avail by number of sent packets */
-		ring->avail -= n;
-		kring->nr_hwavail = ring->avail;
+		kring->nr_hwavail -= n;
+		ring->avail = kring->nr_hwavail;
 
-		/* Set the watchdog */
+		/* Set the watchdog XXX ? */
 		txr->queue_status = IGB_QUEUE_WORKING;
 		txr->watchdog_time = ticks;
 
 		bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
-			BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
-		E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), k);
+		E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), l);
+	}
+	if (n == 0 || kring->nr_hwavail < 1) {
+		int delta;
+
+		/* record completed transmission using TDH */
+		l = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+		if (l >= kring->nkr_num_slots) /* XXX can it happen ? */
+			l -= kring->nkr_num_slots;
+		delta = l - txr->next_to_clean;
+		if (delta) {
+			/* new tx were completed */
+			if (delta < 0)
+				delta += kring->nkr_num_slots;
+			txr->next_to_clean = l;
+			kring->nr_hwavail += delta;
+			ring->avail = kring->nr_hwavail;
+		}
 	}
 	if (do_lock)
 		IGB_TX_UNLOCK(txr);
@@ -273,15 +258,6 @@ igb_netmap_txsync(void *a, u_int ring_nr
 
 /*
  * Reconcile kernel and user view of the receive ring.
- *
- * Userspace has read rx slots up to cur (excluded).
- * The last unread slot previously known to the kernel was nr_hwcur,
- * and the last interrupt reported nr_hwavail slots available.
- * We must subtract the newly consumed slots (cur - nr_hwcur)
- * from nr_hwavail, clearing the descriptors for the next
- * read, tell the hardware that they are available,
- * and set nr_hwcur = cur and avail = nr_hwavail.
- * issuing a dmamap_sync on all slots.
  */
 static int
 igb_netmap_rxsync(void *a, u_int ring_nr, int do_lock)
@@ -291,10 +267,10 @@ igb_netmap_rxsync(void *a, u_int ring_nr
 	struct netmap_adapter *na = NA(adapter->ifp);
 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
-	int j, k, n, lim = kring->nkr_num_slots - 1;
+	int j, k, l, n, lim = kring->nkr_num_slots - 1;
 
-	k = ring->cur;	/* ring is not protected by any lock */
-	if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+	k = ring->cur;
+	if (k > lim)
 		return netmap_ring_reinit(kring);
 
 	if (do_lock)
@@ -304,9 +280,12 @@ igb_netmap_rxsync(void *a, u_int ring_nr
 	bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
-	j = rxr->next_to_check;
+	l = rxr->next_to_check;
+	j = l + kring->nkr_hwofs;
+	if (j > lim)
+		j -= lim + 1;
 	for (n = 0; ; n++) {
-		union e1000_adv_rx_desc *curr = &rxr->rx_base[j];
+		union e1000_adv_rx_desc *curr = &rxr->rx_base[l];
 		uint32_t staterr = le32toh(curr->wb.upper.status_error);
 
 		if ((staterr & E1000_RXD_STAT_DD) == 0)
@@ -314,15 +293,13 @@ igb_netmap_rxsync(void *a, u_int ring_nr
 		ring->slot[j].len = le16toh(curr->wb.upper.length);
 		
 		bus_dmamap_sync(rxr->ptag,
-			rxr->rx_buffers[j].pmap, BUS_DMASYNC_POSTREAD);
+			rxr->rx_buffers[l].pmap, BUS_DMASYNC_POSTREAD);
 		j = (j == lim) ? 0 : j + 1;
+		l = (l == lim) ? 0 : l + 1;
 	}
 	if (n) {
-		rxr->next_to_check = j;
+		rxr->next_to_check = l;
 		kring->nr_hwavail += n;
-		if (kring->nr_hwavail >= lim - 10) {
-			ND("rx ring %d almost full %d", ring_nr, kring->nr_hwavail);
-		}
 	}
 
 	/* skip past packets that userspace has already processed,
@@ -332,12 +309,15 @@ igb_netmap_rxsync(void *a, u_int ring_nr
 	 * Also increase nr_hwavail
 	 */
 	j = kring->nr_hwcur;
+	l = kring->nr_hwcur - kring->nkr_hwofs;
+	if (l < 0)
+		l += lim + 1;
 	if (j != k) {	/* userspace has read some packets. */
 		n = 0;
 		while (j != k) {
 			struct netmap_slot *slot = ring->slot + j;
-			union e1000_adv_rx_desc *curr = &rxr->rx_base[j];
-			struct igb_rx_buf *rxbuf = rxr->rx_buffers + j;
+			union e1000_adv_rx_desc *curr = &rxr->rx_base[l];
+			struct igb_rx_buf *rxbuf = rxr->rx_buffers + l;
 			void *addr = NMB(slot);
 
 			if (addr == netmap_buffer_base) { /* bad buf */
@@ -358,6 +338,7 @@ igb_netmap_rxsync(void *a, u_int ring_nr
 				BUS_DMASYNC_PREREAD);
 
 			j = (j == lim) ? 0 : j + 1;
+			l = (l == lim) ? 0 : l + 1;
 			n++;
 		}
 		kring->nr_hwavail -= n;
@@ -365,10 +346,10 @@ igb_netmap_rxsync(void *a, u_int ring_nr
 		bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
 			BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 		/* IMPORTANT: we must leave one free slot in the ring,
-		 * so move j back by one unit
+		 * so move l back by one unit
 		 */
-		j = (j == 0) ? lim : j - 1;
-		E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), j);
+		l = (l == 0) ? lim : l - 1;
+		E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), l);
 	}
 	/* tell userspace that there are new packets */
 	ring->avail = kring->nr_hwavail ;

Modified: head/sys/dev/netmap/if_lem_netmap.h
==============================================================================
--- head/sys/dev/netmap/if_lem_netmap.h	Mon Dec  5 10:34:52 2011	(r228275)
+++ head/sys/dev/netmap/if_lem_netmap.h	Mon Dec  5 12:06:53 2011	(r228276)
@@ -25,9 +25,12 @@
 
 /*
  * $FreeBSD$
- * $Id: if_lem_netmap.h 9662 2011-11-16 13:18:06Z luigi $
+ * $Id: if_lem_netmap.h 9802 2011-12-02 18:42:37Z luigi $
  *
  * netmap support for if_lem.c
+ *
+ * For structure and details on the individual functions please see
+ * ixgbe_netmap.h
  */
 
 #include <net/netmap.h>
@@ -59,7 +62,7 @@ lem_netmap_attach(struct adapter *adapte
 	na.nm_rxsync = lem_netmap_rxsync;
 	na.nm_lock = lem_netmap_lock_wrapper;
 	na.nm_register = lem_netmap_reg;
-	na.buff_size = MCLBYTES;
+	na.buff_size = NETMAP_BUF_SIZE;
 	netmap_attach(&na, 1);
 }
 
@@ -94,7 +97,61 @@ lem_netmap_lock_wrapper(void *_a, int wh
 
 
 /*
- * Reconcile kernel and user view of the transmit ring. see ixgbe.c
+ * Register/unregister routine
+ */
+static int
+lem_netmap_reg(struct ifnet *ifp, int onoff)
+{
+	struct adapter *adapter = ifp->if_softc;
+	struct netmap_adapter *na = NA(ifp);
+	int error = 0;
+
+	if (na == NULL)
+		return EINVAL;
+
+	lem_disable_intr(adapter);
+
+	/* Tell the stack that the interface is no longer active */
+	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+
+	/* lem_netmap_block_tasks(adapter); */
+#ifndef EM_LEGACY_IRQ // XXX do we need this ?
+	taskqueue_block(adapter->tq);
+	taskqueue_drain(adapter->tq, &adapter->rxtx_task);
+	taskqueue_drain(adapter->tq, &adapter->link_task);
+#endif /* !EM_LEGCY_IRQ */
+	if (onoff) {
+		ifp->if_capenable |= IFCAP_NETMAP;
+
+		/* save if_transmit to restore it when exiting.
+		 * XXX what about if_start and if_qflush ?
+		 */
+		na->if_transmit = ifp->if_transmit;
+		ifp->if_transmit = netmap_start;
+
+		lem_init_locked(adapter);
+		if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) {
+			error = ENOMEM;
+			goto fail;
+		}
+	} else {
+fail:
+		/* restore non-netmap mode */
+		ifp->if_transmit = na->if_transmit;
+		ifp->if_capenable &= ~IFCAP_NETMAP;
+		lem_init_locked(adapter);	/* also enables intr */
+	}
+
+#ifndef EM_LEGACY_IRQ
+	taskqueue_unblock(adapter->tq); // XXX do we need this ?
+#endif /* !EM_LEGCY_IRQ */
+
+	return (error);
+}
+
+
+/*
+ * Reconcile kernel and user view of the transmit ring.
  */
 static int
 lem_netmap_txsync(void *a, u_int ring_nr, int do_lock)
@@ -103,13 +160,13 @@ lem_netmap_txsync(void *a, u_int ring_nr
 	struct netmap_adapter *na = NA(adapter->ifp);
 	struct netmap_kring *kring = &na->tx_rings[0];
 	struct netmap_ring *ring = kring->ring;
-	int j, k, n, lim = kring->nkr_num_slots - 1;
+	int j, k, l, n = 0, lim = kring->nkr_num_slots - 1;
 
 	/* generate an interrupt approximately every half ring */
 	int report_frequency = kring->nkr_num_slots >> 1;
 
 	k = ring->cur;
-	if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+	if (k > lim)
 		return netmap_ring_reinit(kring);
 
 	if (do_lock)
@@ -117,33 +174,18 @@ lem_netmap_txsync(void *a, u_int ring_nr
 	bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
 			BUS_DMASYNC_POSTREAD);
 
-	/* record completed transmissions TODO
-	 *
-	 * instead of using TDH, we could read the transmitted status bit.
-	 */
-	j = E1000_READ_REG(&adapter->hw, E1000_TDH(0));
-	if (j >= kring->nkr_num_slots) { /* can it happen ? */
-		D("bad TDH %d", j);
-		j -= kring->nkr_num_slots;
-	}
-	int delta = j - adapter->next_tx_to_clean;
-	if (delta) {
-		if (delta < 0)
-			delta += kring->nkr_num_slots;
-		adapter->next_tx_to_clean = j;
-		kring->nr_hwavail += delta;
-	}
-
 	/* update avail to what the hardware knows */
 	ring->avail = kring->nr_hwavail;
 
-	j = kring->nr_hwcur;
+	j = kring->nr_hwcur; /* points into the netmap ring */
 	if (j != k) {	/* we have new packets to send */
-		n = 0;
+		l = j - kring->nkr_hwofs; /* points into the NIC ring */
+		if (l < 0)
+			l += lim + 1;
 		while (j != k) {
 			struct netmap_slot *slot = &ring->slot[j];
-			struct e1000_tx_desc *curr = &adapter->tx_desc_base[j];
-			struct em_buffer *txbuf = &adapter->tx_buffer_area[j];
+			struct e1000_tx_desc *curr = &adapter->tx_desc_base[l];
+			struct em_buffer *txbuf = &adapter->tx_buffer_area[l];
 			void *addr = NMB(slot);
 			int flags = ((slot->flags & NS_REPORT) ||
 				j == 0 || j == report_frequency) ?
@@ -156,34 +198,54 @@ lem_netmap_txsync(void *a, u_int ring_nr
 				return netmap_ring_reinit(kring);
 			}
 
+			slot->flags &= ~NS_REPORT;
 			curr->upper.data = 0;
-			/* always interrupt. XXX make it conditional */
 			curr->lower.data =
 			    htole32( adapter->txd_cmd | len |
 				(E1000_TXD_CMD_EOP | flags) );

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201112051206.pB5C6rxH036343>