Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 16 Aug 2014 15:00:01 +0000 (UTC)
From:      Luigi Rizzo <luigi@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r270063 - in head: sys/conf sys/dev/cxgbe sys/dev/e1000 sys/dev/ixgbe sys/dev/netmap sys/dev/virtio/network sys/net tools/tools/netmap
Message-ID:  <201408161500.s7GF01KC085529@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: luigi
Date: Sat Aug 16 15:00:01 2014
New Revision: 270063
URL: http://svnweb.freebsd.org/changeset/base/270063

Log:
  Update to the current version of netmap.
  Mostly bugfixes or features developed in the past 6 months,
  so this is a 10.1 candidate.
  
  Basically no user API changes (some bugfixes in sys/net/netmap_user.h).
  
  In detail:
  
  1. netmap support for virtio-net, including in netmap mode.
    Under bhyve and with a netmap backend [2] we reach over 1Mpps
    with standard APIs (e.g. libpcap), and 5-8 Mpps in netmap mode.
  
  2. (kernel) add support for multiple memory allocators, so we can
    better partition physical and virtual interfaces giving access
    to separate users. The most visible effect is one additional
    argument to the various kernel functions to compute buffer
    addresses. All netmap-supported drivers are affected, but changes
    are mechanical and trivial
  
  3. (kernel) simplify the prototype for *txsync() and *rxsync()
    driver methods. All netmap drivers affected, changes mostly mechanical.
  
  4. add support for netmap-monitor ports. Think of it as a mirroring
    port on a physical switch: a netmap monitor port replicates traffic
    present on the main port. Restrictions apply. Drive carefully.
  
  5. if_lem.c: support for various paravirtualization features,
    experimental and disabled by default.
    Most of these are described in our ANCS'13 paper [1].
    Paravirtualized support in netmap mode is new, and beats the
    numbers in the paper by a large factor (under qemu-kvm,
    we measured gues-host throughput up to 10-12 Mpps).
  
  A lot of refactoring and additional documentation in the files
  in sys/dev/netmap, but apart from #2 and #3 above, almost nothing
  of this stuff is visible to other kernel parts.
  
  Example programs in tools/tools/netmap have been updated with bugfixes
  and to support more of the existing features.
  
  This is meant to go into 10.1 so we plan an MFC before the Aug.22 deadline.
  
  A lot of this code has been contributed by my colleagues at UNIPI,
  including Giuseppe Lettieri, Vincenzo Maffione, Stefano Garzarella.
  
  MFC after:	3 days.

Added:
  head/sys/dev/netmap/if_vtnet_netmap.h   (contents, props changed)
  head/sys/dev/netmap/netmap_monitor.c   (contents, props changed)
  head/sys/net/paravirt.h   (contents, props changed)
Modified:
  head/sys/conf/files
  head/sys/dev/cxgbe/t4_netmap.c
  head/sys/dev/e1000/if_em.c
  head/sys/dev/e1000/if_igb.c
  head/sys/dev/e1000/if_lem.c
  head/sys/dev/ixgbe/ixgbe.c
  head/sys/dev/netmap/if_em_netmap.h
  head/sys/dev/netmap/if_igb_netmap.h
  head/sys/dev/netmap/if_lem_netmap.h
  head/sys/dev/netmap/if_re_netmap.h
  head/sys/dev/netmap/ixgbe_netmap.h
  head/sys/dev/netmap/netmap.c
  head/sys/dev/netmap/netmap_freebsd.c
  head/sys/dev/netmap/netmap_generic.c
  head/sys/dev/netmap/netmap_kern.h
  head/sys/dev/netmap/netmap_mbq.h
  head/sys/dev/netmap/netmap_mem2.c
  head/sys/dev/netmap/netmap_mem2.h
  head/sys/dev/netmap/netmap_offloadings.c
  head/sys/dev/netmap/netmap_pipe.c
  head/sys/dev/netmap/netmap_vale.c
  head/sys/dev/virtio/network/if_vtnet.c
  head/sys/net/netmap.h
  head/sys/net/netmap_user.h
  head/tools/tools/netmap/pkt-gen.c
  head/tools/tools/netmap/vale-ctl.c

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Sat Aug 16 14:56:11 2014	(r270062)
+++ head/sys/conf/files	Sat Aug 16 15:00:01 2014	(r270063)
@@ -1948,6 +1948,7 @@ dev/netmap/netmap_freebsd.c	optional net
 dev/netmap/netmap_generic.c	optional netmap
 dev/netmap/netmap_mbq.c		optional netmap
 dev/netmap/netmap_mem2.c	optional netmap
+dev/netmap/netmap_monitor.c	optional netmap
 dev/netmap/netmap_offloadings.c	optional netmap
 dev/netmap/netmap_pipe.c	optional netmap
 dev/netmap/netmap_vale.c	optional netmap

Modified: head/sys/dev/cxgbe/t4_netmap.c
==============================================================================
--- head/sys/dev/cxgbe/t4_netmap.c	Sat Aug 16 14:56:11 2014	(r270062)
+++ head/sys/dev/cxgbe/t4_netmap.c	Sat Aug 16 15:00:01 2014	(r270063)
@@ -434,19 +434,18 @@ cxgbe_netmap_on(struct adapter *sc, stru
 
 	hwb = &sc->sge.hw_buf_info[0];
 	for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) {
-		if (hwb->size == NETMAP_BUF_SIZE)
+		if (hwb->size == NETMAP_BUF_SIZE(na))
 			break;
 	}
 	if (i >= SGE_FLBUF_SIZES) {
 		if_printf(ifp, "no hwidx for netmap buffer size %d.\n",
-		    NETMAP_BUF_SIZE);
+		    NETMAP_BUF_SIZE(na));
 		return (ENXIO);
 	}
 	hwidx = i;
 
 	/* Must set caps before calling netmap_reset */
-	na->na_flags |= (NAF_NATIVE_ON | NAF_NETMAP_ON);
-	ifp->if_capenable |= IFCAP_NETMAP;
+	nm_set_native_flags(na);
 
 	for_each_nm_rxq(pi, i, nm_rxq) {
 		alloc_nm_rxq_hwq(pi, nm_rxq);
@@ -460,7 +459,7 @@ cxgbe_netmap_on(struct adapter *sc, stru
 		for (j = 0; j < nm_rxq->fl_sidx - 8; j++) {
 			uint64_t ba;
 
-			PNMB(&slot[j], &ba);
+			PNMB(na, &slot[j], &ba);
 			nm_rxq->fl_desc[j] = htobe64(ba | hwidx);
 		}
 		nm_rxq->fl_pidx = j;
@@ -512,8 +511,7 @@ cxgbe_netmap_off(struct adapter *sc, str
 	rc = -t4_enable_vi(sc, sc->mbox, pi->nm_viid, false, false);
 	if (rc != 0)
 		if_printf(ifp, "netmap disable_vi failed: %d\n", rc);
-	na->na_flags &= ~(NAF_NATIVE_ON | NAF_NETMAP_ON);
-	ifp->if_capenable &= ~IFCAP_NETMAP;
+	nm_clear_native_flags(na);
 
 	/*
 	 * XXXNM: We need to make sure that the tx queues are quiet and won't
@@ -669,7 +667,7 @@ cxgbe_nm_tx(struct adapter *sc, struct s
 
 		for (i = 0; i < n; i++) {
 			slot = &ring->slot[kring->nr_hwcur];
-			PNMB(slot, &ba);
+			PNMB(kring->na, slot, &ba);
 
 			cpl->ctrl0 = nm_txq->cpl_ctrl0;
 			cpl->pack = 0;
@@ -786,13 +784,13 @@ reclaim_nm_tx_desc(struct sge_nm_txq *nm
 }
 
 static int
-cxgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+cxgbe_netmap_txsync(struct netmap_kring *kring, int flags)
 {
-	struct netmap_kring *kring = &na->tx_rings[ring_nr];
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
 	struct port_info *pi = ifp->if_softc;
 	struct adapter *sc = pi->adapter;
-	struct sge_nm_txq *nm_txq = &sc->sge.nm_txq[pi->first_nm_txq + ring_nr];
+	struct sge_nm_txq *nm_txq = &sc->sge.nm_txq[pi->first_nm_txq + kring->ring_id];
 	const u_int head = kring->rhead;
 	u_int reclaimed = 0;
 	int n, d, npkt_remaining, ndesc_remaining;
@@ -851,14 +849,14 @@ cxgbe_netmap_txsync(struct netmap_adapte
 }
 
 static int
-cxgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+cxgbe_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
-	struct netmap_kring *kring = &na->rx_rings[ring_nr];
+	struct netmap_adapter *na = kring->na;
 	struct netmap_ring *ring = kring->ring;
 	struct ifnet *ifp = na->ifp;
 	struct port_info *pi = ifp->if_softc;
 	struct adapter *sc = pi->adapter;
-	struct sge_nm_rxq *nm_rxq = &sc->sge.nm_rxq[pi->first_nm_rxq + ring_nr];
+	struct sge_nm_rxq *nm_rxq = &sc->sge.nm_rxq[pi->first_nm_rxq + kring->ring_id];
 	u_int const head = nm_rxsync_prologue(kring);
 	u_int n;
 	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
@@ -891,7 +889,7 @@ cxgbe_netmap_rxsync(struct netmap_adapte
 
 		while (n > 0) {
 			for (i = 0; i < 8; i++, fl_pidx++, slot++) {
-				PNMB(slot, &ba);
+				PNMB(na, slot, &ba);
 				nm_rxq->fl_desc[fl_pidx] = htobe64(ba | hwidx);
 				slot->flags &= ~NS_BUF_CHANGED;
 				MPASS(fl_pidx <= nm_rxq->fl_sidx);

Modified: head/sys/dev/e1000/if_em.c
==============================================================================
--- head/sys/dev/e1000/if_em.c	Sat Aug 16 14:56:11 2014	(r270062)
+++ head/sys/dev/e1000/if_em.c	Sat Aug 16 15:00:01 2014	(r270063)
@@ -3340,10 +3340,10 @@ em_setup_transmit_ring(struct tx_ring *t
 			uint64_t paddr;
 			void *addr;
 
-			addr = PNMB(slot + si, &paddr);
+			addr = PNMB(na, slot + si, &paddr);
 			txr->tx_base[i].buffer_addr = htole64(paddr);
 			/* reload the map for netmap mode */
-			netmap_load_map(txr->txtag, txbuf->map, addr);
+			netmap_load_map(na, txr->txtag, txbuf->map, addr);
 		}
 #endif /* DEV_NETMAP */
 
@@ -4082,8 +4082,8 @@ em_setup_receive_ring(struct rx_ring *rx
 			uint64_t paddr;
 			void *addr;
 
-			addr = PNMB(slot + si, &paddr);
-			netmap_load_map(rxr->rxtag, rxbuf->map, addr);
+			addr = PNMB(na, slot + si, &paddr);
+			netmap_load_map(na, rxr->rxtag, rxbuf->map, addr);
 			/* Update descriptor */
 			rxr->rx_base[j].buffer_addr = htole64(paddr);
 			continue;

Modified: head/sys/dev/e1000/if_igb.c
==============================================================================
--- head/sys/dev/e1000/if_igb.c	Sat Aug 16 14:56:11 2014	(r270062)
+++ head/sys/dev/e1000/if_igb.c	Sat Aug 16 15:00:01 2014	(r270063)
@@ -3629,7 +3629,7 @@ igb_setup_transmit_ring(struct tx_ring *
 		if (slot) {
 			int si = netmap_idx_n2k(&na->tx_rings[txr->me], i);
 			/* no need to set the address */
-			netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si));
+			netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si));
 		}
 #endif /* DEV_NETMAP */
 		/* clear the watch index */
@@ -4433,8 +4433,8 @@ igb_setup_receive_ring(struct rx_ring *r
 			uint64_t paddr;
 			void *addr;
 
-			addr = PNMB(slot + sj, &paddr);
-			netmap_load_map(rxr->ptag, rxbuf->pmap, addr);
+			addr = PNMB(na, slot + sj, &paddr);
+			netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr);
 			/* Update descriptor */
 			rxr->rx_base[j].read.pkt_addr = htole64(paddr);
 			continue;

Modified: head/sys/dev/e1000/if_lem.c
==============================================================================
--- head/sys/dev/e1000/if_lem.c	Sat Aug 16 14:56:11 2014	(r270062)
+++ head/sys/dev/e1000/if_lem.c	Sat Aug 16 15:00:01 2014	(r270063)
@@ -32,6 +32,15 @@
 ******************************************************************************/
 /*$FreeBSD$*/
 
+/*
+ * Uncomment the following extensions for better performance in a VM,
+ * especially if you have support in the hypervisor.
+ * See http://info.iet.unipi.it/~luigi/netmap/
+ */
+// #define BATCH_DISPATCH
+// #define NIC_SEND_COMBINING
+// #define NIC_PARAVIRT	/* enable virtio-like synchronization */
+
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
@@ -291,6 +300,10 @@ static int lem_tx_int_delay_dflt = EM_TI
 static int lem_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR);
 static int lem_tx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_TADV);
 static int lem_rx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_RADV);
+/*
+ * increase lem_rxd and lem_txd to at least 2048 in netmap mode
+ * for better performance.
+ */
 static int lem_rxd = EM_DEFAULT_RXD;
 static int lem_txd = EM_DEFAULT_TXD;
 static int lem_smart_pwr_down = FALSE;
@@ -460,6 +473,20 @@ lem_attach(device_t dev)
 	    "max number of rx packets to process", &adapter->rx_process_limit,
 	    lem_rx_process_limit);
 
+#ifdef NIC_SEND_COMBINING
+	/* Sysctls to control mitigation */
+	lem_add_rx_process_limit(adapter, "sc_enable",
+	    "driver TDT mitigation", &adapter->sc_enable, 0);
+#endif /* NIC_SEND_COMBINING */
+#ifdef BATCH_DISPATCH
+	lem_add_rx_process_limit(adapter, "batch_enable",
+	    "driver rx batch", &adapter->batch_enable, 0);
+#endif /* BATCH_DISPATCH */
+#ifdef NIC_PARAVIRT
+	lem_add_rx_process_limit(adapter, "rx_retries",
+	    "driver rx retries", &adapter->rx_retries, 0);
+#endif /* NIC_PARAVIRT */
+
         /* Sysctl for setting the interface flow control */
 	lem_set_flow_cntrl(adapter, "flow_control",
 	    "flow control setting",
@@ -517,6 +544,49 @@ lem_attach(device_t dev)
 	 */
 	adapter->hw.mac.report_tx_early = 1;
 
+#ifdef NIC_PARAVIRT
+	device_printf(dev, "driver supports paravirt, subdev 0x%x\n",
+		adapter->hw.subsystem_device_id);
+	if (adapter->hw.subsystem_device_id == E1000_PARA_SUBDEV) {
+		uint64_t bus_addr;
+
+		device_printf(dev, "paravirt support on dev %p\n", adapter);
+		tsize = 4096; // XXX one page for the csb
+		if (lem_dma_malloc(adapter, tsize, &adapter->csb_mem, BUS_DMA_NOWAIT)) {
+			device_printf(dev, "Unable to allocate csb memory\n");
+			error = ENOMEM;
+			goto err_csb;
+		}
+		/* Setup the Base of the CSB */
+		adapter->csb = (struct paravirt_csb *)adapter->csb_mem.dma_vaddr;
+		/* force the first kick */
+		adapter->csb->host_need_txkick = 1; /* txring empty */
+		adapter->csb->guest_need_rxkick = 1; /* no rx packets */
+		bus_addr = adapter->csb_mem.dma_paddr;
+		lem_add_rx_process_limit(adapter, "csb_on",
+		    "enable paravirt.", &adapter->csb->guest_csb_on, 0);
+		lem_add_rx_process_limit(adapter, "txc_lim",
+		    "txc_lim", &adapter->csb->host_txcycles_lim, 1);
+
+		/* some stats */
+#define PA_SC(name, var, val)		\
+	lem_add_rx_process_limit(adapter, name, name, var, val)
+		PA_SC("host_need_txkick",&adapter->csb->host_need_txkick, 1);
+		PA_SC("host_rxkick_at",&adapter->csb->host_rxkick_at, ~0);
+		PA_SC("guest_need_txkick",&adapter->csb->guest_need_txkick, 0);
+		PA_SC("guest_need_rxkick",&adapter->csb->guest_need_rxkick, 1);
+		PA_SC("tdt_reg_count",&adapter->tdt_reg_count, 0);
+		PA_SC("tdt_csb_count",&adapter->tdt_csb_count, 0);
+		PA_SC("tdt_int_count",&adapter->tdt_int_count, 0);
+		PA_SC("guest_need_kick_count",&adapter->guest_need_kick_count, 0);
+		/* tell the host where the block is */
+		E1000_WRITE_REG(&adapter->hw, E1000_CSBAH,
+			(u32)(bus_addr >> 32));
+		E1000_WRITE_REG(&adapter->hw, E1000_CSBAL,
+			(u32)bus_addr);
+	}
+#endif /* NIC_PARAVIRT */
+
 	tsize = roundup2(adapter->num_tx_desc * sizeof(struct e1000_tx_desc),
 	    EM_DBA_ALIGN);
 
@@ -675,6 +745,11 @@ err_hw_init:
 err_rx_desc:
 	lem_dma_free(adapter, &adapter->txdma);
 err_tx_desc:
+#ifdef NIC_PARAVIRT
+	lem_dma_free(adapter, &adapter->csb_mem);
+err_csb:
+#endif /* NIC_PARAVIRT */
+
 err_pci:
 	if (adapter->ifp != (void *)NULL)
 		if_free_drv(adapter->ifp);
@@ -762,6 +837,12 @@ lem_detach(device_t dev)
 		adapter->rx_desc_base = NULL;
 	}
 
+#ifdef NIC_PARAVIRT
+	if (adapter->csb) {
+		lem_dma_free(adapter, &adapter->csb_mem);
+		adapter->csb = NULL;
+	}
+#endif /* NIC_PARAVIRT */
 	lem_release_hw_control(adapter);
 	free(adapter->mta, M_DEVBUF);
 	EM_TX_LOCK_DESTROY(adapter);
@@ -871,6 +952,16 @@ lem_start_locked(if_t ifp)
 	}
 	if (adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD)
 		if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
+#ifdef NIC_PARAVIRT
+	if (if_getdrvflags(ifp) & IFF_DRV_OACTIVE && adapter->csb &&
+	    adapter->csb->guest_csb_on &&
+	    !(adapter->csb->guest_need_txkick & 1))  {
+		adapter->csb->guest_need_txkick = 1;
+		adapter->guest_need_kick_count++;
+		// XXX memory barrier
+		lem_txeof(adapter); // XXX possibly clear IFF_DRV_OACTIVE
+	}
+#endif /* NIC_PARAVIRT */
 
 	return;
 }
@@ -1716,6 +1807,37 @@ lem_xmit(struct adapter *adapter, struct
 	 */
 	bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+
+#ifdef NIC_PARAVIRT
+	if (adapter->csb) {
+		adapter->csb->guest_tdt = i;
+		/* XXX memory barrier ? */
+ 		if (adapter->csb->guest_csb_on &&
+		    !(adapter->csb->host_need_txkick & 1)) {
+			/* XXX maybe useless
+			 * clean the ring. maybe do it before ?
+			 * maybe a little bit of histeresys ?
+			 */
+			if (adapter->num_tx_desc_avail <= 64) {// XXX
+				lem_txeof(adapter);
+			}
+			return (0);
+		}
+	}
+#endif /* NIC_PARAVIRT */
+
+#ifdef NIC_SEND_COMBINING
+	if (adapter->sc_enable) {
+		if (adapter->shadow_tdt & MIT_PENDING_INT) {
+			/* signal intr and data pending */
+			adapter->shadow_tdt = MIT_PENDING_TDT | (i & 0xffff);
+			return (0);
+		} else {
+			adapter->shadow_tdt = MIT_PENDING_INT;
+		}
+	}
+#endif /* NIC_SEND_COMBINING */
+
 	if (adapter->hw.mac.type == e1000_82547 &&
 	    adapter->link_duplex == HALF_DUPLEX)
 		lem_82547_move_tail(adapter);
@@ -1959,6 +2081,20 @@ lem_local_timer(void *arg)
 
 	lem_smartspeed(adapter);
 
+#ifdef NIC_PARAVIRT
+	/* recover space if needed */
+	if (adapter->csb && adapter->csb->guest_csb_on &&
+	    (adapter->watchdog_check == TRUE) &&
+	    (ticks - adapter->watchdog_time > EM_WATCHDOG) &&
+	    (adapter->num_tx_desc_avail != adapter->num_tx_desc) ) {
+		lem_txeof(adapter);
+		/*
+		 * lem_txeof() normally (except when space in the queue
+		 * runs low XXX) cleans watchdog_check so that
+		 * we do not hung.
+		 */
+	}
+#endif /* NIC_PARAVIRT */
 	/*
 	 * We check the watchdog: the time since
 	 * the last TX descriptor was cleaned.
@@ -2643,10 +2779,10 @@ lem_setup_transmit_structures(struct ada
 			uint64_t paddr;
 			void *addr;
 
-			addr = PNMB(slot + si, &paddr);
+			addr = PNMB(na, slot + si, &paddr);
 			adapter->tx_desc_base[i].buffer_addr = htole64(paddr);
 			/* reload the map for netmap mode */
-			netmap_load_map(adapter->txtag, tx_buffer->map, addr);
+			netmap_load_map(na, adapter->txtag, tx_buffer->map, addr);
 		}
 #endif /* DEV_NETMAP */
 		tx_buffer->next_eop = -1;
@@ -3021,6 +3157,16 @@ lem_txeof(struct adapter *adapter)
         adapter->next_tx_to_clean = first;
         adapter->num_tx_desc_avail = num_avail;
 
+#ifdef NIC_SEND_COMBINING
+	if ((adapter->shadow_tdt & MIT_PENDING_TDT) == MIT_PENDING_TDT) {
+		/* a tdt write is pending, do it */
+		E1000_WRITE_REG(&adapter->hw, E1000_TDT(0),
+			0xffff & adapter->shadow_tdt);
+		adapter->shadow_tdt = MIT_PENDING_INT;
+	} else {
+		adapter->shadow_tdt = 0; // disable
+	}
+#endif /* NIC_SEND_COMBINING */
         /*
          * If we have enough room, clear IFF_DRV_OACTIVE to
          * tell the stack that it is OK to send packets.
@@ -3028,6 +3174,12 @@ lem_txeof(struct adapter *adapter)
          */
         if (adapter->num_tx_desc_avail > EM_TX_CLEANUP_THRESHOLD) {                
                 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
+#ifdef NIC_PARAVIRT
+		if (adapter->csb) { // XXX also csb_on ?
+			adapter->csb->guest_need_txkick = 2; /* acked */
+			// XXX memory barrier
+		}
+#endif /* NIC_PARAVIRT */
                 if (adapter->num_tx_desc_avail == adapter->num_tx_desc) {
 			adapter->watchdog_check = FALSE;
 			return;
@@ -3213,8 +3365,8 @@ lem_setup_receive_structures(struct adap
 			uint64_t paddr;
 			void *addr;
 
-			addr = PNMB(slot + si, &paddr);
-			netmap_load_map(adapter->rxtag, rx_buffer->map, addr);
+			addr = PNMB(na, slot + si, &paddr);
+			netmap_load_map(na, adapter->rxtag, rx_buffer->map, addr);
 			/* Update descriptor */
 			adapter->rx_desc_base[i].buffer_addr = htole64(paddr);
 			continue;
@@ -3413,7 +3565,23 @@ lem_rxeof(struct adapter *adapter, int c
 	int		i, rx_sent = 0;
 	struct e1000_rx_desc   *current_desc;
 
+#ifdef BATCH_DISPATCH
+	struct mbuf *mh = NULL, *mt = NULL;
+#endif /* BATCH_DISPATCH */
+#ifdef NIC_PARAVIRT
+	int retries = 0;
+	struct paravirt_csb* csb = adapter->csb;
+	int csb_mode = csb && csb->guest_csb_on;
+
+	//ND("clear guest_rxkick at %d", adapter->next_rx_desc_to_check);
+	if (csb_mode && csb->guest_need_rxkick)
+		csb->guest_need_rxkick = 0;
+#endif /* NIC_PARAVIRT */
 	EM_RX_LOCK(adapter);
+
+#ifdef BATCH_DISPATCH
+    batch_again:
+#endif /* BATCH_DISPATCH */
 	i = adapter->next_rx_desc_to_check;
 	current_desc = &adapter->rx_desc_base[i];
 	bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
@@ -3426,19 +3594,45 @@ lem_rxeof(struct adapter *adapter, int c
 	}
 #endif /* DEV_NETMAP */
 
+#if 1 // XXX optimization ?
 	if (!((current_desc->status) & E1000_RXD_STAT_DD)) {
 		if (done != NULL)
 			*done = rx_sent;
 		EM_RX_UNLOCK(adapter);
 		return (FALSE);
 	}
+#endif /* 0 */
 
 	while (count != 0 && if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 		struct mbuf *m = NULL;
 
 		status = current_desc->status;
-		if ((status & E1000_RXD_STAT_DD) == 0)
+		if ((status & E1000_RXD_STAT_DD) == 0) {
+#ifdef NIC_PARAVIRT
+		    if (csb_mode) {
+			/* buffer not ready yet. Retry a few times before giving up */
+			if (++retries <= adapter->rx_retries) {
+				continue;
+			}
+			if (csb->guest_need_rxkick == 0) {
+				// ND("set guest_rxkick at %d", adapter->next_rx_desc_to_check);
+				csb->guest_need_rxkick = 1;
+				// XXX memory barrier, status volatile ?
+				continue; /* double check */
+			}
+		    }
+		    /* no buffer ready, give up */
+#endif /* NIC_PARAVIRT */
 			break;
+		}
+#ifdef NIC_PARAVIRT
+		if (csb_mode) {
+			if (csb->guest_need_rxkick)
+				// ND("clear again guest_rxkick at %d", adapter->next_rx_desc_to_check);
+			csb->guest_need_rxkick = 0;
+			retries = 0;
+		}
+#endif /* NIC_PARAVIRT */
 
 		mp = adapter->rx_buffer_area[i].m_head;
 		/*
@@ -3563,11 +3757,36 @@ discard:
 		bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
+#ifdef NIC_PARAVIRT
+		if (csb_mode) {
+			/* the buffer at i has been already replaced by lem_get_buf()
+			 * so it is safe to set guest_rdt = i and possibly send a kick.
+			 * XXX see if we can optimize it later.
+			 */
+			csb->guest_rdt = i;
+			// XXX memory barrier
+			if (i == csb->host_rxkick_at)
+				E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i);
+		}
+#endif /* NIC_PARAVIRT */
 		/* Advance our pointers to the next descriptor. */
 		if (++i == adapter->num_rx_desc)
 			i = 0;
 		/* Call into the stack */
 		if (m != NULL) {
+#ifdef BATCH_DISPATCH
+		    if (adapter->batch_enable) {
+			if (mh == NULL)
+				mh = mt = m;
+			else
+				mt->m_nextpkt = m;
+			mt = m;
+			m->m_nextpkt = NULL;
+			rx_sent++;
+			current_desc = &adapter->rx_desc_base[i];
+			continue;
+		    }
+#endif /* BATCH_DISPATCH */
 			adapter->next_rx_desc_to_check = i;
 			EM_RX_UNLOCK(adapter);
 			if_input(ifp, m);
@@ -3578,10 +3797,27 @@ discard:
 		current_desc = &adapter->rx_desc_base[i];
 	}
 	adapter->next_rx_desc_to_check = i;
+#ifdef BATCH_DISPATCH
+	if (mh) {
+		EM_RX_UNLOCK(adapter);
+		while ( (mt = mh) != NULL) {
+			mh = mh->m_nextpkt;
+			mt->m_nextpkt = NULL;
+			if_input(ifp, mt);
+		}
+		EM_RX_LOCK(adapter);
+		i = adapter->next_rx_desc_to_check; /* in case of interrupts */
+		if (count > 0)
+			goto batch_again;
+	}
+#endif /* BATCH_DISPATCH */
 
 	/* Advance the E1000's Receive Queue #0  "Tail Pointer". */
 	if (--i < 0)
 		i = adapter->num_rx_desc - 1;
+#ifdef NIC_PARAVIRT
+	if (!csb_mode) /* filter out writes */
+#endif /* NIC_PARAVIRT */
 	E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i);
 	if (done != NULL)
 		*done = rx_sent;

Modified: head/sys/dev/ixgbe/ixgbe.c
==============================================================================
--- head/sys/dev/ixgbe/ixgbe.c	Sat Aug 16 14:56:11 2014	(r270062)
+++ head/sys/dev/ixgbe/ixgbe.c	Sat Aug 16 15:00:01 2014	(r270063)
@@ -3155,7 +3155,7 @@ ixgbe_setup_transmit_ring(struct tx_ring
 		 */
 		if (slot) {
 			int si = netmap_idx_n2k(&na->tx_rings[txr->me], i);
-			netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si));
+			netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si));
 		}
 #endif /* DEV_NETMAP */
 		/* Clear the EOP descriptor pointer */
@@ -4098,8 +4098,8 @@ ixgbe_setup_receive_ring(struct rx_ring 
 			uint64_t paddr;
 			void *addr;
 
-			addr = PNMB(slot + sj, &paddr);
-			netmap_load_map(rxr->ptag, rxbuf->pmap, addr);
+			addr = PNMB(na, slot + sj, &paddr);
+			netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr);
 			/* Update descriptor and the cached value */
 			rxr->rx_base[j].read.pkt_addr = htole64(paddr);
 			rxbuf->addr = htole64(paddr);

Modified: head/sys/dev/netmap/if_em_netmap.h
==============================================================================
--- head/sys/dev/netmap/if_em_netmap.h	Sat Aug 16 14:56:11 2014	(r270062)
+++ head/sys/dev/netmap/if_em_netmap.h	Sat Aug 16 15:00:01 2014	(r270063)
@@ -113,10 +113,10 @@ em_netmap_reg(struct netmap_adapter *na,
  * Reconcile kernel and user view of the transmit ring.
  */
 static int
-em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+em_netmap_txsync(struct netmap_kring *kring, int flags)
 {
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
-	struct netmap_kring *kring = &na->tx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
@@ -128,7 +128,7 @@ em_netmap_txsync(struct netmap_adapter *
 
 	/* device-specific */
 	struct adapter *adapter = ifp->if_softc;
-	struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+	struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
 
 	bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
 			BUS_DMASYNC_POSTREAD);
@@ -144,7 +144,7 @@ em_netmap_txsync(struct netmap_adapter *
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			u_int len = slot->len;
 			uint64_t paddr;
-			void *addr = PNMB(slot, &paddr);
+			void *addr = PNMB(na, slot, &paddr);
 
 			/* device-specific */
 			struct e1000_tx_desc *curr = &txr->tx_base[nic_i];
@@ -153,12 +153,12 @@ em_netmap_txsync(struct netmap_adapter *
 				nic_i == 0 || nic_i == report_frequency) ?
 				E1000_TXD_CMD_RS : 0;
 
-			NM_CHECK_ADDR_LEN(addr, len);
+			NM_CHECK_ADDR_LEN(na, addr, len);
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				curr->buffer_addr = htole64(paddr);
 				/* buffer has changed, reload map */
-				netmap_reload_map(txr->txtag, txbuf->map, addr);
+				netmap_reload_map(na, txr->txtag, txbuf->map, addr);
 			}
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 
@@ -187,7 +187,7 @@ em_netmap_txsync(struct netmap_adapter *
 	 */
 	if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
 		/* record completed transmissions using TDH */
-		nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+		nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id));
 		if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
 			D("TDH wrap %d", nic_i);
 			nic_i -= kring->nkr_num_slots;
@@ -208,10 +208,10 @@ em_netmap_txsync(struct netmap_adapter *
  * Reconcile kernel and user view of the receive ring.
  */
 static int
-em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+em_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
-	struct netmap_kring *kring = &na->rx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
@@ -222,7 +222,7 @@ em_netmap_rxsync(struct netmap_adapter *
 
 	/* device-specific */
 	struct adapter *adapter = ifp->if_softc;
-	struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+	struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
 
 	if (head > lim)
 		return netmap_ring_reinit(kring);
@@ -271,18 +271,18 @@ em_netmap_rxsync(struct netmap_adapter *
 		for (n = 0; nm_i != head; n++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			uint64_t paddr;
-			void *addr = PNMB(slot, &paddr);
+			void *addr = PNMB(na, slot, &paddr);
 
 			struct e1000_rx_desc *curr = &rxr->rx_base[nic_i];
 			struct em_buffer *rxbuf = &rxr->rx_buffers[nic_i];
 
-			if (addr == netmap_buffer_base) /* bad buf */
+			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
 				goto ring_reset;
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				curr->buffer_addr = htole64(paddr);
-				netmap_reload_map(rxr->rxtag, rxbuf->map, addr);
+				netmap_reload_map(na, rxr->rxtag, rxbuf->map, addr);
 				slot->flags &= ~NS_BUF_CHANGED;
 			}
 			curr->status = 0;

Modified: head/sys/dev/netmap/if_igb_netmap.h
==============================================================================
--- head/sys/dev/netmap/if_igb_netmap.h	Sat Aug 16 14:56:11 2014	(r270062)
+++ head/sys/dev/netmap/if_igb_netmap.h	Sat Aug 16 15:00:01 2014	(r270063)
@@ -81,10 +81,10 @@ igb_netmap_reg(struct netmap_adapter *na
  * Reconcile kernel and user view of the transmit ring.
  */
 static int
-igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+igb_netmap_txsync(struct netmap_kring *kring, int flags)
 {
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
-	struct netmap_kring *kring = &na->tx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
@@ -96,7 +96,7 @@ igb_netmap_txsync(struct netmap_adapter 
 
 	/* device-specific */
 	struct adapter *adapter = ifp->if_softc;
-	struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+	struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
 	/* 82575 needs the queue index added */
 	u32 olinfo_status =
 	    (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0;
@@ -115,7 +115,7 @@ igb_netmap_txsync(struct netmap_adapter 
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			u_int len = slot->len;
 			uint64_t paddr;
-			void *addr = PNMB(slot, &paddr);
+			void *addr = PNMB(na, slot, &paddr);
 
 			/* device-specific */
 			union e1000_adv_tx_desc *curr =
@@ -125,11 +125,11 @@ igb_netmap_txsync(struct netmap_adapter 
 				nic_i == 0 || nic_i == report_frequency) ?
 				E1000_ADVTXD_DCMD_RS : 0;
 
-			NM_CHECK_ADDR_LEN(addr, len);
+			NM_CHECK_ADDR_LEN(na, addr, len);
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
-				netmap_reload_map(txr->txtag, txbuf->map, addr);
+				netmap_reload_map(na, txr->txtag, txbuf->map, addr);
 			}
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 
@@ -171,7 +171,7 @@ igb_netmap_txsync(struct netmap_adapter 
 	 */
 	if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
 		/* record completed transmissions using TDH */
-		nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+		nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id));
 		if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
 			D("TDH wrap %d", nic_i);
 			nic_i -= kring->nkr_num_slots;
@@ -190,10 +190,10 @@ igb_netmap_txsync(struct netmap_adapter 
  * Reconcile kernel and user view of the receive ring.
  */
 static int
-igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+igb_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
-	struct netmap_kring *kring = &na->rx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
@@ -204,7 +204,7 @@ igb_netmap_rxsync(struct netmap_adapter 
 
 	/* device-specific */
 	struct adapter *adapter = ifp->if_softc;
-	struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+	struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
 
 	if (head > lim)
 		return netmap_ring_reinit(kring);
@@ -251,17 +251,17 @@ igb_netmap_rxsync(struct netmap_adapter 
 		for (n = 0; nm_i != head; n++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			uint64_t paddr;
-			void *addr = PNMB(slot, &paddr);
+			void *addr = PNMB(na, slot, &paddr);
 
 			union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i];
 			struct igb_rx_buf *rxbuf = &rxr->rx_buffers[nic_i];
 
-			if (addr == netmap_buffer_base) /* bad buf */
+			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
 				goto ring_reset;
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
-				netmap_reload_map(rxr->ptag, rxbuf->pmap, addr);
+				netmap_reload_map(na, rxr->ptag, rxbuf->pmap, addr);
 				slot->flags &= ~NS_BUF_CHANGED;
 			}
 			curr->wb.upper.status_error = 0;

Modified: head/sys/dev/netmap/if_lem_netmap.h
==============================================================================
--- head/sys/dev/netmap/if_lem_netmap.h	Sat Aug 16 14:56:11 2014	(r270062)
+++ head/sys/dev/netmap/if_lem_netmap.h	Sat Aug 16 15:00:01 2014	(r270063)
@@ -39,6 +39,7 @@
 #include <vm/pmap.h>    /* vtophys ? */
 #include <dev/netmap/netmap_kern.h>
 
+extern int netmap_adaptive_io;
 
 /*
  * Register/unregister. We are already under netmap lock.
@@ -84,10 +85,10 @@ lem_netmap_reg(struct netmap_adapter *na
  * Reconcile kernel and user view of the transmit ring.
  */
 static int
-lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+lem_netmap_txsync(struct netmap_kring *kring, int flags)
 {
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
-	struct netmap_kring *kring = &na->tx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
@@ -98,6 +99,10 @@ lem_netmap_txsync(struct netmap_adapter 
 
 	/* device-specific */
 	struct adapter *adapter = ifp->if_softc;
+#ifdef NIC_PARAVIRT
+	struct paravirt_csb *csb = adapter->csb;
+	uint64_t *csbd = (uint64_t *)(csb + 1);
+#endif /* NIC_PARAVIRT */
 
 	bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
 			BUS_DMASYNC_POSTREAD);
@@ -108,12 +113,25 @@ lem_netmap_txsync(struct netmap_adapter 
 
 	nm_i = kring->nr_hwcur;
 	if (nm_i != head) {	/* we have new packets to send */
+#ifdef NIC_PARAVIRT
+		int do_kick = 0;
+		uint64_t t = 0; // timestamp
+		int n = head - nm_i;
+		if (n < 0)
+			n += lim + 1;
+		if (csb) {
+			t = rdtsc(); /* last timestamp */
+			csbd[16] += t - csbd[0]; /* total Wg */
+			csbd[17] += n;		/* Wg count */
+			csbd[0] = t;
+		}
+#endif /* NIC_PARAVIRT */
 		nic_i = netmap_idx_k2n(kring, nm_i);
 		while (nm_i != head) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			u_int len = slot->len;
 			uint64_t paddr;
-			void *addr = PNMB(slot, &paddr);
+			void *addr = PNMB(na, slot, &paddr);
 
 			/* device-specific */
 			struct e1000_tx_desc *curr = &adapter->tx_desc_base[nic_i];
@@ -122,12 +140,12 @@ lem_netmap_txsync(struct netmap_adapter 
 				nic_i == 0 || nic_i == report_frequency) ?
 				E1000_TXD_CMD_RS : 0;
 
-			NM_CHECK_ADDR_LEN(addr, len);
+			NM_CHECK_ADDR_LEN(na, addr, len);
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				curr->buffer_addr = htole64(paddr);
-				netmap_reload_map(adapter->txtag, txbuf->map, addr);
+				netmap_reload_map(na, adapter->txtag, txbuf->map, addr);
 			}
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 
@@ -140,6 +158,7 @@ lem_netmap_txsync(struct netmap_adapter 
 
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
+			// XXX might try an early kick
 		}
 		kring->nr_hwcur = head;
 
@@ -147,8 +166,38 @@ lem_netmap_txsync(struct netmap_adapter 
 		bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
 			BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
+#ifdef NIC_PARAVIRT
+		/* set unconditionally, then also kick if needed */
+		if (csb) {
+			t = rdtsc();
+			if (csb->host_need_txkick == 2) {
+				/* can compute an update of delta */
+				int64_t delta = t - csbd[3];
+				if (delta < 0)
+					delta = -delta;
+				if (csbd[8] == 0 || delta < csbd[8]) {
+					csbd[8] = delta;
+					csbd[9]++;
+				}
+				csbd[10]++;
+			}
+			csb->guest_tdt = nic_i;
+			csbd[18] += t - csbd[0]; // total wp
+			csbd[19] += n;
+		}
+		if (!csb || !csb->guest_csb_on || (csb->host_need_txkick & 1))
+			do_kick = 1;
+		if (do_kick)
+#endif /* NIC_PARAVIRT */
 		/* (re)start the tx unit up to slot nic_i (excluded) */
 		E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i);
+#ifdef NIC_PARAVIRT
+		if (do_kick) {
+			uint64_t t1 = rdtsc();
+			csbd[20] += t1 - t; // total Np
+			csbd[21]++;
+		}
+#endif /* NIC_PARAVIRT */
 	}
 
 	/*
@@ -157,6 +206,93 @@ lem_netmap_txsync(struct netmap_adapter 
 	if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
 		kring->last_reclaim = ticks;
 		/* record completed transmissions using TDH */
+#ifdef NIC_PARAVIRT
+		/* host updates tdh unconditionally, and we have
+		 * no side effects on reads, so we can read from there
+		 * instead of exiting.
+		 */
+		if (csb) {
+		    static int drain = 0, nodrain=0, good = 0, bad = 0, fail = 0;
+		    u_int x = adapter->next_tx_to_clean;
+		    csbd[19]++; // XXX count reclaims
+		    nic_i = csb->host_tdh;
+		    if (csb->guest_csb_on) {
+			if (nic_i == x) {
+			    bad++;
+		    	    csbd[24]++; // failed reclaims
+			    /* no progress, request kick and retry */
+			    csb->guest_need_txkick = 1;
+			    mb(); // XXX barrier
+		    	    nic_i = csb->host_tdh;
+			} else {
+			    good++;
+			}
+			if (nic_i != x) {
+			    csb->guest_need_txkick = 2;
+			    if (nic_i == csb->guest_tdt)
+				drain++;
+			    else
+				nodrain++;
+#if 1
+			if (netmap_adaptive_io) {
+			    /* new mechanism: last half ring (or so)
+			     * released one slot at a time.
+			     * This effectively makes the system spin.
+			     *
+			     * Take next_to_clean + 1 as a reference.
+			     * tdh must be ahead or equal
+			     * On entry, the logical order is
+			     *		x < tdh = nic_i
+			     * We first push tdh up to avoid wraps.
+			     * The limit is tdh-ll (half ring).
+			     * if tdh-256 < x we report x;
+			     * else we report tdh-256
+			     */
+			    u_int tdh = nic_i;
+			    u_int ll = csbd[15];
+			    u_int delta = lim/8;
+			    if (netmap_adaptive_io == 2 || ll > delta)
+				csbd[15] = ll = delta;
+			    else if (netmap_adaptive_io == 1 && ll > 1) {
+				csbd[15]--;
+			    }
+
+			    if (nic_i >= kring->nkr_num_slots) {
+				RD(5, "bad nic_i %d on input", nic_i);
+			    }
+			    x = nm_next(x, lim);
+			    if (tdh < x)
+				tdh += lim + 1;
+			    if (tdh <= x + ll) {
+				nic_i = x;
+				csbd[25]++; //report n + 1;
+			    } else {
+				tdh = nic_i;
+				if (tdh < ll)
+				    tdh += lim + 1;
+				nic_i = tdh - ll;
+				csbd[26]++; // report tdh - ll
+			    }
+			}
+#endif
+			} else {
+			    /* we stop, count whether we are idle or not */
+			    int bh_active = csb->host_need_txkick & 2 ? 4 : 0;
+			    csbd[27+ csb->host_need_txkick]++;
+			    if (netmap_adaptive_io == 1) {
+				if (bh_active && csbd[15] > 1)
+				    csbd[15]--;
+				else if (!bh_active && csbd[15] < lim/2)
+				    csbd[15]++;
+			    }
+			    bad--;
+			    fail++;
+			}
+		    }
+		    RD(1, "drain %d nodrain %d good %d retry %d fail %d",
+			drain, nodrain, good, bad, fail);
+		} else
+#endif /* !NIC_PARAVIRT */
 		nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0));
 		if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
 			D("TDH wrap %d", nic_i);
@@ -176,10 +312,10 @@ lem_netmap_txsync(struct netmap_adapter 
  * Reconcile kernel and user view of the receive ring.
  */
 static int
-lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+lem_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
-	struct netmap_kring *kring = &na->rx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
@@ -190,10 +326,21 @@ lem_netmap_rxsync(struct netmap_adapter 
 

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201408161500.s7GF01KC085529>