Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 14 Nov 2016 05:37:34 +0000 (UTC)
From:      Sepherosa Ziehau <sephe@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org
Subject:   svn commit: r308628 - stable/11/sys/dev/hyperv/netvsc
Message-ID:  <201611140537.uAE5bYjS005815@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: sephe
Date: Mon Nov 14 05:37:34 2016
New Revision: 308628
URL: https://svnweb.freebsd.org/changeset/base/308628

Log:
  MFC 308117-308120
  
  308117
      hyperv/hn: Rework temporary channel packet buffer expanding.
  
      And use large default temporary channel packer buffer; we really
      don't want it to be expanded at run time.
  
      Sponsored by:   Microsoft
      Differential Revision:  https://reviews.freebsd.org/D8367
  
  308118
      hyperv/hn: Cleanup RXBUF ack processing.
  
      - Increase the # of retries.
      - Add comment.
      - Log error, if RXBUF ack fails.
      - Add stat for RXBUF ack failures.
  
      RXBUF ack really should _not_ fail...
  
      Sponsored by:   Microsoft
      Differential Revision:  https://reviews.freebsd.org/D8368
  
  308119
      hyperv/hn: Reset do_lro, if the hash types are not TCP related.
  
      Mainly because the host side only set TCPCS and IPCS even for
      UDP datagrams.
  
      Sponsored by:   Microsoft
      Differential Revision:  https://reviews.freebsd.org/D8369
  
  308120
      hyperv/hn: Don't start shared TX taskq, if the hypervisor is not Hyper-V.
  
      - Move the SYSINIT to DRIVER/SECOND, i.e. after the vm_guest becomes
        determistic.
      - Minor style changes.
  
      Sponsored by:   Microsoft
      Differential Revision:  https://reviews.freebsd.org/D8370

Modified:
  stable/11/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
  stable/11/sys/dev/hyperv/netvsc/if_hnvar.h
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
==============================================================================
--- stable/11/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c	Mon Nov 14 05:24:05 2016	(r308627)
+++ stable/11/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c	Mon Nov 14 05:37:34 2016	(r308628)
@@ -176,6 +176,8 @@ __FBSDID("$FreeBSD$");
 	 HN_RXINFO_HASHINF |		\
 	 HN_RXINFO_HASHVAL)
 
+#define HN_PKTBUF_LEN_DEF		(16 * 1024)
+
 struct hn_txdesc {
 #ifndef HN_USE_TXDESC_BUFRING
 	SLIST_ENTRY(hn_txdesc) link;
@@ -412,7 +414,8 @@ static void hn_nvs_handle_comp(struct hn
 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr,
 		struct vmbus_channel *chan,
 		const struct vmbus_chanpkt_hdr *pkthdr);
-static void hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid);
+static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, struct vmbus_channel *,
+		uint64_t);
 
 static int hn_transmit(struct ifnet *, struct mbuf *);
 static void hn_xmit_qflush(struct ifnet *);
@@ -1751,6 +1754,13 @@ hn_rxpkt(struct hn_rx_ring *rxr, const v
 				rxr->hn_csum_udp++;
 		}
 
+		/*
+		 * XXX
+		 * As of this write (Oct 28th, 2016), host side will turn
+		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
+		 * the do_lro setting here is actually _not_ accurate.  We
+		 * depend on the RSS hash type check to reset do_lro.
+		 */
 		if ((info->csum_info &
 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
@@ -1825,9 +1835,16 @@ skip:
 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
 
+			/*
+			 * NOTE:
+			 * do_lro is resetted, if the hash types are not TCP
+			 * related.  See the comment in the above csum_flags
+			 * setup section.
+			 */
 			switch (type) {
 			case NDIS_HASH_IPV4:
 				hash_type = M_HASHTYPE_RSS_IPV4;
+				do_lro = 0;
 				break;
 
 			case NDIS_HASH_TCP_IPV4:
@@ -1836,10 +1853,12 @@ skip:
 
 			case NDIS_HASH_IPV6:
 				hash_type = M_HASHTYPE_RSS_IPV6;
+				do_lro = 0;
 				break;
 
 			case NDIS_HASH_IPV6_EX:
 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
+				do_lro = 0;
 				break;
 
 			case NDIS_HASH_TCP_IPV6:
@@ -2720,7 +2739,8 @@ hn_create_rx_data(struct hn_softc *sc, i
 		rxr->hn_ifp = sc->hn_ifp;
 		if (i < sc->hn_tx_ring_cnt)
 			rxr->hn_txr = &sc->hn_tx_ring[i];
-		rxr->hn_pktbuf = malloc(HN_PKTBUF_LEN, M_DEVBUF, M_WAITOK);
+		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
+		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
 		rxr->hn_rx_idx = i;
 		rxr->hn_rxbuf = sc->hn_rxbuf;
 
@@ -2763,6 +2783,11 @@ hn_create_rx_data(struct hn_softc *sc, i
 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
 				    &rxr->hn_rss_pkts,
 				    "# of packets w/ RSS info received");
+				SYSCTL_ADD_INT(ctx,
+				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
+				    &rxr->hn_pktbuf_len, 0,
+				    "Temporary channel packet buffer length");
 			}
 		}
 	}
@@ -2835,6 +2860,10 @@ hn_create_rx_data(struct hn_softc *sc, i
 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_rx_ring, hn_ack_failed),
+	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
@@ -4491,43 +4520,43 @@ hn_nvs_handle_rxbuf(struct hn_rx_ring *r
 	}
 
 	/*
-	 * Moved completion call back here so that all received 
-	 * messages (not just data messages) will trigger a response
-	 * message back to the host.
+	 * Ack the consumed RXBUF associated w/ this channel packet,
+	 * so that this RXBUF can be recycled by the hypervisor.
 	 */
-	hn_nvs_ack_rxbuf(chan, pkt->cp_hdr.cph_xactid);
+	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
 }
 
-/*
- * Net VSC on receive completion
- *
- * Send a receive completion packet to RNDIS device (ie NetVsp)
- */
 static void
-hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid)
+hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
+    uint64_t tid)
 {
 	struct hn_nvs_rndis_ack ack;
-	int retries = 0;
-	int ret = 0;
+	int retries, error;
 	
 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
 	ack.nvs_status = HN_NVS_STATUS_OK;
 
-retry_send_cmplt:
-	/* Send the completion */
-	ret = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
+	retries = 0;
+again:
+	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
-	if (ret == 0) {
-		/* success */
-		/* no-op */
-	} else if (ret == EAGAIN) {
-		/* no more room... wait a bit and attempt to retry 3 times */
+	if (__predict_false(error == EAGAIN)) {
+		/*
+		 * NOTE:
+		 * This should _not_ happen in real world, since the
+		 * consumption of the TX bufring from the TX path is
+		 * controlled.
+		 */
+		if (rxr->hn_ack_failed == 0)
+			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
+		rxr->hn_ack_failed++;
 		retries++;
-
-		if (retries < 4) {
+		if (retries < 10) {
 			DELAY(100);
-			goto retry_send_cmplt;
+			goto again;
 		}
+		/* RXBUF leaks! */
+		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
 	}
 }
 
@@ -4536,66 +4565,72 @@ hn_chan_callback(struct vmbus_channel *c
 {
 	struct hn_rx_ring *rxr = xrxr;
 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
-	void *buffer;
-	int bufferlen = HN_PKTBUF_LEN;
 
-	buffer = rxr->hn_pktbuf;
-	do {
-		struct vmbus_chanpkt_hdr *pkt = buffer;
-		uint32_t bytes_rxed;
-		int ret;
-
-		bytes_rxed = bufferlen;
-		ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed);
-		if (ret == 0) {
-			switch (pkt->cph_type) {
-			case VMBUS_CHANPKT_TYPE_COMP:
-				hn_nvs_handle_comp(sc, chan, pkt);
-				break;
-			case VMBUS_CHANPKT_TYPE_RXBUF:
-				hn_nvs_handle_rxbuf(rxr, chan, pkt);
-				break;
-			case VMBUS_CHANPKT_TYPE_INBAND:
-				hn_nvs_handle_notify(sc, pkt);
-				break;
-			default:
-				if_printf(rxr->hn_ifp,
-				    "unknown chan pkt %u\n",
-				    pkt->cph_type);
-				break;
-			}
-		} else if (ret == ENOBUFS) {
-			/* Handle large packet */
-			if (bufferlen > HN_PKTBUF_LEN) {
-				free(buffer, M_DEVBUF);
-				buffer = NULL;
-			}
+	for (;;) {
+		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
+		int error, pktlen;
+
+		pktlen = rxr->hn_pktbuf_len;
+		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
+		if (__predict_false(error == ENOBUFS)) {
+			void *nbuf;
+			int nlen;
 
-			/* alloc new buffer */
-			buffer = malloc(bytes_rxed, M_DEVBUF, M_NOWAIT);
-			if (buffer == NULL) {
-				if_printf(rxr->hn_ifp,
-				    "hv_cb malloc buffer failed, len=%u\n",
-				    bytes_rxed);
-				bufferlen = 0;
-				break;
-			}
-			bufferlen = bytes_rxed;
-		} else {
-			/* No more packets */
+			/*
+			 * Expand channel packet buffer.
+			 *
+			 * XXX
+			 * Use M_WAITOK here, since allocation failure
+			 * is fatal.
+			 */
+			nlen = rxr->hn_pktbuf_len * 2;
+			while (nlen < pktlen)
+				nlen *= 2;
+			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
+
+			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
+			    rxr->hn_pktbuf_len, nlen);
+
+			free(rxr->hn_pktbuf, M_DEVBUF);
+			rxr->hn_pktbuf = nbuf;
+			rxr->hn_pktbuf_len = nlen;
+			/* Retry! */
+			continue;
+		} else if (__predict_false(error == EAGAIN)) {
+			/* No more channel packets; done! */
 			break;
 		}
-	} while (1);
+		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
 
-	if (bufferlen > HN_PKTBUF_LEN)
-		free(buffer, M_DEVBUF);
+		switch (pkt->cph_type) {
+		case VMBUS_CHANPKT_TYPE_COMP:
+			hn_nvs_handle_comp(sc, chan, pkt);
+			break;
+
+		case VMBUS_CHANPKT_TYPE_RXBUF:
+			hn_nvs_handle_rxbuf(rxr, chan, pkt);
+			break;
 
+		case VMBUS_CHANPKT_TYPE_INBAND:
+			hn_nvs_handle_notify(sc, pkt);
+			break;
+
+		default:
+			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
+			    pkt->cph_type);
+			break;
+		}
+	}
 	hn_chan_rollup(rxr, rxr->hn_txr);
 }
 
 static void
 hn_tx_taskq_create(void *arg __unused)
 {
+
+	if (vm_guest != VM_GUEST_HV)
+		return;
+
 	if (!hn_share_tx_taskq)
 		return;
 
@@ -4614,16 +4649,17 @@ hn_tx_taskq_create(void *arg __unused)
 		taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
 	}
 }
-SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST,
+SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
     hn_tx_taskq_create, NULL);
 
 static void
 hn_tx_taskq_destroy(void *arg __unused)
 {
+
 	if (hn_tx_taskq != NULL)
 		taskqueue_free(hn_tx_taskq);
 }
-SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST,
+SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
     hn_tx_taskq_destroy, NULL);
 
 static device_method_t netvsc_methods[] = {

Modified: stable/11/sys/dev/hyperv/netvsc/if_hnvar.h
==============================================================================
--- stable/11/sys/dev/hyperv/netvsc/if_hnvar.h	Mon Nov 14 05:24:05 2016	(r308627)
+++ stable/11/sys/dev/hyperv/netvsc/if_hnvar.h	Mon Nov 14 05:37:34 2016	(r308628)
@@ -39,8 +39,6 @@
 /* Claimed to be 12232B */
 #define HN_MTU_MAX			(9 * 1024)
 
-#define HN_PKTBUF_LEN			4096
-
 #define HN_TXBR_SIZE			(128 * PAGE_SIZE)
 #define HN_RXBR_SIZE			(128 * PAGE_SIZE)
 
@@ -63,6 +61,7 @@ struct hn_rx_ring {
 	struct ifnet	*hn_ifp;
 	struct hn_tx_ring *hn_txr;
 	void		*hn_pktbuf;
+	int		hn_pktbuf_len;
 	uint8_t		*hn_rxbuf;	/* shadow sc->hn_rxbuf */
 	int		hn_rx_idx;
 
@@ -78,6 +77,7 @@ struct hn_rx_ring {
 	u_long		hn_small_pkts;
 	u_long		hn_pkts;
 	u_long		hn_rss_pkts;
+	u_long		hn_ack_failed;
 
 	/* Rarely used stuffs */
 	struct sysctl_oid *hn_rx_sysctl_tree;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201611140537.uAE5bYjS005815>