Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 20 Feb 2020 21:07:24 +0000 (UTC)
From:      Vincenzo Maffione <vmaffione@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r358180 - head/usr.sbin/bhyve
Message-ID:  <202002202107.01KL7Olw094786@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: vmaffione
Date: Thu Feb 20 21:07:23 2020
New Revision: 358180
URL: https://svnweb.freebsd.org/changeset/base/358180

Log:
  bhyve: enable virtio-net mergeable rx buffers for tap(4)
  
  This patch adds a new netbe_peek_recvlen() function to the net
  backend API. The new function allows the virtio-net receive code
  to know in advance how many virtio descriptors chains will be
  needed to receive the next packet. As a result, the implementation
  of the virtio-net mergeable rx buffers feature becomes efficient,
  so that we can enable it also with the tap(4) backend. For the
  tap(4) backend, a bounce buffer is introduced to implement the
  peeck_recvlen() callback, which implies an additional packet copy
  on the receive datapath. In the future, it should be possible to
  remove the bounce buffer (and so the additional copy), by
  obtaining the length of the next packet from kevent data.
  
  Reviewed by:    grehan, aleksandr.fedorov@itglobal.com
  MFC after:      1 week
  Differential Revision:	https://reviews.freebsd.org/D23472

Modified:
  head/usr.sbin/bhyve/iov.c
  head/usr.sbin/bhyve/iov.h
  head/usr.sbin/bhyve/net_backends.c
  head/usr.sbin/bhyve/net_backends.h
  head/usr.sbin/bhyve/pci_virtio_net.c

Modified: head/usr.sbin/bhyve/iov.c
==============================================================================
--- head/usr.sbin/bhyve/iov.c	Thu Feb 20 19:07:29 2020	(r358179)
+++ head/usr.sbin/bhyve/iov.c	Thu Feb 20 21:07:23 2020	(r358180)
@@ -119,24 +119,25 @@ iov_to_buf(const struct iovec *iov, int niov, void **b
 }
 
 ssize_t
-buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov,
+buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov, int niov,
     size_t seek)
 {
 	struct iovec *diov;
-	int ndiov, i;
 	size_t off = 0, len;
+	int  i;
 
 	if (seek > 0) {
+		int ndiov;
+
 		diov = malloc(sizeof(struct iovec) * niov);
 		seek_iov(iov, niov, diov, &ndiov, seek);
-	} else {
-		diov = iov;
-		ndiov = niov;
+		iov = diov;
+		niov = ndiov;
 	}
 
-	for (i = 0; i < ndiov && off < buflen; i++) {
-		len = MIN(diov[i].iov_len, buflen - off);
-		memcpy(diov[i].iov_base, buf + off, len);
+	for (i = 0; i < niov && off < buflen; i++) {
+		len = MIN(iov[i].iov_len, buflen - off);
+		memcpy(iov[i].iov_base, buf + off, len);
 		off += len;
 	}
 

Modified: head/usr.sbin/bhyve/iov.h
==============================================================================
--- head/usr.sbin/bhyve/iov.h	Thu Feb 20 19:07:29 2020	(r358179)
+++ head/usr.sbin/bhyve/iov.h	Thu Feb 20 21:07:23 2020	(r358180)
@@ -38,7 +38,7 @@ void seek_iov(const struct iovec *iov1, int niov1, str
 void truncate_iov(struct iovec *iov, int *niov, size_t length);
 size_t count_iov(const struct iovec *iov, int niov);
 ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf);
-ssize_t buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov,
-    size_t seek);
+ssize_t buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov,
+    int niov, size_t seek);
 
 #endif	/* _IOV_H_ */

Modified: head/usr.sbin/bhyve/net_backends.c
==============================================================================
--- head/usr.sbin/bhyve/net_backends.c	Thu Feb 20 19:07:29 2020	(r358179)
+++ head/usr.sbin/bhyve/net_backends.c	Thu Feb 20 21:07:23 2020	(r358180)
@@ -103,6 +103,13 @@ struct net_backend {
 	    int iovcnt);
 
 	/*
+	 * Get the length of the next packet that can be received from
+	 * the backend. If no packets are currently available, this
+	 * function returns 0.
+	 */
+	ssize_t (*peek_recvlen)(struct net_backend *be);
+
+	/*
 	 * Called to receive a packet from the backend. When the function
 	 * returns a positive value 'len', the scatter-gather vector
 	 * provided by the caller contains a packet with such length.
@@ -167,6 +174,13 @@ SET_DECLARE(net_backend_set, struct net_backend);
 
 struct tap_priv {
 	struct mevent *mevp;
+	/*
+	 * A bounce buffer that allows us to implement the peek_recvlen
+	 * callback. In the future we may get the same information from
+	 * the kevent data.
+	 */
+	char bbuf[1 << 16];
+	ssize_t bbuflen;
 };
 
 static void
@@ -223,6 +237,9 @@ tap_init(struct net_backend *be, const char *devname,
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
+	memset(priv->bbuf, 0, sizeof(priv->bbuf));
+	priv->bbuflen = 0;
+
 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
 	if (priv->mevp == NULL) {
 		WPRINTF(("Could not register event"));
@@ -246,15 +263,56 @@ tap_send(struct net_backend *be, const struct iovec *i
 }
 
 static ssize_t
+tap_peek_recvlen(struct net_backend *be)
+{
+	struct tap_priv *priv = (struct tap_priv *)be->opaque;
+	ssize_t ret;
+
+	if (priv->bbuflen > 0) {
+		/*
+		 * We already have a packet in the bounce buffer.
+		 * Just return its length.
+		 */
+		return priv->bbuflen;
+	}
+
+	/*
+	 * Read the next packet (if any) into the bounce buffer, so
+	 * that we get to know its length and we can return that
+	 * to the caller.
+	 */
+	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
+	if (ret < 0 && errno == EWOULDBLOCK) {
+		return (0);
+	}
+
+	if (ret > 0)
+		priv->bbuflen = ret;
+
+	return (ret);
+}
+
+static ssize_t
 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
 {
+	struct tap_priv *priv = (struct tap_priv *)be->opaque;
 	ssize_t ret;
 
-	/* Should never be called without a valid tap fd */
-	assert(be->fd != -1);
+	if (priv->bbuflen > 0) {
+		/*
+		 * A packet is available in the bounce buffer, so
+		 * we read it from there.
+		 */
+		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
+		    iov, iovcnt, 0);
 
-	ret = readv(be->fd, iov, iovcnt);
+		/* Mark the bounce buffer as empty. */
+		priv->bbuflen = 0;
 
+		return (ret);
+	}
+
+	ret = readv(be->fd, iov, iovcnt);
 	if (ret < 0 && errno == EWOULDBLOCK) {
 		return (0);
 	}
@@ -299,6 +357,7 @@ static struct net_backend tap_backend = {
 	.init = tap_init,
 	.cleanup = tap_cleanup,
 	.send = tap_send,
+	.peek_recvlen = tap_peek_recvlen,
 	.recv = tap_recv,
 	.recv_enable = tap_recv_enable,
 	.recv_disable = tap_recv_disable,
@@ -313,6 +372,7 @@ static struct net_backend vmnet_backend = {
 	.init = tap_init,
 	.cleanup = tap_cleanup,
 	.send = tap_send,
+	.peek_recvlen = tap_peek_recvlen,
 	.recv = tap_recv,
 	.recv_enable = tap_recv_enable,
 	.recv_disable = tap_recv_disable,
@@ -331,8 +391,7 @@ DATA_SET(net_backend_set, vmnet_backend);
 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
-		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO | \
-		VIRTIO_NET_F_MRG_RXBUF)
+		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
 
 struct netmap_priv {
 	char ifname[IFNAMSIZ];
@@ -540,6 +599,26 @@ txsync:
 }
 
 static ssize_t
+netmap_peek_recvlen(struct net_backend *be)
+{
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+	struct netmap_ring *ring = priv->rx;
+	uint32_t head = ring->head;
+	ssize_t totlen = 0;
+
+	while (head != ring->tail) {
+		struct netmap_slot *slot = ring->slot + head;
+
+		totlen += slot->len;
+		if ((slot->flags & NS_MOREFRAG) == 0)
+			break;
+		head = nm_ring_next(ring, head);
+	}
+
+	return (totlen);
+}
+
+static ssize_t
 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
 {
 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
@@ -628,6 +707,7 @@ static struct net_backend netmap_backend = {
 	.init = netmap_init,
 	.cleanup = netmap_cleanup,
 	.send = netmap_send,
+	.peek_recvlen = netmap_peek_recvlen,
 	.recv = netmap_recv,
 	.recv_enable = netmap_recv_enable,
 	.recv_disable = netmap_recv_disable,
@@ -642,6 +722,7 @@ static struct net_backend vale_backend = {
 	.init = netmap_init,
 	.cleanup = netmap_cleanup,
 	.send = netmap_send,
+	.peek_recvlen = netmap_peek_recvlen,
 	.recv = netmap_recv,
 	.recv_enable = netmap_recv_enable,
 	.recv_disable = netmap_recv_disable,
@@ -756,6 +837,13 @@ netbe_send(struct net_backend *be, const struct iovec 
 {
 
 	return (be->send(be, iov, iovcnt));
+}
+
+ssize_t
+netbe_peek_recvlen(struct net_backend *be)
+{
+
+	return (be->peek_recvlen(be));
 }
 
 /*

Modified: head/usr.sbin/bhyve/net_backends.h
==============================================================================
--- head/usr.sbin/bhyve/net_backends.h	Thu Feb 20 19:07:29 2020	(r358179)
+++ head/usr.sbin/bhyve/net_backends.h	Thu Feb 20 21:07:23 2020	(r358180)
@@ -45,6 +45,7 @@ int	 netbe_set_cap(net_backend_t *be, uint64_t cap,
              unsigned vnet_hdr_len);
 size_t	netbe_get_vnet_hdr_len(net_backend_t *be);
 ssize_t	netbe_send(net_backend_t *be, const struct iovec *iov, int iovcnt);
+ssize_t	netbe_peek_recvlen(net_backend_t *be);
 ssize_t	netbe_recv(net_backend_t *be, const struct iovec *iov, int iovcnt);
 ssize_t	netbe_rx_discard(net_backend_t *be);
 void	netbe_rx_disable(net_backend_t *be);

Modified: head/usr.sbin/bhyve/pci_virtio_net.c
==============================================================================
--- head/usr.sbin/bhyve/pci_virtio_net.c	Thu Feb 20 19:07:29 2020	(r358179)
+++ head/usr.sbin/bhyve/pci_virtio_net.c	Thu Feb 20 21:07:23 2020	(r358180)
@@ -228,22 +228,34 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 	struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS];
 	struct iovec iov[VTNET_MAXSEGS + 1];
 	struct vqueue_info *vq;
-	uint32_t riov_bytes;
-	struct iovec *riov;
-	int riov_len;
-	uint32_t ulen;
-	int n_chains;
-	int len;
 
 	vq = &sc->vsc_queues[VTNET_RXQ];
 	for (;;) {
 		struct virtio_net_rxhdr *hdr;
+		uint32_t riov_bytes;
+		struct iovec *riov;
+		uint32_t ulen;
+		int riov_len;
+		int n_chains;
+		ssize_t rlen;
+		ssize_t plen;
 
+		plen = netbe_peek_recvlen(sc->vsc_be);
+		if (plen <= 0) {
+			/*
+			 * No more packets (plen == 0), or backend errored
+			 * (plen < 0). Interrupt if needed and stop.
+			 */
+			vq_endchains(vq, /*used_all_avail=*/0);
+			return;
+		}
+		plen += prepend_hdr_len;
+
 		/*
 		 * Get a descriptor chain to store the next ingress
 		 * packet. In case of mergeable rx buffers, get as
 		 * many chains as necessary in order to make room
-		 * for a maximum sized LRO packet.
+		 * for plen bytes.
 		 */
 		riov_bytes = 0;
 		riov_len = 0;
@@ -287,8 +299,7 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 			riov_bytes += info[n_chains].len;
 			riov += n;
 			n_chains++;
-		} while (riov_bytes < VTNET_MAX_PKT_LEN &&
-			    riov_len < VTNET_MAXSEGS);
+		} while (riov_bytes < plen && riov_len < VTNET_MAXSEGS);
 
 		riov = iov;
 		hdr = riov[0].iov_base;
@@ -312,21 +323,20 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 			memset(hdr, 0, prepend_hdr_len);
 		}
 
-		len = netbe_recv(sc->vsc_be, riov, riov_len);
-
-		if (len <= 0) {
+		rlen = netbe_recv(sc->vsc_be, riov, riov_len);
+		if (rlen != plen - prepend_hdr_len) {
 			/*
-			 * No more packets (len == 0), or backend errored
-			 * (err < 0). Return unused available buffers
-			 * and stop.
+			 * If this happens it means there is something
+			 * wrong with the backend (e.g., some other
+			 * process is stealing our packets).
 			 */
+			WPRINTF(("netbe_recv: expected %zd bytes, "
+				"got %zd", plen - prepend_hdr_len, rlen));
 			vq_retchains(vq, n_chains);
-			/* Interrupt if needed/appropriate and stop. */
-			vq_endchains(vq, /*used_all_avail=*/0);
-			return;
+			continue;
 		}
 
-		ulen = (uint32_t)(len + prepend_hdr_len);
+		ulen = (uint32_t)plen;
 
 		/*
 		 * Publish the used buffers to the guest, reporting the
@@ -346,12 +356,11 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 				vq_relchain_prepare(vq, info[i].idx, iolen);
 				ulen -= iolen;
 				i++;
-				assert(i <= n_chains);
 			} while (ulen > 0);
 
 			hdr->vrh_bufs = i;
 			vq_relchain_publish(vq);
-			vq_retchains(vq, n_chains - i);
+			assert(i == n_chains);
 		}
 	}
 
@@ -592,7 +601,8 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *
 			free(sc);
 			return (err);
 		}
-		sc->vsc_consts.vc_hv_caps |= netbe_get_cap(sc->vsc_be);
+		sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF |
+		    netbe_get_cap(sc->vsc_be);
 	}
 
 	if (!mac_provided) {



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202002202107.01KL7Olw094786>