Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 30 Jan 2013 04:30:36 +0000 (UTC)
From:      Neel Natu <neel@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r246109 - head/usr.sbin/bhyve
Message-ID:  <201301300430.r0U4UaQS086091@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: neel
Date: Wed Jan 30 04:30:36 2013
New Revision: 246109
URL: http://svnweb.freebsd.org/changeset/base/246109

Log:
  Add support for MSI-X interrupts in the virtio network device and make that
  the default.
  
  The current behavior of advertising a single MSI vector can be requested by
  setting the environment variable "BHYVE_USE_MSI" to "true". The use of MSI
  is not compliant with the virtio specification and will be eventually phased
  out.
  
  Submitted by:	Gopakumar T
  Obtained from:	NetApp

Modified:
  head/usr.sbin/bhyve/pci_emul.c
  head/usr.sbin/bhyve/pci_emul.h
  head/usr.sbin/bhyve/pci_virtio_net.c
  head/usr.sbin/bhyve/virtio.h

Modified: head/usr.sbin/bhyve/pci_emul.c
==============================================================================
--- head/usr.sbin/bhyve/pci_emul.c	Wed Jan 30 04:09:09 2013	(r246108)
+++ head/usr.sbin/bhyve/pci_emul.c	Wed Jan 30 04:30:36 2013	(r246109)
@@ -167,6 +167,94 @@ pci_parse_slot(char *opt, int legacy)
 }
 
 static int
+pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
+{
+
+	if (offset < pi->pi_msix.pba_offset)
+		return (0);
+
+	if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
+		return (0);
+	}
+
+	return (1);
+}
+
+int
+pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
+		     uint64_t value)
+{
+	int msix_entry_offset;
+	int tab_index;
+	char *dest;
+
+	/* support only 4 or 8 byte writes */
+	if (size != 4 && size != 8)
+		return (-1);
+
+	/*
+	 * Return if table index is beyond what device supports
+	 */
+	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+	if (tab_index >= pi->pi_msix.table_count)
+		return (-1);
+
+	msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+	/* support only aligned writes */
+	if ((msix_entry_offset % size) != 0)
+		return (-1);
+
+	dest = (char *)(pi->pi_msix.table + tab_index);
+	dest += msix_entry_offset;
+
+	if (size == 4)
+		*((uint32_t *)dest) = value;
+	else
+		*((uint64_t *)dest) = value;
+
+	return (0);
+}
+
+uint64_t
+pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
+{
+	char *dest;
+	int msix_entry_offset;
+	int tab_index;
+	uint64_t retval = ~0;
+
+	/* support only 4 or 8 byte reads */
+	if (size != 4 && size != 8)
+		return (retval);
+
+	msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+	/* support only aligned reads */
+	if ((msix_entry_offset % size) != 0) {
+		return (retval);
+	}
+
+	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+
+	if (tab_index < pi->pi_msix.table_count) {
+		/* valid MSI-X Table access */
+		dest = (char *)(pi->pi_msix.table + tab_index);
+		dest += msix_entry_offset;
+
+		if (size == 4)
+			retval = *((uint32_t *)dest);
+		else
+			retval = *((uint64_t *)dest);
+	} else if (pci_valid_pba_offset(pi, offset)) {
+		/* return 0 for PBA access */
+		retval = 0;
+	}
+
+	return (retval);
+}
+
+static int
 pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		    uint32_t *eax, void *arg)
 {
@@ -178,8 +266,7 @@ pci_emul_io_handler(struct vmctx *ctx, i
 	for (i = 0; i <= PCI_BARMAX; i++) {
 		if (pdi->pi_bar[i].type == PCIBAR_IO &&
 		    port >= pdi->pi_bar[i].addr &&
-		    port + bytes <=
-		        pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
+		    port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
 			offset = port - pdi->pi_bar[i].addr;
 			if (in)
 				*eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
@@ -484,13 +571,95 @@ pci_emul_add_msicap(struct pci_devinst *
 	return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
 }
 
+static void
+pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
+		     uint32_t msix_tab_size, int nextptr)
+{
+	CTASSERT(sizeof(struct msixcap) == 12);
+
+	assert(msix_tab_size % 4096 == 0);
+
+	bzero(msixcap, sizeof(struct msixcap));
+	msixcap->capid = PCIY_MSIX;
+	msixcap->nextptr = nextptr;
+
+	/*
+	 * Message Control Register, all fields set to
+	 * zero except for the Table Size.
+	 * Note: Table size N is encoded as N-1
+	 */
+	msixcap->msgctrl = msgnum - 1;
+
+	/*
+	 * MSI-X BAR setup:
+	 * - MSI-X table start at offset 0
+	 * - PBA table starts at a 4K aligned offset after the MSI-X table
+	 */
+	msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK;
+	msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK);
+}
+
+static void
+pci_msix_table_init(struct pci_devinst *pi, int table_entries)
+{
+	int i, table_size;
+
+	assert(table_entries > 0);
+	assert(table_entries <= MAX_MSIX_TABLE_ENTRIES);
+
+	table_size = table_entries * MSIX_TABLE_ENTRY_SIZE;
+	pi->pi_msix.table = malloc(table_size);
+	bzero(pi->pi_msix.table, table_size);
+
+	/* set mask bit of vector control register */
+	for (i = 0; i < table_entries; i++)
+		pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK;
+}
+
+int
+pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
+{
+	uint16_t pba_index;
+	uint32_t tab_size;
+	struct msixcap msixcap;
+
+	assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
+	assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
+	
+	tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
+
+	/* Align table size to nearest 4K */
+	tab_size = roundup2(tab_size, 4096);
+
+	pi->pi_msix.table_bar = barnum;
+	pi->pi_msix.pba_bar   = barnum;
+	pi->pi_msix.table_offset = 0;
+	pi->pi_msix.table_count = msgnum;
+	pi->pi_msix.pba_offset = tab_size;
+
+	/* calculate the MMIO size required for MSI-X PBA */
+	pba_index = (msgnum - 1) / (PBA_TABLE_ENTRY_SIZE * 8);
+	pi->pi_msix.pba_size = (pba_index + 1) * PBA_TABLE_ENTRY_SIZE;
+
+	pci_msix_table_init(pi, msgnum);
+
+	pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size, 0);
+
+	/* allocate memory for MSI-X Table and PBA */
+	pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32,
+				tab_size + pi->pi_msix.pba_size);
+
+	return (pci_emul_add_capability(pi, (u_char *)&msixcap,
+					sizeof(msixcap)));
+}
+
 void
 msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		 int bytes, uint32_t val)
 {
 	uint16_t msgctrl, rwmask;
 	int off, table_bar;
-        
+	
 	off = offset - capoff;
 	table_bar = pi->pi_msix.table_bar;
 	/* Message Control Register */
@@ -502,6 +671,7 @@ msixcap_cfgwrite(struct pci_devinst *pi,
 		val = msgctrl;
 
 		pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
+		pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
 	} 
 	
 	CFGWRITE(pi, offset, val, bytes);
@@ -589,6 +759,9 @@ pci_emul_capwrite(struct pci_devinst *pi
 	case PCIY_MSI:
 		msicap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
+	case PCIY_MSIX:
+		msixcap_cfgwrite(pi, capoff, offset, bytes, val);
+		break;
 	default:
 		break;
 	}
@@ -668,6 +841,35 @@ pci_msi_msgnum(struct pci_devinst *pi)
 		return (0);
 }
 
+int
+pci_msix_enabled(struct pci_devinst *pi)
+{
+
+	return (pi->pi_msix.enabled && !pi->pi_msi.enabled);
+}
+
+void
+pci_generate_msix(struct pci_devinst *pi, int index)
+{
+	struct msix_table_entry *mte;
+
+	if (!pci_msix_enabled(pi))
+		return;
+
+	if (pi->pi_msix.function_mask)
+		return;
+
+	if (index >= pi->pi_msix.table_count)
+		return;
+
+	mte = &pi->pi_msix.table[index];
+	if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+		/* XXX Set PBA bit if interrupt is disabled */
+		vm_lapic_irq(pi->pi_vmctx,
+			     (mte->addr >> 12) & 0xff, mte->msg_data & 0xff);
+	}
+}
+
 void
 pci_generate_msi(struct pci_devinst *pi, int msg)
 {

Modified: head/usr.sbin/bhyve/pci_emul.h
==============================================================================
--- head/usr.sbin/bhyve/pci_emul.h	Wed Jan 30 04:09:09 2013	(r246108)
+++ head/usr.sbin/bhyve/pci_emul.h	Wed Jan 30 04:30:36 2013	(r246109)
@@ -96,6 +96,8 @@ struct msix_table_entry {
  * for the size that should be emulated.
  */
 #define	MSIX_TABLE_ENTRY_SIZE	16
+#define MAX_MSIX_TABLE_ENTRIES	2048
+#define PBA_TABLE_ENTRY_SIZE	8
 
 struct pci_devinst {
 	struct pci_devemu *pi_d;
@@ -120,6 +122,8 @@ struct pci_devinst {
 		size_t	table_offset;
 		int	table_count;
 		size_t	pba_offset;
+		size_t	pba_size;
+		int	function_mask; 	
 		struct msix_table_entry *table;	/* allocated at runtime */
 	} pi_msix;
 
@@ -168,6 +172,10 @@ int	pci_msix_enabled(struct pci_devinst 
 int	pci_msi_msgnum(struct pci_devinst *pi);
 void	pci_parse_slot(char *opt, int legacy);
 void	pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
+int	pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
+int	pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
+			     uint64_t value);
+uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
 
 static __inline void 
 pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)

Modified: head/usr.sbin/bhyve/pci_virtio_net.c
==============================================================================
--- head/usr.sbin/bhyve/pci_virtio_net.c	Wed Jan 30 04:09:09 2013	(r246108)
+++ head/usr.sbin/bhyve/pci_virtio_net.c	Wed Jan 30 04:30:36 2013	(r246109)
@@ -59,17 +59,17 @@ __FBSDID("$FreeBSD$");
 /*
  * PCI config-space register offsets
  */
-#define VTNET_R_CFG0	       20
-#define VTNET_R_CFG1	       21
-#define VTNET_R_CFG2	       22
-#define VTNET_R_CFG3	       23
-#define VTNET_R_CFG4	       24
-#define VTNET_R_CFG5	       25
-#define VTNET_R_CFG6	       26
-#define VTNET_R_CFG7	       27
-#define VTNET_R_MAX	       27
+#define VTNET_R_CFG0	24
+#define VTNET_R_CFG1	25
+#define VTNET_R_CFG2	26
+#define VTNET_R_CFG3	27
+#define VTNET_R_CFG4	28
+#define VTNET_R_CFG5	29
+#define VTNET_R_CFG6	30
+#define VTNET_R_CFG7	31
+#define VTNET_R_MAX	31
 
-#define VTNET_REGSZ		VTNET_R_MAX+1
+#define VTNET_REGSZ	VTNET_R_MAX+1
 
 /*
  * Host capabilities
@@ -88,6 +88,8 @@ __FBSDID("$FreeBSD$");
 
 #define VTNET_MAXQ	3
 
+static int use_msix = 1;
+
 struct vring_hqueue {
 	/* Internal state */
 	uint16_t	hq_size;
@@ -144,9 +146,24 @@ struct pci_vtnet_softc {
 
 	uint64_t	vsc_pfn[VTNET_MAXQ];
 	struct	vring_hqueue vsc_hq[VTNET_MAXQ];
+	uint16_t	vsc_msix_table_idx[VTNET_MAXQ];
 };
 
 /*
+ * Return the size of IO BAR that maps virtio header and device specific
+ * region. The size would vary depending on whether MSI-X is enabled or
+ * not.
+ */
+static uint64_t
+pci_vtnet_iosize(struct pci_devinst *pi)
+{
+	if (pci_msix_enabled(pi))
+		return (VTNET_REGSZ);
+	else
+		return (VTNET_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX));
+}
+
+/*
  * Return the number of available descriptors in the vring taking care
  * of the 16-bit index wraparound.
  */
@@ -344,8 +361,13 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc 
 	hq->hq_cur_aidx = aidx;
 
 	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
-		sc->vsc_isr |= 1;
-		pci_generate_msi(sc->vsc_pi, 0);
+		if (use_msix) {
+			pci_generate_msix(sc->vsc_pi,
+					  sc->vsc_msix_table_idx[VTNET_RXQ]);
+		} else {
+			sc->vsc_isr |= 1;
+			pci_generate_msi(sc->vsc_pi, 0);
+		}
 	}
 }
 
@@ -438,8 +460,13 @@ pci_vtnet_proctx(struct pci_vtnet_softc 
 	 * Generate an interrupt if able
 	 */
 	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
-		sc->vsc_isr |= 1;
-		pci_generate_msi(sc->vsc_pi, 0);
+		if (use_msix) {
+			pci_generate_msix(sc->vsc_pi,
+					  sc->vsc_msix_table_idx[VTNET_TXQ]);
+		} else {
+			sc->vsc_isr |= 1;
+			pci_generate_msi(sc->vsc_pi, 0);
+		}
 	}	
 }
 
@@ -512,6 +539,7 @@ pci_vtnet_init(struct vmctx *ctx, struct
 	unsigned char digest[16];
 	char nstr[80];
 	struct pci_vtnet_softc *sc;
+	const char *env_msi;
 
 	/*
 	 * Access to guest memory is required. Fail if
@@ -527,6 +555,14 @@ pci_vtnet_init(struct vmctx *ctx, struct
 	sc->vsc_pi = pi;
 
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
+ 
+	/*
+	 * Use MSI if set by user
+	 */
+	if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) {
+		if (strcasecmp(env_msi, "yes") == 0)
+			use_msix = 0;
+	}
 
 	/*
 	 * Attempt to open the tap device
@@ -594,7 +630,24 @@ pci_vtnet_init(struct vmctx *ctx, struct
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
-	pci_emul_add_msicap(pi, 1);
+	
+	if (use_msix) {
+		/* MSI-X support */
+		int i;
+
+		for (i = 0; i < VTNET_MAXQ; i++)
+			sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
+
+		/*
+		 * BAR 1 used to map MSI-X table and PBA
+		 */
+		if (pci_emul_add_msixcap(pi, VTNET_MAXQ, 1))
+			return (1);
+	} else {
+		/* MSI support */
+		pci_emul_add_msicap(pi, 1);
+	}
+	
 	pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
 
 	return (0);
@@ -609,6 +662,21 @@ static void (*pci_vtnet_qnotify[VTNET_MA
 	pci_vtnet_ping_ctlq
 };
 
+static uint64_t
+vtnet_adjust_offset(struct pci_devinst *pi, uint64_t offset)
+{
+	/*
+	 * Device specific offsets used by guest would change based on
+	 * whether MSI-X capability is enabled or not
+	 */
+	if (!pci_msix_enabled(pi)) {
+		if (offset >= VTCFG_R_MSIX)
+			return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX));
+	}
+
+	return (offset);
+}
+
 static void
 pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		int baridx, uint64_t offset, int size, uint64_t value)
@@ -616,9 +684,17 @@ pci_vtnet_write(struct vmctx *ctx, int v
 	struct pci_vtnet_softc *sc = pi->pi_arg;
 	void *ptr;
 
+	if (use_msix) {
+		if (baridx == pi->pi_msix.table_bar ||
+		    baridx == pi->pi_msix.pba_bar) {
+			pci_emul_msix_twrite(pi, offset, size, value);
+			return;
+		}
+	}
+
 	assert(baridx == 0);
 
-	if (offset + size > VTNET_REGSZ) {
+	if (offset + size > pci_vtnet_iosize(pi)) {
 		DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
 			 offset, size));
 		return;
@@ -626,6 +702,8 @@ pci_vtnet_write(struct vmctx *ctx, int v
 
 	pthread_mutex_lock(&sc->vsc_mtx);
 
+	offset = vtnet_adjust_offset(pi, offset);
+
 	switch (offset) {
 	case VTCFG_R_GUESTCAP:
 		assert(size == 4);
@@ -649,6 +727,15 @@ pci_vtnet_write(struct vmctx *ctx, int v
 		assert(size == 1);
 		pci_vtnet_update_status(sc, value);
 		break;
+	case VTCFG_R_CFGVEC:
+		assert(size == 2);
+		sc->vsc_msix_table_idx[VTNET_CTLQ] = value;
+		break;
+	case VTCFG_R_QVEC:
+		assert(size == 2);
+		assert(sc->vsc_curq != VTNET_CTLQ);
+		sc->vsc_msix_table_idx[sc->vsc_curq] = value;
+		break;
 	case VTNET_R_CFG0:
 	case VTNET_R_CFG1:
 	case VTNET_R_CFG2:
@@ -693,9 +780,16 @@ pci_vtnet_read(struct vmctx *ctx, int vc
 	void *ptr;
 	uint64_t value;
 
+	if (use_msix) {
+		if (baridx == pi->pi_msix.table_bar ||
+		    baridx == pi->pi_msix.pba_bar) {
+			return (pci_emul_msix_tread(pi, offset, size));
+		}
+	}
+
 	assert(baridx == 0);
 
-	if (offset + size > VTNET_REGSZ) {
+	if (offset + size > pci_vtnet_iosize(pi)) {
 		DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
 			 offset, size));
 		return (0);
@@ -703,6 +797,8 @@ pci_vtnet_read(struct vmctx *ctx, int vc
 
 	pthread_mutex_lock(&sc->vsc_mtx);
 
+	offset = vtnet_adjust_offset(pi, offset);
+
 	switch (offset) {
 	case VTCFG_R_HOSTCAP:
 		assert(size == 4);
@@ -737,21 +833,30 @@ pci_vtnet_read(struct vmctx *ctx, int vc
 		value = sc->vsc_isr;
 		sc->vsc_isr = 0;     /* a read clears this flag */
 		break;
+	case VTCFG_R_CFGVEC:
+		assert(size == 2);
+		value = sc->vsc_msix_table_idx[VTNET_CTLQ];
+		break;
+	case VTCFG_R_QVEC:
+		assert(size == 2);
+		assert(sc->vsc_curq != VTNET_CTLQ);
+		value = sc->vsc_msix_table_idx[sc->vsc_curq];
+		break;
 	case VTNET_R_CFG0:
 	case VTNET_R_CFG1:
 	case VTNET_R_CFG2:
 	case VTNET_R_CFG3:
 	case VTNET_R_CFG4:
 	case VTNET_R_CFG5:
-                assert((size + offset) <= (VTNET_R_CFG5 + 1));
-                ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
-                if (size == 1) {
-                        value = *(uint8_t *) ptr;
-                } else if (size == 2) {
-                        value = *(uint16_t *) ptr;
-                } else {
-                        value = *(uint32_t *) ptr;
-                }
+		assert((size + offset) <= (VTNET_R_CFG5 + 1));
+		ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
+		if (size == 1) {
+			value = *(uint8_t *) ptr;
+		} else if (size == 2) {
+			value = *(uint16_t *) ptr;
+		} else {
+			value = *(uint32_t *) ptr;
+		}
 		break;
 	case VTNET_R_CFG6:
 		assert(size != 4);

Modified: head/usr.sbin/bhyve/virtio.h
==============================================================================
--- head/usr.sbin/bhyve/virtio.h	Wed Jan 30 04:09:09 2013	(r246108)
+++ head/usr.sbin/bhyve/virtio.h	Wed Jan 30 04:30:36 2013	(r246109)
@@ -36,6 +36,7 @@
 #define VRING_DESC_F_INDIRECT	(1 << 2)
 
 #define VRING_AVAIL_F_NO_INTERRUPT   1
+#define VIRTIO_MSI_NO_VECTOR	0xFFFF
 
 struct virtio_desc {
 	uint64_t	vd_addr;
@@ -78,6 +79,8 @@ struct virtio_used {
 #define VTCFG_R_QNOTIFY		16
 #define VTCFG_R_STATUS		18
 #define VTCFG_R_ISR		19
+#define VTCFG_R_CFGVEC		20
+#define VTCFG_R_QVEC		22
 #define VTCFG_R_CFG0		20	/* No MSI-X */
 #define VTCFG_R_CFG1		24	/* With MSI-X */
 #define VTCFG_R_MSIX		20



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201301300430.r0U4UaQS086091>