Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 28 Apr 2012 16:28:01 +0000 (UTC)
From:      Peter Grehan <grehan@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-projects@freebsd.org
Subject:   svn commit: r234761 - in projects/bhyve: lib/libvmmapi sys/amd64/include sys/amd64/vmm sys/amd64/vmm/intel sys/amd64/vmm/io usr.sbin/bhyve
Message-ID:  <201204281628.q3SGS1m4069240@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: grehan
Date: Sat Apr 28 16:28:00 2012
New Revision: 234761
URL: http://svn.freebsd.org/changeset/base/234761

Log:
  MSI-x interrupt support for PCI pass-thru devices.
  
  Includes instruction emulation for memory r/w access. This
  opens the door for io-apic, local apic, hpet timer, and
  legacy device emulation.
  
  Submitted by:	ryan dot berryhill at sandvine dot com
  Reviewed by:	grehan
  Obtained from:	Sandvine

Added:
  projects/bhyve/usr.sbin/bhyve/instruction_emul.c   (contents, props changed)
  projects/bhyve/usr.sbin/bhyve/instruction_emul.h   (contents, props changed)
Modified:
  projects/bhyve/lib/libvmmapi/vmmapi.c
  projects/bhyve/lib/libvmmapi/vmmapi.h
  projects/bhyve/sys/amd64/include/vmm.h
  projects/bhyve/sys/amd64/include/vmm_dev.h
  projects/bhyve/sys/amd64/vmm/intel/vmcs.h
  projects/bhyve/sys/amd64/vmm/intel/vmx.c
  projects/bhyve/sys/amd64/vmm/io/ppt.c
  projects/bhyve/sys/amd64/vmm/io/ppt.h
  projects/bhyve/sys/amd64/vmm/io/vlapic.c
  projects/bhyve/sys/amd64/vmm/vmm_dev.c
  projects/bhyve/usr.sbin/bhyve/Makefile
  projects/bhyve/usr.sbin/bhyve/fbsdrun.c
  projects/bhyve/usr.sbin/bhyve/pci_emul.c
  projects/bhyve/usr.sbin/bhyve/pci_emul.h
  projects/bhyve/usr.sbin/bhyve/pci_passthru.c

Modified: projects/bhyve/lib/libvmmapi/vmmapi.c
==============================================================================
--- projects/bhyve/lib/libvmmapi/vmmapi.c	Sat Apr 28 14:42:49 2012	(r234760)
+++ projects/bhyve/lib/libvmmapi/vmmapi.c	Sat Apr 28 16:28:00 2012	(r234761)
@@ -454,6 +454,25 @@ vm_setup_msi(struct vmctx *ctx, int vcpu
 	return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
 }
 
+int	
+vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+	      int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+{
+	struct vm_pptdev_msix pptmsix;
+
+	bzero(&pptmsix, sizeof(pptmsix));
+	pptmsix.vcpu = vcpu;
+	pptmsix.bus = bus;
+	pptmsix.slot = slot;
+	pptmsix.func = func;
+	pptmsix.idx = idx;
+	pptmsix.msg = msg;
+	pptmsix.addr = addr;
+	pptmsix.vector_control = vector_control;
+
+	return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
+}
+
 uint64_t *
 vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
 	     int *ret_entries)

Modified: projects/bhyve/lib/libvmmapi/vmmapi.h
==============================================================================
--- projects/bhyve/lib/libvmmapi/vmmapi.h	Sat Apr 28 14:42:49 2012	(r234760)
+++ projects/bhyve/lib/libvmmapi/vmmapi.h	Sat Apr 28 16:28:00 2012	(r234761)
@@ -77,6 +77,8 @@ int	vm_map_pptdev_mmio(struct vmctx *ctx
 			   vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int	vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
 		     int dest, int vector, int numvec);
+int	vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+		      int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
 
 /*
  * Return a pointer to the statistics buffer. Note that this is not MT-safe.

Modified: projects/bhyve/sys/amd64/include/vmm.h
==============================================================================
--- projects/bhyve/sys/amd64/include/vmm.h	Sat Apr 28 14:42:49 2012	(r234760)
+++ projects/bhyve/sys/amd64/include/vmm.h	Sat Apr 28 16:28:00 2012	(r234761)
@@ -227,7 +227,8 @@ enum vm_exitcode {
 	VM_EXITCODE_HLT,
 	VM_EXITCODE_MTRAP,
 	VM_EXITCODE_PAUSE,
-	VM_EXITCODE_MAX,
+	VM_EXITCODE_PAGING,
+	VM_EXITCODE_MAX
 };
 
 struct vm_exit {
@@ -243,6 +244,9 @@ struct vm_exit {
 			uint16_t	port;
 			uint32_t	eax;		/* valid for out */
 		} inout;
+		struct {
+			uint64_t	cr3;
+		} paging;
 		/*
 		 * VMX specific payload. Used when there is no "better"
 		 * exitcode to represent the VM-exit.

Modified: projects/bhyve/sys/amd64/include/vmm_dev.h
==============================================================================
--- projects/bhyve/sys/amd64/include/vmm_dev.h	Sat Apr 28 14:42:49 2012	(r234760)
+++ projects/bhyve/sys/amd64/include/vmm_dev.h	Sat Apr 28 16:28:00 2012	(r234761)
@@ -108,6 +108,17 @@ struct vm_pptdev_msi {
 	int		destcpu;
 };
 
+struct vm_pptdev_msix {
+	int		vcpu;
+	int		bus;
+	int		slot;
+	int		func;
+	int		idx;
+	uint32_t	msg;
+	uint32_t	vector_control;
+	uint64_t	addr;
+};
+
 struct vm_nmi {
 	int		cpuid;
 };
@@ -143,6 +154,7 @@ enum {
 	IOCNUM_UNBIND_PPTDEV,
 	IOCNUM_MAP_PPTDEV_MMIO,
 	IOCNUM_PPTDEV_MSI,
+	IOCNUM_PPTDEV_MSIX,
 	IOCNUM_INJECT_NMI,
 	IOCNUM_VM_STATS,
 	IOCNUM_VM_STAT_DESC,
@@ -182,6 +194,8 @@ enum {
 	_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
 #define	VM_PPTDEV_MSI \
 	_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define	VM_PPTDEV_MSIX \
+	_IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
 #define VM_INJECT_NMI \
 	_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
 #define	VM_STATS \

Modified: projects/bhyve/sys/amd64/vmm/intel/vmcs.h
==============================================================================
--- projects/bhyve/sys/amd64/vmm/intel/vmcs.h	Sat Apr 28 14:42:49 2012	(r234760)
+++ projects/bhyve/sys/amd64/vmm/intel/vmcs.h	Sat Apr 28 16:28:00 2012	(r234761)
@@ -65,6 +65,7 @@ uint64_t vmcs_read(uint32_t encoding);
 #define	vmcs_instruction_error()	vmcs_read(VMCS_INSTRUCTION_ERROR)
 #define	vmcs_exit_reason()		(vmcs_read(VMCS_EXIT_REASON) & 0xffff)
 #define	vmcs_exit_qualification()	vmcs_read(VMCS_EXIT_QUALIFICATION)
+#define	vmcs_guest_cr3()		vmcs_read(VMCS_GUEST_CR3)
 
 #endif	/* _KERNEL */
 

Modified: projects/bhyve/sys/amd64/vmm/intel/vmx.c
==============================================================================
--- projects/bhyve/sys/amd64/vmm/intel/vmx.c	Sat Apr 28 14:42:49 2012	(r234760)
+++ projects/bhyve/sys/amd64/vmm/intel/vmx.c	Sat Apr 28 16:28:00 2012	(r234761)
@@ -1185,6 +1185,10 @@ vmx_exit_process(struct vmx *vmx, int vc
 	case EXIT_REASON_CPUID:
 		handled = vmx_handle_cpuid(vcpu, vmxctx);
 		break;
+	case EXIT_REASON_EPT_FAULT:
+		vmexit->exitcode = VM_EXITCODE_PAGING;
+		vmexit->u.paging.cr3 = vmcs_guest_cr3();
+		break;
 	default:
 		break;
 	}

Modified: projects/bhyve/sys/amd64/vmm/io/ppt.c
==============================================================================
--- projects/bhyve/sys/amd64/vmm/io/ppt.c	Sat Apr 28 14:42:49 2012	(r234760)
+++ projects/bhyve/sys/amd64/vmm/io/ppt.c	Sat Apr 28 16:28:00 2012	(r234761)
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/pciio.h>
@@ -56,9 +57,12 @@ __FBSDID("$FreeBSD$");
 #define	MAX_MMIOSEGS	(PCIR_MAX_BAR_0 + 1)
 #define	MAX_MSIMSGS	32
 
+MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
+
 struct pptintr_arg {				/* pptintr(pptintr_arg) */
 	struct pptdev	*pptdev;
-	int		msg;
+	int		vec;
+	int 		vcpu;
 };
 
 static struct pptdev {
@@ -75,6 +79,16 @@ static struct pptdev {
 		void	*cookie[MAX_MSIMSGS];
 		struct pptintr_arg arg[MAX_MSIMSGS];
 	} msi;
+
+	struct {
+		int num_msgs;
+		int startrid;
+		int msix_table_rid;
+		struct resource *msix_table_res;
+		struct resource **res;
+		void **cookie;
+		struct pptintr_arg *arg;
+	} msix;
 } pptdevs[32];
 
 static int num_pptdevs;
@@ -209,6 +223,57 @@ ppt_teardown_msi(struct pptdev *ppt)
 	ppt->msi.num_msgs = 0;
 }
 
+static void 
+ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
+{
+	int rid;
+	struct resource *res;
+	void *cookie;
+
+	rid = ppt->msix.startrid + idx;
+	res = ppt->msix.res[idx];
+	cookie = ppt->msix.cookie[idx];
+
+	if (cookie != NULL) 
+		bus_teardown_intr(ppt->dev, res, cookie);
+
+	if (res != NULL) 
+		bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+
+	ppt->msix.res[idx] = NULL;
+	ppt->msix.cookie[idx] = NULL;
+}
+
+static void 
+ppt_teardown_msix(struct pptdev *ppt)
+{
+	int i, error;
+
+	if (ppt->msix.num_msgs == 0) 
+		return;
+
+	for (i = 0; i < ppt->msix.num_msgs; i++) 
+		ppt_teardown_msix_intr(ppt, i);
+
+	if (ppt->msix.msix_table_res) {
+		bus_release_resource(ppt->dev, SYS_RES_MEMORY, 
+				     ppt->msix.msix_table_rid,
+				     ppt->msix.msix_table_res);
+		ppt->msix.msix_table_res = NULL;
+		ppt->msix.msix_table_rid = 0;
+	}
+
+	free(ppt->msix.res, M_PPTMSIX);
+	free(ppt->msix.cookie, M_PPTMSIX);
+	free(ppt->msix.arg, M_PPTMSIX);
+
+	error = pci_release_msi(ppt->dev);
+	if (error) 
+		printf("ppt_teardown_msix: Failed to release MSI-X resources (error %i)\n", error);
+
+	ppt->msix.num_msgs = 0;
+}
+
 int
 ppt_assign_device(struct vm *vm, int bus, int slot, int func)
 {
@@ -244,6 +309,7 @@ ppt_unassign_device(struct vm *vm, int b
 			return (EBUSY);
 		ppt_unmap_mmio(vm, ppt);
 		ppt_teardown_msi(ppt);
+		ppt_teardown_msix(ppt);
 		iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
 		ppt->vm = NULL;
 		return (0);
@@ -309,10 +375,10 @@ pptintr(void *arg)
 	
 	pptarg = arg;
 	ppt = pptarg->pptdev;
-	vec = ppt->msi.vector + pptarg->msg;
+	vec = pptarg->vec;
 
 	if (ppt->vm != NULL)
-		(void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec);
+		(void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec);
 	else {
 		/*
 		 * XXX
@@ -431,7 +497,7 @@ ppt_setup_msi(struct vm *vm, int vcpu, i
 			break;
 
 		ppt->msi.arg[i].pptdev = ppt;
-		ppt->msi.arg[i].msg = i;
+		ppt->msi.arg[i].vec = vector + i;
 
 		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
 				       INTR_TYPE_NET | INTR_MPSAFE,
@@ -448,3 +514,110 @@ ppt_setup_msi(struct vm *vm, int vcpu, i
 
 	return (0);
 }
+
+int
+ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+	       int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+{
+	struct pptdev *ppt;
+	struct pci_devinfo *dinfo;
+	int numvec, vector_count, rid, error;
+	size_t res_size, cookie_size, arg_size;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt == NULL)
+		return (ENOENT);
+	if (ppt->vm != vm)		/* Make sure we own this device */
+		return (EBUSY);
+
+	dinfo = device_get_ivars(ppt->dev);
+	if (!dinfo) 
+		return (ENXIO);
+
+	/* 
+	 * First-time configuration:
+	 * 	Allocate the MSI-X table
+	 *	Allocate the IRQ resources
+	 *	Set up some variables in ppt->msix
+	 */
+	if (!ppt->msix.msix_table_res) {
+		ppt->msix.res = NULL;
+		ppt->msix.cookie = NULL;
+		ppt->msix.arg = NULL;
+
+		rid = dinfo->cfg.msix.msix_table_bar;
+		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, SYS_RES_MEMORY,
+								  &rid, RF_ACTIVE);
+		if (ppt->msix.msix_table_res == NULL) 
+			return (ENOSPC);
+
+		ppt->msix.msix_table_rid = rid;
+
+		vector_count = numvec = pci_msix_count(ppt->dev);
+
+		error = pci_alloc_msix(ppt->dev, &numvec);
+		if (error) 
+			return (error);
+		else if (vector_count != numvec) {
+			pci_release_msi(ppt->dev);
+			return (ENOSPC);
+		} 
+
+		ppt->msix.num_msgs = numvec;
+
+		ppt->msix.startrid = 1;
+
+		res_size = numvec * sizeof(ppt->msix.res[0]);
+		cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
+		arg_size = numvec * sizeof(ppt->msix.arg[0]);
+
+		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK);
+		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, M_WAITOK);
+		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK);
+		if (ppt->msix.res == NULL || ppt->msix.cookie == NULL || 
+		    ppt->msix.arg == NULL) {
+			ppt_teardown_msix(ppt);
+			return (ENOSPC);
+		}
+		bzero(ppt->msix.res, res_size);
+		bzero(ppt->msix.cookie, cookie_size);
+		bzero(ppt->msix.arg, arg_size);
+	}
+
+	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+		/* Tear down the IRQ if it's already set up */
+		ppt_teardown_msix_intr(ppt, idx);
+
+		/* Allocate the IRQ resource */
+		ppt->msix.cookie[idx] = NULL;
+		rid = ppt->msix.startrid + idx;
+		ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+							    &rid, RF_ACTIVE);
+		if (ppt->msix.res[idx] == NULL)
+			return (ENXIO);
+	
+		ppt->msix.arg[idx].pptdev = ppt;
+		ppt->msix.arg[idx].vec = msg;
+		ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF;
+	
+		/* Setup the MSI-X interrupt */
+		error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
+				       INTR_TYPE_NET | INTR_MPSAFE,
+				       pptintr, NULL, &ppt->msix.arg[idx],
+				       &ppt->msix.cookie[idx]);
+	
+		if (error != 0) {
+			bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
+			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
+			ppt->msix.cookie[idx] = NULL;
+			ppt->msix.res[idx] = NULL;
+			return (ENXIO);
+		}
+	} else {
+		/* Masked, tear it down if it's already been set up */
+		ppt_teardown_msix_intr(ppt, idx);
+	}
+
+	return (0);
+}
+

Modified: projects/bhyve/sys/amd64/vmm/io/ppt.h
==============================================================================
--- projects/bhyve/sys/amd64/vmm/io/ppt.h	Sat Apr 28 14:42:49 2012	(r234760)
+++ projects/bhyve/sys/amd64/vmm/io/ppt.h	Sat Apr 28 16:28:00 2012	(r234761)
@@ -36,5 +36,6 @@ int	ppt_map_mmio(struct vm *vm, int bus,
 		     vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int	ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
 		      int destcpu, int vector, int numvec);
-
+int	ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+		       int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
 #endif

Modified: projects/bhyve/sys/amd64/vmm/io/vlapic.c
==============================================================================
--- projects/bhyve/sys/amd64/vmm/io/vlapic.c	Sat Apr 28 14:42:49 2012	(r234760)
+++ projects/bhyve/sys/amd64/vmm/io/vlapic.c	Sat Apr 28 16:28:00 2012	(r234761)
@@ -778,6 +778,7 @@ vlapic_init(struct vm *vm, int vcpuid)
 void
 vlapic_cleanup(struct vlapic *vlapic)
 {
+	vlapic_op_halt(vlapic);
 	vdev_unregister(vlapic);
 	free(vlapic, M_VLAPIC);
 }

Modified: projects/bhyve/sys/amd64/vmm/vmm_dev.c
==============================================================================
--- projects/bhyve/sys/amd64/vmm/vmm_dev.c	Sat Apr 28 14:42:49 2012	(r234760)
+++ projects/bhyve/sys/amd64/vmm/vmm_dev.c	Sat Apr 28 16:28:00 2012	(r234761)
@@ -158,6 +158,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long c
 	struct vm_pptdev *pptdev;
 	struct vm_pptdev_mmio *pptmmio;
 	struct vm_pptdev_msi *pptmsi;
+	struct vm_pptdev_msix *pptmsix;
 	struct vm_nmi *vmnmi;
 	struct vm_stats *vmstats;
 	struct vm_stat_desc *statdesc;
@@ -240,6 +241,14 @@ vmmdev_ioctl(struct cdev *cdev, u_long c
 				      pptmsi->destcpu, pptmsi->vector,
 				      pptmsi->numvec);
 		break;
+	case VM_PPTDEV_MSIX:
+		pptmsix = (struct vm_pptdev_msix *)data;
+		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
+				       pptmsix->bus, pptmsix->slot, 
+				       pptmsix->func, pptmsix->idx,
+				       pptmsix->msg, pptmsix->vector_control,
+				       pptmsix->addr);
+		break;
 	case VM_MAP_PPTDEV_MMIO:
 		pptmmio = (struct vm_pptdev_mmio *)data;
 		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,

Modified: projects/bhyve/usr.sbin/bhyve/Makefile
==============================================================================
--- projects/bhyve/usr.sbin/bhyve/Makefile	Sat Apr 28 14:42:49 2012	(r234760)
+++ projects/bhyve/usr.sbin/bhyve/Makefile	Sat Apr 28 16:28:00 2012	(r234761)
@@ -4,7 +4,8 @@
 
 PROG=	bhyve
 
-SRCS=	atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c mevent.c
+SRCS=	atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c 
+SRCS+=  instruction_emul.c mevent.c
 SRCS+=	pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
 SRCS+=	pci_virtio_net.c pit_8254.c post.c rtc.c uart.c xmsr.c
 

Modified: projects/bhyve/usr.sbin/bhyve/fbsdrun.c
==============================================================================
--- projects/bhyve/usr.sbin/bhyve/fbsdrun.c	Sat Apr 28 14:42:49 2012	(r234760)
+++ projects/bhyve/usr.sbin/bhyve/fbsdrun.c	Sat Apr 28 16:28:00 2012	(r234761)
@@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$");
 #include "mevent.h"
 #include "pci_emul.h"
 #include "xmsr.h"
+#include "instruction_emul.h"
 
 #define	DEFAULT_GUEST_HZ	100
 #define	DEFAULT_GUEST_TSLICE	200
@@ -108,6 +109,7 @@ struct fbsdstats {
         uint64_t        vmexit_hlt;
         uint64_t        vmexit_pause;
         uint64_t        vmexit_mtrap;
+        uint64_t        vmexit_paging;
         uint64_t        cpu_switch_rotate;
         uint64_t        cpu_switch_direct;
         int             io_reset;
@@ -412,6 +414,20 @@ vmexit_mtrap(struct vmctx *ctx, struct v
 	return (VMEXIT_RESTART);
 }
 
+static int
+vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+	stats.vmexit_paging++;
+
+	if (emulate_instruction(ctx, *pvcpu, vmexit->rip, vmexit->u.paging.cr3) != 0) {
+		printf("Failed to emulate instruction at 0x%lx\n", vmexit->rip);
+		return (VMEXIT_ABORT);
+	}
+
+	return (VMEXIT_CONTINUE);
+}
+
 static void
 sigalrm(int sig)
 {
@@ -446,12 +462,13 @@ setup_timeslice(void)
 }
 
 static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
-	[VM_EXITCODE_INOUT] = vmexit_inout,
-	[VM_EXITCODE_VMX]   = vmexit_vmx,
-	[VM_EXITCODE_BOGUS] = vmexit_bogus,
-	[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
-	[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
-	[VM_EXITCODE_MTRAP] = vmexit_mtrap,
+	[VM_EXITCODE_INOUT]  = vmexit_inout,
+	[VM_EXITCODE_VMX]    = vmexit_vmx,
+	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
+	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
+	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
+	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
+	[VM_EXITCODE_PAGING] = vmexit_paging
 };
 
 static void

Added: projects/bhyve/usr.sbin/bhyve/instruction_emul.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ projects/bhyve/usr.sbin/bhyve/instruction_emul.c	Sat Apr 28 16:28:00 2012	(r234761)
@@ -0,0 +1,555 @@
+/*-
+ * Copyright (c) 2012 Sandvine, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <strings.h>
+#include <unistd.h>
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "fbsdrun.h"
+#include "instruction_emul.h"
+
+#define PREFIX_LOCK 		0xF0
+#define PREFIX_REPNE 		0xF2
+#define PREFIX_REPE		0xF3
+#define PREFIX_CS_OVERRIDE	0x2E
+#define PREFIX_SS_OVERRIDE	0x36
+#define PREFIX_DS_OVERRIDE	0x3E
+#define PREFIX_ES_OVERRIDE	0x26
+#define PREFIX_FS_OVERRIDE	0x64
+#define PREFIX_GS_OVERRIDE	0x65
+#define PREFIX_BRANCH_NOT_TAKEN	0x2E
+#define PREFIX_BRANCH_TAKEN	0x3E
+#define PREFIX_OPSIZE		0x66
+#define PREFIX_ADDRSIZE 	0x67
+
+#define OPCODE_2BYTE_ESCAPE	0x0F
+#define OPCODE_3BYTE_ESCAPE	0x38
+
+#define MODRM_MOD_MASK		0xC0
+#define MODRM_MOD_SHIFT		6
+#define MODRM_RM_MASK		0x07
+#define MODRM_RM_SHIFT		0
+#define MODRM_REG_MASK		0x38
+#define MODRM_REG_SHIFT		3
+
+#define MOD_INDIRECT		0x0
+#define MOD_INDIRECT_DISP8	0x1
+#define MOD_INDIRECT_DISP32	0x2
+#define MOD_DIRECT		0x3
+
+#define RM_EAX			0x0
+#define RM_ECX			0x1
+#define RM_EDX			0x2
+#define RM_EBX			0x3
+#define RM_SIB			0x4
+#define RM_DISP32		0x5
+#define RM_EBP			RM_DISP32
+#define RM_ESI			0x6
+#define RM_EDI			0x7
+
+#define REG_EAX			0x0
+#define REG_ECX			0x1
+#define REG_EDX			0x2
+#define REG_EBX			0x3
+#define REG_ESP			0x4
+#define REG_EBP			0x5
+#define REG_ESI			0x6
+#define REG_EDI			0x7
+#define REG_R8			0x8
+#define REG_R9			0x9
+#define REG_R10			0xA
+#define REG_R11			0xB
+#define REG_R12			0xC
+#define REG_R13			0xD
+#define REG_R14			0xE
+#define REG_R15			0xF
+
+#define HAS_MODRM		1
+#define FROM_RM			(1<<1)
+#define FROM_REG		(1<<2)
+#define TO_RM			(1<<3)
+#define TO_REG			(1<<4)
+
+#define REX_MASK		0xF0
+#define REX_PREFIX		0x40
+#define is_rex_prefix(x) ( ((x) & REX_MASK) == REX_PREFIX )
+#define REX_W_MASK		0x8
+#define REX_R_MASK		0x4
+#define REX_X_MASK		0x2
+#define REX_B_MASK		0x1
+
+#define is_prefix(x) ((x) == PREFIX_LOCK || (x) == PREFIX_REPNE || \
+		      (x) == PREFIX_REPE || (x) == PREFIX_CS_OVERRIDE || \
+		      (x) == PREFIX_SS_OVERRIDE || (x) == PREFIX_DS_OVERRIDE || \
+		      (x) == PREFIX_ES_OVERRIDE || (x) == PREFIX_FS_OVERRIDE || \
+		      (x) == PREFIX_GS_OVERRIDE || (x) == PREFIX_BRANCH_NOT_TAKEN || \
+		      (x) == PREFIX_BRANCH_TAKEN || (x) == PREFIX_OPSIZE || \
+		      (x) == PREFIX_ADDRSIZE || is_rex_prefix((x)))
+
+#define PAGE_FRAME_MASK		0x80
+#define PAGE_OFFSET_MASK	0xFFF
+#define PAGE_TABLE_ENTRY_MASK	(~PAGE_OFFSET_MASK)
+#define PML4E_OFFSET_MASK	0x0000FF8000000000
+#define PML4E_SHIFT		39
+
+#define MAX_EMULATED_REGIONS 8
+int registered_regions = 0;
+struct memory_region
+{
+	uintptr_t start;
+	uintptr_t end;
+	emulated_read_func_t memread;
+	emulated_write_func_t memwrite;
+	void *arg;
+} emulated_regions[MAX_EMULATED_REGIONS];
+
+struct decoded_instruction
+{
+	void *instruction;
+	uint8_t *opcode;
+	uint8_t *modrm;
+	uint8_t *sib;
+	uint8_t *displacement;
+	uint8_t *immediate;
+
+	uint8_t opcode_flags;
+
+	uint8_t addressing_mode;
+	uint8_t rm;
+	uint8_t reg;
+	uint8_t rex_r;
+	uint8_t rex_w;
+	uint8_t rex_b;
+	uint8_t rex_x;
+
+	int32_t disp;
+};
+
+static enum vm_reg_name vm_reg_name_mappings[] = {
+	[REG_EAX] = VM_REG_GUEST_RAX,
+	[REG_EBX] = VM_REG_GUEST_RBX,
+	[REG_ECX] = VM_REG_GUEST_RCX,
+	[REG_EDX] = VM_REG_GUEST_RDX,
+	[REG_ESP] = VM_REG_GUEST_RSP,
+	[REG_EBP] = VM_REG_GUEST_RBP,
+	[REG_ESI] = VM_REG_GUEST_RSI,
+	[REG_EDI] = VM_REG_GUEST_RDI,
+	[REG_R8]  = VM_REG_GUEST_R8,
+	[REG_R9]  = VM_REG_GUEST_R9,
+	[REG_R10] = VM_REG_GUEST_R10,
+	[REG_R11] = VM_REG_GUEST_R11,
+	[REG_R12] = VM_REG_GUEST_R12,
+	[REG_R13] = VM_REG_GUEST_R13,
+	[REG_R14] = VM_REG_GUEST_R14,
+	[REG_R15] = VM_REG_GUEST_R15
+};
+
+uint8_t one_byte_opcodes[256] = {
+	[0x89]  = HAS_MODRM | FROM_REG | TO_RM,
+	[0x8B]	= HAS_MODRM | FROM_RM | TO_REG,
+};
+
+static uintptr_t 
+gla2gpa(uint64_t gla, uint64_t guest_cr3)
+{
+	uint64_t *table;
+	uint64_t mask, entry;
+	int level, shift;
+	uintptr_t page_frame;
+
+        table = paddr_guest2host(guest_cr3 & PAGE_TABLE_ENTRY_MASK);
+        mask = PML4E_OFFSET_MASK;
+        shift = PML4E_SHIFT;
+        for (level = 0; level < 4; ++level)
+        {
+		entry = table[(gla & mask) >> shift];
+		table = (uint64_t*)(entry & PAGE_TABLE_ENTRY_MASK);
+
+		/* This entry does not point to another page table */
+		if (entry & PAGE_FRAME_MASK || level >= 3) 
+			break;
+		
+		table = paddr_guest2host((uintptr_t)table);
+		mask >>= 9;
+		shift -= 9;
+        }
+
+	mask = (1 << shift) - 1;
+	page_frame = ((uintptr_t)table & ~mask);
+	return (page_frame | (gla & mask));
+}
+
+static void *
+gla2hla(uint64_t gla, uint64_t guest_cr3)
+{
+	uintptr_t gpa;
+
+	gpa = gla2gpa(gla, guest_cr3);
+	return paddr_guest2host(gpa);
+}
+
+/*
+ * Decodes all of the prefixes of the instruction. Only a subset of REX 
+ * prefixes are currently supported. If any unsupported prefix is 
+ * encountered, returns -1.
+ */
+static int 
+decode_prefixes(struct decoded_instruction *decoded)
+{
+	uint8_t *current_prefix;
+
+	current_prefix = decoded->instruction;
+
+	if (is_rex_prefix(*current_prefix)) {
+		decoded->rex_w = *current_prefix & REX_W_MASK;
+		decoded->rex_r = *current_prefix & REX_R_MASK;
+		decoded->rex_x = *current_prefix & REX_X_MASK;
+		decoded->rex_b = *current_prefix & REX_B_MASK;
+		current_prefix++;
+	} else if (is_prefix(*current_prefix)) {
+		return (-1);
+	}
+
+	decoded->opcode = current_prefix;
+	return (0);
+}
+
+/*
+ * Decodes the instruction's opcode. If the opcode is not understood, returns
+ * -1 indicating an error. Sets the instruction's mod_rm pointer to the 
+ * location of the ModR/M field.
+ */
+static int 
+decode_opcode(struct decoded_instruction *decoded)
+{
+	uint8_t opcode, flags;
+
+	opcode = *decoded->opcode;
+	flags = one_byte_opcodes[opcode];
+
+	if (!flags) 
+		return (-1);
+
+	if (flags & HAS_MODRM) {
+		decoded->modrm = decoded->opcode + 1;
+	}
+
+	decoded->opcode_flags = flags;
+
+	return (0);
+}
+
+/*
+ * Decodes the instruction's ModR/M field. Sets the instruction's sib pointer
+ * to the location of the SIB if one is expected to be present, or 0 if not.
+ */
+static int 
+decode_mod_rm(struct decoded_instruction *decoded)
+{
+	uint8_t modrm;
+	uint8_t *extension_operands;
+
+	if (decoded->modrm) {
+		modrm = *decoded->modrm;
+	
+		decoded->addressing_mode = (modrm & MODRM_MOD_MASK) >> MODRM_MOD_SHIFT;
+		decoded->rm = (modrm & MODRM_RM_MASK) >> MODRM_RM_SHIFT;
+		decoded->reg = (modrm & MODRM_REG_MASK) >> MODRM_REG_SHIFT;
+
+		if (decoded->rex_b) 
+			decoded->rm |= (1<<3);
+
+		if (decoded->rex_r) 
+			decoded->reg |= (1<<3);
+
+		extension_operands = decoded->modrm + 1;
+	
+		if (decoded->rm == RM_SIB) {
+			decoded->sib = decoded->modrm + 1;
+			extension_operands = decoded->sib + 1;
+		}
+
+		switch (decoded->addressing_mode) {
+		case MOD_INDIRECT:
+		case MOD_DIRECT:
+			decoded->displacement = 0;
+			break;
+		case MOD_INDIRECT_DISP8:
+			decoded->displacement = extension_operands;
+			break;
+		case MOD_INDIRECT_DISP32:
+			decoded->displacement = extension_operands;
+			break;
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Decodes the instruction's SIB field. No such instructions are currently
+ * supported, so do nothing and return -1 if there is a SIB field, 0 otherwise.
+ */
+static int
+decode_sib(struct decoded_instruction *decoded)
+{
+
+	if (decoded->sib) 
+		return (-1);
+
+	return (0);
+}
+
+/*
+ * Grabs and saves the instruction's immediate operand and displacement if
+ * they are present. Immediates are not currently supported, so if an 
+ * immediate is present it will return -1 indicating an error.
+ */
+static int
+decode_extension_operands(struct decoded_instruction *decoded)
+{
+
+	if (decoded->displacement) {
+		if (decoded->addressing_mode == MOD_INDIRECT_DISP8) {
+			decoded->disp = (int32_t)*decoded->displacement;
+		} else if (decoded->addressing_mode == MOD_INDIRECT_DISP32) {
+			decoded->disp = *((int32_t*)decoded->displacement);
+		}
+	}
+
+	if (decoded->immediate) {
+		return (-1);
+	}
+
+	return (0);
+}
+
+static int
+decode_instruction(void *instr, struct decoded_instruction *decoded)
+{
+	int error;
+
+	bzero(decoded, sizeof(*decoded));
+	decoded->instruction = instr;
+
+	error = decode_prefixes(decoded);
+	if (error)
+                return (error);
+
+	error = decode_opcode(decoded);
+	if (error) 
+		return (error);
+
+	error = decode_mod_rm(decoded);
+	if (error)
+		return (error);
+
+	error = decode_sib(decoded);
+	if (error) 
+		return (error);
+
+	error = decode_extension_operands(decoded);
+	if (error) 
+		return (error);
+
+	return (0);
+}
+
+static struct memory_region * 
+find_region(uintptr_t addr)
+{
+	int i;
+
+	for (i = 0; i < registered_regions; ++i) {
+		if (emulated_regions[i].start <= addr && 
+		   emulated_regions[i].end >= addr) {
+			return &emulated_regions[i];
+		}
+	}
+
+	return (0);
+}
+
+static enum vm_reg_name
+get_vm_reg_name(uint8_t reg)
+{
+	return vm_reg_name_mappings[reg];
+}
+
+static int 
+get_operand(struct vmctx *vm, int vcpu, uint64_t guest_cr3,
+	    const struct decoded_instruction *instruction, uint64_t *operand)
+{
+	enum vm_reg_name regname;
+	uint64_t reg;
+	uintptr_t target;
+	int error;
+	uint8_t rm, addressing_mode;
+	struct memory_region *emulated_memory;
+
+	if (instruction->opcode_flags & FROM_RM) {
+		rm = instruction->rm;
+		addressing_mode = instruction->addressing_mode;
+	} else if (instruction->opcode_flags & FROM_REG) {
+		rm = instruction->reg;
+		addressing_mode = MOD_DIRECT;
+	} else 
+		return (-1);
+
+	regname = get_vm_reg_name(rm);
+	error = vm_get_register(vm, vcpu, regname, &reg);
+	if (error) 
+		return (error);
+
+	switch (addressing_mode) {
+	case MOD_DIRECT:
+		*operand = reg;
+		return (0);
+	case MOD_INDIRECT:
+		target = gla2gpa(reg, guest_cr3);
+		emulated_memory = find_region(target);
+		if (emulated_memory) {
+			return emulated_memory->memread(vm, vcpu, target, 
+							4, operand, 
+							emulated_memory->arg);
+		}
+                return (-1);
+	case MOD_INDIRECT_DISP8:
+	case MOD_INDIRECT_DISP32:
+		target = gla2gpa(reg, guest_cr3);
+		target += instruction->disp;
+		emulated_memory = find_region(target);
+		if (emulated_memory) {
+			return emulated_memory->memread(vm, vcpu, target, 
+							4, operand, 
+							emulated_memory->arg);
+		}
+		return (-1);
+	default:
+		return (-1);
+	}
+}
+
+static int 
+perform_write(struct vmctx *vm, int vcpu, uint64_t guest_cr3,
+	      const struct decoded_instruction *instruction, uint64_t operand)
+{
+	enum vm_reg_name regname;
+	uintptr_t target;
+	int error;
+	uint64_t reg;
+	struct memory_region *emulated_memory;
+	uint8_t addressing_mode;
+
+	if (instruction->opcode_flags & TO_RM) {
+		reg = instruction->rm;
+		addressing_mode = instruction->addressing_mode;
+	} else if (instruction->opcode_flags & TO_REG) {
+		reg = instruction->reg;
+		addressing_mode = MOD_DIRECT;
+	} else 
+		return (-1);
+
+	regname = get_vm_reg_name(reg);
+	error = vm_get_register(vm, vcpu, regname, &reg);
+	if (error) 
+		return (error);
+
+	switch(addressing_mode) {
+	case MOD_DIRECT:
+		return vm_set_register(vm, vcpu, regname, operand);
+	case MOD_INDIRECT:
+		target = gla2gpa(reg, guest_cr3);
+		emulated_memory = find_region(target);
+		if (emulated_memory) {
+			return emulated_memory->memwrite(vm, vcpu, target, 
+							 4, operand, 
+							 emulated_memory->arg);
+		}
+		return (-1);
+	default:
+		return (-1);
+	}
+}
+
+static int 
+emulate_decoded_instruction(struct vmctx *vm, int vcpu, uint64_t cr3,
+			    const struct decoded_instruction *instruction)
+{
+	uint64_t operand;
+	int error;
+
+	error = get_operand(vm, vcpu, cr3, instruction, &operand);
+	if (error) 
+		return (error);
+
+	return perform_write(vm, vcpu, cr3, instruction, operand);
+}
+
+int 
+emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, uint64_t cr3)
+{
+	struct decoded_instruction instr;
+	int error;
+	void *instruction = gla2hla(rip, cr3);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201204281628.q3SGS1m4069240>