Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 28 Nov 2012 00:02:17 +0000 (UTC)
From:      Neel Natu <neel@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-projects@freebsd.org
Subject:   svn commit: r243640 - in projects/bhyve: sys/amd64/include sys/amd64/vmm sys/amd64/vmm/intel usr.sbin/bhyve
Message-ID:  <201211280002.qAS02HnU088183@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: neel
Date: Wed Nov 28 00:02:17 2012
New Revision: 243640
URL: http://svnweb.freebsd.org/changeset/base/243640

Log:
  Revamp the x86 instruction emulation in bhyve.
  
  On a nested page table fault the hypervisor will:
  - fetch the instruction using the guest %rip and %cr3
  - decode the instruction in 'struct vie'
  - emulate the instruction in host kernel context for local apic accesses
  - any other type of mmio access is punted up to user-space (e.g. ioapic)
  
  The decoded instruction is passed as collateral to the user-space process
  that is handling the PAGING exit.
  
  The emulation code is fleshed out to include more addressing modes (e.g. SIB)
  and more types of operands (e.g. imm8). The source code is unified into a
  single file (vmm_instruction_emul.c) that is compiled into vmm.ko as well
  as /usr/sbin/bhyve.
  
  Reviewed by:	grehan
  Obtained from:	NetApp

Added:
  projects/bhyve/sys/amd64/include/vmm_instruction_emul.h   (contents, props changed)
Deleted:
  projects/bhyve/sys/amd64/vmm/vmm_instruction_emul.h
  projects/bhyve/usr.sbin/bhyve/instruction_emul.c
  projects/bhyve/usr.sbin/bhyve/instruction_emul.h
Modified:
  projects/bhyve/sys/amd64/include/vmm.h
  projects/bhyve/sys/amd64/vmm/intel/vmcs.h
  projects/bhyve/sys/amd64/vmm/intel/vmx.c
  projects/bhyve/sys/amd64/vmm/vmm_instruction_emul.c
  projects/bhyve/sys/amd64/vmm/vmm_lapic.c
  projects/bhyve/sys/amd64/vmm/vmm_lapic.h
  projects/bhyve/usr.sbin/bhyve/Makefile
  projects/bhyve/usr.sbin/bhyve/fbsdrun.c
  projects/bhyve/usr.sbin/bhyve/ioapic.c
  projects/bhyve/usr.sbin/bhyve/mem.c
  projects/bhyve/usr.sbin/bhyve/mem.h
  projects/bhyve/usr.sbin/bhyve/pci_passthru.c

Modified: projects/bhyve/sys/amd64/include/vmm.h
==============================================================================
--- projects/bhyve/sys/amd64/include/vmm.h	Tue Nov 27 23:16:56 2012	(r243639)
+++ projects/bhyve/sys/amd64/include/vmm.h	Wed Nov 28 00:02:17 2012	(r243640)
@@ -150,6 +150,8 @@ void vm_interrupt_hostcpu(struct vm *vm,
 
 #endif	/* KERNEL */
 
+#include <machine/vmm_instruction_emul.h>
+
 #define	VM_MAXCPU	8			/* maximum virtual cpus */
 
 /*
@@ -268,6 +270,7 @@ struct vm_exit {
 			uint64_t	cr3;
 			uint64_t	gpa;
 			int		rwx;
+			struct vie	vie;
 		} paging;
 		/*
 		 * VMX specific payload. Used when there is no "better"

Added: projects/bhyve/sys/amd64/include/vmm_instruction_emul.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ projects/bhyve/sys/amd64/include/vmm_instruction_emul.h	Wed Nov 28 00:02:17 2012	(r243640)
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_INSTRUCTION_EMUL_H_
+#define	_VMM_INSTRUCTION_EMUL_H_
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+	uint8_t		op_byte;	/* actual opcode byte */
+	uint8_t		op_type;	/* type of operation (e.g. MOV) */
+	uint16_t	op_flags;
+};
+
+#define	VIE_INST_SIZE	15
+struct vie {
+	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
+	uint8_t		num_valid;		/* size of the instruction */
+	uint8_t		num_processed;
+
+	uint8_t		rex_w:1,		/* REX prefix */
+			rex_r:1,
+			rex_x:1,
+			rex_b:1;
+
+	uint8_t		mod:2,			/* ModRM byte */
+			reg:4,
+			rm:4;
+
+	uint8_t		ss:2,			/* SIB byte */
+			index:4,
+			base:4;
+
+	uint8_t		disp_bytes;
+	uint8_t		imm_bytes;
+
+	uint8_t		scale;
+	int		base_register;		/* VM_REG_GUEST_xyz */
+	int		index_register;		/* VM_REG_GUEST_xyz */
+
+	int64_t		displacement;		/* optional addr displacement */
+	int64_t		immediate;		/* optional immediate operand */
+
+	uint8_t		decoded;	/* set to 1 if successfully decoded */
+
+	struct vie_op	op;			/* opcode description */
+};
+
+/*
+ * Callback functions to read and write memory regions.
+ */
+typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
+				 uint64_t *rval, int rsize, void *arg);
+
+typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
+				  uint64_t wval, int wsize, void *arg);
+
+/*
+ * Emulate the decoded 'vie' instruction.
+ *
+ * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
+ * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
+ * callback functions.
+ *
+ * 'void *vm' should be 'struct vm *' when called from kernel context and
+ * 'struct vmctx *' when called from user context.
+ * s
+ */
+int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
+			    mem_region_read_t mrr, mem_region_write_t mrw,
+			    void *mrarg);
+
+#ifdef _KERNEL
+/*
+ * APIs to fetch and decode the instruction from nested page fault handler.
+ */
+int vmm_fetch_instruction(struct vm *vm, int cpuid,
+			  uint64_t rip, int inst_length, uint64_t cr3,
+			  struct vie *vie);
+
+int vmm_decode_instruction(struct vm *vm, int cpuid,
+			   uint64_t gla, struct vie *vie);
+#endif	/* _KERNEL */
+
+#endif	/* _VMM_INSTRUCTION_EMUL_H_ */

Modified: projects/bhyve/sys/amd64/vmm/intel/vmcs.h
==============================================================================
--- projects/bhyve/sys/amd64/vmm/intel/vmcs.h	Tue Nov 27 23:16:56 2012	(r243639)
+++ projects/bhyve/sys/amd64/vmm/intel/vmcs.h	Wed Nov 28 00:02:17 2012	(r243640)
@@ -67,6 +67,7 @@ uint64_t vmcs_read(uint32_t encoding);
 #define	vmcs_exit_qualification()	vmcs_read(VMCS_EXIT_QUALIFICATION)
 #define	vmcs_guest_cr3()		vmcs_read(VMCS_GUEST_CR3)
 #define	vmcs_gpa()			vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
+#define	vmcs_gla()			vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
 
 #endif	/* _KERNEL */
 

Modified: projects/bhyve/sys/amd64/vmm/intel/vmx.c
==============================================================================
--- projects/bhyve/sys/amd64/vmm/intel/vmx.c	Tue Nov 27 23:16:56 2012	(r243639)
+++ projects/bhyve/sys/amd64/vmm/intel/vmx.c	Wed Nov 28 00:02:17 2012	(r243640)
@@ -63,7 +63,6 @@ __FBSDID("$FreeBSD$");
 #include "vmx.h"
 #include "x86.h"
 #include "vmx_controls.h"
-#include "vmm_instruction_emul.h"
 
 #define	PINBASED_CTLS_ONE_SETTING					\
 	(PINBASED_EXTINT_EXITING	|				\
@@ -1150,23 +1149,11 @@ vmx_emulate_cr_access(struct vmx *vmx, i
 }
 
 static int
-vmx_lapic_fault(struct vm *vm, int cpu,
-		uint64_t gpa, uint64_t rip, int inst_length,
-		uint64_t cr3, uint64_t ept_qual)
+vmx_ept_fault(struct vm *vm, int cpu,
+	      uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
+	      uint64_t cr3, uint64_t ept_qual, struct vie *vie)
 {
-	int read, write, handled;
-	struct vie vie;
-
-	/*
-	 * For this to be a legitimate access to the local apic:
-	 * - the GPA in the local apic page
-	 * - the GPA must be aligned on a 16 byte boundary
-	 */
-	if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
-		return (UNHANDLED);
-
-	if ((gpa & 0xF) != 0)
-		return (UNHANDLED);
+	int read, write, error;
 
 	/* EPT violation on an instruction fetch doesn't make sense here */
 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
@@ -1188,15 +1175,22 @@ vmx_lapic_fault(struct vm *vm, int cpu,
 	}
 
 	/* Fetch, decode and emulate the faulting instruction */
-	if (vmm_fetch_instruction(vm, rip, inst_length, cr3, &vie) != 0)
+	if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
 		return (UNHANDLED);
 
-	if (vmm_decode_instruction(&vie) != 0)
+	if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
 		return (UNHANDLED);
 
-	handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, &vie);
+	/*
+	 * Check if this is a local apic access
+	 */
+	if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
+		return (UNHANDLED);
 
-	return (handled);
+	error = vmm_emulate_instruction(vm, cpu, gpa, vie,
+					lapic_mmio_read, lapic_mmio_write, 0);
+
+	return (error ? UNHANDLED : HANDLED);
 }
 
 static int
@@ -1206,7 +1200,7 @@ vmx_exit_process(struct vmx *vmx, int vc
 	struct vmcs *vmcs;
 	struct vmxctx *vmxctx;
 	uint32_t eax, ecx, edx;
-	uint64_t qual, gpa, cr3, intr_info;
+	uint64_t qual, gla, gpa, cr3, intr_info;
 
 	handled = 0;
 	vmcs = &vmx->vmcs[vcpu];
@@ -1299,11 +1293,12 @@ vmx_exit_process(struct vmx *vmx, int vc
 		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
 		break;
 	case EXIT_REASON_EPT_FAULT:
+		gla = vmcs_gla();
 		gpa = vmcs_gpa();
 		cr3 = vmcs_guest_cr3();
-		handled = vmx_lapic_fault(vmx->vm, vcpu,
-					  gpa, vmexit->rip, vmexit->inst_length,
-					  cr3, qual);
+		handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
+					vmexit->rip, vmexit->inst_length,
+					cr3, qual, &vmexit->u.paging.vie);
 		if (!handled) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.cr3 = cr3;

Modified: projects/bhyve/sys/amd64/vmm/vmm_instruction_emul.c
==============================================================================
--- projects/bhyve/sys/amd64/vmm/vmm_instruction_emul.c	Tue Nov 27 23:16:56 2012	(r243639)
+++ projects/bhyve/sys/amd64/vmm/vmm_instruction_emul.c	Wed Nov 28 00:02:17 2012	(r243640)
@@ -30,6 +30,7 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#ifdef _KERNEL
 #include <sys/param.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
@@ -40,10 +41,60 @@ __FBSDID("$FreeBSD$");
 #include <machine/pmap.h>
 #include <machine/vmparam.h>
 #include <machine/vmm.h>
+#else	/* !_KERNEL */
+#include <sys/types.h>
+#include <sys/errno.h>
 
-#include "vmm_instruction_emul.h"
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+#endif	/* _KERNEL */
+
+
+
+/* struct vie_op.op_type */
+enum {
+	VIE_OP_TYPE_NONE = 0,
+	VIE_OP_TYPE_MOV,
+	VIE_OP_TYPE_AND,
+	VIE_OP_TYPE_LAST
+};
+
+/* struct vie_op.op_flags */
+#define	VIE_OP_F_IMM		(1 << 0)	/* immediate operand present */
+#define	VIE_OP_F_IMM8		(1 << 1)	/* 8-bit immediate operand */
+
+static const struct vie_op one_byte_opcodes[256] = {
+	[0x89] = {
+		.op_byte = 0x89,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
+	[0x8B] = {
+		.op_byte = 0x8B,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
+	[0xC7] = {
+		.op_byte = 0xC7,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_IMM,
+	},
+	[0x23] = {
+		.op_byte = 0x23,
+		.op_type = VIE_OP_TYPE_AND,
+	}
+};
+
+/* struct vie.mod */
+#define	VIE_MOD_INDIRECT		0
+#define	VIE_MOD_INDIRECT_DISP8		1
+#define	VIE_MOD_INDIRECT_DISP32		2
+#define	VIE_MOD_DIRECT			3
+
+/* struct vie.rm */
+#define	VIE_RM_SIB			4
+#define	VIE_RM_DISP32			5
 
-#define	GB	(1024 * 1024 * 1024)
+#define	GB				(1024 * 1024 * 1024)
 
 static enum vm_reg_name gpr_map[16] = {
 	VM_REG_GUEST_RAX,
@@ -64,17 +115,232 @@ static enum vm_reg_name gpr_map[16] = {
 	VM_REG_GUEST_R15
 };
 
+static uint64_t size2mask[] = {
+	[1] = 0xff,
+	[2] = 0xffff,
+	[4] = 0xffffffff,
+	[8] = 0xffffffffffffffff,
+};
+
+static int
+vie_valid_register(enum vm_reg_name reg)
+{
+#ifdef _KERNEL
+	/*
+	 * XXX
+	 * The operand register in which we store the result of the
+	 * read must be a GPR that we can modify even if the vcpu
+	 * is "running". All the GPRs qualify except for %rsp.
+	 *
+	 * This is a limitation of the vm_set_register() API
+	 * and can be fixed if necessary.
+	 */
+	if (reg == VM_REG_GUEST_RSP)
+		return (0);
+#endif
+	return (1);
+}
+
+static int
+vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
+{
+	int error;
+
+	if (!vie_valid_register(reg))
+		return (EINVAL);
+
+	error = vm_get_register(vm, vcpuid, reg, rval);
+
+	return (error);
+}
+
+static int
+vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+		    uint64_t val, int size)
+{
+	int error;
+	uint64_t origval;
+
+	if (!vie_valid_register(reg))
+		return (EINVAL);
+
+	switch (size) {
+	case 1:
+	case 2:
+		error = vie_read_register(vm, vcpuid, reg, &origval);
+		if (error)
+			return (error);
+		val &= size2mask[size];
+		val |= origval & ~size2mask[size];
+		break;
+	case 4:
+		val &= 0xffffffffUL;
+		break;
+	case 8:
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	error = vm_set_register(vm, vcpuid, reg, val);
+	return (error);
+}
+
+/*
+ * The following simplifying assumptions are made during emulation:
+ *
+ * - guest is in 64-bit mode
+ *   - default address size is 64-bits
+ *   - default operand size is 32-bits
+ *
+ * - operand size override is not supported
+ *
+ * - address size override is not supported
+ */
+static int
+emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	enum vm_reg_name reg;
+	uint64_t val;
+
+	size = 4;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x89:
+		/*
+		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+		 * 89/r:	mov r/m32, r32
+		 * REX.W + 89/r	mov r/m64, r64
+		 */
+		if (vie->rex_w)
+			size = 8;
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val);
+		if (error == 0) {
+			val &= size2mask[size];
+			error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		}
+		break;
+	case 0x8B:
+		/*
+		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
+		 * 8B/r:	mov r32, r/m32
+		 * REX.W 8B/r:	mov r64, r/m64
+		 */
+		if (vie->rex_w)
+			size = 8;
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0) {
+			reg = gpr_map[vie->reg];
+			error = vie_update_register(vm, vcpuid, reg, val, size);
+		}
+		break;
+	case 0xC7:
+		/*
+		 * MOV from imm32 to mem (ModRM:r/m)
+		 * C7/0		mov r/m32, imm32
+		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
+		 */
+		val = vie->immediate;		/* already sign-extended */
+
+		if (vie->rex_w)
+			size = 8;
+
+		if (size != 8)
+			val &= size2mask[size];
+
+		error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		break;
+	default:
+		break;
+	}
+
+	return (error);
+}
+
+static int
+emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	enum vm_reg_name reg;
+	uint64_t val1, val2;
+
+	size = 4;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x23:
+		/*
+		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
+		 * result in reg.
+		 *
+		 * 23/r		and r32, r/m32
+		 * REX.W + 23/r	and r64, r/m64
+		 */
+		if (vie->rex_w)
+			size = 8;
+
+		/* get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val1);
+		if (error)
+			break;
+
+		/* get the second operand */
+		error = memread(vm, vcpuid, gpa, &val2, size, arg);
+		if (error)
+			break;
+
+		/* perform the operation and write the result */
+		val1 &= val2;
+		error = vie_update_register(vm, vcpuid, reg, val1, size);
+		break;
+	default:
+		break;
+	}
+	return (error);
+}
+
+int
+vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+			mem_region_read_t memread, mem_region_write_t memwrite,
+			void *memarg)
+{
+	int error;
+
+	if (!vie->decoded)
+		return (EINVAL);
+
+	switch (vie->op.op_type) {
+	case VIE_OP_TYPE_MOV:
+		error = emulate_mov(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_AND:
+		error = emulate_and(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
+
+#ifdef _KERNEL
 static void
 vie_init(struct vie *vie)
 {
 
 	bzero(vie, sizeof(struct vie));
 
-	vie->op_size = VIE_OP_SIZE_32BIT;
-
 	vie->base_register = VM_REG_LAST;
 	vie->index_register = VM_REG_LAST;
-	vie->operand_register = VM_REG_LAST;
 }
 
 static int
@@ -129,7 +395,7 @@ error:
 }
 
 int
-vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
+vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
 		      uint64_t cr3, struct vie *vie)
 {
 	int n, err;
@@ -172,6 +438,7 @@ vmm_fetch_instruction(struct vm *vm, uin
 static int
 vie_peek(struct vie *vie, uint8_t *x)
 {
+
 	if (vie->num_processed < vie->num_valid) {
 		*x = vie->inst[vie->num_processed];
 		return (0);
@@ -182,8 +449,6 @@ vie_peek(struct vie *vie, uint8_t *x)
 static void
 vie_advance(struct vie *vie)
 {
-	if (vie->num_processed >= vie->num_valid)
-		panic("vie_advance: %d/%d", vie->num_processed, vie->num_valid);
 
 	vie->num_processed++;
 }
@@ -213,24 +478,16 @@ decode_opcode(struct vie *vie)
 {
 	uint8_t x;
 
-	static const uint8_t flags[256] = {
-		[0x89] = VIE_F_HAS_MODRM | VIE_F_FROM_REG | VIE_F_TO_RM,
-		[0x8B] = VIE_F_HAS_MODRM | VIE_F_FROM_RM | VIE_F_TO_REG,
-		[0xC7] = VIE_F_HAS_MODRM | VIE_F_FROM_IMM | VIE_F_TO_RM,
-	};
-
 	if (vie_peek(vie, &x))
 		return (-1);
 
-	vie->opcode_byte = x;
-	vie->opcode_flags = flags[x];
+	vie->op = one_byte_opcodes[x];
 
-	vie_advance(vie);
-
-	if (vie->opcode_flags == 0)
+	if (vie->op.op_type == VIE_OP_TYPE_NONE)
 		return (-1);
-	else
-		return (0);
+
+	vie_advance(vie);
+	return (0);
 }
 
 /*
@@ -241,9 +498,6 @@ decode_modrm(struct vie *vie)
 {
 	uint8_t x;
 
-	if ((vie->opcode_flags & VIE_F_HAS_MODRM) == 0)
-		return (0);
-
 	if (vie_peek(vie, &x))
 		return (-1);
 
@@ -251,35 +505,40 @@ decode_modrm(struct vie *vie)
 	vie->rm =  (x >> 0) & 0x7;
 	vie->reg = (x >> 3) & 0x7;
 
+	/*
+	 * A direct addressing mode makes no sense in the context of an EPT
+	 * fault. There has to be a memory access involved to cause the
+	 * EPT fault.
+	 */
+	if (vie->mod == VIE_MOD_DIRECT)
+		return (-1);
+
 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
-			/*
-			 * Table 2-5: Special Cases of REX Encodings
-			 *
-			 * mod=0, r/m=5 is used in the compatibility mode to
-			 * indicate a disp32 without a base register.
-			 *
-			 * mod!=3, r/m=4 is used in the compatibility mode to
-			 * indicate that the SIB byte is present.
-			 *
-			 * The 'b' bit in the REX prefix is don't care in
-			 * this case.
-			 */
+		/*
+		 * Table 2-5: Special Cases of REX Encodings
+		 *
+		 * mod=0, r/m=5 is used in the compatibility mode to
+		 * indicate a disp32 without a base register.
+		 *
+		 * mod!=3, r/m=4 is used in the compatibility mode to
+		 * indicate that the SIB byte is present.
+		 *
+		 * The 'b' bit in the REX prefix is don't care in
+		 * this case.
+		 */
 	} else {
 		vie->rm |= (vie->rex_b << 3);
 	}
 
 	vie->reg |= (vie->rex_r << 3);
 
-	/* SIB addressing not supported yet */
+	/* SIB */
 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
-		return (-1);
+		goto done;
 
 	vie->base_register = gpr_map[vie->rm];
 
-	if (vie->opcode_flags & (VIE_F_FROM_REG | VIE_F_TO_REG))
-		vie->operand_register = gpr_map[vie->reg];
-
 	switch (vie->mod) {
 	case VIE_MOD_INDIRECT_DISP8:
 		vie->disp_bytes = 1;
@@ -295,12 +554,76 @@ decode_modrm(struct vie *vie)
 		break;
 	}
 
-	/* calculate the operand size */
-	if (vie->rex_w)
-		vie->op_size = VIE_OP_SIZE_64BIT;
-
-	if (vie->opcode_flags & VIE_F_FROM_IMM)
+	/* Figure out immediate operand size (if any) */
+	if (vie->op.op_flags & VIE_OP_F_IMM)
 		vie->imm_bytes = 4;
+	else if (vie->op.op_flags & VIE_OP_F_IMM8)
+		vie->imm_bytes = 1;
+
+done:
+	vie_advance(vie);
+
+	return (0);
+}
+
+static int
+decode_sib(struct vie *vie)
+{
+	uint8_t x;
+
+	/* Proceed only if SIB byte is present */
+	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
+		return (0);
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	/* De-construct the SIB byte */
+	vie->ss = (x >> 6) & 0x3;
+	vie->index = (x >> 3) & 0x7;
+	vie->base = (x >> 0) & 0x7;
+
+	/* Apply the REX prefix modifiers */
+	vie->index |= vie->rex_x << 3;
+	vie->base |= vie->rex_b << 3;
+
+	switch (vie->mod) {
+	case VIE_MOD_INDIRECT_DISP8:
+		vie->disp_bytes = 1;
+		break;
+	case VIE_MOD_INDIRECT_DISP32:
+		vie->disp_bytes = 4;
+		break;
+	}
+
+	if (vie->mod == VIE_MOD_INDIRECT &&
+	    (vie->base == 5 || vie->base == 13)) {
+		/*
+		 * Special case when base register is unused if mod = 0
+		 * and base = %rbp or %r13.
+		 *
+		 * Documented in:
+		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+		 * Table 2-5: Special Cases of REX Encodings
+		 */
+		vie->disp_bytes = 4;
+	} else {
+		vie->base_register = gpr_map[vie->base];
+	}
+
+	/*
+	 * All encodings of 'index' are valid except for %rsp (4).
+	 *
+	 * Documented in:
+	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+	 * Table 2-5: Special Cases of REX Encodings
+	 */
+	if (vie->index != 4)
+		vie->index_register = gpr_map[vie->index];
+
+	/* 'scale' makes sense only in the context of an index register */
+	if (vie->index_register < VM_REG_LAST)
+		vie->scale = 1 << vie->ss;
 
 	vie_advance(vie);
 
@@ -348,13 +671,14 @@ decode_immediate(struct vie *vie)
 	uint8_t x;
 	union {
 		char	buf[4];
+		int8_t	signed8;
 		int32_t	signed32;
 	} u;
 
 	if ((n = vie->imm_bytes) == 0)
 		return (0);
 
-	if (n != 4)
+	if (n != 1 && n != 4)
 		panic("decode_immediate: invalid imm_bytes %d", n);
 
 	for (i = 0; i < n; i++) {
@@ -365,14 +689,62 @@ decode_immediate(struct vie *vie)
 		vie_advance(vie);
 	}
 	
-	vie->immediate = u.signed32;		/* sign-extended */
+	if (n == 1)
+		vie->immediate = u.signed8;		/* sign-extended */
+	else
+		vie->immediate = u.signed32;		/* sign-extended */
+
+	return (0);
+}
+
+#define	VERIFY_GLA
+/*
+ * Verify that the 'guest linear address' provided as collateral of the nested
+ * page table fault matches with our instruction decoding.
+ */
+#ifdef VERIFY_GLA
+static int
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+	int error;
+	uint64_t base, idx;
+
+	base = 0;
+	if (vie->base_register != VM_REG_LAST) {
+		error = vm_get_register(vm, cpuid, vie->base_register, &base);
+		if (error) {
+			printf("verify_gla: error %d getting base reg %d\n",
+				error, vie->base_register);
+			return (-1);
+		}
+	}
+
+	idx = 0;
+	if (vie->index_register != VM_REG_LAST) {
+		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
+		if (error) {
+			printf("verify_gla: error %d getting index reg %d\n",
+				error, vie->index_register);
+			return (-1);
+		}
+	}
+
+	if (base + vie->scale * idx + vie->displacement != gla) {
+		printf("verify_gla mismatch: "
+		       "base(0x%0lx), scale(%d), index(0x%0lx), "
+		       "disp(0x%0lx), gla(0x%0lx)\n",
+		       base, vie->scale, idx, vie->displacement, gla);
+		return (-1);
+	}
 
 	return (0);
 }
+#endif	/* VERIFY_GLA */
 
 int
-vmm_decode_instruction(struct vie *vie)
+vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 {
+
 	if (decode_rex(vie))
 		return (-1);
 
@@ -382,11 +754,22 @@ vmm_decode_instruction(struct vie *vie)
 	if (decode_modrm(vie))
 		return (-1);
 
+	if (decode_sib(vie))
+		return (-1);
+
 	if (decode_displacement(vie))
 		return (-1);
 	
 	if (decode_immediate(vie))
 		return (-1);
 
+#ifdef VERIFY_GLA
+	if (verify_gla(vm, cpuid, gla, vie))
+		return (-1);
+#endif
+
+	vie->decoded = 1;	/* success */
+
 	return (0);
 }
+#endif	/* _KERNEL */

Modified: projects/bhyve/sys/amd64/vmm/vmm_lapic.c
==============================================================================
--- projects/bhyve/sys/amd64/vmm/vmm_lapic.c	Tue Nov 27 23:16:56 2012	(r243639)
+++ projects/bhyve/sys/amd64/vmm/vmm_lapic.c	Wed Nov 28 00:02:17 2012	(r243640)
@@ -34,12 +34,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/smp.h>
 
 #include <x86/specialreg.h>
+#include <x86/apicreg.h>
 
 #include <machine/vmm.h>
 #include "vmm_ipi.h"
 #include "vmm_lapic.h"
 #include "vlapic.h"
-#include "vmm_instruction_emul.h"
 
 static int
 lapic_write(struct vlapic *vlapic, u_int offset, uint64_t val)
@@ -177,64 +177,45 @@ lapic_wrmsr(struct vm *vm, int cpu, u_in
 }
 
 int
-lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, struct vie *vie)
+lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
+		 void *arg)
 {
-	int handled, error;
-	uint64_t val;
+	int error;
+	uint64_t off;
 	struct vlapic *vlapic;
 
-	const int UNHANDLED = 0;
+	off = gpa - DEFAULT_APIC_BASE;
+
+	/*
+	 * Memory mapped local apic accesses must be 4 bytes wide and
+	 * aligned on a 16-byte boundary.
+	 */
+	if (size != 4 || off & 0xf)
+		return (EINVAL);
 
 	vlapic = vm_lapic(vm, cpu);
+	error = vlapic_op_mem_write(vlapic, off, DWORD, wval);
+	return (error);
+}
+
+int
+lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
+		void *arg)
+{
+	int error;
+	uint64_t off;
+	struct vlapic *vlapic;
 
-	/* Only 32-bit accesses to local apic */
-	if (vie->op_size != VIE_OP_SIZE_32BIT)
-		return (UNHANDLED);
+	off = gpa - DEFAULT_APIC_BASE;
 
 	/*
-	 * XXX
-	 * The operand register in which we store the result of the
-	 * read must be a GPR that we can modify even if the vcpu
-	 * is "running". All the GPRs qualify except for %rsp.
-	 *
-	 * This is a limitation of the vm_set_register() API
-	 * and can be fixed if necessary.
+	 * Memory mapped local apic accesses must be 4 bytes wide and
+	 * aligned on a 16-byte boundary.
 	 */
-	if (vie->operand_register == VM_REG_GUEST_RSP)
-		return (UNHANDLED);
-
-	if (read) {
-		if ((vie->opcode_flags & VIE_F_TO_REG) == 0)
-			return (UNHANDLED);
-
-		if (vie->operand_register >= VM_REG_LAST)
-			return (UNHANDLED);
-
-		handled = lapic_read(vlapic, offset, &val);
-		if (handled) {
-			error = vm_set_register(vm, cpu, vie->operand_register,
-						val);
-			if (error)
-				panic("lapic_mmio: error %d setting gpr %d",
-				      error, vie->operand_register);
-		}
-	} else {
-		if ((vie->opcode_flags & VIE_F_FROM_REG) &&
-		    (vie->operand_register < VM_REG_LAST)) {
-			error = vm_get_register(vm, cpu, vie->operand_register,
-						&val);
-			if (error) {
-				panic("lapic_mmio: error %d getting gpr %d",
-				      error, vie->operand_register);
-			}
-		} else if (vie->opcode_flags & VIE_F_FROM_IMM) {
-			val = vie->immediate;
-		} else {
-			return (UNHANDLED);
-		}
-
-		handled = lapic_write(vlapic, offset, val);
-	}
+	if (size != 4 || off & 0xf)
+		return (EINVAL);
 
-	return (handled);
+	vlapic = vm_lapic(vm, cpu);
+	error = vlapic_op_mem_read(vlapic, off, DWORD, rval);
+	return (error);
 }

Modified: projects/bhyve/sys/amd64/vmm/vmm_lapic.h
==============================================================================
--- projects/bhyve/sys/amd64/vmm/vmm_lapic.h	Tue Nov 27 23:16:56 2012	(r243639)
+++ projects/bhyve/sys/amd64/vmm/vmm_lapic.h	Wed Nov 28 00:02:17 2012	(r243640)
@@ -30,13 +30,15 @@
 #define	_VMM_LAPIC_H_
 
 struct vm;
-struct vie;
 
 boolean_t lapic_msr(u_int num);
 int	lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
 int	lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
 
-int	lapic_mmio(struct vm *vm, int cpu, u_int offset, int rd, struct vie *);
+int	lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
+			uint64_t *rval, int size, void *arg);
+int	lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
+			 uint64_t wval, int size, void *arg);
 
 int	lapic_timer_tick(struct vm *vm, int cpu);
 

Modified: projects/bhyve/usr.sbin/bhyve/Makefile
==============================================================================
--- projects/bhyve/usr.sbin/bhyve/Makefile	Tue Nov 27 23:16:56 2012	(r243639)
+++ projects/bhyve/usr.sbin/bhyve/Makefile	Wed Nov 28 00:02:17 2012	(r243640)
@@ -7,11 +7,14 @@ PROG=	bhyve
 DEBUG_FLAGS= -g -O0 
 
 SRCS=	acpi.c atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c 
-SRCS+=  instruction_emul.c ioapic.c mem.c mevent.c mptbl.c
+SRCS+=  ioapic.c mem.c mevent.c mptbl.c
 SRCS+=	pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
 SRCS+=	pci_virtio_net.c pci_uart.c pit_8254.c pmtmr.c post.c rtc.c uart.c
 SRCS+=	xmsr.c spinup_ap.c
 
+.PATH:	${.CURDIR}/../../sys/amd64/vmm
+SRCS+=	vmm_instruction_emul.c

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201211280002.qAS02HnU088183>