From owner-svn-src-projects@FreeBSD.ORG Sat Apr 28 16:28:01 2012 Return-Path: Delivered-To: svn-src-projects@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [69.147.83.52]) by hub.freebsd.org (Postfix) with ESMTP id 69142106564A; Sat, 28 Apr 2012 16:28:01 +0000 (UTC) (envelope-from grehan@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 51BF58FC12; Sat, 28 Apr 2012 16:28:01 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id q3SGS1dI069255; Sat, 28 Apr 2012 16:28:01 GMT (envelope-from grehan@svn.freebsd.org) Received: (from grehan@localhost) by svn.freebsd.org (8.14.4/8.14.4/Submit) id q3SGS1m4069240; Sat, 28 Apr 2012 16:28:01 GMT (envelope-from grehan@svn.freebsd.org) Message-Id: <201204281628.q3SGS1m4069240@svn.freebsd.org> From: Peter Grehan Date: Sat, 28 Apr 2012 16:28:01 +0000 (UTC) To: src-committers@freebsd.org, svn-src-projects@freebsd.org X-SVN-Group: projects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r234761 - in projects/bhyve: lib/libvmmapi sys/amd64/include sys/amd64/vmm sys/amd64/vmm/intel sys/amd64/vmm/io usr.sbin/bhyve X-BeenThere: svn-src-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the src " projects" tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 28 Apr 2012 16:28:01 -0000 Author: grehan Date: Sat Apr 28 16:28:00 2012 New Revision: 234761 URL: http://svn.freebsd.org/changeset/base/234761 Log: MSI-x interrupt support for PCI pass-thru devices. Includes instruction emulation for memory r/w access. This opens the door for io-apic, local apic, hpet timer, and legacy device emulation. Submitted by: ryan dot berryhill at sandvine dot com Reviewed by: grehan Obtained from: Sandvine Added: projects/bhyve/usr.sbin/bhyve/instruction_emul.c (contents, props changed) projects/bhyve/usr.sbin/bhyve/instruction_emul.h (contents, props changed) Modified: projects/bhyve/lib/libvmmapi/vmmapi.c projects/bhyve/lib/libvmmapi/vmmapi.h projects/bhyve/sys/amd64/include/vmm.h projects/bhyve/sys/amd64/include/vmm_dev.h projects/bhyve/sys/amd64/vmm/intel/vmcs.h projects/bhyve/sys/amd64/vmm/intel/vmx.c projects/bhyve/sys/amd64/vmm/io/ppt.c projects/bhyve/sys/amd64/vmm/io/ppt.h projects/bhyve/sys/amd64/vmm/io/vlapic.c projects/bhyve/sys/amd64/vmm/vmm_dev.c projects/bhyve/usr.sbin/bhyve/Makefile projects/bhyve/usr.sbin/bhyve/fbsdrun.c projects/bhyve/usr.sbin/bhyve/pci_emul.c projects/bhyve/usr.sbin/bhyve/pci_emul.h projects/bhyve/usr.sbin/bhyve/pci_passthru.c Modified: projects/bhyve/lib/libvmmapi/vmmapi.c ============================================================================== --- projects/bhyve/lib/libvmmapi/vmmapi.c Sat Apr 28 14:42:49 2012 (r234760) +++ projects/bhyve/lib/libvmmapi/vmmapi.c Sat Apr 28 16:28:00 2012 (r234761) @@ -454,6 +454,25 @@ vm_setup_msi(struct vmctx *ctx, int vcpu return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); } +int +vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, + int idx, uint32_t msg, uint32_t vector_control, uint64_t addr) +{ + struct vm_pptdev_msix pptmsix; + + bzero(&pptmsix, sizeof(pptmsix)); + pptmsix.vcpu = vcpu; + pptmsix.bus = bus; + pptmsix.slot = slot; + pptmsix.func = func; + pptmsix.idx = idx; + pptmsix.msg = msg; + pptmsix.addr = addr; + pptmsix.vector_control = vector_control; + + return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); +} + uint64_t * vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, int *ret_entries) Modified: projects/bhyve/lib/libvmmapi/vmmapi.h ============================================================================== --- projects/bhyve/lib/libvmmapi/vmmapi.h Sat Apr 28 14:42:49 2012 (r234760) +++ projects/bhyve/lib/libvmmapi/vmmapi.h Sat Apr 28 16:28:00 2012 (r234761) @@ -77,6 +77,8 @@ int vm_map_pptdev_mmio(struct vmctx *ctx vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int dest, int vector, int numvec); +int vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, + int idx, uint32_t msg, uint32_t vector_control, uint64_t addr); /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. Modified: projects/bhyve/sys/amd64/include/vmm.h ============================================================================== --- projects/bhyve/sys/amd64/include/vmm.h Sat Apr 28 14:42:49 2012 (r234760) +++ projects/bhyve/sys/amd64/include/vmm.h Sat Apr 28 16:28:00 2012 (r234761) @@ -227,7 +227,8 @@ enum vm_exitcode { VM_EXITCODE_HLT, VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, - VM_EXITCODE_MAX, + VM_EXITCODE_PAGING, + VM_EXITCODE_MAX }; struct vm_exit { @@ -243,6 +244,9 @@ struct vm_exit { uint16_t port; uint32_t eax; /* valid for out */ } inout; + struct { + uint64_t cr3; + } paging; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. Modified: projects/bhyve/sys/amd64/include/vmm_dev.h ============================================================================== --- projects/bhyve/sys/amd64/include/vmm_dev.h Sat Apr 28 14:42:49 2012 (r234760) +++ projects/bhyve/sys/amd64/include/vmm_dev.h Sat Apr 28 16:28:00 2012 (r234761) @@ -108,6 +108,17 @@ struct vm_pptdev_msi { int destcpu; }; +struct vm_pptdev_msix { + int vcpu; + int bus; + int slot; + int func; + int idx; + uint32_t msg; + uint32_t vector_control; + uint64_t addr; +}; + struct vm_nmi { int cpuid; }; @@ -143,6 +154,7 @@ enum { IOCNUM_UNBIND_PPTDEV, IOCNUM_MAP_PPTDEV_MMIO, IOCNUM_PPTDEV_MSI, + IOCNUM_PPTDEV_MSIX, IOCNUM_INJECT_NMI, IOCNUM_VM_STATS, IOCNUM_VM_STAT_DESC, @@ -182,6 +194,8 @@ enum { _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) #define VM_PPTDEV_MSI \ _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) +#define VM_PPTDEV_MSIX \ + _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix) #define VM_INJECT_NMI \ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) #define VM_STATS \ Modified: projects/bhyve/sys/amd64/vmm/intel/vmcs.h ============================================================================== --- projects/bhyve/sys/amd64/vmm/intel/vmcs.h Sat Apr 28 14:42:49 2012 (r234760) +++ projects/bhyve/sys/amd64/vmm/intel/vmcs.h Sat Apr 28 16:28:00 2012 (r234761) @@ -65,6 +65,7 @@ uint64_t vmcs_read(uint32_t encoding); #define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR) #define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff) #define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) +#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3) #endif /* _KERNEL */ Modified: projects/bhyve/sys/amd64/vmm/intel/vmx.c ============================================================================== --- projects/bhyve/sys/amd64/vmm/intel/vmx.c Sat Apr 28 14:42:49 2012 (r234760) +++ projects/bhyve/sys/amd64/vmm/intel/vmx.c Sat Apr 28 16:28:00 2012 (r234761) @@ -1185,6 +1185,10 @@ vmx_exit_process(struct vmx *vmx, int vc case EXIT_REASON_CPUID: handled = vmx_handle_cpuid(vcpu, vmxctx); break; + case EXIT_REASON_EPT_FAULT: + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->u.paging.cr3 = vmcs_guest_cr3(); + break; default: break; } Modified: projects/bhyve/sys/amd64/vmm/io/ppt.c ============================================================================== --- projects/bhyve/sys/amd64/vmm/io/ppt.c Sat Apr 28 14:42:49 2012 (r234760) +++ projects/bhyve/sys/amd64/vmm/io/ppt.c Sat Apr 28 16:28:00 2012 (r234761) @@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -56,9 +57,12 @@ __FBSDID("$FreeBSD$"); #define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1) #define MAX_MSIMSGS 32 +MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources"); + struct pptintr_arg { /* pptintr(pptintr_arg) */ struct pptdev *pptdev; - int msg; + int vec; + int vcpu; }; static struct pptdev { @@ -75,6 +79,16 @@ static struct pptdev { void *cookie[MAX_MSIMSGS]; struct pptintr_arg arg[MAX_MSIMSGS]; } msi; + + struct { + int num_msgs; + int startrid; + int msix_table_rid; + struct resource *msix_table_res; + struct resource **res; + void **cookie; + struct pptintr_arg *arg; + } msix; } pptdevs[32]; static int num_pptdevs; @@ -209,6 +223,57 @@ ppt_teardown_msi(struct pptdev *ppt) ppt->msi.num_msgs = 0; } +static void +ppt_teardown_msix_intr(struct pptdev *ppt, int idx) +{ + int rid; + struct resource *res; + void *cookie; + + rid = ppt->msix.startrid + idx; + res = ppt->msix.res[idx]; + cookie = ppt->msix.cookie[idx]; + + if (cookie != NULL) + bus_teardown_intr(ppt->dev, res, cookie); + + if (res != NULL) + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); + + ppt->msix.res[idx] = NULL; + ppt->msix.cookie[idx] = NULL; +} + +static void +ppt_teardown_msix(struct pptdev *ppt) +{ + int i, error; + + if (ppt->msix.num_msgs == 0) + return; + + for (i = 0; i < ppt->msix.num_msgs; i++) + ppt_teardown_msix_intr(ppt, i); + + if (ppt->msix.msix_table_res) { + bus_release_resource(ppt->dev, SYS_RES_MEMORY, + ppt->msix.msix_table_rid, + ppt->msix.msix_table_res); + ppt->msix.msix_table_res = NULL; + ppt->msix.msix_table_rid = 0; + } + + free(ppt->msix.res, M_PPTMSIX); + free(ppt->msix.cookie, M_PPTMSIX); + free(ppt->msix.arg, M_PPTMSIX); + + error = pci_release_msi(ppt->dev); + if (error) + printf("ppt_teardown_msix: Failed to release MSI-X resources (error %i)\n", error); + + ppt->msix.num_msgs = 0; +} + int ppt_assign_device(struct vm *vm, int bus, int slot, int func) { @@ -244,6 +309,7 @@ ppt_unassign_device(struct vm *vm, int b return (EBUSY); ppt_unmap_mmio(vm, ppt); ppt_teardown_msi(ppt); + ppt_teardown_msix(ppt); iommu_remove_device(vm_iommu_domain(vm), bus, slot, func); ppt->vm = NULL; return (0); @@ -309,10 +375,10 @@ pptintr(void *arg) pptarg = arg; ppt = pptarg->pptdev; - vec = ppt->msi.vector + pptarg->msg; + vec = pptarg->vec; if (ppt->vm != NULL) - (void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec); + (void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec); else { /* * XXX @@ -431,7 +497,7 @@ ppt_setup_msi(struct vm *vm, int vcpu, i break; ppt->msi.arg[i].pptdev = ppt; - ppt->msi.arg[i].msg = i; + ppt->msi.arg[i].vec = vector + i; error = bus_setup_intr(ppt->dev, ppt->msi.res[i], INTR_TYPE_NET | INTR_MPSAFE, @@ -448,3 +514,110 @@ ppt_setup_msi(struct vm *vm, int vcpu, i return (0); } + +int +ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, + int idx, uint32_t msg, uint32_t vector_control, uint64_t addr) +{ + struct pptdev *ppt; + struct pci_devinfo *dinfo; + int numvec, vector_count, rid, error; + size_t res_size, cookie_size, arg_size; + + ppt = ppt_find(bus, slot, func); + if (ppt == NULL) + return (ENOENT); + if (ppt->vm != vm) /* Make sure we own this device */ + return (EBUSY); + + dinfo = device_get_ivars(ppt->dev); + if (!dinfo) + return (ENXIO); + + /* + * First-time configuration: + * Allocate the MSI-X table + * Allocate the IRQ resources + * Set up some variables in ppt->msix + */ + if (!ppt->msix.msix_table_res) { + ppt->msix.res = NULL; + ppt->msix.cookie = NULL; + ppt->msix.arg = NULL; + + rid = dinfo->cfg.msix.msix_table_bar; + ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, SYS_RES_MEMORY, + &rid, RF_ACTIVE); + if (ppt->msix.msix_table_res == NULL) + return (ENOSPC); + + ppt->msix.msix_table_rid = rid; + + vector_count = numvec = pci_msix_count(ppt->dev); + + error = pci_alloc_msix(ppt->dev, &numvec); + if (error) + return (error); + else if (vector_count != numvec) { + pci_release_msi(ppt->dev); + return (ENOSPC); + } + + ppt->msix.num_msgs = numvec; + + ppt->msix.startrid = 1; + + res_size = numvec * sizeof(ppt->msix.res[0]); + cookie_size = numvec * sizeof(ppt->msix.cookie[0]); + arg_size = numvec * sizeof(ppt->msix.arg[0]); + + ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK); + ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, M_WAITOK); + ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK); + if (ppt->msix.res == NULL || ppt->msix.cookie == NULL || + ppt->msix.arg == NULL) { + ppt_teardown_msix(ppt); + return (ENOSPC); + } + bzero(ppt->msix.res, res_size); + bzero(ppt->msix.cookie, cookie_size); + bzero(ppt->msix.arg, arg_size); + } + + if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + /* Tear down the IRQ if it's already set up */ + ppt_teardown_msix_intr(ppt, idx); + + /* Allocate the IRQ resource */ + ppt->msix.cookie[idx] = NULL; + rid = ppt->msix.startrid + idx; + ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, + &rid, RF_ACTIVE); + if (ppt->msix.res[idx] == NULL) + return (ENXIO); + + ppt->msix.arg[idx].pptdev = ppt; + ppt->msix.arg[idx].vec = msg; + ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF; + + /* Setup the MSI-X interrupt */ + error = bus_setup_intr(ppt->dev, ppt->msix.res[idx], + INTR_TYPE_NET | INTR_MPSAFE, + pptintr, NULL, &ppt->msix.arg[idx], + &ppt->msix.cookie[idx]); + + if (error != 0) { + bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]); + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]); + ppt->msix.cookie[idx] = NULL; + ppt->msix.res[idx] = NULL; + return (ENXIO); + } + } else { + /* Masked, tear it down if it's already been set up */ + ppt_teardown_msix_intr(ppt, idx); + } + + return (0); +} + Modified: projects/bhyve/sys/amd64/vmm/io/ppt.h ============================================================================== --- projects/bhyve/sys/amd64/vmm/io/ppt.h Sat Apr 28 14:42:49 2012 (r234760) +++ projects/bhyve/sys/amd64/vmm/io/ppt.h Sat Apr 28 16:28:00 2012 (r234761) @@ -36,5 +36,6 @@ int ppt_map_mmio(struct vm *vm, int bus, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, int destcpu, int vector, int numvec); - +int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, + int idx, uint32_t msg, uint32_t vector_control, uint64_t addr); #endif Modified: projects/bhyve/sys/amd64/vmm/io/vlapic.c ============================================================================== --- projects/bhyve/sys/amd64/vmm/io/vlapic.c Sat Apr 28 14:42:49 2012 (r234760) +++ projects/bhyve/sys/amd64/vmm/io/vlapic.c Sat Apr 28 16:28:00 2012 (r234761) @@ -778,6 +778,7 @@ vlapic_init(struct vm *vm, int vcpuid) void vlapic_cleanup(struct vlapic *vlapic) { + vlapic_op_halt(vlapic); vdev_unregister(vlapic); free(vlapic, M_VLAPIC); } Modified: projects/bhyve/sys/amd64/vmm/vmm_dev.c ============================================================================== --- projects/bhyve/sys/amd64/vmm/vmm_dev.c Sat Apr 28 14:42:49 2012 (r234760) +++ projects/bhyve/sys/amd64/vmm/vmm_dev.c Sat Apr 28 16:28:00 2012 (r234761) @@ -158,6 +158,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long c struct vm_pptdev *pptdev; struct vm_pptdev_mmio *pptmmio; struct vm_pptdev_msi *pptmsi; + struct vm_pptdev_msix *pptmsix; struct vm_nmi *vmnmi; struct vm_stats *vmstats; struct vm_stat_desc *statdesc; @@ -240,6 +241,14 @@ vmmdev_ioctl(struct cdev *cdev, u_long c pptmsi->destcpu, pptmsi->vector, pptmsi->numvec); break; + case VM_PPTDEV_MSIX: + pptmsix = (struct vm_pptdev_msix *)data; + error = ppt_setup_msix(sc->vm, pptmsix->vcpu, + pptmsix->bus, pptmsix->slot, + pptmsix->func, pptmsix->idx, + pptmsix->msg, pptmsix->vector_control, + pptmsix->addr); + break; case VM_MAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, Modified: projects/bhyve/usr.sbin/bhyve/Makefile ============================================================================== --- projects/bhyve/usr.sbin/bhyve/Makefile Sat Apr 28 14:42:49 2012 (r234760) +++ projects/bhyve/usr.sbin/bhyve/Makefile Sat Apr 28 16:28:00 2012 (r234761) @@ -4,7 +4,8 @@ PROG= bhyve -SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c mevent.c +SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c +SRCS+= instruction_emul.c mevent.c SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c SRCS+= pci_virtio_net.c pit_8254.c post.c rtc.c uart.c xmsr.c Modified: projects/bhyve/usr.sbin/bhyve/fbsdrun.c ============================================================================== --- projects/bhyve/usr.sbin/bhyve/fbsdrun.c Sat Apr 28 14:42:49 2012 (r234760) +++ projects/bhyve/usr.sbin/bhyve/fbsdrun.c Sat Apr 28 16:28:00 2012 (r234761) @@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$"); #include "mevent.h" #include "pci_emul.h" #include "xmsr.h" +#include "instruction_emul.h" #define DEFAULT_GUEST_HZ 100 #define DEFAULT_GUEST_TSLICE 200 @@ -108,6 +109,7 @@ struct fbsdstats { uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; + uint64_t vmexit_paging; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; int io_reset; @@ -412,6 +414,20 @@ vmexit_mtrap(struct vmctx *ctx, struct v return (VMEXIT_RESTART); } +static int +vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + stats.vmexit_paging++; + + if (emulate_instruction(ctx, *pvcpu, vmexit->rip, vmexit->u.paging.cr3) != 0) { + printf("Failed to emulate instruction at 0x%lx\n", vmexit->rip); + return (VMEXIT_ABORT); + } + + return (VMEXIT_CONTINUE); +} + static void sigalrm(int sig) { @@ -446,12 +462,13 @@ setup_timeslice(void) } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { - [VM_EXITCODE_INOUT] = vmexit_inout, - [VM_EXITCODE_VMX] = vmexit_vmx, - [VM_EXITCODE_BOGUS] = vmexit_bogus, - [VM_EXITCODE_RDMSR] = vmexit_rdmsr, - [VM_EXITCODE_WRMSR] = vmexit_wrmsr, - [VM_EXITCODE_MTRAP] = vmexit_mtrap, + [VM_EXITCODE_INOUT] = vmexit_inout, + [VM_EXITCODE_VMX] = vmexit_vmx, + [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_RDMSR] = vmexit_rdmsr, + [VM_EXITCODE_WRMSR] = vmexit_wrmsr, + [VM_EXITCODE_MTRAP] = vmexit_mtrap, + [VM_EXITCODE_PAGING] = vmexit_paging }; static void Added: projects/bhyve/usr.sbin/bhyve/instruction_emul.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ projects/bhyve/usr.sbin/bhyve/instruction_emul.c Sat Apr 28 16:28:00 2012 (r234761) @@ -0,0 +1,555 @@ +/*- + * Copyright (c) 2012 Sandvine, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include + +#include "fbsdrun.h" +#include "instruction_emul.h" + +#define PREFIX_LOCK 0xF0 +#define PREFIX_REPNE 0xF2 +#define PREFIX_REPE 0xF3 +#define PREFIX_CS_OVERRIDE 0x2E +#define PREFIX_SS_OVERRIDE 0x36 +#define PREFIX_DS_OVERRIDE 0x3E +#define PREFIX_ES_OVERRIDE 0x26 +#define PREFIX_FS_OVERRIDE 0x64 +#define PREFIX_GS_OVERRIDE 0x65 +#define PREFIX_BRANCH_NOT_TAKEN 0x2E +#define PREFIX_BRANCH_TAKEN 0x3E +#define PREFIX_OPSIZE 0x66 +#define PREFIX_ADDRSIZE 0x67 + +#define OPCODE_2BYTE_ESCAPE 0x0F +#define OPCODE_3BYTE_ESCAPE 0x38 + +#define MODRM_MOD_MASK 0xC0 +#define MODRM_MOD_SHIFT 6 +#define MODRM_RM_MASK 0x07 +#define MODRM_RM_SHIFT 0 +#define MODRM_REG_MASK 0x38 +#define MODRM_REG_SHIFT 3 + +#define MOD_INDIRECT 0x0 +#define MOD_INDIRECT_DISP8 0x1 +#define MOD_INDIRECT_DISP32 0x2 +#define MOD_DIRECT 0x3 + +#define RM_EAX 0x0 +#define RM_ECX 0x1 +#define RM_EDX 0x2 +#define RM_EBX 0x3 +#define RM_SIB 0x4 +#define RM_DISP32 0x5 +#define RM_EBP RM_DISP32 +#define RM_ESI 0x6 +#define RM_EDI 0x7 + +#define REG_EAX 0x0 +#define REG_ECX 0x1 +#define REG_EDX 0x2 +#define REG_EBX 0x3 +#define REG_ESP 0x4 +#define REG_EBP 0x5 +#define REG_ESI 0x6 +#define REG_EDI 0x7 +#define REG_R8 0x8 +#define REG_R9 0x9 +#define REG_R10 0xA +#define REG_R11 0xB +#define REG_R12 0xC +#define REG_R13 0xD +#define REG_R14 0xE +#define REG_R15 0xF + +#define HAS_MODRM 1 +#define FROM_RM (1<<1) +#define FROM_REG (1<<2) +#define TO_RM (1<<3) +#define TO_REG (1<<4) + +#define REX_MASK 0xF0 +#define REX_PREFIX 0x40 +#define is_rex_prefix(x) ( ((x) & REX_MASK) == REX_PREFIX ) +#define REX_W_MASK 0x8 +#define REX_R_MASK 0x4 +#define REX_X_MASK 0x2 +#define REX_B_MASK 0x1 + +#define is_prefix(x) ((x) == PREFIX_LOCK || (x) == PREFIX_REPNE || \ + (x) == PREFIX_REPE || (x) == PREFIX_CS_OVERRIDE || \ + (x) == PREFIX_SS_OVERRIDE || (x) == PREFIX_DS_OVERRIDE || \ + (x) == PREFIX_ES_OVERRIDE || (x) == PREFIX_FS_OVERRIDE || \ + (x) == PREFIX_GS_OVERRIDE || (x) == PREFIX_BRANCH_NOT_TAKEN || \ + (x) == PREFIX_BRANCH_TAKEN || (x) == PREFIX_OPSIZE || \ + (x) == PREFIX_ADDRSIZE || is_rex_prefix((x))) + +#define PAGE_FRAME_MASK 0x80 +#define PAGE_OFFSET_MASK 0xFFF +#define PAGE_TABLE_ENTRY_MASK (~PAGE_OFFSET_MASK) +#define PML4E_OFFSET_MASK 0x0000FF8000000000 +#define PML4E_SHIFT 39 + +#define MAX_EMULATED_REGIONS 8 +int registered_regions = 0; +struct memory_region +{ + uintptr_t start; + uintptr_t end; + emulated_read_func_t memread; + emulated_write_func_t memwrite; + void *arg; +} emulated_regions[MAX_EMULATED_REGIONS]; + +struct decoded_instruction +{ + void *instruction; + uint8_t *opcode; + uint8_t *modrm; + uint8_t *sib; + uint8_t *displacement; + uint8_t *immediate; + + uint8_t opcode_flags; + + uint8_t addressing_mode; + uint8_t rm; + uint8_t reg; + uint8_t rex_r; + uint8_t rex_w; + uint8_t rex_b; + uint8_t rex_x; + + int32_t disp; +}; + +static enum vm_reg_name vm_reg_name_mappings[] = { + [REG_EAX] = VM_REG_GUEST_RAX, + [REG_EBX] = VM_REG_GUEST_RBX, + [REG_ECX] = VM_REG_GUEST_RCX, + [REG_EDX] = VM_REG_GUEST_RDX, + [REG_ESP] = VM_REG_GUEST_RSP, + [REG_EBP] = VM_REG_GUEST_RBP, + [REG_ESI] = VM_REG_GUEST_RSI, + [REG_EDI] = VM_REG_GUEST_RDI, + [REG_R8] = VM_REG_GUEST_R8, + [REG_R9] = VM_REG_GUEST_R9, + [REG_R10] = VM_REG_GUEST_R10, + [REG_R11] = VM_REG_GUEST_R11, + [REG_R12] = VM_REG_GUEST_R12, + [REG_R13] = VM_REG_GUEST_R13, + [REG_R14] = VM_REG_GUEST_R14, + [REG_R15] = VM_REG_GUEST_R15 +}; + +uint8_t one_byte_opcodes[256] = { + [0x89] = HAS_MODRM | FROM_REG | TO_RM, + [0x8B] = HAS_MODRM | FROM_RM | TO_REG, +}; + +static uintptr_t +gla2gpa(uint64_t gla, uint64_t guest_cr3) +{ + uint64_t *table; + uint64_t mask, entry; + int level, shift; + uintptr_t page_frame; + + table = paddr_guest2host(guest_cr3 & PAGE_TABLE_ENTRY_MASK); + mask = PML4E_OFFSET_MASK; + shift = PML4E_SHIFT; + for (level = 0; level < 4; ++level) + { + entry = table[(gla & mask) >> shift]; + table = (uint64_t*)(entry & PAGE_TABLE_ENTRY_MASK); + + /* This entry does not point to another page table */ + if (entry & PAGE_FRAME_MASK || level >= 3) + break; + + table = paddr_guest2host((uintptr_t)table); + mask >>= 9; + shift -= 9; + } + + mask = (1 << shift) - 1; + page_frame = ((uintptr_t)table & ~mask); + return (page_frame | (gla & mask)); +} + +static void * +gla2hla(uint64_t gla, uint64_t guest_cr3) +{ + uintptr_t gpa; + + gpa = gla2gpa(gla, guest_cr3); + return paddr_guest2host(gpa); +} + +/* + * Decodes all of the prefixes of the instruction. Only a subset of REX + * prefixes are currently supported. If any unsupported prefix is + * encountered, returns -1. + */ +static int +decode_prefixes(struct decoded_instruction *decoded) +{ + uint8_t *current_prefix; + + current_prefix = decoded->instruction; + + if (is_rex_prefix(*current_prefix)) { + decoded->rex_w = *current_prefix & REX_W_MASK; + decoded->rex_r = *current_prefix & REX_R_MASK; + decoded->rex_x = *current_prefix & REX_X_MASK; + decoded->rex_b = *current_prefix & REX_B_MASK; + current_prefix++; + } else if (is_prefix(*current_prefix)) { + return (-1); + } + + decoded->opcode = current_prefix; + return (0); +} + +/* + * Decodes the instruction's opcode. If the opcode is not understood, returns + * -1 indicating an error. Sets the instruction's mod_rm pointer to the + * location of the ModR/M field. + */ +static int +decode_opcode(struct decoded_instruction *decoded) +{ + uint8_t opcode, flags; + + opcode = *decoded->opcode; + flags = one_byte_opcodes[opcode]; + + if (!flags) + return (-1); + + if (flags & HAS_MODRM) { + decoded->modrm = decoded->opcode + 1; + } + + decoded->opcode_flags = flags; + + return (0); +} + +/* + * Decodes the instruction's ModR/M field. Sets the instruction's sib pointer + * to the location of the SIB if one is expected to be present, or 0 if not. + */ +static int +decode_mod_rm(struct decoded_instruction *decoded) +{ + uint8_t modrm; + uint8_t *extension_operands; + + if (decoded->modrm) { + modrm = *decoded->modrm; + + decoded->addressing_mode = (modrm & MODRM_MOD_MASK) >> MODRM_MOD_SHIFT; + decoded->rm = (modrm & MODRM_RM_MASK) >> MODRM_RM_SHIFT; + decoded->reg = (modrm & MODRM_REG_MASK) >> MODRM_REG_SHIFT; + + if (decoded->rex_b) + decoded->rm |= (1<<3); + + if (decoded->rex_r) + decoded->reg |= (1<<3); + + extension_operands = decoded->modrm + 1; + + if (decoded->rm == RM_SIB) { + decoded->sib = decoded->modrm + 1; + extension_operands = decoded->sib + 1; + } + + switch (decoded->addressing_mode) { + case MOD_INDIRECT: + case MOD_DIRECT: + decoded->displacement = 0; + break; + case MOD_INDIRECT_DISP8: + decoded->displacement = extension_operands; + break; + case MOD_INDIRECT_DISP32: + decoded->displacement = extension_operands; + break; + } + } + + return (0); +} + +/* + * Decodes the instruction's SIB field. No such instructions are currently + * supported, so do nothing and return -1 if there is a SIB field, 0 otherwise. + */ +static int +decode_sib(struct decoded_instruction *decoded) +{ + + if (decoded->sib) + return (-1); + + return (0); +} + +/* + * Grabs and saves the instruction's immediate operand and displacement if + * they are present. Immediates are not currently supported, so if an + * immediate is present it will return -1 indicating an error. + */ +static int +decode_extension_operands(struct decoded_instruction *decoded) +{ + + if (decoded->displacement) { + if (decoded->addressing_mode == MOD_INDIRECT_DISP8) { + decoded->disp = (int32_t)*decoded->displacement; + } else if (decoded->addressing_mode == MOD_INDIRECT_DISP32) { + decoded->disp = *((int32_t*)decoded->displacement); + } + } + + if (decoded->immediate) { + return (-1); + } + + return (0); +} + +static int +decode_instruction(void *instr, struct decoded_instruction *decoded) +{ + int error; + + bzero(decoded, sizeof(*decoded)); + decoded->instruction = instr; + + error = decode_prefixes(decoded); + if (error) + return (error); + + error = decode_opcode(decoded); + if (error) + return (error); + + error = decode_mod_rm(decoded); + if (error) + return (error); + + error = decode_sib(decoded); + if (error) + return (error); + + error = decode_extension_operands(decoded); + if (error) + return (error); + + return (0); +} + +static struct memory_region * +find_region(uintptr_t addr) +{ + int i; + + for (i = 0; i < registered_regions; ++i) { + if (emulated_regions[i].start <= addr && + emulated_regions[i].end >= addr) { + return &emulated_regions[i]; + } + } + + return (0); +} + +static enum vm_reg_name +get_vm_reg_name(uint8_t reg) +{ + return vm_reg_name_mappings[reg]; +} + +static int +get_operand(struct vmctx *vm, int vcpu, uint64_t guest_cr3, + const struct decoded_instruction *instruction, uint64_t *operand) +{ + enum vm_reg_name regname; + uint64_t reg; + uintptr_t target; + int error; + uint8_t rm, addressing_mode; + struct memory_region *emulated_memory; + + if (instruction->opcode_flags & FROM_RM) { + rm = instruction->rm; + addressing_mode = instruction->addressing_mode; + } else if (instruction->opcode_flags & FROM_REG) { + rm = instruction->reg; + addressing_mode = MOD_DIRECT; + } else + return (-1); + + regname = get_vm_reg_name(rm); + error = vm_get_register(vm, vcpu, regname, ®); + if (error) + return (error); + + switch (addressing_mode) { + case MOD_DIRECT: + *operand = reg; + return (0); + case MOD_INDIRECT: + target = gla2gpa(reg, guest_cr3); + emulated_memory = find_region(target); + if (emulated_memory) { + return emulated_memory->memread(vm, vcpu, target, + 4, operand, + emulated_memory->arg); + } + return (-1); + case MOD_INDIRECT_DISP8: + case MOD_INDIRECT_DISP32: + target = gla2gpa(reg, guest_cr3); + target += instruction->disp; + emulated_memory = find_region(target); + if (emulated_memory) { + return emulated_memory->memread(vm, vcpu, target, + 4, operand, + emulated_memory->arg); + } + return (-1); + default: + return (-1); + } +} + +static int +perform_write(struct vmctx *vm, int vcpu, uint64_t guest_cr3, + const struct decoded_instruction *instruction, uint64_t operand) +{ + enum vm_reg_name regname; + uintptr_t target; + int error; + uint64_t reg; + struct memory_region *emulated_memory; + uint8_t addressing_mode; + + if (instruction->opcode_flags & TO_RM) { + reg = instruction->rm; + addressing_mode = instruction->addressing_mode; + } else if (instruction->opcode_flags & TO_REG) { + reg = instruction->reg; + addressing_mode = MOD_DIRECT; + } else + return (-1); + + regname = get_vm_reg_name(reg); + error = vm_get_register(vm, vcpu, regname, ®); + if (error) + return (error); + + switch(addressing_mode) { + case MOD_DIRECT: + return vm_set_register(vm, vcpu, regname, operand); + case MOD_INDIRECT: + target = gla2gpa(reg, guest_cr3); + emulated_memory = find_region(target); + if (emulated_memory) { + return emulated_memory->memwrite(vm, vcpu, target, + 4, operand, + emulated_memory->arg); + } + return (-1); + default: + return (-1); + } +} + +static int +emulate_decoded_instruction(struct vmctx *vm, int vcpu, uint64_t cr3, + const struct decoded_instruction *instruction) +{ + uint64_t operand; + int error; + + error = get_operand(vm, vcpu, cr3, instruction, &operand); + if (error) + return (error); + + return perform_write(vm, vcpu, cr3, instruction, operand); +} + +int +emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, uint64_t cr3) +{ + struct decoded_instruction instr; + int error; + void *instruction = gla2hla(rip, cr3); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***