Date: Sun, 6 Oct 2013 09:37:58 +0000 (UTC) From: Mark Murray <markm@FreeBSD.org> To: src-committers@freebsd.org, svn-src-projects@freebsd.org Subject: svn commit: r256080 - in projects/random_number_generator: lib/libvmmapi sys/amd64/amd64 sys/amd64/include sys/amd64/vmm sys/amd64/vmm/amd sys/amd64/vmm/intel sys/amd64/vmm/io sys/cam/ctl sys/conf ... Message-ID: <201310060937.r969bw3v038058@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: markm Date: Sun Oct 6 09:37:57 2013 New Revision: 256080 URL: http://svnweb.freebsd.org/changeset/base/256080 Log: MFC - tracking commit Deleted: projects/random_number_generator/sys/kern/uipc_cow.c Modified: projects/random_number_generator/lib/libvmmapi/vmmapi.c projects/random_number_generator/lib/libvmmapi/vmmapi.h projects/random_number_generator/sys/amd64/amd64/machdep.c projects/random_number_generator/sys/amd64/amd64/pmap.c projects/random_number_generator/sys/amd64/amd64/trap.c projects/random_number_generator/sys/amd64/include/pcpu.h projects/random_number_generator/sys/amd64/include/pmap.h projects/random_number_generator/sys/amd64/include/vmm.h (contents, props changed) projects/random_number_generator/sys/amd64/include/vmm_dev.h (contents, props changed) projects/random_number_generator/sys/amd64/include/vmm_instruction_emul.h (contents, props changed) projects/random_number_generator/sys/amd64/vmm/amd/amdv.c projects/random_number_generator/sys/amd64/vmm/intel/ept.c projects/random_number_generator/sys/amd64/vmm/intel/ept.h projects/random_number_generator/sys/amd64/vmm/intel/vmcs.c projects/random_number_generator/sys/amd64/vmm/intel/vmcs.h projects/random_number_generator/sys/amd64/vmm/intel/vmx.c projects/random_number_generator/sys/amd64/vmm/intel/vmx.h projects/random_number_generator/sys/amd64/vmm/intel/vmx_genassym.c projects/random_number_generator/sys/amd64/vmm/intel/vmx_support.S projects/random_number_generator/sys/amd64/vmm/io/ppt.c projects/random_number_generator/sys/amd64/vmm/io/ppt.h projects/random_number_generator/sys/amd64/vmm/vmm.c projects/random_number_generator/sys/amd64/vmm/vmm_dev.c projects/random_number_generator/sys/amd64/vmm/vmm_instruction_emul.c projects/random_number_generator/sys/amd64/vmm/vmm_mem.c projects/random_number_generator/sys/amd64/vmm/vmm_mem.h projects/random_number_generator/sys/cam/ctl/ctl_frontend_iscsi.c projects/random_number_generator/sys/conf/files projects/random_number_generator/sys/dev/e1000/if_igb.c projects/random_number_generator/sys/dev/hyperv/vmbus/hv_hv.c projects/random_number_generator/sys/dev/ixgbe/ixgbe.c projects/random_number_generator/sys/dev/virtio/network/if_vtnet.c projects/random_number_generator/sys/dev/xen/timer/timer.c projects/random_number_generator/sys/i386/include/pcpu.h projects/random_number_generator/sys/i386/xen/mp_machdep.c projects/random_number_generator/sys/i386/xen/mptable.c projects/random_number_generator/sys/kern/kern_malloc.c projects/random_number_generator/sys/x86/acpica/madt.c projects/random_number_generator/sys/x86/xen/hvm.c projects/random_number_generator/sys/x86/xen/xen_intr.c projects/random_number_generator/usr.sbin/bhyve/bhyverun.c projects/random_number_generator/usr.sbin/bhyve/pci_emul.c projects/random_number_generator/usr.sbin/bhyve/rtc.c projects/random_number_generator/usr.sbin/bhyvectl/bhyvectl.c projects/random_number_generator/usr.sbin/bhyveload/bhyveload.c Directory Properties: projects/random_number_generator/ (props changed) projects/random_number_generator/lib/libvmmapi/ (props changed) projects/random_number_generator/sys/ (props changed) projects/random_number_generator/sys/amd64/vmm/ (props changed) projects/random_number_generator/sys/conf/ (props changed) projects/random_number_generator/sys/dev/hyperv/ (props changed) projects/random_number_generator/usr.sbin/bhyve/ (props changed) projects/random_number_generator/usr.sbin/bhyvectl/ (props changed) projects/random_number_generator/usr.sbin/bhyveload/ (props changed) Modified: projects/random_number_generator/lib/libvmmapi/vmmapi.c ============================================================================== --- projects/random_number_generator/lib/libvmmapi/vmmapi.c Sun Oct 6 06:57:28 2013 (r256079) +++ projects/random_number_generator/lib/libvmmapi/vmmapi.c Sun Oct 6 09:37:57 2013 (r256080) @@ -124,7 +124,8 @@ vm_destroy(struct vmctx *vm) } int -vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len) +vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len, + int *wired) { int error; struct vm_memory_segment seg; @@ -133,6 +134,8 @@ vm_get_memory_seg(struct vmctx *ctx, vm_ seg.gpa = gpa; error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg); *ret_len = seg.len; + if (wired != NULL) + *wired = seg.wired; return (error); } @@ -741,3 +744,23 @@ vcpu_reset(struct vmctx *vmctx, int vcpu done: return (error); } + +int +vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) +{ + int error, i; + struct vm_gpa_pte gpapte; + + bzero(&gpapte, sizeof(gpapte)); + gpapte.gpa = gpa; + + error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); + + if (error == 0) { + *num = gpapte.ptenum; + for (i = 0; i < gpapte.ptenum; i++) + pte[i] = gpapte.pte[i]; + } + + return (error); +} Modified: projects/random_number_generator/lib/libvmmapi/vmmapi.h ============================================================================== --- projects/random_number_generator/lib/libvmmapi/vmmapi.h Sun Oct 6 06:57:28 2013 (r256079) +++ projects/random_number_generator/lib/libvmmapi/vmmapi.h Sun Oct 6 09:37:57 2013 (r256080) @@ -45,9 +45,11 @@ enum vm_mmap_style { int vm_create(const char *name); struct vmctx *vm_open(const char *name); void vm_destroy(struct vmctx *ctx); -int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len); +int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len, + int *wired); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); +int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, Modified: projects/random_number_generator/sys/amd64/amd64/machdep.c ============================================================================== --- projects/random_number_generator/sys/amd64/amd64/machdep.c Sun Oct 6 06:57:28 2013 (r256079) +++ projects/random_number_generator/sys/amd64/amd64/machdep.c Sun Oct 6 09:37:57 2013 (r256080) @@ -1574,7 +1574,7 @@ getmemsize(caddr_t kmdp, u_int64_t first /* * map page into kernel: valid, read/write,non-cacheable */ - *pte = pa | PG_V | PG_RW | PG_N; + *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; invltlb(); tmp = *(int *)ptr; Modified: projects/random_number_generator/sys/amd64/amd64/pmap.c ============================================================================== --- projects/random_number_generator/sys/amd64/amd64/pmap.c Sun Oct 6 06:57:28 2013 (r256079) +++ projects/random_number_generator/sys/amd64/amd64/pmap.c Sun Oct 6 09:37:57 2013 (r256080) @@ -76,6 +76,8 @@ * SUCH DAMAGE. */ +#define AMD64_NPT_AWARE + #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); @@ -143,6 +145,120 @@ __FBSDID("$FreeBSD$"); #include <machine/smp.h> #endif +static __inline boolean_t +pmap_emulate_ad_bits(pmap_t pmap) +{ + + return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); +} + +static __inline pt_entry_t +pmap_valid_bit(pmap_t pmap) +{ + pt_entry_t mask; + + switch (pmap->pm_type) { + case PT_X86: + mask = X86_PG_V; + break; + case PT_EPT: + if (pmap_emulate_ad_bits(pmap)) + mask = EPT_PG_EMUL_V; + else + mask = EPT_PG_READ; + break; + default: + panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); + } + + return (mask); +} + +static __inline pt_entry_t +pmap_rw_bit(pmap_t pmap) +{ + pt_entry_t mask; + + switch (pmap->pm_type) { + case PT_X86: + mask = X86_PG_RW; + break; + case PT_EPT: + if (pmap_emulate_ad_bits(pmap)) + mask = EPT_PG_EMUL_RW; + else + mask = EPT_PG_WRITE; + break; + default: + panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); + } + + return (mask); +} + +static __inline pt_entry_t +pmap_global_bit(pmap_t pmap) +{ + pt_entry_t mask; + + switch (pmap->pm_type) { + case PT_X86: + mask = X86_PG_G; + break; + case PT_EPT: + mask = 0; + break; + default: + panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); + } + + return (mask); +} + +static __inline pt_entry_t +pmap_accessed_bit(pmap_t pmap) +{ + pt_entry_t mask; + + switch (pmap->pm_type) { + case PT_X86: + mask = X86_PG_A; + break; + case PT_EPT: + if (pmap_emulate_ad_bits(pmap)) + mask = EPT_PG_READ; + else + mask = EPT_PG_A; + break; + default: + panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); + } + + return (mask); +} + +static __inline pt_entry_t +pmap_modified_bit(pmap_t pmap) +{ + pt_entry_t mask; + + switch (pmap->pm_type) { + case PT_X86: + mask = X86_PG_M; + break; + case PT_EPT: + if (pmap_emulate_ad_bits(pmap)) + mask = EPT_PG_WRITE; + else + mask = EPT_PG_M; + break; + default: + panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); + } + + return (mask); +} + #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline @@ -247,6 +363,8 @@ static struct md_page *pv_table; pt_entry_t *CMAP1 = 0; caddr_t CADDR1 = 0; +static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ + static struct unrhdr pcid_unr; static struct mtx pcid_mtx; int pmap_pcid_enabled = 1; @@ -306,12 +424,12 @@ static void pmap_fill_ptp(pt_entry_t *fi static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); -static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); +static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); -static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); +static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, @@ -323,7 +441,7 @@ static boolean_t pmap_try_insert_pv_entr vm_page_t m, struct rwlock **lockp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); -static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); +static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); @@ -416,7 +534,9 @@ static __inline pdp_entry_t * pmap_pdpe(pmap_t pmap, vm_offset_t va) { pml4_entry_t *pml4e; + pt_entry_t PG_V; + PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); if ((*pml4e & PG_V) == 0) return (NULL); @@ -438,7 +558,9 @@ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; + pt_entry_t PG_V; + PG_V = pmap_valid_bit(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return (NULL); @@ -460,7 +582,9 @@ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; + pt_entry_t PG_V; + PG_V = pmap_valid_bit(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return (NULL); @@ -490,6 +614,8 @@ vtopte(vm_offset_t va) { u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); + return (PTmap + ((va >> PAGE_SHIFT) & mask)); } @@ -498,6 +624,8 @@ vtopde(vm_offset_t va) { u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); + return (PDmap + ((va >> PDRSHIFT) & mask)); } @@ -601,22 +729,24 @@ create_pagetables(vm_paddr_t *firstaddr) /* XXX not fully used, underneath 2M pages */ pt_p = (pt_entry_t *)KPTphys; for (i = 0; ptoa(i) < *firstaddr; i++) - pt_p[i] = ptoa(i) | PG_RW | PG_V | PG_G; + pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G; /* Now map the page tables at their location within PTmap */ pd_p = (pd_entry_t *)KPDphys; for (i = 0; i < nkpt; i++) - pd_p[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V; + pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* Map from zero to end of allocations under 2M pages */ /* This replaces some of the KPTphys entries above */ for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) - pd_p[i] = (i << PDRSHIFT) | PG_RW | PG_V | PG_PS | PG_G; + pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | + X86_PG_G; /* And connect up the PD to the PDP (leaving room for L4 pages) */ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); for (i = 0; i < nkpdpe; i++) - pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | PG_RW | PG_V | PG_U; + pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V | + PG_U; /* * Now, set up the direct map region using 2MB and/or 1GB pages. If @@ -630,36 +760,36 @@ create_pagetables(vm_paddr_t *firstaddr) for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ - pd_p[j] |= PG_RW | PG_V | PG_PS | PG_G | - PG_M | PG_A; + pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | + X86_PG_M | X86_PG_A; } pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ - pdp_p[i] |= PG_RW | PG_V | PG_PS | PG_G | - PG_M | PG_A; + pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | + X86_PG_M | X86_PG_A; } for (j = 0; i < ndmpdp; i++, j++) { pdp_p[i] = DMPDphys + ptoa(j); - pdp_p[i] |= PG_RW | PG_V | PG_U; + pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; } /* And recursively map PML4 to itself in order to get PTmap */ p4_p = (pml4_entry_t *)KPML4phys; p4_p[PML4PML4I] = KPML4phys; - p4_p[PML4PML4I] |= PG_RW | PG_V | PG_U; + p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U; /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < ndmpdpphys; i++) { p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); - p4_p[DMPML4I + i] |= PG_RW | PG_V | PG_U; + p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U; } /* Connect the KVA slots up to the PML4 */ for (i = 0; i < NKPML4E; i++) { p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); - p4_p[KPML4BASE + i] |= PG_RW | PG_V | PG_U; + p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U; } } @@ -705,6 +835,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ CPU_ZERO(&kernel_pmap->pm_save); TAILQ_INIT(&kernel_pmap->pm_pvchunk); + kernel_pmap->pm_flags = pmap_flags; /* * Initialize the global pv list lock. @@ -948,35 +1079,131 @@ SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, de * Low level helper routines..... ***************************************************/ +static pt_entry_t +pmap_swap_pat(pmap_t pmap, pt_entry_t entry) +{ + int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; + + switch (pmap->pm_type) { + case PT_X86: + /* Verify that both PAT bits are not set at the same time */ + KASSERT((entry & x86_pat_bits) != x86_pat_bits, + ("Invalid PAT bits in entry %#lx", entry)); + + /* Swap the PAT bits if one of them is set */ + if ((entry & x86_pat_bits) != 0) + entry ^= x86_pat_bits; + break; + case PT_EPT: + /* + * Nothing to do - the memory attributes are represented + * the same way for regular pages and superpages. + */ + break; + default: + panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); + } + + return (entry); +} + /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ static int -pmap_cache_bits(int mode, boolean_t is_pde) +pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) panic("Unknown caching mode %d\n", mode); - /* The PAT bit is different for PTE's and PDE's. */ - pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; + switch (pmap->pm_type) { + case PT_X86: + /* The PAT bit is different for PTE's and PDE's. */ + pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; + + /* Map the caching mode to a PAT index. */ + pat_idx = pat_index[mode]; + + /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ + cache_bits = 0; + if (pat_idx & 0x4) + cache_bits |= pat_flag; + if (pat_idx & 0x2) + cache_bits |= PG_NC_PCD; + if (pat_idx & 0x1) + cache_bits |= PG_NC_PWT; + break; + + case PT_EPT: + cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); + break; - /* Map the caching mode to a PAT index. */ - pat_idx = pat_index[mode]; + default: + panic("unsupported pmap type %d", pmap->pm_type); + } - /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ - cache_bits = 0; - if (pat_idx & 0x4) - cache_bits |= pat_flag; - if (pat_idx & 0x2) - cache_bits |= PG_NC_PCD; - if (pat_idx & 0x1) - cache_bits |= PG_NC_PWT; return (cache_bits); } +static int +pmap_cache_mask(pmap_t pmap, boolean_t is_pde) +{ + int mask; + + switch (pmap->pm_type) { + case PT_X86: + mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; + break; + case PT_EPT: + mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); + break; + default: + panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); + } + + return (mask); +} + +static __inline boolean_t +pmap_ps_enabled(pmap_t pmap) +{ + + return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); +} + +static void +pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) +{ + + switch (pmap->pm_type) { + case PT_X86: + break; + case PT_EPT: + /* + * XXX + * This is a little bogus since the generation number is + * supposed to be bumped up when a region of the address + * space is invalidated in the page tables. + * + * In this case the old PDE entry is valid but yet we want + * to make sure that any mappings using the old entry are + * invalidated in the TLB. + * + * The reason this works as expected is because we rendezvous + * "all" host cpus and force any vcpu context to exit as a + * side-effect. + */ + atomic_add_acq_long(&pmap->pm_eptgen, 1); + break; + default: + panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); + } + pde_store(pde, newpde); +} + /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the @@ -985,8 +1212,17 @@ pmap_cache_bits(int mode, boolean_t is_p * The calling thread must be pinned to a processor. */ static void -pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) +pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) { + pt_entry_t PG_G; + + if (pmap->pm_type == PT_EPT) + return; + + KASSERT(pmap->pm_type == PT_X86, + ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); + + PG_G = pmap_global_bit(pmap); if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ @@ -1048,12 +1284,61 @@ pmap_invalidate_page_pcid(pmap_t pmap, v * immutable. The kernel page table is always active on every * processor. */ + +/* + * Interrupt the cpus that are executing in the guest context. + * This will force the vcpu to exit and the cached EPT mappings + * will be invalidated by the host before the next vmresume. + */ +static __inline void +pmap_invalidate_ept(pmap_t pmap) +{ + + sched_pin(); + KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), + ("pmap_invalidate_ept: absurd pm_active")); + + /* + * The TLB mappings associated with a vcpu context are not + * flushed each time a different vcpu is chosen to execute. + * + * This is in contrast with a process's vtop mappings that + * are flushed from the TLB on each context switch. + * + * Therefore we need to do more than just a TLB shootdown on + * the active cpus in 'pmap->pm_active'. To do this we keep + * track of the number of invalidations performed on this pmap. + * + * Each vcpu keeps a cache of this counter and compares it + * just before a vmresume. If the counter is out-of-date an + * invept will be done to flush stale mappings from the TLB. + */ + atomic_add_acq_long(&pmap->pm_eptgen, 1); + + /* + * Force the vcpu to exit and trap back into the hypervisor. + * + * XXX this is not optimal because IPI_AST builds a trapframe + * whereas all we need is an 'eoi' followed by 'iret'. + */ + ipi_selected(pmap->pm_active, IPI_AST); + sched_unpin(); +} + void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t other_cpus; u_int cpuid; + if (pmap->pm_type == PT_EPT) { + pmap_invalidate_ept(pmap); + return; + } + + KASSERT(pmap->pm_type == PT_X86, + ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); + sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { if (!pmap_pcid_enabled) { @@ -1124,6 +1409,14 @@ pmap_invalidate_range(pmap_t pmap, vm_of vm_offset_t addr; u_int cpuid; + if (pmap->pm_type == PT_EPT) { + pmap_invalidate_ept(pmap); + return; + } + + KASSERT(pmap->pm_type == PT_X86, + ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); + sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { if (!pmap_pcid_enabled) { @@ -1175,6 +1468,14 @@ pmap_invalidate_all(pmap_t pmap) uint64_t cr3; u_int cpuid; + if (pmap->pm_type == PT_EPT) { + pmap_invalidate_ept(pmap); + return; + } + + KASSERT(pmap->pm_type == PT_X86, + ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); + sched_pin(); cpuid = PCPU_GET(cpuid); if (pmap == kernel_pmap || @@ -1243,6 +1544,7 @@ pmap_invalidate_cache(void) struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ + pmap_t pmap; vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; @@ -1255,7 +1557,7 @@ pmap_update_pde_action(void *arg) struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) - pde_store(act->pde, act->newpde); + pmap_update_pde_store(act->pmap, act->pde, act->newpde); } static void @@ -1264,7 +1566,7 @@ pmap_update_pde_teardown(void *arg) struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) - pmap_update_pde_invalidate(act->va, act->newpde); + pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); } /* @@ -1286,7 +1588,7 @@ pmap_update_pde(pmap_t pmap, vm_offset_t cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); - if (pmap == kernel_pmap) + if (pmap == kernel_pmap || pmap->pm_type == PT_EPT) active = all_cpus; else { active = pmap->pm_active; @@ -1296,6 +1598,7 @@ pmap_update_pde(pmap_t pmap, vm_offset_t act.store = cpuid; act.invalidate = active; act.va = va; + act.pmap = pmap; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); @@ -1303,9 +1606,9 @@ pmap_update_pde(pmap_t pmap, vm_offset_t smp_no_rendevous_barrier, pmap_update_pde_action, pmap_update_pde_teardown, &act); } else { - pde_store(pde, newpde); + pmap_update_pde_store(pmap, pde, newpde); if (CPU_ISSET(cpuid, &active)) - pmap_update_pde_invalidate(va, newpde); + pmap_update_pde_invalidate(pmap, va, newpde); } sched_unpin(); } @@ -1318,8 +1621,17 @@ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - invlpg(va); + switch (pmap->pm_type) { + case PT_X86: + if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + invlpg(va); + break; + case PT_EPT: + pmap->pm_eptgen++; + break; + default: + panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type); + } } PMAP_INLINE void @@ -1327,17 +1639,35 @@ pmap_invalidate_range(pmap_t pmap, vm_of { vm_offset_t addr; - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); + switch (pmap->pm_type) { + case PT_X86: + if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + break; + case PT_EPT: + pmap->pm_eptgen++; + break; + default: + panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type); + } } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - invltlb(); + switch (pmap->pm_type) { + case PT_X86: + if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + invltlb(); + break; + case PT_EPT: + pmap->pm_eptgen++; + break; + default: + panic("pmap_invalidate_all: unknown type %d", pmap->pm_type); + } } PMAP_INLINE void @@ -1351,9 +1681,9 @@ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { - pde_store(pde, newpde); + pmap_update_pde_store(pmap, pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - pmap_update_pde_invalidate(va, newpde); + pmap_update_pde_invalidate(pmap, va, newpde); else CPU_ZERO(&pmap->pm_save); } @@ -1455,10 +1785,11 @@ pmap_extract(pmap_t pmap, vm_offset_t va { pdp_entry_t *pdpe; pd_entry_t *pde; - pt_entry_t *pte; + pt_entry_t *pte, PG_V; vm_paddr_t pa; pa = 0; + PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { @@ -1493,12 +1824,14 @@ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde, *pdep; - pt_entry_t pte; + pt_entry_t pte, PG_RW, PG_V; vm_paddr_t pa; vm_page_t m; pa = 0; m = NULL; + PG_RW = pmap_rw_bit(pmap); + PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, va); @@ -1571,16 +1904,18 @@ pmap_kenter(vm_offset_t va, vm_paddr_t p pt_entry_t *pte; pte = vtopte(va); - pte_store(pte, pa | PG_RW | PG_V | PG_G); + pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; + int cache_bits; pte = vtopte(va); - pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0)); + cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); + pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits); } /* @@ -1629,20 +1964,22 @@ pmap_qenter(vm_offset_t sva, vm_page_t * { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; + int cache_bits; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; - pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); - if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { + cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); + pa = VM_PAGE_TO_PHYS(m) | cache_bits; + if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { oldpte |= *pte; - pte_store(pte, pa | PG_G | PG_RW | PG_V); + pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V); } pte++; } - if (__predict_false((oldpte & PG_V) != 0)) + if (__predict_false((oldpte & X86_PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } @@ -1841,6 +2178,7 @@ pmap_pinit0(pmap_t pmap) TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1; + pmap->pm_flags = pmap_flags; } /* @@ -1848,9 +2186,10 @@ pmap_pinit0(pmap_t pmap) * such as one in a vmspace structure. */ int -pmap_pinit(pmap_t pmap) +pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { vm_page_t pml4pg; + vm_paddr_t pml4phys; int i; /* @@ -1860,41 +2199,61 @@ pmap_pinit(pmap_t pmap) VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) VM_WAIT; - pmap->pm_cr3 = VM_PAGE_TO_PHYS(pml4pg); - pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pmap->pm_cr3); + pml4phys = VM_PAGE_TO_PHYS(pml4pg); + pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); + pmap->pm_pcid = -1; + pmap->pm_cr3 = ~0; /* initialize to an invalid value */ if ((pml4pg->flags & PG_ZERO) == 0) pagezero(pmap->pm_pml4); - /* Wire in kernel global address entries. */ - for (i = 0; i < NKPML4E; i++) { - pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + (i << PAGE_SHIFT)) | - PG_RW | PG_V | PG_U; - } - for (i = 0; i < ndmpdpphys; i++) { - pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) | - PG_RW | PG_V | PG_U; - } + /* + * Do not install the host kernel mappings in the nested page + * tables. These mappings are meaningless in the guest physical + * address space. + */ + if ((pmap->pm_type = pm_type) == PT_X86) { + pmap->pm_cr3 = pml4phys; + + /* Wire in kernel global address entries. */ + for (i = 0; i < NKPML4E; i++) { + pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | + X86_PG_RW | X86_PG_V | PG_U; + } + for (i = 0; i < ndmpdpphys; i++) { + pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | + X86_PG_RW | X86_PG_V | PG_U; + } + + /* install self-referential address mapping entry(s) */ + pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | + X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; - /* install self-referential address mapping entry(s) */ - pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; + if (pmap_pcid_enabled) { + pmap->pm_pcid = alloc_unr(&pcid_unr); + if (pmap->pm_pcid != -1) + pmap->pm_cr3 |= pmap->pm_pcid; + } + } pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); - if (pmap_pcid_enabled) { - pmap->pm_pcid = alloc_unr(&pcid_unr); - if (pmap->pm_pcid != -1) - pmap->pm_cr3 |= pmap->pm_pcid; - } else { - pmap->pm_pcid = -1; - } + pmap->pm_flags = flags; + pmap->pm_eptgen = 0; CPU_ZERO(&pmap->pm_save); return (1); } +int +pmap_pinit(pmap_t pmap) +{ + + return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); +} + /* * This routine is called if the desired page table page does not exist. * @@ -1910,9 +2269,15 @@ static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, pdppg, pdpg; + pt_entry_t PG_A, PG_M, PG_RW, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_V = pmap_valid_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + /* * Allocate a page table page. */ @@ -2040,9 +2405,11 @@ static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t pdpindex, ptepindex; - pdp_entry_t *pdpe; + pdp_entry_t *pdpe, PG_V; vm_page_t pdpg; + PG_V = pmap_valid_bit(pmap); + retry: pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { @@ -2064,9 +2431,11 @@ static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; - pd_entry_t *pd; + pd_entry_t *pd, PG_V; vm_page_t m; + PG_V = pmap_valid_bit(pmap); + /* * Calculate pagetable page index */ @@ -2140,7 +2509,7 @@ pmap_release(pmap_t pmap) pmap_invalidate_all(pmap); } - m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); for (i = 0; i < NKPML4E; i++) /* KVA */ pmap->pm_pml4[KPML4BASE + i] = 0; @@ -2211,7 +2580,7 @@ pmap_growkernel(vm_offset_t addr) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); - if ((*pdpe & PG_V) == 0) { + if ((*pdpe & X86_PG_V) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | @@ -2221,12 +2590,12 @@ pmap_growkernel(vm_offset_t addr) if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); - *pdpe = (pdp_entry_t) - (paddr | PG_V | PG_RW | PG_A | PG_M); + *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | + X86_PG_A | X86_PG_M); continue; /* try again */ } pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201310060937.r969bw3v038058>