From owner-svn-src-user@FreeBSD.ORG Mon Apr 19 04:09:20 2010 Return-Path: Delivered-To: svn-src-user@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id CB27C106564A; Mon, 19 Apr 2010 04:09:20 +0000 (UTC) (envelope-from kmacy@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id B73E98FC23; Mon, 19 Apr 2010 04:09:20 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id o3J49KeT088284; Mon, 19 Apr 2010 04:09:20 GMT (envelope-from kmacy@svn.freebsd.org) Received: (from kmacy@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id o3J49KTC088281; Mon, 19 Apr 2010 04:09:20 GMT (envelope-from kmacy@svn.freebsd.org) Message-Id: <201004190409.o3J49KTC088281@svn.freebsd.org> From: Kip Macy Date: Mon, 19 Apr 2010 04:09:20 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r206826 - in user/kmacy/head_page_lock_2/sys: amd64/amd64 amd64/include cddl/contrib/opensolaris/uts/common/fs/zfs dev/agp dev/drm dev/md dev/ti fs/nfsclient fs/nwfs fs/smbfs fs/tmpfs k... X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 19 Apr 2010 04:09:20 -0000 Author: kmacy Date: Mon Apr 19 04:09:20 2010 New Revision: 206826 URL: http://svn.freebsd.org/changeset/base/206826 Log: - integrate page lock patch in to latest version of head - additionally lock nwfs, nfsclient, smbfs, tmpfs, agp, and drm Modified: user/kmacy/head_page_lock_2/sys/amd64/amd64/pmap.c user/kmacy/head_page_lock_2/sys/amd64/include/pmap.h user/kmacy/head_page_lock_2/sys/amd64/include/vmparam.h user/kmacy/head_page_lock_2/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c user/kmacy/head_page_lock_2/sys/dev/agp/agp.c user/kmacy/head_page_lock_2/sys/dev/agp/agp_i810.c user/kmacy/head_page_lock_2/sys/dev/drm/via_dmablit.c user/kmacy/head_page_lock_2/sys/dev/md/md.c user/kmacy/head_page_lock_2/sys/dev/ti/if_ti.c user/kmacy/head_page_lock_2/sys/fs/nfsclient/nfs_clbio.c user/kmacy/head_page_lock_2/sys/fs/nwfs/nwfs_io.c user/kmacy/head_page_lock_2/sys/fs/smbfs/smbfs_io.c user/kmacy/head_page_lock_2/sys/fs/tmpfs/tmpfs_vnops.c user/kmacy/head_page_lock_2/sys/kern/kern_exec.c user/kmacy/head_page_lock_2/sys/kern/subr_uio.c user/kmacy/head_page_lock_2/sys/kern/subr_witness.c user/kmacy/head_page_lock_2/sys/kern/sys_pipe.c user/kmacy/head_page_lock_2/sys/kern/sys_process.c user/kmacy/head_page_lock_2/sys/kern/uipc_cow.c user/kmacy/head_page_lock_2/sys/kern/uipc_shm.c user/kmacy/head_page_lock_2/sys/kern/uipc_syscalls.c user/kmacy/head_page_lock_2/sys/kern/vfs_bio.c user/kmacy/head_page_lock_2/sys/net/bpf_zerocopy.c user/kmacy/head_page_lock_2/sys/nfsclient/nfs_bio.c user/kmacy/head_page_lock_2/sys/ufs/ffs/ffs_vnops.c user/kmacy/head_page_lock_2/sys/vm/device_pager.c user/kmacy/head_page_lock_2/sys/vm/pmap.h user/kmacy/head_page_lock_2/sys/vm/sg_pager.c user/kmacy/head_page_lock_2/sys/vm/swap_pager.c user/kmacy/head_page_lock_2/sys/vm/uma_core.c user/kmacy/head_page_lock_2/sys/vm/vm_contig.c user/kmacy/head_page_lock_2/sys/vm/vm_fault.c user/kmacy/head_page_lock_2/sys/vm/vm_glue.c user/kmacy/head_page_lock_2/sys/vm/vm_kern.c user/kmacy/head_page_lock_2/sys/vm/vm_map.c user/kmacy/head_page_lock_2/sys/vm/vm_mmap.c user/kmacy/head_page_lock_2/sys/vm/vm_object.c user/kmacy/head_page_lock_2/sys/vm/vm_page.c user/kmacy/head_page_lock_2/sys/vm/vm_page.h user/kmacy/head_page_lock_2/sys/vm/vm_pageout.c user/kmacy/head_page_lock_2/sys/vm/vnode_pager.c Modified: user/kmacy/head_page_lock_2/sys/amd64/amd64/pmap.c ============================================================================== --- user/kmacy/head_page_lock_2/sys/amd64/amd64/pmap.c Mon Apr 19 03:07:47 2010 (r206825) +++ user/kmacy/head_page_lock_2/sys/amd64/amd64/pmap.c Mon Apr 19 04:09:20 2010 (r206826) @@ -160,16 +160,33 @@ __FBSDID("$FreeBSD$"); #define PMAP_INLINE #endif -#define PV_STATS #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif +#define PA_LOCK_PAD CACHE_LINE_SIZE + +struct vp_lock { + struct mtx vp_lock; + unsigned char pad[(PA_LOCK_PAD - sizeof(struct mtx))]; +}; + #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) +#define PA_LOCKPTR(pa) &pa_lock[pa_index((pa)) % PA_LOCK_COUNT].vp_lock +#define PA_LOCK(pa) mtx_lock(PA_LOCKPTR(pa)) +#define PA_TRYLOCK(pa) mtx_trylock(PA_LOCKPTR(pa)) +#define PA_UNLOCK(pa) mtx_unlock(PA_LOCKPTR(pa)) +#define PA_LOCK_ASSERT(pa, a) mtx_assert(PA_LOCKPTR(pa), (a)) + +#define PA_LOCK_COUNT 256 + +struct vp_lock pa_lock[PA_LOCK_COUNT] __aligned(CACHE_LINE_SIZE); + + struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ @@ -188,6 +205,15 @@ static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, "Are large page mappings enabled?"); +static uint64_t pmap_tryrelock_calls; +SYSCTL_QUAD(_vm_pmap, OID_AUTO, tryrelock_calls, CTLFLAG_RD, + &pmap_tryrelock_calls, 0, "Number of tryrelock calls"); + +static int pmap_tryrelock_restart; +SYSCTL_INT(_vm_pmap, OID_AUTO, tryrelock_restart, CTLFLAG_RD, + &pmap_tryrelock_restart, 0, "Number of tryrelock restarts"); + + static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ @@ -199,7 +225,8 @@ static u_int64_t DMPDPphys; /* phys addr /* * Data for the pv entry allocation mechanism */ -static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; +static int pv_entry_count __aligned(CACHE_LINE_SIZE); +static int pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; static int shpgperproc = PMAP_SHPGPERPROC; @@ -215,8 +242,9 @@ caddr_t CADDR1 = 0; static caddr_t crashdumpmap; static void free_pv_entry(pmap_t pmap, pv_entry_t pv); -static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); -static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); +static pv_entry_t get_pv_entry(pmap_t locked_pmap); +static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct pv_list_head *pv_list); static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); @@ -225,7 +253,8 @@ static pv_entry_t pmap_pvh_remove(struct static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); -static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); +static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, + struct pv_list_head *pv_list); static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va); static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, @@ -244,7 +273,7 @@ static boolean_t pmap_protect_pde(pmap_t vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, - vm_page_t *free); + vm_page_t *free, struct pv_list_head *pv_list); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free); static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); @@ -252,17 +281,17 @@ static void pmap_remove_page(pmap_t pmap vm_page_t *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); -static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); -static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags); -static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); +static vm_page_t pmap_allocpde(pmap_t pmap, vm_paddr_t pa, vm_offset_t va, int flags); +static vm_page_t pmap_allocpte(pmap_t pmap, vm_paddr_t pa, vm_offset_t va, int flags); -static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags); +static vm_page_t _pmap_allocpte(pmap_t pmap, vm_paddr_t pa, + vm_pindex_t ptepindex, int flags); static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t* free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *); @@ -271,6 +300,82 @@ static vm_offset_t pmap_kmem_choose(vm_o CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); + +#define LS_MAX 4 +struct lock_stack { + struct mtx *ls_array[LS_MAX]; + int ls_top; +}; + +static void +ls_init(struct lock_stack *ls) +{ + + ls->ls_top = 0; +} + + +#define ls_push(ls, m) _ls_push((ls), (m), LOCK_FILE, LOCK_LINE) + +static void +_ls_push(struct lock_stack *ls, struct mtx *lock, char *file, int line) +{ + + KASSERT(ls->ls_top < LS_MAX, ("lock stack overflow")); + + ls->ls_array[ls->ls_top] = lock; + ls->ls_top++; +#if LOCK_DEBUG > 0 || defined(MUTEX_NOINLINE) + _mtx_lock_flags(lock, 0, file, line); +#else + _get_sleep_lock(lock, curthread, 0, file, line); +#endif +} + +static int +ls_trypush(struct lock_stack *ls, struct mtx *lock) +{ + + KASSERT(ls->ls_top < LS_MAX, ("lock stack overflow")); + + if (mtx_trylock(lock) == 0) + return (0); + + ls->ls_array[ls->ls_top] = lock; + ls->ls_top++; + return (1); +} + +#ifdef notyet +static void +ls_pop(struct lock_stack *ls) +{ + struct mtx *lock; + + KASSERT(ls->ls_top > 0, ("lock stack underflow")); + + ls->ls_top--; + lock = ls->ls_array[ls->ls_top]; + mtx_unlock(lock); +} +#endif + +static void +ls_popa(struct lock_stack *ls) +{ + struct mtx *lock; + + KASSERT(ls->ls_top > 0, ("lock stack underflow")); + + while (ls->ls_top > 0) { + ls->ls_top--; + lock = ls->ls_array[ls->ls_top]; + mtx_unlock(lock); + } +} +#ifdef INVARIANTS +extern void kdb_backtrace(void); +#endif /* * Move the kernel virtual free pointer to the next * 2MB. This is used to help improve performance @@ -420,6 +525,37 @@ vtopde(vm_offset_t va) return (PDmap + ((va >> PDRSHIFT) & mask)); } +/* + * Try to acquire a physical address lock while a pmap is locked. If we + * fail to trylock we unlock and lock the pmap directly and cache the + * locked pa in *locked. The caller should then restart their loop in case + * the virtual to physical mapping has changed. + */ +static int +pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) +{ + vm_paddr_t lockpa; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + atomic_add_long((volatile long *)&pmap_tryrelock_calls, 1); + lockpa = *locked; + *locked = pa; + if (lockpa) { + PA_LOCK_ASSERT(lockpa, MA_OWNED); + if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) + return (0); + PA_UNLOCK(lockpa); + } + if (PA_TRYLOCK(pa)) + return 0; + PMAP_UNLOCK(pmap); + atomic_add_int((volatile int *)&pmap_tryrelock_restart, 1); + PA_LOCK(pa); + PMAP_LOCK(pmap); + + return (EAGAIN); +} + static u_int64_t allocpages(vm_paddr_t *firstaddr, int n) { @@ -529,6 +665,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; + int i; /* * Create an initial set of page tables to run the kernel in. @@ -578,6 +715,12 @@ pmap_bootstrap(vm_paddr_t *firstaddr) /* Initialize the PAT MSR. */ pmap_init_pat(); + + /* Setup page locks. */ + for (i = 0; i < PA_LOCK_COUNT; i++) + mtx_init(&pa_lock[i].vp_lock, "page lock", NULL, + MTX_DEF | MTX_RECURSE | MTX_DUPOK); + } /* @@ -651,6 +794,14 @@ pmap_page_init(vm_page_t m) m->md.pat_mode = PAT_WRITE_BACK; } +struct mtx * +pmap_page_lockptr(vm_page_t m) +{ + + KASSERT(m != NULL, ("pmap_page_lockptr: NULL page")); + return (PA_LOCKPTR(VM_PAGE_TO_PHYS(m))); +} + /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap @@ -1184,15 +1335,20 @@ pmap_extract_and_hold(pmap_t pmap, vm_of { pd_entry_t pde, *pdep; pt_entry_t pte; + vm_paddr_t pa; vm_page_t m; + pa = 0; m = NULL; - vm_page_lock_queues(); PMAP_LOCK(pmap); +retry: pdep = pmap_pde(pmap, va); if (pdep != NULL && (pde = *pdep)) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { + if (pa_tryrelock(pmap, pde & PG_PS_FRAME, &pa)) + goto retry; + m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); @@ -1201,12 +1357,15 @@ pmap_extract_and_hold(pmap_t pmap, vm_of pte = *pmap_pde_to_pte(pdep, va); if ((pte & PG_V) && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { + if (pa_tryrelock(pmap, pte & PG_FRAME, &pa)) + goto retry; m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } } } - vm_page_unlock_queues(); + if (pa) + PA_UNLOCK(pa); PMAP_UNLOCK(pmap); return (m); } @@ -1604,7 +1763,7 @@ pmap_pinit(pmap_t pmap) * race conditions. */ static vm_page_t -_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags) +_pmap_allocpte(pmap_t pmap, vm_paddr_t pa, vm_pindex_t ptepindex, int flags) { vm_page_t m, pdppg, pdpg; @@ -1619,9 +1778,9 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (flags & M_WAITOK) { PMAP_UNLOCK(pmap); - vm_page_unlock_queues(); + PA_UNLOCK(pa); VM_WAIT; - vm_page_lock_queues(); + PA_LOCK(pa); PMAP_LOCK(pmap); } @@ -1661,7 +1820,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ - if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, + if (_pmap_allocpte(pmap, pa, NUPDE + NUPDPE + pml4index, flags) == NULL) { --m->wire_count; atomic_subtract_int(&cnt.v_wire_count, 1); @@ -1694,7 +1853,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pd, recurse */ - if (_pmap_allocpte(pmap, NUPDE + pdpindex, + if (_pmap_allocpte(pmap, pa, NUPDE + pdpindex, flags) == NULL) { --m->wire_count; atomic_subtract_int(&cnt.v_wire_count, 1); @@ -1708,7 +1867,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ - if (_pmap_allocpte(pmap, NUPDE + pdpindex, + if (_pmap_allocpte(pmap, pa, NUPDE + pdpindex, flags) == NULL) { --m->wire_count; atomic_subtract_int(&cnt.v_wire_count, @@ -1735,7 +1894,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t } static vm_page_t -pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags) +pmap_allocpde(pmap_t pmap, vm_paddr_t pa, vm_offset_t va, int flags) { vm_pindex_t pdpindex, ptepindex; pdp_entry_t *pdpe; @@ -1754,7 +1913,7 @@ retry: /* Allocate a pd page. */ ptepindex = pmap_pde_pindex(va); pdpindex = ptepindex >> NPDPEPGSHIFT; - pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags); + pdpg = _pmap_allocpte(pmap, pa, NUPDE + pdpindex, flags); if (pdpg == NULL && (flags & M_WAITOK)) goto retry; } @@ -1762,11 +1921,12 @@ retry: } static vm_page_t -pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) +pmap_allocpte(pmap_t pmap, vm_paddr_t pa, vm_offset_t va, int flags) { vm_pindex_t ptepindex; pd_entry_t *pd; vm_page_t m; + struct pv_list_head pv_list; KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, @@ -1787,7 +1947,8 @@ retry: * normal 4K page. */ if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { - if (!pmap_demote_pde(pmap, pd, va)) { + TAILQ_INIT(&pv_list); + if (!pmap_demote_pde(pmap, pd, va, &pv_list)) { /* * Invalidation of the 2MB page mapping may have caused * the deallocation of the underlying PD page. @@ -1808,7 +1969,7 @@ retry: * Here if the pte page isn't mapped, or if it has been * deallocated. */ - m = _pmap_allocpte(pmap, ptepindex, flags); + m = _pmap_allocpte(pmap, pa, ptepindex, flags); if (m == NULL && (flags & M_WAITOK)) goto retry; } @@ -2014,6 +2175,7 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_coll * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ +#ifdef nomore static void pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) { @@ -2029,8 +2191,8 @@ pmap_collect(pmap_t locked_pmap, struct if (m->hold_count || m->busy) continue; TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { - va = pv->pv_va; pmap = PV_PMAP(pv); + va = pv->pv_va; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); @@ -2064,7 +2226,7 @@ pmap_collect(pmap_t locked_pmap, struct } } } - +#endif /* * free the pv_entry back to the free list @@ -2076,11 +2238,11 @@ free_pv_entry(pmap_t pmap, pv_entry_t pv struct pv_chunk *pc; int idx, field, bit; - mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + atomic_add_int(&pv_entry_count, -1); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); - pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; @@ -2099,7 +2261,9 @@ free_pv_entry(pmap_t pmap, pv_entry_t pv /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); - vm_page_unwire(m, 0); + KASSERT(m->wire_count == 1, ("wire_count == %d", m->wire_count)); + m->wire_count--; + atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free(m); } @@ -2108,7 +2272,7 @@ free_pv_entry(pmap_t pmap, pv_entry_t pv * when needed. */ static pv_entry_t -get_pv_entry(pmap_t pmap, int try) +get_pv_entry(pmap_t pmap) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; @@ -2120,16 +2284,15 @@ get_pv_entry(pmap_t pmap, int try) vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - mtx_assert(&vm_page_queue_mtx, MA_OWNED); + + atomic_add_int(&pv_entry_count, 1); PV_STAT(pv_entry_allocs++); - pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max sysctl.\n"); pq = NULL; -retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { @@ -2156,26 +2319,9 @@ retry: VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { - if (try) { - pv_entry_count--; - PV_STAT(pc_chunk_tryfail++); - return (NULL); - } - /* - * Reclaim pv entries: At first, destroy mappings to inactive - * pages. After that, if a pv chunk entry is still needed, - * destroy mappings to active pages. - */ - if (pq == NULL) { - PV_STAT(pmap_collect_inactive++); - pq = &vm_page_queues[PQ_INACTIVE]; - } else if (pq == &vm_page_queues[PQ_INACTIVE]) { - PV_STAT(pmap_collect_active++); - pq = &vm_page_queues[PQ_ACTIVE]; - } else - panic("get_pv_entry: increase vm.pmap.shpgperproc"); - pmap_collect(pmap, pq); - goto retry; + PV_STAT(pc_chunk_tryfail++); + atomic_add_int(&pv_entry_count, -1); + return (NULL); } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); @@ -2189,9 +2335,63 @@ retry: pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); + return (pv); } +static void +pmap_pv_list_free(pmap_t pmap, struct pv_list_head *pv_list) +{ + pv_entry_t pv; + + while (!TAILQ_EMPTY(pv_list)) { + pv = TAILQ_FIRST(pv_list); + TAILQ_REMOVE(pv_list, pv, pv_list); + free_pv_entry(pmap, pv); + } +} + +static boolean_t +pmap_pv_list_alloc(pmap_t pmap, int count, struct pv_list_head *pv_list) +{ + pv_entry_t pv; + int i; + boolean_t slept; + + slept = FALSE; + for (i = 0; i < count; i++) { + while ((pv = get_pv_entry(pmap)) == NULL) { + PMAP_UNLOCK(pmap); + slept = TRUE; + VM_WAIT; + PMAP_LOCK(pmap); + } + TAILQ_INSERT_HEAD(pv_list, pv, pv_list); + } + + return (slept); +} + +static boolean_t +pmap_pv_list_try_alloc(pmap_t pmap, int count, struct pv_list_head *pv_list) +{ + pv_entry_t pv; + int i; + boolean_t success; + + success = TRUE; + for (i = 0; i < count; i++) { + if ((pv = get_pv_entry(pmap)) == NULL) { + success = FALSE; + pmap_pv_list_free(pmap, pv_list); + goto done; + } + TAILQ_INSERT_HEAD(pv_list, pv, pv_list); + } +done: + return (success); +} + /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL @@ -2203,7 +2403,8 @@ pmap_pvh_remove(struct md_page *pvh, pma { pv_entry_t pv; - mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); @@ -2219,27 +2420,37 @@ pmap_pvh_remove(struct md_page *pvh, pma * entries for each of the 4KB page mappings. */ static void -pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct pv_list_head *pv_list) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; - mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PA_LOCK_ASSERT(pa, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); - /* - * Transfer the 2mpage's pv entry for this mapping to the first - * page's pv list. - */ + /* Transfer the 2mpage's pv entry for this mapping to the first + * page's pv list. + */ pvh = pa_to_pvh(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); +#ifdef INVARIANTS + if (va == 0) { + printf("inserting va==0\n"); + kdb_backtrace(); + } +#endif + vm_page_lock(m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + vm_page_unlock(m); + /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { @@ -2247,8 +2458,20 @@ pmap_pv_demote_pde(pmap_t pmap, vm_offse KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); va += PAGE_SIZE; - pmap_insert_entry(pmap, va, m); + pv = TAILQ_FIRST(pv_list); + TAILQ_REMOVE(pv_list, pv, pv_list); +#ifdef INVARIANTS + if (va == 0) { + printf("inserting va==0\n"); + kdb_backtrace(); + } +#endif + pv->pv_va = va; + vm_page_lock(m); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + vm_page_unlock(m); } while (va < va_last); + } /* @@ -2264,7 +2487,7 @@ pmap_pv_promote_pde(pmap_t pmap, vm_offs vm_offset_t va_last; vm_page_t m; - mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PA_LOCK_ASSERT(pa, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); @@ -2310,7 +2533,8 @@ pmap_remove_entry(pmap_t pmap, vm_page_t { struct md_page *pvh; - mtx_assert(&vm_page_queue_mtx, MA_OWNED); + vm_page_lock_assert(m, MA_OWNED); + pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list)) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); @@ -2320,22 +2544,6 @@ pmap_remove_entry(pmap_t pmap, vm_page_t } /* - * Create a pv entry for page at pa for - * (pmap, va). - */ -static void -pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) -{ - pv_entry_t pv; - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - mtx_assert(&vm_page_queue_mtx, MA_OWNED); - pv = get_pv_entry(pmap, FALSE); - pv->pv_va = va; - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); -} - -/* * Conditionally create a pv entry. */ static boolean_t @@ -2344,9 +2552,15 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - mtx_assert(&vm_page_queue_mtx, MA_OWNED); + vm_page_lock_assert(m, MA_OWNED); if (pv_entry_count < pv_entry_high_water && - (pv = get_pv_entry(pmap, TRUE)) != NULL) { + (pv = get_pv_entry(pmap)) != NULL) { +#ifdef INVARIANTS + if (va == 0) { + printf("inserting va==0\n"); + kdb_backtrace(); + } +#endif pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); return (TRUE); @@ -2363,9 +2577,16 @@ pmap_pv_insert_pde(pmap_t pmap, vm_offse struct md_page *pvh; pv_entry_t pv; - mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PA_LOCK_ASSERT(pa, MA_OWNED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && - (pv = get_pv_entry(pmap, TRUE)) != NULL) { + (pv = get_pv_entry(pmap)) != NULL) { +#ifdef INVARIANTS + if (va == 0) { + printf("inserting va==0\n"); + kdb_backtrace(); + } +#endif pv->pv_va = va; pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); @@ -2393,7 +2614,8 @@ pmap_fill_ptp(pt_entry_t *firstpte, pt_e * mapping is invalidated. */ static boolean_t -pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) +pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, + struct pv_list_head *pv_list) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; @@ -2429,7 +2651,7 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { free = NULL; - pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free); + pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free, pv_list); pmap_invalidate_page(pmap, trunc_2mpage(va)); pmap_free_zero_pages(free); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" @@ -2439,6 +2661,10 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t if (va < VM_MAXUSER_ADDRESS) pmap->pm_stats.resident_count++; } + if (TAILQ_EMPTY(pv_list) && ((oldpde & PG_MANAGED) != 0)) { + if (pmap_pv_list_try_alloc(pmap, NPTEPG-1, pv_list) == FALSE) + return (FALSE); + } mptepa = VM_PAGE_TO_PHYS(mpte); firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; @@ -2496,7 +2722,7 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t * the 2mpage to referencing the page table page. */ if ((oldpde & PG_MANAGED) != 0) - pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); + pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, pv_list); pmap_pde_demotions++; CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" @@ -2509,7 +2735,7 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t */ static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, - vm_page_t *free) + vm_page_t *free, struct pv_list_head *pv_list) { struct md_page *pvh; pd_entry_t oldpde; @@ -2536,6 +2762,10 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { + /* + * XXX do we need to individually lock each page? + * + */ if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) @@ -2546,7 +2776,7 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t } } if (pmap == kernel_pmap) { - if (!pmap_demote_pde(pmap, pdq, sva)) + if (!pmap_demote_pde(pmap, pdq, sva, pv_list)) panic("pmap_remove_pde: failed demotion"); } else { mpte = pmap_lookup_pt_page(pmap, sva); @@ -2563,6 +2793,7 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); } + /* * pmap_remove_pte: do the things to unmap a page in a process */ @@ -2586,6 +2817,7 @@ pmap_remove_pte(pmap_t pmap, pt_entry_t pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); + vm_page_lock_assert(m, MA_OWNED); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) @@ -2602,6 +2834,7 @@ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free) { pt_entry_t *pte; + vm_paddr_t pa = 0; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((*pde & PG_V) == 0) @@ -2609,10 +2842,89 @@ pmap_remove_page(pmap_t pmap, vm_offset_ pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) return; + if (*pte & PG_MANAGED) + (void)pa_tryrelock(pmap, *pte & PG_FRAME, &pa); + pmap_remove_pte(pmap, pte, va, *pde, free); + if (pa) + PA_UNLOCK(pa); pmap_invalidate_page(pmap, va); } +static void +pmap_prealloc_pv_list(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + struct pv_list_head *pv_list) +{ + vm_offset_t va_next; + pml4_entry_t *pml4e; + pdp_entry_t *pdpe; + pd_entry_t ptpaddr, *pde; + pt_entry_t *pte; + int i, alloc_count; + + alloc_count = 0; + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + + pml4e = pmap_pml4e(pmap, sva); + if ((*pml4e & PG_V) == 0) { + va_next = (sva + NBPML4) & ~PML4MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + pdpe = pmap_pml4e_to_pdpe(pml4e, sva); + if ((*pdpe & PG_V) == 0) { + va_next = (sva + NBPDP) & ~PDPMASK; + if (va_next < sva) + va_next = eva; + continue; + } + + /* + * Calculate index for next page table. + */ + va_next = (sva + NBPDR) & ~PDRMASK; + if (va_next < sva) + va_next = eva; + + pde = pmap_pdpe_to_pde(pdpe, sva); + ptpaddr = *pde; + + /* + * Weed out invalid mappings. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & PG_PS) != 0) { + alloc_count++; + continue; + } + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being removed. + */ + if (va_next > eva) + va_next = eva; + + for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + if (*pte == 0) + continue; + } + } + for (i = 0; i < alloc_count; i++) + pmap_pv_list_alloc(pmap, NPTEPG-1, pv_list); + + PMAP_UNLOCK(pmap); +} + /* * Remove the given range of addresses from the specified map. * @@ -2627,7 +2939,9 @@ pmap_remove(pmap_t pmap, vm_offset_t sva pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte; + vm_paddr_t pa; vm_page_t free = NULL; + struct pv_list_head pv_list; int anyvalid; /* @@ -2636,11 +2950,19 @@ pmap_remove(pmap_t pmap, vm_offset_t sva if (pmap->pm_stats.resident_count == 0) return; - anyvalid = 0; + pa = anyvalid = 0; + TAILQ_INIT(&pv_list); - vm_page_lock_queues(); - PMAP_LOCK(pmap); + /* + * pre-allocate pvs + * + */ + if ((pmap == kernel_pmap) && + (sva + PAGE_SIZE != eva)) + pmap_prealloc_pv_list(pmap, sva, eva, &pv_list); + PMAP_LOCK(pmap); +restart: /* * special handling of removing one page. a very * common operation and easy to short circuit some @@ -2695,6 +3017,11 @@ pmap_remove(pmap_t pmap, vm_offset_t sva * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { + if (pa_tryrelock(pmap, ptpaddr & PG_FRAME, &pa)) { + va_next = sva; + continue; + } + /* * Are we removing the entire large page? If not, * demote the mapping and fall through. @@ -2706,9 +3033,9 @@ pmap_remove(pmap_t pmap, vm_offset_t sva */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; - pmap_remove_pde(pmap, pde, sva, &free); + pmap_remove_pde(pmap, pde, sva, &free, &pv_list); continue; - } else if (!pmap_demote_pde(pmap, pde, sva)) { + } else if (!pmap_demote_pde(pmap, pde, sva, &pv_list)) { /* The large page mapping was destroyed. */ continue; } else @@ -2725,23 +3052,35 @@ pmap_remove(pmap_t pmap, vm_offset_t sva for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { + int ret; + if (*pte == 0) continue; + if ((*pte & PG_MANAGED) && + pa_tryrelock(pmap, *pte & PG_FRAME, &pa)) + goto restart; + /* * The TLB entry for a PG_G mapping is invalidated * by pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = 1; - if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free)) + ret = pmap_remove_pte(pmap, pte, sva, ptpaddr, &free); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***