Date: Sat, 10 Sep 2016 17:14:57 +0000 (UTC) From: Alan Cox <alc@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r305688 - in user/alc/PQ_LAUNDRY: sys/amd64/amd64 sys/arm/arm sys/arm64/arm64 sys/cddl/compat/opensolaris/sys sys/i386/i386 sys/kern sys/powerpc/booke sys/powerpc/conf sys/riscv/riscv s... Message-ID: <201609101714.u8AHEvmd031762@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: alc Date: Sat Sep 10 17:14:57 2016 New Revision: 305688 URL: https://svnweb.freebsd.org/changeset/base/305688 Log: MFH r305685 Added: user/alc/PQ_LAUNDRY/tests/sys/kern/waitpid_nohang.c - copied unchanged from r305685, head/tests/sys/kern/waitpid_nohang.c Modified: user/alc/PQ_LAUNDRY/sys/amd64/amd64/pmap.c user/alc/PQ_LAUNDRY/sys/arm/arm/pmap-v6.c user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c user/alc/PQ_LAUNDRY/sys/cddl/compat/opensolaris/sys/random.h user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c user/alc/PQ_LAUNDRY/sys/kern/kern_exit.c user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c user/alc/PQ_LAUNDRY/sys/kern/uipc_syscalls.c user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c user/alc/PQ_LAUNDRY/sys/powerpc/booke/pmap.c user/alc/PQ_LAUNDRY/sys/powerpc/conf/MPC85XX user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c user/alc/PQ_LAUNDRY/sys/sparc64/sparc64/pmap.c user/alc/PQ_LAUNDRY/sys/vm/pmap.h user/alc/PQ_LAUNDRY/tests/sys/kern/Makefile Directory Properties: user/alc/PQ_LAUNDRY/ (props changed) Modified: user/alc/PQ_LAUNDRY/sys/amd64/amd64/pmap.c ============================================================================== --- user/alc/PQ_LAUNDRY/sys/amd64/amd64/pmap.c Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/amd64/amd64/pmap.c Sat Sep 10 17:14:57 2016 (r305688) @@ -5816,8 +5816,6 @@ safe_to_clear_referenced(pmap_t pmap, pt return (FALSE); } -#define PMAP_TS_REFERENCED_MAX 5 - /* * pmap_ts_referenced: * @@ -5826,10 +5824,6 @@ safe_to_clear_referenced(pmap_t pmap, pt * is necessary that 0 only be returned when there are truly no * reference bits set. * - * XXX: The exact number of bits to check and clear is a matter that - * should be tested and standardized at some point in the future for - * optimal aging of shared pages. - * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls @@ -5898,7 +5892,7 @@ retry: */ vm_page_dirty(m); } - if ((*pde & PG_A) != 0) { + if ((oldpde & PG_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is @@ -5919,7 +5913,7 @@ retry: */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && - (*pde & PG_W) == 0) { + (oldpde & PG_W) == 0) { if (safe_to_clear_referenced(pmap, oldpde)) { atomic_clear_long(pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); Modified: user/alc/PQ_LAUNDRY/sys/arm/arm/pmap-v6.c ============================================================================== --- user/alc/PQ_LAUNDRY/sys/arm/arm/pmap-v6.c Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/arm/arm/pmap-v6.c Sat Sep 10 17:14:57 2016 (r305688) @@ -5161,8 +5161,6 @@ pmap_is_referenced(vm_page_t m) return (rv); } -#define PMAP_TS_REFERENCED_MAX 5 - /* * pmap_ts_referenced: * @@ -5171,10 +5169,6 @@ pmap_is_referenced(vm_page_t m) * is necessary that 0 only be returned when there are truly no * reference bits set. * - * XXX: The exact number of bits to check and clear is a matter that - * should be tested and standardized at some point in the future for - * optimal aging of shared pages. - * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls Modified: user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c ============================================================================== --- user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c Sat Sep 10 17:14:57 2016 (r305688) @@ -3880,8 +3880,6 @@ safe_to_clear_referenced(pmap_t pmap, pt return (FALSE); } -#define PMAP_TS_REFERENCED_MAX 5 - /* * pmap_ts_referenced: * @@ -3890,9 +3888,13 @@ safe_to_clear_referenced(pmap_t pmap, pt * is necessary that 0 only be returned when there are truly no * reference bits set. * - * XXX: The exact number of bits to check and clear is a matter that - * should be tested and standardized at some point in the future for - * optimal aging of shared pages. + * As an optimization, update the page's dirty field if a modified bit is + * found while counting reference bits. This opportunistic update can be + * performed at low cost and can eliminate the need for some future calls + * to pmap_is_modified(). However, since this function stops after + * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some + * dirty pages. Those dirty pages will only be detected by a future call + * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) @@ -3947,6 +3949,14 @@ retry: ("pmap_ts_referenced: found an invalid l1 table")); pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); + if (pmap_page_dirty(tpte)) { + /* + * Although "tpte" is mapping a 2MB page, because + * this function is called at a 4KB page granularity, + * we only update the 4KB page under test. + */ + vm_page_dirty(m); + } if ((tpte & ATTR_AF) != 0) { /* * Since this reference bit is shared by 512 4KB @@ -4043,6 +4053,8 @@ small_mappings: ("pmap_ts_referenced: found an invalid l2 table")); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); + if (pmap_page_dirty(tpte)) + vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) { if (safe_to_clear_referenced(pmap, tpte)) { /* Modified: user/alc/PQ_LAUNDRY/sys/cddl/compat/opensolaris/sys/random.h ============================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/compat/opensolaris/sys/random.h Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/cddl/compat/opensolaris/sys/random.h Sat Sep 10 17:14:57 2016 (r305688) @@ -32,6 +32,6 @@ #include_next <sys/random.h> #define random_get_bytes(p, s) read_random((p), (int)(s)) -#define random_get_pseudo_bytes(p, s) read_random((p), (int)(s)) +#define random_get_pseudo_bytes(p, s) arc4rand((p), (int)(s), 0) #endif /* !_OPENSOLARIS_SYS_RANDOM_H_ */ Modified: user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c ============================================================================== --- user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c Sat Sep 10 17:14:57 2016 (r305688) @@ -4765,8 +4765,6 @@ retry: rw_wunlock(&pvh_global_lock); } -#define PMAP_TS_REFERENCED_MAX 5 - /* * pmap_ts_referenced: * @@ -4775,10 +4773,6 @@ retry: * is necessary that 0 only be returned when there are truly no * reference bits set. * - * XXX: The exact number of bits to check and clear is a matter that - * should be tested and standardized at some point in the future for - * optimal aging of shared pages. - * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls Modified: user/alc/PQ_LAUNDRY/sys/kern/kern_exit.c ============================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/kern_exit.c Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/kern/kern_exit.c Sat Sep 10 17:14:57 2016 (r305688) @@ -723,9 +723,9 @@ sys_wait4(struct thread *td, struct wait else rup = NULL; error = kern_wait(td, uap->pid, &status, uap->options, rup); - if (uap->status != NULL && error == 0) + if (uap->status != NULL && error == 0 && td->td_retval[0] != 0) error = copyout(&status, uap->status, sizeof(status)); - if (uap->rusage != NULL && error == 0) + if (uap->rusage != NULL && error == 0 && td->td_retval[0] != 0) error = copyout(&ru, uap->rusage, sizeof(struct rusage)); return (error); } @@ -759,9 +759,9 @@ sys_wait6(struct thread *td, struct wait */ error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip); - if (uap->status != NULL && error == 0) + if (uap->status != NULL && error == 0 && td->td_retval[0] != 0) error = copyout(&status, uap->status, sizeof(status)); - if (uap->wrusage != NULL && error == 0) + if (uap->wrusage != NULL && error == 0 && td->td_retval[0] != 0) error = copyout(&wru, uap->wrusage, sizeof(wru)); if (uap->info != NULL && error == 0) error = copyout(&si, uap->info, sizeof(si)); Modified: user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c ============================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c Sat Sep 10 17:14:57 2016 (r305688) @@ -623,6 +623,14 @@ static struct witness_order_list_entry o { "vnode interlock", &lock_class_mtx_sleep }, { NULL, NULL }, /* + * VFS namecache + */ + { "ncglobal", &lock_class_rw }, + { "ncbuc", &lock_class_rw }, + { "vnode interlock", &lock_class_mtx_sleep }, + { "ncneg", &lock_class_mtx_sleep }, + { NULL, NULL }, + /* * ZFS locking */ { "dn->dn_mtx", &lock_class_sx }, Modified: user/alc/PQ_LAUNDRY/sys/kern/uipc_syscalls.c ============================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/uipc_syscalls.c Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/kern/uipc_syscalls.c Sat Sep 10 17:14:57 2016 (r305688) @@ -1115,7 +1115,7 @@ orecvfrom(struct thread *td, struct recv #ifdef COMPAT_OLDSOCK int -orecv(struct thread *td, struct orecv_args) +orecv(struct thread *td, struct orecv_args *uap) { struct msghdr msg; struct iovec aiov; Modified: user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c ============================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c Sat Sep 10 17:14:57 2016 (r305688) @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include <sys/proc.h> #include <sys/rwlock.h> #include <sys/sdt.h> +#include <sys/smp.h> #include <sys/syscallsubr.h> #include <sys/sysctl.h> #include <sys/sysproto.h> @@ -148,6 +149,23 @@ struct namecache_ts { * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. + * + * These locks are used (in the order in which they can be taken): + * NAME TYPE ROLE + * cache_lock rwlock global, needed for all modifications + * bucketlock rwlock for access to given hash bucket + * ncneg_mtx mtx negative entry LRU management + * + * A name -> vnode lookup can be safely performed by either locking cache_lock + * or the relevant hash bucket. + * + * ".." and vnode -> name lookups require cache_lock. + * + * Modifications require both cache_lock and relevant bucketlock taken for + * writing. + * + * Negative entry LRU management requires ncneg_mtx taken on top of either + * cache_lock or bucketlock. */ /* @@ -179,8 +197,9 @@ SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor struct nchstats nchstats; /* cache effectiveness statistics */ static struct rwlock cache_lock; -RW_SYSINIT(vfscache, &cache_lock, "Name Cache"); +RW_SYSINIT(vfscache, &cache_lock, "ncglobal"); +#define CACHE_TRY_WLOCK() rw_try_wlock(&cache_lock) #define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock) #define CACHE_RLOCK() rw_rlock(&cache_lock) #define CACHE_RUNLOCK() rw_runlock(&cache_lock) @@ -188,7 +207,12 @@ RW_SYSINIT(vfscache, &cache_lock, "Name #define CACHE_WUNLOCK() rw_wunlock(&cache_lock) static struct mtx_padalign ncneg_mtx; -MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "Name Cache neg", MTX_DEF); +MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "ncneg", MTX_DEF); + +static u_int numbucketlocks; +static struct rwlock_padalign *bucketlocks; +#define HASH2BUCKETLOCK(hash) \ + ((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)])) /* * UMA zones for the VFS cache. @@ -307,6 +331,8 @@ STATNODE_COUNTER(numfullpathfail4, "Numb STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); static long numupgrades; STATNODE_ULONG(numupgrades, "Number of updates of the cache after lookup (write lock + retry)"); +static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, + "Number of times bucketlocked zap_and_exit case failed to writelock"); static void cache_zap(struct namecache *ncp); static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf, @@ -326,6 +352,39 @@ cache_get_hash(char *name, u_char len, s return (hash); } +#ifdef INVARIANTS +static void +cache_assert_bucket_locked(struct namecache *ncp, int mode) +{ + struct rwlock *bucketlock; + uint32_t hash; + + hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp); + bucketlock = HASH2BUCKETLOCK(hash); + rw_assert(bucketlock, mode); +} +#else +#define cache_assert_bucket_locked(x, y) do { } while (0) +#endif + +static void +cache_lock_all_buckets(void) +{ + u_int i; + + for (i = 0; i < numbucketlocks; i++) + rw_wlock(&bucketlocks[i]); +} + +static void +cache_unlock_all_buckets(void) +{ + u_int i; + + for (i = 0; i < numbucketlocks; i++) + rw_wunlock(&bucketlocks[i]); +} + static int sysctl_nchstats(SYSCTL_HANDLER_ARGS) { @@ -442,21 +501,13 @@ SYSCTL_PROC(_debug_hashstat, OID_AUTO, n * Negative entries management */ static void -cache_negative_hit(struct namecache *ncp, int wlocked) +cache_negative_hit(struct namecache *ncp) { - if (!wlocked) { - rw_assert(&cache_lock, RA_RLOCKED); - mtx_lock(&ncneg_mtx); - } else { - rw_assert(&cache_lock, RA_WLOCKED); - } - + mtx_lock(&ncneg_mtx); TAILQ_REMOVE(&ncneg, ncp, nc_dst); TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); - - if (!wlocked) - mtx_unlock(&ncneg_mtx); + mtx_unlock(&ncneg_mtx); } static void @@ -464,9 +515,12 @@ cache_negative_insert(struct namecache * { rw_assert(&cache_lock, RA_WLOCKED); + cache_assert_bucket_locked(ncp, RA_WLOCKED); MPASS(ncp->nc_vp == NULL); + mtx_lock(&ncneg_mtx); TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); numneg++; + mtx_unlock(&ncneg_mtx); } static void @@ -474,9 +528,12 @@ cache_negative_remove(struct namecache * { rw_assert(&cache_lock, RA_WLOCKED); + cache_assert_bucket_locked(ncp, RA_WLOCKED); MPASS(ncp->nc_vp == NULL); + mtx_lock(&ncneg_mtx); TAILQ_REMOVE(&ncneg, ncp, nc_dst); numneg--; + mtx_unlock(&ncneg_mtx); } static struct namecache * @@ -499,10 +556,11 @@ cache_negative_zap_one(void) * pointer to a vnode or if it is just a negative cache entry. */ static void -cache_zap(struct namecache *ncp) +cache_zap_locked(struct namecache *ncp) { rw_assert(&cache_lock, RA_WLOCKED); + cache_assert_bucket_locked(ncp, RA_WLOCKED); CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp); if (ncp->nc_vp != NULL) { SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, @@ -532,6 +590,21 @@ cache_zap(struct namecache *ncp) numcache--; } +static void +cache_zap(struct namecache *ncp) +{ + struct rwlock *bucketlock; + uint32_t hash; + + rw_assert(&cache_lock, RA_WLOCKED); + + hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp); + bucketlock = HASH2BUCKETLOCK(hash); + rw_wlock(bucketlock); + cache_zap_locked(ncp); + rw_wunlock(bucketlock); +} + /* * Lookup an entry in the cache * @@ -549,22 +622,42 @@ cache_zap(struct namecache *ncp) * not recursively acquired. */ +enum { UNLOCKED, WLOCKED, RLOCKED }; + +static void +cache_unlock(int cache_locked) +{ + + switch (cache_locked) { + case UNLOCKED: + break; + case WLOCKED: + CACHE_WUNLOCK(); + break; + case RLOCKED: + CACHE_RUNLOCK(); + break; + } +} + int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { + struct rwlock *bucketlock; struct namecache *ncp; uint32_t hash; - int error, ltype, wlocked; + int error, ltype, cache_locked; if (!doingcache) { cnp->cn_flags &= ~MAKEENTRY; return (0); } retry: - wlocked = 0; - counter_u64_add(numcalls, 1); + bucketlock = NULL; + cache_locked = UNLOCKED; error = 0; + counter_u64_add(numcalls, 1); retry_wlocked: if (cnp->cn_nameptr[0] == '.') { @@ -598,17 +691,21 @@ retry_wlocked: } return (-1); } - if (!wlocked) - CACHE_RLOCK(); if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { counter_u64_add(dotdothits, 1); + if (cache_locked == UNLOCKED) { + CACHE_RLOCK(); + cache_locked = RLOCKED; + } + if (dvp->v_cache_dd == NULL) { SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); goto unlock; } if ((cnp->cn_flags & MAKEENTRY) == 0) { - if (!wlocked && !CACHE_UPGRADE_LOCK()) + if (cache_locked != WLOCKED && + !CACHE_UPGRADE_LOCK()) goto wlock; ncp = NULL; if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) { @@ -639,10 +736,14 @@ retry_wlocked: nc_dotdottime; goto success; } - } else if (!wlocked) - CACHE_RLOCK(); + } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); + if (cache_locked == UNLOCKED) { + bucketlock = HASH2BUCKETLOCK(hash); + rw_rlock(bucketlock); + } + LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { counter_u64_add(numchecks, 1); if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && @@ -665,12 +766,7 @@ retry_wlocked: /* We don't want to have an entry, so dump it */ if ((cnp->cn_flags & MAKEENTRY) == 0) { counter_u64_add(numposzaps, 1); - if (!wlocked && !CACHE_UPGRADE_LOCK()) - goto wlock; - cache_zap(ncp); - CACHE_WUNLOCK(); - cache_free(ncp); - return (0); + goto zap_and_exit; } /* We found a "positive" match, return the vnode */ @@ -689,25 +785,20 @@ negative_success: /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { counter_u64_add(numnegzaps, 1); - if (!wlocked && !CACHE_UPGRADE_LOCK()) - goto wlock; - cache_zap(ncp); - CACHE_WUNLOCK(); - cache_free(ncp); - return (0); + goto zap_and_exit; } counter_u64_add(numneghits, 1); - cache_negative_hit(ncp, wlocked); + cache_negative_hit(ncp); if (ncp->nc_flag & NCF_WHITE) cnp->cn_flags |= ISWHITEOUT; SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, nc_get_name(ncp)); cache_out_ts(ncp, tsp, ticksp); - if (wlocked) - CACHE_WUNLOCK(); - else - CACHE_RUNLOCK(); + MPASS(bucketlock != NULL || cache_locked != UNLOCKED); + if (bucketlock != NULL) + rw_runlock(bucketlock); + cache_unlock(cache_locked); return (ENOENT); wlock: @@ -716,9 +807,10 @@ wlock: * a write lock and retry the operation. */ CACHE_RUNLOCK(); +wlock_unlocked: CACHE_WLOCK(); numupgrades++; - wlocked = 1; + cache_locked = WLOCKED; goto retry_wlocked; success: @@ -733,10 +825,10 @@ success: VOP_UNLOCK(dvp, 0); } vhold(*vpp); - if (wlocked) - CACHE_WUNLOCK(); - else - CACHE_RUNLOCK(); + MPASS(bucketlock != NULL || cache_locked != UNLOCKED); + if (bucketlock != NULL) + rw_runlock(bucketlock); + cache_unlock(cache_locked); error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread); if (cnp->cn_flags & ISDOTDOT) { vn_lock(dvp, ltype | LK_RETRY); @@ -758,10 +850,29 @@ success: return (-1); unlock: - if (wlocked) - CACHE_WUNLOCK(); - else - CACHE_RUNLOCK(); + MPASS(bucketlock != NULL || cache_locked != UNLOCKED); + if (bucketlock != NULL) + rw_runlock(bucketlock); + cache_unlock(cache_locked); + return (0); + +zap_and_exit: + if (bucketlock != NULL) { + rw_assert(&cache_lock, RA_UNLOCKED); + if (!CACHE_TRY_WLOCK()) { + rw_runlock(bucketlock); + bucketlock = NULL; + zap_and_exit_bucket_fail++; + goto wlock_unlocked; + } + cache_locked = WLOCKED; + rw_runlock(bucketlock); + bucketlock = NULL; + } else if (cache_locked != WLOCKED && !CACHE_UPGRADE_LOCK()) + goto wlock; + cache_zap(ncp); + CACHE_WUNLOCK(); + cache_free(ncp); return (0); } @@ -772,6 +883,7 @@ void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp) { + struct rwlock *bucketlock; struct namecache *ncp, *n2, *ndd, *nneg; struct namecache_ts *n3; struct nchashhead *ncpp; @@ -924,11 +1036,6 @@ cache_enter_time(struct vnode *dvp, stru } } - /* - * Insert the new namecache entry into the appropriate chain - * within the cache entries table. - */ - LIST_INSERT_HEAD(ncpp, ncp, nc_hash); if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { vhold(dvp); @@ -937,6 +1044,15 @@ cache_enter_time(struct vnode *dvp, stru LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } + bucketlock = HASH2BUCKETLOCK(hash); + rw_wlock(bucketlock); + + /* + * Insert the new namecache entry into the appropriate chain + * within the cache entries table. + */ + LIST_INSERT_HEAD(ncpp, ncp, nc_hash); + /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the @@ -953,6 +1069,7 @@ cache_enter_time(struct vnode *dvp, stru SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, nc_get_name(ncp)); } + rw_wunlock(bucketlock); if (numneg * ncnegfactor > numcache) nneg = cache_negative_zap_one(); CACHE_WUNLOCK(); @@ -960,12 +1077,24 @@ cache_enter_time(struct vnode *dvp, stru cache_free(nneg); } +static u_int +cache_roundup_2(u_int val) +{ + u_int res; + + for (res = 1; res <= val; res <<= 1) + continue; + + return (res); +} + /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { + u_int i; TAILQ_INIT(&ncneg); @@ -983,6 +1112,13 @@ nchinit(void *dummy __unused) NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); + numbucketlocks = cache_roundup_2(mp_ncpus * 16); + if (numbucketlocks > nchash) + numbucketlocks = nchash; + bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, + M_WAITOK | M_ZERO); + for (i = 0; i < numbucketlocks; i++) + rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK); numcalls = counter_u64_alloc(M_WAITOK); dothits = counter_u64_alloc(M_WAITOK); @@ -1023,6 +1159,7 @@ cache_changesize(int newmaxvnodes) * because to do so, they have to be removed from the hash table. */ CACHE_WLOCK(); + cache_lock_all_buckets(); old_nchashtbl = nchashtbl; old_nchash = nchash; nchashtbl = new_nchashtbl; @@ -1035,6 +1172,7 @@ cache_changesize(int newmaxvnodes) LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); } } + cache_unlock_all_buckets(); CACHE_WUNLOCK(); free(old_nchashtbl, M_VFSCACHE); } @@ -1108,20 +1246,30 @@ void cache_purgevfs(struct mount *mp) { TAILQ_HEAD(, namecache) ncps; - struct nchashhead *ncpp; + struct rwlock *bucketlock; + struct nchashhead *bucket; struct namecache *ncp, *nnp; + u_long i, j, n_nchash; /* Scan hash tables for applicable entries */ SDT_PROBE1(vfs, namecache, purgevfs, done, mp); TAILQ_INIT(&ncps); CACHE_WLOCK(); - for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { - LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) { - if (ncp->nc_dvp->v_mount != mp) - continue; - cache_zap(ncp); - TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); + n_nchash = nchash + 1; + for (i = 0; i < numbucketlocks; i++) { + bucketlock = (struct rwlock *)&bucketlocks[i]; + rw_wlock(bucketlock); + for (j = i; j < n_nchash; j += numbucketlocks) { + bucket = &nchashtbl[j]; + LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { + cache_assert_bucket_locked(ncp, RA_WLOCKED); + if (ncp->nc_dvp->v_mount != mp) + continue; + cache_zap_locked(ncp); + TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); + } } + rw_wunlock(bucketlock); } CACHE_WUNLOCK(); TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { Modified: user/alc/PQ_LAUNDRY/sys/powerpc/booke/pmap.c ============================================================================== --- user/alc/PQ_LAUNDRY/sys/powerpc/booke/pmap.c Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/powerpc/booke/pmap.c Sat Sep 10 17:14:57 2016 (r305688) @@ -2499,9 +2499,13 @@ mmu_booke_clear_modify(mmu_t mmu, vm_pag * is necessary that 0 only be returned when there are truly no * reference bits set. * - * XXX: The exact number of bits to check and clear is a matter that - * should be tested and standardized at some point in the future for - * optimal aging of shared pages. + * As an optimization, update the page's dirty field if a modified bit is + * found while counting reference bits. This opportunistic update can be + * performed at low cost and can eliminate the need for some future calls + * to pmap_is_modified(). However, since this function stops after + * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some + * dirty pages. Those dirty pages will only be detected by a future call + * to pmap_is_modified(). */ static int mmu_booke_ts_referenced(mmu_t mmu, vm_page_t m) @@ -2518,6 +2522,8 @@ mmu_booke_ts_referenced(mmu_t mmu, vm_pa PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { + if (PTE_ISMODIFIED(pte)) + vm_page_dirty(m); if (PTE_ISREFERENCED(pte)) { mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); @@ -2528,7 +2534,7 @@ mmu_booke_ts_referenced(mmu_t mmu, vm_pa tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); - if (++count > 4) { + if (++count >= PMAP_TS_REFERENCED_MAX) { PMAP_UNLOCK(pv->pv_pmap); break; } Modified: user/alc/PQ_LAUNDRY/sys/powerpc/conf/MPC85XX ============================================================================== --- user/alc/PQ_LAUNDRY/sys/powerpc/conf/MPC85XX Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/powerpc/conf/MPC85XX Sat Sep 10 17:14:57 2016 (r305688) @@ -89,6 +89,7 @@ device tun device uart options USB_DEBUG # enable debug msgs #device uhci +device ehci device umass device usb device vlan Modified: user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c ============================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c Sat Sep 10 17:14:57 2016 (r305688) @@ -2991,8 +2991,6 @@ safe_to_clear_referenced(pmap_t pmap, pt return (FALSE); } -#define PMAP_TS_REFERENCED_MAX 5 - /* * pmap_ts_referenced: * @@ -3001,9 +2999,13 @@ safe_to_clear_referenced(pmap_t pmap, pt * is necessary that 0 only be returned when there are truly no * reference bits set. * - * XXX: The exact number of bits to check and clear is a matter that - * should be tested and standardized at some point in the future for - * optimal aging of shared pages. + * As an optimization, update the page's dirty field if a modified bit is + * found while counting reference bits. This opportunistic update can be + * performed at low cost and can eliminate the need for some future calls + * to pmap_is_modified(). However, since this function stops after + * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some + * dirty pages. Those dirty pages will only be detected by a future call + * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) @@ -3012,7 +3014,7 @@ pmap_ts_referenced(vm_page_t m) pmap_t pmap; struct rwlock *lock; pd_entry_t *l2; - pt_entry_t *l3; + pt_entry_t *l3, old_l3; vm_paddr_t pa; int cleared, md_gen, not_cleared; struct spglist free; @@ -3050,15 +3052,18 @@ retry: ("pmap_ts_referenced: found an invalid l2 table")); l3 = pmap_l2_to_l3(l2, pv->pv_va); - if ((pmap_load(l3) & PTE_A) != 0) { - if (safe_to_clear_referenced(pmap, pmap_load(l3))) { + old_l3 = pmap_load(l3); + if (pmap_page_dirty(old_l3)) + vm_page_dirty(m); + if ((old_l3 & PTE_A) != 0) { + if (safe_to_clear_referenced(pmap, old_l3)) { /* * TODO: We don't handle the access flag * at all. We need to be able to set it in * the exception handler. */ panic("RISCVTODO: safe_to_clear_referenced\n"); - } else if ((pmap_load(l3) & PTE_SW_WIRED) == 0) { + } else if ((old_l3 & PTE_SW_WIRED) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for Modified: user/alc/PQ_LAUNDRY/sys/sparc64/sparc64/pmap.c ============================================================================== --- user/alc/PQ_LAUNDRY/sys/sparc64/sparc64/pmap.c Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/sparc64/sparc64/pmap.c Sat Sep 10 17:14:57 2016 (r305688) @@ -2073,18 +2073,12 @@ pmap_page_is_mapped(vm_page_t m) return (rv); } -#define PMAP_TS_REFERENCED_MAX 5 - /* * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * - * XXX: The exact number of bits to check and clear is a matter that - * should be tested and standardized at some point in the future for - * optimal aging of shared pages. - * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls Modified: user/alc/PQ_LAUNDRY/sys/vm/pmap.h ============================================================================== --- user/alc/PQ_LAUNDRY/sys/vm/pmap.h Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/sys/vm/pmap.h Sat Sep 10 17:14:57 2016 (r305688) @@ -104,6 +104,16 @@ extern vm_offset_t kernel_vm_end; #define PMAP_ENTER_NOSLEEP 0x0100 #define PMAP_ENTER_WIRED 0x0200 +/* + * Define the maximum number of machine-dependent reference bits that are + * cleared by a call to pmap_ts_referenced(). This limit serves two purposes. + * First, it bounds the cost of reference bit maintenance on widely shared + * pages. Second, it prevents numeric overflow during maintenance of a + * widely shared page's "act_count" field. An overflow could result in the + * premature deactivation of the page. + */ +#define PMAP_TS_REFERENCED_MAX 5 + void pmap_activate(struct thread *td); void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice); Modified: user/alc/PQ_LAUNDRY/tests/sys/kern/Makefile ============================================================================== --- user/alc/PQ_LAUNDRY/tests/sys/kern/Makefile Sat Sep 10 17:00:08 2016 (r305687) +++ user/alc/PQ_LAUNDRY/tests/sys/kern/Makefile Sat Sep 10 17:14:57 2016 (r305688) @@ -12,6 +12,7 @@ PLAIN_TESTS_C+= subr_unit_test ATF_TESTS_C+= unix_seqpacket_test ATF_TESTS_C+= unix_passfd_test TEST_METADATA.unix_seqpacket_test+= timeout="15" +ATF_TESTS_C+= waitpid_nohang LIBADD.ptrace_test+= pthread LIBADD.unix_seqpacket_test+= pthread Copied: user/alc/PQ_LAUNDRY/tests/sys/kern/waitpid_nohang.c (from r305685, head/tests/sys/kern/waitpid_nohang.c) ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/alc/PQ_LAUNDRY/tests/sys/kern/waitpid_nohang.c Sat Sep 10 17:14:57 2016 (r305688, copy of r305685, head/tests/sys/kern/waitpid_nohang.c) @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 2016 Jilles Tjoelker + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/wait.h> + +#include <atf-c.h> +#include <signal.h> +#include <unistd.h> + +ATF_TC_WITHOUT_HEAD(waitpid_nohang); +ATF_TC_BODY(waitpid_nohang, tc) +{ + pid_t child, pid; + int status, r; + + child = fork(); + ATF_REQUIRE(child != -1); + if (child == 0) { + sleep(10); + _exit(1); + } + + status = 42; + pid = waitpid(child, &status, WNOHANG); + ATF_REQUIRE(pid == 0); + ATF_CHECK(status == 42); + + r = kill(child, SIGTERM); + ATF_REQUIRE(r == 0); + r = waitid(P_PID, child, NULL, WEXITED | WNOWAIT); + ATF_REQUIRE(r == 0); + + status = -1; + pid = waitpid(child, &status, WNOHANG); + ATF_REQUIRE(pid == child); + ATF_CHECK(WIFSIGNALED(status) && WTERMSIG(status) == SIGTERM); +} + +ATF_TP_ADD_TCS(tp) +{ + + ATF_TP_ADD_TC(tp, waitpid_nohang); + return (atf_no_error()); +}
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201609101714.u8AHEvmd031762>