Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 10 Sep 2016 17:14:57 +0000 (UTC)
From:      Alan Cox <alc@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r305688 - in user/alc/PQ_LAUNDRY: sys/amd64/amd64 sys/arm/arm sys/arm64/arm64 sys/cddl/compat/opensolaris/sys sys/i386/i386 sys/kern sys/powerpc/booke sys/powerpc/conf sys/riscv/riscv s...
Message-ID:  <201609101714.u8AHEvmd031762@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: alc
Date: Sat Sep 10 17:14:57 2016
New Revision: 305688
URL: https://svnweb.freebsd.org/changeset/base/305688

Log:
  MFH r305685

Added:
  user/alc/PQ_LAUNDRY/tests/sys/kern/waitpid_nohang.c
     - copied unchanged from r305685, head/tests/sys/kern/waitpid_nohang.c
Modified:
  user/alc/PQ_LAUNDRY/sys/amd64/amd64/pmap.c
  user/alc/PQ_LAUNDRY/sys/arm/arm/pmap-v6.c
  user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c
  user/alc/PQ_LAUNDRY/sys/cddl/compat/opensolaris/sys/random.h
  user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c
  user/alc/PQ_LAUNDRY/sys/kern/kern_exit.c
  user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c
  user/alc/PQ_LAUNDRY/sys/kern/uipc_syscalls.c
  user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c
  user/alc/PQ_LAUNDRY/sys/powerpc/booke/pmap.c
  user/alc/PQ_LAUNDRY/sys/powerpc/conf/MPC85XX
  user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c
  user/alc/PQ_LAUNDRY/sys/sparc64/sparc64/pmap.c
  user/alc/PQ_LAUNDRY/sys/vm/pmap.h
  user/alc/PQ_LAUNDRY/tests/sys/kern/Makefile
Directory Properties:
  user/alc/PQ_LAUNDRY/   (props changed)

Modified: user/alc/PQ_LAUNDRY/sys/amd64/amd64/pmap.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/amd64/amd64/pmap.c	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/amd64/amd64/pmap.c	Sat Sep 10 17:14:57 2016	(r305688)
@@ -5816,8 +5816,6 @@ safe_to_clear_referenced(pmap_t pmap, pt
 		return (FALSE);
 }
 
-#define	PMAP_TS_REFERENCED_MAX	5
-
 /*
  *	pmap_ts_referenced:
  *
@@ -5826,10 +5824,6 @@ safe_to_clear_referenced(pmap_t pmap, pt
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
- *	XXX: The exact number of bits to check and clear is a matter that
- *	should be tested and standardized at some point in the future for
- *	optimal aging of shared pages.
- *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
@@ -5898,7 +5892,7 @@ retry:
 			 */
 			vm_page_dirty(m);
 		}
-		if ((*pde & PG_A) != 0) {
+		if ((oldpde & PG_A) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB
 			 * pages, it should not be cleared every time it is
@@ -5919,7 +5913,7 @@ retry:
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
-			    (*pde & PG_W) == 0) {
+			    (oldpde & PG_W) == 0) {
 				if (safe_to_clear_referenced(pmap, oldpde)) {
 					atomic_clear_long(pde, PG_A);
 					pmap_invalidate_page(pmap, pv->pv_va);

Modified: user/alc/PQ_LAUNDRY/sys/arm/arm/pmap-v6.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/arm/pmap-v6.c	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/arm/arm/pmap-v6.c	Sat Sep 10 17:14:57 2016	(r305688)
@@ -5161,8 +5161,6 @@ pmap_is_referenced(vm_page_t m)
 	return (rv);
 }
 
-#define	PMAP_TS_REFERENCED_MAX	5
-
 /*
  *	pmap_ts_referenced:
  *
@@ -5171,10 +5169,6 @@ pmap_is_referenced(vm_page_t m)
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
- *	XXX: The exact number of bits to check and clear is a matter that
- *	should be tested and standardized at some point in the future for
- *	optimal aging of shared pages.
- *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls

Modified: user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c	Sat Sep 10 17:14:57 2016	(r305688)
@@ -3880,8 +3880,6 @@ safe_to_clear_referenced(pmap_t pmap, pt
 	return (FALSE);
 }
 
-#define	PMAP_TS_REFERENCED_MAX	5
-
 /*
  *	pmap_ts_referenced:
  *
@@ -3890,9 +3888,13 @@ safe_to_clear_referenced(pmap_t pmap, pt
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
- *	XXX: The exact number of bits to check and clear is a matter that
- *	should be tested and standardized at some point in the future for
- *	optimal aging of shared pages.
+ *	As an optimization, update the page's dirty field if a modified bit is
+ *	found while counting reference bits.  This opportunistic update can be
+ *	performed at low cost and can eliminate the need for some future calls
+ *	to pmap_is_modified().  However, since this function stops after
+ *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
+ *	dirty pages.  Those dirty pages will only be detected by a future call
+ *	to pmap_is_modified().
  */
 int
 pmap_ts_referenced(vm_page_t m)
@@ -3947,6 +3949,14 @@ retry:
 		    ("pmap_ts_referenced: found an invalid l1 table"));
 		pte = pmap_l1_to_l2(pde, pv->pv_va);
 		tpte = pmap_load(pte);
+		if (pmap_page_dirty(tpte)) {
+			/*
+			 * Although "tpte" is mapping a 2MB page, because
+			 * this function is called at a 4KB page granularity,
+			 * we only update the 4KB page under test.
+			 */
+			vm_page_dirty(m);
+		}
 		if ((tpte & ATTR_AF) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB
@@ -4043,6 +4053,8 @@ small_mappings:
 		    ("pmap_ts_referenced: found an invalid l2 table"));
 		pte = pmap_l2_to_l3(pde, pv->pv_va);
 		tpte = pmap_load(pte);
+		if (pmap_page_dirty(tpte))
+			vm_page_dirty(m);
 		if ((tpte & ATTR_AF) != 0) {
 			if (safe_to_clear_referenced(pmap, tpte)) {
 				/*

Modified: user/alc/PQ_LAUNDRY/sys/cddl/compat/opensolaris/sys/random.h
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/cddl/compat/opensolaris/sys/random.h	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/cddl/compat/opensolaris/sys/random.h	Sat Sep 10 17:14:57 2016	(r305688)
@@ -32,6 +32,6 @@
 #include_next <sys/random.h>
 
 #define	random_get_bytes(p, s)		read_random((p), (int)(s))
-#define	random_get_pseudo_bytes(p, s)	read_random((p), (int)(s))
+#define	random_get_pseudo_bytes(p, s)	arc4rand((p), (int)(s), 0)
 
 #endif	/* !_OPENSOLARIS_SYS_RANDOM_H_ */

Modified: user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c	Sat Sep 10 17:14:57 2016	(r305688)
@@ -4765,8 +4765,6 @@ retry:
 	rw_wunlock(&pvh_global_lock);
 }
 
-#define	PMAP_TS_REFERENCED_MAX	5
-
 /*
  *	pmap_ts_referenced:
  *
@@ -4775,10 +4773,6 @@ retry:
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
- *	XXX: The exact number of bits to check and clear is a matter that
- *	should be tested and standardized at some point in the future for
- *	optimal aging of shared pages.
- *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls

Modified: user/alc/PQ_LAUNDRY/sys/kern/kern_exit.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/kern_exit.c	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/kern/kern_exit.c	Sat Sep 10 17:14:57 2016	(r305688)
@@ -723,9 +723,9 @@ sys_wait4(struct thread *td, struct wait
 	else
 		rup = NULL;
 	error = kern_wait(td, uap->pid, &status, uap->options, rup);
-	if (uap->status != NULL && error == 0)
+	if (uap->status != NULL && error == 0 && td->td_retval[0] != 0)
 		error = copyout(&status, uap->status, sizeof(status));
-	if (uap->rusage != NULL && error == 0)
+	if (uap->rusage != NULL && error == 0 && td->td_retval[0] != 0)
 		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 	return (error);
 }
@@ -759,9 +759,9 @@ sys_wait6(struct thread *td, struct wait
 	 */
 	error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip);
 
-	if (uap->status != NULL && error == 0)
+	if (uap->status != NULL && error == 0 && td->td_retval[0] != 0)
 		error = copyout(&status, uap->status, sizeof(status));
-	if (uap->wrusage != NULL && error == 0)
+	if (uap->wrusage != NULL && error == 0 && td->td_retval[0] != 0)
 		error = copyout(&wru, uap->wrusage, sizeof(wru));
 	if (uap->info != NULL && error == 0)
 		error = copyout(&si, uap->info, sizeof(si));

Modified: user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c	Sat Sep 10 17:14:57 2016	(r305688)
@@ -623,6 +623,14 @@ static struct witness_order_list_entry o
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
+	 * VFS namecache
+	 */
+	{ "ncglobal", &lock_class_rw },
+	{ "ncbuc", &lock_class_rw },
+	{ "vnode interlock", &lock_class_mtx_sleep },
+	{ "ncneg", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
 	 * ZFS locking
 	 */
 	{ "dn->dn_mtx", &lock_class_sx },

Modified: user/alc/PQ_LAUNDRY/sys/kern/uipc_syscalls.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/uipc_syscalls.c	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/kern/uipc_syscalls.c	Sat Sep 10 17:14:57 2016	(r305688)
@@ -1115,7 +1115,7 @@ orecvfrom(struct thread *td, struct recv
 
 #ifdef COMPAT_OLDSOCK
 int
-orecv(struct thread *td, struct orecv_args)
+orecv(struct thread *td, struct orecv_args *uap)
 {
 	struct msghdr msg;
 	struct iovec aiov;

Modified: user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c	Sat Sep 10 17:14:57 2016	(r305688)
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
+#include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
@@ -148,6 +149,23 @@ struct	namecache_ts {
  * Upon reaching the last segment of a path, if the reference
  * is for DELETE, or NOCACHE is set (rewrite), and the
  * name is located in the cache, it will be dropped.
+ *
+ * These locks are used (in the order in which they can be taken):
+ * NAME         TYPE    ROLE
+ * cache_lock   rwlock  global, needed for all modifications
+ * bucketlock   rwlock  for access to given hash bucket
+ * ncneg_mtx    mtx     negative entry LRU management
+ *
+ * A name -> vnode lookup can be safely performed by either locking cache_lock
+ * or the relevant hash bucket.
+ *
+ * ".." and vnode -> name lookups require cache_lock.
+ *
+ * Modifications require both cache_lock and relevant bucketlock taken for
+ * writing.
+ *
+ * Negative entry LRU management requires ncneg_mtx taken on top of either
+ * cache_lock or bucketlock.
  */
 
 /*
@@ -179,8 +197,9 @@ SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor
 struct nchstats	nchstats;		/* cache effectiveness statistics */
 
 static struct rwlock cache_lock;
-RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
+RW_SYSINIT(vfscache, &cache_lock, "ncglobal");
 
+#define	CACHE_TRY_WLOCK()	rw_try_wlock(&cache_lock)
 #define	CACHE_UPGRADE_LOCK()	rw_try_upgrade(&cache_lock)
 #define	CACHE_RLOCK()		rw_rlock(&cache_lock)
 #define	CACHE_RUNLOCK()		rw_runlock(&cache_lock)
@@ -188,7 +207,12 @@ RW_SYSINIT(vfscache, &cache_lock, "Name 
 #define	CACHE_WUNLOCK()		rw_wunlock(&cache_lock)
 
 static struct mtx_padalign ncneg_mtx;
-MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "Name Cache neg", MTX_DEF);
+MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "ncneg", MTX_DEF);
+
+static u_int   numbucketlocks;
+static struct rwlock_padalign  *bucketlocks;
+#define	HASH2BUCKETLOCK(hash) \
+	((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)]))
 
 /*
  * UMA zones for the VFS cache.
@@ -307,6 +331,8 @@ STATNODE_COUNTER(numfullpathfail4, "Numb
 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
 static long numupgrades; STATNODE_ULONG(numupgrades,
     "Number of updates of the cache after lookup (write lock + retry)");
+static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
+    "Number of times bucketlocked zap_and_exit case failed to writelock");
 
 static void cache_zap(struct namecache *ncp);
 static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
@@ -326,6 +352,39 @@ cache_get_hash(char *name, u_char len, s
 	return (hash);
 }
 
+#ifdef INVARIANTS
+static void
+cache_assert_bucket_locked(struct namecache *ncp, int mode)
+{
+	struct rwlock *bucketlock;
+	uint32_t hash;
+
+	hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
+	bucketlock = HASH2BUCKETLOCK(hash);
+	rw_assert(bucketlock, mode);
+}
+#else
+#define cache_assert_bucket_locked(x, y) do { } while (0)
+#endif
+
+static void
+cache_lock_all_buckets(void)
+{
+	u_int i;
+
+	for (i = 0; i < numbucketlocks; i++)
+		rw_wlock(&bucketlocks[i]);
+}
+
+static void
+cache_unlock_all_buckets(void)
+{
+	u_int i;
+
+	for (i = 0; i < numbucketlocks; i++)
+		rw_wunlock(&bucketlocks[i]);
+}
+
 static int
 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 {
@@ -442,21 +501,13 @@ SYSCTL_PROC(_debug_hashstat, OID_AUTO, n
  * Negative entries management
  */
 static void
-cache_negative_hit(struct namecache *ncp, int wlocked)
+cache_negative_hit(struct namecache *ncp)
 {
 
-	if (!wlocked) {
-		rw_assert(&cache_lock, RA_RLOCKED);
-		mtx_lock(&ncneg_mtx);
-	} else {
-		rw_assert(&cache_lock, RA_WLOCKED);
-	}
-
+	mtx_lock(&ncneg_mtx);
 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
-
-	if (!wlocked)
-		mtx_unlock(&ncneg_mtx);
+	mtx_unlock(&ncneg_mtx);
 }
 
 static void
@@ -464,9 +515,12 @@ cache_negative_insert(struct namecache *
 {
 
 	rw_assert(&cache_lock, RA_WLOCKED);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 	MPASS(ncp->nc_vp == NULL);
+	mtx_lock(&ncneg_mtx);
 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 	numneg++;
+	mtx_unlock(&ncneg_mtx);
 }
 
 static void
@@ -474,9 +528,12 @@ cache_negative_remove(struct namecache *
 {
 
 	rw_assert(&cache_lock, RA_WLOCKED);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 	MPASS(ncp->nc_vp == NULL);
+	mtx_lock(&ncneg_mtx);
 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 	numneg--;
+	mtx_unlock(&ncneg_mtx);
 }
 
 static struct namecache *
@@ -499,10 +556,11 @@ cache_negative_zap_one(void)
  *   pointer to a vnode or if it is just a negative cache entry.
  */
 static void
-cache_zap(struct namecache *ncp)
+cache_zap_locked(struct namecache *ncp)
 {
 
 	rw_assert(&cache_lock, RA_WLOCKED);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
 	if (ncp->nc_vp != NULL) {
 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
@@ -532,6 +590,21 @@ cache_zap(struct namecache *ncp)
 	numcache--;
 }
 
+static void
+cache_zap(struct namecache *ncp)
+{
+	struct rwlock *bucketlock;
+	uint32_t hash;
+
+	rw_assert(&cache_lock, RA_WLOCKED);
+
+	hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
+	bucketlock = HASH2BUCKETLOCK(hash);
+	rw_wlock(bucketlock);
+	cache_zap_locked(ncp);
+	rw_wunlock(bucketlock);
+}
+
 /*
  * Lookup an entry in the cache
  *
@@ -549,22 +622,42 @@ cache_zap(struct namecache *ncp)
  * not recursively acquired.
  */
 
+enum { UNLOCKED, WLOCKED, RLOCKED };
+
+static void
+cache_unlock(int cache_locked)
+{
+
+	switch (cache_locked) {
+	case UNLOCKED:
+		break;
+	case WLOCKED:
+		CACHE_WUNLOCK();
+		break;
+	case RLOCKED:
+		CACHE_RUNLOCK();
+		break;
+	}
+}
+
 int
 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct timespec *tsp, int *ticksp)
 {
+	struct rwlock *bucketlock;
 	struct namecache *ncp;
 	uint32_t hash;
-	int error, ltype, wlocked;
+	int error, ltype, cache_locked;
 
 	if (!doingcache) {
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
 retry:
-	wlocked = 0;
-	counter_u64_add(numcalls, 1);
+	bucketlock = NULL;
+	cache_locked = UNLOCKED;
 	error = 0;
+	counter_u64_add(numcalls, 1);
 
 retry_wlocked:
 	if (cnp->cn_nameptr[0] == '.') {
@@ -598,17 +691,21 @@ retry_wlocked:
 			}
 			return (-1);
 		}
-		if (!wlocked)
-			CACHE_RLOCK();
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			counter_u64_add(dotdothits, 1);
+			if (cache_locked == UNLOCKED) {
+				CACHE_RLOCK();
+				cache_locked = RLOCKED;
+			}
+
 			if (dvp->v_cache_dd == NULL) {
 				SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
 				    "..", NULL);
 				goto unlock;
 			}
 			if ((cnp->cn_flags & MAKEENTRY) == 0) {
-				if (!wlocked && !CACHE_UPGRADE_LOCK())
+				if (cache_locked != WLOCKED &&
+				    !CACHE_UPGRADE_LOCK())
 					goto wlock;
 				ncp = NULL;
 				if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) {
@@ -639,10 +736,14 @@ retry_wlocked:
 				    nc_dotdottime;
 			goto success;
 		}
-	} else if (!wlocked)
-		CACHE_RLOCK();
+	}
 
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+	if (cache_locked == UNLOCKED) {
+		bucketlock = HASH2BUCKETLOCK(hash);
+		rw_rlock(bucketlock);
+	}
+
 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		counter_u64_add(numchecks, 1);
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
@@ -665,12 +766,7 @@ retry_wlocked:
 	/* We don't want to have an entry, so dump it */
 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
 		counter_u64_add(numposzaps, 1);
-		if (!wlocked && !CACHE_UPGRADE_LOCK())
-			goto wlock;
-		cache_zap(ncp);
-		CACHE_WUNLOCK();
-		cache_free(ncp);
-		return (0);
+		goto zap_and_exit;
 	}
 
 	/* We found a "positive" match, return the vnode */
@@ -689,25 +785,20 @@ negative_success:
 	/* We found a negative match, and want to create it, so purge */
 	if (cnp->cn_nameiop == CREATE) {
 		counter_u64_add(numnegzaps, 1);
-		if (!wlocked && !CACHE_UPGRADE_LOCK())
-			goto wlock;
-		cache_zap(ncp);
-		CACHE_WUNLOCK();
-		cache_free(ncp);
-		return (0);
+		goto zap_and_exit;
 	}
 
 	counter_u64_add(numneghits, 1);
-	cache_negative_hit(ncp, wlocked);
+	cache_negative_hit(ncp);
 	if (ncp->nc_flag & NCF_WHITE)
 		cnp->cn_flags |= ISWHITEOUT;
 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
 	    nc_get_name(ncp));
 	cache_out_ts(ncp, tsp, ticksp);
-	if (wlocked)
-		CACHE_WUNLOCK();
-	else
-		CACHE_RUNLOCK();
+	MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+	if (bucketlock != NULL)
+		rw_runlock(bucketlock);
+	cache_unlock(cache_locked);
 	return (ENOENT);
 
 wlock:
@@ -716,9 +807,10 @@ wlock:
 	 * a write lock and retry the operation.
 	 */
 	CACHE_RUNLOCK();
+wlock_unlocked:
 	CACHE_WLOCK();
 	numupgrades++;
-	wlocked = 1;
+	cache_locked = WLOCKED;
 	goto retry_wlocked;
 
 success:
@@ -733,10 +825,10 @@ success:
 		VOP_UNLOCK(dvp, 0);
 	}
 	vhold(*vpp);
-	if (wlocked)
-		CACHE_WUNLOCK();
-	else
-		CACHE_RUNLOCK();
+	MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+	if (bucketlock != NULL)
+		rw_runlock(bucketlock);
+	cache_unlock(cache_locked);
 	error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
 	if (cnp->cn_flags & ISDOTDOT) {
 		vn_lock(dvp, ltype | LK_RETRY);
@@ -758,10 +850,29 @@ success:
 	return (-1);
 
 unlock:
-	if (wlocked)
-		CACHE_WUNLOCK();
-	else
-		CACHE_RUNLOCK();
+	MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+	if (bucketlock != NULL)
+		rw_runlock(bucketlock);
+	cache_unlock(cache_locked);
+	return (0);
+
+zap_and_exit:
+	if (bucketlock != NULL) {
+		rw_assert(&cache_lock, RA_UNLOCKED);
+		if (!CACHE_TRY_WLOCK()) {
+			rw_runlock(bucketlock);
+			bucketlock = NULL;
+			zap_and_exit_bucket_fail++;
+			goto wlock_unlocked;
+		}
+		cache_locked = WLOCKED;
+		rw_runlock(bucketlock);
+		bucketlock = NULL;
+	} else if (cache_locked != WLOCKED && !CACHE_UPGRADE_LOCK())
+		goto wlock;
+	cache_zap(ncp);
+	CACHE_WUNLOCK();
+	cache_free(ncp);
 	return (0);
 }
 
@@ -772,6 +883,7 @@ void
 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
     struct timespec *tsp, struct timespec *dtsp)
 {
+	struct rwlock *bucketlock;
 	struct namecache *ncp, *n2, *ndd, *nneg;
 	struct namecache_ts *n3;
 	struct nchashhead *ncpp;
@@ -924,11 +1036,6 @@ cache_enter_time(struct vnode *dvp, stru
 		}
 	}
 
-	/*
-	 * Insert the new namecache entry into the appropriate chain
-	 * within the cache entries table.
-	 */
-	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 	if (flag != NCF_ISDOTDOT) {
 		if (LIST_EMPTY(&dvp->v_cache_src)) {
 			vhold(dvp);
@@ -937,6 +1044,15 @@ cache_enter_time(struct vnode *dvp, stru
 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 	}
 
+	bucketlock = HASH2BUCKETLOCK(hash);
+	rw_wlock(bucketlock);
+
+	/*
+	 * Insert the new namecache entry into the appropriate chain
+	 * within the cache entries table.
+	 */
+	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+
 	/*
 	 * If the entry is "negative", we place it into the
 	 * "negative" cache queue, otherwise, we place it into the
@@ -953,6 +1069,7 @@ cache_enter_time(struct vnode *dvp, stru
 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 		    nc_get_name(ncp));
 	}
+	rw_wunlock(bucketlock);
 	if (numneg * ncnegfactor > numcache)
 		nneg = cache_negative_zap_one();
 	CACHE_WUNLOCK();
@@ -960,12 +1077,24 @@ cache_enter_time(struct vnode *dvp, stru
 	cache_free(nneg);
 }
 
+static u_int
+cache_roundup_2(u_int val)
+{
+	u_int res;
+
+	for (res = 1; res <= val; res <<= 1)
+		continue;
+
+	return (res);
+}
+
 /*
  * Name cache initialization, from vfs_init() when we are booting
  */
 static void
 nchinit(void *dummy __unused)
 {
+	u_int i;
 
 	TAILQ_INIT(&ncneg);
 
@@ -983,6 +1112,13 @@ nchinit(void *dummy __unused)
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 
 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
+	numbucketlocks = cache_roundup_2(mp_ncpus * 16);
+	if (numbucketlocks > nchash)
+		numbucketlocks = nchash;
+	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
+	    M_WAITOK | M_ZERO);
+	for (i = 0; i < numbucketlocks; i++)
+		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK);
 
 	numcalls = counter_u64_alloc(M_WAITOK);
 	dothits = counter_u64_alloc(M_WAITOK);
@@ -1023,6 +1159,7 @@ cache_changesize(int newmaxvnodes)
 	 * because to do so, they have to be removed from the hash table.
 	 */
 	CACHE_WLOCK();
+	cache_lock_all_buckets();
 	old_nchashtbl = nchashtbl;
 	old_nchash = nchash;
 	nchashtbl = new_nchashtbl;
@@ -1035,6 +1172,7 @@ cache_changesize(int newmaxvnodes)
 			LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
 		}
 	}
+	cache_unlock_all_buckets();
 	CACHE_WUNLOCK();
 	free(old_nchashtbl, M_VFSCACHE);
 }
@@ -1108,20 +1246,30 @@ void
 cache_purgevfs(struct mount *mp)
 {
 	TAILQ_HEAD(, namecache) ncps;
-	struct nchashhead *ncpp;
+	struct rwlock *bucketlock;
+	struct nchashhead *bucket;
 	struct namecache *ncp, *nnp;
+	u_long i, j, n_nchash;
 
 	/* Scan hash tables for applicable entries */
 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
 	TAILQ_INIT(&ncps);
 	CACHE_WLOCK();
-	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
-		LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
-			if (ncp->nc_dvp->v_mount != mp)
-				continue;
-			cache_zap(ncp);
-			TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+	n_nchash = nchash + 1;
+	for (i = 0; i < numbucketlocks; i++) {
+		bucketlock = (struct rwlock *)&bucketlocks[i];
+		rw_wlock(bucketlock);
+		for (j = i; j < n_nchash; j += numbucketlocks) {
+			bucket = &nchashtbl[j];
+			LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
+				cache_assert_bucket_locked(ncp, RA_WLOCKED);
+				if (ncp->nc_dvp->v_mount != mp)
+					continue;
+				cache_zap_locked(ncp);
+				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
+			}
 		}
+		rw_wunlock(bucketlock);
 	}
 	CACHE_WUNLOCK();
 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {

Modified: user/alc/PQ_LAUNDRY/sys/powerpc/booke/pmap.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/powerpc/booke/pmap.c	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/powerpc/booke/pmap.c	Sat Sep 10 17:14:57 2016	(r305688)
@@ -2499,9 +2499,13 @@ mmu_booke_clear_modify(mmu_t mmu, vm_pag
  * is necessary that 0 only be returned when there are truly no
  * reference bits set.
  *
- * XXX: The exact number of bits to check and clear is a matter that
- * should be tested and standardized at some point in the future for
- * optimal aging of shared pages.
+ * As an optimization, update the page's dirty field if a modified bit is
+ * found while counting reference bits.  This opportunistic update can be
+ * performed at low cost and can eliminate the need for some future calls
+ * to pmap_is_modified().  However, since this function stops after
+ * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
+ * dirty pages.  Those dirty pages will only be detected by a future call
+ * to pmap_is_modified().
  */
 static int
 mmu_booke_ts_referenced(mmu_t mmu, vm_page_t m)
@@ -2518,6 +2522,8 @@ mmu_booke_ts_referenced(mmu_t mmu, vm_pa
 		PMAP_LOCK(pv->pv_pmap);
 		if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL &&
 		    PTE_ISVALID(pte)) {
+			if (PTE_ISMODIFIED(pte))
+				vm_page_dirty(m);
 			if (PTE_ISREFERENCED(pte)) {
 				mtx_lock_spin(&tlbivax_mutex);
 				tlb_miss_lock();
@@ -2528,7 +2534,7 @@ mmu_booke_ts_referenced(mmu_t mmu, vm_pa
 				tlb_miss_unlock();
 				mtx_unlock_spin(&tlbivax_mutex);
 
-				if (++count > 4) {
+				if (++count >= PMAP_TS_REFERENCED_MAX) {
 					PMAP_UNLOCK(pv->pv_pmap);
 					break;
 				}

Modified: user/alc/PQ_LAUNDRY/sys/powerpc/conf/MPC85XX
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/powerpc/conf/MPC85XX	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/powerpc/conf/MPC85XX	Sat Sep 10 17:14:57 2016	(r305688)
@@ -89,6 +89,7 @@ device		tun
 device		uart
 options 	USB_DEBUG	# enable debug msgs
 #device		uhci
+device		ehci
 device		umass
 device		usb
 device		vlan

Modified: user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c	Sat Sep 10 17:14:57 2016	(r305688)
@@ -2991,8 +2991,6 @@ safe_to_clear_referenced(pmap_t pmap, pt
 	return (FALSE);
 }
 
-#define	PMAP_TS_REFERENCED_MAX	5
-
 /*
  *	pmap_ts_referenced:
  *
@@ -3001,9 +2999,13 @@ safe_to_clear_referenced(pmap_t pmap, pt
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
- *	XXX: The exact number of bits to check and clear is a matter that
- *	should be tested and standardized at some point in the future for
- *	optimal aging of shared pages.
+ *	As an optimization, update the page's dirty field if a modified bit is
+ *	found while counting reference bits.  This opportunistic update can be
+ *	performed at low cost and can eliminate the need for some future calls
+ *	to pmap_is_modified().  However, since this function stops after
+ *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
+ *	dirty pages.  Those dirty pages will only be detected by a future call
+ *	to pmap_is_modified().
  */
 int
 pmap_ts_referenced(vm_page_t m)
@@ -3012,7 +3014,7 @@ pmap_ts_referenced(vm_page_t m)
 	pmap_t pmap;
 	struct rwlock *lock;
 	pd_entry_t *l2;
-	pt_entry_t *l3;
+	pt_entry_t *l3, old_l3;
 	vm_paddr_t pa;
 	int cleared, md_gen, not_cleared;
 	struct spglist free;
@@ -3050,15 +3052,18 @@ retry:
 		    ("pmap_ts_referenced: found an invalid l2 table"));
 
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
-		if ((pmap_load(l3) & PTE_A) != 0) {
-			if (safe_to_clear_referenced(pmap, pmap_load(l3))) {
+		old_l3 = pmap_load(l3);
+		if (pmap_page_dirty(old_l3))
+			vm_page_dirty(m);
+		if ((old_l3 & PTE_A) != 0) {
+			if (safe_to_clear_referenced(pmap, old_l3)) {
 				/*
 				 * TODO: We don't handle the access flag
 				 * at all. We need to be able to set it in
 				 * the exception handler.
 				 */
 				panic("RISCVTODO: safe_to_clear_referenced\n");
-			} else if ((pmap_load(l3) & PTE_SW_WIRED) == 0) {
+			} else if ((old_l3 & PTE_SW_WIRED) == 0) {
 				/*
 				 * Wired pages cannot be paged out so
 				 * doing accessed bit emulation for

Modified: user/alc/PQ_LAUNDRY/sys/sparc64/sparc64/pmap.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/sparc64/sparc64/pmap.c	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/sparc64/sparc64/pmap.c	Sat Sep 10 17:14:57 2016	(r305688)
@@ -2073,18 +2073,12 @@ pmap_page_is_mapped(vm_page_t m)
 	return (rv);
 }
 
-#define	PMAP_TS_REFERENCED_MAX	5
-
 /*
  * Return a count of reference bits for a page, clearing those bits.
  * It is not necessary for every reference bit to be cleared, but it
  * is necessary that 0 only be returned when there are truly no
  * reference bits set.
  *
- * XXX: The exact number of bits to check and clear is a matter that
- * should be tested and standardized at some point in the future for
- * optimal aging of shared pages.
- *
  * As an optimization, update the page's dirty field if a modified bit is
  * found while counting reference bits.  This opportunistic update can be
  * performed at low cost and can eliminate the need for some future calls

Modified: user/alc/PQ_LAUNDRY/sys/vm/pmap.h
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/vm/pmap.h	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/sys/vm/pmap.h	Sat Sep 10 17:14:57 2016	(r305688)
@@ -104,6 +104,16 @@ extern vm_offset_t kernel_vm_end;
 #define	PMAP_ENTER_NOSLEEP	0x0100
 #define	PMAP_ENTER_WIRED	0x0200
 
+/*
+ * Define the maximum number of machine-dependent reference bits that are
+ * cleared by a call to pmap_ts_referenced().  This limit serves two purposes.
+ * First, it bounds the cost of reference bit maintenance on widely shared
+ * pages.  Second, it prevents numeric overflow during maintenance of a
+ * widely shared page's "act_count" field.  An overflow could result in the
+ * premature deactivation of the page.
+ */
+#define	PMAP_TS_REFERENCED_MAX	5
+
 void		 pmap_activate(struct thread *td);
 void		 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 		    int advice);

Modified: user/alc/PQ_LAUNDRY/tests/sys/kern/Makefile
==============================================================================
--- user/alc/PQ_LAUNDRY/tests/sys/kern/Makefile	Sat Sep 10 17:00:08 2016	(r305687)
+++ user/alc/PQ_LAUNDRY/tests/sys/kern/Makefile	Sat Sep 10 17:14:57 2016	(r305688)
@@ -12,6 +12,7 @@ PLAIN_TESTS_C+=	subr_unit_test
 ATF_TESTS_C+=	unix_seqpacket_test
 ATF_TESTS_C+=	unix_passfd_test
 TEST_METADATA.unix_seqpacket_test+=	timeout="15"
+ATF_TESTS_C+=	waitpid_nohang
 
 LIBADD.ptrace_test+=			pthread
 LIBADD.unix_seqpacket_test+=		pthread

Copied: user/alc/PQ_LAUNDRY/tests/sys/kern/waitpid_nohang.c (from r305685, head/tests/sys/kern/waitpid_nohang.c)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/alc/PQ_LAUNDRY/tests/sys/kern/waitpid_nohang.c	Sat Sep 10 17:14:57 2016	(r305688, copy of r305685, head/tests/sys/kern/waitpid_nohang.c)
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2016 Jilles Tjoelker
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/wait.h>
+
+#include <atf-c.h>
+#include <signal.h>
+#include <unistd.h>
+
+ATF_TC_WITHOUT_HEAD(waitpid_nohang);
+ATF_TC_BODY(waitpid_nohang, tc)
+{
+	pid_t child, pid;
+	int status, r;
+
+	child = fork();
+	ATF_REQUIRE(child != -1);
+	if (child == 0) {
+		sleep(10);
+		_exit(1);
+	}
+
+	status = 42;
+	pid = waitpid(child, &status, WNOHANG);
+	ATF_REQUIRE(pid == 0);
+	ATF_CHECK(status == 42);
+
+	r = kill(child, SIGTERM);
+	ATF_REQUIRE(r == 0);
+	r = waitid(P_PID, child, NULL, WEXITED | WNOWAIT);
+	ATF_REQUIRE(r == 0);
+
+	status = -1;
+	pid = waitpid(child, &status, WNOHANG);
+	ATF_REQUIRE(pid == child);
+	ATF_CHECK(WIFSIGNALED(status) && WTERMSIG(status) == SIGTERM);
+}
+
+ATF_TP_ADD_TCS(tp)
+{
+
+	ATF_TP_ADD_TC(tp, waitpid_nohang);
+	return (atf_no_error());
+}



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201609101714.u8AHEvmd031762>