Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 5 Feb 2018 23:01:49 +0000 (UTC)
From:      Jeff Roberson <jeff@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r328904 - in user/jeff/numa/sys: kern sys
Message-ID:  <201802052301.w15N1nEv036995@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jeff
Date: Mon Feb  5 23:01:49 2018
New Revision: 328904
URL: https://svnweb.freebsd.org/changeset/base/328904

Log:
  Re-implement the buffer queues with a number of independent silos each
  having their own space allotment and bufspace daemon.
  
  Use a per-cpu clean queue cache in front of the silo clean queue.
  
  Move the common queue variables (queue, len, lock) into a structure so they
  can be aligned and packed together.
  
  Implement a REUSE flag to operate as a second chance in buf_recycle() so
  we don't have to requeue frequently re-used buffers.
  
  Move counters to the counter API.

Modified:
  user/jeff/numa/sys/kern/vfs_bio.c
  user/jeff/numa/sys/kern/vfs_subr.c
  user/jeff/numa/sys/sys/buf.h
  user/jeff/numa/sys/sys/bufobj.h

Modified: user/jeff/numa/sys/kern/vfs_bio.c
==============================================================================
--- user/jeff/numa/sys/kern/vfs_bio.c	Mon Feb  5 22:21:51 2018	(r328903)
+++ user/jeff/numa/sys/kern/vfs_bio.c	Mon Feb  5 23:01:49 2018	(r328904)
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
+#include <sys/counter.h>
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
@@ -105,7 +106,6 @@ caddr_t unmapped_buf;
 
 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
 struct proc *bufdaemonproc;
-struct proc *bufspacedaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
@@ -124,11 +124,8 @@ static int vfs_bio_clcheck(struct vnode *vp, int size,
 static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
 		void (*)(struct buf *));
 static int buf_flush(struct vnode *vp, int);
-static int buf_recycle(bool);
-static int buf_scan(bool);
 static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
-static void bremfreel(struct buf *bp);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 static void bufkva_reclaim(vmem_t *, int);
@@ -137,28 +134,17 @@ static int buf_import(void *, void **, int, int, int);
 static void buf_release(void *, void **, int);
 static void maxbcachebuf_adjust(void);
 
-#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
-    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
-#endif
-
 int vmiodirenable = TRUE;
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
     "Use the VM system for directory writes");
 long runningbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
     "Amount of presently outstanding async buffer io");
-static long bufspace;
-#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
-    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
-    &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
-#else
-SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
-    "Physical memory used for buffers");
-#endif
-static long bufkvaspace;
-SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
+    NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers");
+static counter_u64_t bufkvaspace;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
@@ -178,11 +164,11 @@ SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &h
 long bufspacethresh;
 SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
     0, "Bufspace consumed before waking the daemon to free some");
-static int buffreekvacnt;
-SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
+static counter_u64_t buffreekvacnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
     "Number of times we have freed the KVA space from some buffer");
-static int bufdefragcnt;
-SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
+static counter_u64_t bufdefragcnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt,
     "Number of times we have had to repeat buffer allocation to defragment");
 static long lorunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
@@ -225,24 +211,26 @@ SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, 
 static int hifreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
    "Threshold for clean buffer recycling");
-static int getnewbufcalls;
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
-   "Number of calls to getnewbuf");
-static int getnewbufrestarts;
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
+static counter_u64_t getnewbufcalls;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
+   &getnewbufcalls, "Number of calls to getnewbuf");
+static counter_u64_t getnewbufrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD,
+    &getnewbufrestarts,
     "Number of times getnewbuf has had to restart a buffer acquisition");
-static int mappingrestarts;
-SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
+static counter_u64_t mappingrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD,
+    &mappingrestarts,
     "Number of times getblk has had to restart a buffer mapping for "
     "unmapped buffer");
-static int numbufallocfails;
-SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
-    "Number of times buffer allocations failed");
+static counter_u64_t numbufallocfails;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW,
+    &numbufallocfails, "Number of times buffer allocations failed");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
-static long notbufdflushes;
-SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
+static counter_u64_t notbufdflushes;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
@@ -266,11 +254,6 @@ static struct mtx_padalign __exclusive_cache_line bdlo
 static struct mtx_padalign __exclusive_cache_line rbreqlock;
 
 /*
- * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
- */
-static struct rwlock_padalign __exclusive_cache_line nblock;
-
-/*
  * Lock that protects bdirtywait.
  */
 static struct mtx_padalign __exclusive_cache_line bdirtylock;
@@ -283,11 +266,6 @@ static struct mtx_padalign __exclusive_cache_line bdir
 static int bd_request;
 
 /*
- * Request/wakeup point for the bufspace daemon.
- */
-static int bufspace_request;
-
-/*
  * Request for the buf daemon to write more buffers than is indicated by
  * lodirtybuf.  This may be necessary to push out excess dependencies or
  * defragment the address space where a simple count of the number of dirty
@@ -302,15 +280,6 @@ static int bd_speedupreq;
  */
 static int runningbufreq;
 
-/* 
- * Synchronization (sleep/wakeup) variable for buffer requests.
- * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
- * by and/or.
- * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
- * getnewbuf(), and getblk().
- */
-static volatile int needsbuffer;
-
 /*
  * Synchronization for bwillwrite() waiters.
  */
@@ -323,29 +292,65 @@ static int bdirtywait;
 #define QUEUE_EMPTY	1	/* empty buffer headers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
 #define QUEUE_CLEAN	3	/* non-B_DELWRI buffers */
-#define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
+#define QUEUE_SENTINEL	4	/* not an queue index, but mark for sentinel */
 
-/* Maximum number of clean buffer queues. */
-#define	CLEAN_QUEUES	16
+struct bufqueue {
+	struct mtx_padalign	bq_lock;
+	TAILQ_HEAD(, buf)	bq_queue;
+	uint8_t			bq_index;
+	uint16_t		bq_cpu;
+	int			bq_len;
+} __aligned(CACHE_LINE_SIZE);
 
+#define	BQ_LOCKPTR(bq)		(&(bq)->bq_lock)
+#define	BQ_LOCK(bq)		mtx_lock(BQ_LOCKPTR((bq)))
+#define	BQ_UNLOCK(bq)		mtx_unlock(BQ_LOCKPTR((bq)))
+#define	BQ_ASSERT_LOCKED(bq)	mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
+
+struct bufqueue __exclusive_cache_line bqempty;
+struct bufqueue __exclusive_cache_line bqdirty;
+
+struct bufdomain {
+	struct bufqueue	bd_cpuq[MAXCPU];
+	struct bufqueue	bd_cleanq;
+	/* Constants */
+	long		bd_maxbufspace;
+	long		bd_hibufspace;
+	long 		bd_lobufspace;
+	long 		bd_bufspacethresh;
+	int		bd_hifreebuffers;
+	int		bd_lofreebuffers;
+	int		bd_lim;
+	/* atomics */
+	int		bd_wanted;
+	int  __aligned(CACHE_LINE_SIZE)	bd_request;
+	long __aligned(CACHE_LINE_SIZE) bd_bufspace;
+	int __aligned(CACHE_LINE_SIZE)	bd_freebuffers;
+} __aligned(CACHE_LINE_SIZE);
+
+#define	BD_LOCKPTR(bd)		(&(bd)->bd_cleanq.bq_lock)
+#define	BD_LOCK(bd)		mtx_lock(BD_LOCKPTR((bd)))
+#define	BD_UNLOCK(bd)		mtx_unlock(BD_LOCKPTR((bd)))
+#define	BD_ASSERT_LOCKED(bd)	mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
+#define	BD_DOMAIN(bd)		(bd - bdclean)
+
+/* Maximum number of clean buffer domains. */
+#define	CLEAN_DOMAINS	8
+
 /* Configured number of clean queues. */
-static int clean_queues;
+static int __read_mostly clean_domains;
 
-/* Maximum number of buffer queues. */
-#define BUFFER_QUEUES	(QUEUE_CLEAN + CLEAN_QUEUES)
+struct bufdomain __exclusive_cache_line bdclean[CLEAN_DOMAINS];
 
-/* Queues for free buffers with various properties */
-static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
-#ifdef INVARIANTS
-static int bq_len[BUFFER_QUEUES];
-#endif
+static void bq_remove(struct bufqueue *bq, struct buf *bp);
+static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
+static int buf_recycle(struct bufdomain *, bool kva);
+static void bq_init(struct bufqueue *bq, int qindex, int cpu,
+	    const char *lockname);
+static void bd_init(struct bufdomain *bd);
+static int bd_flushall(struct bufdomain *bd);
 
 /*
- * Lock for each bufqueue
- */
-static struct mtx_padalign __exclusive_cache_line bqlocks[BUFFER_QUEUES];
-
-/*
  * per-cpu empty buffer cache.
  */
 uma_zone_t buf_zone;
@@ -391,46 +396,34 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
 	long lvalue;
 	int ivalue;
+	int i;
 
+	lvalue = 0;
+	for (i = 0; i < clean_domains; i++)
+		lvalue += bdclean[i].bd_bufspace;
 	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
-		return (sysctl_handle_long(oidp, arg1, arg2, req));
-	lvalue = *(long *)arg1;
+		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	if (lvalue > INT_MAX)
 		/* On overflow, still write out a long to trigger ENOMEM. */
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	ivalue = lvalue;
 	return (sysctl_handle_int(oidp, &ivalue, 0, req));
 }
-#endif
-
+#else
 static int
-bqcleanq(void)
+sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
-	static int nextq;
+	long lvalue;
+	int i;
 
-	return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
+	lvalue = 0;
+	for (i = 0; i < clean_domains; i++)
+		lvalue += bdclean[i].bd_bufspace;
+	return (sysctl_handle_int(oidp, &lvalue, 0, req));
 }
+#endif
 
-static int
-bqisclean(int qindex)
-{
-
-	return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
-}
-
 /*
- *	bqlock:
- *
- *	Return the appropriate queue lock based on the index.
- */
-static inline struct mtx *
-bqlock(int qindex)
-{
-
-	return (struct mtx *)&bqlocks[qindex];
-}
-
-/*
  *	bdirtywakeup:
  *
  *	Wakeup any bwillwrite() waiters.
@@ -481,50 +474,23 @@ bdirtyadd(void)
 }
 
 /*
- *	bufspace_wakeup:
+ *	bufspace_daemonwakeup:
  *
- *	Called when buffer space is potentially available for recovery.
- *	getnewbuf() will block on this flag when it is unable to free 
- *	sufficient buffer space.  Buffer space becomes recoverable when 
- *	bp's get placed back in the queues.
+ *	Wakeup the daemons responsible for freeing clean bufs.
  */
 static void
-bufspace_wakeup(void)
+bufspace_daemonwakeup(struct bufdomain *bd)
 {
 
-	/*
-	 * If someone is waiting for bufspace, wake them up.
-	 *
-	 * Since needsbuffer is set prior to doing an additional queue
-	 * scan it is safe to check for the flag prior to acquiring the
-	 * lock.  The thread that is preparing to scan again before
-	 * blocking would discover the buf we released.
-	 */
-	if (needsbuffer) {
-		rw_rlock(&nblock);
-		if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
-			wakeup(__DEVOLATILE(void *, &needsbuffer));
-		rw_runlock(&nblock);
+	if (atomic_fetchadd_int(&bd->bd_request, 1) == 0) {
+		BD_LOCK(bd);
+		bd->bd_request = 1;
+		wakeup(&bd->bd_request);
+		BD_UNLOCK(bd);
 	}
 }
 
 /*
- *	bufspace_daemonwakeup:
- *
- *	Wakeup the daemon responsible for freeing clean bufs.
- */
-static void
-bufspace_daemonwakeup(void)
-{
-	rw_rlock(&nblock);
-	if (bufspace_request == 0) {
-		bufspace_request = 1;
-		wakeup(&bufspace_request);
-	}
-	rw_runlock(&nblock);
-}
-
-/*
  *	bufspace_adjust:
  *
  *	Adjust the reported bufspace for a KVA managed buffer, possibly
@@ -533,20 +499,22 @@ bufspace_daemonwakeup(void)
 static void
 bufspace_adjust(struct buf *bp, int bufsize)
 {
+	struct bufdomain *bd;
 	long space;
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) == 0,
 	    ("bufspace_adjust: malloc buf %p", bp));
+	bd = &bdclean[bp->b_domain];
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0) {
-		atomic_subtract_long(&bufspace, -diff);
-		bufspace_wakeup();
+		atomic_subtract_long(&bd->bd_bufspace, -diff);
 	} else {
-		space = atomic_fetchadd_long(&bufspace, diff);
+		space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
 		/* Wake up the daemon on the transition. */
-		if (space < bufspacethresh && space + diff >= bufspacethresh)
-			bufspace_daemonwakeup();
+		if (space < bd->bd_bufspacethresh &&
+		    space + diff >= bd->bd_bufspacethresh)
+			bufspace_daemonwakeup(bd);
 	}
 	bp->b_bufsize = bufsize;
 }
@@ -558,24 +526,25 @@ bufspace_adjust(struct buf *bp, int bufsize)
  *	different space limit than data.
  */
 static int
-bufspace_reserve(int size, bool metadata)
+bufspace_reserve(struct bufdomain *bd, int size, bool metadata)
 {
-	long limit;
+	long limit, new;
 	long space;
 
 	if (metadata)
-		limit = maxbufspace;
+		limit = bd->bd_maxbufspace;
 	else
-		limit = hibufspace;
+		limit = bd->bd_hibufspace;
 	do {
-		space = bufspace;
-		if (space + size > limit)
+		space = bd->bd_bufspace;
+		new = space + size;
+		if (new > limit)
 			return (ENOSPC);
-	} while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
+	} while (atomic_cmpset_long(&bd->bd_bufspace, space, new) == 0);
 
 	/* Wake up the daemon on the transition. */
-	if (space < bufspacethresh && space + size >= bufspacethresh)
-		bufspace_daemonwakeup();
+	if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh)
+		bufspace_daemonwakeup(bd);
 
 	return (0);
 }
@@ -586,21 +555,22 @@ bufspace_reserve(int size, bool metadata)
  *	Release reserved bufspace after bufspace_adjust() has consumed it.
  */
 static void
-bufspace_release(int size)
+bufspace_release(struct bufdomain *bd, int size)
 {
-	atomic_subtract_long(&bufspace, size);
-	bufspace_wakeup();
+
+	atomic_subtract_long(&bd->bd_bufspace, size);
 }
 
 /*
  *	bufspace_wait:
  *
  *	Wait for bufspace, acting as the buf daemon if a locked vnode is
- *	supplied.  needsbuffer must be set in a safe fashion prior to
- *	polling for space.  The operation must be re-tried on return.
+ *	supplied.  bd_wanted must be set prior to polling for space.  The
+ *	operation must be re-tried on return.
  */
 static void
-bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
+bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags,
+    int slpflag, int slptimeo)
 {
 	struct thread *td;
 	int error, fl, norunbuf;
@@ -609,11 +579,11 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
 		return;
 
 	td = curthread;
-	rw_wlock(&nblock);
-	while (needsbuffer != 0) {
+	BD_LOCK(bd);
+	while (bd->bd_wanted) {
 		if (vp != NULL && vp->v_type != VCHR &&
 		    (td->td_pflags & TDP_BUFNEED) == 0) {
-			rw_wunlock(&nblock);
+			BD_UNLOCK(bd);
 			/*
 			 * getblk() is called with a vnode locked, and
 			 * some majority of the dirty buffers may as
@@ -636,18 +606,18 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
 			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
 			fl = buf_flush(vp, flushbufqtarget);
 			td->td_pflags &= norunbuf;
-			rw_wlock(&nblock);
+			BD_LOCK(bd);
 			if (fl != 0)
 				continue;
-			if (needsbuffer == 0)
+			if (bd->bd_wanted == 0)
 				break;
 		}
-		error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
+		error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
 		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
 		if (error != 0)
 			break;
 	}
-	rw_wunlock(&nblock);
+	BD_UNLOCK(bd);
 }
 
 
@@ -659,10 +629,13 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
  *	block nor work to reclaim buffers.
  */
 static void
-bufspace_daemon(void)
+bufspace_daemon(void *arg)
 {
+	struct bufdomain *bd;
+
+	bd = arg;
 	for (;;) {
-		kproc_suspend_check(bufspacedaemonproc);
+		kproc_suspend_check(curproc);
 
 		/*
 		 * Free buffers from the clean queue until we meet our
@@ -689,46 +662,35 @@ bufspace_daemon(void)
 		 *	which will inefficiently trade bufs with bqrelse
 		 *	until we return to condition 2.
 		 */
-		while (bufspace > lobufspace ||
-		    numfreebuffers < hifreebuffers) {
-			if (buf_recycle(false) != 0) {
-				atomic_set_int(&needsbuffer, 1);
-				if (buf_recycle(false) != 0) {
-					rw_wlock(&nblock);
-					if (needsbuffer)
-						rw_sleep(__DEVOLATILE(void *,
-						    &needsbuffer), &nblock,
-						    PRIBIO|PDROP, "bufspace",
-						    hz/10);
-					else
-						rw_wunlock(&nblock);
-				}
+		do {
+			if (buf_recycle(bd, false) != 0) {
+				if (bd_flushall(bd))
+					continue;
+				BD_LOCK(bd);
+				if (bd->bd_wanted) {
+					msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
+					    PRIBIO|PDROP, "bufspace", hz/10);
+				} else
+					BD_UNLOCK(bd);
 			}
 			maybe_yield();
-		}
+		} while (bd->bd_bufspace > bd->bd_lobufspace ||
+		    bd->bd_freebuffers < bd->bd_hifreebuffers);
 
 		/*
-		 * Re-check our limits under the exclusive nblock.
+		 * Re-check our limits and sleep.
 		 */
-		rw_wlock(&nblock);
-		if (bufspace < bufspacethresh &&
-		    numfreebuffers > lofreebuffers) {
-			bufspace_request = 0;
-			rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
+		BD_LOCK(bd);
+		if (bd->bd_bufspace < bd->bd_bufspacethresh &&
+		    bd->bd_freebuffers > bd->bd_lofreebuffers) {
+			bd->bd_request = 0;
+			msleep(&bd->bd_request, BD_LOCKPTR(bd), PRIBIO|PDROP,
 			    "-", hz);
 		} else
-			rw_wunlock(&nblock);
+			BD_UNLOCK(bd);
 	}
 }
 
-static struct kproc_desc bufspace_kp = {
-	"bufspacedaemon",
-	bufspace_daemon,
-	&bufspacedaemonproc
-};
-SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
-    &bufspace_kp);
-
 /*
  *	bufmallocadjust:
  *
@@ -1038,38 +1000,32 @@ bufinit(void)
 	KASSERT(maxbcachebuf >= MAXBSIZE,
 	    ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
 	    MAXBSIZE));
-	mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
-	mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
-	for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
-		mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
+	bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
+	bq_init(&bqdirty, QUEUE_DIRTY, -1, "bufq dirty lock");
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
-	rw_init(&nblock, "needsbuffer lock");
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
-	/* next, make a null set of free lists */
-	for (i = 0; i < BUFFER_QUEUES; i++)
-		TAILQ_INIT(&bufqueues[i]);
-
 	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
 
 	/* finally, initialize each buffer header and stick on empty q */
+	BQ_LOCK(&bqempty);
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
-		bp->b_qindex = QUEUE_EMPTY;
+		bp->b_qindex = QUEUE_NONE;
+		bp->b_domain = -1;
+		bp->b_cpu = -1;
 		bp->b_xflags = 0;
 		bp->b_data = bp->b_kvabase = unmapped_buf;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
-		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
-#ifdef INVARIANTS
-		bq_len[QUEUE_EMPTY]++;
-#endif
+		bq_insert(&bqempty, bp, false);
 	}
+	BQ_UNLOCK(&bqempty);
 
 	/*
 	 * maxbufspace is the absolute maximum amount of buffer space we are 
@@ -1150,8 +1106,31 @@ bufinit(void)
 	 * One queue per-256mb up to the max.  More queues gives better
 	 * concurrency but less accurate LRU.
 	 */
-	clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
+	clean_domains = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_DOMAINS);
+	for (i = 0 ; i < clean_domains; i++) {
+		struct bufdomain *bd;
 
+		bd = &bdclean[i];
+		bd_init(bd);
+		bd->bd_freebuffers = nbuf / clean_domains;
+		bd->bd_hifreebuffers = hifreebuffers / clean_domains;
+		bd->bd_lofreebuffers = lofreebuffers / clean_domains;
+		bd->bd_bufspace = 0;
+		bd->bd_maxbufspace = maxbufspace / clean_domains;
+		bd->bd_hibufspace = hibufspace / clean_domains;
+		bd->bd_lobufspace = lobufspace / clean_domains;
+		bd->bd_bufspacethresh = bufspacethresh / clean_domains;
+		/* Don't allow more than 2% of bufs in the per-cpu caches. */
+		bd->bd_lim = nbuf / clean_domains / 50 / mp_ncpus;
+	}
+	getnewbufcalls = counter_u64_alloc(M_WAITOK);
+	getnewbufrestarts = counter_u64_alloc(M_WAITOK);
+	mappingrestarts = counter_u64_alloc(M_WAITOK);
+	numbufallocfails = counter_u64_alloc(M_WAITOK);
+	notbufdflushes = counter_u64_alloc(M_WAITOK);
+	buffreekvacnt = counter_u64_alloc(M_WAITOK);
+	bufdefragcnt = counter_u64_alloc(M_WAITOK);
+	bufkvaspace = counter_u64_alloc(M_WAITOK);
 }
 
 #ifdef INVARIANTS
@@ -1326,58 +1305,77 @@ bpmap_qenter(struct buf *bp)
 	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
+static struct bufqueue *
+bufqueue(struct buf *bp)
+{
+	struct bufdomain *bd;
+
+	switch (bp->b_qindex) {
+	case QUEUE_NONE:
+		/* FALLTHROUGH */
+	case QUEUE_SENTINEL:
+		return (NULL);
+	case QUEUE_EMPTY:
+		return (&bqempty);
+	case QUEUE_DIRTY:
+		return (&bqdirty);
+	case QUEUE_CLEAN:
+		/* FALLTHROUGH */
+		break;
+	default:
+		panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex);
+	}
+	bd = &bdclean[bp->b_domain];
+	if (bp->b_cpu > mp_maxid)
+		return (&bd->bd_cleanq);
+	return (&bd->bd_cpuq[bp->b_cpu]);
+
+}
+
 /*
  *	binsfree:
  *
- *	Insert the buffer into the appropriate free list.
+ *	Insert the buffer into the appropriate free list.  Requires a
+ *	locked buffer on entry and buffer is unlocked before return.
  */
 static void
 binsfree(struct buf *bp, int qindex)
 {
-	struct mtx *olock, *nlock;
+	struct bufdomain *bd;
+	struct bufqueue *bq;
 
-	if (qindex != QUEUE_EMPTY) {
-		BUF_ASSERT_XLOCKED(bp);
-	}
+	KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY,
+	    ("binsfree: Invalid qindex %d", qindex));
+	BUF_ASSERT_XLOCKED(bp);
 
 	/*
-	 * Stick to the same clean queue for the lifetime of the buf to
-	 * limit locking below.  Otherwise pick ont sequentially.
-	 */
-	if (qindex == QUEUE_CLEAN) {
-		if (bqisclean(bp->b_qindex))
-			qindex = bp->b_qindex;
-		else
-			qindex = bqcleanq();
-	}
-
-	/*
 	 * Handle delayed bremfree() processing.
 	 */
-	nlock = bqlock(qindex);
 	if (bp->b_flags & B_REMFREE) {
-		olock = bqlock(bp->b_qindex);
-		mtx_lock(olock);
-		bremfreel(bp);
-		if (olock != nlock) {
-			mtx_unlock(olock);
-			mtx_lock(nlock);
+		if (bp->b_qindex == qindex) {
+			bp->b_flags |= B_REUSE;
+			bp->b_flags &= ~B_REMFREE;
+			BUF_UNLOCK(bp);
+			return;
 		}
+		bq = bufqueue(bp);
+		BQ_LOCK(bq);
+		bq_remove(bq, bp);
+		BQ_UNLOCK(bq);
+	}
+	if (qindex == QUEUE_CLEAN) {
+		bd = &bdclean[bp->b_domain];
+		if (bd->bd_lim != 0)
+			bq = &bd->bd_cpuq[PCPU_GET(cpuid)];
+		else
+			bq = &bd->bd_cleanq;
 	} else
-		mtx_lock(nlock);
+		bq = &bqdirty;
+	BQ_LOCK(bq);
+	bq_insert(bq, bp, true);
+	BQ_UNLOCK(bq);
 
-	if (bp->b_qindex != QUEUE_NONE)
-		panic("binsfree: free buffer onto another queue???");
-
-	bp->b_qindex = qindex;
-	if (bp->b_flags & B_AGE)
-		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
-	else
-		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
-#ifdef INVARIANTS
-	bq_len[bp->b_qindex]++;
-#endif
-	mtx_unlock(nlock);
+	return;
 }
 
 /*
@@ -1404,10 +1402,9 @@ buf_free(struct buf *bp)
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_deallocate(bp);
 	bufkva_free(bp);
+	atomic_add_int(&bdclean[bp->b_domain].bd_freebuffers, 1);
 	BUF_UNLOCK(bp);
 	uma_zfree(buf_zone, bp);
-	atomic_add_int(&numfreebuffers, 1);
-	bufspace_wakeup();
 }
 
 /*
@@ -1424,15 +1421,15 @@ buf_import(void *arg, void **store, int cnt, int domai
 	struct buf *bp;
 	int i;
 
-	mtx_lock(&bqlocks[QUEUE_EMPTY]);
+	BQ_LOCK(&bqempty);
 	for (i = 0; i < cnt; i++) {
-		bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+		bp = TAILQ_FIRST(&bqempty.bq_queue);
 		if (bp == NULL)
 			break;
-		bremfreel(bp);
+		bq_remove(&bqempty, bp);
 		store[i] = bp;
 	}
-	mtx_unlock(&bqlocks[QUEUE_EMPTY]);
+	BQ_UNLOCK(&bqempty);
 
 	return (i);
 }
@@ -1447,8 +1444,10 @@ buf_release(void *arg, void **store, int cnt)
 {
         int i;
 
+	BQ_LOCK(&bqempty);
         for (i = 0; i < cnt; i++)
-		binsfree(store[i], QUEUE_EMPTY);
+		bq_insert(&bqempty, store[i], false);
+	BQ_UNLOCK(&bqempty);
 }
 
 /*
@@ -1457,22 +1456,31 @@ buf_release(void *arg, void **store, int cnt)
  *	Allocate an empty buffer header.
  */
 static struct buf *
-buf_alloc(void)
+buf_alloc(struct bufdomain *bd)
 {
 	struct buf *bp;
+	int freebufs;
 
-	bp = uma_zalloc(buf_zone, M_NOWAIT);
+	/*
+	 * We can only run out of bufs in the buf zone if the average buf
+	 * is less than BKVASIZE.  In this case the actual wait/block will
+	 * come from buf_reycle() failing to flush one of these small bufs.
+	 */
+	bp = NULL;
+	freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1);
+	if (freebufs > 0)
+		bp = uma_zalloc(buf_zone, M_NOWAIT);
 	if (bp == NULL) {
-		bufspace_daemonwakeup();
-		atomic_add_int(&numbufallocfails, 1);
+		atomic_fetchadd_int(&bd->bd_freebuffers, 1);
+		bufspace_daemonwakeup(bd);
+		counter_u64_add(numbufallocfails, 1);
 		return (NULL);
 	}
-
 	/*
-	 * Wake-up the bufspace daemon on transition.
+	 * Wake-up the bufspace daemon on transition below threshold.
 	 */
-	if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
-		bufspace_daemonwakeup();
+	if (freebufs == bd->bd_lofreebuffers)
+		bufspace_daemonwakeup(bd);
 
 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
@@ -1488,6 +1496,7 @@ buf_alloc(void)
 	KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
 	KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
 
+	bp->b_domain = BD_DOMAIN(bd);
 	bp->b_flags = 0;
 	bp->b_ioflags = 0;
 	bp->b_xflags = 0;
@@ -1512,22 +1521,26 @@ buf_alloc(void)
 }
 
 /*
- *	buf_qrecycle:
+ *	buf_recycle:
  *
  *	Free a buffer from the given bufqueue.  kva controls whether the
  *	freed buf must own some kva resources.  This is used for
  *	defragmenting.
  */
 static int
-buf_qrecycle(int qindex, bool kva)
+buf_recycle(struct bufdomain *bd, bool kva)
 {
+	struct bufqueue *bq;
 	struct buf *bp, *nbp;
 
 	if (kva)
-		atomic_add_int(&bufdefragcnt, 1);
+		counter_u64_add(bufdefragcnt, 1);
 	nbp = NULL;
-	mtx_lock(&bqlocks[qindex]);
-	nbp = TAILQ_FIRST(&bufqueues[qindex]);
+	bq = &bd->bd_cleanq;
+	BQ_LOCK(bq);
+	KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd),
+	    ("buf_recycle: Locks don't match"));
+	nbp = TAILQ_FIRST(&bq->bq_queue);
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
@@ -1551,6 +1564,18 @@ buf_qrecycle(int qindex, bool kva)
 			continue;
 
 		/*
+		 * Implement a second chance algorithm for frequently
+		 * accessed buffers.
+		 */
+		if ((bp->b_flags & B_REUSE) != 0) {
+			TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
+			TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+			bp->b_flags &= ~B_REUSE;
+			BUF_UNLOCK(bp);
+			continue;
+		}
+
+		/*
 		 * Skip buffers with background writes in progress.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
@@ -1558,14 +1583,18 @@ buf_qrecycle(int qindex, bool kva)
 			continue;
 		}
 
-		KASSERT(bp->b_qindex == qindex,
-		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
+		KASSERT(bp->b_qindex == QUEUE_CLEAN,
+		    ("buf_recycle: inconsistent queue %d bp %p",
+		    bp->b_qindex, bp));
+		KASSERT(bp->b_domain == BD_DOMAIN(bd),
+		    ("getnewbuf: queue domain %d doesn't match request %ld",
+		    bp->b_domain, BD_DOMAIN(bd)));
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 */
-		bremfreel(bp);
-		mtx_unlock(&bqlocks[qindex]);
+		bq_remove(bq, bp);
+		BQ_UNLOCK(bq);
 
 		/*
 		 * Requeue the background write buffer with error and
@@ -1573,70 +1602,21 @@ buf_qrecycle(int qindex, bool kva)
 		 */
 		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
 			bqrelse(bp);
-			mtx_lock(&bqlocks[qindex]);
-			nbp = TAILQ_FIRST(&bufqueues[qindex]);
+			BQ_LOCK(bq);
+			nbp = TAILQ_FIRST(&bq->bq_queue);
 			continue;
 		}
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 		return (0);
 	}
-	mtx_unlock(&bqlocks[qindex]);
+	bd->bd_wanted = 1;
+	BQ_UNLOCK(bq);
 
 	return (ENOBUFS);
 }
 
 /*
- *	buf_recycle:
- *
- *	Iterate through all clean queues until we find a buf to recycle or
- *	exhaust the search.
- */
-static int
-buf_recycle(bool kva)
-{
-	int qindex, first_qindex;
-
-	qindex = first_qindex = bqcleanq();
-	do {
-		if (buf_qrecycle(qindex, kva) == 0)
-			return (0);
-		if (++qindex == QUEUE_CLEAN + clean_queues)
-			qindex = QUEUE_CLEAN;
-	} while (qindex != first_qindex);
-
-	return (ENOBUFS);
-}
-
-/*
- *	buf_scan:
- *
- *	Scan the clean queues looking for a buffer to recycle.  needsbuffer
- *	is set on failure so that the caller may optionally bufspace_wait()
- *	in a race-free fashion.
- */
-static int
-buf_scan(bool defrag)
-{
-	int error;
-
-	/*
-	 * To avoid heavy synchronization and wakeup races we set
-	 * needsbuffer and re-poll before failing.  This ensures that
-	 * no frees can be missed between an unsuccessful poll and
-	 * going to sleep in a synchronized fashion.
-	 */
-	if ((error = buf_recycle(defrag)) != 0) {
-		atomic_set_int(&needsbuffer, 1);
-		bufspace_daemonwakeup();
-		error = buf_recycle(defrag);
-	}
-	if (error == 0)
-		atomic_add_int(&getnewbufrestarts, 1);
-	return (error);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201802052301.w15N1nEv036995>