Date: Mon, 5 Feb 2018 23:01:49 +0000 (UTC) From: Jeff Roberson <jeff@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r328904 - in user/jeff/numa/sys: kern sys Message-ID: <201802052301.w15N1nEv036995@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: jeff Date: Mon Feb 5 23:01:49 2018 New Revision: 328904 URL: https://svnweb.freebsd.org/changeset/base/328904 Log: Re-implement the buffer queues with a number of independent silos each having their own space allotment and bufspace daemon. Use a per-cpu clean queue cache in front of the silo clean queue. Move the common queue variables (queue, len, lock) into a structure so they can be aligned and packed together. Implement a REUSE flag to operate as a second chance in buf_recycle() so we don't have to requeue frequently re-used buffers. Move counters to the counter API. Modified: user/jeff/numa/sys/kern/vfs_bio.c user/jeff/numa/sys/kern/vfs_subr.c user/jeff/numa/sys/sys/buf.h user/jeff/numa/sys/sys/bufobj.h Modified: user/jeff/numa/sys/kern/vfs_bio.c ============================================================================== --- user/jeff/numa/sys/kern/vfs_bio.c Mon Feb 5 22:21:51 2018 (r328903) +++ user/jeff/numa/sys/kern/vfs_bio.c Mon Feb 5 23:01:49 2018 (r328904) @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/bio.h> #include <sys/conf.h> +#include <sys/counter.h> #include <sys/buf.h> #include <sys/devicestat.h> #include <sys/eventhandler.h> @@ -105,7 +106,6 @@ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; -struct proc *bufspacedaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); @@ -124,11 +124,8 @@ static int vfs_bio_clcheck(struct vnode *vp, int size, static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *)); static int buf_flush(struct vnode *vp, int); -static int buf_recycle(bool); -static int buf_scan(bool); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); -static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); @@ -137,28 +134,17 @@ static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); -#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); -#endif - int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); -static long bufspace; -#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, - &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); -#else -SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, - "Physical memory used for buffers"); -#endif -static long bufkvaspace; -SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, + NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers"); +static counter_u64_t bufkvaspace; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, @@ -178,11 +164,11 @@ SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &h long bufspacethresh; SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, 0, "Bufspace consumed before waking the daemon to free some"); -static int buffreekvacnt; -SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, +static counter_u64_t buffreekvacnt; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, "Number of times we have freed the KVA space from some buffer"); -static int bufdefragcnt; -SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, +static counter_u64_t bufdefragcnt; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | @@ -225,24 +211,26 @@ SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "Threshold for clean buffer recycling"); -static int getnewbufcalls; -SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, - "Number of calls to getnewbuf"); -static int getnewbufrestarts; -SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, +static counter_u64_t getnewbufcalls; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, + &getnewbufcalls, "Number of calls to getnewbuf"); +static counter_u64_t getnewbufrestarts; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, + &getnewbufrestarts, "Number of times getnewbuf has had to restart a buffer acquisition"); -static int mappingrestarts; -SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, +static counter_u64_t mappingrestarts; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD, + &mappingrestarts, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); -static int numbufallocfails; -SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0, - "Number of times buffer allocations failed"); +static counter_u64_t numbufallocfails; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, + &numbufallocfails, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); -static long notbufdflushes; -SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, +static counter_u64_t notbufdflushes; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, @@ -266,11 +254,6 @@ static struct mtx_padalign __exclusive_cache_line bdlo static struct mtx_padalign __exclusive_cache_line rbreqlock; /* - * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. - */ -static struct rwlock_padalign __exclusive_cache_line nblock; - -/* * Lock that protects bdirtywait. */ static struct mtx_padalign __exclusive_cache_line bdirtylock; @@ -283,11 +266,6 @@ static struct mtx_padalign __exclusive_cache_line bdir static int bd_request; /* - * Request/wakeup point for the bufspace daemon. - */ -static int bufspace_request; - -/* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty @@ -302,15 +280,6 @@ static int bd_speedupreq; */ static int runningbufreq; -/* - * Synchronization (sleep/wakeup) variable for buffer requests. - * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done - * by and/or. - * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(), - * getnewbuf(), and getblk(). - */ -static volatile int needsbuffer; - /* * Synchronization for bwillwrite() waiters. */ @@ -323,29 +292,65 @@ static int bdirtywait; #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ -#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ +#define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */ -/* Maximum number of clean buffer queues. */ -#define CLEAN_QUEUES 16 +struct bufqueue { + struct mtx_padalign bq_lock; + TAILQ_HEAD(, buf) bq_queue; + uint8_t bq_index; + uint16_t bq_cpu; + int bq_len; +} __aligned(CACHE_LINE_SIZE); +#define BQ_LOCKPTR(bq) (&(bq)->bq_lock) +#define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) +#define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) +#define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) + +struct bufqueue __exclusive_cache_line bqempty; +struct bufqueue __exclusive_cache_line bqdirty; + +struct bufdomain { + struct bufqueue bd_cpuq[MAXCPU]; + struct bufqueue bd_cleanq; + /* Constants */ + long bd_maxbufspace; + long bd_hibufspace; + long bd_lobufspace; + long bd_bufspacethresh; + int bd_hifreebuffers; + int bd_lofreebuffers; + int bd_lim; + /* atomics */ + int bd_wanted; + int __aligned(CACHE_LINE_SIZE) bd_request; + long __aligned(CACHE_LINE_SIZE) bd_bufspace; + int __aligned(CACHE_LINE_SIZE) bd_freebuffers; +} __aligned(CACHE_LINE_SIZE); + +#define BD_LOCKPTR(bd) (&(bd)->bd_cleanq.bq_lock) +#define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) +#define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) +#define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) +#define BD_DOMAIN(bd) (bd - bdclean) + +/* Maximum number of clean buffer domains. */ +#define CLEAN_DOMAINS 8 + /* Configured number of clean queues. */ -static int clean_queues; +static int __read_mostly clean_domains; -/* Maximum number of buffer queues. */ -#define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES) +struct bufdomain __exclusive_cache_line bdclean[CLEAN_DOMAINS]; -/* Queues for free buffers with various properties */ -static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; -#ifdef INVARIANTS -static int bq_len[BUFFER_QUEUES]; -#endif +static void bq_remove(struct bufqueue *bq, struct buf *bp); +static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); +static int buf_recycle(struct bufdomain *, bool kva); +static void bq_init(struct bufqueue *bq, int qindex, int cpu, + const char *lockname); +static void bd_init(struct bufdomain *bd); +static int bd_flushall(struct bufdomain *bd); /* - * Lock for each bufqueue - */ -static struct mtx_padalign __exclusive_cache_line bqlocks[BUFFER_QUEUES]; - -/* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; @@ -391,46 +396,34 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; + int i; + lvalue = 0; + for (i = 0; i < clean_domains; i++) + lvalue += bdclean[i].bd_bufspace; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) - return (sysctl_handle_long(oidp, arg1, arg2, req)); - lvalue = *(long *)arg1; + return (sysctl_handle_long(oidp, &lvalue, 0, req)); if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } -#endif - +#else static int -bqcleanq(void) +sysctl_bufspace(SYSCTL_HANDLER_ARGS) { - static int nextq; + long lvalue; + int i; - return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN); + lvalue = 0; + for (i = 0; i < clean_domains; i++) + lvalue += bdclean[i].bd_bufspace; + return (sysctl_handle_int(oidp, &lvalue, 0, req)); } +#endif -static int -bqisclean(int qindex) -{ - - return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES); -} - /* - * bqlock: - * - * Return the appropriate queue lock based on the index. - */ -static inline struct mtx * -bqlock(int qindex) -{ - - return (struct mtx *)&bqlocks[qindex]; -} - -/* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. @@ -481,50 +474,23 @@ bdirtyadd(void) } /* - * bufspace_wakeup: + * bufspace_daemonwakeup: * - * Called when buffer space is potentially available for recovery. - * getnewbuf() will block on this flag when it is unable to free - * sufficient buffer space. Buffer space becomes recoverable when - * bp's get placed back in the queues. + * Wakeup the daemons responsible for freeing clean bufs. */ static void -bufspace_wakeup(void) +bufspace_daemonwakeup(struct bufdomain *bd) { - /* - * If someone is waiting for bufspace, wake them up. - * - * Since needsbuffer is set prior to doing an additional queue - * scan it is safe to check for the flag prior to acquiring the - * lock. The thread that is preparing to scan again before - * blocking would discover the buf we released. - */ - if (needsbuffer) { - rw_rlock(&nblock); - if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1) - wakeup(__DEVOLATILE(void *, &needsbuffer)); - rw_runlock(&nblock); + if (atomic_fetchadd_int(&bd->bd_request, 1) == 0) { + BD_LOCK(bd); + bd->bd_request = 1; + wakeup(&bd->bd_request); + BD_UNLOCK(bd); } } /* - * bufspace_daemonwakeup: - * - * Wakeup the daemon responsible for freeing clean bufs. - */ -static void -bufspace_daemonwakeup(void) -{ - rw_rlock(&nblock); - if (bufspace_request == 0) { - bufspace_request = 1; - wakeup(&bufspace_request); - } - rw_runlock(&nblock); -} - -/* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly @@ -533,20 +499,22 @@ bufspace_daemonwakeup(void) static void bufspace_adjust(struct buf *bp, int bufsize) { + struct bufdomain *bd; long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); + bd = &bdclean[bp->b_domain]; diff = bufsize - bp->b_bufsize; if (diff < 0) { - atomic_subtract_long(&bufspace, -diff); - bufspace_wakeup(); + atomic_subtract_long(&bd->bd_bufspace, -diff); } else { - space = atomic_fetchadd_long(&bufspace, diff); + space = atomic_fetchadd_long(&bd->bd_bufspace, diff); /* Wake up the daemon on the transition. */ - if (space < bufspacethresh && space + diff >= bufspacethresh) - bufspace_daemonwakeup(); + if (space < bd->bd_bufspacethresh && + space + diff >= bd->bd_bufspacethresh) + bufspace_daemonwakeup(bd); } bp->b_bufsize = bufsize; } @@ -558,24 +526,25 @@ bufspace_adjust(struct buf *bp, int bufsize) * different space limit than data. */ static int -bufspace_reserve(int size, bool metadata) +bufspace_reserve(struct bufdomain *bd, int size, bool metadata) { - long limit; + long limit, new; long space; if (metadata) - limit = maxbufspace; + limit = bd->bd_maxbufspace; else - limit = hibufspace; + limit = bd->bd_hibufspace; do { - space = bufspace; - if (space + size > limit) + space = bd->bd_bufspace; + new = space + size; + if (new > limit) return (ENOSPC); - } while (atomic_cmpset_long(&bufspace, space, space + size) == 0); + } while (atomic_cmpset_long(&bd->bd_bufspace, space, new) == 0); /* Wake up the daemon on the transition. */ - if (space < bufspacethresh && space + size >= bufspacethresh) - bufspace_daemonwakeup(); + if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh) + bufspace_daemonwakeup(bd); return (0); } @@ -586,21 +555,22 @@ bufspace_reserve(int size, bool metadata) * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void -bufspace_release(int size) +bufspace_release(struct bufdomain *bd, int size) { - atomic_subtract_long(&bufspace, size); - bufspace_wakeup(); + + atomic_subtract_long(&bd->bd_bufspace, size); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is - * supplied. needsbuffer must be set in a safe fashion prior to - * polling for space. The operation must be re-tried on return. + * supplied. bd_wanted must be set prior to polling for space. The + * operation must be re-tried on return. */ static void -bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo) +bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags, + int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; @@ -609,11 +579,11 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl return; td = curthread; - rw_wlock(&nblock); - while (needsbuffer != 0) { + BD_LOCK(bd); + while (bd->bd_wanted) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { - rw_wunlock(&nblock); + BD_UNLOCK(bd); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as @@ -636,18 +606,18 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, flushbufqtarget); td->td_pflags &= norunbuf; - rw_wlock(&nblock); + BD_LOCK(bd); if (fl != 0) continue; - if (needsbuffer == 0) + if (bd->bd_wanted == 0) break; } - error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, + error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd), (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } - rw_wunlock(&nblock); + BD_UNLOCK(bd); } @@ -659,10 +629,13 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl * block nor work to reclaim buffers. */ static void -bufspace_daemon(void) +bufspace_daemon(void *arg) { + struct bufdomain *bd; + + bd = arg; for (;;) { - kproc_suspend_check(bufspacedaemonproc); + kproc_suspend_check(curproc); /* * Free buffers from the clean queue until we meet our @@ -689,46 +662,35 @@ bufspace_daemon(void) * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ - while (bufspace > lobufspace || - numfreebuffers < hifreebuffers) { - if (buf_recycle(false) != 0) { - atomic_set_int(&needsbuffer, 1); - if (buf_recycle(false) != 0) { - rw_wlock(&nblock); - if (needsbuffer) - rw_sleep(__DEVOLATILE(void *, - &needsbuffer), &nblock, - PRIBIO|PDROP, "bufspace", - hz/10); - else - rw_wunlock(&nblock); - } + do { + if (buf_recycle(bd, false) != 0) { + if (bd_flushall(bd)) + continue; + BD_LOCK(bd); + if (bd->bd_wanted) { + msleep(&bd->bd_wanted, BD_LOCKPTR(bd), + PRIBIO|PDROP, "bufspace", hz/10); + } else + BD_UNLOCK(bd); } maybe_yield(); - } + } while (bd->bd_bufspace > bd->bd_lobufspace || + bd->bd_freebuffers < bd->bd_hifreebuffers); /* - * Re-check our limits under the exclusive nblock. + * Re-check our limits and sleep. */ - rw_wlock(&nblock); - if (bufspace < bufspacethresh && - numfreebuffers > lofreebuffers) { - bufspace_request = 0; - rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP, + BD_LOCK(bd); + if (bd->bd_bufspace < bd->bd_bufspacethresh && + bd->bd_freebuffers > bd->bd_lofreebuffers) { + bd->bd_request = 0; + msleep(&bd->bd_request, BD_LOCKPTR(bd), PRIBIO|PDROP, "-", hz); } else - rw_wunlock(&nblock); + BD_UNLOCK(bd); } } -static struct kproc_desc bufspace_kp = { - "bufspacedaemon", - bufspace_daemon, - &bufspacedaemonproc -}; -SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, - &bufspace_kp); - /* * bufmallocadjust: * @@ -1038,38 +1000,32 @@ bufinit(void) KASSERT(maxbcachebuf >= MAXBSIZE, ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); - mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); - mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); - for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) - mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF); + bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock"); + bq_init(&bqdirty, QUEUE_DIRTY, -1, "bufq dirty lock"); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); - rw_init(&nblock, "needsbuffer lock"); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); - /* next, make a null set of free lists */ - for (i = 0; i < BUFFER_QUEUES; i++) - TAILQ_INIT(&bufqueues[i]); - unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ + BQ_LOCK(&bqempty); for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; - bp->b_qindex = QUEUE_EMPTY; + bp->b_qindex = QUEUE_NONE; + bp->b_domain = -1; + bp->b_cpu = -1; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); -#ifdef INVARIANTS - bq_len[QUEUE_EMPTY]++; -#endif + bq_insert(&bqempty, bp, false); } + BQ_UNLOCK(&bqempty); /* * maxbufspace is the absolute maximum amount of buffer space we are @@ -1150,8 +1106,31 @@ bufinit(void) * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ - clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES); + clean_domains = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_DOMAINS); + for (i = 0 ; i < clean_domains; i++) { + struct bufdomain *bd; + bd = &bdclean[i]; + bd_init(bd); + bd->bd_freebuffers = nbuf / clean_domains; + bd->bd_hifreebuffers = hifreebuffers / clean_domains; + bd->bd_lofreebuffers = lofreebuffers / clean_domains; + bd->bd_bufspace = 0; + bd->bd_maxbufspace = maxbufspace / clean_domains; + bd->bd_hibufspace = hibufspace / clean_domains; + bd->bd_lobufspace = lobufspace / clean_domains; + bd->bd_bufspacethresh = bufspacethresh / clean_domains; + /* Don't allow more than 2% of bufs in the per-cpu caches. */ + bd->bd_lim = nbuf / clean_domains / 50 / mp_ncpus; + } + getnewbufcalls = counter_u64_alloc(M_WAITOK); + getnewbufrestarts = counter_u64_alloc(M_WAITOK); + mappingrestarts = counter_u64_alloc(M_WAITOK); + numbufallocfails = counter_u64_alloc(M_WAITOK); + notbufdflushes = counter_u64_alloc(M_WAITOK); + buffreekvacnt = counter_u64_alloc(M_WAITOK); + bufdefragcnt = counter_u64_alloc(M_WAITOK); + bufkvaspace = counter_u64_alloc(M_WAITOK); } #ifdef INVARIANTS @@ -1326,58 +1305,77 @@ bpmap_qenter(struct buf *bp) (vm_offset_t)(bp->b_offset & PAGE_MASK)); } +static struct bufqueue * +bufqueue(struct buf *bp) +{ + struct bufdomain *bd; + + switch (bp->b_qindex) { + case QUEUE_NONE: + /* FALLTHROUGH */ + case QUEUE_SENTINEL: + return (NULL); + case QUEUE_EMPTY: + return (&bqempty); + case QUEUE_DIRTY: + return (&bqdirty); + case QUEUE_CLEAN: + /* FALLTHROUGH */ + break; + default: + panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex); + } + bd = &bdclean[bp->b_domain]; + if (bp->b_cpu > mp_maxid) + return (&bd->bd_cleanq); + return (&bd->bd_cpuq[bp->b_cpu]); + +} + /* * binsfree: * - * Insert the buffer into the appropriate free list. + * Insert the buffer into the appropriate free list. Requires a + * locked buffer on entry and buffer is unlocked before return. */ static void binsfree(struct buf *bp, int qindex) { - struct mtx *olock, *nlock; + struct bufdomain *bd; + struct bufqueue *bq; - if (qindex != QUEUE_EMPTY) { - BUF_ASSERT_XLOCKED(bp); - } + KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY, + ("binsfree: Invalid qindex %d", qindex)); + BUF_ASSERT_XLOCKED(bp); /* - * Stick to the same clean queue for the lifetime of the buf to - * limit locking below. Otherwise pick ont sequentially. - */ - if (qindex == QUEUE_CLEAN) { - if (bqisclean(bp->b_qindex)) - qindex = bp->b_qindex; - else - qindex = bqcleanq(); - } - - /* * Handle delayed bremfree() processing. */ - nlock = bqlock(qindex); if (bp->b_flags & B_REMFREE) { - olock = bqlock(bp->b_qindex); - mtx_lock(olock); - bremfreel(bp); - if (olock != nlock) { - mtx_unlock(olock); - mtx_lock(nlock); + if (bp->b_qindex == qindex) { + bp->b_flags |= B_REUSE; + bp->b_flags &= ~B_REMFREE; + BUF_UNLOCK(bp); + return; } + bq = bufqueue(bp); + BQ_LOCK(bq); + bq_remove(bq, bp); + BQ_UNLOCK(bq); + } + if (qindex == QUEUE_CLEAN) { + bd = &bdclean[bp->b_domain]; + if (bd->bd_lim != 0) + bq = &bd->bd_cpuq[PCPU_GET(cpuid)]; + else + bq = &bd->bd_cleanq; } else - mtx_lock(nlock); + bq = &bqdirty; + BQ_LOCK(bq); + bq_insert(bq, bp, true); + BQ_UNLOCK(bq); - if (bp->b_qindex != QUEUE_NONE) - panic("binsfree: free buffer onto another queue???"); - - bp->b_qindex = qindex; - if (bp->b_flags & B_AGE) - TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); - else - TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); -#ifdef INVARIANTS - bq_len[bp->b_qindex]++; -#endif - mtx_unlock(nlock); + return; } /* @@ -1404,10 +1402,9 @@ buf_free(struct buf *bp) if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); + atomic_add_int(&bdclean[bp->b_domain].bd_freebuffers, 1); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); - atomic_add_int(&numfreebuffers, 1); - bufspace_wakeup(); } /* @@ -1424,15 +1421,15 @@ buf_import(void *arg, void **store, int cnt, int domai struct buf *bp; int i; - mtx_lock(&bqlocks[QUEUE_EMPTY]); + BQ_LOCK(&bqempty); for (i = 0; i < cnt; i++) { - bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); + bp = TAILQ_FIRST(&bqempty.bq_queue); if (bp == NULL) break; - bremfreel(bp); + bq_remove(&bqempty, bp); store[i] = bp; } - mtx_unlock(&bqlocks[QUEUE_EMPTY]); + BQ_UNLOCK(&bqempty); return (i); } @@ -1447,8 +1444,10 @@ buf_release(void *arg, void **store, int cnt) { int i; + BQ_LOCK(&bqempty); for (i = 0; i < cnt; i++) - binsfree(store[i], QUEUE_EMPTY); + bq_insert(&bqempty, store[i], false); + BQ_UNLOCK(&bqempty); } /* @@ -1457,22 +1456,31 @@ buf_release(void *arg, void **store, int cnt) * Allocate an empty buffer header. */ static struct buf * -buf_alloc(void) +buf_alloc(struct bufdomain *bd) { struct buf *bp; + int freebufs; - bp = uma_zalloc(buf_zone, M_NOWAIT); + /* + * We can only run out of bufs in the buf zone if the average buf + * is less than BKVASIZE. In this case the actual wait/block will + * come from buf_reycle() failing to flush one of these small bufs. + */ + bp = NULL; + freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1); + if (freebufs > 0) + bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { - bufspace_daemonwakeup(); - atomic_add_int(&numbufallocfails, 1); + atomic_fetchadd_int(&bd->bd_freebuffers, 1); + bufspace_daemonwakeup(bd); + counter_u64_add(numbufallocfails, 1); return (NULL); } - /* - * Wake-up the bufspace daemon on transition. + * Wake-up the bufspace daemon on transition below threshold. */ - if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers) - bufspace_daemonwakeup(); + if (freebufs == bd->bd_lofreebuffers) + bufspace_daemonwakeup(bd); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf_empty: Locked buf %p on free queue.", bp); @@ -1488,6 +1496,7 @@ buf_alloc(void) KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); + bp->b_domain = BD_DOMAIN(bd); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; @@ -1512,22 +1521,26 @@ buf_alloc(void) } /* - * buf_qrecycle: + * buf_recycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int -buf_qrecycle(int qindex, bool kva) +buf_recycle(struct bufdomain *bd, bool kva) { + struct bufqueue *bq; struct buf *bp, *nbp; if (kva) - atomic_add_int(&bufdefragcnt, 1); + counter_u64_add(bufdefragcnt, 1); nbp = NULL; - mtx_lock(&bqlocks[qindex]); - nbp = TAILQ_FIRST(&bufqueues[qindex]); + bq = &bd->bd_cleanq; + BQ_LOCK(bq); + KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd), + ("buf_recycle: Locks don't match")); + nbp = TAILQ_FIRST(&bq->bq_queue); /* * Run scan, possibly freeing data and/or kva mappings on the fly @@ -1551,6 +1564,18 @@ buf_qrecycle(int qindex, bool kva) continue; /* + * Implement a second chance algorithm for frequently + * accessed buffers. + */ + if ((bp->b_flags & B_REUSE) != 0) { + TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); + TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); + bp->b_flags &= ~B_REUSE; + BUF_UNLOCK(bp); + continue; + } + + /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { @@ -1558,14 +1583,18 @@ buf_qrecycle(int qindex, bool kva) continue; } - KASSERT(bp->b_qindex == qindex, - ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); + KASSERT(bp->b_qindex == QUEUE_CLEAN, + ("buf_recycle: inconsistent queue %d bp %p", + bp->b_qindex, bp)); + KASSERT(bp->b_domain == BD_DOMAIN(bd), + ("getnewbuf: queue domain %d doesn't match request %ld", + bp->b_domain, BD_DOMAIN(bd))); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ - bremfreel(bp); - mtx_unlock(&bqlocks[qindex]); + bq_remove(bq, bp); + BQ_UNLOCK(bq); /* * Requeue the background write buffer with error and @@ -1573,70 +1602,21 @@ buf_qrecycle(int qindex, bool kva) */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); - mtx_lock(&bqlocks[qindex]); - nbp = TAILQ_FIRST(&bufqueues[qindex]); + BQ_LOCK(bq); + nbp = TAILQ_FIRST(&bq->bq_queue); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } - mtx_unlock(&bqlocks[qindex]); + bd->bd_wanted = 1; + BQ_UNLOCK(bq); return (ENOBUFS); } /* - * buf_recycle: - * - * Iterate through all clean queues until we find a buf to recycle or - * exhaust the search. - */ -static int -buf_recycle(bool kva) -{ - int qindex, first_qindex; - - qindex = first_qindex = bqcleanq(); - do { - if (buf_qrecycle(qindex, kva) == 0) - return (0); - if (++qindex == QUEUE_CLEAN + clean_queues) - qindex = QUEUE_CLEAN; - } while (qindex != first_qindex); - - return (ENOBUFS); -} - -/* - * buf_scan: - * - * Scan the clean queues looking for a buffer to recycle. needsbuffer - * is set on failure so that the caller may optionally bufspace_wait() - * in a race-free fashion. - */ -static int -buf_scan(bool defrag) -{ - int error; - - /* - * To avoid heavy synchronization and wakeup races we set - * needsbuffer and re-poll before failing. This ensures that - * no frees can be missed between an unsuccessful poll and - * going to sleep in a synchronized fashion. - */ - if ((error = buf_recycle(defrag)) != 0) { - atomic_set_int(&needsbuffer, 1); - bufspace_daemonwakeup(); - error = buf_recycle(defrag); - } - if (error == 0) - atomic_add_int(&getnewbufrestarts, 1); - return (error); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201802052301.w15N1nEv036995>