From owner-svn-src-all@freebsd.org Fri Sep 30 17:27:19 2016 Return-Path: Delivered-To: svn-src-all@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 6BD73C03BA5; Fri, 30 Sep 2016 17:27:19 +0000 (UTC) (envelope-from mjg@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 31123B9F; Fri, 30 Sep 2016 17:27:19 +0000 (UTC) (envelope-from mjg@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id u8UHRIWh051377; Fri, 30 Sep 2016 17:27:18 GMT (envelope-from mjg@FreeBSD.org) Received: (from mjg@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id u8UHRIgD051373; Fri, 30 Sep 2016 17:27:18 GMT (envelope-from mjg@FreeBSD.org) Message-Id: <201609301727.u8UHRIgD051373@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: mjg set sender to mjg@FreeBSD.org using -f From: Mateusz Guzik Date: Fri, 30 Sep 2016 17:27:18 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r306512 - in head/sys: kern sys X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 30 Sep 2016 17:27:19 -0000 Author: mjg Date: Fri Sep 30 17:27:17 2016 New Revision: 306512 URL: https://svnweb.freebsd.org/changeset/base/306512 Log: vfs: batch free vnodes in per-mnt lists Previously free vnodes would always by directly returned to the global LRU list. With this change up to mnt_free_list_batch vnodes are collected first. syncer runs always return the batch regardless of its size. While vnodes on per-mnt lists are not counted as free, they can be returned in case of vnode shortage. Reviewed by: kib Tested by: pho Modified: head/sys/kern/vfs_mount.c head/sys/kern/vfs_subr.c head/sys/sys/mount.h head/sys/sys/vnode.h Modified: head/sys/kern/vfs_mount.c ============================================================================== --- head/sys/kern/vfs_mount.c Fri Sep 30 17:19:43 2016 (r306511) +++ head/sys/kern/vfs_mount.c Fri Sep 30 17:27:17 2016 (r306512) @@ -109,6 +109,7 @@ mount_init(void *mem, int size, int flag mp = (struct mount *)mem; mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); + mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); return (0); } @@ -120,6 +121,7 @@ mount_fini(void *mem, int size) mp = (struct mount *)mem; lockdestroy(&mp->mnt_explock); + mtx_destroy(&mp->mnt_listmtx); mtx_destroy(&mp->mnt_mtx); } @@ -461,6 +463,8 @@ vfs_mount_alloc(struct vnode *vp, struct mp->mnt_nvnodelistsize = 0; TAILQ_INIT(&mp->mnt_activevnodelist); mp->mnt_activevnodelistsize = 0; + TAILQ_INIT(&mp->mnt_tmpfreevnodelist); + mp->mnt_tmpfreevnodelistsize = 0; mp->mnt_ref = 0; (void) vfs_busy(mp, MBF_NOWAIT); atomic_add_acq_int(&vfsp->vfc_refcount, 1); Modified: head/sys/kern/vfs_subr.c ============================================================================== --- head/sys/kern/vfs_subr.c Fri Sep 30 17:19:43 2016 (r306511) +++ head/sys/kern/vfs_subr.c Fri Sep 30 17:27:17 2016 (r306512) @@ -112,6 +112,7 @@ static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_locked(void *arg); static void vfs_knl_assert_unlocked(void *arg); +static void vnlru_return_batches(struct vfsops *mnt_op); static void destroy_vpollinfo(struct vpollinfo *vi); /* @@ -127,6 +128,10 @@ static u_long vnodes_created; SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 0, "Number of vnodes created by getnewvnode"); +static u_long mnt_free_list_batch = 128; +SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW, + &mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list"); + /* * Conversion tables for conversion from vnode types to inode formats * and back. @@ -953,7 +958,9 @@ vnlru_free_locked(int count, struct vfso { struct vnode *vp; struct mount *mp; + bool tried_batches; + tried_batches = false; mtx_assert(&vnode_free_list_mtx, MA_OWNED); if (count > max_vnlru_free) count = max_vnlru_free; @@ -963,8 +970,16 @@ vnlru_free_locked(int count, struct vfso * The list can be modified while the free_list_mtx * has been dropped and vp could be NULL here. */ - if (!vp) - break; + if (vp == NULL) { + if (tried_batches) + break; + mtx_unlock(&vnode_free_list_mtx); + vnlru_return_batches(mnt_op); + tried_batches = true; + mtx_lock(&vnode_free_list_mtx); + continue; + } + VNASSERT(vp->v_op != NULL, vp, ("vnlru_free: vnode already reclaimed.")); KASSERT((vp->v_iflag & VI_FREE) != 0, @@ -1041,6 +1056,63 @@ vspace(void) return (space); } +static void +vnlru_return_batch_locked(struct mount *mp) +{ + struct vnode *vp; + + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + + if (mp->mnt_tmpfreevnodelistsize == 0) + return; + + mtx_lock(&vnode_free_list_mtx); + TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) { + VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp, + ("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist")); + vp->v_mflag &= ~VMP_TMPMNTFREELIST; + } + TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist); + freevnodes += mp->mnt_tmpfreevnodelistsize; + mp->mnt_tmpfreevnodelistsize = 0; + mtx_unlock(&vnode_free_list_mtx); +} + +static void +vnlru_return_batch(struct mount *mp) +{ + + mtx_lock(&mp->mnt_listmtx); + vnlru_return_batch_locked(mp); + mtx_unlock(&mp->mnt_listmtx); +} + +static void +vnlru_return_batches(struct vfsops *mnt_op) +{ + struct mount *mp, *nmp; + bool need_unbusy; + + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { + need_unbusy = false; + if (mnt_op != NULL && mp->mnt_op != mnt_op) + goto next; + if (mp->mnt_tmpfreevnodelistsize == 0) + goto next; + if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) { + vnlru_return_batch(mp); + need_unbusy = true; + mtx_lock(&mountlist_mtx); + } +next: + nmp = TAILQ_NEXT(mp, mnt_list); + if (need_unbusy) + vfs_unbusy(mp); + } + mtx_unlock(&mountlist_mtx); +} + /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some @@ -1068,9 +1140,8 @@ vnlru_proc(void) * adjusted using its sysctl, or emergency growth), first * try to reduce it by discarding from the free list. */ - if (numvnodes > desiredvnodes && freevnodes > 0) - vnlru_free_locked(ulmin(numvnodes - desiredvnodes, - freevnodes), NULL); + if (numvnodes > desiredvnodes) + vnlru_free_locked(numvnodes - desiredvnodes, NULL); /* * Sleep if the vnode cache is in a good state. This is * when it is not over-full and has space for about a 4% @@ -1457,10 +1528,10 @@ delmntque(struct vnode *vp) active = vp->v_iflag & VI_ACTIVE; vp->v_iflag &= ~VI_ACTIVE; if (active) { - mtx_lock(&vnode_free_list_mtx); + mtx_lock(&mp->mnt_listmtx); TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize--; - mtx_unlock(&vnode_free_list_mtx); + mtx_unlock(&mp->mnt_listmtx); } vp->v_mount = NULL; VI_UNLOCK(vp); @@ -1525,10 +1596,10 @@ insmntque1(struct vnode *vp, struct moun KASSERT((vp->v_iflag & VI_ACTIVE) == 0, ("Activating already active vnode")); vp->v_iflag |= VI_ACTIVE; - mtx_lock(&vnode_free_list_mtx); + mtx_lock(&mp->mnt_listmtx); TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize++; - mtx_unlock(&vnode_free_list_mtx); + mtx_unlock(&mp->mnt_listmtx); VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); @@ -2753,17 +2824,25 @@ _vhold(struct vnode *vp, bool locked) * Remove a vnode from the free list, mark it as in use, * and put it on the active list. */ - mtx_lock(&vnode_free_list_mtx); - TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); - freevnodes--; - vp->v_iflag &= ~VI_FREE; + mp = vp->v_mount; + mtx_lock(&mp->mnt_listmtx); + if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) { + TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist); + mp->mnt_tmpfreevnodelistsize--; + vp->v_mflag &= ~VMP_TMPMNTFREELIST; + } else { + mtx_lock(&vnode_free_list_mtx); + TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); + freevnodes--; + mtx_unlock(&vnode_free_list_mtx); + } KASSERT((vp->v_iflag & VI_ACTIVE) == 0, ("Activating already active vnode")); + vp->v_iflag &= ~VI_FREE; vp->v_iflag |= VI_ACTIVE; - mp = vp->v_mount; TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize++; - mtx_unlock(&vnode_free_list_mtx); + mtx_unlock(&mp->mnt_listmtx); refcount_acquire(&vp->v_holdcnt); if (!locked) VI_UNLOCK(vp); @@ -2819,21 +2898,25 @@ _vdrop(struct vnode *vp, bool locked) if ((vp->v_iflag & VI_OWEINACT) == 0) { vp->v_iflag &= ~VI_ACTIVE; mp = vp->v_mount; - mtx_lock(&vnode_free_list_mtx); + mtx_lock(&mp->mnt_listmtx); if (active) { TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize--; } - TAILQ_INSERT_TAIL(&vnode_free_list, vp, + TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist); - freevnodes++; + mp->mnt_tmpfreevnodelistsize++; vp->v_iflag |= VI_FREE; - mtx_unlock(&vnode_free_list_mtx); + vp->v_mflag |= VMP_TMPMNTFREELIST; + VI_UNLOCK(vp); + if (mp->mnt_tmpfreevnodelistsize >= mnt_free_list_batch) + vnlru_return_batch_locked(mp); + mtx_unlock(&mp->mnt_listmtx); } else { + VI_UNLOCK(vp); atomic_add_long(&free_owe_inact, 1); } - VI_UNLOCK(vp); return; } /* @@ -3926,6 +4009,9 @@ vfs_msync(struct mount *mp, int flags) struct vm_object *obj; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); + + vnlru_return_batch(mp); + MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { obj = vp->v_object; if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && @@ -5236,7 +5322,7 @@ mnt_vnode_next_active(struct vnode **mvp { struct vnode *vp, *nvp; - mtx_assert(&vnode_free_list_mtx, MA_OWNED); + mtx_assert(&mp->mnt_listmtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: vp = TAILQ_NEXT(*mvp, v_actfreelist); @@ -5249,9 +5335,9 @@ restart: if (!VI_TRYLOCK(vp)) { if (mp_ncpus == 1 || should_yield()) { TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); - mtx_unlock(&vnode_free_list_mtx); + mtx_unlock(&mp->mnt_listmtx); pause("vnacti", 1); - mtx_lock(&vnode_free_list_mtx); + mtx_lock(&mp->mnt_listmtx); goto restart; } continue; @@ -5268,12 +5354,12 @@ restart: /* Check if we are done */ if (vp == NULL) { - mtx_unlock(&vnode_free_list_mtx); + mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_active(mvp, mp); return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); - mtx_unlock(&vnode_free_list_mtx); + mtx_unlock(&mp->mnt_listmtx); ASSERT_VI_LOCKED(vp, "active iter"); KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); return (vp); @@ -5285,7 +5371,7 @@ __mnt_vnode_next_active(struct vnode **m if (should_yield()) kern_yield(PRI_USER); - mtx_lock(&vnode_free_list_mtx); + mtx_lock(&mp->mnt_listmtx); return (mnt_vnode_next_active(mvp, mp)); } @@ -5301,10 +5387,10 @@ __mnt_vnode_first_active(struct vnode ** (*mvp)->v_type = VMARKER; (*mvp)->v_mount = mp; - mtx_lock(&vnode_free_list_mtx); + mtx_lock(&mp->mnt_listmtx); vp = TAILQ_FIRST(&mp->mnt_activevnodelist); if (vp == NULL) { - mtx_unlock(&vnode_free_list_mtx); + mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_active(mvp, mp); return (NULL); } @@ -5319,8 +5405,8 @@ __mnt_vnode_markerfree_active(struct vno if (*mvp == NULL) return; - mtx_lock(&vnode_free_list_mtx); + mtx_lock(&mp->mnt_listmtx); TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); - mtx_unlock(&vnode_free_list_mtx); + mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_active(mvp, mp); } Modified: head/sys/sys/mount.h ============================================================================== --- head/sys/sys/mount.h Fri Sep 30 17:19:43 2016 (r306511) +++ head/sys/sys/mount.h Fri Sep 30 17:27:17 2016 (r306512) @@ -147,6 +147,7 @@ struct vfsopt { * put on a doubly linked list. * * Lock reference: + * l - mnt_listmtx * m - mountlist_mtx * i - interlock * v - vnode freelist mutex @@ -166,8 +167,6 @@ struct mount { int mnt_ref; /* (i) Reference count */ struct vnodelst mnt_nvnodelist; /* (i) list of vnodes */ int mnt_nvnodelistsize; /* (i) # of vnodes */ - struct vnodelst mnt_activevnodelist; /* (v) list of active vnodes */ - int mnt_activevnodelistsize;/* (v) # of active vnodes */ int mnt_writeopcount; /* (i) write syscalls pending */ int mnt_kern_flag; /* (i) kernel only flags */ uint64_t mnt_flag; /* (i) flags shared with user */ @@ -188,6 +187,11 @@ struct mount { struct thread *mnt_susp_owner; /* (i) thread owning suspension */ #define mnt_endzero mnt_gjprovider char *mnt_gjprovider; /* gjournal provider name */ + struct mtx mnt_listmtx; + struct vnodelst mnt_activevnodelist; /* (l) list of active vnodes */ + int mnt_activevnodelistsize;/* (l) # of active vnodes */ + struct vnodelst mnt_tmpfreevnodelist; /* (l) list of free vnodes */ + int mnt_tmpfreevnodelistsize;/* (l) # of free vnodes */ struct lock mnt_explock; /* vfs_export walkers lock */ TAILQ_ENTRY(mount) mnt_upper_link; /* (m) we in the all uppers */ TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ Modified: head/sys/sys/vnode.h ============================================================================== --- head/sys/sys/vnode.h Fri Sep 30 17:19:43 2016 (r306511) +++ head/sys/sys/vnode.h Fri Sep 30 17:27:17 2016 (r306512) @@ -75,8 +75,8 @@ struct vpollinfo { * * Lock reference: * c - namecache mutex - * f - freelist mutex * i - interlock + * l - mp mnt_listmtx or freelist mutex * I - updated with atomics, 0->1 and 1->0 transitions with interlock held * m - mount point interlock * p - pollinfo lock @@ -144,7 +144,7 @@ struct vnode { /* * The machinery of being a vnode */ - TAILQ_ENTRY(vnode) v_actfreelist; /* f vnode active/free lists */ + TAILQ_ENTRY(vnode) v_actfreelist; /* l vnode active/free lists */ struct bufobj v_bufobj; /* * Buffer cache object */ /* @@ -167,6 +167,7 @@ struct vnode { u_int v_usecount; /* I ref count of users */ u_int v_iflag; /* i vnode flags (see below) */ u_int v_vflag; /* v vnode flags */ + u_int v_mflag; /* l mnt-specific vnode flags */ int v_writecount; /* v ref count of writers */ u_int v_hash; enum vtype v_type; /* u vnode type */ @@ -256,6 +257,8 @@ struct xvnode { #define VV_MD 0x0800 /* vnode backs the md device */ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ +#define VMP_TMPMNTFREELIST 0x0001 /* Vnode is on mnt's tmp free list */ + /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr).