Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 16 Sep 2019 21:31:02 +0000 (UTC)
From:      Mateusz Guzik <mjg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r352424 - in head/sys: cddl/compat/opensolaris/kern kern sys vm
Message-ID:  <201909162131.x8GLV2bt001656@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mjg
Date: Mon Sep 16 21:31:02 2019
New Revision: 352424
URL: https://svnweb.freebsd.org/changeset/base/352424

Log:
  vfs: manage mnt_ref with atomics
  
  New primitive is introduced to denote sections can operate locklessly
  on aspects of struct mount, but which can also be disabled if necessary.
  This provides an opportunity to start scaling common case modifications
  while providing stable state of the struct when facing unmount, write
  suspendion or other events.
  
  mnt_ref is the first counter to start being managed in this manner with
  the intent to make it per-cpu.
  
  Reviewed by:	kib, jeff
  Sponsored by:	The FreeBSD Foundation
  Differential Revision:	https://reviews.freebsd.org/D21425

Modified:
  head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
  head/sys/kern/subr_pcpu.c
  head/sys/kern/vfs_default.c
  head/sys/kern/vfs_mount.c
  head/sys/kern/vfs_mountroot.c
  head/sys/kern/vfs_subr.c
  head/sys/sys/mount.h
  head/sys/vm/uma.h

Modified: head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
==============================================================================
--- head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c	Mon Sep 16 20:43:20 2019	(r352423)
+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c	Mon Sep 16 21:31:02 2019	(r352424)
@@ -242,6 +242,7 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const cha
 	if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
 		panic("mount: lost mount");
 	VOP_UNLOCK(vp, 0);
+	vfs_op_exit(mp);
 	vfs_unbusy(mp);
 	*vpp = mvp;
 	return (0);

Modified: head/sys/kern/subr_pcpu.c
==============================================================================
--- head/sys/kern/subr_pcpu.c	Mon Sep 16 20:43:20 2019	(r352423)
+++ head/sys/kern/subr_pcpu.c	Mon Sep 16 21:31:02 2019	(r352424)
@@ -131,15 +131,19 @@ SYSINIT(dpcpu, SI_SUB_KLD, SI_ORDER_FIRST, dpcpu_start
 
 /*
  * UMA_PCPU_ZONE zones, that are available for all kernel
- * consumers. Right now 64 bit zone is used for counter(9).
+ * consumers. Right now 64 bit zone is used for counter(9)
+ * and int zone is used for mount point counters.
  */
 
+uma_zone_t pcpu_zone_int;
 uma_zone_t pcpu_zone_64;
 
 static void
 pcpu_zones_startup(void)
 {
 
+	pcpu_zone_int = uma_zcreate("int pcpu", sizeof(int),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU);
 	pcpu_zone_64 = uma_zcreate("64 pcpu", sizeof(uint64_t),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU);
 }

Modified: head/sys/kern/vfs_default.c
==============================================================================
--- head/sys/kern/vfs_default.c	Mon Sep 16 20:43:20 2019	(r352423)
+++ head/sys/kern/vfs_default.c	Mon Sep 16 21:31:02 2019	(r352424)
@@ -601,17 +601,24 @@ vop_stdgetwritemount(ap)
 	 */
 	vp = ap->a_vp;
 	mp = vp->v_mount;
-	if (mp == NULL)
-		goto out;
-	MNT_ILOCK(mp);
-	if (mp != vp->v_mount) {
+	if (mp == NULL) {
+		*(ap->a_mpp) = NULL;
+		return (0);
+	}
+	if (vfs_op_thread_enter(mp)) {
+		if (mp == vp->v_mount)
+			MNT_REF_UNLOCKED(mp);
+		else
+			mp = NULL;
+		vfs_op_thread_exit(mp);
+	} else {
+		MNT_ILOCK(mp);
+		if (mp == vp->v_mount)
+			MNT_REF(mp);
+		else
+			mp = NULL;
 		MNT_IUNLOCK(mp);
-		mp = NULL;
-		goto out;
 	}
-	MNT_REF(mp);
-	MNT_IUNLOCK(mp);
-out:
 	*(ap->a_mpp) = mp;
 	return (0);
 }

Modified: head/sys/kern/vfs_mount.c
==============================================================================
--- head/sys/kern/vfs_mount.c	Mon Sep 16 20:43:20 2019	(r352423)
+++ head/sys/kern/vfs_mount.c	Mon Sep 16 21:31:02 2019	(r352424)
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/conf.h>
+#include <sys/smp.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
@@ -123,6 +124,10 @@ mount_init(void *mem, int size, int flags)
 	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
 	mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
 	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
+	mp->mnt_thread_in_ops_pcpu = uma_zalloc_pcpu(pcpu_zone_int,
+	    M_WAITOK | M_ZERO);
+	mp->mnt_ref = 0;
+	mp->mnt_vfs_ops = 1;
 	return (0);
 }
 
@@ -132,6 +137,7 @@ mount_fini(void *mem, int size)
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
+	uma_zfree_pcpu(pcpu_zone_int, mp->mnt_thread_in_ops_pcpu);
 	lockdestroy(&mp->mnt_explock);
 	mtx_destroy(&mp->mnt_listmtx);
 	mtx_destroy(&mp->mnt_mtx);
@@ -445,6 +451,12 @@ vfs_ref(struct mount *mp)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+	if (vfs_op_thread_enter(mp)) {
+		MNT_REF_UNLOCKED(mp);
+		vfs_op_thread_exit(mp);
+		return;
+	}
+
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
@@ -455,6 +467,12 @@ vfs_rel(struct mount *mp)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+	if (vfs_op_thread_enter(mp)) {
+		MNT_REL_UNLOCKED(mp);
+		vfs_op_thread_exit(mp);
+		return;
+	}
+
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
@@ -478,7 +496,12 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp
 	mp->mnt_activevnodelistsize = 0;
 	TAILQ_INIT(&mp->mnt_tmpfreevnodelist);
 	mp->mnt_tmpfreevnodelistsize = 0;
-	mp->mnt_ref = 0;
+	if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 ||
+	    mp->mnt_writeopcount != 0)
+		panic("%s: non-zero counters on new mp %p\n", __func__, mp);
+	if (mp->mnt_vfs_ops != 1)
+		panic("%s: vfs_ops should be 1 but %d found\n", __func__,
+		    mp->mnt_vfs_ops);
 	(void) vfs_busy(mp, MBF_NOWAIT);
 	atomic_add_acq_int(&vfsp->vfc_refcount, 1);
 	mp->mnt_op = vfsp->vfc_vfsops;
@@ -507,6 +530,9 @@ void
 vfs_mount_destroy(struct mount *mp)
 {
 
+	if (mp->mnt_vfs_ops == 0)
+		panic("%s: entered with zero vfs_ops\n", __func__);
+
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_REFEXPIRE;
 	if (mp->mnt_kern_flag & MNTK_MWAIT) {
@@ -540,6 +566,11 @@ vfs_mount_destroy(struct mount *mp)
 	if (mp->mnt_lockref != 0)
 		panic("vfs_mount_destroy: nonzero lock refcount");
 	MNT_IUNLOCK(mp);
+
+	if (mp->mnt_vfs_ops != 1)
+		panic("%s: vfs_ops should be 1 but %d found\n", __func__,
+		    mp->mnt_vfs_ops);
+
 	if (mp->mnt_vnodecovered != NULL)
 		vrele(mp->mnt_vnodecovered);
 #ifdef MAC
@@ -951,6 +982,7 @@ vfs_domount_first(
 	vrele(newdp);
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		vfs_allocate_syncvnode(mp);
+	vfs_op_exit(mp);
 	vfs_unbusy(mp);
 	return (0);
 }
@@ -1019,6 +1051,8 @@ vfs_domount_update(
 	VI_UNLOCK(vp);
 	VOP_UNLOCK(vp, 0);
 
+	vfs_op_enter(mp);
+
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		MNT_IUNLOCK(mp);
@@ -1100,6 +1134,7 @@ vfs_domount_update(
 	else
 		vfs_deallocate_syncvnode(mp);
 end:
+	vfs_op_exit(mp);
 	vfs_unbusy(mp);
 	VI_LOCK(vp);
 	vp->v_iflag &= ~VI_MOUNT;
@@ -1328,6 +1363,7 @@ dounmount_cleanup(struct mount *mp, struct vnode *cove
 		mp->mnt_kern_flag &= ~MNTK_MWAIT;
 		wakeup(mp);
 	}
+	vfs_op_exit_locked(mp);
 	MNT_IUNLOCK(mp);
 	if (coveredvp != NULL) {
 		VOP_UNLOCK(coveredvp, 0);
@@ -1337,6 +1373,69 @@ dounmount_cleanup(struct mount *mp, struct vnode *cove
 }
 
 /*
+ * There are various reference counters associated with the mount point.
+ * Normally it is permitted to modify them without taking the mnt ilock,
+ * but this behavior can be temporarily disabled if stable value is needed
+ * or callers are expected to block (e.g. to not allow new users during
+ * forced unmount).
+ */
+void
+vfs_op_enter(struct mount *mp)
+{
+
+	MNT_ILOCK(mp);
+	mp->mnt_vfs_ops++;
+	if (mp->mnt_vfs_ops > 1) {
+		MNT_IUNLOCK(mp);
+		return;
+	}
+	/*
+	 * Paired with a fence in vfs_op_thread_enter(). See the comment
+	 * above it for details.
+	 */
+	atomic_thread_fence_seq_cst();
+	vfs_op_barrier_wait(mp);
+	MNT_IUNLOCK(mp);
+}
+
+void
+vfs_op_exit_locked(struct mount *mp)
+{
+
+	mtx_assert(MNT_MTX(mp), MA_OWNED);
+
+	if (mp->mnt_vfs_ops <= 0)
+		panic("%s: invalid vfs_ops count %d for mp %p\n",
+		    __func__, mp->mnt_vfs_ops, mp);
+	mp->mnt_vfs_ops--;
+}
+
+void
+vfs_op_exit(struct mount *mp)
+{
+
+	MNT_ILOCK(mp);
+	vfs_op_exit_locked(mp);
+	MNT_IUNLOCK(mp);
+}
+
+/*
+ * It is assumed the caller already posted at least an acquire barrier.
+ */
+void
+vfs_op_barrier_wait(struct mount *mp)
+{
+	int *in_op;
+	int cpu;
+
+	CPU_FOREACH(cpu) {
+		in_op = zpcpu_get_cpu(mp->mnt_thread_in_ops_pcpu, cpu);
+		while (atomic_load_int(in_op))
+			cpu_spinwait();
+	}
+}
+
+/*
  * Do the actual filesystem unmount.
  */
 int
@@ -1379,6 +1478,8 @@ dounmount(struct mount *mp, int flags, struct thread *
 		return (error);
 	}
 
+	vfs_op_enter(mp);
+
 	vn_start_write(NULL, &mp, V_WAIT | V_MNTREF);
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
@@ -1469,6 +1570,7 @@ dounmount(struct mount *mp, int flags, struct thread *
 			mp->mnt_kern_flag &= ~MNTK_MWAIT;
 			wakeup(mp);
 		}
+		vfs_op_exit_locked(mp);
 		MNT_IUNLOCK(mp);
 		if (coveredvp)
 			VOP_UNLOCK(coveredvp, 0);

Modified: head/sys/kern/vfs_mountroot.c
==============================================================================
--- head/sys/kern/vfs_mountroot.c	Mon Sep 16 20:43:20 2019	(r352423)
+++ head/sys/kern/vfs_mountroot.c	Mon Sep 16 21:31:02 2019	(r352424)
@@ -273,6 +273,7 @@ vfs_mountroot_devfs(struct thread *td, struct mount **
 
 		*mpp = mp;
 		rootdevmp = mp;
+		vfs_op_exit(mp);
 	}
 
 	set_rootvnode();

Modified: head/sys/kern/vfs_subr.c
==============================================================================
--- head/sys/kern/vfs_subr.c	Mon Sep 16 20:43:20 2019	(r352423)
+++ head/sys/kern/vfs_subr.c	Mon Sep 16 21:31:02 2019	(r352424)
@@ -4032,6 +4032,7 @@ DB_SHOW_COMMAND(mount, db_show_mount)
 	    mp->mnt_secondary_accwrites);
 	db_printf("    mnt_gjprovider = %s\n",
 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
+	db_printf("    mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
 
 	db_printf("\n\nList of active vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {

Modified: head/sys/sys/mount.h
==============================================================================
--- head/sys/sys/mount.h	Mon Sep 16 20:43:20 2019	(r352423)
+++ head/sys/sys/mount.h	Mon Sep 16 21:31:02 2019	(r352424)
@@ -226,6 +226,8 @@ struct mount {
 	struct lock	mnt_explock;		/* vfs_export walkers lock */
 	TAILQ_ENTRY(mount) mnt_upper_link;	/* (m) we in the all uppers */
 	TAILQ_HEAD(, mount) mnt_uppers;		/* (m) upper mounts over us*/
+	int		mnt_vfs_ops;		/* (i) pending vfs ops */
+	int		*mnt_thread_in_ops_pcpu;
 };
 
 /*
@@ -265,15 +267,26 @@ void          __mnt_vnode_markerfree_active(struct vno
 #define	MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx)
 #define	MNT_IUNLOCK(mp)	mtx_unlock(&(mp)->mnt_mtx)
 #define	MNT_MTX(mp)	(&(mp)->mnt_mtx)
+
+#define	MNT_REF_UNLOCKED(mp)	do {					\
+	atomic_add_int(&(mp)->mnt_ref, 1);				\
+} while (0)
+#define	MNT_REL_UNLOCKED(mp)	do {					\
+	int _c;								\
+	_c = atomic_fetchadd_int(&(mp)->mnt_ref, -1) - 1;		\
+	KASSERT(_c >= 0, ("negative mnt_ref %d", _c));			\
+} while (0)
+
 #define	MNT_REF(mp)	do {						\
 	mtx_assert(MNT_MTX(mp), MA_OWNED);				\
-	(mp)->mnt_ref++;						\
+	atomic_add_int(&(mp)->mnt_ref, 1);				\
 } while (0)
 #define	MNT_REL(mp)	do {						\
+	int _c;								\
 	mtx_assert(MNT_MTX(mp), MA_OWNED);				\
-	KASSERT((mp)->mnt_ref > 0, ("negative mnt_ref"));		\
-	(mp)->mnt_ref--;						\
-	if ((mp)->mnt_ref == 0)						\
+	_c = atomic_fetchadd_int(&(mp)->mnt_ref, -1) - 1;		\
+	KASSERT(_c >= 0, ("negative mnt_ref %d", _c));			\
+	if (_c == 0)							\
 		wakeup((mp));						\
 } while (0)
 
@@ -940,6 +953,48 @@ vfs_sysctl_t		vfs_stdsysctl;
 
 void	syncer_suspend(void);
 void	syncer_resume(void);
+
+void	vfs_op_barrier_wait(struct mount *);
+void	vfs_op_enter(struct mount *);
+void	vfs_op_exit_locked(struct mount *);
+void	vfs_op_exit(struct mount *);
+
+/*
+ * We mark ourselves as entering the section and post a sequentially consistent
+ * fence, meaning the store is completed before we get into the section and
+ * mnt_vfs_ops is only read afterwards.
+ *
+ * Any thread transitioning the ops counter 0->1 does things in the opposite
+ * order - first bumps the count, posts a sequentially consistent fence and
+ * observes all CPUs not executing within the section.
+ *
+ * This provides an invariant that by the time the last CPU is observed not
+ * executing, everyone else entering will see the counter > 0 and exit.
+ *
+ * Note there is no barrier between vfs_ops and the rest of the code in the
+ * section. It is not necessary as the writer has to wait for everyone to drain
+ * before making any changes or only make changes safe while the section is
+ * executed.
+ */
+
+#define vfs_op_thread_enter(mp) ({				\
+	struct mount *_mp = (mp);				\
+	bool _retval = true;					\
+	critical_enter();					\
+	*(int *)zpcpu_get(_mp->mnt_thread_in_ops_pcpu) = 1;	\
+	atomic_thread_fence_seq_cst();				\
+	if (__predict_false(_mp->mnt_vfs_ops > 0)) {		\
+		vfs_op_thread_exit(_mp);			\
+		_retval = false;				\
+	}							\
+	_retval;						\
+})
+
+#define vfs_op_thread_exit(mp) do {				\
+	atomic_thread_fence_rel();				\
+	*(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 0;	\
+	critical_exit();					\
+} while (0)
 
 #else /* !_KERNEL */
 

Modified: head/sys/vm/uma.h
==============================================================================
--- head/sys/vm/uma.h	Mon Sep 16 20:43:20 2019	(r352423)
+++ head/sys/vm/uma.h	Mon Sep 16 21:31:02 2019	(r352424)
@@ -650,6 +650,7 @@ int uma_zone_exhausted_nolock(uma_zone_t zone);
 /*
  * Common UMA_ZONE_PCPU zones.
  */
+extern uma_zone_t pcpu_zone_int;
 extern uma_zone_t pcpu_zone_64;
 
 /*



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201909162131.x8GLV2bt001656>