Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 1 Feb 2021 12:40:30 GMT
From:      Mateusz Guzik <mjg@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org
Subject:   git: dcee9964238b - stable/13 - cache: add symlink support to lockless lookup
Message-ID:  <202102011240.111CeURH094618@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch stable/13 has been updated by mjg:

URL: https://cgit.FreeBSD.org/src/commit/?id=dcee9964238b12a8e55917f292139f074b1a80b2

commit dcee9964238b12a8e55917f292139f074b1a80b2
Author:     Mateusz Guzik <mjg@FreeBSD.org>
AuthorDate: 2021-01-23 13:40:48 +0000
Commit:     Mateusz Guzik <mjg@FreeBSD.org>
CommitDate: 2021-02-01 12:37:46 +0000

    cache: add symlink support to lockless lookup
    
    Reviewed by:    kib (previous version)
    Tested by:      pho (previous version)
    Differential Revision:  https://reviews.freebsd.org/D27488
---
 sys/fs/deadfs/dead_vnops.c |   1 +
 sys/kern/vfs_cache.c       | 214 ++++++++++++++++++++++++++++++++++++++-------
 sys/kern/vfs_lookup.c      |  11 ++-
 sys/kern/vfs_subr.c        |  13 +++
 sys/kern/vnode_if.src      |  10 +++
 sys/sys/namei.h            |   4 +-
 sys/sys/param.h            |   2 +-
 sys/sys/vnode.h            |   9 ++
 8 files changed, 227 insertions(+), 37 deletions(-)

diff --git a/sys/fs/deadfs/dead_vnops.c b/sys/fs/deadfs/dead_vnops.c
index 49f852791387..09c3c996ee0e 100644
--- a/sys/fs/deadfs/dead_vnops.c
+++ b/sys/fs/deadfs/dead_vnops.c
@@ -80,6 +80,7 @@ struct vop_vector dead_vnodeops = {
 	.vop_unset_text =	dead_unset_text,
 	.vop_write =		dead_write,
 	.vop_fplookup_vexec =	VOP_EOPNOTSUPP,
+	.vop_fplookup_symlink =	VOP_EAGAIN,
 };
 VFS_VOP_VECTOR_REGISTER(dead_vnodeops);
 
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index cfa25dc59ee4..770c8ebf061b 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -562,6 +562,36 @@ static uma_zone_t __read_mostly cache_zone_small_ts;
 static uma_zone_t __read_mostly cache_zone_large;
 static uma_zone_t __read_mostly cache_zone_large_ts;
 
+char *
+cache_symlink_alloc(size_t size, int flags)
+{
+
+	if (size < CACHE_ZONE_SMALL_SIZE) {
+		return (uma_zalloc_smr(cache_zone_small, flags));
+	}
+	if (size < CACHE_ZONE_LARGE_SIZE) {
+		return (uma_zalloc_smr(cache_zone_large, flags));
+	}
+	return (NULL);
+}
+
+void
+cache_symlink_free(char *string, size_t size)
+{
+
+	MPASS(string != NULL);
+
+	if (size < CACHE_ZONE_SMALL_SIZE) {
+		uma_zfree_smr(cache_zone_small, string);
+		return;
+	}
+	if (size < CACHE_ZONE_LARGE_SIZE) {
+		uma_zfree_smr(cache_zone_large, string);
+		return;
+	}
+	__assert_unreachable();
+}
+
 static struct namecache *
 cache_alloc_uma(int len, bool ts)
 {
@@ -3584,6 +3614,7 @@ cache_fast_lookup_enabled_recalc(void)
 
 #ifdef MAC
 	mac_on = mac_vnode_check_lookup_enabled();
+	mac_on |= mac_vnode_check_readlink_enabled();
 #else
 	mac_on = 0;
 #endif
@@ -3615,6 +3646,7 @@ SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MP
  * need restoring in case fast path lookup fails.
  */
 struct nameidata_outer {
+	size_t ni_pathlen;
 	int cn_flags;
 };
 
@@ -3656,8 +3688,10 @@ static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
+static int cache_fplookup_preparse(struct cache_fpl *fpl);
 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
+static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
 
 static void
@@ -3698,6 +3732,7 @@ static void
 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
 {
 
+	fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
 	fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
 }
 
@@ -3731,6 +3766,7 @@ cache_fpl_restore_abort(struct cache_fpl *fpl)
 	 */
 	fpl->ndp->ni_resflags = 0;
 	fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
+	fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
 }
 
 #ifdef INVARIANTS
@@ -3752,6 +3788,7 @@ cache_fpl_assert_status(struct cache_fpl *fpl)
 	case CACHE_FPL_STATUS_UNSET:
 		__assert_unreachable();
 		break;
+	case CACHE_FPL_STATUS_DESTROYED:
 	case CACHE_FPL_STATUS_ABORTED:
 	case CACHE_FPL_STATUS_PARTIAL:
 	case CACHE_FPL_STATUS_HANDLED:
@@ -3804,6 +3841,11 @@ cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
 static int __noinline
 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
 {
+	struct nameidata *ndp;
+	struct componentname *cnp;
+
+	ndp = fpl->ndp;
+	cnp = fpl->cnp;
 
 	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
 		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
@@ -3815,6 +3857,14 @@ cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
 	if (fpl->in_smr)
 		cache_fpl_smr_exit(fpl);
 	cache_fpl_restore_abort(fpl);
+	/*
+	 * Resolving symlinks overwrites data passed by the caller.
+	 * Let namei know.
+	 */
+	if (ndp->ni_loopcnt > 0) {
+		fpl->status = CACHE_FPL_STATUS_DESTROYED;
+		cache_fpl_cleanup_cnp(cnp);
+	}
 	return (CACHE_FPL_FAILED);
 }
 
@@ -3955,13 +4005,6 @@ cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
 	return (0);
 }
 
-static bool
-cache_fplookup_vnode_supported(struct vnode *vp)
-{
-
-	return (vp->v_type != VLNK);
-}
-
 static int __noinline
 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
     uint32_t hash)
@@ -4242,8 +4285,7 @@ cache_fplookup_final_modifying(struct cache_fpl *fpl)
 	 * Since we expect this to be the terminal vnode it should
 	 * almost never be true.
 	 */
-	if (__predict_false(!cache_fplookup_vnode_supported(tvp) ||
-	    cache_fplookup_is_mp(fpl))) {
+	if (__predict_false(tvp->v_type == VLNK || cache_fplookup_is_mp(fpl))) {
 		vput(dvp);
 		vput(tvp);
 		return (cache_fpl_aborted(fpl));
@@ -4546,8 +4588,7 @@ cache_fplookup_noentry(struct cache_fpl *fpl)
 		return (cache_fpl_handled(fpl));
 	}
 
-	if (__predict_false(!cache_fplookup_vnode_supported(tvp) ||
-	    cache_fplookup_is_mp(fpl))) {
+	if (__predict_false(tvp->v_type == VLNK || cache_fplookup_is_mp(fpl))) {
 		vput(dvp);
 		vput(tvp);
 		return (cache_fpl_aborted(fpl));
@@ -4688,6 +4729,102 @@ cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
 	return (cache_fpl_handled_error(fpl, ENOENT));
 }
 
+/*
+ * Resolve a symlink. Called by filesystem-specific routines.
+ *
+ * Code flow is:
+ * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
+ */
+int
+cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
+{
+	struct nameidata *ndp;
+	struct componentname *cnp;
+
+	ndp = fpl->ndp;
+	cnp = fpl->cnp;
+
+	if (__predict_false(len == 0)) {
+		return (ENOENT);
+	}
+
+	ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
+#ifdef INVARIANTS
+	if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
+		panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
+		    __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
+		    cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
+	}
+#endif
+
+	if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
+		return (ENAMETOOLONG);
+	}
+
+	if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
+		return (ELOOP);
+	}
+
+	if (ndp->ni_pathlen > 1) {
+		bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
+	} else {
+		cnp->cn_pnbuf[len] = '\0';
+	}
+	bcopy(string, cnp->cn_pnbuf, len);
+
+	ndp->ni_pathlen += len;
+	cache_fpl_pathlen_add(fpl, len);
+	cnp->cn_nameptr = cnp->cn_pnbuf;
+	fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
+
+	return (0);
+}
+
+static int __noinline
+cache_fplookup_symlink(struct cache_fpl *fpl)
+{
+	struct nameidata *ndp;
+	struct componentname *cnp;
+	struct vnode *dvp, *tvp;
+	int error;
+
+	ndp = fpl->ndp;
+	cnp = fpl->cnp;
+	dvp = fpl->dvp;
+	tvp = fpl->tvp;
+
+	if (cache_fpl_islastcn(ndp)) {
+		if ((cnp->cn_flags & FOLLOW) == 0) {
+			return (cache_fplookup_final(fpl));
+		}
+	}
+
+	error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
+	if (__predict_false(error != 0)) {
+		switch (error) {
+		case EAGAIN:
+			return (cache_fpl_partial(fpl));
+		case ENOENT:
+		case ENAMETOOLONG:
+		case ELOOP:
+			cache_fpl_smr_exit(fpl);
+			return (cache_fpl_handled_error(fpl, error));
+		default:
+			return (cache_fpl_aborted(fpl));
+		}
+	}
+
+	if (*(cnp->cn_nameptr) == '/') {
+		fpl->dvp = cache_fpl_handle_root(fpl);
+		fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
+		if (seqc_in_modify(fpl->dvp_seqc)) {
+			return (cache_fpl_aborted(fpl));
+		}
+	}
+
+	return (cache_fplookup_preparse(fpl));
+}
+
 static int
 cache_fplookup_next(struct cache_fpl *fpl)
 {
@@ -4743,10 +4880,6 @@ cache_fplookup_next(struct cache_fpl *fpl)
 		return (cache_fpl_partial(fpl));
 	}
 
-	if (!cache_fplookup_vnode_supported(tvp)) {
-		return (cache_fpl_partial(fpl));
-	}
-
 	counter_u64_add(numposhits, 1);
 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
 
@@ -4931,7 +5064,16 @@ static void
 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
 {
 
-	fpl->debug.ni_pathlen++;
+	cache_fpl_pathlen_add(fpl, 1);
+}
+
+static void
+cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
+{
+
+	fpl->debug.ni_pathlen += n;
+	KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
+	    ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
 }
 
 static void
@@ -4953,13 +5095,18 @@ cache_fpl_pathlen_inc(struct cache_fpl *fpl)
 {
 }
 
+static void
+cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
+{
+}
+
 static void
 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
 {
 }
 #endif
 
-static int
+static int __always_inline
 cache_fplookup_preparse(struct cache_fpl *fpl)
 {
 	struct componentname *cnp;
@@ -5238,8 +5385,6 @@ cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
 		return (cache_fpl_aborted(fpl));
 	}
 
-	VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
-
 	error = cache_fplookup_preparse(fpl);
 	if (__predict_false(cache_fpl_terminated(fpl))) {
 		return (error);
@@ -5251,8 +5396,6 @@ cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
 			break;
 		}
 
-		VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
-
 		error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
 		if (__predict_false(error != 0)) {
 			error = cache_fplookup_failed_vexec(fpl, error);
@@ -5266,20 +5409,27 @@ cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
 
 		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
 
-		if (cache_fpl_islastcn(ndp)) {
-			error = cache_fplookup_final(fpl);
-			break;
-		}
+		if (fpl->tvp->v_type == VLNK) {
+			error = cache_fplookup_symlink(fpl);
+			if (cache_fpl_terminated(fpl)) {
+				break;
+			}
+		} else {
+			if (cache_fpl_islastcn(ndp)) {
+				error = cache_fplookup_final(fpl);
+				break;
+			}
 
-		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
-			error = cache_fpl_aborted(fpl);
-			break;
-		}
+			if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
+				error = cache_fpl_aborted(fpl);
+				break;
+			}
 
-		fpl->dvp = fpl->tvp;
-		fpl->dvp_seqc = fpl->tvp_seqc;
+			fpl->dvp = fpl->tvp;
+			fpl->dvp_seqc = fpl->tvp_seqc;
+			cache_fplookup_parse_advance(fpl);
+		}
 
-		cache_fplookup_parse_advance(fpl);
 		cache_fpl_checkpoint(fpl);
 	}
 
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
index b6529623ecbb..ad65ab11bb1d 100644
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -578,6 +578,7 @@ namei(struct nameidata *ndp)
 	    ndp->ni_startdir->v_type == VBAD);
 
 	ndp->ni_lcf = 0;
+	ndp->ni_loopcnt = 0;
 	ndp->ni_vp = NULL;
 
 	error = namei_getpath(ndp);
@@ -611,8 +612,16 @@ namei(struct nameidata *ndp)
 		TAILQ_INIT(&ndp->ni_cap_tracker);
 		dp = ndp->ni_startdir;
 		break;
+	case CACHE_FPL_STATUS_DESTROYED:
+		ndp->ni_loopcnt = 0;
+		error = namei_getpath(ndp);
+		if (__predict_false(error != 0)) {
+			return (error);
+		}
+		/* FALLTHROUGH */
 	case CACHE_FPL_STATUS_ABORTED:
 		TAILQ_INIT(&ndp->ni_cap_tracker);
+		MPASS(ndp->ni_lcf == 0);
 		error = namei_setup(ndp, &dp, &pwd);
 		if (error != 0) {
 			namei_cleanup_cnp(cnp);
@@ -621,8 +630,6 @@ namei(struct nameidata *ndp)
 		break;
 	}
 
-	ndp->ni_loopcnt = 0;
-
 	/*
 	 * Locked lookup.
 	 */
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 8461fd0d49b5..047e4c54f0c5 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -5452,6 +5452,19 @@ vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused)
 	VFS_SMR_ASSERT_ENTERED();
 }
 
+void
+vop_fplookup_symlink_debugpre(void *ap __unused)
+{
+
+	VFS_SMR_ASSERT_ENTERED();
+}
+
+void
+vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused)
+{
+
+	VFS_SMR_ASSERT_ENTERED();
+}
 void
 vop_strategy_debugpre(void *ap)
 {
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index ed6f3bf9d61b..5d15d4a0c863 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -156,6 +156,16 @@ vop_fplookup_vexec {
 };
 
 
+%% fplookup_symlink	vp	- - -
+%! fplookup_symlink	debugpre	vop_fplookup_symlink_debugpre
+%! fplookup_symlink	debugpost	vop_fplookup_symlink_debugpost
+
+vop_fplookup_symlink {
+	IN struct vnode *vp;
+	IN struct cache_fpl *fpl;
+};
+
+
 %% access	vp	L L L
 
 vop_access {
diff --git a/sys/sys/namei.h b/sys/sys/namei.h
index c045bc7c6bc5..1f9bae9e2753 100644
--- a/sys/sys/namei.h
+++ b/sys/sys/namei.h
@@ -116,8 +116,8 @@ struct nameidata {
 
 #ifdef _KERNEL
 
-enum cache_fpl_status { CACHE_FPL_STATUS_ABORTED, CACHE_FPL_STATUS_PARTIAL,
-    CACHE_FPL_STATUS_HANDLED, CACHE_FPL_STATUS_UNSET };
+enum cache_fpl_status { CACHE_FPL_STATUS_DESTROYED, CACHE_FPL_STATUS_ABORTED,
+    CACHE_FPL_STATUS_PARTIAL, CACHE_FPL_STATUS_HANDLED, CACHE_FPL_STATUS_UNSET };
 int	cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
     struct pwd **pwdp);
 
diff --git a/sys/sys/param.h b/sys/sys/param.h
index 97ec07ed3a4d..afa5d9a2e3a8 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -60,7 +60,7 @@
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1300137	/* Master, propagated to newvers */
+#define __FreeBSD_version 1300138	/* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 0eadfec02313..ffd0bdad940d 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -66,6 +66,7 @@ enum vgetstate	{ VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT };
  */
 
 struct namecache;
+struct cache_fpl;
 
 struct vpollinfo {
 	struct	mtx vpi_lock;		/* lock to protect below */
@@ -646,6 +647,10 @@ void	cache_purge(struct vnode *vp);
 void	cache_purge_vgone(struct vnode *vp);
 void	cache_purge_negative(struct vnode *vp);
 void	cache_purgevfs(struct mount *mp);
+char	*cache_symlink_alloc(size_t size, int flags);
+void	cache_symlink_free(char *string, size_t size);
+int	cache_symlink_resolve(struct cache_fpl *fpl, const char *string,
+	    size_t len);
 void	cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp);
 void	cache_vop_rmdir(struct vnode *dvp, struct vnode *vp);
@@ -900,6 +905,8 @@ int	vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a);
 #ifdef DEBUG_VFS_LOCKS
 void	vop_fplookup_vexec_debugpre(void *a);
 void	vop_fplookup_vexec_debugpost(void *a, int rc);
+void	vop_fplookup_symlink_debugpre(void *a);
+void	vop_fplookup_symlink_debugpost(void *a, int rc);
 void	vop_strategy_debugpre(void *a);
 void	vop_lock_debugpre(void *a);
 void	vop_lock_debugpost(void *a, int rc);
@@ -910,6 +917,8 @@ void	vop_mkdir_debugpost(void *a, int rc);
 #else
 #define	vop_fplookup_vexec_debugpre(x)		do { } while (0)
 #define	vop_fplookup_vexec_debugpost(x, y)	do { } while (0)
+#define	vop_fplookup_symlink_debugpre(x)	do { } while (0)
+#define	vop_fplookup_symlink_debugpost(x, y)	do { } while (0)
 #define	vop_strategy_debugpre(x)		do { } while (0)
 #define	vop_lock_debugpre(x)			do { } while (0)
 #define	vop_lock_debugpost(x, y)		do { } while (0)



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202102011240.111CeURH094618>