Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 9 May 2013 10:20:46 +0300
From:      Konstantin Belousov <kostikbel@gmail.com>
To:        Peter Holm <peter@holm.cc>
Cc:        freebsd-stable@freebsd.org
Subject:   Re: Nullfs leaks i-nodes
Message-ID:  <20130509072046.GN3047@kib.kiev.ua>
In-Reply-To: <20130509070256.GA15884@x2.osted.lan>
References:  <B799E3B928B18B9E6C68F912@[172.16.2.62]> <20130508091317.GJ3047@kib.kiev.ua> <20130509070256.GA15884@x2.osted.lan>

next in thread | previous in thread | raw e-mail | index | archive | help

--CzJ9BTjEYwHt6w7v
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On Thu, May 09, 2013 at 09:02:56AM +0200, Peter Holm wrote:
> On Wed, May 08, 2013 at 12:13:17PM +0300, Konstantin Belousov wrote:
> > On Tue, May 07, 2013 at 08:30:06AM +0200, G??ran L??wkrantz wrote:
> > > I created a PR, kern/178238, on this but would like to know if anyone=
 has=20
> > > any ideas or patches?
> > >=20
> > > Have updated the system where I see this to FreeBSD 9.1-STABLE #0 r25=
0229=20
> > > and still have the problem.
> >=20
> > The patch below should fix the issue for you, at least it did so in my
> > limited testing.
> >=20
> > What is does:
> > 1. When inactivating a nullfs vnode, check if the lower vnode is
> >    unlinked, and reclaim upper vnode if so. [This fixes your case].
> >=20
> > 2. Besides a callback to the upper filesystems for the lower vnode
> >    reclaimation, it also calls the upper fs for lower vnode unlink.
> >    This allows nullfs to purge cached vnodes for the unlinked lower.
> >    [This fixes an opposite case, when the vnode is removed from the
> >    lower mount, but upper aliases prevent the vnode from being
> >    recycled].
> >=20
> > 3. Fix a wart which existed from the introduction of the nullfs caching,
> >    do not unlock lower vnode in the nullfs_reclaim_lowervp().  It should
> >    be completely innocent, but now it is also formally safe.
> >=20
> > 4. Fix vnode reference leak in nullfs_reclaim_lowervp().
> >=20
> > Please note that the patch is basically not tested, I only verified your
> > scenario and a mirror of it as described in item 2.
> >=20
> > diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h
> > index 4f37020..a624be6 100644
> > --- a/sys/fs/nullfs/null.h
>=20
> The page fault seen in fifo_close() seems unrelated to this patch,
> which I will continue testing some more.
>=20
> The scenario triggering the page fault is the "rm":
>=20
> mdconfig -a -t swap -s 1g -u 5
> bsdlabel -w md5 auto
> newfs -U md5a
> mount /dev/md5a /mnt
> mount -t nullfs /mnt /mnt2
> mkfifo /mnt2/fifo
> rm /mnt/fifo
Yeah, I figured this out from the backtrace.

The panic in kostik563 is related and directly caused by the patch,
since patch erronously performs vgone() on the active (removed) vnode.
As result, a spurious VOP_CLOSE() call is performed on the vnode,
which is more or less innocent for regular files, but fatal for fifos
and probably nfsv4 as well.

Updated patch is below.

>=20
> Not a problem on 8.3-STABLE r247938.
Yes, new nullfs is only in HEAD and stable/9.

diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h
index 4f37020..0972dfc 100644
--- a/sys/fs/nullfs/null.h
+++ b/sys/fs/nullfs/null.h
@@ -53,8 +53,12 @@ struct null_node {
 	LIST_ENTRY(null_node)	null_hash;	/* Hash list */
 	struct vnode	        *null_lowervp;	/* VREFed once */
 	struct vnode		*null_vnode;	/* Back pointer */
+	u_int			null_flags;
 };
=20
+#define	NULLV_NOUNLOCK	0x0001
+#define	NULLV_DROP	0x0002
+
 #define	MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data))
 #define	VTONULL(vp) ((struct null_node *)(vp)->v_data)
 #define	NULLTOV(xp) ((xp)->null_vnode)
diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c
index 02932bd..ad02236 100644
--- a/sys/fs/nullfs/null_vfsops.c
+++ b/sys/fs/nullfs/null_vfsops.c
@@ -65,7 +65,6 @@ static vfs_statfs_t	nullfs_statfs;
 static vfs_unmount_t	nullfs_unmount;
 static vfs_vget_t	nullfs_vget;
 static vfs_extattrctl_t	nullfs_extattrctl;
-static vfs_reclaim_lowervp_t nullfs_reclaim_lowervp;
=20
 /*
  * Mount null layer
@@ -391,8 +390,37 @@ nullfs_reclaim_lowervp(struct mount *mp, struct vnode =
*lowervp)
 	vp =3D null_hashget(mp, lowervp);
 	if (vp =3D=3D NULL)
 		return;
+	VTONULL(vp)->null_flags |=3D NULLV_NOUNLOCK;
 	vgone(vp);
-	vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY);
+	vput(vp);
+}
+
+static void
+nullfs_unlink_lowervp(struct mount *mp, struct vnode *lowervp)
+{
+	struct vnode *vp;
+	struct null_node *xp;
+
+	vp =3D null_hashget(mp, lowervp);
+	if (vp =3D=3D NULL)
+		return;
+	xp =3D VTONULL(vp);
+	xp->null_flags |=3D NULLV_DROP | NULLV_NOUNLOCK;
+	vhold(vp);
+	vunref(vp);
+
+	/*
+	 * If vunref() dropped the last use reference on the nullfs
+	 * vnode, it must be reclaimed, and its lock was split from
+	 * the lower vnode lock.  Need to do extra unlock before
+	 * allowing the final vdrop() to free the vnode.
+	 */
+	if (vp->v_usecount =3D=3D 0) {
+		KASSERT((vp->v_iflag & VI_DOOMED) !=3D 0,
+		    ("not reclaimed %p", vp));
+		VOP_UNLOCK(vp, 0);
+	}
+	vdrop(vp);
 }
=20
 static struct vfsops null_vfsops =3D {
@@ -408,6 +436,7 @@ static struct vfsops null_vfsops =3D {
 	.vfs_unmount =3D		nullfs_unmount,
 	.vfs_vget =3D		nullfs_vget,
 	.vfs_reclaim_lowervp =3D	nullfs_reclaim_lowervp,
+	.vfs_unlink_lowervp =3D	nullfs_unlink_lowervp,
 };
=20
 VFS_SET(null_vfsops, nullfs, VFCF_LOOPBACK | VFCF_JAIL);
diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c
index f59865f..6ff15ee 100644
--- a/sys/fs/nullfs/null_vnops.c
+++ b/sys/fs/nullfs/null_vnops.c
@@ -692,18 +692,24 @@ null_unlock(struct vop_unlock_args *ap)
 static int
 null_inactive(struct vop_inactive_args *ap __unused)
 {
-	struct vnode *vp;
+	struct vnode *vp, *lvp;
+	struct null_node *xp;
 	struct mount *mp;
 	struct null_mount *xmp;
=20
 	vp =3D ap->a_vp;
+	xp =3D VTONULL(vp);
+	lvp =3D NULLVPTOLOWERVP(vp);
 	mp =3D vp->v_mount;
 	xmp =3D MOUNTTONULLMOUNT(mp);
-	if ((xmp->nullm_flags & NULLM_CACHE) =3D=3D 0) {
+	if ((xmp->nullm_flags & NULLM_CACHE) =3D=3D 0 ||
+	    (xp->null_flags & NULLV_DROP) !=3D 0 ||
+	    (lvp->v_vflag & VV_NOSYNC) !=3D 0) {
 		/*
 		 * If this is the last reference and caching of the
-		 * nullfs vnodes is not enabled, then free up the
-		 * vnode so as not to tie up the lower vnodes.
+		 * nullfs vnodes is not enabled, or the lower vnode is
+		 * deleted, then free up the vnode so as not to tie up
+		 * the lower vnodes.
 		 */
 		vp->v_object =3D NULL;
 		vrecycle(vp);
@@ -748,7 +754,10 @@ null_reclaim(struct vop_reclaim_args *ap)
 	 */
 	if (vp->v_writecount > 0)
 		VOP_ADD_WRITECOUNT(lowervp, -1);
-	vput(lowervp);
+	if ((xp->null_flags & NULLV_NOUNLOCK) !=3D 0)
+		vunref(lowervp);
+	else
+		vput(lowervp);
 	free(xp, M_NULLFSNODE);
=20
 	return (0);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index de118f7..988a114 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -2700,19 +2700,20 @@ vgone(struct vnode *vp)
 }
=20
 static void
-vgonel_reclaim_lowervp_vfs(struct mount *mp __unused,
+notify_lowervp_vfs_dummy(struct mount *mp __unused,
     struct vnode *lowervp __unused)
 {
 }
=20
 /*
- * Notify upper mounts about reclaimed vnode.
+ * Notify upper mounts about reclaimed or unlinked vnode.
  */
-static void
-vgonel_reclaim_lowervp(struct vnode *vp)
+void
+vfs_notify_upper(struct vnode *vp, int event)
 {
 	static struct vfsops vgonel_vfsops =3D {
-		.vfs_reclaim_lowervp =3D vgonel_reclaim_lowervp_vfs
+		.vfs_reclaim_lowervp =3D notify_lowervp_vfs_dummy,
+		.vfs_unlink_lowervp =3D notify_lowervp_vfs_dummy,
 	};
 	struct mount *mp, *ump, *mmp;
=20
@@ -2736,7 +2737,17 @@ vgonel_reclaim_lowervp(struct vnode *vp)
 		}
 		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
 		MNT_IUNLOCK(mp);
-		VFS_RECLAIM_LOWERVP(ump, vp);
+		switch (event) {
+		case VFS_NOTIFY_UPPER_RECLAIM:
+			VFS_RECLAIM_LOWERVP(ump, vp);
+			break;
+		case VFS_NOTIFY_UPPER_UNLINK:
+			VFS_UNLINK_LOWERVP(ump, vp);
+			break;
+		default:
+			KASSERT(0, ("invalid event %d", event));
+			break;
+		}
 		MNT_ILOCK(mp);
 		ump =3D TAILQ_NEXT(mmp, mnt_upper_link);
 		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
@@ -2783,7 +2794,7 @@ vgonel(struct vnode *vp)
 	active =3D vp->v_usecount;
 	oweinact =3D (vp->v_iflag & VI_OWEINACT);
 	VI_UNLOCK(vp);
-	vgonel_reclaim_lowervp(vp);
+	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
=20
 	/*
 	 * Clean out any buffers associated with the vnode.
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 29cb7bd..a004ea0 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -1846,6 +1846,7 @@ restart:
 		if (error)
 			goto out;
 #endif
+		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
 		error =3D VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 #ifdef MAC
 out:
@@ -3825,6 +3826,7 @@ restart:
 			return (error);
 		goto restart;
 	}
+	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
 	error =3D VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 	vn_finished_write(mp);
 out:
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index a9c86f6..a953dae 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -608,7 +608,7 @@ typedef	int vfs_mount_t(struct mount *mp);
 typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op,
 		    struct sysctl_req *req);
 typedef void vfs_susp_clean_t(struct mount *mp);
-typedef void vfs_reclaim_lowervp_t(struct mount *mp, struct vnode *lowervp=
);
+typedef void vfs_notify_lowervp_t(struct mount *mp, struct vnode *lowervp);
=20
 struct vfsops {
 	vfs_mount_t		*vfs_mount;
@@ -626,7 +626,8 @@ struct vfsops {
 	vfs_extattrctl_t	*vfs_extattrctl;
 	vfs_sysctl_t		*vfs_sysctl;
 	vfs_susp_clean_t	*vfs_susp_clean;
-	vfs_reclaim_lowervp_t	*vfs_reclaim_lowervp;
+	vfs_notify_lowervp_t	*vfs_reclaim_lowervp;
+	vfs_notify_lowervp_t	*vfs_unlink_lowervp;
 };
=20
 vfs_statfs_t	__vfs_statfs;
@@ -747,6 +748,14 @@ vfs_statfs_t	__vfs_statfs;
 	}								\
 } while (0)
=20
+#define	VFS_UNLINK_LOWERVP(MP, VP) do {					\
+	if (*(MP)->mnt_op->vfs_unlink_lowervp !=3D NULL) {		\
+		VFS_PROLOGUE(MP);					\
+		(*(MP)->mnt_op->vfs_unlink_lowervp)((MP), (VP));	\
+		VFS_EPILOGUE(MP);					\
+	}								\
+} while (0)
+
 #define VFS_KNOTE_LOCKED(vp, hint) do					\
 {									\
 	if (((vp)->v_vflag & VV_NOKNOTE) =3D=3D 0)				\
@@ -759,6 +768,9 @@ vfs_statfs_t	__vfs_statfs;
 		VN_KNOTE((vp), (hint), 0);				\
 } while (0)
=20
+#define	VFS_NOTIFY_UPPER_RECLAIM	1
+#define	VFS_NOTIFY_UPPER_UNLINK		2
+
 #include <sys/module.h>
=20
 /*
@@ -840,6 +852,7 @@ int	vfs_modevent(module_t, int, void *);
 void	vfs_mount_error(struct mount *, const char *, ...);
 void	vfs_mountroot(void);			/* mount our root filesystem */
 void	vfs_mountedfrom(struct mount *, const char *from);
+void	vfs_notify_upper(struct vnode *, int);
 void	vfs_oexport_conv(const struct oexport_args *oexp,
 	    struct export_args *exp);
 void	vfs_ref(struct mount *);

--CzJ9BTjEYwHt6w7v
Content-Type: application/pgp-signature

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2.0.19 (FreeBSD)

iQIcBAEBAgAGBQJRi05NAAoJEJDCuSvBvK1BuFkP/i4qzpF1cWvS3zmNFxrMK4HN
QKEcbbUByJhzuwxTsgAKPI/VXI2zs8PW0by3ul4Tl4x9forfja578esLdaTW83QZ
VGntUXYfJXYBEg5kC3IdfOWAyHZa5S84OZI/Dd3wur0ftxvQPPydE809Ri+Zh4Nw
vzL0uroV6k6N3PUgE1HVTeIR99Ft+U1vy7aymQC/kSFYPahur0N9P7Z6U9cIQWkp
SvOQ7xlQZd3o5FJ3hNT+D+IgjSz9PveePSobe7LqGgB1/Pj3wvW7jsaQKZ3OlV+N
DkA6COQ2VaaOH4Cr9UGw+G0Vk1u7sKBKnDsP+7iIjv6T/nk5G2ry2ICmQFKTSlzb
Y4eWp1h9VWh1+QalYBpXi8b3kdLGRf7pRO7TT8gX6ixZnhVYsY9mrSfSGZWeLVNO
g5OHPXSVof7eje34LW9WEnUGrmteHzbzgIJGrC7++kEXdEdTgS59jhQwzoiHFObY
U4brSroNEN8WJhftBU5cz/ahbNeAoEVt1aT1ZosZwOSLWQ6LpeIxe/0Bd1VrDKb1
W4CZ4saV43CbDcL77h2Fja1+MyxZqAgOxd9bZ4zlEROYcZWuYI7PelDQY7ICCvDv
gF51ho0PwncD8+r/0R4bT8vjL4PGBV2T+CG2WRJFtGs61H4lyMUJJDrQK3STMM2y
aTZhLmZTD55gEkC0QM+J
=cA3q
-----END PGP SIGNATURE-----

--CzJ9BTjEYwHt6w7v--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20130509072046.GN3047>