Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 8 May 2013 12:13:17 +0300
From:      Konstantin Belousov <kostikbel@gmail.com>
To:        G??ran L??wkrantz <goran.lowkrantz@ismobile.com>
Cc:        pho@freebsd.org, freebsd-stable@freebsd.org
Subject:   Re: Nullfs leaks i-nodes
Message-ID:  <20130508091317.GJ3047@kib.kiev.ua>
In-Reply-To: <B799E3B928B18B9E6C68F912@[172.16.2.62]>
References:  <B799E3B928B18B9E6C68F912@[172.16.2.62]>

next in thread | previous in thread | raw e-mail | index | archive | help

--wICysDeFgFMelglS
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On Tue, May 07, 2013 at 08:30:06AM +0200, G??ran L??wkrantz wrote:
> I created a PR, kern/178238, on this but would like to know if anyone has=
=20
> any ideas or patches?
>=20
> Have updated the system where I see this to FreeBSD 9.1-STABLE #0 r250229=
=20
> and still have the problem.

The patch below should fix the issue for you, at least it did so in my
limited testing.

What is does:
1. When inactivating a nullfs vnode, check if the lower vnode is
   unlinked, and reclaim upper vnode if so. [This fixes your case].

2. Besides a callback to the upper filesystems for the lower vnode
   reclaimation, it also calls the upper fs for lower vnode unlink.
   This allows nullfs to purge cached vnodes for the unlinked lower.
   [This fixes an opposite case, when the vnode is removed from the
   lower mount, but upper aliases prevent the vnode from being
   recycled].

3. Fix a wart which existed from the introduction of the nullfs caching,
   do not unlock lower vnode in the nullfs_reclaim_lowervp().  It should
   be completely innocent, but now it is also formally safe.

4. Fix vnode reference leak in nullfs_reclaim_lowervp().

Please note that the patch is basically not tested, I only verified your
scenario and a mirror of it as described in item 2.

diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h
index 4f37020..a624be6 100644
--- a/sys/fs/nullfs/null.h
+++ b/sys/fs/nullfs/null.h
@@ -53,8 +53,11 @@ struct null_node {
 	LIST_ENTRY(null_node)	null_hash;	/* Hash list */
 	struct vnode	        *null_lowervp;	/* VREFed once */
 	struct vnode		*null_vnode;	/* Back pointer */
+	u_int			null_flags;
 };
=20
+#define	NULLV_NOUNLOCK	0x0001
+
 #define	MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data))
 #define	VTONULL(vp) ((struct null_node *)(vp)->v_data)
 #define	NULLTOV(xp) ((xp)->null_vnode)
diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c
index 02932bd..544c358 100644
--- a/sys/fs/nullfs/null_vfsops.c
+++ b/sys/fs/nullfs/null_vfsops.c
@@ -65,7 +65,6 @@ static vfs_statfs_t	nullfs_statfs;
 static vfs_unmount_t	nullfs_unmount;
 static vfs_vget_t	nullfs_vget;
 static vfs_extattrctl_t	nullfs_extattrctl;
-static vfs_reclaim_lowervp_t nullfs_reclaim_lowervp;
=20
 /*
  * Mount null layer
@@ -391,8 +390,22 @@ nullfs_reclaim_lowervp(struct mount *mp, struct vnode =
*lowervp)
 	vp =3D null_hashget(mp, lowervp);
 	if (vp =3D=3D NULL)
 		return;
+	VTONULL(vp)->null_flags |=3D NULLV_NOUNLOCK;
 	vgone(vp);
-	vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY);
+	vput(vp);
+}
+
+static void
+nullfs_unlink_lowervp(struct mount *mp, struct vnode *lowervp)
+{
+	struct vnode *vp;
+
+	vp =3D null_hashget(mp, lowervp);
+	if (vp =3D=3D NULL || vp->v_usecount > 1)
+		return;
+	VTONULL(vp)->null_flags |=3D NULLV_NOUNLOCK;
+	vgone(vp);
+	vput(vp);
 }
=20
 static struct vfsops null_vfsops =3D {
@@ -408,6 +421,7 @@ static struct vfsops null_vfsops =3D {
 	.vfs_unmount =3D		nullfs_unmount,
 	.vfs_vget =3D		nullfs_vget,
 	.vfs_reclaim_lowervp =3D	nullfs_reclaim_lowervp,
+	.vfs_unlink_lowervp =3D	nullfs_unlink_lowervp,
 };
=20
 VFS_SET(null_vfsops, nullfs, VFCF_LOOPBACK | VFCF_JAIL);
diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c
index f59865f..3c41124 100644
--- a/sys/fs/nullfs/null_vnops.c
+++ b/sys/fs/nullfs/null_vnops.c
@@ -692,18 +692,21 @@ null_unlock(struct vop_unlock_args *ap)
 static int
 null_inactive(struct vop_inactive_args *ap __unused)
 {
-	struct vnode *vp;
+	struct vnode *vp, *lvp;
 	struct mount *mp;
 	struct null_mount *xmp;
=20
 	vp =3D ap->a_vp;
+	lvp =3D NULLVPTOLOWERVP(vp);
 	mp =3D vp->v_mount;
 	xmp =3D MOUNTTONULLMOUNT(mp);
-	if ((xmp->nullm_flags & NULLM_CACHE) =3D=3D 0) {
+	if ((xmp->nullm_flags & NULLM_CACHE) =3D=3D 0 ||
+	    (lvp->v_vflag & VV_NOSYNC) !=3D 0) {
 		/*
 		 * If this is the last reference and caching of the
-		 * nullfs vnodes is not enabled, then free up the
-		 * vnode so as not to tie up the lower vnodes.
+		 * nullfs vnodes is not enabled, or the lower vnode is
+		 * deleted, then free up the vnode so as not to tie up
+		 * the lower vnodes.
 		 */
 		vp->v_object =3D NULL;
 		vrecycle(vp);
@@ -748,7 +751,10 @@ null_reclaim(struct vop_reclaim_args *ap)
 	 */
 	if (vp->v_writecount > 0)
 		VOP_ADD_WRITECOUNT(lowervp, -1);
-	vput(lowervp);
+	if ((xp->null_flags & NULLV_NOUNLOCK) !=3D 0)
+		vunref(lowervp);
+	else
+		vput(lowervp);
 	free(xp, M_NULLFSNODE);
=20
 	return (0);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index de118f7..988a114 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -2700,19 +2700,20 @@ vgone(struct vnode *vp)
 }
=20
 static void
-vgonel_reclaim_lowervp_vfs(struct mount *mp __unused,
+notify_lowervp_vfs_dummy(struct mount *mp __unused,
     struct vnode *lowervp __unused)
 {
 }
=20
 /*
- * Notify upper mounts about reclaimed vnode.
+ * Notify upper mounts about reclaimed or unlinked vnode.
  */
-static void
-vgonel_reclaim_lowervp(struct vnode *vp)
+void
+vfs_notify_upper(struct vnode *vp, int event)
 {
 	static struct vfsops vgonel_vfsops =3D {
-		.vfs_reclaim_lowervp =3D vgonel_reclaim_lowervp_vfs
+		.vfs_reclaim_lowervp =3D notify_lowervp_vfs_dummy,
+		.vfs_unlink_lowervp =3D notify_lowervp_vfs_dummy,
 	};
 	struct mount *mp, *ump, *mmp;
=20
@@ -2736,7 +2737,17 @@ vgonel_reclaim_lowervp(struct vnode *vp)
 		}
 		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
 		MNT_IUNLOCK(mp);
-		VFS_RECLAIM_LOWERVP(ump, vp);
+		switch (event) {
+		case VFS_NOTIFY_UPPER_RECLAIM:
+			VFS_RECLAIM_LOWERVP(ump, vp);
+			break;
+		case VFS_NOTIFY_UPPER_UNLINK:
+			VFS_UNLINK_LOWERVP(ump, vp);
+			break;
+		default:
+			KASSERT(0, ("invalid event %d", event));
+			break;
+		}
 		MNT_ILOCK(mp);
 		ump =3D TAILQ_NEXT(mmp, mnt_upper_link);
 		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
@@ -2783,7 +2794,7 @@ vgonel(struct vnode *vp)
 	active =3D vp->v_usecount;
 	oweinact =3D (vp->v_iflag & VI_OWEINACT);
 	VI_UNLOCK(vp);
-	vgonel_reclaim_lowervp(vp);
+	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
=20
 	/*
 	 * Clean out any buffers associated with the vnode.
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 29cb7bd..a004ea0 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -1846,6 +1846,7 @@ restart:
 		if (error)
 			goto out;
 #endif
+		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
 		error =3D VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 #ifdef MAC
 out:
@@ -3825,6 +3826,7 @@ restart:
 			return (error);
 		goto restart;
 	}
+	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
 	error =3D VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 	vn_finished_write(mp);
 out:
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index a9c86f6..a953dae 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -608,7 +608,7 @@ typedef	int vfs_mount_t(struct mount *mp);
 typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op,
 		    struct sysctl_req *req);
 typedef void vfs_susp_clean_t(struct mount *mp);
-typedef void vfs_reclaim_lowervp_t(struct mount *mp, struct vnode *lowervp=
);
+typedef void vfs_notify_lowervp_t(struct mount *mp, struct vnode *lowervp);
=20
 struct vfsops {
 	vfs_mount_t		*vfs_mount;
@@ -626,7 +626,8 @@ struct vfsops {
 	vfs_extattrctl_t	*vfs_extattrctl;
 	vfs_sysctl_t		*vfs_sysctl;
 	vfs_susp_clean_t	*vfs_susp_clean;
-	vfs_reclaim_lowervp_t	*vfs_reclaim_lowervp;
+	vfs_notify_lowervp_t	*vfs_reclaim_lowervp;
+	vfs_notify_lowervp_t	*vfs_unlink_lowervp;
 };
=20
 vfs_statfs_t	__vfs_statfs;
@@ -747,6 +748,14 @@ vfs_statfs_t	__vfs_statfs;
 	}								\
 } while (0)
=20
+#define	VFS_UNLINK_LOWERVP(MP, VP) do {					\
+	if (*(MP)->mnt_op->vfs_unlink_lowervp !=3D NULL) {		\
+		VFS_PROLOGUE(MP);					\
+		(*(MP)->mnt_op->vfs_unlink_lowervp)((MP), (VP));	\
+		VFS_EPILOGUE(MP);					\
+	}								\
+} while (0)
+
 #define VFS_KNOTE_LOCKED(vp, hint) do					\
 {									\
 	if (((vp)->v_vflag & VV_NOKNOTE) =3D=3D 0)				\
@@ -759,6 +768,9 @@ vfs_statfs_t	__vfs_statfs;
 		VN_KNOTE((vp), (hint), 0);				\
 } while (0)
=20
+#define	VFS_NOTIFY_UPPER_RECLAIM	1
+#define	VFS_NOTIFY_UPPER_UNLINK		2
+
 #include <sys/module.h>
=20
 /*
@@ -840,6 +852,7 @@ int	vfs_modevent(module_t, int, void *);
 void	vfs_mount_error(struct mount *, const char *, ...);
 void	vfs_mountroot(void);			/* mount our root filesystem */
 void	vfs_mountedfrom(struct mount *, const char *from);
+void	vfs_notify_upper(struct vnode *, int);
 void	vfs_oexport_conv(const struct oexport_args *oexp,
 	    struct export_args *exp);
 void	vfs_ref(struct mount *);

--wICysDeFgFMelglS
Content-Type: application/pgp-signature

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2.0.19 (FreeBSD)

iQIcBAEBAgAGBQJRihcsAAoJEJDCuSvBvK1BTKAP/3X9R+Z3LWS3qarpGobad/oJ
Ro4NJaOX+T8Bq84dJKC4vUfaMT0tU10v5erO/7991qWzb7XBoBOzph7WGbdy5LmR
IBr88bqyjMUeX+PZXlRNP57GRtjNGPRa6vLnlYmtevX/jP6gN+Laa+ck+xZN9pQJ
CijODfARyvt0eWLhqPYbEhbYkgAmmjV1g3qotFN6BCrwUemddJ+nvVlq1fYPvHUE
9r+d8nNbWyTZOuIoNuS5icGesNxNvMJUQi5HgBJLFCvsoJz8GjPPNXTv4xaAG+X2
k3xaqQy/q6SeAHtbqOdZLbSWFPHTd+3PGOh6I/28kutVXU6IxdJI0Nz6qt/ezks8
smidVcggmuyqKl3u1vdntgfijaW2JR0EKMs2QR/Hvvk+28DPK3YyA+VrXrBcCbjY
eD6OIDQAVKX1NVjqEline6lObqOExIT4Nkjh+jhcYBxDHBe65pgPf1px6fs5Aw86
ITMVlJbCQp+IeUiS2zcdzTvLuNoYkGQ2+hV7iMf3ZSt5NM94K2awWhYiHRXL44k5
g5f3ObJBT86lGTsAIPNpl/fIJUGbIBs5Q+9VrGvlLKZOk0XLoC+agVMijmQNjZCe
Ncjj9/xJsLvxn4hDRylhkQ0zgHrJVIe2rtB6G1ccFid/UhLbrfSsJdn9aRGU6ygL
1v8Gn/5YdSjEYFRe6WjF
=/jR1
-----END PGP SIGNATURE-----

--wICysDeFgFMelglS--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20130508091317.GJ3047>