Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 2 Apr 2019 13:59:04 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r345803 - head/sys/fs/tmpfs
Message-ID:  <201904021359.x32Dx4RD090274@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Tue Apr  2 13:59:04 2019
New Revision: 345803
URL: https://svnweb.freebsd.org/changeset/base/345803

Log:
  tmpfs: plug holes on rw->ro mount update.
  
  In particular:
  - suspend the mount around vflush() to avoid new writes come after the
    vnode is processed;
  - flush pending metadata updates (mostly node times);
  - remap all rw mappings of files from the mount into ro.
  
  It is not clear to me how to handle writeable mappings on rw->ro for
  tmpfs best.  Other filesystems, which use vnode vm object, call
  vgone() on vnodes with writers, which sets the vm object type to
  OBJT_DEAD, and keep the resident pages and installed ptes as is.  In
  particular, the existing mappings continue to work as far as
  application only accesses resident pages, but changes are not flushed
  to file.
  
  For tmpfs the vm object of VREG vnodes also serves as the data pages
  container, giving single copy of the mapped pages, so it cannot be set
  to OBJT_DEAD.  Alternatives for making rw mappings ro could be either
  invalidating them at all, or marking as CoW.
  
  Tested by:	pho
  Sponsored by:	The FreeBSD Foundation
  MFC after:	2 weeks
  Differential revision:	https://reviews.freebsd.org/D19737

Modified:
  head/sys/fs/tmpfs/tmpfs_vfsops.c

Modified: head/sys/fs/tmpfs/tmpfs_vfsops.c
==============================================================================
--- head/sys/fs/tmpfs/tmpfs_vfsops.c	Tue Apr  2 13:58:31 2019	(r345802)
+++ head/sys/fs/tmpfs/tmpfs_vfsops.c	Tue Apr  2 13:59:04 2019	(r345803)
@@ -60,10 +60,15 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
+#include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_param.h>
 
@@ -137,14 +142,235 @@ tmpfs_node_fini(void *mem, int size)
 	mtx_destroy(&node->tn_interlock);
 }
 
+/*
+ * Handle updates of time from writes to mmaped regions.  Use
+ * MNT_VNODE_FOREACH_ALL instead of MNT_VNODE_FOREACH_ACTIVE, since
+ * unmap of the tmpfs-backed vnode does not call vinactive(), due to
+ * vm object type is OBJT_SWAP.
+ * If lazy, only handle delayed update of mtime due to the writes to
+ * mapped files.
+ */
+static void
+tmpfs_update_mtime(struct mount *mp, bool lazy)
+{
+	struct vnode *vp, *mvp;
+	struct vm_object *obj;
+
+	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
+		if (vp->v_type != VREG) {
+			VI_UNLOCK(vp);
+			continue;
+		}
+		obj = vp->v_object;
+		KASSERT((obj->flags & (OBJ_TMPFS_NODE | OBJ_TMPFS)) ==
+		    (OBJ_TMPFS_NODE | OBJ_TMPFS), ("non-tmpfs obj"));
+
+		/*
+		 * In lazy case, do unlocked read, avoid taking vnode
+		 * lock if not needed.  Lost update will be handled on
+		 * the next call.
+		 * For non-lazy case, we must flush all pending
+		 * metadata changes now.
+		 */
+		if (!lazy || (obj->flags & OBJ_TMPFS_DIRTY) != 0) {
+			if (vget(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
+			    curthread) != 0)
+				continue;
+			tmpfs_check_mtime(vp);
+			if (!lazy)
+				tmpfs_update(vp);
+			vput(vp);
+		} else {
+			VI_UNLOCK(vp);
+			continue;
+		}
+	}
+}
+
+struct tmpfs_check_rw_maps_arg {
+	bool found;
+};
+
+static bool
+tmpfs_check_rw_maps_cb(struct mount *mp __unused, vm_map_t map __unused,
+    vm_map_entry_t entry __unused, void *arg)
+{
+	struct tmpfs_check_rw_maps_arg *a;
+
+	a = arg;
+	a->found = true;
+	return (true);
+}
+
+/*
+ * Revoke write permissions from all mappings of regular files
+ * belonging to the specified tmpfs mount.
+ */
+static bool
+tmpfs_revoke_rw_maps_cb(struct mount *mp __unused, vm_map_t map,
+    vm_map_entry_t entry, void *arg __unused)
+{
+
+	/*
+	 * XXXKIB: might be invalidate the mapping
+	 * instead ?  The process is not going to be
+	 * happy in any case.
+	 */
+	entry->max_protection &= ~VM_PROT_WRITE;
+	if ((entry->protection & VM_PROT_WRITE) != 0) {
+		entry->protection &= ~VM_PROT_WRITE;
+		pmap_protect(map->pmap, entry->start, entry->end,
+		    entry->protection);
+	}
+	return (false);
+}
+
+static void
+tmpfs_all_rw_maps(struct mount *mp, bool (*cb)(struct mount *mp, vm_map_t,
+    vm_map_entry_t, void *), void *cb_arg)
+{
+	struct proc *p;
+	struct vmspace *vm;
+	vm_map_t map;
+	vm_map_entry_t entry;
+	vm_object_t object;
+	struct vnode *vp;
+	int gen;
+	bool terminate;
+
+	terminate = false;
+	sx_slock(&allproc_lock);
+again:
+	gen = allproc_gen;
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
+		    P_SYSTEM | P_WEXIT)) != 0) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		vm = vmspace_acquire_ref(p);
+		_PHOLD_LITE(p);
+		PROC_UNLOCK(p);
+		if (vm == NULL) {
+			PRELE(p);
+			continue;
+		}
+		sx_sunlock(&allproc_lock);
+		map = &vm->vm_map;
+
+		vm_map_lock(map);
+		if (map->busy)
+			vm_map_wait_busy(map);
+		for (entry = map->header.next; entry != &map->header;
+		    entry = entry->next) {
+			if ((entry->eflags & (MAP_ENTRY_GUARD |
+			    MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_COW)) != 0 ||
+			    (entry->max_protection & VM_PROT_WRITE) == 0)
+				continue;
+			object = entry->object.vm_object;
+			if (object == NULL || object->type != OBJT_SWAP ||
+			    (object->flags & OBJ_TMPFS_NODE) == 0)
+				continue;
+			/*
+			 * No need to dig into shadow chain, mapping
+			 * of the object not at top is readonly.
+			 */
+
+			VM_OBJECT_RLOCK(object);
+			if (object->type == OBJT_DEAD) {
+				VM_OBJECT_RUNLOCK(object);
+				continue;
+			}
+			MPASS(object->ref_count > 1);
+			if ((object->flags & (OBJ_TMPFS_NODE | OBJ_TMPFS)) !=
+			    (OBJ_TMPFS_NODE | OBJ_TMPFS)) {
+				VM_OBJECT_RUNLOCK(object);
+				continue;
+			}
+			vp = object->un_pager.swp.swp_tmpfs;
+			if (vp->v_mount != mp) {
+				VM_OBJECT_RUNLOCK(object);
+				continue;
+			}
+
+			terminate = cb(mp, map, entry, cb_arg);
+			VM_OBJECT_RUNLOCK(object);
+			if (terminate)
+				break;
+		}
+		vm_map_unlock(map);
+
+		vmspace_free(vm);
+		sx_slock(&allproc_lock);
+		PRELE(p);
+		if (terminate)
+			break;
+	}
+	if (!terminate && gen != allproc_gen)
+		goto again;
+	sx_sunlock(&allproc_lock);
+}
+
+static bool
+tmpfs_check_rw_maps(struct mount *mp)
+{
+	struct tmpfs_check_rw_maps_arg ca;
+
+	ca.found = false;
+	tmpfs_all_rw_maps(mp, tmpfs_check_rw_maps_cb, &ca);
+	return (ca.found);
+}
+
 static int
+tmpfs_rw_to_ro(struct mount *mp)
+{
+	int error, flags;
+	bool forced;
+
+	forced = (mp->mnt_flag & MNT_FORCE) != 0;
+	flags = WRITECLOSE | (forced ? FORCECLOSE : 0);
+
+	if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
+		return (error);
+	error = vfs_write_suspend_umnt(mp);
+	if (error != 0)
+		return (error);
+	if (!forced && tmpfs_check_rw_maps(mp)) {
+		error = EBUSY;
+		goto out;
+	}
+	VFS_TO_TMPFS(mp)->tm_ronly = 1;
+	MNT_ILOCK(mp);
+	mp->mnt_flag |= MNT_RDONLY;
+	MNT_IUNLOCK(mp);
+	for (;;) {
+		tmpfs_all_rw_maps(mp, tmpfs_revoke_rw_maps_cb, NULL);
+		tmpfs_update_mtime(mp, false);
+		error = vflush(mp, 0, flags, curthread);
+		if (error != 0) {
+			VFS_TO_TMPFS(mp)->tm_ronly = 0;
+			MNT_ILOCK(mp);
+			mp->mnt_flag &= ~MNT_RDONLY;
+			MNT_IUNLOCK(mp);
+			goto out;
+		}
+		if (!tmpfs_check_rw_maps(mp))
+			break;
+	}
+out:
+	vfs_write_resume(mp, 0);
+	return (error);
+}
+
+static int
 tmpfs_mount(struct mount *mp)
 {
 	const size_t nodes_per_page = howmany(PAGE_SIZE,
 	    sizeof(struct tmpfs_dirent) + sizeof(struct tmpfs_node));
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *root;
-	int error, flags;
+	int error;
 	bool nonc;
 	/* Size counters. */
 	u_quad_t pages;
@@ -178,19 +404,7 @@ tmpfs_mount(struct mount *mp)
 		if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) &&
 		    !(VFS_TO_TMPFS(mp)->tm_ronly)) {
 			/* RW -> RO */
-			error = VFS_SYNC(mp, MNT_WAIT);
-			if (error)
-				return (error);
-			flags = WRITECLOSE;
-			if (mp->mnt_flag & MNT_FORCE)
-				flags |= FORCECLOSE;
-			error = vflush(mp, 0, flags, curthread);
-			if (error)
-				return (error);
-			VFS_TO_TMPFS(mp)->tm_ronly = 1;
-			MNT_ILOCK(mp);
-			mp->mnt_flag |= MNT_RDONLY;
-			MNT_IUNLOCK(mp);
+			return (tmpfs_rw_to_ro(mp));
 		} else if (!vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) &&
 		    VFS_TO_TMPFS(mp)->tm_ronly) {
 			/* RO -> RW */
@@ -469,45 +683,13 @@ tmpfs_statfs(struct mount *mp, struct statfs *sbp)
 static int
 tmpfs_sync(struct mount *mp, int waitfor)
 {
-	struct vnode *vp, *mvp;
-	struct vm_object *obj;
 
 	if (waitfor == MNT_SUSPEND) {
 		MNT_ILOCK(mp);
 		mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED;
 		MNT_IUNLOCK(mp);
 	} else if (waitfor == MNT_LAZY) {
-		/*
-		 * Handle lazy updates of mtime from writes to mmaped
-		 * regions.  Use MNT_VNODE_FOREACH_ALL instead of
-		 * MNT_VNODE_FOREACH_ACTIVE, since unmap of the
-		 * tmpfs-backed vnode does not call vinactive(), due
-		 * to vm object type is OBJT_SWAP.
-		 */
-		MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
-			if (vp->v_type != VREG) {
-				VI_UNLOCK(vp);
-				continue;
-			}
-			obj = vp->v_object;
-			KASSERT((obj->flags & (OBJ_TMPFS_NODE | OBJ_TMPFS)) ==
-			    (OBJ_TMPFS_NODE | OBJ_TMPFS), ("non-tmpfs obj"));
-
-			/*
-			 * Unlocked read, avoid taking vnode lock if
-			 * not needed.  Lost update will be handled on
-			 * the next call.
-			 */
-			if ((obj->flags & OBJ_TMPFS_DIRTY) == 0) {
-				VI_UNLOCK(vp);
-				continue;
-			}
-			if (vget(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
-			    curthread) != 0)
-				continue;
-			tmpfs_check_mtime(vp);
-			vput(vp);
-		}
+		tmpfs_update_mtime(mp, true);
 	}
 	return (0);
 }



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201904021359.x32Dx4RD090274>