Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 2 Nov 2016 08:46:05 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org
Subject:   svn commit: r308208 - stable/11/sys/ufs/ffs
Message-ID:  <201611020846.uA28k5sl009715@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Wed Nov  2 08:46:04 2016
New Revision: 308208
URL: https://svnweb.freebsd.org/changeset/base/308208

Log:
  MFC r307626:
  Add FFS pager, which uses buffer cache read operation to validate pages.
  
  For now, the pager is disabled by default in the stable branch.

Modified:
  stable/11/sys/ufs/ffs/ffs_vnops.c
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/sys/ufs/ffs/ffs_vnops.c
==============================================================================
--- stable/11/sys/ufs/ffs/ffs_vnops.c	Wed Nov  2 08:41:01 2016	(r308207)
+++ stable/11/sys/ufs/ffs/ffs_vnops.c	Wed Nov  2 08:46:04 2016	(r308208)
@@ -77,6 +77,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/priv.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
+#include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
@@ -86,6 +87,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_pageout.h>
 #include <vm/vnode_pager.h>
 
 #include <ufs/ufs/extattr.h>
@@ -102,8 +104,9 @@ __FBSDID("$FreeBSD$");
 #ifdef DIRECTIO
 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
 #endif
-static vop_fsync_t	ffs_fsync;
 static vop_fdatasync_t	ffs_fdatasync;
+static vop_fsync_t	ffs_fsync;
+static vop_getpages_t	ffs_getpages;
 static vop_lock1_t	ffs_lock;
 static vop_read_t	ffs_read;
 static vop_write_t	ffs_write;
@@ -119,13 +122,12 @@ static vop_openextattr_t	ffs_openextattr
 static vop_setextattr_t	ffs_setextattr;
 static vop_vptofh_t	ffs_vptofh;
 
-
 /* Global vfs data structures for ufs. */
 struct vop_vector ffs_vnodeops1 = {
 	.vop_default =		&ufs_vnodeops,
 	.vop_fsync =		ffs_fsync,
 	.vop_fdatasync =	ffs_fdatasync,
-	.vop_getpages =		vnode_pager_local_getpages,
+	.vop_getpages =		ffs_getpages,
 	.vop_getpages_async =	vnode_pager_local_getpages_async,
 	.vop_lock1 =		ffs_lock,
 	.vop_read =		ffs_read,
@@ -147,7 +149,7 @@ struct vop_vector ffs_vnodeops2 = {
 	.vop_default =		&ufs_vnodeops,
 	.vop_fsync =		ffs_fsync,
 	.vop_fdatasync =	ffs_fdatasync,
-	.vop_getpages =		vnode_pager_local_getpages,
+	.vop_getpages =		ffs_getpages,
 	.vop_getpages_async =	vnode_pager_local_getpages_async,
 	.vop_lock1 =		ffs_lock,
 	.vop_read =		ffs_read,
@@ -1784,3 +1786,165 @@ vop_vptofh {
 	ufhp->ufid_gen = ip->i_gen;
 	return (0);
 }
+
+SYSCTL_DECL(_vfs_ffs);
+static int use_buf_pager = 0;
+SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
+    "Always use buffer pager instead of bmap");
+static int buf_pager_relbuf;
+SYSCTL_INT(_vfs_ffs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
+    &buf_pager_relbuf, 0,
+    "Make buffer pager release buffers after reading");
+
+/*
+ * The FFS pager.  It uses buffer reads to validate pages.
+ *
+ * In contrast to the generic local pager from vm/vnode_pager.c, this
+ * pager correctly and easily handles volumes where the underlying
+ * device block size is greater than the machine page size.  The
+ * buffer cache transparently extends the requested page run to be
+ * aligned at the block boundary, and does the necessary bogus page
+ * replacements in the addends to avoid obliterating already valid
+ * pages.
+ *
+ * The only non-trivial issue is that the exclusive busy state for
+ * pages, which is assumed by the vm_pager_getpages() interface, is
+ * incompatible with the VMIO buffer cache's desire to share-busy the
+ * pages.  This function performs a trivial downgrade of the pages'
+ * state before reading buffers, and a less trivial upgrade from the
+ * shared-busy to excl-busy state after the read.
+ */
+static int
+ffs_getpages(struct vop_getpages_args *ap)
+{
+	struct vnode *vp;
+	vm_page_t *ma, m;
+	vm_object_t object;
+	struct buf *bp;
+	struct ufsmount *um;
+	ufs_lbn_t lbn, lbnp;
+	vm_ooffset_t la, lb;
+	long bsize;
+	int bo_bs, count, error, i;
+	bool redo, lpart;
+
+	vp = ap->a_vp;
+	ma = ap->a_m;
+	count = ap->a_count;
+
+	um = VFSTOUFS(ap->a_vp->v_mount);
+	bo_bs = um->um_devvp->v_bufobj.bo_bsize;
+	if (!use_buf_pager && bo_bs <= PAGE_SIZE)
+		return (vnode_pager_generic_getpages(vp, ma, count,
+		    ap->a_rbehind, ap->a_rahead, NULL, NULL));
+
+	object = vp->v_object;
+	la = IDX_TO_OFF(ma[count - 1]->pindex);
+	if (la >= object->un_pager.vnp.vnp_size)
+		return (VM_PAGER_BAD);
+	lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size;
+	if (ap->a_rbehind != NULL) {
+		lb = IDX_TO_OFF(ma[0]->pindex);
+		*ap->a_rbehind = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
+	}
+	if (ap->a_rahead != NULL) {
+		*ap->a_rahead = OFF_TO_IDX(roundup2(la, bo_bs) - la);
+		if (la + IDX_TO_OFF(*ap->a_rahead) >=
+		    object->un_pager.vnp.vnp_size) {
+			*ap->a_rahead = OFF_TO_IDX(roundup2(object->un_pager.
+			    vnp.vnp_size, PAGE_SIZE) - la);
+		}
+	}
+	VM_OBJECT_WLOCK(object);
+again:
+	for (i = 0; i < count; i++)
+		vm_page_busy_downgrade(ma[i]);
+	VM_OBJECT_WUNLOCK(object);
+
+	lbnp = -1;
+	for (i = 0; i < count; i++) {
+		m = ma[i];
+
+		/*
+		 * Pages are shared busy and the object lock is not
+		 * owned, which together allow for the pages'
+		 * invalidation.  The racy test for validity avoids
+		 * useless creation of the buffer for the most typical
+		 * case when invalidation is not used in redo or for
+		 * parallel read.  The shared->excl upgrade loop at
+		 * the end of the function catches the race in a
+		 * reliable way (protected by the object lock).
+		 */
+		if (m->valid == VM_PAGE_BITS_ALL)
+			continue;
+
+		lbn = lblkno(um->um_fs, IDX_TO_OFF(m->pindex));
+		if (lbn != lbnp) {
+			bsize = blksize(um->um_fs, VTOI(vp), lbn);
+			error = bread_gb(vp, lbn, bsize, NOCRED, GB_UNMAPPED,
+			    &bp);
+			if (error != 0)
+				break;
+			KASSERT(1 /* racy, enable for debugging */ ||
+			    m->valid == VM_PAGE_BITS_ALL || i == count - 1,
+			    ("buf %d %p invalid", i, m));
+			if (i == count - 1 && lpart) {
+				VM_OBJECT_WLOCK(object);
+				if (m->valid != 0 &&
+				    m->valid != VM_PAGE_BITS_ALL)
+					vm_page_zero_invalid(m, TRUE);
+				VM_OBJECT_WUNLOCK(object);
+			}
+			if (LIST_EMPTY(&bp->b_dep)) {
+				/*
+				 * Invalidation clears m->valid, but
+				 * may leave B_CACHE flag if the
+				 * buffer existed at the invalidation
+				 * time.  In this case, recycle the
+				 * buffer to do real read on next
+				 * bread() after redo.
+				 *
+				 * Otherwise B_RELBUF is not strictly
+				 * necessary, enable to reduce buf
+				 * cache pressure.
+				 */
+				if (buf_pager_relbuf ||
+				    m->valid != VM_PAGE_BITS_ALL)
+					bp->b_flags |= B_RELBUF;
+
+				bp->b_flags &= ~B_NOCACHE;
+				brelse(bp);
+			} else {
+				bqrelse(bp);
+			}
+			lbnp = lbn;
+		}
+	}
+
+	VM_OBJECT_WLOCK(object);
+	redo = false;
+	for (i = 0; i < count; i++) {
+		vm_page_sunbusy(ma[i]);
+		ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
+
+		/*
+		 * Since the pages were only sbusy while neither the
+		 * buffer nor the object lock was held by us, or
+		 * reallocated while vm_page_grab() slept for busy
+		 * relinguish, they could have been invalidated.
+		 * Recheck the valid bits and re-read as needed.
+		 *
+		 * Note that the last page is made fully valid in the
+		 * read loop, and partial validity for the page at
+		 * index count - 1 could mean that the page was
+		 * invalidated or removed, so we must restart for
+		 * safety as well.
+		 */
+		if (ma[i]->valid != VM_PAGE_BITS_ALL)
+			redo = true;
+	}
+	if (redo && error == 0)
+		goto again;
+	VM_OBJECT_WUNLOCK(object);
+	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
+}



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201611020846.uA28k5sl009715>