Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 27 Feb 2012 19:00:55 GMT
From:      John Baldwin <jhb@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 206989 for review
Message-ID:  <201202271900.q1RJ0toJ064427@skunkworks.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://p4web.freebsd.org/@@206989?ac=10

Change 206989 by jhb@jhb_jhbbsd on 2012/02/27 19:00:32

	Import my current WIP to implement POSIX_FADV_WILLNEED for UFS.

Affected files ...

.. //depot/projects/fadvise/sys/kern/vfs_bio.c#5 edit
.. //depot/projects/fadvise/sys/kern/vfs_cluster.c#3 edit
.. //depot/projects/fadvise/sys/sys/buf.h#2 edit
.. //depot/projects/fadvise/sys/ufs/ffs/ffs_vnops.c#3 edit

Differences ...

==== //depot/projects/fadvise/sys/kern/vfs_bio.c#5 (text+ko) ====

@@ -2664,8 +2664,10 @@
 		if (error == ENOLCK)
 			goto loop;
 		/* We timed out or were interrupted. */
-		else if (error)
+		else if (error) {
+			CTR4(KTR_BUF, "getblk(%p, %ld, %d) failed %d", vp, (long)blkno, size, error);
 			return (NULL);
+		}
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
@@ -2787,8 +2789,16 @@
 
 		bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
 		if (bp == NULL) {
-			if (slpflag || slptimeo)
+			/*
+			 * XXX: Should this also return NULL if
+			 * GB_NOWAIT_BD is set?
+			 */
+			if (slpflag || slptimeo) {
+				CTR3(KTR_BUF,
+				    "getblk(%p, %ld, %d) failed getnewbuf()",
+				    vp, (long)blkno, size);
 				return NULL;
+			}
 			goto loop;
 		}
 

==== //depot/projects/fadvise/sys/kern/vfs_cluster.c#3 (text+ko) ====

@@ -39,6 +39,7 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/ktr.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
@@ -64,8 +65,10 @@
 	cluster_collectbufs(struct vnode *vp, struct buf *last_bp);
 static struct buf *
 	cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
-			 daddr_t blkno, long size, int run, struct buf *fbp);
+	    daddr_t blkno, long size, int run, struct buf *fbp, int gbflags);
 static void cluster_callback(struct buf *);
+static void cluster_ra(struct vnode *vp, u_quad_t filesize, daddr_t flbn,
+	    daddr_t elbn, long size, int racluster, int gbflags);
 
 static int write_behind = 1;
 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
@@ -75,6 +78,19 @@
 SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
     "Cluster read-ahead max block count");
 
+SYSCTL_NODE(_vfs, OID_AUTO, cluster, CTLFLAG_RD, NULL, "");
+
+static int ra_fails;
+SYSCTL_INT(_vfs_cluster, OID_AUTO, ra_fails, CTLFLAG_RD, &ra_fails, 0, "");
+static int rbuild_fails;
+SYSCTL_INT(_vfs_cluster, OID_AUTO, rbuild_fails, CTLFLAG_RD, &rbuild_fails, 0,
+    "");
+static int ra_clusters;
+SYSCTL_INT(_vfs_cluster, OID_AUTO, ra_clusters, CTLFLAG_RD, &ra_clusters, 0,
+    "");
+static int ra_singles;
+SYSCTL_INT(_vfs_cluster, OID_AUTO, ra_singles, CTLFLAG_RD, &ra_singles, 0, "");
+
 /* Page expended to mark partially backed buffers */
 extern vm_page_t	bogus_page;
 
@@ -208,7 +224,7 @@
 			if (ncontig < nblks)
 				nblks = ncontig;
 			bp = cluster_rbuild(vp, filesize, lblkno,
-				blkno, size, nblks, bp);
+			    blkno, size, nblks, bp, 0);
 			lblkno += (bp->b_bufsize / size);
 		} else {
 			bp->b_flags |= B_RAM;
@@ -236,11 +252,69 @@
 	/*
 	 * If we have been doing sequential I/O, then do some read-ahead.
 	 */
-	while (lblkno < (origblkno + maxra)) {
+	cluster_ra(vp, filesize, lblkno, origblkno + maxra, size, racluster, 0);
+
+	if (reqbp)
+		return (bufwait(reqbp));
+	else
+		return (error);
+}
+
+/*
+ * Perform asynchronous read-ahead clustering reads for contiguous blocks
+ * if possible.  Returns the amount of I/O it attempted to schedule.
+ */
+long
+cluster_readahead(vp, filesize, lblkno, size)
+	struct vnode *vp;
+	u_quad_t filesize;
+	daddr_t lblkno;
+	long size;
+{
+	int maxra, racluster;
+
+	/*
+	 * Try to limit the amount of read-ahead by a few
+	 * ad-hoc parameters.  This needs work!!!
+	 */
+	racluster = vp->v_mount->mnt_iosize_max / size;
+	maxra = min(nbuf/8, read_max);
+	if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
+		maxra = (filesize / size) - lblkno;
+	CTR3(KTR_BUF, "cluster_readahead(%p, %ld) using maxra %d", vp, lblkno,
+	    maxra);
+	cluster_ra(vp, filesize, lblkno, lblkno + maxra, size, racluster,
+	    /* GB_NOWAIT_BD | */ GB_LOCK_NOWAIT);
+	return (maxra * size);
+}
+
+static void
+cluster_ra(vp, filesize, flbn, elbn, size, racluster, gbflags)
+	struct vnode *vp;
+	u_quad_t filesize;
+	daddr_t flbn;
+	daddr_t elbn;
+	long size;
+	int racluster;
+	int gbflags;
+{
+	struct buf *rbp;
+	daddr_t blkno, lblkno;
+#ifdef KTR
+	daddr_t old;
+#endif
+	int error, ncontig;
+
+	for (lblkno = flbn; lblkno < elbn; ) {
+#ifdef KTR
+		old = lblkno;
+#endif
 		error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL);
 		if (error)
 			break;
 
+		CTR4(KTR_BUF, "cluster_ra: VOP_BMAP(%p, %ld) returned %ld, %d",
+		    vp, lblkno, blkno, ncontig);
 		if (blkno == -1)
 			break;
 
@@ -252,22 +326,46 @@
 		if (ncontig) {
 			ncontig = min(ncontig + 1, racluster);
 			rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
-				size, ncontig, NULL);
+			    size, ncontig, NULL, gbflags);
+			if (rbp == NULL) {
+				CTR2(KTR_BUF, "cluster_rbuild(%p, %ld) failed",
+				    vp, lblkno);
+				lblkno += 1;
+				continue;
+			}
 			lblkno += (rbp->b_bufsize / size);
 			if (rbp->b_flags & B_DELWRI) {
+				CTR2(KTR_BUF,
+				    "cluster_ra: cluster for %ld,%d has B_DELWRI",
+				    old, rbp->b_bufsize / size);
 				bqrelse(rbp);
 				continue;
 			}
+			CTR2(KTR_BUF,
+			    "cluster_ra: scheduling cluster %ld,%d",
+			    old, rbp->b_bufsize / size);
+			ra_clusters++;
 		} else {
-			rbp = getblk(vp, lblkno, size, 0, 0, 0);
+			rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
 			lblkno += 1;
+			if (rbp == NULL) {
+				CTR2(KTR_BUF,
+				    "cluster_ra: getblk(%p, %ld) failed",
+				    vp, lblkno);
+				ra_fails++;
+				continue;
+			}
 			if (rbp->b_flags & B_DELWRI) {
+				CTR1(KTR_BUF,
+				    "cluster_ra: block %ld has B_DELWRI", old);
 				bqrelse(rbp);
 				continue;
 			}
 			rbp->b_flags |= B_ASYNC | B_RAM;
 			rbp->b_iocmd = BIO_READ;
 			rbp->b_blkno = blkno;
+			CTR1(KTR_BUF, "cluster_ra: scheduling block %ld", old);
+			ra_singles++;
 		}
 		if (rbp->b_flags & B_CACHE) {
 			rbp->b_flags &= ~B_ASYNC;
@@ -285,11 +383,6 @@
 		bstrategy(rbp);
 		curthread->td_ru.ru_inblock++;
 	}
-
-	if (reqbp)
-		return (bufwait(reqbp));
-	else
-		return (error);
 }
 
 /*
@@ -298,7 +391,7 @@
  * and then parcel them up into logical blocks in the buffer hash table.
  */
 static struct buf *
-cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
+cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp, gbflags)
 	struct vnode *vp;
 	u_quad_t filesize;
 	daddr_t lbn;
@@ -306,6 +399,7 @@
 	long size;
 	int run;
 	struct buf *fbp;
+	int gbflags;
 {
 	struct bufobj *bo;
 	struct buf *bp, *tbp;
@@ -329,8 +423,10 @@
 		tbp = fbp;
 		tbp->b_iocmd = BIO_READ; 
 	} else {
-		tbp = getblk(vp, lbn, size, 0, 0, 0);
-		if (tbp->b_flags & B_CACHE)
+		tbp = getblk(vp, lbn, size, 0, 0, gbflags);
+		if (tbp == NULL)
+			rbuild_fails++;
+		if (tbp == NULL || tbp->b_flags & B_CACHE)
 			return tbp;
 		tbp->b_flags |= B_ASYNC | B_RAM;
 		tbp->b_iocmd = BIO_READ;

==== //depot/projects/fadvise/sys/sys/buf.h#2 (text+ko) ====

@@ -504,6 +504,7 @@
 
 int	cluster_read(struct vnode *, u_quad_t, daddr_t, long,
 	    struct ucred *, long, int, struct buf **);
+long	cluster_readahead(struct vnode *, u_quad_t, daddr_t, long);
 int	cluster_wbuild(struct vnode *, long, daddr_t, int);
 void	cluster_write(struct vnode *, struct buf *, u_quad_t, int);
 void	vfs_bio_set_valid(struct buf *, int base, int size);

==== //depot/projects/fadvise/sys/ufs/ffs/ffs_vnops.c#3 (text+ko) ====

@@ -70,6 +70,7 @@
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/extattr.h>
+#include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
@@ -100,6 +101,7 @@
 #ifdef DIRECTIO
 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
 #endif
+static vop_advise_t	ffs_advise;
 static vop_fsync_t	ffs_fsync;
 static vop_lock1_t	ffs_lock;
 static vop_getpages_t	ffs_getpages;
@@ -124,6 +126,7 @@
 	.vop_fsync =		ffs_fsync,
 	.vop_getpages =		ffs_getpages,
 	.vop_lock1 =		ffs_lock,
+	.vop_advise =		ffs_advise,
 	.vop_read =		ffs_read,
 	.vop_reallocblks =	ffs_reallocblks,
 	.vop_write =		ffs_write,
@@ -143,6 +146,7 @@
 	.vop_fsync =		ffs_fsync,
 	.vop_getpages =		ffs_getpages,
 	.vop_lock1 =		ffs_lock,
+	.vop_advise =		ffs_advise,
 	.vop_read =		ffs_read,
 	.vop_reallocblks =	ffs_reallocblks,
 	.vop_write =		ffs_write,
@@ -399,6 +403,78 @@
 #endif
 }
 
+static int
+ffs_advise(ap)
+	struct vop_advise_args /* {
+		struct vnode *a_vp;
+		off_t a_start;
+		off_t a_end;
+		int a_advice;
+	} */ *ap;
+{
+	struct vnode *vp;
+	struct inode *ip;
+	struct fs *fs;
+	off_t start, end;
+	size_t resid;
+	ufs_lbn_t lbn, endblkno;
+	long size, blkoffset;
+	int xfersize;
+
+	switch (ap->a_advice) {
+	case POSIX_FADV_WILLNEED:
+		vp = ap->a_vp;
+		start = ap->a_start;
+		end = ap->a_end;
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+		if (vp->v_iflag & VI_DOOMED) {
+			VOP_UNLOCK(vp, 0);
+			return (EBADF);
+		}
+		KASSERT(vp->v_type == VREG, ("FADV_WILLNEED on bad vnode"));
+		ip = VTOI(vp);
+		if (start >= ip->i_size) {
+			VOP_UNLOCK(vp, 0);
+			return (0);
+		}
+		if (end >= ip->i_size)
+			end = ip->i_size - 1;
+		resid = end - start + 1;
+		fs = ip->i_fs;
+
+		/* HACK: Prefetch indirect blocks for this range. */
+		endblkno = lblkno(fs, end);
+		for (lbn = NDADDR; lbn < endblkno; lbn += NINDIR(fs))
+			breada(vp, &lbn, &fs->fs_bsize, 1, NOCRED);
+
+		while (resid > 0) {
+			/* Limit the number of read ahead buffers. */
+			if (runningbufspace > hibufspace / 2)
+				break;
+			lbn = lblkno(fs, start);
+			size = blksize(fs, ip, lbn);
+			blkoffset = blkoff(fs, start);
+			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
+				resid += blkoffset;
+				start -= blkoffset;
+				xfersize = cluster_readahead(vp, ip->i_size,
+				    lbn, size);
+			} else {
+				xfersize = fs->fs_bsize - blkoffset;
+				if (resid < xfersize)
+					xfersize = resid;
+				breada(vp, &lbn, &xfersize, 1, NOCRED);
+			}
+			resid -= xfersize;
+			start += xfersize;
+		}
+		VOP_UNLOCK(vp, 0);
+		return (0);
+	default:
+		return (vop_stdadvise(ap));
+	}
+}
+
 /*
  * Vnode op for reading.
  */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201202271900.q1RJ0toJ064427>