Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 17 May 2001 21:40:44 -0700 (PDT)
From:      Matt Dillon <dillon@earth.backplane.com>
To:        Tor.Egge@fast.no
Cc:        arch@FreeBSD.ORG
Subject:   Final O_DIRECT patch (first stage, without rawread/rawwrite)
Message-ID:  <200105180440.f4I4eiB05429@earth.backplane.com>
References:  <200105162222.f4GMMpC81247@earth.backplane.com> <200105162331.BAA04708@midten.fast.no>

next in thread | previous in thread | raw e-mail | index | archive | help
    Ok, I've done some pretty good testing of this patch.  The problem
    with write() not freeing the buffer was due to the clustering code.
    bdwrite() called bqrelse() which cleared B_RELBUF and B_DIRECT.  That
    was easy to fix.  This patch should cause O_DIRECT I/O to operate
    without polluting the buffer cache.  As an added bonus I managed to 
    keep the write clustering code intact, so the I/O should be mostly
    optimal (as far as that goes).

    The patch below is for -stable.  I will continue testing it on stable
    through the weekend.  I'll probably commit the -current version on the
    weekend and the stable version the weekend after.

    This is the first stage.  The second stage will be to figure out how
    best to implement the zero-copy rawread/rawwrite functionality using
    Tor's code as a reference point.

						-Matt

Index: kern/vfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.242.2.7
diff -u -r1.242.2.7 vfs_bio.c
--- kern/vfs_bio.c	2001/03/02 16:45:12	1.242.2.7
+++ kern/vfs_bio.c	2001/05/18 04:32:52
@@ -1230,7 +1230,7 @@
 
 	/* unlock */
 	BUF_UNLOCK(bp);
-	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
 	splx(s);
 }
 
@@ -1242,6 +1242,8 @@
  * biodone() to requeue an async I/O on completion.  It is also used when
  * known good buffers need to be requeued but we think we may need the data
  * again soon.
+ *
+ * XXX we should be able to leave the B_RELBUF hint set on completion.
  */
 void
 bqrelse(struct buf * bp)
@@ -1328,12 +1330,15 @@
 			vm_page_flag_clear(m, PG_ZERO);
 			/*
 			 * Might as well free the page if we can and it has
-			 * no valid data.
+			 * no valid data.  We also free the page if the
+			 * buffer was used for direct I/O
 			 */
 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
 				vm_page_busy(m);
 				vm_page_protect(m, VM_PROT_NONE);
 				vm_page_free(m);
+			} else if (bp->b_flags & B_DIRECT) {
+				vm_page_try_to_free(m);
 			} else if (vm_page_count_severe()) {
 				vm_page_try_to_cache(m);
 			}
Index: kern/vfs_cluster.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_cluster.c,v
retrieving revision 1.92.2.5
diff -u -r1.92.2.5 vfs_cluster.c
--- kern/vfs_cluster.c	2001/03/02 16:45:12	1.92.2.5
+++ kern/vfs_cluster.c	2001/05/18 04:33:46
@@ -490,6 +490,15 @@
 		} else {
 			tbp->b_dirtyoff = tbp->b_dirtyend = 0;
 			tbp->b_flags &= ~(B_ERROR|B_INVAL);
+			/*
+			 * XXX the bdwrite()/bqrelse() issued during
+			 * cluster building clears B_RELBUF (see bqrelse()
+			 * comment).  If direct I/O was specified, we have
+			 * to restore it here to allow the buffer and VM
+			 * to be freed.
+			 */
+			if (tbp->b_flags & B_DIRECT)
+			    tbp->b_flags |= B_RELBUF;
 		}
 		biodone(tbp);
 	}
Index: kern/vfs_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.87.2.6
diff -u -r1.87.2.6 vfs_vnops.c
--- kern/vfs_vnops.c	2001/02/26 04:23:16	1.87.2.6
+++ kern/vfs_vnops.c	2001/05/17 05:17:55
@@ -334,6 +334,8 @@
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_DIRECT;
 	VOP_LEASE(vp, p, cred, LEASE_READ);
 	vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
 	if ((flags & FOF_OFFSET) == 0)
@@ -374,6 +376,8 @@
 		ioflag |= IO_APPEND;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_DIRECT;
 	if ((fp->f_flag & O_FSYNC) ||
 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 		ioflag |= IO_SYNC;
Index: sys/buf.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/buf.h,v
retrieving revision 1.88.2.3
diff -u -r1.88.2.3 buf.h
--- sys/buf.h	2000/12/30 01:51:10	1.88.2.3
+++ sys/buf.h	2001/05/18 04:02:02
@@ -191,12 +191,16 @@
  *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
  *			always at least DEV_BSIZE aligned, though ).
  *	
+ *	B_DIRECT	Hint that we should attempt to completely free
+ *			the pages underlying the buffer.   B_DIRECT is 
+ *			sticky until the buffer is released and typically
+ *			only has an effect when B_RELBUF is also set.
  */
 
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
 #define	B_NEEDCOMMIT	0x00000002	/* Append-write in progress. */
 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
-#define	B_UNUSED0	0x00000008	/* Old B_BAD */
+#define	B_DIRECT	0x00000008	/* direct I/O flag (pls free vmio) */
 #define	B_DEFERRED	0x00000010	/* Skipped over for cleaning */
 #define	B_CACHE		0x00000020	/* Bread found us in the cache. */
 #define	B_CALL		0x00000040	/* Call b_iodone from biodone. */
@@ -231,7 +235,7 @@
 	"\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \
 	"\25read\24raw\23phys\22clusterok\21malloc\20nocache" \
 	"\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \
-	"\10delwri\7call\6cache\4bad\3async\2needcommit\1age"
+	"\10delwri\7call\6cache\4direct\3async\2needcommit\1age"
 
 /*
  * These flags are kept in b_xflags.
Index: sys/fcntl.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/fcntl.h,v
retrieving revision 1.9.2.1
diff -u -r1.9.2.1 fcntl.h
--- sys/fcntl.h	2000/08/22 01:46:30	1.9.2.1
+++ sys/fcntl.h	2001/05/17 04:01:47
@@ -98,15 +98,18 @@
 /* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */
 #define	O_NOCTTY	0x8000		/* don't assign controlling terminal */
 
+/* Attempt to bypass buffer cache */
+#define O_DIRECT	0x00010000
+
 #ifdef _KERNEL
 /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */
 #define	FFLAGS(oflags)	((oflags) + 1)
 #define	OFLAGS(fflags)	((fflags) - 1)
 
 /* bits to save after open */
-#define	FMASK		(FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK)
+#define	FMASK		(FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT)
 /* bits settable by fcntl(F_SETFL, ...) */
-#define	FCNTLFLAGS	(FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM)
+#define	FCNTLFLAGS	(FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT)
 #endif
 
 /*
Index: sys/file.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/file.h,v
retrieving revision 1.22.2.5
diff -u -r1.22.2.5 file.h
--- sys/file.h	2001/02/26 04:23:21	1.22.2.5
+++ sys/file.h	2001/05/17 04:34:53
@@ -56,15 +56,14 @@
  */
 struct file {
 	LIST_ENTRY(file) f_list;/* list of active files */
-	short	f_flag;		/* see fcntl.h */
+	short	f_FILLER3;	/* (old f_flag) */
 #define	DTYPE_VNODE	1	/* file */
 #define	DTYPE_SOCKET	2	/* communications endpoint */
 #define	DTYPE_PIPE	3	/* pipe */
 #define	DTYPE_FIFO	4	/* fifo (named pipe) */
 #define	DTYPE_KQUEUE	5	/* event queue */
 	short	f_type;		/* descriptor type */
-	short	f_FILLER1;	/* (OLD) reference count */
-	short	f_FILLER2;	/* (OLD) references from message queue */
+	u_int	f_flag;		/* see fcntl.h */
 	struct	ucred *f_cred;	/* credentials associated with descriptor */
 	struct	fileops {
 		int	(*fo_read)	__P((struct file *fp, struct uio *uio,
Index: sys/vnode.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/vnode.h,v
retrieving revision 1.111.2.4
diff -u -r1.111.2.4 vnode.h
--- sys/vnode.h	2000/12/30 01:51:10	1.111.2.4
+++ sys/vnode.h	2001/05/17 04:49:14
@@ -213,6 +213,7 @@
 #define	IO_VMIO		0x20		/* data already in VMIO space */
 #define	IO_INVAL	0x40		/* invalidate after I/O */
 #define IO_ASYNC	0x80		/* bawrite rather then bdwrite */
+#define IO_DIRECT	0x100		/* attempt to bypass buffer cache */
 
 /*
  *  Modes.  Some values same as Ixxx entries from inode.h for now.
Index: ufs/ufs/ufs_readwrite.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.65.2.6
diff -u -r1.65.2.6 ufs_readwrite.c
--- ufs/ufs/ufs_readwrite.c	2000/12/30 01:51:11	1.65.2.6
+++ ufs/ufs/ufs_readwrite.c	2001/05/18 03:51:55
@@ -278,6 +278,15 @@
 		}
 
 		/*
+		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
+		 * will cause us to attempt to release the buffer later on
+		 * and will cause the buffer cache to attempt to free the
+		 * underlying pages.
+		 */
+		if (ioflag & IO_DIRECT)
+			bp->b_flags |= B_DIRECT;
+
+		/*
 		 * We should only get non-zero b_resid when an I/O error
 		 * has occurred, which should cause us to break above.
 		 * However, if the short read did not cause an error,
@@ -319,12 +328,12 @@
 		if (error)
 			break;
 
-		if ((ioflag & IO_VMIO) &&
-		   (LIST_FIRST(&bp->b_dep) == NULL)) {
+		if ((ioflag & (IO_VMIO|IO_DIRECT)) && 
+		    (LIST_FIRST(&bp->b_dep) == NULL)) {
 			/*
-			 * If there are no dependencies, and
-			 * it's VMIO, then we don't need the buf,
-			 * mark it available for freeing. The VM has the data.
+			 * If there are no dependencies, and it's VMIO,
+			 * then we don't need the buf, mark it available
+			 * for freeing. The VM has the data.
 			 */
 			bp->b_flags |= B_RELBUF;
 			brelse(bp);
@@ -346,8 +355,8 @@
 	 * so it must have come from a 'break' statement
 	 */
 	if (bp != NULL) {
-		if ((ioflag & IO_VMIO) &&
-		   (LIST_FIRST(&bp->b_dep) == NULL)) {
+		if ((ioflag & (IO_VMIO|IO_DIRECT)) && 
+		    (LIST_FIRST(&bp->b_dep) == NULL)) {
 			bp->b_flags |= B_RELBUF;
 			brelse(bp);
 		} else {
@@ -486,6 +495,8 @@
 		    ap->a_cred, flags, &bp);
 		if (error != 0)
 			break;
+		if (ioflag & IO_DIRECT)
+			bp->b_flags |= B_DIRECT;
 
 		if (uio->uio_offset + xfersize > ip->i_size) {
 			ip->i_size = uio->uio_offset + xfersize;
@@ -498,10 +509,19 @@
 
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
-		if ((ioflag & IO_VMIO) &&
-		   (LIST_FIRST(&bp->b_dep) == NULL))
+		if ((ioflag & (IO_VMIO|IO_DIRECT)) && 
+		    (LIST_FIRST(&bp->b_dep) == NULL)) {
 			bp->b_flags |= B_RELBUF;
+		}
 
+		/*
+		 * If IO_SYNC each buffer is written synchronously.  Otherwise
+		 * if we have a severe page deficiency write the buffer 
+		 * asynchronously.  Otherwise try to cluster, and if that
+		 * doesn't do it then either do an async write (if O_DIRECT),
+		 * or a delayed write (if not).
+		 */
+
 		if (ioflag & IO_SYNC) {
 			(void)bwrite(bp);
 		} else if (vm_page_count_severe() || 
@@ -516,6 +536,9 @@
 			} else {
 				bawrite(bp);
 			}
+		} else if (ioflag & IO_DIRECT) {
+			bp->b_flags |= B_CLUSTEROK;
+			bawrite(bp);
 		} else {
 			bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
Index: vm/vm_page.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_page.c,v
retrieving revision 1.147.2.6
diff -u -r1.147.2.6 vm_page.c
--- vm/vm_page.c	2001/03/03 23:06:09	1.147.2.6
+++ vm/vm_page.c	2001/05/17 04:22:38
@@ -1353,6 +1353,31 @@
 }
 
 /*
+ * vm_page_try_to_free()
+ *
+ *	Attempt to free the page.  If we cannot free it, we do nothing.
+ *	1 is returned on success, 0 on failure.
+ */
+
+int
+vm_page_try_to_free(m)
+	vm_page_t m;
+{
+	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
+	    (m->flags & (PG_BUSY|PG_UNMANAGED))) {
+		return(0);
+	}
+	vm_page_test_dirty(m);
+	if (m->dirty)
+		return(0);
+	vm_page_busy(m);
+	vm_page_protect(m, VM_PROT_NONE);
+	vm_page_free(m);
+	return(1);
+}
+
+
+/*
  * vm_page_cache
  *
  * Put the specified page onto the page cache queue (if appropriate).
Index: vm/vm_page.h
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_page.h,v
retrieving revision 1.75.2.5
diff -u -r1.75.2.5 vm_page.h
--- vm/vm_page.h	2000/12/30 01:51:11	1.75.2.5
+++ vm/vm_page.h	2001/05/17 04:23:05
@@ -406,6 +406,7 @@
 vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
 void vm_page_cache __P((register vm_page_t));
 int vm_page_try_to_cache __P((vm_page_t));
+int vm_page_try_to_free __P((vm_page_t));
 void vm_page_dontneed __P((register vm_page_t));
 static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
 static __inline void vm_page_free __P((vm_page_t));

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200105180440.f4I4eiB05429>