Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 16 May 2001 23:47:46 -0700 (PDT)
From:      Matt Dillon <dillon@earth.backplane.com>
To:        Tor.Egge@fast.no
Cc:        arch@FreeBSD.ORG
Subject:   Preliminary O_DIRECT patch (for review only, not yet tested!)
Message-ID:  <200105170647.f4H6lkk88458@earth.backplane.com>
References:  <200105162222.f4GMMpC81247@earth.backplane.com> <200105162331.BAA04708@midten.fast.no>

next in thread | previous in thread | raw e-mail | index | archive | help
    This is my preliminary O_DIRECT patch so far, against -stable at the
    moment (Obviously it will be committed to -current first, but I have
    to test it on -stable).  It seems to work for reads.  It doesn't work
    for writes yet (the buffers still get cached).

    Basically it takes Tor's infrastructure with some minor modifications,
    removes the rawread/rawwrite stuff, and then adds a B_DIRECT flag
    to the buffer cache.  write()'s are converted to synchronous writes,
    and both read()s and write()s attempt to completely free the underlying
    VM pages plus the buffer is released.

    I need to figure out how to free underlying buffers/VM for write()
    operations before I can commit any of this.  It could be a while.

    --

    I've looked at the rawread/rawwrite issue and I believe it may be 
    possible to use the already-existing B_VMIO flag coupled with 
    some VM magic to achieve the equivalent in the buffer cache itself
    rather then having to write a rawread/rawwrite function for each
    filesystem.  Filesystems already support B_VMIO.  If it is possible,
    then we'll have a general raw I/O solution.


						-Matt

Index: kern/vfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.242.2.7
diff -u -r1.242.2.7 vfs_bio.c
--- kern/vfs_bio.c	2001/03/02 16:45:12	1.242.2.7
+++ kern/vfs_bio.c	2001/05/17 04:21:37
@@ -1230,7 +1230,7 @@
 
 	/* unlock */
 	BUF_UNLOCK(bp);
-	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
 	splx(s);
 }
 
@@ -1296,7 +1296,7 @@
 
 	/* unlock */
 	BUF_UNLOCK(bp);
-	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
 	splx(s);
 }
 
@@ -1328,12 +1328,15 @@
 			vm_page_flag_clear(m, PG_ZERO);
 			/*
 			 * Might as well free the page if we can and it has
-			 * no valid data.
+			 * no valid data.  We also free the page if the
+			 * buffer was used for direct I/O
 			 */
 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
 				vm_page_busy(m);
 				vm_page_protect(m, VM_PROT_NONE);
 				vm_page_free(m);
+			} else if (bp->b_flags & B_DIRECT) {
+				vm_page_try_to_free(m);
 			} else if (vm_page_count_severe()) {
 				vm_page_try_to_cache(m);
 			}
@@ -2187,7 +2190,7 @@
 		}
 
 		splx(s);
-		bp->b_flags &= ~B_DONE;
+		bp->b_flags &= ~(B_DONE | B_DIRECT);
 	} else {
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
@@ -2267,7 +2270,7 @@
 		allocbuf(bp, size);
 
 		splx(s);
-		bp->b_flags &= ~B_DONE;
+		bp->b_flags &= ~(B_DONE | B_DIRECT);
 	}
 	return (bp);
 }
Index: kern/vfs_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.87.2.6
diff -u -r1.87.2.6 vfs_vnops.c
--- kern/vfs_vnops.c	2001/02/26 04:23:16	1.87.2.6
+++ kern/vfs_vnops.c	2001/05/17 05:17:55
@@ -334,6 +334,8 @@
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_DIRECT;
 	VOP_LEASE(vp, p, cred, LEASE_READ);
 	vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
 	if ((flags & FOF_OFFSET) == 0)
@@ -374,6 +376,8 @@
 		ioflag |= IO_APPEND;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_DIRECT;
 	if ((fp->f_flag & O_FSYNC) ||
 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 		ioflag |= IO_SYNC;
Index: sys/buf.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/buf.h,v
retrieving revision 1.88.2.3
diff -u -r1.88.2.3 buf.h
--- sys/buf.h	2000/12/30 01:51:10	1.88.2.3
+++ sys/buf.h	2001/05/17 04:18:35
@@ -191,12 +191,14 @@
  *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
  *			always at least DEV_BSIZE aligned, though ).
  *	
+ *	B_DIRECT	Hint (along with B_RELBUF) that we should attempt to
+ *			completely free the pages underlying the buffer.
  */
 
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
 #define	B_NEEDCOMMIT	0x00000002	/* Append-write in progress. */
 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
-#define	B_UNUSED0	0x00000008	/* Old B_BAD */
+#define	B_DIRECT	0x00000008	/* direct I/O flag (pls free vmio) */
 #define	B_DEFERRED	0x00000010	/* Skipped over for cleaning */
 #define	B_CACHE		0x00000020	/* Bread found us in the cache. */
 #define	B_CALL		0x00000040	/* Call b_iodone from biodone. */
@@ -231,7 +233,7 @@
 	"\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \
 	"\25read\24raw\23phys\22clusterok\21malloc\20nocache" \
 	"\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \
-	"\10delwri\7call\6cache\4bad\3async\2needcommit\1age"
+	"\10delwri\7call\6cache\4direct\3async\2needcommit\1age"
 
 /*
  * These flags are kept in b_xflags.
Index: sys/fcntl.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/fcntl.h,v
retrieving revision 1.9.2.1
diff -u -r1.9.2.1 fcntl.h
--- sys/fcntl.h	2000/08/22 01:46:30	1.9.2.1
+++ sys/fcntl.h	2001/05/17 04:01:47
@@ -98,15 +98,18 @@
 /* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */
 #define	O_NOCTTY	0x8000		/* don't assign controlling terminal */
 
+/* Attempt to bypass buffer cache */
+#define O_DIRECT	0x00010000
+
 #ifdef _KERNEL
 /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */
 #define	FFLAGS(oflags)	((oflags) + 1)
 #define	OFLAGS(fflags)	((fflags) - 1)
 
 /* bits to save after open */
-#define	FMASK		(FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK)
+#define	FMASK		(FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT)
 /* bits settable by fcntl(F_SETFL, ...) */
-#define	FCNTLFLAGS	(FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM)
+#define	FCNTLFLAGS	(FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT)
 #endif
 
 /*
Index: sys/file.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/file.h,v
retrieving revision 1.22.2.5
diff -u -r1.22.2.5 file.h
--- sys/file.h	2001/02/26 04:23:21	1.22.2.5
+++ sys/file.h	2001/05/17 04:34:53
@@ -56,15 +56,14 @@
  */
 struct file {
 	LIST_ENTRY(file) f_list;/* list of active files */
-	short	f_flag;		/* see fcntl.h */
+	short	f_FILLER3;	/* (old f_flag) */
 #define	DTYPE_VNODE	1	/* file */
 #define	DTYPE_SOCKET	2	/* communications endpoint */
 #define	DTYPE_PIPE	3	/* pipe */
 #define	DTYPE_FIFO	4	/* fifo (named pipe) */
 #define	DTYPE_KQUEUE	5	/* event queue */
 	short	f_type;		/* descriptor type */
-	short	f_FILLER1;	/* (OLD) reference count */
-	short	f_FILLER2;	/* (OLD) references from message queue */
+	u_int	f_flag;		/* see fcntl.h */
 	struct	ucred *f_cred;	/* credentials associated with descriptor */
 	struct	fileops {
 		int	(*fo_read)	__P((struct file *fp, struct uio *uio,
Index: sys/vnode.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/vnode.h,v
retrieving revision 1.111.2.4
diff -u -r1.111.2.4 vnode.h
--- sys/vnode.h	2000/12/30 01:51:10	1.111.2.4
+++ sys/vnode.h	2001/05/17 04:49:14
@@ -213,6 +213,7 @@
 #define	IO_VMIO		0x20		/* data already in VMIO space */
 #define	IO_INVAL	0x40		/* invalidate after I/O */
 #define IO_ASYNC	0x80		/* bawrite rather then bdwrite */
+#define IO_DIRECT	0x100		/* attempt to bypass buffer cache */
 
 /*
  *  Modes.  Some values same as Ixxx entries from inode.h for now.
Index: ufs/ufs/ufs_readwrite.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.65.2.6
diff -u -r1.65.2.6 ufs_readwrite.c
--- ufs/ufs/ufs_readwrite.c	2000/12/30 01:51:11	1.65.2.6
+++ ufs/ufs/ufs_readwrite.c	2001/05/17 06:26:16
@@ -278,6 +278,15 @@
 		}
 
 		/*
+		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
+		 * will cause us to attempt to release the buffer later on
+		 * and will cause the buffer cache to attempt to free the
+		 * underlying pages.
+		 */
+		if (ioflag & IO_DIRECT)
+			bp->b_flags |= B_DIRECT;
+
+		/*
 		 * We should only get non-zero b_resid when an I/O error
 		 * has occurred, which should cause us to break above.
 		 * However, if the short read did not cause an error,
@@ -319,12 +328,12 @@
 		if (error)
 			break;
 
-		if ((ioflag & IO_VMIO) &&
-		   (LIST_FIRST(&bp->b_dep) == NULL)) {
+		if ((ioflag & (IO_VMIO|IO_DIRECT)) && 
+		    (LIST_FIRST(&bp->b_dep) == NULL)) {
 			/*
-			 * If there are no dependencies, and
-			 * it's VMIO, then we don't need the buf,
-			 * mark it available for freeing. The VM has the data.
+			 * If there are no dependencies, and it's VMIO,
+			 * then we don't need the buf, mark it available
+			 * for freeing. The VM has the data.
 			 */
 			bp->b_flags |= B_RELBUF;
 			brelse(bp);
@@ -346,8 +355,8 @@
 	 * so it must have come from a 'break' statement
 	 */
 	if (bp != NULL) {
-		if ((ioflag & IO_VMIO) &&
-		   (LIST_FIRST(&bp->b_dep) == NULL)) {
+		if ((ioflag & (IO_VMIO|IO_DIRECT)) && 
+		    (LIST_FIRST(&bp->b_dep) == NULL)) {
 			bp->b_flags |= B_RELBUF;
 			brelse(bp);
 		} else {
@@ -449,7 +458,7 @@
 	resid = uio->uio_resid;
 	osize = ip->i_size;
 	flags = 0;
-	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
+	if ((ioflag & (IO_SYNC|IO_DIRECT)) && !DOINGASYNC(vp))
 		flags = B_SYNC;
 
 	if (object && (object->flags & OBJ_OPT)) {
@@ -486,6 +495,8 @@
 		    ap->a_cred, flags, &bp);
 		if (error != 0)
 			break;
+		if (ioflag & IO_DIRECT)
+			bp->b_flags |= B_DIRECT;
 
 		if (uio->uio_offset + xfersize > ip->i_size) {
 			ip->i_size = uio->uio_offset + xfersize;
@@ -498,11 +509,12 @@
 
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
-		if ((ioflag & IO_VMIO) &&
-		   (LIST_FIRST(&bp->b_dep) == NULL))
+		if ((ioflag & (IO_VMIO|IO_DIRECT)) && 
+		    (LIST_FIRST(&bp->b_dep) == NULL)) {
 			bp->b_flags |= B_RELBUF;
+		}
 
-		if (ioflag & IO_SYNC) {
+		if (ioflag & (IO_SYNC|IO_DIRECT)) {
 			(void)bwrite(bp);
 		} else if (vm_page_count_severe() || 
 			    buf_dirty_count_severe() ||
Index: vm/vm_page.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_page.c,v
retrieving revision 1.147.2.6
diff -u -r1.147.2.6 vm_page.c
--- vm/vm_page.c	2001/03/03 23:06:09	1.147.2.6
+++ vm/vm_page.c	2001/05/17 04:22:38
@@ -1353,6 +1353,31 @@
 }
 
 /*
+ * vm_page_try_to_free()
+ *
+ *	Attempt to free the page.  If we cannot free it, we do nothing.
+ *	1 is returned on success, 0 on failure.
+ */
+
+int
+vm_page_try_to_free(m)
+	vm_page_t m;
+{
+	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
+	    (m->flags & (PG_BUSY|PG_UNMANAGED))) {
+		return(0);
+	}
+	vm_page_test_dirty(m);
+	if (m->dirty)
+		return(0);
+	vm_page_busy(m);
+	vm_page_protect(m, VM_PROT_NONE);
+	vm_page_free(m);
+	return(1);
+}
+
+
+/*
  * vm_page_cache
  *
  * Put the specified page onto the page cache queue (if appropriate).
Index: vm/vm_page.h
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_page.h,v
retrieving revision 1.75.2.5
diff -u -r1.75.2.5 vm_page.h
--- vm/vm_page.h	2000/12/30 01:51:11	1.75.2.5
+++ vm/vm_page.h	2001/05/17 04:23:05
@@ -406,6 +406,7 @@
 vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
 void vm_page_cache __P((register vm_page_t));
 int vm_page_try_to_cache __P((vm_page_t));
+int vm_page_try_to_free __P((vm_page_t));
 void vm_page_dontneed __P((register vm_page_t));
 static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
 static __inline void vm_page_free __P((vm_page_t));

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200105170647.f4H6lkk88458>