Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 16 May 2001 22:18:25 +0200
From:      Tor.Egge@fast.no
To:        dillon@earth.backplane.com
Cc:        arch@FreeBSD.ORG
Subject:   Re: on load control / process swapping
Message-ID:  <200105162018.WAA99982@midten.fast.no>
In-Reply-To: Your message of "Wed, 16 May 2001 11:01:24 -0700 (PDT)"
References:  <200105161801.f4GI1Oc73283@earth.backplane.com>

next in thread | previous in thread | raw e-mail | index | archive | help
> 
>     I think someone tried to implement O_DIRECT a while back, but it
>     was fairly complex to try to do away with caching entirely.
> 
>     I think our best bet to 'start' an implementation of O_DIRECT is
>     to support the flag in open() and fcntl(), and have it simply
>     modify the sequential detection heuristic to throw away pages
>     and buffers rather then simply depressing their priority.
> 
>     Eventually we can implement the direct-I/O piece of the equation.
> 
>     I could do this first part in an hour, I think.  When I get home....

I've used something like the following patch since FreeBSD 3.3-STABLE.

On a Dell 2450 machine running a FreeBSD 4.3-RELEASE SMP kernel it
increases idle time from 0% to 95% when running a test program with
100 threads that each reads 256K from random sector aligned locations
in a 10 GB file.  Read speed is increased from 120 MB/s to 160 MB/s.

This implementation is not semantically correct since it doesn't check
for dirty pages in the vm object.


Index: sys/sys/vnode.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/vnode.h,v
retrieving revision 1.150
diff -u -r1.150 vnode.h
--- sys/sys/vnode.h	2001/05/01 08:34:44	1.150
+++ sys/sys/vnode.h	2001/05/09 16:09:32
@@ -220,6 +220,7 @@
 #define	IO_VMIO		0x20		/* data already in VMIO space */
 #define	IO_INVAL	0x40		/* invalidate after I/O */
 #define	IO_ASYNC	0x80		/* bawrite rather then bdwrite */
+#define	IO_NOBUFFER	0x100		/* bypass buffer cache */
 
 /*
  *  Modes.  Some values same as Ixxx entries from inode.h for now.
Index: sys/sys/file.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/file.h,v
retrieving revision 1.28
diff -u -r1.28 file.h
--- sys/sys/file.h	2001/02/15 16:34:10	1.28
+++ sys/sys/file.h	2001/02/15 19:14:53
@@ -56,7 +56,7 @@
  */
 struct file {
 	LIST_ENTRY(file) f_list;/* list of active files */
-	short	f_flag;		/* see fcntl.h */
+	int	f_flag;		/* see fcntl.h */
 #define	DTYPE_VNODE	1	/* file */
 #define	DTYPE_SOCKET	2	/* communications endpoint */
 #define	DTYPE_PIPE	3	/* pipe */
Index: sys/sys/fcntl.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/fcntl.h,v
retrieving revision 1.10
diff -u -r1.10 fcntl.h
--- sys/sys/fcntl.h	2000/04/22 15:22:21	1.10
+++ sys/sys/fcntl.h	2000/04/25 19:33:55
@@ -98,15 +98,18 @@
 /* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */
 #define	O_NOCTTY	0x8000		/* don't assign controlling terminal */
 
+/* Bypass buffer cache */
+#define O_DIRECT	0x00010000
+
 #ifdef _KERNEL
 /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */
 #define	FFLAGS(oflags)	((oflags) + 1)
 #define	OFLAGS(fflags)	((fflags) - 1)
 
 /* bits to save after open */
-#define	FMASK		(FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK)
+#define	FMASK		(FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT)
 /* bits settable by fcntl(F_SETFL, ...) */
-#define	FCNTLFLAGS	(FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM)
+#define	FCNTLFLAGS	(FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT)
 #endif
 
 /*
Index: sys/kern/vfs_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.116
diff -u -r1.116 vfs_vnops.c
--- sys/kern/vfs_vnops.c	2001/04/29 02:44:49	1.116
+++ sys/kern/vfs_vnops.c	2001/05/09 16:09:02
@@ -352,6 +360,10 @@
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
+#ifdef DIRECTIO
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_NOBUFFER;
+#endif
 	VOP_LEASE(vp, p, cred, LEASE_READ);
 	vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
 	if ((flags & FOF_OFFSET) == 0)
Index: sys/ufs/ufs/ufs_readwrite.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.77
diff -u -r1.77 ufs_readwrite.c
--- sys/ufs/ufs/ufs_readwrite.c	2001/05/01 08:34:45	1.77
+++ sys/ufs/ufs/ufs_readwrite.c	2001/05/09 16:09:33
@@ -42,6 +42,12 @@
 #define	WRITE			ffs_write
 #define	WRITE_S			"ffs_write"
 
+#ifdef DIRECTIO
+extern int allowrawread;
+extern int ffs_rawread __P((struct vnode *vp,
+			    struct uio *uio));
+#endif
+
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
@@ -86,6 +92,14 @@
 	mode = ip->i_mode;
 	uio = ap->a_uio;
 	ioflag = ap->a_ioflag;
+#ifdef DIRECTIO
+	if ((ioflag & IO_NOBUFFER) != 0 && allowrawread != 0 &&
+	    uio->uio_iovcnt == 1 && 
+	    (uio->uio_offset & (DEV_BSIZE - 1)) == 0 &&
+	    (uio->uio_resid & (DEV_BSIZE - 1)) == 0 &&
+	    uio->uio_resid == uio->uio_iov->iov_len)
+		return ffs_rawread(vp, uio);
+#endif
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
@@ -251,7 +265,7 @@
 			 * doing sequential access.
 			 */
 			error = cluster_read(vp, ip->i_size, lbn,
-				size, NOCRED, uio->uio_resid, seqcount, &bp);
+				size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp);
 		else if (seqcount > 1) {
 			/*
 			 * If we are NOT allowed to cluster, then
--- /dev/null	Wed May 16 21:49:24 2001
+++ sys/ufs/ufs/ufs_rawread.c	Sun Nov 26 06:01:31 2000
@@ -0,0 +1,307 @@
+/*-
+ * Copyright (c) 2000 Tor Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD:$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+
+#include <machine/limits.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+static int ffs_rawread_readahead __P((struct vnode *vp,
+				      caddr_t udata,
+				      off_t offset,
+				      size_t len,
+				      struct proc *p,
+				      struct buf *bp,
+				      caddr_t sa));
+int ffs_rawread __P((struct vnode *vp,
+		     struct uio *uio));
+
+static void ffs_rawreadwakeup __P((struct buf *bp));
+
+
+static int rawbufcnt = 350;
+SYSCTL_INT(_debug, OID_AUTO, rawbufcnt, CTLFLAG_RD, &rawbufcnt, 0, "");
+
+unsigned long	allowrawread = 1;
+SYSCTL_INT(_debug, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, "");
+
+static unsigned long    rawreadahead = 1;
+SYSCTL_INT(_debug, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, "");
+
+static int
+ffs_rawread_readahead(vp, udata, offset, len, p, bp, sa)
+	struct vnode *vp;
+	caddr_t udata;
+	off_t offset;
+	size_t len;
+	struct proc *p;
+	struct buf *bp;
+	caddr_t sa;
+{
+	int error;
+	u_int iolen;
+	off_t blockno;
+	int blockoff;
+	int bsize;
+	struct vnode *dp;
+	int bforwards;
+	
+	bsize = vp->v_mount->mnt_stat.f_iosize;
+	
+	iolen = ((vm_offset_t) udata) & PAGE_MASK;
+	bp->b_bcount = len;
+	if (bp->b_bcount + iolen > bp->b_kvasize) {
+		bp->b_bcount = bp->b_kvasize;
+		if (iolen != 0)
+			bp->b_bcount -= PAGE_SIZE;
+	}
+	bp->b_flags = B_PHYS;
+	bp->b_iocmd = BIO_READ;
+	bp->b_iodone = ffs_rawreadwakeup;
+	bp->b_data = udata;
+	bp->b_saveaddr = sa;
+	bp->b_offset = offset;
+	blockno = bp->b_offset / bsize;
+	blockoff = (bp->b_offset % bsize) / DEV_BSIZE;
+	if ((daddr_t) blockno != blockno) {
+		return EINVAL; /* blockno overflow */
+	}
+	
+	bp->b_lblkno = bp->b_blkno = blockno;
+	if (!useracc(bp->b_data, bp->b_bcount, VM_PROT_WRITE)) {
+		return EFAULT;
+	}
+	
+	error = VOP_BMAP(vp, bp->b_lblkno, &dp, &bp->b_blkno, &bforwards,
+			 NULL);
+	if (error != 0) {
+		return error;
+	}
+	
+	if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards))
+		bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE;
+	bp->b_bufsize = bp->b_bcount;
+	bp->b_blkno += blockoff;
+	bp->b_dev = dp->v_rdev;
+	
+	vmapbuf(bp);
+	
+	(void) VOP_STRATEGY(dp, bp);
+	return 0;
+}
+
+int
+ffs_rawread(vp, uio)
+	struct vnode *vp;
+	struct uio *uio;
+{
+	int error, nerror;
+	struct buf *bp, *nbp, *tbp;
+	caddr_t sa, nsa, tsa;
+	u_int iolen;
+	int spl;
+	caddr_t udata;
+	long resid;
+	off_t offset;
+	struct proc *p;
+	
+	udata = uio->uio_iov->iov_base;
+	resid = uio->uio_resid;
+	offset = uio->uio_offset;
+	p = uio->uio_procp ? uio->uio_procp : curproc;
+
+	if ((offset % DEV_BSIZE) != 0 || (resid % DEV_BSIZE) != 0)
+		return EINVAL;
+	
+	/*
+	 * keep the process from being swapped
+	 */
+	PHOLD(p);
+	
+	error = 0;
+	nerror = 0;
+	
+	bp = NULL;
+	nbp = NULL;
+	sa = NULL;
+	nsa = NULL;
+	
+	while (resid > 0) {
+		
+		if (bp == NULL) { /* Setup first read */
+			/* XXX: Leave some bufs for swap */
+			bp = getpbuf(&rawbufcnt);
+			sa = bp->b_data;
+			bp->b_vp = vp; 
+			bp->b_error = 0;
+			error = ffs_rawread_readahead(vp, udata, offset,
+						     resid, p, bp, sa);
+			if (error != 0)
+				break;
+			
+			if (resid > bp->b_bufsize) { /* Setup fist readahead */
+				/* XXX: Leave bufs for swap */
+				if (rawreadahead != 0) 
+					nbp = trypbuf(&rawbufcnt);
+				else
+					nbp = NULL;
+				if (nbp != NULL) {
+					nsa = nbp->b_data;
+					nbp->b_vp = vp;
+					nbp->b_error = 0;
+					
+					nerror = ffs_rawread_readahead(vp, 
+								       udata +
+								       bp->b_bufsize,
+								       offset +
+								       bp->b_bufsize,
+								       resid -
+								       bp->b_bufsize,
+								       p,
+								       nbp,
+								       nsa);
+					if (nerror) {
+						relpbuf(nbp, &rawbufcnt);
+						nbp = NULL;
+					}
+				}
+			}
+		}
+		
+		spl = splbio();
+		while ((bp->b_flags & B_DONE) == 0) {
+			tsleep((caddr_t)bp, PRIBIO, "rawrd", 0);
+		}
+		splx(spl);
+		
+		vunmapbuf(bp);
+		
+		iolen = bp->b_bcount - bp->b_resid;
+		if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) {
+			nerror = 0;	/* Ignore possible beyond EOF error */
+			break; /* EOF */
+		}
+		
+		if ((bp->b_ioflags & BIO_ERROR) != 0) {
+			error = bp->b_error;
+			break;
+		}
+		resid -= iolen;
+		udata += iolen;
+		offset += iolen;
+		if (iolen < bp->b_bufsize) {
+			/* Incomplete read.  Try to read remaining part */
+			error = ffs_rawread_readahead(vp,
+						      udata,
+						      offset,
+						      bp->b_bufsize - iolen,
+						      p,
+						      bp,
+						      sa);
+			if (error)
+				break;
+		} else if (nbp != NULL) { /* Complete read with readahead */
+			
+			tbp = bp;
+			bp = nbp;
+			nbp = tbp;
+			
+			tsa = sa;
+			sa = nsa;
+			nsa = tsa;
+			
+			if (resid <= bp->b_bufsize) { /* No more readaheads */
+				relpbuf(nbp, &rawbufcnt);
+				nbp = NULL;
+			} else { /* Setup next readahead */
+				nerror = ffs_rawread_readahead(vp,
+							       udata +
+							       bp->b_bufsize,
+							       offset +
+							       bp->b_bufsize,
+							       resid -
+							       bp->b_bufsize,
+							       p,
+							       nbp,
+							       nsa);
+				if (nerror != 0) {
+					relpbuf(nbp, &rawbufcnt);
+					nbp = NULL;
+				}
+			}
+		} else if (nerror != 0) {/* Deferred Readahead error */
+			break;		
+		}  else if (resid > 0) { /* More to read, no readahead */
+			error = ffs_rawread_readahead(vp, udata, offset,
+						      resid, p, bp, sa);
+			if (error != 0)
+				break;
+		}
+	}
+	
+	if (bp != NULL)
+		relpbuf(bp, &rawbufcnt);
+	if (nbp != NULL) {			/* Run down readahead buffer */
+		spl = splbio();
+		while ((nbp->b_flags & B_DONE) == 0) {
+			tsleep((caddr_t)nbp, PRIBIO, "rawrd", 0);
+		}
+		splx(spl);
+		vunmapbuf(nbp);
+		relpbuf(nbp, &rawbufcnt);
+	}
+	
+	if (error == 0)
+		error = nerror;
+	PRELE(p);
+	uio->uio_resid = resid;
+	return error;
+}
+
+static void
+ffs_rawreadwakeup(bp)
+	struct buf *bp;
+{
+	wakeup((caddr_t) bp);
+}
+
Index: sys/conf/options
===================================================================
RCS file: /home/ncvs/src/sys/conf/options,v
retrieving revision 1.271
diff -u -r1.271 options
--- sys/conf/options	2001/05/13 20:52:36	1.271
+++ sys/conf/options	2001/05/16 17:36:04
@@ -378,6 +380,7 @@
 REGRESSION		opt_global.h
 SIMPLELOCK_DEBUG	opt_global.h
 VFS_BIO_DEBUG		opt_global.h
+DIRECTIO		opt_global.h
 
 # These are VM related options
 VM_KMEM_SIZE		opt_vm.h



- Tor Egge

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200105162018.WAA99982>