From owner-freebsd-arch Wed May 16 23:48:38 2001 Delivered-To: freebsd-arch@freebsd.org Received: from earth.backplane.com (earth-nat-cw.backplane.com [208.161.114.67]) by hub.freebsd.org (Postfix) with ESMTP id 33F0037B423 for ; Wed, 16 May 2001 23:48:24 -0700 (PDT) (envelope-from dillon@earth.backplane.com) Received: (from dillon@localhost) by earth.backplane.com (8.11.3/8.11.2) id f4H6lkk88458; Wed, 16 May 2001 23:47:46 -0700 (PDT) (envelope-from dillon) Date: Wed, 16 May 2001 23:47:46 -0700 (PDT) From: Matt Dillon Message-Id: <200105170647.f4H6lkk88458@earth.backplane.com> To: Tor.Egge@fast.no Cc: arch@FreeBSD.ORG Subject: Preliminary O_DIRECT patch (for review only, not yet tested!) References: <200105162222.f4GMMpC81247@earth.backplane.com> <200105162331.BAA04708@midten.fast.no> Sender: owner-freebsd-arch@FreeBSD.ORG Precedence: bulk X-Loop: FreeBSD.ORG This is my preliminary O_DIRECT patch so far, against -stable at the moment (Obviously it will be committed to -current first, but I have to test it on -stable). It seems to work for reads. It doesn't work for writes yet (the buffers still get cached). Basically it takes Tor's infrastructure with some minor modifications, removes the rawread/rawwrite stuff, and then adds a B_DIRECT flag to the buffer cache. write()'s are converted to synchronous writes, and both read()s and write()s attempt to completely free the underlying VM pages plus the buffer is released. I need to figure out how to free underlying buffers/VM for write() operations before I can commit any of this. It could be a while. -- I've looked at the rawread/rawwrite issue and I believe it may be possible to use the already-existing B_VMIO flag coupled with some VM magic to achieve the equivalent in the buffer cache itself rather then having to write a rawread/rawwrite function for each filesystem. Filesystems already support B_VMIO. If it is possible, then we'll have a general raw I/O solution. -Matt Index: kern/vfs_bio.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_bio.c,v retrieving revision 1.242.2.7 diff -u -r1.242.2.7 vfs_bio.c --- kern/vfs_bio.c 2001/03/02 16:45:12 1.242.2.7 +++ kern/vfs_bio.c 2001/05/17 04:21:37 @@ -1230,7 +1230,7 @@ /* unlock */ BUF_UNLOCK(bp); - bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); + bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); splx(s); } @@ -1296,7 +1296,7 @@ /* unlock */ BUF_UNLOCK(bp); - bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); + bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); splx(s); } @@ -1328,12 +1328,15 @@ vm_page_flag_clear(m, PG_ZERO); /* * Might as well free the page if we can and it has - * no valid data. + * no valid data. We also free the page if the + * buffer was used for direct I/O */ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) { vm_page_busy(m); vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); + } else if (bp->b_flags & B_DIRECT) { + vm_page_try_to_free(m); } else if (vm_page_count_severe()) { vm_page_try_to_cache(m); } @@ -2187,7 +2190,7 @@ } splx(s); - bp->b_flags &= ~B_DONE; + bp->b_flags &= ~(B_DONE | B_DIRECT); } else { /* * Buffer is not in-core, create new buffer. The buffer @@ -2267,7 +2270,7 @@ allocbuf(bp, size); splx(s); - bp->b_flags &= ~B_DONE; + bp->b_flags &= ~(B_DONE | B_DIRECT); } return (bp); } Index: kern/vfs_vnops.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_vnops.c,v retrieving revision 1.87.2.6 diff -u -r1.87.2.6 vfs_vnops.c --- kern/vfs_vnops.c 2001/02/26 04:23:16 1.87.2.6 +++ kern/vfs_vnops.c 2001/05/17 05:17:55 @@ -334,6 +334,8 @@ ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; + if (fp->f_flag & O_DIRECT) + ioflag |= IO_DIRECT; VOP_LEASE(vp, p, cred, LEASE_READ); vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p); if ((flags & FOF_OFFSET) == 0) @@ -374,6 +376,8 @@ ioflag |= IO_APPEND; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; + if (fp->f_flag & O_DIRECT) + ioflag |= IO_DIRECT; if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; Index: sys/buf.h =================================================================== RCS file: /home/ncvs/src/sys/sys/buf.h,v retrieving revision 1.88.2.3 diff -u -r1.88.2.3 buf.h --- sys/buf.h 2000/12/30 01:51:10 1.88.2.3 +++ sys/buf.h 2001/05/17 04:18:35 @@ -191,12 +191,14 @@ * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). * + * B_DIRECT Hint (along with B_RELBUF) that we should attempt to + * completely free the pages underlying the buffer. */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ -#define B_UNUSED0 0x00000008 /* Old B_BAD */ +#define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_CALL 0x00000040 /* Call b_iodone from biodone. */ @@ -231,7 +233,7 @@ "\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \ "\25read\24raw\23phys\22clusterok\21malloc\20nocache" \ "\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \ - "\10delwri\7call\6cache\4bad\3async\2needcommit\1age" + "\10delwri\7call\6cache\4direct\3async\2needcommit\1age" /* * These flags are kept in b_xflags. Index: sys/fcntl.h =================================================================== RCS file: /home/ncvs/src/sys/sys/fcntl.h,v retrieving revision 1.9.2.1 diff -u -r1.9.2.1 fcntl.h --- sys/fcntl.h 2000/08/22 01:46:30 1.9.2.1 +++ sys/fcntl.h 2001/05/17 04:01:47 @@ -98,15 +98,18 @@ /* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */ #define O_NOCTTY 0x8000 /* don't assign controlling terminal */ +/* Attempt to bypass buffer cache */ +#define O_DIRECT 0x00010000 + #ifdef _KERNEL /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ #define FFLAGS(oflags) ((oflags) + 1) #define OFLAGS(fflags) ((fflags) - 1) /* bits to save after open */ -#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK) +#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT) /* bits settable by fcntl(F_SETFL, ...) */ -#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM) +#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT) #endif /* Index: sys/file.h =================================================================== RCS file: /home/ncvs/src/sys/sys/file.h,v retrieving revision 1.22.2.5 diff -u -r1.22.2.5 file.h --- sys/file.h 2001/02/26 04:23:21 1.22.2.5 +++ sys/file.h 2001/05/17 04:34:53 @@ -56,15 +56,14 @@ */ struct file { LIST_ENTRY(file) f_list;/* list of active files */ - short f_flag; /* see fcntl.h */ + short f_FILLER3; /* (old f_flag) */ #define DTYPE_VNODE 1 /* file */ #define DTYPE_SOCKET 2 /* communications endpoint */ #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_FIFO 4 /* fifo (named pipe) */ #define DTYPE_KQUEUE 5 /* event queue */ short f_type; /* descriptor type */ - short f_FILLER1; /* (OLD) reference count */ - short f_FILLER2; /* (OLD) references from message queue */ + u_int f_flag; /* see fcntl.h */ struct ucred *f_cred; /* credentials associated with descriptor */ struct fileops { int (*fo_read) __P((struct file *fp, struct uio *uio, Index: sys/vnode.h =================================================================== RCS file: /home/ncvs/src/sys/sys/vnode.h,v retrieving revision 1.111.2.4 diff -u -r1.111.2.4 vnode.h --- sys/vnode.h 2000/12/30 01:51:10 1.111.2.4 +++ sys/vnode.h 2001/05/17 04:49:14 @@ -213,6 +213,7 @@ #define IO_VMIO 0x20 /* data already in VMIO space */ #define IO_INVAL 0x40 /* invalidate after I/O */ #define IO_ASYNC 0x80 /* bawrite rather then bdwrite */ +#define IO_DIRECT 0x100 /* attempt to bypass buffer cache */ /* * Modes. Some values same as Ixxx entries from inode.h for now. Index: ufs/ufs/ufs_readwrite.c =================================================================== RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_readwrite.c,v retrieving revision 1.65.2.6 diff -u -r1.65.2.6 ufs_readwrite.c --- ufs/ufs/ufs_readwrite.c 2000/12/30 01:51:11 1.65.2.6 +++ ufs/ufs/ufs_readwrite.c 2001/05/17 06:26:16 @@ -278,6 +278,15 @@ } /* + * If IO_DIRECT then set B_DIRECT for the buffer. This + * will cause us to attempt to release the buffer later on + * and will cause the buffer cache to attempt to free the + * underlying pages. + */ + if (ioflag & IO_DIRECT) + bp->b_flags |= B_DIRECT; + + /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, @@ -319,12 +328,12 @@ if (error) break; - if ((ioflag & IO_VMIO) && - (LIST_FIRST(&bp->b_dep) == NULL)) { + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { /* - * If there are no dependencies, and - * it's VMIO, then we don't need the buf, - * mark it available for freeing. The VM has the data. + * If there are no dependencies, and it's VMIO, + * then we don't need the buf, mark it available + * for freeing. The VM has the data. */ bp->b_flags |= B_RELBUF; brelse(bp); @@ -346,8 +355,8 @@ * so it must have come from a 'break' statement */ if (bp != NULL) { - if ((ioflag & IO_VMIO) && - (LIST_FIRST(&bp->b_dep) == NULL)) { + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { @@ -449,7 +458,7 @@ resid = uio->uio_resid; osize = ip->i_size; flags = 0; - if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) + if ((ioflag & (IO_SYNC|IO_DIRECT)) && !DOINGASYNC(vp)) flags = B_SYNC; if (object && (object->flags & OBJ_OPT)) { @@ -486,6 +495,8 @@ ap->a_cred, flags, &bp); if (error != 0) break; + if (ioflag & IO_DIRECT) + bp->b_flags |= B_DIRECT; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; @@ -498,11 +509,12 @@ error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); - if ((ioflag & IO_VMIO) && - (LIST_FIRST(&bp->b_dep) == NULL)) + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { bp->b_flags |= B_RELBUF; + } - if (ioflag & IO_SYNC) { + if (ioflag & (IO_SYNC|IO_DIRECT)) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || Index: vm/vm_page.c =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_page.c,v retrieving revision 1.147.2.6 diff -u -r1.147.2.6 vm_page.c --- vm/vm_page.c 2001/03/03 23:06:09 1.147.2.6 +++ vm/vm_page.c 2001/05/17 04:22:38 @@ -1353,6 +1353,31 @@ } /* + * vm_page_try_to_free() + * + * Attempt to free the page. If we cannot free it, we do nothing. + * 1 is returned on success, 0 on failure. + */ + +int +vm_page_try_to_free(m) + vm_page_t m; +{ + if (m->dirty || m->hold_count || m->busy || m->wire_count || + (m->flags & (PG_BUSY|PG_UNMANAGED))) { + return(0); + } + vm_page_test_dirty(m); + if (m->dirty) + return(0); + vm_page_busy(m); + vm_page_protect(m, VM_PROT_NONE); + vm_page_free(m); + return(1); +} + + +/* * vm_page_cache * * Put the specified page onto the page cache queue (if appropriate). Index: vm/vm_page.h =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_page.h,v retrieving revision 1.75.2.5 diff -u -r1.75.2.5 vm_page.h --- vm/vm_page.h 2000/12/30 01:51:11 1.75.2.5 +++ vm/vm_page.h 2001/05/17 04:23:05 @@ -406,6 +406,7 @@ vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int)); void vm_page_cache __P((register vm_page_t)); int vm_page_try_to_cache __P((vm_page_t)); +int vm_page_try_to_free __P((vm_page_t)); void vm_page_dontneed __P((register vm_page_t)); static __inline void vm_page_copy __P((vm_page_t, vm_page_t)); static __inline void vm_page_free __P((vm_page_t)); To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-arch" in the body of the message