Date: Wed, 12 Oct 2011 21:29:13 GMT From: John Baldwin <jhb@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 200123 for review Message-ID: <201110122129.p9CLTDqE071590@skunkworks.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://p4web.freebsd.org/@@200123?ac=10 Change 200123 by jhb@jhb_jhbbsd on 2011/10/12 21:28:22 Checkpoint WIP for fadvise(2). Next I need to add the logic in the vnode f_ops to actually act on the hints during an I/O. Was hoping to leverage vm_object_madvise(MADV_DONTNEED) to handle FADV_NOREUSE but that doesn't actually do what I need (it still leaves the pages on the inactive queue, and I really want them in the cache queue to avoid the degenerate splay tree case in pagedaemon.. in fact, what I'd really like is to not have pagedaemon run at _all_). Perhaps we can add a new internal variant of MADV_DONTNEED that forces pages into cache instead of the inactive queue. Affected files ... .. //depot/projects/fadvise/sys/compat/freebsd32/freebsd32_misc.c#2 edit .. //depot/projects/fadvise/sys/compat/freebsd32/syscalls.master#2 edit .. //depot/projects/fadvise/sys/kern/kern_descrip.c#2 edit .. //depot/projects/fadvise/sys/kern/syscalls.master#2 edit .. //depot/projects/fadvise/sys/kern/vfs_syscalls.c#2 edit .. //depot/projects/fadvise/sys/sys/fcntl.h#2 edit .. //depot/projects/fadvise/sys/sys/file.h#2 edit Differences ... ==== //depot/projects/fadvise/sys/compat/freebsd32/freebsd32_misc.c#2 (text+ko) ==== @@ -2815,3 +2815,15 @@ ap.len = (uap->lenlo | ((off_t)uap->lenhi << 32)); return (sys_posix_fallocate(td, &ap)); } + +int +freebsd32_fadvise(struct thread *td, struct freebsd32_fadvise_args *uap) +{ + struct fadvise_args ap; + + ap.fd = uap->fd; + ap.offset = (uap->offsetlo | ((off_t)uap->offsethi << 32)); + ap.len = (uap->lenlo | ((off_t)uap->lenhi << 32)); + ap.advice = uap->advice; + return (sys_fadvise(td, &ap)); +} ==== //depot/projects/fadvise/sys/compat/freebsd32/syscalls.master#2 (text+ko) ==== @@ -989,6 +989,9 @@ size_t inbuflen, void *outbufp, \ size_t outbuflen); } 530 AUE_NULL STD { int freebsd32_posix_fallocate(int fd,\ - uint32_t offsetlo, uint32_t offsethi,\ - uint32_t lenlo, uint32_t lenhi); } -531 AUE_NULL UNIMPL posix_fadvise + uint32_t offsetlo, uint32_t offsethi,\ + uint32_t lenlo, uint32_t lenhi); } +531 AUE_NULL STD { int freebsd32_fadvise(int fd, \ + uint32_t offsetlo, uint32_t offsethi,\ + uint32_t lenlo, uint32_t lenhi, \ + int advice); } ==== //depot/projects/fadvise/sys/kern/kern_descrip.c#2 (text+ko) ==== @@ -1654,6 +1654,7 @@ fp->f_ops = &badfileops; fp->f_data = NULL; fp->f_vnode = NULL; + fp->f_advice = FADV_NORMAL; *resultfp = fp; return (0); } ==== //depot/projects/fadvise/sys/kern/syscalls.master#2 (text+ko) ==== @@ -947,6 +947,7 @@ size_t outbuflen); } 530 AUE_NULL STD { int posix_fallocate(int fd, \ off_t offset, off_t len); } -531 AUE_NULL UNIMPL posix_fadvise +531 AUE_NULL STD { int fadvise(int fd, off_t offset, \ + off_t len, int advice); } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master ==== //depot/projects/fadvise/sys/kern/vfs_syscalls.c#2 (text+ko) ==== @@ -4845,3 +4845,124 @@ return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len)); } + +/* + * Unlike madvise(2), we do not make a best effort to remember every + * possible caching hint. Instead, we remember the last setting with + * the exception that we will allow FADV_NORMAL to adjust the region + * of any current setting. + */ +int +sys_fadvise(struct thread *td, struct fadvise_args *uap) +{ + struct file *fp; + struct vnode *vp; + off_t newoff, start, end; + int error, vfslocked; + + if (uap->offset < 0 || uap->len < 0 || + uap->offset + uap->len < uap->offset) + return (EINVAL); + switch (uap->advice) { + case FADV_NORMAL: + case FADV_SEQUENTIAL: + case FADV_RANDOM: + case FADV_WILLNEED: + case FADV_DONTNEED: + case FADV_NOREUSE: + break; + default: + return (EINVAL); + } + /* XXX: CAP_FADVISE? */ + error = fget(td, uap->fd, 0, &fp); + if (error != 0) + return (error); + + switch (fp->f_type) { + case DTYPE_VNODE: + break; + case DTYPE_PIPE: + case DTYPE_FIFO: + error = ESPIPE; + goto out; + default: + error = ENODEV; + goto out; + } + vp = fp->f_vnode; + if (vp->v_type != VREG) { + error = ENODEV; + goto out; + } + switch (uap->advice) { + case FADV_SEQUENTIAL: + case FADV_RANDOM: + case FADV_NOREUSE: + mtx_pool_lock(mtxpool_sleep, fp); + fp->f_advice = uap->advice; + fp->f_adviceoff = uap->offset; + fp->f_advicelen = uap->len; + mtx_pool_unlock(mtxpool_sleep, fp); + break; + case FADV_NORMAL: + /* + * If a the "normal" region overlaps with an existing + * non-standard region, trim or remove the + * non-standard region. + */ + mtx_pool_lock(mtxpool_sleep, fp); + if (fp->f_advice != FADV_NORMAL) { + if (uap->len == 0 && fp->f_advicelen == 0) { + if (uap->offset > fp->f_adviceoff) + fp->f_advicelen = + uap->offset - fp->f_adviceoff; + else + fp->f_advice = FADV_NORMAL; + } else if (uap->len == 0) { + if (uap->offset <= fp->f_adviceoff) + fp->f_advice = FADV_NORMAL; + else if (fp->f_adviceoff + fp->f_advicelen > + uap->offset) + fp->f_advicelen = + uap->offset - fp->f_adviceoff; + } else if (fp->f_advicelen == 0) { + if (uap->offset + uap->len > fp->f_adviceoff) + fp->f_adviceoff = + uap->offset + uap->len; + } else if (fp->f_adviceoff < uap->offset + uap->len && + fp->f_adviceoff + fp->f_advicelen > uap->offset) + fp->f_advicelen = + uap->offset - fp->f_adviceoff; + else if (uap->offset < + fp->f_adviceoff + fp->f_advicelen && + uap->offset + uap->len > fp->f_adviceoff) { + newoff = uap->offset + uap->len + fp->f_advicelen -= (fp->f_adviceoff - newoff); + fp->f_adviceoff = newoff; + } + } + mtx_pool_unlock(mtxpool_sleep, fp); + break; + case FADV_WILLNEED: + case FADV_DONTNEED: + /* + * Apply the request to the backing VM object. Note + * that the FADV_* constants map directly to the same + * madvise(2) constants. + */ + start = trunc_page(uap->offset); + end = round_page(uap->offset + uap->len - 1); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vn_lock(vp, LK_SHARED | LK_RETRY); + if (vp->v_object != NULL) + vm_object_madvise(vp->v_object, OFF_TO_IDX(start), + atop(end - start), uap->advice); + VOP_UNLOCK(vp, 0); + VFS_UNLOCK_GIANT(vfslocked); + break; + } +out: + fdrop(fp, td); + return (error); +} ==== //depot/projects/fadvise/sys/sys/fcntl.h#2 (text+ko) ==== @@ -278,6 +278,34 @@ #endif /* + * Advice to fadvise + */ +#define _FADV_NORMAL 0 /* no special treatment */ +#define _FADV_RANDOM 1 /* expect random page references */ +#define _FADV_SEQUENTIAL 2 /* expect sequential page references */ +#define _FADV_WILLNEED 3 /* will need these pages */ +#define _FADV_DONTNEED 4 /* dont need these pages */ +#define _FADV_NOREUSE 5 /* access data only once */ + +#if __BSD_VISIBLE +#define FADV_NORMAL _FADV_NORMAL +#define FADV_RANDOM _FADV_RANDOM +#define FADV_SEQUENTIAL _FADV_SEQUENTIAL +#define FADV_WILLNEED _FADV_WILLNEED +#define FADV_DONTNEED _FADV_DONTNEED +#define FADV_NOREUSE _FADV_NOREUSE +#endif + +#if __POSIX_VISIBLE >= 200112 +#define POSIX_FADV_NORMAL _FADV_NORMAL +#define POSIX_FADV_RANDOM _FADV_RANDOM +#define POSIX_FADV_SEQUENTIAL _FADV_SEQUENTIAL +#define POSIX_FADV_WILLNEED _FADV_WILLNEED +#define POSIX_FADV_DONTNEED _FADV_DONTNEED +#define POSIX_FADV_NOREUSE _FADV_NOREUSE +#endif + +/* * XXX missing posix_fadvise() and POSIX_FADV_* macros. */ @@ -289,6 +317,12 @@ #if __BSD_VISIBLE || __POSIX_VISIBLE >= 200809 int openat(int, const char *, int, ...); #endif +#if __BSD_VISIBLE +int fadvise(int, off_t, off_t, int); +#endif +#if __POSIX_VISIBLE >= 200112 +int posix_fadvise(int, off_t, off_t, int); +#endif #if __BSD_VISIBLE || __POSIX_VISIBLE >= 200112 int posix_fallocate(int, off_t, off_t); #endif ==== //depot/projects/fadvise/sys/sys/file.h#2 (text+ko) ==== @@ -137,6 +137,9 @@ int f_seqcount; /* Count of sequential accesses. */ off_t f_nextoff; /* next expected read/write offset. */ struct cdev_privdata *f_cdevpriv; /* (d) Private data for the cdev. */ + int f_advice; /* (f) FADV_* type. */ + off_t f_adviceoff; /* (f) fadvice regionoffset. */ + off_t f_advicelen; /* (f) fadvice region length. */ /* * DFLAG_SEEKABLE specific fields */
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201110122129.p9CLTDqE071590>