Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 12 Oct 2011 21:29:13 GMT
From:      John Baldwin <jhb@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 200123 for review
Message-ID:  <201110122129.p9CLTDqE071590@skunkworks.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://p4web.freebsd.org/@@200123?ac=10

Change 200123 by jhb@jhb_jhbbsd on 2011/10/12 21:28:22

	Checkpoint WIP for fadvise(2).  Next I need to add the logic in
	the vnode f_ops to actually act on the hints during an I/O.  Was
	hoping to leverage vm_object_madvise(MADV_DONTNEED) to handle
	FADV_NOREUSE but that doesn't actually do what I need (it still
	leaves the pages on the inactive queue, and I really want them
	in the cache queue to avoid the degenerate splay tree case in
	pagedaemon.. in fact, what I'd really like is to not have pagedaemon
	run at _all_).  Perhaps we can add a new internal variant
	of MADV_DONTNEED that forces pages into cache instead of the
	inactive queue.

Affected files ...

.. //depot/projects/fadvise/sys/compat/freebsd32/freebsd32_misc.c#2 edit
.. //depot/projects/fadvise/sys/compat/freebsd32/syscalls.master#2 edit
.. //depot/projects/fadvise/sys/kern/kern_descrip.c#2 edit
.. //depot/projects/fadvise/sys/kern/syscalls.master#2 edit
.. //depot/projects/fadvise/sys/kern/vfs_syscalls.c#2 edit
.. //depot/projects/fadvise/sys/sys/fcntl.h#2 edit
.. //depot/projects/fadvise/sys/sys/file.h#2 edit

Differences ...

==== //depot/projects/fadvise/sys/compat/freebsd32/freebsd32_misc.c#2 (text+ko) ====

@@ -2815,3 +2815,15 @@
 	ap.len = (uap->lenlo | ((off_t)uap->lenhi << 32));
 	return (sys_posix_fallocate(td, &ap));
 }
+
+int
+freebsd32_fadvise(struct thread *td, struct freebsd32_fadvise_args *uap)
+{
+	struct fadvise_args ap;
+
+	ap.fd = uap->fd;
+	ap.offset = (uap->offsetlo | ((off_t)uap->offsethi << 32));
+	ap.len = (uap->lenlo | ((off_t)uap->lenhi << 32));
+	ap.advice = uap->advice;
+	return (sys_fadvise(td, &ap));
+}

==== //depot/projects/fadvise/sys/compat/freebsd32/syscalls.master#2 (text+ko) ====

@@ -989,6 +989,9 @@
 				    size_t inbuflen, void *outbufp, \
 				    size_t outbuflen); }
 530	AUE_NULL	STD	{ int freebsd32_posix_fallocate(int fd,\
-				     uint32_t offsetlo, uint32_t offsethi,\
-				     uint32_t lenlo, uint32_t lenhi); }
-531	AUE_NULL	UNIMPL	posix_fadvise
+				    uint32_t offsetlo, uint32_t offsethi,\
+				    uint32_t lenlo, uint32_t lenhi); }
+531	AUE_NULL	STD	{ int freebsd32_fadvise(int fd, \
+				    uint32_t offsetlo, uint32_t offsethi,\
+				    uint32_t lenlo, uint32_t lenhi, \
+				    int advice); }

==== //depot/projects/fadvise/sys/kern/kern_descrip.c#2 (text+ko) ====

@@ -1654,6 +1654,7 @@
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fp->f_vnode = NULL;
+	fp->f_advice = FADV_NORMAL;
 	*resultfp = fp;
 	return (0);
 }

==== //depot/projects/fadvise/sys/kern/syscalls.master#2 (text+ko) ====

@@ -947,6 +947,7 @@
 				    size_t outbuflen); }
 530	AUE_NULL	STD	{ int posix_fallocate(int fd, \
 				    off_t offset, off_t len); }
-531	AUE_NULL	UNIMPL	posix_fadvise
+531	AUE_NULL	STD	{ int fadvise(int fd, off_t offset, \
+				    off_t len, int advice); }
 ; Please copy any additions and changes to the following compatability tables:
 ; sys/compat/freebsd32/syscalls.master

==== //depot/projects/fadvise/sys/kern/vfs_syscalls.c#2 (text+ko) ====

@@ -4845,3 +4845,124 @@
 
 	return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
 }
+
+/*
+ * Unlike madvise(2), we do not make a best effort to remember every
+ * possible caching hint.  Instead, we remember the last setting with
+ * the exception that we will allow FADV_NORMAL to adjust the region
+ * of any current setting.
+ */
+int
+sys_fadvise(struct thread *td, struct fadvise_args *uap)
+{
+	struct file *fp;
+	struct vnode *vp;
+	off_t newoff, start, end;
+	int error, vfslocked;
+
+	if (uap->offset < 0 || uap->len < 0 ||
+	    uap->offset + uap->len < uap->offset)
+		return (EINVAL);
+	switch (uap->advice) {
+	case FADV_NORMAL:
+	case FADV_SEQUENTIAL:
+	case FADV_RANDOM:
+	case FADV_WILLNEED:
+	case FADV_DONTNEED:
+	case FADV_NOREUSE:
+		break;
+	default:
+		return (EINVAL);
+	}
+	/* XXX: CAP_FADVISE? */
+	error = fget(td, uap->fd, 0, &fp);
+	if (error != 0)
+		return (error);
+	
+	switch (fp->f_type) {
+	case DTYPE_VNODE:
+		break;
+	case DTYPE_PIPE:
+	case DTYPE_FIFO:
+		error = ESPIPE;
+		goto out;
+	default:
+		error = ENODEV;
+		goto out;
+	}
+	vp = fp->f_vnode;
+	if (vp->v_type != VREG) {
+		error = ENODEV;
+		goto out;
+	}
+	switch (uap->advice) {
+	case FADV_SEQUENTIAL:
+	case FADV_RANDOM:
+	case FADV_NOREUSE:
+		mtx_pool_lock(mtxpool_sleep, fp);
+		fp->f_advice = uap->advice;
+		fp->f_adviceoff = uap->offset;
+		fp->f_advicelen = uap->len;
+		mtx_pool_unlock(mtxpool_sleep, fp);
+		break;
+	case FADV_NORMAL:
+		/*
+		 * If a the "normal" region overlaps with an existing
+		 * non-standard region, trim or remove the
+		 * non-standard region.
+		 */
+		mtx_pool_lock(mtxpool_sleep, fp);
+		if (fp->f_advice != FADV_NORMAL) {
+			if (uap->len == 0 && fp->f_advicelen == 0) {
+				if (uap->offset > fp->f_adviceoff)
+					fp->f_advicelen =
+					    uap->offset - fp->f_adviceoff;
+				else
+					fp->f_advice = FADV_NORMAL;
+			} else if (uap->len == 0) {
+				if (uap->offset <= fp->f_adviceoff)
+					fp->f_advice = FADV_NORMAL;
+				else if (fp->f_adviceoff + fp->f_advicelen >
+				    uap->offset)
+					fp->f_advicelen =
+					    uap->offset - fp->f_adviceoff;
+			} else if (fp->f_advicelen == 0) {
+				if (uap->offset + uap->len > fp->f_adviceoff)
+					fp->f_adviceoff =
+					    uap->offset + uap->len;
+			} else if (fp->f_adviceoff < uap->offset + uap->len &&
+			    fp->f_adviceoff + fp->f_advicelen > uap->offset)
+				fp->f_advicelen =
+				    uap->offset - fp->f_adviceoff;
+			else if (uap->offset <
+			    fp->f_adviceoff + fp->f_advicelen &&
+			    uap->offset + uap->len > fp->f_adviceoff) {
+				newoff = uap->offset + uap->len
+				fp->f_advicelen -= (fp->f_adviceoff - newoff);
+				fp->f_adviceoff = newoff;
+			}
+		}
+		mtx_pool_unlock(mtxpool_sleep, fp);
+		break;
+	case FADV_WILLNEED:
+	case FADV_DONTNEED:
+		/*
+		 * Apply the request to the backing VM object.  Note
+		 * that the FADV_* constants map directly to the same
+		 * madvise(2) constants.
+		 */
+		start = trunc_page(uap->offset);
+		end = round_page(uap->offset + uap->len - 1);
+		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+		if (vp->v_object != NULL)
+			vm_object_madvise(vp->v_object, OFF_TO_IDX(start),
+			    atop(end - start), uap->advice);
+		VOP_UNLOCK(vp, 0);
+		VFS_UNLOCK_GIANT(vfslocked);
+		break;
+	}
+out:
+	fdrop(fp, td);
+	return (error);
+}

==== //depot/projects/fadvise/sys/sys/fcntl.h#2 (text+ko) ====

@@ -278,6 +278,34 @@
 #endif
 
 /*
+ * Advice to fadvise
+ */
+#define	_FADV_NORMAL	0	/* no special treatment */
+#define	_FADV_RANDOM	1	/* expect random page references */
+#define	_FADV_SEQUENTIAL 2	/* expect sequential page references */
+#define	_FADV_WILLNEED	3	/* will need these pages */
+#define	_FADV_DONTNEED	4	/* dont need these pages */
+#define	_FADV_NOREUSE	5	/* access data only once */
+
+#if __BSD_VISIBLE
+#define	FADV_NORMAL	_FADV_NORMAL
+#define	FADV_RANDOM	_FADV_RANDOM
+#define	FADV_SEQUENTIAL	_FADV_SEQUENTIAL
+#define	FADV_WILLNEED	_FADV_WILLNEED
+#define	FADV_DONTNEED	_FADV_DONTNEED
+#define	FADV_NOREUSE	_FADV_NOREUSE
+#endif
+
+#if __POSIX_VISIBLE >= 200112
+#define	POSIX_FADV_NORMAL	_FADV_NORMAL
+#define	POSIX_FADV_RANDOM	_FADV_RANDOM
+#define	POSIX_FADV_SEQUENTIAL	_FADV_SEQUENTIAL
+#define	POSIX_FADV_WILLNEED	_FADV_WILLNEED
+#define	POSIX_FADV_DONTNEED	_FADV_DONTNEED
+#define	POSIX_FADV_NOREUSE	_FADV_NOREUSE
+#endif
+
+/*
  * XXX missing posix_fadvise() and POSIX_FADV_* macros.
  */
 
@@ -289,6 +317,12 @@
 #if __BSD_VISIBLE || __POSIX_VISIBLE >= 200809
 int	openat(int, const char *, int, ...);
 #endif
+#if __BSD_VISIBLE
+int	fadvise(int, off_t, off_t, int);
+#endif
+#if __POSIX_VISIBLE >= 200112
+int	posix_fadvise(int, off_t, off_t, int);
+#endif
 #if __BSD_VISIBLE || __POSIX_VISIBLE >= 200112
 int	posix_fallocate(int, off_t, off_t);
 #endif

==== //depot/projects/fadvise/sys/sys/file.h#2 (text+ko) ====

@@ -137,6 +137,9 @@
 	int		f_seqcount;	/* Count of sequential accesses. */
 	off_t		f_nextoff;	/* next expected read/write offset. */
 	struct cdev_privdata *f_cdevpriv; /* (d) Private data for the cdev. */
+	int		f_advice;	/* (f) FADV_* type. */
+	off_t		f_adviceoff;	/* (f) fadvice regionoffset. */
+	off_t		f_advicelen;	/* (f) fadvice region length. */
 	/*
 	 *  DFLAG_SEEKABLE specific fields
 	 */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201110122129.p9CLTDqE071590>