Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 12 Nov 2005 13:29:09 -0500
From:      Craig Rodrigues <rodrigc@crodrigues.org>
To:        freebsd-arch@freebsd.org
Cc:        kan@freebsd.org, jeff@freebsd.org
Subject:   [RFC] vfs_bio additions, motivated by XFS for FreeBSD project
Message-ID:  <20051112182909.GA4301@crodrigues.org>

next in thread | raw e-mail | index | archive | help
Hi,

Now that FreeBSD 6.0 is released, I would like to work
on integrating code from the XFS for FreeBSD project into
FreeBSD-CURRENT.

Alexander Kabaev made some changes to vfs_bio.c which are
needed by the XFS for FreeBSD code.  In addition to some
new functions, this patch adds three new fields
to struct buf (b_fsprivate1, b_fsprivate2, b_fsprivate3).
You don't see their use here, but in the XFS for FreeBSD code
(which you can get from http://people.freebsd.org/~rodrigc/xfs/ ),
they are used to cache certain information.

Comments?


--- //depot/vendor/freebsd/src/sys/kern/vfs_bio.c	2005/10/08 15:01:11
+++ //depot/projects/src/sys/kern/vfs_bio.c	2005/10/08 16:09:54
@@ -216,7 +216,7 @@
  */
 static struct mtx rbreqlock;
 
-/* 
+/*
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
@@ -233,8 +233,12 @@
 /*
  * Lock that protects against bwait()/bdone()/B_DONE races.
  */
+static struct mtx bdonelock;
 
-static struct mtx bdonelock;
+/*
+ * Lock that protects against bwait()/bdone()/B_DONE races.
+ */
+static struct mtx bpinlock;
 
 /*
  * Definitions for the buffer free lists.
@@ -523,6 +527,7 @@
 	mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF);
+	mtx_init(&bpinlock, "bpin lock", NULL, MTX_DEF);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
@@ -636,7 +641,7 @@
  *	bremfree:
  *
  *	Mark the buffer for removal from the appropriate free list in brelse.
- *	
+ *
  */
 void
 bremfree(struct buf *bp)
@@ -720,18 +725,51 @@
 }
 
 /*
+ * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
+ * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
+ * the buffer is valid and we do not have to do anything.
+ */
+void
+breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
+    int cnt, struct ucred * cred)
+{
+	struct buf *rabp;
+	int i;
+
+	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+		if (inmem(vp, *rablkno))
+			continue;
+		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
+
+		if ((rabp->b_flags & B_CACHE) == 0) {
+			if (curthread != PCPU_GET(idlethread))
+				curthread->td_proc->p_stats->p_ru.ru_inblock++;
+			rabp->b_flags |= B_ASYNC;
+			rabp->b_flags &= ~B_INVAL;
+			rabp->b_ioflags &= ~BIO_ERROR;
+			rabp->b_iocmd = BIO_READ;
+			if (rabp->b_rcred == NOCRED && cred != NOCRED)
+				rabp->b_rcred = crhold(cred);
+			vfs_busy_pages(rabp, 0);
+			BUF_KERNPROC(rabp);
+			rabp->b_iooffset = dbtob(rabp->b_blkno);
+			bstrategy(rabp);
+		} else {
+			brelse(rabp);
+		}
+	}
+}
+
+/*
  * Operates like bread, but also starts asynchronous I/O on
- * read-ahead blocks.  We must clear BIO_ERROR and B_INVAL prior
- * to initiating I/O . If B_CACHE is set, the buffer is valid 
- * and we do not have to do anything.
+ * read-ahead blocks.
  */
 int
 breadn(struct vnode * vp, daddr_t blkno, int size,
     daddr_t * rablkno, int *rabsize,
     int cnt, struct ucred * cred, struct buf **bpp)
 {
-	struct buf *bp, *rabp;
-	int i;
+	struct buf *bp;
 	int rv = 0, readwait = 0;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
@@ -752,29 +790,8 @@
 		++readwait;
 	}
 
-	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
-		if (inmem(vp, *rablkno))
-			continue;
-		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
+	breada(vp, rablkno, rabsize, cnt, cred);
 
-		if ((rabp->b_flags & B_CACHE) == 0) {
-			if (curthread != PCPU_GET(idlethread))
-				curthread->td_proc->p_stats->p_ru.ru_inblock++;
-			rabp->b_flags |= B_ASYNC;
-			rabp->b_flags &= ~B_INVAL;
-			rabp->b_ioflags &= ~BIO_ERROR;
-			rabp->b_iocmd = BIO_READ;
-			if (rabp->b_rcred == NOCRED && cred != NOCRED)
-				rabp->b_rcred = crhold(cred);
-			vfs_busy_pages(rabp, 0);
-			BUF_KERNPROC(rabp);
-			rabp->b_iooffset = dbtob(rabp->b_blkno);
-			bstrategy(rabp);
-		} else {
-			brelse(rabp);
-		}
-	}
-
 	if (readwait) {
 		rv = bufwait(bp);
 	}
@@ -807,6 +824,10 @@
 
 	if (BUF_REFCNT(bp) == 0)
 		panic("bufwrite: buffer is not busy???");
+
+	if (bp->b_pin_count > 0)
+		bunpin_wait(bp);
+
 	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
 	    ("FFS background buffer should not get here %p", bp));
 
@@ -1117,6 +1138,11 @@
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
+	if (bp->b_flags & B_MANAGED) {
+		bqrelse(bp);
+		return;
+	}
+
 	if (bp->b_iocmd == BIO_WRITE &&
 	    (bp->b_ioflags & BIO_ERROR) &&
 	    !(bp->b_flags & B_INVAL)) {
@@ -1286,7 +1312,7 @@
 		}
 
 	}
-			
+
 	if (BUF_REFCNT(bp) > 1) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
@@ -1394,6 +1420,18 @@
 		BUF_UNLOCK(bp);
 		return;
 	}
+
+	if (bp->b_flags & B_MANAGED) {
+		if (bp->b_flags & B_REMFREE) {
+			mtx_lock(&bqlock);
+			bremfreel(bp);
+			mtx_unlock(&bqlock);
+		}
+		bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+		BUF_UNLOCK(bp);
+		return;
+	}
+
 	mtx_lock(&bqlock);
 	/* Handle delayed bremfree() processing. */
 	if (bp->b_flags & B_REMFREE)
@@ -1821,6 +1859,10 @@
 		bp->b_npages = 0;
 		bp->b_dirtyoff = bp->b_dirtyend = 0;
 		bp->b_bufobj = NULL;
+		bp->b_pin_count = 0;
+		bp->b_fsprivate1 = NULL;
+		bp->b_fsprivate2 = NULL;
+		bp->b_fsprivate3 = NULL;
 
 		LIST_INIT(&bp->b_dep);
 
@@ -2059,6 +2101,10 @@
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
+		if (bp->b_pin_count > 0) {
+			BUF_UNLOCK(bp);
+			continue;
+		}
 		BO_LOCK(bp->b_bufobj);
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
 		    (bp->b_flags & B_DELWRI) == 0) {
@@ -2393,6 +2439,19 @@
 			if ((bp->b_flags & B_VMIO) == 0 ||
 			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
+					/*
+					 * If buffer is pinned and caller does
+					 * not want sleep  waiting for it to be
+					 * unpinned, bail out
+					 * */
+					if (bp->b_pin_count > 0) {
+						if (flags & GB_LOCK_NOWAIT) {
+							bqrelse(bp);
+							return (NULL);
+						} else {
+							bunpin_wait(bp);
+						}
+					}
 					bp->b_flags |= B_NOCACHE;
 					bwrite(bp);
 				} else {
@@ -3034,11 +3093,11 @@
 	struct bufobj *dropobj;
 	void    (*biodone)(struct buf *);
 
-
 	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	dropobj = NULL;
 
-	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
+	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
+	    BUF_REFCNT(bp)));
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 
 	runningbufwakeup(bp);
@@ -3053,6 +3112,19 @@
 			bufobj_wdrop(dropobj);
 		return;
 	}
+
+	bufdone_finish(bp);
+
+	if (dropobj)
+		bufobj_wdrop(dropobj);
+}
+
+void
+bufdone_finish(struct buf *bp)
+{
+	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
+	    BUF_REFCNT(bp)));
+
 	if (LIST_FIRST(&bp->b_dep) != NULL)
 		buf_complete(bp);
 
@@ -3118,7 +3190,8 @@
 				if (m == NULL)
 					panic("biodone: page disappeared!");
 				bp->b_pages[i] = m;
-				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+				    bp->b_pages, bp->b_npages);
 			}
 #if defined(VFS_BIO_DEBUG)
 			if (OFF_TO_IDX(foff) != m->pindex) {
@@ -3130,7 +3203,7 @@
 
 			/*
 			 * In the write case, the valid and clean bits are
-			 * already changed correctly ( see bdwrite() ), so we 
+			 * already changed correctly ( see bdwrite() ), so we
 			 * only need to do this here in the read case.
 			 */
 			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
@@ -3185,8 +3258,6 @@
 			bqrelse(bp);
 	} else
 		bdone(bp);
-	if (dropobj)
-		bufobj_wdrop(dropobj);
 }
 
 /*
@@ -3742,6 +3813,32 @@
 	return (error);
 }
 
+void
+bpin(struct buf *bp)
+{
+	mtx_lock(&bpinlock);
+	bp->b_pin_count ++;
+	mtx_unlock(&bpinlock);
+}
+
+void
+bunpin(struct buf *bp)
+{
+	mtx_lock(&bpinlock);
+	if ( --bp->b_pin_count == 0)
+		wakeup(bp);
+	mtx_unlock(&bpinlock);
+}
+
+void
+bunpin_wait(struct buf *bp)
+{
+	mtx_lock(&bpinlock);
+	while (bp->b_pin_count > 0)
+		msleep(bp, &bpinlock, PRIBIO, "bwunpin", 0);
+	mtx_unlock(&bpinlock);
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -3794,3 +3891,4 @@
 	}
 }
 #endif /* DDB */
+
--- //depot/vendor/freebsd/src/sys/kern/vfs_cluster.c	2005/08/14 09:53:08
+++ //depot/projects/src/sys/kern/vfs_cluster.c	2005/08/14 10:01:58
@@ -765,6 +765,12 @@
 			--len;
 			continue;
 		}
+		if (tbp->b_pin_count >  0) {
+			BUF_UNLOCK(tbp);
+			++start_lbn;
+			--len;
+			continue;
+		}
 		bremfree(tbp);
 		tbp->b_flags &= ~B_DONE;
 
@@ -868,6 +874,15 @@
 					BUF_UNLOCK(tbp);
 					break;
 				}
+
+				/*
+				 * Do not pull in pinned buffers.
+				 */
+				if (tbp->b_pin_count > 0) {
+					BUF_UNLOCK(tbp);
+					break;
+				}
+
 				/*
 				 * Ok, it's passed all the tests,
 				 * so remove it from the free list
@@ -979,3 +994,4 @@
 	buflist->bs_nchildren = i + 1;
 	return (buflist);
 }
+
--- //depot/vendor/freebsd/src/sys/sys/buf.h	2005/10/08 15:01:11
+++ //depot/projects/src/sys/sys/buf.h	2005/10/08 16:09:54
@@ -135,6 +135,10 @@
 	struct	vm_page *b_pages[btoc(MAXPHYS)];
 	int		b_npages;
 	struct	workhead b_dep;		/* (D) List of filesystem dependencies. */
+	void	*b_fsprivate1;
+	void	*b_fsprivate2;
+	void	*b_fsprivate3;
+	int	b_pin_count;
 };
 
 #define b_object	b_bufobj->bo_object
@@ -214,7 +218,7 @@
 #define	B_01000000	0x01000000	/* Available flag. */
 #define	B_02000000	0x02000000	/* Available flag. */
 #define	B_PAGING	0x04000000	/* volatile paging I/O -- bypass VMIO */
-#define	B_08000000	0x08000000	/* Available flag. */
+#define B_MANAGED	0x08000000	/* Managed by FS. */
 #define B_RAM		0x10000000	/* Read ahead mark (flag) */
 #define B_VMIO		0x20000000	/* VMIO flag */
 #define B_CLUSTER	0x40000000	/* pagein op, so swap() can count it */
@@ -486,6 +490,7 @@
 void	bremfree(struct buf *);
 void	bremfreef(struct buf *);	/* XXX Force bremfree, only for nfs. */
 int	bread(struct vnode *, daddr_t, int, struct ucred *, struct buf **);
+void	breada(struct vnode *, daddr_t *, int *, int, struct ucred *);
 int	breadn(struct vnode *, daddr_t, int, daddr_t *, int *, int,
 	    struct ucred *, struct buf **);
 void	bdwrite(struct buf *);
@@ -504,6 +509,7 @@
 int	bufwait(struct buf *);
 int	bufwrite(struct buf *);
 void	bufdone(struct buf *);
+void	bufdone_finish(struct buf *);
 
 int	cluster_read(struct vnode *, u_quad_t, daddr_t, long,
 	    struct ucred *, long, int, struct buf **);
@@ -527,7 +533,11 @@
 struct	buf *trypbuf(int *);
 void	bwait(struct buf *, u_char, const char *);
 void	bdone(struct buf *);
+void	bpin(struct buf *);
+void	bunpin(struct buf *);
+void 	bunpin_wait(struct buf *);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_BUF_H_ */
+
-- 
Craig Rodrigues        
rodrigc@crodrigues.org



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20051112182909.GA4301>