FreeBSD Mail Archives

Date:      Sun, 4 Nov 2001 17:12:39 -0800 (PST)
From:      Matthew Dillon <dillon@apollo.backplane.com>
To:        Mark Santcroos <marks@ripe.net>, current@FreeBSD.ORG
Subject:   patch #2 (was Re: buf_daemon() lockup)
Message-ID:  <200111050112.fA51Cdc42844@apollo.backplane.com>
References:  <20011101092118.A434@laptop.6bone.nl> <200111042259.fA4MxSc93566@apollo.backplane.com> <200111050006.fA506f309535@apollo.backplane.com> <200111050015.fA50Fdk09561@apollo.backplane.com>


:
:    Hmm..  that last patch didn't do it.  I've noticed some errors on the
:    console before the lockup:
:
:unexpected md driver lock: 0xe1813900: type VREG, usecount 2, writecount 1, refcount 3871, flags (VOBJBUF)
:        tag VT_UFS, ino 4, on dev da0s1h (13, 131079) lock type inode: EXCL (count 1) by pid 6

    Ok.  I think these unexpected md driver lock messages are bogus... I'll
    leave it to Poul to remove it.  The syncer or buf_daemon can be flushing
    buffers associated with the underlying file simultaniously with other
    processes doing MD ops.

    Here's a new patch.  It's the same as the old one except I fixed a
    missing B_NOWDRAIN flag in the clustering code, and I added B_NOWDRAIN
    support to the nfs client code.  I think the missing drain flag in the
    clustering code was the problem.  Try this patch.

						-Matt

Index: dev/md/md.c
===================================================================
RCS file: /home/ncvs/src/sys/dev/md/md.c,v
retrieving revision 1.47
diff -u -r1.47 md.c
--- dev/md/md.c	2001/10/11 23:38:13	1.47
+++ dev/md/md.c	2001/11/04 23:54:18
@@ -388,13 +388,18 @@
 		auio.uio_td = curthread;
 		if (VOP_ISLOCKED(sc->vnode, NULL))
 			vprint("unexpected md driver lock", sc->vnode);
+		/*
+		 * When reading set IO_DIRECT to try to avoid double-caching
+		 * the data.  When writing IO_DIRECT is not optimal, but we
+		 * must set IO_NOWDRAIN to avoid a wdrain deadlock.
+		 */
 		if (bp->bio_cmd == BIO_READ) {
 			vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
-			error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
+			error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred);
 		} else {
 			(void) vn_start_write(sc->vnode, &mp, V_WAIT);
 			vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
-			error = VOP_WRITE(sc->vnode, &auio, 0, sc->cred);
+			error = VOP_WRITE(sc->vnode, &auio, IO_NOWDRAIN, sc->cred);
 			vn_finished_write(mp);
 		}
 		VOP_UNLOCK(sc->vnode, 0, curthread);
Index: kern/vfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.291
diff -u -r1.291 vfs_bio.c
--- kern/vfs_bio.c	2001/10/21 06:26:55	1.291
+++ kern/vfs_bio.c	2001/11/04 23:41:19
@@ -758,11 +758,15 @@
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
-	} else {
+	} else if ((oldflags & B_NOWDRAIN) == 0) {
 		/*
 		 * don't allow the async write to saturate the I/O
-		 * system.  There is no chance of deadlock here because
-		 * we are blocking on I/O that is already in-progress.
+		 * system.  Deadlocks can occur only if a device strategy
+		 * routine (like in MD) turns around and issues another
+		 * high-level write, in which case B_NOWDRAIN is expected
+		 * to be set.  Otherwise we will not deadlock here because
+		 * we are blocking waiting for I/O that is already in-progress
+		 * to complete.
 		 */
 		waitrunningbufspace();
 	}
@@ -1286,7 +1290,8 @@
 
 	/* unlock */
 	BUF_UNLOCK(bp);
-	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
+	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | 
+			B_DIRECT | B_NOWDRAIN);
 	bp->b_ioflags &= ~BIO_ORDERED;
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("brelse: not dirty");
Index: kern/vfs_cluster.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_cluster.c,v
retrieving revision 1.114
diff -u -r1.114 vfs_cluster.c
--- kern/vfs_cluster.c	2001/10/25 22:49:48	1.114
+++ kern/vfs_cluster.c	2001/11/05 00:49:33
@@ -836,7 +836,7 @@
 		bp->b_data = (char *)((vm_offset_t)bp->b_data |
 		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
 		bp->b_flags |= B_CLUSTER |
-				(tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
+				(tbp->b_flags & (B_VMIO | B_NEEDCOMMIT | B_NOWDRAIN));
 		bp->b_iodone = cluster_callback;
 		pbgetvp(vp, bp);
 		/*
Index: nfsclient/nfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/nfsclient/nfs_bio.c,v
retrieving revision 1.102
diff -u -r1.102 nfs_bio.c
--- nfsclient/nfs_bio.c	2001/10/11 23:38:16	1.102
+++ nfsclient/nfs_bio.c	2001/11/05 01:07:42
@@ -961,6 +961,12 @@
 			}
 			vfs_bio_set_validclean(bp, on, n);
 		}
+		/*
+		 * If IO_NOWDRAIN then set B_NOWDRAIN (nfs-backed MD 
+		 * filesystem)
+		 */
+		if (ioflag & IO_NOWDRAIN)
+			bp->b_flags |= B_NOWDRAIN;
 
 		/*
 		 * If IO_SYNC do bwrite().
Index: sys/buf.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/buf.h,v
retrieving revision 1.121
diff -u -r1.121 buf.h
--- sys/buf.h	2001/09/12 08:38:04	1.121
+++ sys/buf.h	2001/11/04 23:30:25
@@ -192,6 +192,11 @@
  *			the pages underlying the buffer.  B_DIRECT is
  *			sticky until the buffer is released and typically
  *			only has an effect when B_RELBUF is also set.
+ *
+ *	B_NOWDRAIN	This flag should be set when a device (like MD)
+ *			does a turn-around VOP_WRITE from its strategy
+ *			routine.  This flag prevents bwrite() from blocking
+ *			in wdrain, avoiding a deadlock situation.
  */
 
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
@@ -204,7 +209,7 @@
 #define	B_DELWRI	0x00000080	/* Delay I/O until buffer reused. */
 #define	B_DONE		0x00000200	/* I/O completed. */
 #define	B_EINTR		0x00000400	/* I/O was interrupted */
-#define	B_00000800	0x00000800	/* Available flag. */
+#define	B_NOWDRAIN	0x00000800	/* Avoid wdrain deadlock */
 #define	B_SCANNED	0x00001000	/* VOP_FSYNC funcs mark written bufs */
 #define	B_INVAL		0x00002000	/* Does not contain valid info. */
 #define	B_LOCKED	0x00004000	/* Locked in core (not reusable). */
Index: sys/vnode.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/vnode.h,v
retrieving revision 1.162
diff -u -r1.162 vnode.h
--- sys/vnode.h	2001/10/27 19:58:55	1.162
+++ sys/vnode.h	2001/11/04 23:27:40
@@ -222,6 +222,7 @@
 #define	IO_INVAL	0x40		/* invalidate after I/O */
 #define	IO_ASYNC	0x80		/* bawrite rather then bdwrite */
 #define IO_DIRECT	0x100		/* attempt to bypass buffer cache */
+#define IO_NOWDRAIN	0x200		/* do not block on wdrain */
 
 /*
  *  Modes.  Some values same as Ixxx entries from inode.h for now.
Index: ufs/ufs/ufs_readwrite.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.82
diff -u -r1.82 ufs_readwrite.c
--- ufs/ufs/ufs_readwrite.c	2001/09/12 08:38:10	1.82
+++ ufs/ufs/ufs_readwrite.c	2001/11/04 23:29:15
@@ -511,6 +511,8 @@
 			break;
 		if (ioflag & IO_DIRECT)
 			bp->b_flags |= B_DIRECT;
+		if (ioflag & IO_NOWDRAIN)
+			bp->b_flags |= B_NOWDRAIN;
 
 		if (uio->uio_offset + xfersize > ip->i_size) {
 			ip->i_size = uio->uio_offset + xfersize;

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-current" in the body of the message

Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200111050112.fA51Cdc42844>

Header And Logo

Peripheral Links

Site Navigation

Header And Logo

Peripheral Links

Search

Site Navigation