Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 19 Apr 2005 11:18:00 -0400
From:      Brian Fundakowski Feldman <green@freebsd.org>
To:        Marc Olzheim <marcolz@stack.nl>
Cc:        freebsd-current@freebsd.org
Subject:   Re: NFS client/buffer cache deadlock
Message-ID:  <20050419151800.GE1157@green.homeunix.org>
In-Reply-To: <20050419133227.GA11612@stack.nl>
References:  <20050415050821.GO981@green.homeunix.org> <20050415132108.GC85922@stack.nl> <20050415150708.GP981@green.homeunix.org> <20050418092550.GA97539@stack.nl> <20050418092752.GB97539@stack.nl> <20050418202213.GC1157@green.homeunix.org> <20050418203321.GA88774@stack.nl> <20050419133227.GA11612@stack.nl>

next in thread | previous in thread | raw e-mail | index | archive | help
On Tue, Apr 19, 2005 at 03:32:27PM +0200, Marc Olzheim wrote:
> On Mon, Apr 18, 2005 at 10:33:21PM +0200, Marc Olzheim wrote:
> > On Mon, Apr 18, 2005 at 04:22:13PM -0400, Brian Fundakowski Feldman wrote:
> > > > > http://green.homeunix.org/~green/nfs_client.deadlock.patch
> > > > Hmm, could you change it into a diff -u ?
> > > 
> > > I replaced the patch with one with -u for you.
> > 
> > Ok, great, I'll do some test on my -CURRENT machine at work
> > in the morning (CEST).
> 
> Hmm, sys/vnode.h has changed a lot from RELENG_5 to -CURRENT... Perhaps
> someone with insight in this overhaul could change the patch to HEAD ?

Does this work for you?

cvs diff: Diffing sys
Index: sys/buf.h
===================================================================
RCS file: /usr/ncvs/src/sys/sys/buf.h,v
retrieving revision 1.185
diff -u -r1.185 buf.h
--- sys/buf.h	10 Feb 2005 12:17:48 -0000	1.185
+++ sys/buf.h	19 Apr 2005 14:30:54 -0000
@@ -465,6 +465,7 @@
 extern int	maxswzone;		/* Max KVA for swap structures */
 extern int	maxbcache;		/* Max KVA for buffer cache */
 extern int	runningbufspace;
+extern int	hibufspace;
 extern int      buf_maxio;              /* nominal maximum I/O for buffer */
 extern struct	buf *buf;		/* The buffer headers. */
 extern char	*buffers;		/* The buffer contents. */
cvs diff: Diffing kern
Index: kern/vfs_bio.c
===================================================================
RCS file: /usr/ncvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.482
diff -u -r1.482 vfs_bio.c
--- kern/vfs_bio.c	25 Mar 2005 00:20:37 -0000	1.482
+++ kern/vfs_bio.c	19 Apr 2005 14:30:54 -0000
@@ -127,7 +127,7 @@
 static int lobufspace;
 SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
     "Minimum amount of buffers we want to have");
-static int hibufspace;
+int hibufspace;
 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
     "Maximum allowed value of bufspace (excluding buf_daemon)");
 static int bufreusecnt;
cvs diff: Diffing nfsclient
Index: nfsclient/nfs_bio.c
===================================================================
RCS file: /usr/ncvs/src/sys/nfsclient/nfs_bio.c,v
retrieving revision 1.150
diff -u -r1.150 nfs_bio.c
--- nfsclient/nfs_bio.c	18 Mar 2005 21:23:32 -0000	1.150
+++ nfsclient/nfs_bio.c	19 Apr 2005 15:11:13 -0000
@@ -844,6 +844,7 @@
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
+	off_t commitleft; /* if non-zero, the amount left we may write */
 	int bcount;
 	int n, on, error = 0;
 	int haverslock = 0;
@@ -873,6 +874,14 @@
 	 */
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
 		if (np->n_flag & NMODIFIED) {
+flush_and_restart:
+			/*
+			 * Require non-blocking, synchronous writes to
+			 * dirty files to inform the program it needs
+			 * to fsync(2) explicitly.
+			 */
+			if (ioflag & IO_NDELAY)
+				return (EAGAIN);
 			np->n_attrstamp = 0;
 			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error)
@@ -953,12 +962,65 @@
 	}
 
 	biosize = vp->v_mount->mnt_stat.f_iosize;
+	commitleft = 0;
+	/*
+	 * If there are possible modifications, find all of this file's
+	 * B_NEEDCOMMIT buffers.  If our writes would exceed the local
+	 * maximum per-file write commit size when combined with those,
+	 * we decide to flush and/or do a short write.
+	 */
+	if ((ioflag & (IO_SYNC | IO_INVAL)) != (IO_SYNC | IO_INVAL)) {
+		commitleft = nmp->nm_wcommitsize;
+		if (np->n_flag & NMODIFIED) {
+			int wouldcommit = 0;
+			BO_LOCK(vp->v_bufobj);
+			if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
+				TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
+				    b_bobufs) {
+					if (bp->b_flags & B_NEEDCOMMIT)
+						wouldcommit += bp->b_bcount;
+				}
+			}
+			BO_UNLOCK(vp->v_bufobj);
+			/*
+			 * Since we're not operating synchronously and
+			 * bypassing the buffer cache, we are in a commit
+			 * and holding all of these buffers whether
+			 * transmitted or not.  If not limited, this
+			 * will lead to the buffer cache deadlocking,
+			 * as no one else can flush our uncommitted buffers.
+			 */
+			wouldcommit += uio->uio_resid;
+			/*
+			 * If we would initially exceed the maximum
+			 * outstanding write commit size, flush and restart.
+			 */
+			if (wouldcommit > commitleft) {
+				if (haverslock) {
+					nfs_rsunlock(np, td);
+					haverslock = 0;
+				}
+				goto flush_and_restart;
+			}
+		} else {
+			/*
+			 * With no outstanding commits, we are limited only
+			 * by commitleft as to how far we can go.
+			 */
+		}
+	}
 
 	do {
 		nfsstats.biocache_writes++;
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize-1);
 		n = min((unsigned)(biosize - on), uio->uio_resid);
+		/* Always allow at least the very first write. */
+		if (commitleft > 0) {
+			commitleft -= n;
+			if (commitleft == 0)
+				commitleft = -1;
+		}
 again:
 		/*
 		 * Handle direct append and file extension cases, calculate
@@ -1146,10 +1208,21 @@
 		} else {
 			bdwrite(bp);
 		}
-	} while (uio->uio_resid > 0 && n > 0);
+	} while (uio->uio_resid > 0 && n > 0 && commitleft >= 0);
 
-	if (haverslock)
+	if (haverslock) {
+		haverslock = 0;
 		nfs_rsunlock(np, td);
+	}
+
+	/*
+	 * On the off chance that we're trying to do a very large
+	 * and non-atomic write, go ahead and let that just continue
+	 * within the same call after flushing out what's been written.
+	 */
+	if (error == 0 && uio->uio_resid > 0 && n > 0 && commitleft < 0 &&
+	    !(ioflag & IO_UNIT))
+		goto flush_and_restart;
 
 	return (error);
 }
Index: nfsclient/nfs_vfsops.c
===================================================================
RCS file: /usr/ncvs/src/sys/nfsclient/nfs_vfsops.c,v
retrieving revision 1.172
diff -u -r1.172 nfs_vfsops.c
--- nfsclient/nfs_vfsops.c	24 Mar 2005 07:37:22 -0000	1.172
+++ nfsclient/nfs_vfsops.c	19 Apr 2005 14:32:19 -0000
@@ -41,6 +41,8 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
@@ -626,6 +628,12 @@
 		else
 			nmp->nm_readahead = NFS_MAXRAHEAD;
 	}
+	if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) {
+		if (argp->wcommitsize < nmp->nm_wsize)
+			nmp->nm_wcommitsize = nmp->nm_wsize;
+		else
+			nmp->nm_wcommitsize = argp->wcommitsize;
+	}
 	if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) {
 		if (argp->deadthresh <= NFS_MAXDEADTHRESH)
 			nmp->nm_deadthresh = argp->deadthresh;
@@ -808,6 +816,7 @@
 		nmp->nm_wsize = NFS_WSIZE;
 		nmp->nm_rsize = NFS_RSIZE;
 	}
+	nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000);
 	nmp->nm_readdirsize = NFS_READDIRSIZE;
 	nmp->nm_numgrps = NFS_MAXGRPS;
 	nmp->nm_readahead = NFS_DEFRAHEAD;
Index: nfsclient/nfsargs.h
===================================================================
RCS file: /usr/ncvs/src/sys/nfsclient/nfsargs.h,v
retrieving revision 1.67
diff -u -r1.67 nfsargs.h
--- nfsclient/nfsargs.h	7 Jan 2005 01:45:51 -0000	1.67
+++ nfsclient/nfsargs.h	19 Apr 2005 14:30:54 -0000
@@ -56,7 +56,7 @@
 	int		retrans;	/* times to retry send */
 	int		maxgrouplist;	/* Max. size of group list */
 	int		readahead;	/* # of blocks to readahead */
-	int		__pad1;		/* was "leaseterm" */
+	int		wcommitsize;	/* Max. write commit size in bytes */
 	int		deadthresh;	/* Retrans threshold */
 	char		*hostname;	/* server's name */
 	int		acregmin;	/* cache attrs for reg files min time */
@@ -80,7 +80,7 @@
 #define	NFSMNT_NFSV3		0x00000200  /* Use NFS Version 3 protocol */
 /* 0x400 free, was NFSMNT_KERB */
 #define	NFSMNT_DUMBTIMR		0x00000800  /* Don't estimate rtt dynamically */
-/* 0x1000 free, was NFSMNT_LEASETERM */
+#define NFSMNT_WCOMMITSIZE	0x00001000  /* set max write commit size */
 #define	NFSMNT_READAHEAD	0x00002000  /* set read ahead */
 #define	NFSMNT_DEADTHRESH	0x00004000  /* set dead server retry thresh */
 #define	NFSMNT_RESVPORT		0x00008000  /* Allocate a reserved port */
Index: nfsclient/nfsmount.h
===================================================================
RCS file: /usr/ncvs/src/sys/nfsclient/nfsmount.h,v
retrieving revision 1.29
diff -u -r1.29 nfsmount.h
--- nfsclient/nfsmount.h	7 Jan 2005 01:45:51 -0000	1.29
+++ nfsclient/nfsmount.h	19 Apr 2005 14:30:54 -0000
@@ -74,6 +74,7 @@
 	int	nm_wsize;		/* Max size of write rpc */
 	int	nm_readdirsize;		/* Size of a readdir rpc */
 	int	nm_readahead;		/* Num. of blocks to readahead */
+	int	nm_wcommitsize;		/* Max size of commit for write */
 	int	nm_acdirmin;		/* Directory attr cache min lifetime */
 	int	nm_acdirmax;		/* Directory attr cache max lifetime */
 	int	nm_acregmin;		/* Reg file attr cache min lifetime */

-- 
Brian Fundakowski Feldman                           \'[ FreeBSD ]''''''''''\
  <> green@FreeBSD.org                               \  The Power to Serve! \
 Opinions expressed are my own.                       \,,,,,,,,,,,,,,,,,,,,,,\



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20050419151800.GE1157>