FreeBSD Mail Archives

Date:      Thu, 13 Dec 2001 10:35:56 -0800 (PST)
From:      Matthew Dillon <dillon@apollo.backplane.com>
To:        David Greenman <dg@root.com>
Cc:        Jordan Hubbard <jkh@winston.freebsd.org>, Peter Wemm <peter@wemm.org>, Mike Smith <msmith@FreeBSD.ORG>, hackers@FreeBSD.ORG, msmith@mass.dis.org
Subject:   NFS Patch #4 -- survived overnight test. (was Re: Found NFS data corruption bug... (was Re:...))
Message-ID:  <200112131835.fBDIZuL70031@apollo.backplane.com>
References:  <58885.1008217148@winston.freebsd.org> <200112130608.fBD689K49906@apollo.backplane.com> <20011212224927.A73226@nexus.root.com>

    Ok, here is the latest patch set.  This patch set survived an 
    overnight run of the nfs torture test that Jordan posted... it
    got through 597,000 test calls over NFSv3, 367,000 over NFSv2, and
    1.35 million on a local filesystem.

    This patch set is for -stable.  It again has the old version of the
    softupdates patch (the new version will be MFC'd in two days from
    -current).

    In addition to previous bug fixes this patch fixes:

	* An edge case with shared R+W mmap()'s and truncate

	* A bug in my previous patch set related to a junked buffer

	* The straddle case for VM pages and buffer cache buffers when
	  truncating.

    That's 8 significant bug fixes total, all rolled into one :-)

    I am going to forward port and test this stuff on -current today and
    probably commit it to -current tonight, with a 3-day turn to -stable.

    I would appreciate other VM gurus taking a look at the 
    vm_page_set_validclean() changes.

    Are there still bugs in NFS?  You bet!   I'm sure there are bugs
    related to multiple clients and/or the server modifying files out
    from under a client, and I think the potential issue with nfsiod
    ordering that was posted to the list is a definite possibility.  But
    this should go a long way to improving the rock solid reputation of
    our stack!

						-Matt

Index: kern/vfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.242.2.13
diff -u -r1.242.2.13 vfs_bio.c
--- kern/vfs_bio.c	2001/11/18 07:10:59	1.242.2.13
+++ kern/vfs_bio.c	2001/12/13 05:52:55
@@ -2189,9 +2189,22 @@
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
+		 *
+		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
+		 * above while extending the buffer, we cannot allow the
+		 * buffer to remain with B_CACHE set after the write
+		 * completes or it will represent a corrupt state.  To
+		 * deal with this we set B_NOCACHE to scrap the buffer
+		 * after the write.
+		 *
+		 * We might be able to do something fancy, like setting
+		 * B_CACHE in bwrite() except if B_DELWRI is already set,
+		 * so the below call doesn't set B_CACHE, but that gets real
+		 * confusing.  This is much easier.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+			bp->b_flags |= B_NOCACHE;
 			VOP_BWRITE(bp->b_vp, bp);
 			goto loop;
 		}
Index: nfs/nfs.h
===================================================================
RCS file: /home/ncvs/src/sys/nfs/Attic/nfs.h,v
retrieving revision 1.53.2.3
diff -u -r1.53.2.3 nfs.h
--- nfs/nfs.h	2001/09/11 09:49:54	1.53.2.3
+++ nfs/nfs.h	2001/12/13 06:47:28
@@ -721,6 +721,8 @@
 			 struct proc *procp, struct mbuf **mrq));
 void	nfsrv_rcv __P((struct socket *so, void *arg, int waitflag));
 void	nfsrv_slpderef __P((struct nfssvc_sock *slp));
+int	nfs_meta_setsize __P((struct vnode *vp, struct ucred *cred, struct proc *p, u_quad_t nsize));
+
 #endif	/* _KERNEL */
 
 #endif
Index: nfs/nfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/nfs/Attic/nfs_bio.c,v
retrieving revision 1.83.2.1
diff -u -r1.83.2.1 nfs_bio.c
--- nfs/nfs_bio.c	2001/11/18 07:10:59	1.83.2.1
+++ nfs/nfs_bio.c	2001/12/13 18:24:01
@@ -193,8 +193,14 @@
 			vm_page_set_validclean(m, 0, size - toff);
 			/* handled by vm_fault now	  */
 			/* vm_page_zero_invalid(m, TRUE); */
+		} else {
+			/*
+			 * Read operation was short.  If no error occured
+			 * we may have hit a zero-fill section.   We simply
+			 * leave valid set to 0.
+			 */
+			;
 		}
-		
 		if (i != ap->a_reqpage) {
 			/*
 			 * Whether or not to leave the page activated is up in
@@ -896,9 +902,7 @@
 				else
 					bcount = np->n_size - (off_t)lbn * biosize;
 			}
-
 			bp = nfs_getcacheblk(vp, lbn, bcount, p);
-
 			if (uio->uio_offset + n > np->n_size) {
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
@@ -1409,11 +1413,13 @@
 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
 	    io.iov_base = bp->b_data;
 	    uiop->uio_rw = UIO_READ;
+
 	    switch (vp->v_type) {
 	    case VREG:
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
 		nfsstats.read_bios++;
 		error = nfs_readrpc(vp, uiop, cr);
+
 		if (!error) {
 		    if (uiop->uio_resid) {
 			/*
@@ -1425,7 +1431,7 @@
 			 * writes, but that is not possible any longer.
 			 */
 			int nread = bp->b_bcount - uiop->uio_resid;
-			int left  = bp->b_bcount - nread;
+			int left  = uiop->uio_resid;
 
 			if (left > 0)
 				bzero((char *)bp->b_data + nread, left);
@@ -1597,3 +1603,48 @@
 	biodone(bp);
 	return (error);
 }
+
+/*
+ * Used to aid in handling ftruncate() operations on the NFS client side.
+ * Truncation creates a number of special problems for NFS.  We have to
+ * throw away VM pages and buffer cache buffers that are beyond EOF, and
+ * we have to properly handle VM pages or (potentially dirty) buffers
+ * that straddle the truncation point.
+ */
+
+int
+nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct proc *p, u_quad_t nsize)
+{
+	struct nfsnode *np = VTONFS(vp);
+	u_quad_t tsize = np->n_size;
+	int biosize = vp->v_mount->mnt_stat.f_iosize;
+	int error = 0;
+
+	np->n_size = nsize;
+
+	if (np->n_size < tsize) {
+		struct buf *bp;
+		daddr_t lbn;
+		int bufsize;
+
+		/*
+		 * vtruncbuf() doesn't get the buffer overlapping the 
+		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
+		 * buffer that now needs to be truncated.
+		 */
+		error = vtruncbuf(vp, cred, p, nsize, biosize);
+		lbn = nsize / biosize;
+		bufsize = nsize & (biosize - 1);
+		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
+		if (bp->b_dirtyoff > bp->b_bcount)
+			bp->b_dirtyoff = bp->b_bcount;
+		if (bp->b_dirtyend > bp->b_bcount)
+			bp->b_dirtyend = bp->b_bcount;
+		bp->b_flags |= B_RELBUF;  /* don't leave garbage around */
+		brelse(bp);
+	} else {
+		vnode_pager_setsize(vp, nsize);
+	}
+	return(error);
+}
+
Index: nfs/nfs_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/nfs/Attic/nfs_vnops.c,v
retrieving revision 1.150.2.4
diff -u -r1.150.2.4 nfs_vnops.c
--- nfs/nfs_vnops.c	2001/08/05 00:23:58	1.150.2.4
+++ nfs/nfs_vnops.c	2001/12/13 06:46:32
@@ -710,7 +710,17 @@
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
-			vnode_pager_setsize(vp, vap->va_size);
+
+			/*
+			 * We run vnode_pager_setsize() early (why?),
+			 * we must set np->n_size now to avoid vinvalbuf
+			 * V_SAVE races that might setsize a lower
+			 * value.
+			 */
+
+			error = nfs_meta_setsize(vp, ap->a_cred, 
+						ap->a_p, vap->va_size);
+
  			if (np->n_flag & NMODIFIED) {
  			    if (vap->va_size == 0)
  				error = nfs_vinvalbuf(vp, 0,
@@ -719,12 +729,12 @@
  				error = nfs_vinvalbuf(vp, V_SAVE,
  					ap->a_cred, ap->a_p, 1);
  			    if (error) {
+				np->n_size = tsize;
 				vnode_pager_setsize(vp, np->n_size);
  				return (error);
 			    }
  			}
- 			tsize = np->n_size;
- 			np->n_size = np->n_vattr.va_size = vap->va_size;
+			np->n_vattr.va_size = vap->va_size;
   		};
   	} else if ((vap->va_mtime.tv_sec != VNOVAL ||
 		vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) &&
@@ -1119,10 +1129,12 @@
 		m_freem(mrep);
 		tsiz -= retlen;
 		if (v3) {
-			if (eof || retlen == 0)
+			if (eof || retlen == 0) {
 				tsiz = 0;
-		} else if (retlen < len)
+			}
+		} else if (retlen < len) {
 			tsiz = 0;
+		}
 	}
 nfsmout:
 	return (error);
@@ -3397,3 +3409,4 @@
 	}
 	return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap));
 }
+
Index: ufs/ffs/ffs_inode.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_inode.c,v
retrieving revision 1.56.2.3
diff -u -r1.56.2.3 ffs_inode.c
--- ufs/ffs/ffs_inode.c	2001/11/20 20:27:27	1.56.2.3
+++ ufs/ffs/ffs_inode.c	2001/12/12 23:43:36
@@ -244,6 +244,10 @@
 		if (error) {
 			return (error);
 		}
+		if (length > 0 && DOINGSOFTDEP(ovp)) {
+			if ((error = VOP_FSYNC(ovp, cred, MNT_WAIT, p)) != 0)
+				return (error);
+		}
 		oip->i_size = length;
 		size = blksize(fs, oip, lbn);
 		if (ovp->v_type != VDIR)
@@ -359,6 +363,10 @@
 		if (newspace == 0)
 			panic("ffs_truncate: newspace");
 		if (oldspace - newspace > 0) {
+			/*
+			 * XXX Softupdates, adjust queued directblock
+			 *     in place rather then the second FSYNC above?
+			 */
 			/*
 			 * Block number of space to be free'd is
 			 * the old block # plus the number of frags
Index: vm/vm_page.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_page.c,v
retrieving revision 1.147.2.12
diff -u -r1.147.2.12 vm_page.c
--- vm/vm_page.c	2001/11/03 19:59:28	1.147.2.12
+++ vm/vm_page.c	2001/12/13 10:42:52
@@ -1625,10 +1625,27 @@
 	 * use this opportunity to clear the PG_NOSYNC flag.  If a process
 	 * takes a write fault on a MAP_NOSYNC memory area the flag will
 	 * be set again.
+	 *
+	 * We set valid bits inclusive of any overlap, but we can only
+	 * clear dirty bits for DEV_BSIZE chunks that are fully within
+	 * the range.
 	 */
 
 	pagebits = vm_page_bits(base, size);
 	m->valid |= pagebits;
+#if 1
+	if ((base & (DEV_BSIZE - 1)) || (size & (DEV_BSIZE - 1))) {
+	    int adj;
+
+	    adj = DEV_BSIZE - (base & (DEV_BSIZE - 1));
+	    base += adj;
+	    if (size < adj)
+		size = 0;
+	    else
+		size = (size - adj) & ~(DEV_BSIZE - 1);
+	    pagebits = vm_page_bits(base, size);
+	}
+#endif
 	m->dirty &= ~pagebits;
 	if (base == 0 && size == PAGE_SIZE) {
 		pmap_clear_modify(m);
Index: vm/vnode_pager.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vnode_pager.c,v
retrieving revision 1.116.2.5
diff -u -r1.116.2.5 vnode_pager.c
--- vm/vnode_pager.c	2001/11/09 03:21:22	1.116.2.5
+++ vm/vnode_pager.c	2001/12/13 10:39:21
@@ -290,14 +290,18 @@
 		}
 		/*
 		 * this gets rid of garbage at the end of a page that is now
-		 * only partially backed by the vnode...
+		 * only partially backed by the vnode.
+		 *
+		 * XXX for some reason (I don't know yet), if we take a
+		 * completely invalid page and mark it partially valid
+		 * it can screw up NFS reads, so we don't allow the case.
 		 */
 		if (nsize & PAGE_MASK) {
 			vm_offset_t kva;
 			vm_page_t m;
 
 			m = vm_page_lookup(object, OFF_TO_IDX(nsize));
-			if (m) {
+			if (m && m->valid) {
 				int base = (int)nsize & PAGE_MASK;
 				int size = PAGE_SIZE - base;
 
@@ -310,6 +314,20 @@
 				vm_pager_unmap_page(kva);
 
 				/*
+				 * XXX work around SMP data integrity race
+				 * by unmapping the page from user processes.
+				 * The garbage we just cleared may be mapped
+				 * to a user process running on another cpu
+				 * and this code is not running through normal
+				 * I/O channels which handle SMP issues for
+				 * us, so unmap page to synchronize all cpus.
+				 *
+				 * XXX should vm_pager_unmap_page() have
+				 * dealt with this?
+				 */
+				vm_page_protect(m, VM_PROT_NONE);
+
+				/*
 				 * Clear out partial-page dirty bits.  This
 				 * has the side effect of setting the valid
 				 * bits, but that is ok.  There are a bunch
@@ -317,6 +335,10 @@
 				 * m->dirty == VM_PAGE_BITS_ALL.  The file EOF
 				 * case is one of them.  If the page is still
 				 * partially dirty, make it fully dirty.
+				 *
+				 * note that we do not clear out the valid
+				 * bits.  This would prevent bogus_page
+				 * replacement from working properly.
 				 */
 				vm_page_set_validclean(m, base, size);
 				if (m->dirty != 0)
@@ -960,6 +982,9 @@
 	 * may not properly clear the dirty bits for the entire page (which
 	 * could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
 	 * With the page locked we are free to fix-up the dirty bits here.
+	 *
+	 * We do not under any circumstances truncate the valid bits, as
+	 * this will screw up bogus page replacement.
 	 */
 	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > poffset) {

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-hackers" in the body of the message

Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200112131835.fBDIZuL70031>

Header And Logo

Peripheral Links

Site Navigation

Header And Logo

Peripheral Links

Search

Site Navigation