Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 16 Aug 2016 21:02:30 +0000 (UTC)
From:      Kirk McKusick <mckusick@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r304239 - head/sys/ufs/ffs
Message-ID:  <201608162102.u7GL2Uix072272@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mckusick
Date: Tue Aug 16 21:02:30 2016
New Revision: 304239
URL: https://svnweb.freebsd.org/changeset/base/304239

Log:
  Bug 211013 reports that a write error to a UFS filesystem running
  with softupdates panics the kernel. The problem that has been pointed
  out is that when there is a transient write error on certain metadata
  blocks, specifically directory blocks (PAGEDEP), inode blocks
  (INODEDEP), indirect pointer blocks (INDIRDEPS), and cylinder group
  (BMSAFEMAP, but only when journaling is enabled), we get a panic
  in one of the routines called by softdep_disk_io_initiation that
  the I/O is "already started" when we retry the write.
  
  These dependency types potentially need to do roll-backs when called
  by softdep_disk_io_initiation before doing a write and then a
  roll-forward when called by softdep_disk_write_complete after the
  I/O completes.  The panic happens when there is a transient error.
  At the top of softdep_disk_write_complete we check to see if the
  write had an error and if an error occurred we just return.  This
  return is correct most of the time because the main role of the routines
  called by softdep_disk_write_complete is to process the now-completed
  dependencies so that the next I/O steps can happen.
  
  But for the four types listed above, they do not get to do their
  rollback operations. This causes the panic when softdep_disk_io_initiation
  gets called on the second attempt to do the write and the roll-back
  routines find that the roll-backs have already been done. As an
  aside I note that there is also the problem that the buffer will
  have been unlocked and thus made visible to the filesystem and to
  user applications with the roll-backs in place.
  
  The way to resolve the problem is to add a flag to the routines called
  by softdep_disk_write_complete for the four dependency types noted
  that indicates whether the write was successful (WRITESUCCEEDED).
  If the write does not succeed, they do just the roll-backs and then
  return. If the write was successful they also do their usual
  processing of the now-completed dependencies.
  
  The fix was tested by selectively injecting write errors for buffers
  holding dependencies of each of the four types noted above and then
  verifying that the kernel no longer paniced and that following the
  successful retry of the write that the filesystem could be unmounted
  and successfully checked cleanly.
  
  PR: 211013
  Reviewed by: kib

Modified:
  head/sys/ufs/ffs/ffs_softdep.c
  head/sys/ufs/ffs/softdep.h

Modified: head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- head/sys/ufs/ffs/ffs_softdep.c	Tue Aug 16 20:35:36 2016	(r304238)
+++ head/sys/ufs/ffs/ffs_softdep.c	Tue Aug 16 21:02:30 2016	(r304239)
@@ -752,16 +752,16 @@ static	int flush_newblk_dep(struct vnode
 static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
 static	int flush_deplist(struct allocdirectlst *, int, int *);
 static	int sync_cgs(struct mount *, int);
-static	int handle_written_filepage(struct pagedep *, struct buf *);
+static	int handle_written_filepage(struct pagedep *, struct buf *, int);
 static	int handle_written_sbdep(struct sbdep *, struct buf *);
 static	void initiate_write_sbdep(struct sbdep *);
 static	void diradd_inode_written(struct diradd *, struct inodedep *);
 static	int handle_written_indirdep(struct indirdep *, struct buf *,
-	    struct buf**);
-static	int handle_written_inodeblock(struct inodedep *, struct buf *);
+	    struct buf**, int);
+static	int handle_written_inodeblock(struct inodedep *, struct buf *, int);
 static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
 	    uint8_t *);
-static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
+static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
 static	void handle_written_jaddref(struct jaddref *);
 static	void handle_written_jremref(struct jremref *);
 static	void handle_written_jseg(struct jseg *, struct buf *);
@@ -10874,6 +10874,10 @@ initiate_write_bmsafemap(bmsafemap, bp)
 	struct fs *fs;
 	ino_t ino;
 
+	/*
+	 * If this is a background write, we did this at the time that
+	 * the copy was made, so do not need to do it again.
+	 */
 	if (bmsafemap->sm_state & IOSTARTED)
 		return;
 	bmsafemap->sm_state |= IOSTARTED;
@@ -10947,10 +10951,39 @@ softdep_disk_write_complete(bp)
 
 	/*
 	 * If an error occurred while doing the write, then the data
-	 * has not hit the disk and the dependencies cannot be unrolled.
+	 * has not hit the disk and the dependencies cannot be processed.
+	 * But we do have to go through and roll forward any dependencies
+	 * that were rolled back before the disk write.
 	 */
-	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
+	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
+		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
+			switch (wk->wk_type) {
+
+			case D_PAGEDEP:
+				handle_written_filepage(WK_PAGEDEP(wk), bp, 0);
+				continue;
+
+			case D_INODEDEP:
+				handle_written_inodeblock(WK_INODEDEP(wk),
+				    bp, 0);
+				continue;
+
+			case D_BMSAFEMAP:
+				handle_written_bmsafemap(WK_BMSAFEMAP(wk),
+				    bp, 0);
+				continue;
+
+			case D_INDIRDEP:
+				handle_written_indirdep(WK_INDIRDEP(wk),
+				    bp, &sbp, 0);
+				continue;
+			default:
+				/* nothing to roll forward */
+				continue;
+			}
+		}
 		return;
+	}
 	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
 		return;
 	ump = VFSTOUFS(wk->wk_mp);
@@ -10970,17 +11003,20 @@ softdep_disk_write_complete(bp)
 		switch (wk->wk_type) {
 
 		case D_PAGEDEP:
-			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
+			if (handle_written_filepage(WK_PAGEDEP(wk), bp,
+			    WRITESUCCEEDED))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_INODEDEP:
-			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
+			if (handle_written_inodeblock(WK_INODEDEP(wk), bp,
+			    WRITESUCCEEDED))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_BMSAFEMAP:
-			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
+			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp,
+			    WRITESUCCEEDED))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
@@ -10999,7 +11035,8 @@ softdep_disk_write_complete(bp)
 			continue;
 
 		case D_INDIRDEP:
-			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
+			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp,
+			    WRITESUCCEEDED))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
@@ -11299,12 +11336,17 @@ handle_bufwait(inodedep, refhd)
  * Called from within softdep_disk_write_complete above to restore
  * in-memory inode block contents to their most up-to-date state. Note
  * that this routine is always called from interrupt level with further
- * splbio interrupts blocked.
+ * interrupts from this device blocked.
+ *
+ * If the write did not succeed, we will do all the roll-forward
+ * operations, but we will not take the actions that will allow its
+ * dependencies to be processed.
  */
 static int 
-handle_written_inodeblock(inodedep, bp)
+handle_written_inodeblock(inodedep, bp, flags)
 	struct inodedep *inodedep;
 	struct buf *bp;		/* buffer containing the inode block */
+	int flags;
 {
 	struct freefile *freefile;
 	struct allocdirect *adp, *nextadp;
@@ -11334,7 +11376,8 @@ handle_written_inodeblock(inodedep, bp)
 	/*
 	 * Leave this inodeblock dirty until it's in the list.
 	 */
-	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
+	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED &&
+	    (flags & WRITESUCCEEDED)) {
 		struct inodedep *inon;
 
 		inon = TAILQ_NEXT(inodedep, id_unlinked);
@@ -11373,7 +11416,8 @@ handle_written_inodeblock(inodedep, bp)
 			goto bufwait;
 		return (1);
 	}
-	inodedep->id_state |= COMPLETE;
+	if (flags & WRITESUCCEEDED)
+		inodedep->id_state |= COMPLETE;
 	/*
 	 * Roll forward anything that had to be rolled back before 
 	 * the inode could be updated.
@@ -11488,6 +11532,13 @@ handle_written_inodeblock(inodedep, bp)
 		bdirty(bp);
 bufwait:
 	/*
+	 * If the write did not succeed, we have done all the roll-forward
+	 * operations, but we cannot take the actions that will allow its
+	 * dependencies to be processed.
+	 */
+	if ((flags & WRITESUCCEEDED) == 0)
+		return (hadchanges);
+	/*
 	 * Process any allocdirects that completed during the update.
 	 */
 	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
@@ -11544,11 +11595,20 @@ bufwait:
 	return (hadchanges);
 }
 
+/*
+ * Perform needed roll-forwards and kick off any dependencies that
+ * can now be processed.
+ *
+ * If the write did not succeed, we will do all the roll-forward
+ * operations, but we will not take the actions that will allow its
+ * dependencies to be processed.
+ */
 static int
-handle_written_indirdep(indirdep, bp, bpp)
+handle_written_indirdep(indirdep, bp, bpp, flags)
 	struct indirdep *indirdep;
 	struct buf *bp;
 	struct buf **bpp;
+	int flags;
 {
 	struct allocindir *aip;
 	struct buf *sbp;
@@ -11573,6 +11633,16 @@ handle_written_indirdep(indirdep, bp, bp
 	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
 	indirdep->ir_state |= ATTACHED;
 	/*
+	 * If the write did not succeed, we have done all the roll-forward
+	 * operations, but we cannot take the actions that will allow its
+	 * dependencies to be processed.
+	 */
+	if ((flags & WRITESUCCEEDED) == 0) {
+		stat_indir_blk_ptrs++;
+		bdirty(bp);
+		return (1);
+	}
+	/*
 	 * Move allocindirs with written pointers to the completehd if
 	 * the indirdep's pointer is not yet written.  Otherwise
 	 * free them here.
@@ -11726,11 +11796,16 @@ jnewblk_rollforward(jnewblk, fs, cgp, bl
  * Complete a write to a bmsafemap structure.  Roll forward any bitmap
  * changes if it's not a background write.  Set all written dependencies 
  * to DEPCOMPLETE and free the structure if possible.
+ *
+ * If the write did not succeed, we will do all the roll-forward
+ * operations, but we will not take the actions that will allow its
+ * dependencies to be processed.
  */
 static int
-handle_written_bmsafemap(bmsafemap, bp)
+handle_written_bmsafemap(bmsafemap, bp, flags)
 	struct bmsafemap *bmsafemap;
 	struct buf *bp;
+	int flags;
 {
 	struct newblk *newblk;
 	struct inodedep *inodedep;
@@ -11746,15 +11821,20 @@ handle_written_bmsafemap(bmsafemap, bp)
 	int chgs;
 
 	if ((bmsafemap->sm_state & IOSTARTED) == 0)
-		panic("initiate_write_bmsafemap: Not started\n");
+		panic("handle_written_bmsafemap: Not started\n");
 	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
 	chgs = 0;
 	bmsafemap->sm_state &= ~IOSTARTED;
 	foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
 	/*
-	 * Release journal work that was waiting on the write.
+	 * If write was successful, release journal work that was waiting
+	 * on the write. Otherwise move the work back.
 	 */
-	handle_jwork(&bmsafemap->sm_freewr);
+	if (flags & WRITESUCCEEDED)
+		handle_jwork(&bmsafemap->sm_freewr);
+	else
+		LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
+		    worklist, wk_list);
 
 	/*
 	 * Restore unwritten inode allocation pending jaddref writes.
@@ -11804,6 +11884,20 @@ handle_written_bmsafemap(bmsafemap, bp)
 			free_jnewblk(jnewblk);
 		}
 	}
+	/*
+	 * If the write did not succeed, we have done all the roll-forward
+	 * operations, but we cannot take the actions that will allow its
+	 * dependencies to be processed.
+	 */
+	if ((flags & WRITESUCCEEDED) == 0) {
+		LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
+		    newblk, nb_deps);
+		LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
+		    worklist, wk_list);
+		if (foreground)
+			bdirty(bp);
+		return (1);
+	}
 	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
 		newblk->nb_state |= DEPCOMPLETE;
 		newblk->nb_state &= ~ONDEPLIST;
@@ -11907,12 +12001,17 @@ free_pagedep(pagedep)
  * A write operation was just completed. Removed inodes can
  * now be freed and associated block pointers may be committed.
  * Note that this routine is always called from interrupt level
- * with further splbio interrupts blocked.
+ * with further interrupts from this device blocked.
+ *
+ * If the write did not succeed, we will do all the roll-forward
+ * operations, but we will not take the actions that will allow its
+ * dependencies to be processed.
  */
 static int 
-handle_written_filepage(pagedep, bp)
+handle_written_filepage(pagedep, bp, flags)
 	struct pagedep *pagedep;
 	struct buf *bp;		/* buffer containing the written page */
+	int flags;
 {
 	struct dirrem *dirrem;
 	struct diradd *dap, *nextdap;
@@ -11922,6 +12021,8 @@ handle_written_filepage(pagedep, bp)
 	if ((pagedep->pd_state & IOSTARTED) == 0)
 		panic("handle_written_filepage: not started");
 	pagedep->pd_state &= ~IOSTARTED;
+	if ((flags & WRITESUCCEEDED) == 0)
+		goto rollforward;
 	/*
 	 * Process any directory removals that have been committed.
 	 */
@@ -11941,6 +12042,7 @@ handle_written_filepage(pagedep, bp)
 	if ((pagedep->pd_state & NEWBLOCK) == 0)
 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 			free_diradd(dap, NULL);
+rollforward:
 	/*
 	 * Uncommitted directory entries must be restored.
 	 */
@@ -11973,7 +12075,7 @@ handle_written_filepage(pagedep, bp)
 	 * marked dirty so that its will eventually get written back in
 	 * its correct form.
 	 */
-	if (chgs) {
+	if (chgs || (flags & WRITESUCCEEDED) == 0) {
 		if ((bp->b_flags & B_DELWRI) == 0)
 			stat_dir_entry++;
 		bdirty(bp);

Modified: head/sys/ufs/ffs/softdep.h
==============================================================================
--- head/sys/ufs/ffs/softdep.h	Tue Aug 16 20:35:36 2016	(r304238)
+++ head/sys/ufs/ffs/softdep.h	Tue Aug 16 21:02:30 2016	(r304239)
@@ -140,6 +140,7 @@
 #define	UNLINKPREV	0x100000 /* inodedep is pointed at in the unlink list */
 #define	UNLINKONLIST	0x200000 /* inodedep is in the unlinked list on disk */
 #define	UNLINKLINKS	(UNLINKNEXT | UNLINKPREV)
+#define	WRITESUCCEEDED	0x400000 /* the disk write completed successfully */
 
 #define	ALLCOMPLETE	(ATTACHED | COMPLETE | DEPCOMPLETE)
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201608162102.u7GL2Uix072272>