Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 20 Sep 2017 06:34:48 +0000 (UTC)
From:      Andriy Gapon <avg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r323789 - vendor-sys/illumos/dist/uts/common/fs/zfs
Message-ID:  <201709200634.v8K6Ym39087165@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: avg
Date: Wed Sep 20 06:34:48 2017
New Revision: 323789
URL: https://svnweb.freebsd.org/changeset/base/323789

Log:
  8473 scrub does not detect errors on active spares
  
  illumos/illumos-gate@554675eee75dd2d7398d960aa5c81083ceb8505a
  https://github.com/illumos/illumos-gate/commit/554675eee75dd2d7398d960aa5c81083ceb8505a
  
  https://www.illumos.org/issues/8473
    Scrubbing is supposed to detect and repair all errors in the pool. However, it
    wrongly ignores active spare devices. The problem can easily be reproduced in
    OpenZFS at git rev 0ef125d with these commands:
  
    truncate -s 64m /tmp/a /tmp/b /tmp/c
    sudo zpool create testpool mirror /tmp/a /tmp/b spare /tmp/c
    sudo zpool replace testpool /tmp/a /tmp/c
    /bin/dd if=/dev/zero bs=1024k count=63 oseek=1 conv=notrunc of=/tmp/c
    sync
    sudo zpool scrub testpool
    zpool status testpool # Will show 0 errors, which is wrong
    sudo zpool offline testpool /tmp/a
    sudo zpool scrub testpool
    zpool status testpool # Will show errors on /tmp/c, which should've already been fixed
  
    FreeBSD head is partially affected: the first scrub will detect some errors,
    but the second scrub will detect more.
  
  Reviewed by: Andy Stormont <astormont@racktopsystems.com>
  Reviewed by: Matt Ahrens <mahrens@delphix.com>
  Reviewed by: George Wilson <george.wilson@delphix.com>
  Approved by: Richard Lowe <richlowe@richlowe.net>
  Author: Alan Somers <asomers@gmail.com>

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c	Wed Sep 20 06:29:11 2017	(r323788)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c	Wed Sep 20 06:34:48 2017	(r323789)
@@ -29,6 +29,9 @@
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/abd.h>
@@ -49,7 +52,7 @@ typedef struct mirror_child {
 
 typedef struct mirror_map {
 	int		mm_children;
-	int		mm_replacing;
+	int		mm_resilvering;
 	int		mm_preferred;
 	int		mm_root;
 	mirror_child_t	mm_child[1];
@@ -86,7 +89,7 @@ vdev_mirror_map_alloc(zio_t *zio)
 
 		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
 		mm->mm_children = c;
-		mm->mm_replacing = B_FALSE;
+		mm->mm_resilvering = B_FALSE;
 		mm->mm_preferred = spa_get_random(c);
 		mm->mm_root = B_TRUE;
 
@@ -109,13 +112,51 @@ vdev_mirror_map_alloc(zio_t *zio)
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 		}
 	} else {
+		int replacing;
+
 		c = vd->vdev_children;
 
 		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
 		mm->mm_children = c;
-		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+		/*
+		 * If we are resilvering, then we should handle scrub reads
+		 * differently; we shouldn't issue them to the resilvering
+		 * device because it might not have those blocks.
+		 *
+		 * We are resilvering iff:
+		 * 1) We are a replacing vdev (ie our name is "replacing-1" or
+		 *    "spare-1" or something like that), and
+		 * 2) The pool is currently being resilvered.
+		 *
+		 * We cannot simply check vd->vdev_resilver_txg, because it's
+		 * not set in this path.
+		 *
+		 * Nor can we just check our vdev_ops; there are cases (such as
+		 * when a user types "zpool replace pool odev spare_dev" and
+		 * spare_dev is in the spare list, or when a spare device is
+		 * automatically used to replace a DEGRADED device) when
+		 * resilvering is complete but both the original vdev and the
+		 * spare vdev remain in the pool.  That behavior is intentional.
+		 * It helps implement the policy that a spare should be
+		 * automatically removed from the pool after the user replaces
+		 * the device that originally failed.
+		 */
+		replacing = (vd->vdev_ops == &vdev_replacing_ops ||
 		    vd->vdev_ops == &vdev_spare_ops);
-		mm->mm_preferred = mm->mm_replacing ? 0 :
+		/*
+		 * If a spa load is in progress, then spa_dsl_pool may be
+		 * uninitialized.  But we shouldn't be resilvering during a spa
+		 * load anyway.
+		 */
+		if (replacing &&
+		    (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) &&
+		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) {
+			mm->mm_resilvering = B_TRUE;
+		} else {
+			mm->mm_resilvering = B_FALSE;
+		}
+
+		mm->mm_preferred = mm->mm_resilvering ? 0 :
 		    (zio->io_offset >> vdev_mirror_shift) % c;
 		mm->mm_root = B_FALSE;
 
@@ -271,7 +312,7 @@ vdev_mirror_io_start(zio_t *zio)
 	mm = vdev_mirror_map_alloc(zio);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
 			/*
 			 * For scrubbing reads we need to allocate a read
 			 * buffer for each child and issue reads to all
@@ -408,7 +449,7 @@ vdev_mirror_io_done(zio_t *zio)
 	if (good_copies && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
-	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
+	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
 		 */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201709200634.v8K6Ym39087165>