Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 28 Mar 2018 18:12:07 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r331695 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor-sys/illumo...
Message-ID:  <201803281812.w2SIC7Ti011304@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Wed Mar 28 18:12:06 2018
New Revision: 331695
URL: https://svnweb.freebsd.org/changeset/base/331695

Log:
  9166 zfs storage pool checkpoint
  
  illumos/illumos-gate@8671400134a11c848244896ca51a7db4d0f69da4
  
  The idea of Storage Pool Checkpoint (aka zpool checkpoint) deals with
  exactly that.  It can be thought of as a “pool-wide snapshot” (or a
  variation of extreme rewind that doesn’t corrupt your data).  It remembers
  the entire state of the pool at the point that it was taken and the user
  can revert back to it later or discard it.  Its generic use case is an
  administrator that is about to perform a set of destructive actions to ZFS
  as part of a critical procedure.  She takes a checkpoint of the pool before
  performing the actions, then rewinds back to it if one of them fails or puts
  the pool into an unexpected state.  Otherwise, she discards it.  With the
  assumption that no one else is making modifications to ZFS, she basically
  wraps all these actions into a “high-level transaction”.
  
  Reviewed by: Matthew Ahrens <mahrens@delphix.com>
  Reviewed by: John Kennedy <john.kennedy@delphix.com>
  Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
  Approved by: Richard Lowe <richlowe@richlowe.net>
  Author: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>

Modified:
  vendor/illumos/dist/cmd/zdb/zdb.c
  vendor/illumos/dist/cmd/zdb/zdb_il.c
  vendor/illumos/dist/cmd/zpool/zpool_main.c
  vendor/illumos/dist/cmd/ztest/ztest.c
  vendor/illumos/dist/lib/libzfs/common/libzfs.h
  vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c
  vendor/illumos/dist/lib/libzfs/common/libzfs_util.c
  vendor/illumos/dist/lib/libzfs_core/common/libzfs_core.c
  vendor/illumos/dist/lib/libzfs_core/common/libzfs_core.h
  vendor/illumos/dist/man/man1m/zdb.1m
  vendor/illumos/dist/man/man1m/zpool.1m
  vendor/illumos/dist/man/man5/zpool-features.5

Changes in other areas also in this revision:
Modified:
  vendor-sys/illumos/dist/common/zfs/zfeature_common.c
  vendor-sys/illumos/dist/common/zfs/zfeature_common.h
  vendor-sys/illumos/dist/common/zfs/zpool_prop.c
  vendor-sys/illumos/dist/uts/common/Makefile.files
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_destroy.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_synctask.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_userhold.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_dir.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_pool.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_synctask.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/range_tree.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_map.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/uberblock_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_removal.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zthr.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/uberblock.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_label.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zcp.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zcp_synctask.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zthr.c
  vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h

Modified: vendor/illumos/dist/cmd/zdb/zdb.c
==============================================================================
--- vendor/illumos/dist/cmd/zdb/zdb.c	Wed Mar 28 17:54:34 2018	(r331694)
+++ vendor/illumos/dist/cmd/zdb/zdb.c	Wed Mar 28 18:12:06 2018	(r331695)
@@ -131,7 +131,7 @@ static void
 usage(void)
 {
 	(void) fprintf(stderr,
-	    "Usage:\t%s [-AbcdDFGhiLMPsvX] [-e [-V] [-p <path> ...]] "
+	    "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] "
 	    "[-I <inflight I/Os>]\n"
 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
 	    "\t\t[<poolname> [<object> ...]]\n"
@@ -168,6 +168,8 @@ usage(void)
 	(void) fprintf(stderr, "        -h pool history\n");
 	(void) fprintf(stderr, "        -i intent logs\n");
 	(void) fprintf(stderr, "        -l read label contents\n");
+	(void) fprintf(stderr, "        -k examine the checkpointed state "
+	    "of the pool\n");
 	(void) fprintf(stderr, "        -L disable leak tracking (do not "
 	    "load spacemaps)\n");
 	(void) fprintf(stderr, "        -m metaslabs\n");
@@ -729,6 +731,22 @@ get_prev_obsolete_spacemap_refcount(spa_t *spa)
 }
 
 static int
+get_checkpoint_refcount(vdev_t *vd)
+{
+	int refcount = 0;
+
+	if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
+	    zap_contains(spa_meta_objset(vd->vdev_spa),
+	    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
+		refcount++;
+
+	for (uint64_t c = 0; c < vd->vdev_children; c++)
+		refcount += get_checkpoint_refcount(vd->vdev_child[c]);
+
+	return (refcount);
+}
+
+static int
 verify_spacemap_refcounts(spa_t *spa)
 {
 	uint64_t expected_refcount = 0;
@@ -741,6 +759,7 @@ verify_spacemap_refcounts(spa_t *spa)
 	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
 	actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
 	actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
+	actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
 
 	if (expected_refcount != actual_refcount) {
 		(void) printf("space map refcount mismatch: expected %lld != "
@@ -814,8 +833,8 @@ static void
 dump_metaslab_stats(metaslab_t *msp)
 {
 	char maxbuf[32];
-	range_tree_t *rt = msp->ms_tree;
-	avl_tree_t *t = &msp->ms_size_tree;
+	range_tree_t *rt = msp->ms_allocatable;
+	avl_tree_t *t = &msp->ms_allocatable_by_size;
 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
 	/* max sure nicenum has enough space */
@@ -851,7 +870,7 @@ dump_metaslab(metaslab_t *msp)
 		metaslab_load_wait(msp);
 		if (!msp->ms_loaded) {
 			VERIFY0(metaslab_load(msp));
-			range_tree_stat_verify(msp->ms_tree);
+			range_tree_stat_verify(msp->ms_allocatable);
 		}
 		dump_metaslab_stats(msp);
 		metaslab_unload(msp);
@@ -2264,6 +2283,8 @@ dump_uberblock(uberblock_t *ub, const char *header, co
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
 		(void) printf("\trootbp = %s\n", blkbuf);
 	}
+	(void) printf("\tcheckpoint_txg = %llu\n",
+	    (u_longlong_t)ub->ub_checkpoint_txg);
 	(void) printf("%s", footer ? footer : "");
 }
 
@@ -2624,6 +2645,7 @@ static const char *zdb_ot_extname[] = {
 typedef struct zdb_cb {
 	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
 	uint64_t	zcb_removing_size;
+	uint64_t	zcb_checkpoint_size;
 	uint64_t	zcb_dedup_asize;
 	uint64_t	zcb_dedup_blocks;
 	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
@@ -2723,7 +2745,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const b
 	}
 
 	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
-	    refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
+	    refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
 	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
 }
 
@@ -2923,7 +2945,7 @@ claim_segment_impl_cb(uint64_t inner_offset, vdev_t *v
 	ASSERT(vdev_is_concrete(vd));
 
 	VERIFY0(metaslab_claim_impl(vd, offset, size,
-	    spa_first_txg(vd->vdev_spa)));
+	    spa_min_claim_txg(vd->vdev_spa)));
 }
 
 static void
@@ -2984,70 +3006,6 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
-/*
- * vm_idxp is an in-out parameter which (for indirect vdevs) is the
- * index in vim_entries that has the first entry in this metaslab.  On
- * return, it will be set to the first entry after this metaslab.
- */
-static void
-zdb_leak_init_ms(metaslab_t *msp, uint64_t *vim_idxp)
-{
-	metaslab_group_t *mg = msp->ms_group;
-	vdev_t *vd = mg->mg_vd;
-	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
-
-	mutex_enter(&msp->ms_lock);
-	metaslab_unload(msp);
-
-	/*
-	 * We don't want to spend the CPU manipulating the size-ordered
-	 * tree, so clear the range_tree ops.
-	 */
-	msp->ms_tree->rt_ops = NULL;
-
-	(void) fprintf(stderr,
-	    "\rloading vdev %llu of %llu, metaslab %llu of %llu ...",
-	    (longlong_t)vd->vdev_id,
-	    (longlong_t)rvd->vdev_children,
-	    (longlong_t)msp->ms_id,
-	    (longlong_t)vd->vdev_ms_count);
-
-	/*
-	 * For leak detection, we overload the metaslab ms_tree to
-	 * contain allocated segments instead of free segments. As a
-	 * result, we can't use the normal metaslab_load/unload
-	 * interfaces.
-	 */
-	if (vd->vdev_ops == &vdev_indirect_ops) {
-		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-		for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
-		    (*vim_idxp)++) {
-			vdev_indirect_mapping_entry_phys_t *vimep =
-			    &vim->vim_entries[*vim_idxp];
-			uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
-			uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
-			ASSERT3U(ent_offset, >=, msp->ms_start);
-			if (ent_offset >= msp->ms_start + msp->ms_size)
-				break;
-
-			/*
-			 * Mappings do not cross metaslab boundaries,
-			 * because we create them by walking the metaslabs.
-			 */
-			ASSERT3U(ent_offset + ent_len, <=,
-			    msp->ms_start + msp->ms_size);
-			range_tree_add(msp->ms_tree, ent_offset, ent_len);
-		}
-	} else if (msp->ms_sm != NULL) {
-		VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC));
-	}
-
-	if (!msp->ms_loaded) {
-		msp->ms_loaded = B_TRUE;
-	}
-	mutex_exit(&msp->ms_lock);
-}
-
 /* ARGSUSED */
 static int
 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
@@ -3104,7 +3062,243 @@ zdb_load_obsolete_counts(vdev_t *vd)
 	return (counts);
 }
 
+typedef struct checkpoint_sm_exclude_entry_arg {
+	vdev_t *cseea_vd;
+	uint64_t cseea_checkpoint_size;
+} checkpoint_sm_exclude_entry_arg_t;
+
+static int
+checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
+    void *arg)
+{
+	checkpoint_sm_exclude_entry_arg_t *cseea = arg;
+	vdev_t *vd = cseea->cseea_vd;
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	uint64_t end = offset + size;
+
+	ASSERT(type == SM_FREE);
+
+	/*
+	 * Since the vdev_checkpoint_sm exists in the vdev level
+	 * and the ms_sm space maps exist in the metaslab level,
+	 * an entry in the checkpoint space map could theoretically
+	 * cross the boundaries of the metaslab that it belongs.
+	 *
+	 * In reality, because of the way that we populate and
+	 * manipulate the checkpoint's space maps currently,
+	 * there shouldn't be any entries that cross metaslabs.
+	 * Hence the assertion below.
+	 *
+	 * That said, there is no fundamental requirement that
+	 * the checkpoint's space map entries should not cross
+	 * metaslab boundaries. So if needed we could add code
+	 * that handles metaslab-crossing segments in the future.
+	 */
+	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+	/*
+	 * By removing the entry from the allocated segments we
+	 * also verify that the entry is there to begin with.
+	 */
+	mutex_enter(&ms->ms_lock);
+	range_tree_remove(ms->ms_allocatable, offset, size);
+	mutex_exit(&ms->ms_lock);
+
+	cseea->cseea_checkpoint_size += size;
+	return (0);
+}
+
 static void
+zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
+{
+	spa_t *spa = vd->vdev_spa;
+	space_map_t *checkpoint_sm = NULL;
+	uint64_t checkpoint_sm_obj;
+
+	/*
+	 * If there is no vdev_top_zap, we are in a pool whose
+	 * version predates the pool checkpoint feature.
+	 */
+	if (vd->vdev_top_zap == 0)
+		return;
+
+	/*
+	 * If there is no reference of the vdev_checkpoint_sm in
+	 * the vdev_top_zap, then one of the following scenarios
+	 * is true:
+	 *
+	 * 1] There is no checkpoint
+	 * 2] There is a checkpoint, but no checkpointed blocks
+	 *    have been freed yet
+	 * 3] The current vdev is indirect
+	 *
+	 * In these cases we return immediately.
+	 */
+	if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+		return;
+
+	VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
+	    &checkpoint_sm_obj));
+
+	checkpoint_sm_exclude_entry_arg_t cseea;
+	cseea.cseea_vd = vd;
+	cseea.cseea_checkpoint_size = 0;
+
+	VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+	    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+	space_map_update(checkpoint_sm);
+
+	VERIFY0(space_map_iterate(checkpoint_sm,
+	    checkpoint_sm_exclude_entry_cb, &cseea));
+	space_map_close(checkpoint_sm);
+
+	zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
+}
+
+static void
+zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
+		zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
+	}
+}
+
+static void
+load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+		vdev_t *vd = rvd->vdev_child[i];
+
+		ASSERT3U(i, ==, vd->vdev_id);
+
+		if (vd->vdev_ops == &vdev_indirect_ops)
+			continue;
+
+		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+			metaslab_t *msp = vd->vdev_ms[m];
+
+			(void) fprintf(stderr,
+			    "\rloading concrete vdev %llu, "
+			    "metaslab %llu of %llu ...",
+			    (longlong_t)vd->vdev_id,
+			    (longlong_t)msp->ms_id,
+			    (longlong_t)vd->vdev_ms_count);
+
+			mutex_enter(&msp->ms_lock);
+			metaslab_unload(msp);
+
+			/*
+			 * We don't want to spend the CPU manipulating the
+			 * size-ordered tree, so clear the range_tree ops.
+			 */
+			msp->ms_allocatable->rt_ops = NULL;
+
+			if (msp->ms_sm != NULL) {
+				VERIFY0(space_map_load(msp->ms_sm,
+				    msp->ms_allocatable, maptype));
+			}
+			if (!msp->ms_loaded)
+				msp->ms_loaded = B_TRUE;
+			mutex_exit(&msp->ms_lock);
+		}
+	}
+}
+
+/*
+ * vm_idxp is an in-out parameter which (for indirect vdevs) is the
+ * index in vim_entries that has the first entry in this metaslab.
+ * On return, it will be set to the first entry after this metaslab.
+ */
+static void
+load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
+    uint64_t *vim_idxp)
+{
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+	mutex_enter(&msp->ms_lock);
+	metaslab_unload(msp);
+
+	/*
+	 * We don't want to spend the CPU manipulating the
+	 * size-ordered tree, so clear the range_tree ops.
+	 */
+	msp->ms_allocatable->rt_ops = NULL;
+
+	for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
+	    (*vim_idxp)++) {
+		vdev_indirect_mapping_entry_phys_t *vimep =
+		    &vim->vim_entries[*vim_idxp];
+		uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+		uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
+		ASSERT3U(ent_offset, >=, msp->ms_start);
+		if (ent_offset >= msp->ms_start + msp->ms_size)
+			break;
+
+		/*
+		 * Mappings do not cross metaslab boundaries,
+		 * because we create them by walking the metaslabs.
+		 */
+		ASSERT3U(ent_offset + ent_len, <=,
+		    msp->ms_start + msp->ms_size);
+		range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
+	}
+
+	if (!msp->ms_loaded)
+		msp->ms_loaded = B_TRUE;
+	mutex_exit(&msp->ms_lock);
+}
+
+static void
+zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+
+		ASSERT3U(c, ==, vd->vdev_id);
+
+		if (vd->vdev_ops != &vdev_indirect_ops)
+			continue;
+
+		/*
+		 * Note: we don't check for mapping leaks on
+		 * removing vdevs because their ms_allocatable's
+		 * are used to look for leaks in allocated space.
+		 */
+		zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
+
+		/*
+		 * Normally, indirect vdevs don't have any
+		 * metaslabs.  We want to set them up for
+		 * zio_claim().
+		 */
+		VERIFY0(vdev_metaslab_init(vd, 0));
+
+		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+		uint64_t vim_idx = 0;
+		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+
+			(void) fprintf(stderr,
+			    "\rloading indirect vdev %llu, "
+			    "metaslab %llu of %llu ...",
+			    (longlong_t)vd->vdev_id,
+			    (longlong_t)vd->vdev_ms[m]->ms_id,
+			    (longlong_t)vd->vdev_ms_count);
+
+			load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
+			    &vim_idx);
+		}
+		ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
+	}
+}
+
+static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	zcb->zcb_spa = spa;
@@ -3115,7 +3309,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 
 		/*
 		 * We are going to be changing the meaning of the metaslab's
-		 * ms_tree.  Ensure that the allocator doesn't try to
+		 * ms_allocatable.  Ensure that the allocator doesn't try to
 		 * use the tree.
 		 */
 		spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
@@ -3125,39 +3319,37 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 		    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
 		    UMEM_NOFAIL);
 
+		/*
+		 * For leak detection, we overload the ms_allocatable trees
+		 * to contain allocated segments instead of free segments.
+		 * As a result, we can't use the normal metaslab_load/unload
+		 * interfaces.
+		 */
+		zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+		load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
 
-		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
-			vdev_t *vd = rvd->vdev_child[c];
-			uint64_t vim_idx = 0;
+		/*
+		 * On load_concrete_ms_allocatable_trees() we loaded all the
+		 * allocated entries from the ms_sm to the ms_allocatable for
+		 * each metaslab. If the pool has a checkpoint or is in the
+		 * middle of discarding a checkpoint, some of these blocks
+		 * may have been freed but their ms_sm may not have been
+		 * updated because they are referenced by the checkpoint. In
+		 * order to avoid false-positives during leak-detection, we
+		 * go through the vdev's checkpoint space map and exclude all
+		 * its entries from their relevant ms_allocatable.
+		 *
+		 * We also aggregate the space held by the checkpoint and add
+		 * it to zcb_checkpoint_size.
+		 *
+		 * Note that at this point we are also verifying that all the
+		 * entries on the checkpoint_sm are marked as allocated in
+		 * the ms_sm of their relevant metaslab.
+		 * [see comment in checkpoint_sm_exclude_entry_cb()]
+		 */
+		zdb_leak_init_exclude_checkpoint(spa, zcb);
 
-			ASSERT3U(c, ==, vd->vdev_id);
-
-			/*
-			 * Note: we don't check for mapping leaks on
-			 * removing vdevs because their ms_tree's are
-			 * used to look for leaks in allocated space.
-			 */
-			if (vd->vdev_ops == &vdev_indirect_ops) {
-				zcb->zcb_vd_obsolete_counts[c] =
-				    zdb_load_obsolete_counts(vd);
-
-				/*
-				 * Normally, indirect vdevs don't have any
-				 * metaslabs.  We want to set them up for
-				 * zio_claim().
-				 */
-				VERIFY0(vdev_metaslab_init(vd, 0));
-			}
-
-			for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
-				zdb_leak_init_ms(vd->vdev_ms[m], &vim_idx);
-			}
-			if (vd->vdev_ops == &vdev_indirect_ops) {
-				ASSERT3U(vim_idx, ==,
-				    vdev_indirect_mapping_num_entries(
-				    vd->vdev_indirect_mapping));
-			}
-		}
+		/* for cleaner progress output */
 		(void) fprintf(stderr, "\n");
 
 		if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
@@ -3166,12 +3358,16 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 			(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
 			    increment_indirect_mapping_cb, zcb, NULL);
 		}
+	} else {
+		/*
+		 * If leak tracing is disabled, we still need to consider
+		 * any checkpointed space in our space verification.
+		 */
+		zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
 	}
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
 	zdb_ddt_leak_init(spa, zcb);
-
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
@@ -3198,7 +3394,7 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb
 		for (uint64_t inner_offset = 0;
 		    inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
 		    inner_offset += 1 << vd->vdev_ashift) {
-			if (range_tree_contains(msp->ms_tree,
+			if (range_tree_contains(msp->ms_allocatable,
 			    offset + inner_offset, 1 << vd->vdev_ashift)) {
 				obsolete_bytes += 1 << vd->vdev_ashift;
 			}
@@ -3264,23 +3460,23 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
 				ASSERT3P(mg, ==, msp->ms_group);
 
 				/*
-				 * The ms_tree has been overloaded to
-				 * contain allocated segments. Now that we
-				 * finished traversing all blocks, any
-				 * block that remains in the ms_tree
+				 * ms_allocatable has been overloaded
+				 * to contain allocated segments. Now that
+				 * we finished traversing all blocks, any
+				 * block that remains in the ms_allocatable
 				 * represents an allocated block that we
 				 * did not claim during the traversal.
 				 * Claimed blocks would have been removed
-				 * from the ms_tree.  For indirect vdevs,
-				 * space remaining in the tree represents
-				 * parts of the mapping that are not
-				 * referenced, which is not a bug.
+				 * from the ms_allocatable.  For indirect
+				 * vdevs, space remaining in the tree
+				 * represents parts of the mapping that are
+				 * not referenced, which is not a bug.
 				 */
 				if (vd->vdev_ops == &vdev_indirect_ops) {
-					range_tree_vacate(msp->ms_tree,
+					range_tree_vacate(msp->ms_allocatable,
 					    NULL, NULL);
 				} else {
-					range_tree_vacate(msp->ms_tree,
+					range_tree_vacate(msp->ms_allocatable,
 					    zdb_leak, vd);
 				}
 
@@ -3403,7 +3599,7 @@ dump_block_stats(spa_t *spa)
 
 	total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
 	total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
-	    zcb.zcb_removing_size;
+	    zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
 
 	if (total_found == total_alloc) {
 		if (!dump_opt['L'])
@@ -3812,7 +4008,385 @@ verify_device_removal_feature_counts(spa_t *spa)
 	return (ret);
 }
 
+#define	BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
+/*
+ * Import the checkpointed state of the pool specified by the target
+ * parameter as readonly. The function also accepts a pool config
+ * as an optional parameter, else it attempts to infer the config by
+ * the name of the target pool.
+ *
+ * Note that the checkpointed state's pool name will be the name of
+ * the original pool with the above suffix appened to it. In addition,
+ * if the target is not a pool name (e.g. a path to a dataset) then
+ * the new_path parameter is populated with the updated path to
+ * reflect the fact that we are looking into the checkpointed state.
+ *
+ * The function returns a newly-allocated copy of the name of the
+ * pool containing the checkpointed state. When this copy is no
+ * longer needed it should be freed with free(3C). Same thing
+ * applies to the new_path parameter if allocated.
+ */
+static char *
+import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
+{
+	int error = 0;
+	char *poolname, *bogus_name;
+
+	/* If the target is not a pool, the extract the pool name */
+	char *path_start = strchr(target, '/');
+	if (path_start != NULL) {
+		size_t poolname_len = path_start - target;
+		poolname = strndup(target, poolname_len);
+	} else {
+		poolname = target;
+	}
+
+	if (cfg == NULL) {
+		error = spa_get_stats(poolname, &cfg, NULL, 0);
+		if (error != 0) {
+			fatal("Tried to read config of pool \"%s\" but "
+			    "spa_get_stats() failed with error %d\n",
+			    poolname, error);
+		}
+	}
+
+	(void) asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX);
+	fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
+
+	error = spa_import(bogus_name, cfg, NULL,
+	    ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT);
+	if (error != 0) {
+		fatal("Tried to import pool \"%s\" but spa_import() failed "
+		    "with error %d\n", bogus_name, error);
+	}
+
+	if (new_path != NULL && path_start != NULL)
+		(void) asprintf(new_path, "%s%s", bogus_name, path_start);
+
+	if (target != poolname)
+		free(poolname);
+
+	return (bogus_name);
+}
+
+typedef struct verify_checkpoint_sm_entry_cb_arg {
+	vdev_t *vcsec_vd;
+
+	/* the following fields are only used for printing progress */
+	uint64_t vcsec_entryid;
+	uint64_t vcsec_num_entries;
+} verify_checkpoint_sm_entry_cb_arg_t;
+
+#define	ENTRIES_PER_PROGRESS_UPDATE 10000
+
+static int
+verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
+    void *arg)
+{
+	verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
+	vdev_t *vd = vcsec->vcsec_vd;
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	uint64_t end = offset + size;
+
+	ASSERT(type == SM_FREE);
+
+	if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
+		(void) fprintf(stderr,
+		    "\rverifying vdev %llu, space map entry %llu of %llu ...",
+		    (longlong_t)vd->vdev_id,
+		    (longlong_t)vcsec->vcsec_entryid,
+		    (longlong_t)vcsec->vcsec_num_entries);
+	}
+	vcsec->vcsec_entryid++;
+
+	/*
+	 * See comment in checkpoint_sm_exclude_entry_cb()
+	 */
+	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+	/*
+	 * The entries in the vdev_checkpoint_sm should be marked as
+	 * allocated in the checkpointed state of the pool, therefore
+	 * their respective ms_allocateable trees should not contain them.
+	 */
+	mutex_enter(&ms->ms_lock);
+	range_tree_verify(ms->ms_allocatable, offset, size);
+	mutex_exit(&ms->ms_lock);
+
+	return (0);
+}
+
+/*
+ * Verify that all segments in the vdev_checkpoint_sm are allocated
+ * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
+ * ms_allocatable).
+ *
+ * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
+ * each vdev in the current state of the pool to the metaslab space maps
+ * (ms_sm) of the checkpointed state of the pool.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of the current spa_t. The entries of these ms_allocatable
+ * trees are cleared out and then repopulated from with the free
+ * entries of their respective ms_sm space maps.
+ */
 static void
+verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+	vdev_t *current_rvd = current->spa_root_vdev;
+
+	load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
+
+	for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
+		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
+		vdev_t *current_vd = current_rvd->vdev_child[c];
+
+		space_map_t *checkpoint_sm = NULL;
+		uint64_t checkpoint_sm_obj;
+
+		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+			/*
+			 * Since we don't allow device removal in a pool
+			 * that has a checkpoint, we expect that all removed
+			 * vdevs were removed from the pool before the
+			 * checkpoint.
+			 */
+			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+			continue;
+		}
+
+		/*
+		 * If the checkpoint space map doesn't exist, then nothing
+		 * here is checkpointed so there's nothing to verify.
+		 */
+		if (current_vd->vdev_top_zap == 0 ||
+		    zap_contains(spa_meta_objset(current),
+		    current_vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+			continue;
+
+		VERIFY0(zap_lookup(spa_meta_objset(current),
+		    current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
+		    checkpoint_sm_obj, 0, current_vd->vdev_asize,
+		    current_vd->vdev_ashift));
+		space_map_update(checkpoint_sm);
+
+		verify_checkpoint_sm_entry_cb_arg_t vcsec;
+		vcsec.vcsec_vd = ckpoint_vd;
+		vcsec.vcsec_entryid = 0;
+		vcsec.vcsec_num_entries =
+		    space_map_length(checkpoint_sm) / sizeof (uint64_t);
+		VERIFY0(space_map_iterate(checkpoint_sm,
+		    verify_checkpoint_sm_entry_cb, &vcsec));
+		dump_spacemap(current->spa_meta_objset, checkpoint_sm);
+		space_map_close(checkpoint_sm);
+	}
+
+	/*
+	 * If we've added vdevs since we took the checkpoint, ensure
+	 * that their checkpoint space maps are empty.
+	 */
+	if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
+		for (uint64_t c = ckpoint_rvd->vdev_children;
+		    c < current_rvd->vdev_children; c++) {
+			vdev_t *current_vd = current_rvd->vdev_child[c];
+			ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
+		}
+	}
+
+	/* for cleaner progress output */
+	(void) fprintf(stderr, "\n");
+}
+
+/*
+ * Verifies that all space that's allocated in the checkpoint is
+ * still allocated in the current version, by checking that everything
+ * in checkpoint's ms_allocatable (which is actually allocated, not
+ * allocatable/free) is not present in current's ms_allocatable.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of both spas when called. The entries of all ms_allocatable
+ * trees are cleared out and then repopulated from their respective
+ * ms_sm space maps. In the checkpointed state we load the allocated
+ * entries, and in the current state we load the free entries.
+ */
+static void
+verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+	vdev_t *current_rvd = current->spa_root_vdev;
+
+	load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
+	load_concrete_ms_allocatable_trees(current, SM_FREE);
+
+	for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
+		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
+		vdev_t *current_vd = current_rvd->vdev_child[i];
+
+		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+			/*
+			 * See comment in verify_checkpoint_vdev_spacemaps()
+			 */
+			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+			continue;
+		}
+
+		for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
+			metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
+			metaslab_t *current_msp = current_vd->vdev_ms[m];
+
+			(void) fprintf(stderr,
+			    "\rverifying vdev %llu of %llu, "
+			    "metaslab %llu of %llu ...",
+			    (longlong_t)current_vd->vdev_id,
+			    (longlong_t)current_rvd->vdev_children,
+			    (longlong_t)current_vd->vdev_ms[m]->ms_id,
+			    (longlong_t)current_vd->vdev_ms_count);
+
+			/*
+			 * We walk through the ms_allocatable trees that
+			 * are loaded with the allocated blocks from the
+			 * ms_sm spacemaps of the checkpoint. For each
+			 * one of these ranges we ensure that none of them
+			 * exists in the ms_allocatable trees of the
+			 * current state which are loaded with the ranges
+			 * that are currently free.
+			 *
+			 * This way we ensure that none of the blocks that
+			 * are part of the checkpoint were freed by mistake.
+			 */
+			range_tree_walk(ckpoint_msp->ms_allocatable,
+			    (range_tree_func_t *)range_tree_verify,
+			    current_msp->ms_allocatable);
+		}
+	}
+
+	/* for cleaner progress output */
+	(void) fprintf(stderr, "\n");
+}
+
+static void
+verify_checkpoint_blocks(spa_t *spa)
+{
+	spa_t *checkpoint_spa;
+	char *checkpoint_pool;
+	nvlist_t *config = NULL;
+	int error = 0;
+
+	/*
+	 * We import the checkpointed state of the pool (under a different
+	 * name) so we can do verification on it against the current state
+	 * of the pool.
+	 */
+	checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
+	    NULL);
+	ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
+
+	error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
+	if (error != 0) {
+		fatal("Tried to open pool \"%s\" but spa_open() failed with "
+		    "error %d\n", checkpoint_pool, error);
+	}
+
+	/*
+	 * Ensure that ranges in the checkpoint space maps of each vdev
+	 * are allocated according to the checkpointed state's metaslab
+	 * space maps.
+	 */
+	verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
+
+	/*
+	 * Ensure that allocated ranges in the checkpoint's metaslab
+	 * space maps remain allocated in the metaslab space maps of
+	 * the current state.
+	 */
+	verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
+
+	/*
+	 * Once we are done, we get rid of the checkpointed state.
+	 */
+	spa_close(checkpoint_spa, FTAG);
+	free(checkpoint_pool);
+}
+
+static void
+dump_leftover_checkpoint_blocks(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+		vdev_t *vd = rvd->vdev_child[i];
+
+		space_map_t *checkpoint_sm = NULL;
+		uint64_t checkpoint_sm_obj;
+
+		if (vd->vdev_top_zap == 0)
+			continue;
+
+		if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+			continue;
+
+		VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+		    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+		space_map_update(checkpoint_sm);
+		dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
+		space_map_close(checkpoint_sm);
+	}
+}
+
+static int
+verify_checkpoint(spa_t *spa)
+{
+	uberblock_t checkpoint;
+	int error;
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+		return (0);
+
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+	if (error == ENOENT) {
+		/*
+		 * If the feature is active but the uberblock is missing
+		 * then we must be in the middle of discarding the
+		 * checkpoint.
+		 */
+		(void) printf("\nPartially discarded checkpoint "
+		    "state found:\n");
+		dump_leftover_checkpoint_blocks(spa);
+		return (0);
+	} else if (error != 0) {
+		(void) printf("lookup error %d when looking for "
+		    "checkpointed uberblock in MOS\n", error);
+		return (error);
+	}
+	dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
+
+	if (checkpoint.ub_checkpoint_txg == 0) {
+		(void) printf("\nub_checkpoint_txg not set in checkpointed "
+		    "uberblock\n");
+		error = 3;
+	}
+
+	if (error == 0)
+		verify_checkpoint_blocks(spa);
+
+	return (error);
+}
+
+static void
 dump_zpool(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
@@ -3911,6 +4485,9 @@ dump_zpool(spa_t *spa)
 	if (dump_opt['h'])
 		dump_history(spa);
 
+	if (rc == 0 && !dump_opt['L'])
+		rc = verify_checkpoint(spa);
+
 	if (rc != 0) {
 		dump_debug_buffer();
 		exit(rc);
@@ -4413,6 +4990,7 @@ main(int argc, char **argv)
 	int rewind = ZPOOL_NEVER_REWIND;
 	char *spa_config_path_env;
 	boolean_t target_is_spa = B_TRUE;
+	nvlist_t *cfg = NULL;
 
 	(void) setrlimit(RLIMIT_NOFILE, &rl);
 	(void) enable_extended_FILE_stdio(-1, -1);
@@ -4429,7 +5007,7 @@ main(int argc, char **argv)
 		spa_config_path = spa_config_path_env;
 
 	while ((c = getopt(argc, argv,
-	    "AbcCdDeEFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
+	    "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
 		switch (c) {
 		case 'b':
 		case 'c':
@@ -4454,6 +5032,7 @@ main(int argc, char **argv)
 		case 'A':
 		case 'e':
 		case 'F':
+		case 'k':
 		case 'L':
 		case 'P':
 		case 'q':
@@ -4559,7 +5138,7 @@ main(int argc, char **argv)
 		verbose = MAX(verbose, 1);
 
 	for (c = 0; c < 256; c++) {
-		if (dump_all && strchr("AeEFlLOPRSX", c) == NULL)
+		if (dump_all && strchr("AeEFklLOPRSX", c) == NULL)
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201803281812.w2SIC7Ti011304>