Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 18 Feb 2018 01:21:52 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r329502 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor-sys/illumos/dist/uts/common/sys/fs vendor/ill...
Message-ID:  <201802180121.w1I1LqQd091337@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Sun Feb 18 01:21:52 2018
New Revision: 329502
URL: https://svnweb.freebsd.org/changeset/base/329502

Log:
  7614 zfs device evacuation/removal
  
  illumos/illumos-gate@5cabbc6b49070407fb9610cfe73d4c0e0dea3e77
  
  https://www.illumos.org/issues/7614:
  This project allows top-level vdevs to be removed from the storage pool with
  “zpool remove”, reducing the total amount of storage in the pool. This
  operation copies all allocated regions of the device to be removed onto other
  devices, recording the mapping from old to new location. After the removal is
  complete, read and free operations to the removed (now “indirect”) vdev must
  be remapped and performed at the new location on disk. The indirect mapping
  table is kept in memory whenever the pool is loaded, so there is minimal
  performance overhead when doing operations on the indirect vdev.
  
  The size of the in-memory mapping table will be reduced when its entries
  become “obsolete” because they are no longer used by any block pointers in
  the pool. An entry becomes obsolete when all the blocks that use it are
  freed. An entry can also become obsolete when all the snapshots that
  reference it are deleted, and the block pointers that reference it have been
  “remapped” in all filesystems/zvols (and clones). Whenever an indirect block
  is written, all the block pointers in it will be “remapped” to their new
  (concrete) locations if possible. This process can be accelerated by using
  the “zfs remap” command to proactively rewrite all indirect blocks that
  reference indirect (removed) vdevs.
  
  Note that when a device is removed, we do not verify the checksum of the data
  that is copied. This makes the process much faster, but if it were used on
  redundant vdevs (i.e. mirror or raidz vdevs), it would be possible to copy
  the wrong data, when we have the correct data on e.g. the other side of the
  mirror. Therefore, mirror and raidz devices can not be removed.
  
  Reviewed by: Alex Reece <alex@delphix.com>
  Reviewed by: George Wilson <george.wilson@delphix.com>
  Reviewed by: John Kennedy <john.kennedy@delphix.com>
  Reviewed by: Prakash Surya <prakash.surya@delphix.com>
  Reviewed by: Matthew Ahrens <mahrens@delphix.com>
  Reviewed by: Richard Laager <rlaager@wiktel.com>
  Reviewed by: Tim Chase <tim@chase2k.com>
  Approved by: Garrett D'Amore <garrett@damore.org>
  Author: Prashanth Sreenivasa <pks@delphix.com>

Added:
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_indirect_births.h   (contents, props changed)
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_indirect_mapping.h   (contents, props changed)
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_removal.h   (contents, props changed)
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c   (contents, props changed)
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect_births.c   (contents, props changed)
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect_mapping.c   (contents, props changed)
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c   (contents, props changed)
Modified:
  vendor-sys/illumos/dist/common/zfs/zfeature_common.c
  vendor-sys/illumos/dist/common/zfs/zfeature_common.h
  vendor-sys/illumos/dist/common/zfs/zfs_deleg.c
  vendor-sys/illumos/dist/common/zfs/zfs_deleg.h
  vendor-sys/illumos/dist/common/zfs/zfs_prop.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/bpobj.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_zfetch.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_deadlist.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_destroy.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa_config.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/space_reftree.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/bpobj.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_dataset.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_deadlist.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_deleg.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_dir.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_pool.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_scan.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/range_tree.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_map.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_debug.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zil.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_priority.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/txg.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_label.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_missing.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_raidz.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_root.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zcp_get.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
  vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/cmd/zdb/zdb.c
  vendor/illumos/dist/cmd/zfs/zfs_main.c
  vendor/illumos/dist/cmd/zpool/zpool_main.c
  vendor/illumos/dist/cmd/ztest/ztest.c
  vendor/illumos/dist/lib/libzfs/common/libzfs.h
  vendor/illumos/dist/lib/libzfs/common/libzfs_dataset.c
  vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c
  vendor/illumos/dist/lib/libzfs/common/libzfs_util.c
  vendor/illumos/dist/lib/libzfs_core/common/libzfs_core.c
  vendor/illumos/dist/lib/libzfs_core/common/libzfs_core.h
  vendor/illumos/dist/man/man1m/zfs.1m
  vendor/illumos/dist/man/man1m/zpool.1m
  vendor/illumos/dist/man/man5/zpool-features.5

Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -246,4 +246,20 @@ zpool_feature_init(void)
 	    "org.illumos:edonr", "edonr",
 	    "Edon-R hash algorithm.",
 	    ZFEATURE_FLAG_PER_DATASET, NULL);
+
+	zfeature_register(SPA_FEATURE_DEVICE_REMOVAL,
+	    "com.delphix:device_removal", "device_removal",
+	    "Top-level vdevs can be removed, reducing logical pool size.",
+	    ZFEATURE_FLAG_MOS, NULL);
+
+	static const spa_feature_t obsolete_counts_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_DEVICE_REMOVAL,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS,
+	    "com.delphix:obsolete_counts", "obsolete_counts",
+	    "Reduce memory used by removed devices when their blocks are "
+	    "freed or remapped.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps);
 }

Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.h
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.h	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.h	Sun Feb 18 01:21:52 2018	(r329502)
@@ -56,6 +56,8 @@ typedef enum spa_feature {
 	SPA_FEATURE_SHA512,
 	SPA_FEATURE_SKEIN,
 	SPA_FEATURE_EDONR,
+	SPA_FEATURE_DEVICE_REMOVAL,
+	SPA_FEATURE_OBSOLETE_COUNTS,
 	SPA_FEATURES
 } spa_feature_t;
 

Modified: vendor-sys/illumos/dist/common/zfs/zfs_deleg.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfs_deleg.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/common/zfs/zfs_deleg.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  */
 
@@ -53,6 +53,7 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
 	{ZFS_DELEG_PERM_MOUNT},
 	{ZFS_DELEG_PERM_PROMOTE},
 	{ZFS_DELEG_PERM_RECEIVE},
+	{ZFS_DELEG_PERM_REMAP},
 	{ZFS_DELEG_PERM_RENAME},
 	{ZFS_DELEG_PERM_ROLLBACK},
 	{ZFS_DELEG_PERM_SNAPSHOT},

Modified: vendor-sys/illumos/dist/common/zfs/zfs_deleg.h
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfs_deleg.h	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/common/zfs/zfs_deleg.h	Sun Feb 18 01:21:52 2018	(r329502)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef	_ZFS_DELEG_H
@@ -67,6 +67,7 @@ typedef enum {
 	ZFS_DELEG_NOTE_RELEASE,
 	ZFS_DELEG_NOTE_DIFF,
 	ZFS_DELEG_NOTE_BOOKMARK,
+	ZFS_DELEG_NOTE_REMAP,
 	ZFS_DELEG_NOTE_NONE
 } zfs_deleg_note_t;
 

Modified: vendor-sys/illumos/dist/common/zfs/zfs_prop.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfs_prop.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/common/zfs/zfs_prop.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -420,6 +420,8 @@ zfs_prop_init(void)
 	/* hidden properties */
 	zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
 	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "CREATETXG");
+	zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG");
 	zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
 	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
 	zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -5009,7 +5009,7 @@ top:
 			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
 			addr = hdr->b_l2hdr.b_daddr;
 			/*
-			 * Lock out device removal.
+			 * Lock out L2ARC device removal.
 			 */
 			if (vdev_is_dead(vd) ||
 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/bpobj.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/bpobj.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/bpobj.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -176,6 +176,12 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object
 	return (0);
 }
 
+boolean_t
+bpobj_is_open(const bpobj_t *bpo)
+{
+	return (bpo->bpo_object != 0);
+}
+
 void
 bpobj_close(bpobj_t *bpo)
 {
@@ -194,11 +200,11 @@ bpobj_close(bpobj_t *bpo)
 	mutex_destroy(&bpo->bpo_lock);
 }
 
-static boolean_t
-bpobj_hasentries(bpobj_t *bpo)
+boolean_t
+bpobj_is_empty(bpobj_t *bpo)
 {
-	return (bpo->bpo_phys->bpo_num_blkptrs != 0 ||
-	    (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0));
+	return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
+	    (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
 }
 
 static int
@@ -211,11 +217,9 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, vo
 	int err = 0;
 	dmu_buf_t *dbuf = NULL;
 
+	ASSERT(bpobj_is_open(bpo));
 	mutex_enter(&bpo->bpo_lock);
 
-	if (!bpobj_hasentries(bpo))
-		goto out;
-
 	if (free)
 		dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 
@@ -345,7 +349,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, vo
 
 out:
 	/* If there are no entries, there should be no bytes. */
-	if (!bpobj_hasentries(bpo)) {
+	if (bpobj_is_empty(bpo)) {
 		ASSERT0(bpo->bpo_phys->bpo_bytes);
 		ASSERT0(bpo->bpo_phys->bpo_comp);
 		ASSERT0(bpo->bpo_phys->bpo_uncomp);
@@ -380,6 +384,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dm
 	bpobj_t subbpo;
 	uint64_t used, comp, uncomp, subsubobjs;
 
+	ASSERT(bpobj_is_open(bpo));
+	ASSERT(subobj != 0);
 	ASSERT(bpo->bpo_havesubobj);
 	ASSERT(bpo->bpo_havecomp);
 	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
@@ -392,7 +398,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dm
 	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
 	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
 
-	if (!bpobj_hasentries(&subbpo)) {
+	if (bpobj_is_empty(&subbpo)) {
 		/* No point in having an empty subobj. */
 		bpobj_close(&subbpo);
 		bpobj_free(bpo->bpo_os, subobj, tx);
@@ -466,6 +472,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx
 	int blkoff;
 	blkptr_t *bparray;
 
+	ASSERT(bpobj_is_open(bpo));
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
 
@@ -551,6 +558,7 @@ space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t
 int
 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
+	ASSERT(bpobj_is_open(bpo));
 	mutex_enter(&bpo->bpo_lock);
 
 	*usedp = bpo->bpo_phys->bpo_bytes;
@@ -576,6 +584,8 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint6
 {
 	struct space_range_arg sra = { 0 };
 	int err;
+
+	ASSERT(bpobj_is_open(bpo));
 
 	/*
 	 * As an optimization, if they want the whole txg range, just

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -47,6 +47,7 @@
 #include <sys/range_tree.h>
 #include <sys/callb.h>
 #include <sys/abd.h>
+#include <sys/vdev.h>
 
 uint_t zfs_dbuf_evict_key;
 
@@ -3007,6 +3008,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
+
 	dbuf_write(dr, db->db_buf, tx);
 
 	zio = dr->dr_zio;
@@ -3478,6 +3480,141 @@ dbuf_write_override_done(zio_t *zio)
 		abd_put(zio->io_abd);
 }
 
+typedef struct dbuf_remap_impl_callback_arg {
+	objset_t	*drica_os;
+	uint64_t	drica_blk_birth;
+	dmu_tx_t	*drica_tx;
+} dbuf_remap_impl_callback_arg_t;
+
+static void
+dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
+    void *arg)
+{
+	dbuf_remap_impl_callback_arg_t *drica = arg;
+	objset_t *os = drica->drica_os;
+	spa_t *spa = dmu_objset_spa(os);
+	dmu_tx_t *tx = drica->drica_tx;
+
+	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+	if (os == spa_meta_objset(spa)) {
+		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
+	} else {
+		dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
+		    size, drica->drica_blk_birth, tx);
+	}
+}
+
+static void
+dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx)
+{
+	blkptr_t bp_copy = *bp;
+	spa_t *spa = dmu_objset_spa(dn->dn_objset);
+	dbuf_remap_impl_callback_arg_t drica;
+
+	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+	drica.drica_os = dn->dn_objset;
+	drica.drica_blk_birth = bp->blk_birth;
+	drica.drica_tx = tx;
+	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
+	    &drica)) {
+		/*
+		 * The struct_rwlock prevents dbuf_read_impl() from
+		 * dereferencing the BP while we are changing it.  To
+		 * avoid lock contention, only grab it when we are actually
+		 * changing the BP.
+		 */
+		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		*bp = bp_copy;
+		rw_exit(&dn->dn_struct_rwlock);
+	}
+}
+
+/*
+ * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting
+ * to remap a copy of every bp in the dbuf.
+ */
+boolean_t
+dbuf_can_remap(const dmu_buf_impl_t *db)
+{
+	spa_t *spa = dmu_objset_spa(db->db_objset);
+	blkptr_t *bp = db->db.db_data;
+	boolean_t ret = B_FALSE;
+
+	ASSERT3U(db->db_level, >, 0);
+	ASSERT3S(db->db_state, ==, DB_CACHED);
+
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+	for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
+		blkptr_t bp_copy = bp[i];
+		if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
+			ret = B_TRUE;
+			break;
+		}
+	}
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	return (ret);
+}
+
+boolean_t
+dnode_needs_remap(const dnode_t *dn)
+{
+	spa_t *spa = dmu_objset_spa(dn->dn_objset);
+	boolean_t ret = B_FALSE;
+
+	if (dn->dn_phys->dn_nlevels == 0) {
+		return (B_FALSE);
+	}
+
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+	for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) {
+		blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j];
+		if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
+			ret = B_TRUE;
+			break;
+		}
+	}
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	return (ret);
+}
+
+/*
+ * Remap any existing BP's to concrete vdevs, if possible.
+ */
+static void
+dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_objset_spa(db->db_objset);
+	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
+		return;
+
+	if (db->db_level > 0) {
+		blkptr_t *bp = db->db.db_data;
+		for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
+			dbuf_remap_impl(dn, &bp[i], tx);
+		}
+	} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+		dnode_phys_t *dnp = db->db.db_data;
+		ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
+		    DMU_OT_DNODE);
+		for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) {
+			for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
+				dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx);
+			}
+		}
+	}
+}
+
+
 /* Issue I/O to commit a dirty buffer to disk. */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
@@ -3511,6 +3648,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, d
 			} else {
 				dbuf_release_bp(db);
 			}
+			dbuf_remap(dn, db, tx);
 		}
 	}
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -704,15 +704,15 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t a
 	for (type = 0; type < DDT_TYPES; type++) {
 		for (class = 0; class < DDT_CLASSES; class++) {
 			error = ddt_object_lookup(ddt, type, class, dde);
-			if (error != ENOENT)
+			if (error != ENOENT) {
+				ASSERT0(error);
 				break;
+			}
 		}
 		if (error != ENOENT)
 			break;
 	}
 
-	ASSERT(error == 0 || error == ENOENT);
-
 	ddt_enter(ddt);
 
 	ASSERT(dde->dde_loaded == B_FALSE);
@@ -1099,7 +1099,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
 {
 	dmu_tx_t *tx;
 	zio_t *rio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
 
 	ASSERT(spa_syncing_txg(spa) == txg);
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -65,6 +65,13 @@ int zfs_nopwrite_enabled = 1;
  */
 uint32_t zfs_per_txg_dirty_frees_percent = 30;
 
+/*
+ * This can be used for testing, to ensure that certain actions happen
+ * while in the middle of a remap (which might otherwise complete too
+ * quickly).
+ */
+int zfs_object_remap_one_indirect_delay_ticks = 0;
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
 	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
@@ -1012,6 +1019,123 @@ dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint6
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+static int
+dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
+    uint64_t last_removal_txg, uint64_t offset)
+{
+	uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
+	int err = 0;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+	ASSERT3P(dbuf, !=, NULL);
+
+	/*
+	 * If the block hasn't been written yet, this default will ensure
+	 * we don't try to remap it.
+	 */
+	uint64_t birth = UINT64_MAX;
+	ASSERT3U(last_removal_txg, !=, UINT64_MAX);
+	if (dbuf->db_blkptr != NULL)
+		birth = dbuf->db_blkptr->blk_birth;
+	rw_exit(&dn->dn_struct_rwlock);
+
+	/*
+	 * If this L1 was already written after the last removal, then we've
+	 * already tried to remap it.
+	 */
+	if (birth <= last_removal_txg &&
+	    dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
+	    dbuf_can_remap(dbuf)) {
+		dmu_tx_t *tx = dmu_tx_create(os);
+		dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err == 0) {
+			(void) dbuf_dirty(dbuf, tx);
+			dmu_tx_commit(tx);
+		} else {
+			dmu_tx_abort(tx);
+		}
+	}
+
+	dbuf_rele(dbuf, FTAG);
+
+	delay(zfs_object_remap_one_indirect_delay_ticks);
+
+	return (err);
+}
+
+/*
+ * Remap all blockpointers in the object, if possible, so that they reference
+ * only concrete vdevs.
+ *
+ * To do this, iterate over the L0 blockpointers and remap any that reference
+ * an indirect vdev. Note that we only examine L0 blockpointers; since we
+ * cannot guarantee that we can remap all blockpointer anyways (due to split
+ * blocks), we do not want to make the code unnecessarily complicated to
+ * catch the unlikely case that there is an L1 block on an indirect vdev that
+ * contains no indirect blockpointers.
+ */
+int
+dmu_object_remap_indirects(objset_t *os, uint64_t object,
+    uint64_t last_removal_txg)
+{
+	uint64_t offset, l1span;
+	int err;
+	dnode_t *dn;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err != 0) {
+		return (err);
+	}
+
+	if (dn->dn_nlevels <= 1) {
+		if (issig(JUSTLOOKING) && issig(FORREAL)) {
+			err = SET_ERROR(EINTR);
+		}
+
+		/*
+		 * If the dnode has no indirect blocks, we cannot dirty them.
+		 * We still want to remap the blkptr(s) in the dnode if
+		 * appropriate, so mark it as dirty.
+		 */
+		if (err == 0 && dnode_needs_remap(dn)) {
+			dmu_tx_t *tx = dmu_tx_create(os);
+			dmu_tx_hold_bonus(tx, dn->dn_object);
+			if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
+				dnode_setdirty(dn, tx);
+				dmu_tx_commit(tx);
+			} else {
+				dmu_tx_abort(tx);
+			}
+		}
+
+		dnode_rele(dn, FTAG);
+		return (err);
+	}
+
+	offset = 0;
+	l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
+	    dn->dn_datablkshift);
+	/*
+	 * Find the next L1 indirect that is not a hole.
+	 */
+	while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
+		if (issig(JUSTLOOKING) && issig(FORREAL)) {
+			err = SET_ERROR(EINTR);
+			break;
+		}
+		if ((err = dmu_object_remap_one_indirect(os, dn,
+		    last_removal_txg, offset)) != 0) {
+			break;
+		}
+		offset += l1span;
+	}
+
+	dnode_rele(dn, FTAG);
+	return (err);
 }
 
 void

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -53,6 +53,7 @@
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_destroy.h>
 #include <sys/vdev.h>
+#include <sys/zfeature.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
@@ -348,6 +349,17 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, bl
 
 	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 
+	/*
+	 * The $ORIGIN dataset (if it exists) doesn't have an associated
+	 * objset, so there's no reason to open it. The $ORIGIN dataset
+	 * will not exist on pools older than SPA_VERSION_ORIGIN.
+	 */
+	if (ds != NULL && spa_get_dsl(spa) != NULL &&
+	    spa_get_dsl(spa)->dp_origin_snap != NULL) {
+		ASSERT3P(ds->ds_dir, !=,
+		    spa_get_dsl(spa)->dp_origin_snap->ds_dir);
+	}
+
 	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
 	os->os_dsl_dataset = ds;
 	os->os_spa = spa;
@@ -1050,6 +1062,101 @@ dmu_objset_clone(const char *clone, const char *origin
 	return (dsl_sync_task(clone,
 	    dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
 	    5, ZFS_SPACE_CHECK_NORMAL));
+}
+
+static int
+dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg)
+{
+	int error = 0;
+	uint64_t object = 0;
+	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
+		error = dmu_object_remap_indirects(os, object,
+		    last_removed_txg);
+		/*
+		 * If the ZPL removed the object before we managed to dnode_hold
+		 * it, we would get an ENOENT. If the ZPL declares its intent
+		 * to remove the object (dnode_free) before we manage to
+		 * dnode_hold it, we would get an EEXIST. In either case, we
+		 * want to continue remapping the other objects in the objset;
+		 * in all other cases, we want to break early.
+		 */
+		if (error != 0 && error != ENOENT && error != EEXIST) {
+			break;
+		}
+	}
+	if (error == ESRCH) {
+		error = 0;
+	}
+	return (error);
+}
+
+int
+dmu_objset_remap_indirects(const char *fsname)
+{
+	int error = 0;
+	objset_t *os = NULL;
+	uint64_t last_removed_txg;
+	uint64_t remap_start_txg;
+	dsl_dir_t *dd;
+
+	error = dmu_objset_hold(fsname, FTAG, &os);
+	if (error != 0) {
+		return (error);
+	}
+	dd = dmu_objset_ds(os)->ds_dir;
+
+	if (!spa_feature_is_enabled(dmu_objset_spa(os),
+	    SPA_FEATURE_OBSOLETE_COUNTS)) {
+		dmu_objset_rele(os, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) {
+		dmu_objset_rele(os, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * If there has not been a removal, we're done.
+	 */
+	last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os));
+	if (last_removed_txg == -1ULL) {
+		dmu_objset_rele(os, FTAG);
+		return (0);
+	}
+
+	/*
+	 * If we have remapped since the last removal, we're done.
+	 */
+	if (dsl_dir_is_zapified(dd)) {
+		uint64_t last_remap_txg;
+		if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)),
+		    dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
+		    sizeof (last_remap_txg), 1, &last_remap_txg) == 0 &&
+		    last_remap_txg > last_removed_txg) {
+			dmu_objset_rele(os, FTAG);
+			return (0);
+		}
+	}
+
+	dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
+	dsl_pool_rele(dmu_objset_pool(os), FTAG);
+
+	remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os));
+	error = dmu_objset_remap_indirects_impl(os, last_removed_txg);
+	if (error == 0) {
+		/*
+		 * We update the last_remap_txg to be the start txg so that
+		 * we can guarantee that every block older than last_remap_txg
+		 * that can be remapped has been remapped.
+		 */
+		error = dsl_dir_update_last_remap_txg(dd, remap_start_txg);
+	}
+
+	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
+	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
+
+	return (error);
 }
 
 int

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -301,6 +301,23 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint6
 }
 
 void
+dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT(tx->tx_txg == 0);
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    object, THT_WRITE, 0, 0);
+	if (txh == NULL)
+		return;
+
+	dnode_t *dn = txh->txh_dnode;
+	(void) refcount_add_many(&txh->txh_space_towrite,
+	    1ULL << dn->dn_indblkshift, FTAG);
+	dmu_tx_count_dnode(txh);
+}
+
+void
 dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
 {
 	dmu_tx_hold_t *txh;

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_zfetch.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_zfetch.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_zfetch.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -211,8 +211,19 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblk
 	int64_t pf_ahead_blks, max_blks;
 	int epbs, max_dist_blks, pf_nblks, ipf_nblks;
 	uint64_t end_of_access_blkid = blkid + nblks;
+	spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
 
 	if (zfs_prefetch_disable)
+		return;
+
+	/*
+	 * If we haven't yet loaded the indirect vdevs' mappings, we
+	 * can only read from blocks that we carefully ensure are on
+	 * concrete vdevs (or previously-loaded indirect vdevs).  So we
+	 * can't allow the predictive prefetcher to attempt reads of other
+	 * blocks (e.g. of the MOS's dnode obejct).
+	 */
+	if (!spa_indirect_vdevs_loaded(spa))
 		return;
 
 	/*

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -1690,8 +1690,7 @@ done:
 	mutex_enter(&dn->dn_mtx);
 	int txgoff = tx->tx_txg & TXG_MASK;
 	if (dn->dn_free_ranges[txgoff] == NULL) {
-		dn->dn_free_ranges[txgoff] =
-		    range_tree_create(NULL, NULL, &dn->dn_mtx);
+		dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
 	}
 	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
 	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c	Sun Feb 18 01:16:37 2018	(r329501)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c	Sun Feb 18 01:21:52 2018	(r329502)
@@ -46,6 +46,7 @@
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
+#include <sys/vdev.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
@@ -81,6 +82,11 @@ int zfs_max_recordsize = 1 * 1024 * 1024;
 
 extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
 
+static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds,
+    uint64_t obj, dmu_tx_t *tx);
+static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
+    dmu_tx_t *tx);
+
 extern int spa_asize_inflation;
 
 static zil_header_t zero_zil;
@@ -154,6 +160,47 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr
 	    DD_USED_REFRSRV, DD_USED_HEAD, tx);
 }
 
+/*
+ * Called when the specified segment has been remapped, and is thus no
+ * longer referenced in the head dataset.  The vdev must be indirect.
+ *
+ * If the segment is referenced by a snapshot, put it on the remap deadlist.
+ * Otherwise, add this segment to the obsolete spacemap.
+ */
+void
+dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
+    uint64_t size, uint64_t birth, dmu_tx_t *tx)
+{
+	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(birth <= tx->tx_txg);
+	ASSERT(!ds->ds_is_snapshot);
+
+	if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
+	} else {
+		blkptr_t fakebp;
+		dva_t *dva = &fakebp.blk_dva[0];
+
+		ASSERT(ds != NULL);
+
+		mutex_enter(&ds->ds_remap_deadlist_lock);
+		if (!dsl_dataset_remap_deadlist_exists(ds)) {
+			dsl_dataset_create_remap_deadlist(ds, tx);
+		}
+		mutex_exit(&ds->ds_remap_deadlist_lock);
+
+		BP_ZERO(&fakebp);
+		fakebp.blk_birth = birth;
+		DVA_SET_VDEV(dva, vdev);
+		DVA_SET_OFFSET(dva, offset);
+		DVA_SET_ASIZE(dva, size);
+
+		dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx);
+	}
+}
+
 int
 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
     boolean_t async)
@@ -272,8 +319,10 @@ dsl_dataset_evict_async(void *dbu)
 	}
 
 	bplist_destroy(&ds->ds_pending_deadlist);
-	if (ds->ds_deadlist.dl_os != NULL)
+	if (dsl_deadlist_is_open(&ds->ds_deadlist))
 		dsl_deadlist_close(&ds->ds_deadlist);
+	if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
+		dsl_deadlist_close(&ds->ds_remap_deadlist);
 	if (ds->ds_dir)
 		dsl_dir_async_rele(ds->ds_dir, ds);
 
@@ -283,6 +332,7 @@ dsl_dataset_evict_async(void *dbu)
 	mutex_destroy(&ds->ds_lock);
 	mutex_destroy(&ds->ds_opening_lock);
 	mutex_destroy(&ds->ds_sendstream_lock);
+	mutex_destroy(&ds->ds_remap_deadlist_lock);
 	refcount_destroy(&ds->ds_longholds);
 	rrw_destroy(&ds->ds_bp_rwlock);
 
@@ -407,15 +457,23 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, v
 		ds->ds_object = dsobj;
 		ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
 
+		err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj,
+		    NULL, ds, &ds->ds_dir);
+		if (err != 0) {
+			kmem_free(ds, sizeof (dsl_dataset_t));
+			dmu_buf_rele(dbuf, tag);
+			return (err);
+		}
+
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&ds->ds_remap_deadlist_lock,
+		    NULL, MUTEX_DEFAULT, NULL);
 		rrw_init(&ds->ds_bp_rwlock, B_FALSE);
 		refcount_create(&ds->ds_longholds);
 
 		bplist_create(&ds->ds_pending_deadlist);
-		dsl_deadlist_open(&ds->ds_deadlist,
-		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
 
 		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
 		    offsetof(dmu_sendarg_t, dsa_link));
@@ -439,20 +497,6 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, v
 			}
 		}
 
-		err = dsl_dir_hold_obj(dp,
-		    dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
-		if (err != 0) {
-			mutex_destroy(&ds->ds_lock);
-			mutex_destroy(&ds->ds_opening_lock);
-			mutex_destroy(&ds->ds_sendstream_lock);
-			refcount_destroy(&ds->ds_longholds);
-			bplist_destroy(&ds->ds_pending_deadlist);
-			dsl_deadlist_close(&ds->ds_deadlist);
-			kmem_free(ds, sizeof (dsl_dataset_t));
-			dmu_buf_rele(dbuf, tag);
-			return (err);
-		}
-
 		if (!ds->ds_is_snapshot) {
 			ds->ds_snapname[0] = '\0';
 			if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
@@ -493,6 +537,15 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, v
 			ds->ds_reserved = ds->ds_quota = 0;
 		}
 
+		dsl_deadlist_open(&ds->ds_deadlist,
+		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
+		uint64_t remap_deadlist_obj =
+		    dsl_dataset_get_remap_deadlist_object(ds);
+		if (remap_deadlist_obj != 0) {
+			dsl_deadlist_open(&ds->ds_remap_deadlist, mos,
+			    remap_deadlist_obj);
+		}
+
 		dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync,
 		    dsl_dataset_evict_async, &ds->ds_dbuf);
 		if (err == 0)
@@ -501,6 +554,8 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, v
 		if (err != 0 || winner != NULL) {
 			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
+			if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
+				dsl_deadlist_close(&ds->ds_remap_deadlist);
 			if (ds->ds_prev)
 				dsl_dataset_rele(ds->ds_prev, ds);
 			dsl_dir_rele(ds->ds_dir, ds);
@@ -1393,10 +1448,27 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, cons
 	dsl_deadlist_add_key(&ds->ds_deadlist,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
 
+	if (dsl_dataset_remap_deadlist_exists(ds)) {
+		uint64_t remap_deadlist_obj =
+		    dsl_dataset_get_remap_deadlist_object(ds);
+		/*
+		 * Move the remap_deadlist to the snapshot.  The head
+		 * will create a new remap deadlist on demand, from
+		 * dsl_dataset_block_remapped().
+		 */
+		dsl_dataset_unset_remap_deadlist_object(ds, tx);
+		dsl_deadlist_close(&ds->ds_remap_deadlist);
+
+		dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST,
+		    sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx));
+	}
+
 	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
 	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
 	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
 	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
+
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
@@ -3233,6 +3305,41 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone
 	return (0);
 }
 
+static void
+dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone,
+    dsl_dataset_t *origin, dmu_tx_t *tx)
+{
+	uint64_t clone_remap_dl_obj, origin_remap_dl_obj;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+
+	ASSERT(dsl_pool_sync_context(dp));
+
+	clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone);
+	origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin);
+
+	if (clone_remap_dl_obj != 0) {
+		dsl_deadlist_close(&clone->ds_remap_deadlist);
+		dsl_dataset_unset_remap_deadlist_object(clone, tx);
+	}
+	if (origin_remap_dl_obj != 0) {
+		dsl_deadlist_close(&origin->ds_remap_deadlist);
+		dsl_dataset_unset_remap_deadlist_object(origin, tx);
+	}
+
+	if (clone_remap_dl_obj != 0) {
+		dsl_dataset_set_remap_deadlist_object(origin,
+		    clone_remap_dl_obj, tx);
+		dsl_deadlist_open(&origin->ds_remap_deadlist,
+		    dp->dp_meta_objset, clone_remap_dl_obj);
+	}
+	if (origin_remap_dl_obj != 0) {
+		dsl_dataset_set_remap_deadlist_object(clone,
+		    origin_remap_dl_obj, tx);
+		dsl_deadlist_open(&clone->ds_remap_deadlist,
+		    dp->dp_meta_objset, origin_remap_dl_obj);
+	}
+}
+
 void
 dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, dmu_tx_t *tx)
@@ -3402,6 +3509,7 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
 	    dsl_dataset_phys(clone)->ds_deadlist_obj);
 	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
 	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
+	dsl_dataset_swap_remap_deadlists(clone, origin_head, tx);
 
 	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
 
@@ -3908,4 +4016,91 @@ dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds
 	return (dsl_dataset_is_zapified(ds) &&
 	    zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
 	    ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
+}
+
+uint64_t
+dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds)
+{
+	uint64_t remap_deadlist_obj;
+	int err;
+
+	if (!dsl_dataset_is_zapified(ds))
+		return (0);
+
+	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1,
+	    &remap_deadlist_obj);
+
+	if (err != 0) {
+		VERIFY3S(err, ==, ENOENT);
+		return (0);
+	}
+
+	ASSERT(remap_deadlist_obj != 0);
+	return (remap_deadlist_obj);
+}
+
+boolean_t
+dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds)
+{
+	EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist),
+	    dsl_dataset_get_remap_deadlist_object(ds) != 0);
+	return (dsl_deadlist_is_open(&ds->ds_remap_deadlist));
+}
+
+static void
+dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj,
+    dmu_tx_t *tx)
+{
+	ASSERT(obj != 0);
+	dsl_dataset_zapify(ds, tx);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201802180121.w1I1LqQd091337>