Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 1 Jul 2014 07:29:42 +0000 (UTC)
From:      Xin LI <delphij@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r268079 - in head: cddl/contrib/opensolaris/cmd/zdb cddl/contrib/opensolaris/lib/libzfs/common sys/cddl/contrib/opensolaris/common/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys...
Message-ID:  <201407010729.s617Tgts010424@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: delphij
Date: Tue Jul  1 07:29:42 2014
New Revision: 268079
URL: http://svnweb.freebsd.org/changeset/base/268079

Log:
  MFV r267566:
  
  4390 i/o errors when deleting filesystem/zvol can lead to space map corruption
  
  MFC after:	2 weeks

Modified:
  head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
  head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
  head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
Directory Properties:
  head/cddl/contrib/opensolaris/   (props changed)
  head/cddl/contrib/opensolaris/lib/libzfs/   (props changed)
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c	Tue Jul  1 07:29:42 2014	(r268079)
@@ -75,9 +75,9 @@
 	DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
 
 #ifndef lint
-extern int zfs_recover;
+extern boolean_t zfs_recover;
 #else
-int zfs_recover;
+boolean_t zfs_recover;
 #endif
 
 const char cmdname[] = "zdb";

Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c	Tue Jul  1 07:29:42 2014	(r268079)
@@ -303,6 +303,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpoo
 		case ZPOOL_PROP_ALLOCATED:
 		case ZPOOL_PROP_FREE:
 		case ZPOOL_PROP_FREEING:
+		case ZPOOL_PROP_LEAKED:
 		case ZPOOL_PROP_EXPANDSZ:
 			if (literal) {
 				(void) snprintf(buf, len, "%llu",

Modified: head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c	Tue Jul  1 07:29:42 2014	(r268079)
@@ -81,6 +81,8 @@ zpool_prop_init(void)
 	    ZFS_TYPE_POOL, "<size>", "FREE");
 	zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "FREEING");
+	zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<size>", "LEAKED");
 	zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
 	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
 	zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c	Tue Jul  1 07:29:42 2014	(r268079)
@@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, 
 	return (dmu_object_free(os, obj, tx));
 }
 
+boolean_t
+bptree_is_empty(objset_t *os, uint64_t obj)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+	boolean_t rv;
+
+	VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+	rv = (bt->bt_begin == bt->bt_end);
+	dmu_buf_rele(db, FTAG);
+	return (rv);
+}
+
 void
 bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
     uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	bptree_phys_t *bt;
-	bptree_entry_phys_t bte;
+	bptree_entry_phys_t bte = { 0 };
 
 	/*
 	 * bptree objects are in the pool mos, therefore they can only be
@@ -122,7 +136,6 @@ bptree_add(objset_t *os, uint64_t obj, b
 
 	bte.be_birth_txg = birth_txg;
 	bte.be_bp = *bp;
-	bzero(&bte.be_zb, sizeof (bte.be_zb));
 	dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
 
 	dmu_buf_will_dirty(db, tx);
@@ -153,10 +166,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zil
 	return (err);
 }
 
+/*
+ * If "free" is set:
+ *  - It is assumed that "func" will be freeing the block pointers.
+ *  - If "func" returns nonzero, the bookmark will be remembered and
+ *    iteration will be restarted from this point on next invocation.
+ *  - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
+ *    bptree_iterate will remember the bookmark, continue traversing
+ *    any additional entries, and return 0.
+ *
+ * If "free" is not set, traversal will stop and return an error if
+ * an i/o error is encountered.
+ *
+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
+ * traverse_dataset_destroyed()).
+ */
 int
 bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
     void *arg, dmu_tx_t *tx)
 {
+	boolean_t ioerr = B_FALSE;
 	int err;
 	uint64_t i;
 	dmu_buf_t *db;
@@ -182,49 +212,82 @@ bptree_iterate(objset_t *os, uint64_t ob
 		bptree_entry_phys_t bte;
 		int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
 
-		ASSERT(!free || i == ba.ba_phys->bt_begin);
-
 		err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
 		    &bte, DMU_READ_NO_PREFETCH);
 		if (err != 0)
 			break;
 
-		if (zfs_recover)
+		if (zfs_free_leak_on_eio)
 			flags |= TRAVERSE_HARD;
+		zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
+		    "bookmark %lld/%lld/%lld/%lld",
+		    i, (longlong_t)bte.be_birth_txg,
+		    (longlong_t)bte.be_zb.zb_objset,
+		    (longlong_t)bte.be_zb.zb_object,
+		    (longlong_t)bte.be_zb.zb_level,
+		    (longlong_t)bte.be_zb.zb_blkid);
 		err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
 		    bte.be_birth_txg, &bte.be_zb, flags,
 		    bptree_visit_cb, &ba);
 		if (free) {
-			if (err == ERESTART) {
+			/*
+			 * The callback has freed the visited block pointers.
+			 * Record our traversal progress on disk, either by
+			 * updating this record's bookmark, or by logically
+			 * removing this record by advancing bt_begin.
+			 */
+			if (err != 0) {
 				/* save bookmark for future resume */
 				ASSERT3U(bte.be_zb.zb_objset, ==,
 				    ZB_DESTROYED_OBJSET);
 				ASSERT0(bte.be_zb.zb_level);
 				dmu_write(os, obj, i * sizeof (bte),
 				    sizeof (bte), &bte, tx);
-				break;
-			}
-			if (err != 0) {
+				if (err == EIO || err == ECKSUM ||
+				    err == ENXIO) {
+					/*
+					 * Skip the rest of this tree and
+					 * continue on to the next entry.
+					 */
+					err = 0;
+					ioerr = B_TRUE;
+				} else {
+					break;
+				}
+			} else if (ioerr) {
 				/*
-				 * We can not properly handle an i/o
-				 * error, because the traversal code
-				 * does not know how to resume from an
-				 * arbitrary bookmark.
+				 * This entry is finished, but there were
+				 * i/o errors on previous entries, so we
+				 * can't adjust bt_begin.  Set this entry's
+				 * be_birth_txg such that it will be
+				 * treated as a no-op in future traversals.
 				 */
-				zfs_panic_recover("error %u from "
-				    "traverse_dataset_destroyed()", err);
+				bte.be_birth_txg = UINT64_MAX;
+				dmu_write(os, obj, i * sizeof (bte),
+				    sizeof (bte), &bte, tx);
 			}
 
-			ba.ba_phys->bt_begin++;
-			(void) dmu_free_range(os, obj,
-			    i * sizeof (bte), sizeof (bte), tx);
+			if (!ioerr) {
+				ba.ba_phys->bt_begin++;
+				(void) dmu_free_range(os, obj,
+				    i * sizeof (bte), sizeof (bte), tx);
+			}
+		} else if (err != 0) {
+			break;
 		}
 	}
 
-	ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+	ASSERT(!free || err != 0 || ioerr ||
+	    ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
 
 	/* if all blocks are free there should be no used space */
 	if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+		if (zfs_free_leak_on_eio) {
+			ba.ba_phys->bt_bytes = 0;
+			ba.ba_phys->bt_comp = 0;
+			ba.ba_phys->bt_uncomp = 0;
+		}
+
 		ASSERT0(ba.ba_phys->bt_bytes);
 		ASSERT0(ba.ba_phys->bt_comp);
 		ASSERT0(ba.ba_phys->bt_uncomp);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c	Tue Jul  1 07:29:42 2014	(r268079)
@@ -58,6 +58,7 @@ typedef struct traverse_data {
 	zbookmark_t *td_resume;
 	int td_flags;
 	prefetch_data_t *td_pfd;
+	boolean_t td_paused;
 	blkptr_cb_t *td_func;
 	void *td_arg;
 } traverse_data_t;
@@ -163,7 +164,6 @@ resume_skip_check(traverse_data_t *td, c
 		 * If we found the block we're trying to resume from, zero
 		 * the bookmark out to indicate that we have resumed.
 		 */
-		ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
 		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
 			bzero(td->td_resume, sizeof (*zb));
 			if (td->td_flags & TRAVERSE_POST)
@@ -174,14 +174,6 @@ resume_skip_check(traverse_data_t *td, c
 }
 
 static void
-traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
-{
-	ASSERT(td->td_resume != NULL);
-	ASSERT0(zb->zb_level);
-	bcopy(zb, td->td_resume, sizeof (*td->td_resume));
-}
-
-static void
 traverse_prefetch_metadata(traverse_data_t *td,
     const blkptr_t *bp, const zbookmark_t *zb)
 {
@@ -210,11 +202,10 @@ traverse_visitbp(traverse_data_t *td, co
     const blkptr_t *bp, const zbookmark_t *zb)
 {
 	zbookmark_t czb;
-	int err = 0, lasterr = 0;
+	int err = 0;
 	arc_buf_t *buf = NULL;
 	prefetch_data_t *pd = td->td_pfd;
 	boolean_t hard = td->td_flags & TRAVERSE_HARD;
-	boolean_t pause = B_FALSE;
 
 	switch (resume_skip_check(td, dnp, zb)) {
 	case RESUME_SKIP_ALL:
@@ -253,7 +244,9 @@ traverse_visitbp(traverse_data_t *td, co
 
 	if (BP_IS_HOLE(bp)) {
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
-		return (err);
+		if (err != 0)
+			goto post;
+		return (0);
 	}
 
 	if (pd && !pd->pd_exited &&
@@ -273,8 +266,6 @@ traverse_visitbp(traverse_data_t *td, co
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
-		if (err == ERESTART)
-			pause = B_TRUE; /* handle pausing at a common point */
 		if (err != 0)
 			goto post;
 	}
@@ -288,7 +279,7 @@ traverse_visitbp(traverse_data_t *td, co
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
-			return (err);
+			goto post;
 		cbp = buf->b_data;
 
 		for (i = 0; i < epb; i++) {
@@ -304,11 +295,8 @@ traverse_visitbp(traverse_data_t *td, co
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = traverse_visitbp(td, dnp, &cbp[i], &czb);
-			if (err != 0) {
-				if (!hard)
-					break;
-				lasterr = err;
-			}
+			if (err != 0)
+				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		uint32_t flags = ARC_WAIT;
@@ -318,7 +306,7 @@ traverse_visitbp(traverse_data_t *td, co
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
-			return (err);
+			goto post;
 		dnp = buf->b_data;
 
 		for (i = 0; i < epb; i++) {
@@ -330,11 +318,8 @@ traverse_visitbp(traverse_data_t *td, co
 		for (i = 0; i < epb; i++) {
 			err = traverse_dnode(td, &dnp[i], zb->zb_objset,
 			    zb->zb_blkid * epb + i);
-			if (err != 0) {
-				if (!hard)
-					break;
-				lasterr = err;
-			}
+			if (err != 0)
+				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		uint32_t flags = ARC_WAIT;
@@ -344,7 +329,7 @@ traverse_visitbp(traverse_data_t *td, co
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
-			return (err);
+			goto post;
 
 		osp = buf->b_data;
 		dnp = &osp->os_meta_dnode;
@@ -359,19 +344,11 @@ traverse_visitbp(traverse_data_t *td, co
 
 		err = traverse_dnode(td, dnp, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
-		if (err && hard) {
-			lasterr = err;
-			err = 0;
-		}
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			dnp = &osp->os_groupused_dnode;
 			err = traverse_dnode(td, dnp, zb->zb_objset,
 			    DMU_GROUPUSED_OBJECT);
 		}
-		if (err && hard) {
-			lasterr = err;
-			err = 0;
-		}
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			dnp = &osp->os_userused_dnode;
 			err = traverse_dnode(td, dnp, zb->zb_objset,
@@ -383,19 +360,37 @@ traverse_visitbp(traverse_data_t *td, co
 		(void) arc_buf_remove_ref(buf, &buf);
 
 post:
-	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+	if (err == 0 && (td->td_flags & TRAVERSE_POST))
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
-		if (err == ERESTART)
-			pause = B_TRUE;
+
+	if (hard && (err == EIO || err == ECKSUM)) {
+		/*
+		 * Ignore this disk error as requested by the HARD flag,
+		 * and continue traversal.
+		 */
+		err = 0;
 	}
 
-	if (pause && td->td_resume != NULL) {
-		ASSERT3U(err, ==, ERESTART);
-		ASSERT(!hard);
-		traverse_pause(td, zb);
+	/*
+	 * If we are stopping here, set td_resume.
+	 */
+	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
+		td->td_resume->zb_objset = zb->zb_objset;
+		td->td_resume->zb_object = zb->zb_object;
+		td->td_resume->zb_level = 0;
+		/*
+		 * If we have stopped on an indirect block (e.g. due to
+		 * i/o error), we have not visited anything below it.
+		 * Set the bookmark to the first level-0 block that we need
+		 * to visit.  This way, the resuming code does not need to
+		 * deal with resuming from indirect blocks.
+		 */
+		td->td_resume->zb_blkid = zb->zb_blkid <<
+		    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+		td->td_paused = B_TRUE;
 	}
 
-	return (err != 0 ? err : lasterr);
+	return (err);
 }
 
 static void
@@ -420,30 +415,21 @@ static int
 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
-	int j, err = 0, lasterr = 0;
+	int j, err = 0;
 	zbookmark_t czb;
-	boolean_t hard = (td->td_flags & TRAVERSE_HARD);
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
-		if (err != 0) {
-			if (!hard)
-				break;
-			lasterr = err;
-		}
+		if (err != 0)
+			break;
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
-		if (err != 0) {
-			if (!hard)
-				return (err);
-			lasterr = err;
-		}
 	}
-	return (err != 0 ? err : lasterr);
+	return (err);
 }
 
 /* ARGSUSED */
@@ -530,6 +516,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t 
 	td.td_arg = arg;
 	td.td_pfd = &pd;
 	td.td_flags = flags;
+	td.td_paused = B_FALSE;
 
 	pd.pd_blks_max = zfs_pd_blks_max;
 	pd.pd_flags = flags;
@@ -603,7 +590,7 @@ int
 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
     blkptr_cb_t func, void *arg)
 {
-	int err, lasterr = 0;
+	int err;
 	uint64_t obj;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	objset_t *mos = dp->dp_meta_objset;
@@ -616,16 +603,15 @@ traverse_pool(spa_t *spa, uint64_t txg_s
 		return (err);
 
 	/* visit each dataset */
-	for (obj = 1; err == 0 || (err != ESRCH && hard);
+	for (obj = 1; err == 0;
 	    err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
 		dmu_object_info_t doi;
 
 		err = dmu_object_info(mos, obj, &doi);
 		if (err != 0) {
-			if (!hard)
-				return (err);
-			lasterr = err;
-			continue;
+			if (hard)
+				continue;
+			break;
 		}
 
 		if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
@@ -636,23 +622,19 @@ traverse_pool(spa_t *spa, uint64_t txg_s
 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 			dsl_pool_config_exit(dp, FTAG);
 			if (err != 0) {
-				if (!hard)
-					return (err);
-				lasterr = err;
-				continue;
+				if (hard)
+					continue;
+				break;
 			}
 			if (ds->ds_phys->ds_prev_snap_txg > txg)
 				txg = ds->ds_phys->ds_prev_snap_txg;
 			err = traverse_dataset(ds, txg, flags, func, arg);
 			dsl_dataset_rele(ds, FTAG);
-			if (err != 0) {
-				if (!hard)
-					return (err);
-				lasterr = err;
-			}
+			if (err != 0)
+				break;
 		}
 	}
 	if (err == ESRCH)
 		err = 0;
-	return (err != 0 ? err : lasterr);
+	return (err);
 }

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c	Tue Jul  1 07:29:42 2014	(r268079)
@@ -329,6 +329,13 @@ dsl_pool_open(dsl_pool_t *dp)
 		    dp->dp_meta_objset, obj));
 	}
 
+	/*
+	 * Note: errors ignored, because the leak dir will not exist if we
+	 * have not encountered a leak yet.
+	 */
+	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
+	    &dp->dp_leak_dir);
+
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
@@ -376,6 +383,8 @@ dsl_pool_close(dsl_pool_t *dp)
 		dsl_dir_rele(dp->dp_mos_dir, dp);
 	if (dp->dp_free_dir)
 		dsl_dir_rele(dp->dp_free_dir, dp);
+	if (dp->dp_leak_dir)
+		dsl_dir_rele(dp->dp_leak_dir, dp);
 	if (dp->dp_root_dir)
 		dsl_dir_rele(dp->dp_root_dir, dp);
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c	Tue Jul  1 07:29:42 2014	(r268079)
@@ -52,9 +52,7 @@
 
 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
 
-static scan_cb_t dsl_scan_defrag_cb;
 static scan_cb_t dsl_scan_scrub_cb;
-static scan_cb_t dsl_scan_remove_cb;
 static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
 
@@ -68,7 +66,7 @@ unsigned int zfs_free_min_time_ms = 1000
 unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver
 						 per txg */
 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN,
@@ -1380,7 +1378,7 @@ dsl_scan_active(dsl_scan_t *scn)
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 	if (scn->scn_phys.scn_state == DSS_SCANNING ||
-	    scn->scn_async_destroying)
+	    (scn->scn_async_destroying && !scn->scn_async_stalled))
 		return (B_TRUE);
 
 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
@@ -1395,7 +1393,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
-	int err;
+	int err = 0;
 
 	/*
 	 * Check for scn_restart_txg before checking spa_load_state, so
@@ -1413,7 +1411,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *
 		dsl_scan_setup_sync(&func, tx);
 	}
 
-	if (!dsl_scan_active(scn) ||
+	/*
+	 * If the scan is inactive due to a stalled async destroy, try again.
+	 */
+	if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) ||
 	    spa_sync_pass(dp->dp_spa) > 1)
 		return;
 
@@ -1423,10 +1424,11 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *
 	spa->spa_scrub_active = B_TRUE;
 
 	/*
-	 * First process the free list.  If we pause the free, don't do
-	 * any scanning.  This ensures that there is no free list when
-	 * we are scanning, so the scan code doesn't have to worry about
-	 * traversing it.
+	 * First process the async destroys.  If we pause, don't do
+	 * any scrubbing or resilvering.  This ensures that there are no
+	 * async destroys while we are scanning, so the scan code doesn't
+	 * have to worry about traversing it.  It is also faster to free the
+	 * blocks than to scrub them.
 	 */
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		scn->scn_is_bptree = B_FALSE;
@@ -1436,48 +1438,92 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *
 		    dsl_scan_free_block_cb, scn, tx);
 		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
 
-		if (err == 0 && spa_feature_is_active(spa,
-		    SPA_FEATURE_ASYNC_DESTROY)) {
-			ASSERT(scn->scn_async_destroying);
-			scn->scn_is_bptree = B_TRUE;
-			scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
-			    NULL, ZIO_FLAG_MUSTSUCCEED);
-			err = bptree_iterate(dp->dp_meta_objset,
-			    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
-			    scn, tx);
-			VERIFY0(zio_wait(scn->scn_zio_root));
-
-			if (err == 0) {
-				/* finished; deactivate async destroy feature */
-				spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY,
-				    tx);
-				ASSERT(!spa_feature_is_active(spa,
-				    SPA_FEATURE_ASYNC_DESTROY));
-				VERIFY0(zap_remove(dp->dp_meta_objset,
-				    DMU_POOL_DIRECTORY_OBJECT,
-				    DMU_POOL_BPTREE_OBJ, tx));
-				VERIFY0(bptree_free(dp->dp_meta_objset,
-				    dp->dp_bptree_obj, tx));
-				dp->dp_bptree_obj = 0;
-				scn->scn_async_destroying = B_FALSE;
-			}
-		}
-		if (scn->scn_visited_this_txg) {
-			zfs_dbgmsg("freed %llu blocks in %llums from "
-			    "free_bpobj/bptree txg %llu",
-			    (longlong_t)scn->scn_visited_this_txg,
-			    (longlong_t)
-			    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
-			    (longlong_t)tx->tx_txg);
-			scn->scn_visited_this_txg = 0;
-			/*
-			 * Re-sync the ddt so that we can further modify
-			 * it when doing bprewrite.
-			 */
-			ddt_sync(spa, tx->tx_txg);
+		if (err != 0 && err != ERESTART)
+			zfs_panic_recover("error %u from bpobj_iterate()", err);
+	}
+
+	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+		ASSERT(scn->scn_async_destroying);
+		scn->scn_is_bptree = B_TRUE;
+		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_MUSTSUCCEED);
+		err = bptree_iterate(dp->dp_meta_objset,
+		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
+		VERIFY0(zio_wait(scn->scn_zio_root));
+
+		if (err == EIO || err == ECKSUM) {
+			err = 0;
+		} else if (err != 0 && err != ERESTART) {
+			zfs_panic_recover("error %u from "
+			    "traverse_dataset_destroyed()", err);
 		}
-		if (err == ERESTART)
-			return;
+
+		/*
+		 * If we didn't make progress, mark the async destroy as
+		 * stalled, so that we will not initiate a spa_sync() on
+		 * its behalf.
+		 */
+		scn->scn_async_stalled = (scn->scn_visited_this_txg == 0);
+
+		if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
+			/* finished; deactivate async destroy feature */
+			spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
+			ASSERT(!spa_feature_is_active(spa,
+			    SPA_FEATURE_ASYNC_DESTROY));
+			VERIFY0(zap_remove(dp->dp_meta_objset,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_BPTREE_OBJ, tx));
+			VERIFY0(bptree_free(dp->dp_meta_objset,
+			    dp->dp_bptree_obj, tx));
+			dp->dp_bptree_obj = 0;
+			scn->scn_async_destroying = B_FALSE;
+		}
+	}
+	if (scn->scn_visited_this_txg) {
+		zfs_dbgmsg("freed %llu blocks in %llums from "
+		    "free_bpobj/bptree txg %llu; err=%u",
+		    (longlong_t)scn->scn_visited_this_txg,
+		    (longlong_t)
+		    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
+		    (longlong_t)tx->tx_txg, err);
+		scn->scn_visited_this_txg = 0;
+
+		/*
+		 * Write out changes to the DDT that may be required as a
+		 * result of the blocks freed.  This ensures that the DDT
+		 * is clean when a scrub/resilver runs.
+		 */
+		ddt_sync(spa, tx->tx_txg);
+	}
+	if (err != 0)
+		return;
+	if (!scn->scn_async_destroying && zfs_free_leak_on_eio &&
+	    (dp->dp_free_dir->dd_phys->dd_used_bytes != 0 ||
+	    dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 ||
+	    dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) {
+		/*
+		 * We have finished background destroying, but there is still
+		 * some space left in the dp_free_dir. Transfer this leaked
+		 * space to the dp_leak_dir.
+		 */
+		if (dp->dp_leak_dir == NULL) {
+			rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+			(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+			    LEAK_DIR_NAME, tx);
+			VERIFY0(dsl_pool_open_special_dir(dp,
+			    LEAK_DIR_NAME, &dp->dp_leak_dir));
+			rrw_exit(&dp->dp_config_rwlock, FTAG);
+		}
+		dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
+		    dp->dp_free_dir->dd_phys->dd_used_bytes,
+		    dp->dp_free_dir->dd_phys->dd_compressed_bytes,
+		    dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
+		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+		    -dp->dp_free_dir->dd_phys->dd_used_bytes,
+		    -dp->dp_free_dir->dd_phys->dd_compressed_bytes,
+		    -dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
+	}
+	if (!scn->scn_async_destroying) {
 		/* finished; verify that space accounting went to zero */
 		ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
 		ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	Tue Jul  1 07:29:42 2014	(r268079)
@@ -257,19 +257,25 @@ spa_prop_get_config(spa_t *spa, nvlist_t
 	}
 
 	if (pool != NULL) {
-		dsl_dir_t *freedir = pool->dp_free_dir;
-
 		/*
 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 		 * when opening pools before this version freedir will be NULL.
 		 */
-		if (freedir != NULL) {
+		if (pool->dp_free_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
-			    freedir->dd_phys->dd_used_bytes, src);
+			    pool->dp_free_dir->dd_phys->dd_used_bytes, src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 			    NULL, 0, src);
 		}
+
+		if (pool->dp_leak_dir != NULL) {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
+			    pool->dp_leak_dir->dd_phys->dd_used_bytes, src);
+		} else {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
+			    NULL, 0, src);
+		}
 	}
 
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	Tue Jul  1 07:29:42 2014	(r268079)
@@ -254,12 +254,43 @@ SYSCTL_INT(_debug, OID_AUTO, zfs_flags, 
  * This should only be used as a last resort, as it typically results
  * in leaked space, or worse.
  */
-int zfs_recover = 0;
+boolean_t zfs_recover = B_FALSE;
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0,
     "Try to recover from otherwise-fatal errors.");
 
 /*
+ * If destroy encounters an EIO while reading metadata (e.g. indirect
+ * blocks), space referenced by the missing metadata can not be freed.
+ * Normally this causes the background destroy to become "stalled", as
+ * it is unable to make forward progress.  While in this stalled state,
+ * all remaining space to free from the error-encountering filesystem is
+ * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
+ * permanently leak the space from indirect blocks that can not be read,
+ * and continue to free everything else that it can.
+ *
+ * The default, "stalling" behavior is useful if the storage partially
+ * fails (i.e. some but not all i/os fail), and then later recovers.  In
+ * this case, we will be able to continue pool operations while it is
+ * partially failed, and when it recovers, we can continue to free the
+ * space, with no leaks.  However, note that this case is actually
+ * fairly rare.
+ *
+ * Typically pools either (a) fail completely (but perhaps temporarily,
+ * e.g. a top-level vdev going offline), or (b) have localized,
+ * permanent errors (e.g. disk returns the wrong data due to bit flip or
+ * firmware bug).  In case (a), this setting does not matter because the
+ * pool will be suspended and the sync thread will not be able to make
+ * forward progress regardless.  In case (b), because the error is
+ * permanent, the best we can do is leak the minimum amount of space,
+ * which is what setting this flag will do.  Therefore, it is reasonable
+ * for this flag to normally be set, but we chose the more conservative
+ * approach of not setting it, so that there is no possibility of
+ * leaking space in the "partial temporary" failure case.
+ */
+boolean_t zfs_free_leak_on_eio = B_FALSE;
+
+/*
  * Expiration time in milliseconds. This value has two meanings. First it is
  * used to determine when the spa_deadman() logic should fire. By default the
  * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h	Tue Jul  1 07:29:42 2014	(r268079)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_BPTREE_H
@@ -50,6 +50,7 @@ typedef int bptree_itor_t(void *arg, con
 
 uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
 int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+boolean_t bptree_is_empty(objset_t *os, uint64_t obj);
 
 void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
     uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	Tue Jul  1 07:29:42 2014	(r268079)
@@ -253,7 +253,6 @@ void zfs_znode_byteswap(void *buf, size_
 
 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
-#define	DMU_DEADLIST_OBJECT	(-3ULL)
 
 /*
  * artificial blkids for bonus buffer and spill blocks

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h	Tue Jul  1 07:29:42 2014	(r268079)
@@ -160,6 +160,7 @@ boolean_t dsl_dir_is_zapified(dsl_dir_t 
 #define	ORIGIN_DIR_NAME "$ORIGIN"
 #define	XLATION_DIR_NAME "$XLATION"
 #define	FREE_DIR_NAME "$FREE"
+#define	LEAK_DIR_NAME "$LEAK"
 
 #ifdef ZFS_DEBUG
 #define	dprintf_dd(dd, fmt, ...) do { \

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h	Tue Jul  1 07:29:42 2014	(r268079)
@@ -84,6 +84,7 @@ typedef struct dsl_pool {
 	struct dsl_dir *dp_root_dir;
 	struct dsl_dir *dp_mos_dir;
 	struct dsl_dir *dp_free_dir;
+	struct dsl_dir *dp_leak_dir;
 	struct dsl_dataset *dp_origin_snap;
 	uint64_t dp_root_dir_obj;
 	struct taskq *dp_vnrele_taskq;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h	Tue Jul  1 07:29:42 2014	(r268079)
@@ -114,6 +114,7 @@ typedef struct dsl_scan {
 	/* for freeing blocks */
 	boolean_t scn_is_bptree;
 	boolean_t scn_async_destroying;
+	boolean_t scn_async_stalled;
 
 	/* for debugging / information */
 	uint64_t scn_visited_this_txg;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h	Tue Jul  1 07:29:42 2014	(r268079)
@@ -49,7 +49,8 @@ extern "C" {
 #endif
 
 extern int zfs_flags;
-extern int zfs_recover;
+extern boolean_t zfs_recover;
+extern boolean_t zfs_free_leak_on_eio;
 
 #define	ZFS_DEBUG_DPRINTF	(1<<0)
 #define	ZFS_DEBUG_DBUF_VERIFY	(1<<1)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c	Tue Jul  1 07:29:42 2014	(r268079)
@@ -28,7 +28,7 @@
 list_t zfs_dbgmsgs;
 int zfs_dbgmsg_size;
 kmutex_t zfs_dbgmsgs_lock;
-int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
 
 void
 zfs_dbgmsg_init(void)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Tue Jul  1 07:29:42 2014	(r268079)
@@ -3378,13 +3378,6 @@ zbookmark_is_before(const dnode_phys_t *
 	ASSERT(zb1->zb_objset == zb2->zb_objset);
 	ASSERT(zb2->zb_level == 0);
 
-	/*
-	 * A bookmark in the deadlist is considered to be after
-	 * everything else.
-	 */
-	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
-		return (B_TRUE);
-
 	/* The objset_phys_t isn't before anything. */
 	if (dnp == NULL)
 		return (B_FALSE);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h	Tue Jul  1 07:13:41 2014	(r268078)
+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h	Tue Jul  1 07:29:42 2014	(r268079)
@@ -193,6 +193,7 @@ typedef enum {
 	ZPOOL_PROP_COMMENT,
 	ZPOOL_PROP_EXPANDSZ,
 	ZPOOL_PROP_FREEING,
+	ZPOOL_PROP_LEAKED,
 	ZPOOL_NUM_PROPS
 } zpool_prop_t;
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201407010729.s617Tgts010424>