Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 19 Jun 2015 06:58:06 +0000 (UTC)
From:      Andriy Gapon <avg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r284593 - in head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys
Message-ID:  <201506190658.t5J6w6L0042911@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: avg
Date: Fri Jun 19 06:58:05 2015
New Revision: 284593
URL: https://svnweb.freebsd.org/changeset/base/284593

Log:
  MFV r284412: 5911 ZFS "hangs" while deleting file
  
  Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
  Reviewed by: Alek Pinchuk <alek@nexenta.com>
  Reviewed by: Simon Klinkert <simon.klinkert@gmail.com>
  Reviewed by: Dan McDonald <danmcd@omniti.com>
  Approved by: Richard Lowe <richlowe@richlowe.net>
  Author: Matthew Ahrens <mahrens@delphix.com>
  
  illumos/illumos-gate@46e1baa6cf6d5432f5fd231bb588df8f9570c858
  
  https://www.illumos.org/issues/5911
  Sometimes ZFS appears to hang while deleting a file. It is actually
  making slow progress at the file deletion, but other operations
  (administrative and writes via the data path) "hang" until the file
  removal completes, which can take a long time if the file has many
  blocks. The deletion (or most of it) happens in a single txg, and the
  sync thread spends most of its time reading indirect blocks via this
  stack trace:
  	swtch+0x141()
  	cv_wait+0x70()
  	zio_wait+0x5b()
  	dbuf_read+0x2c0()
  	free_children+0x50()
  	free_children+0x12a()
  	free_children+0x12a()
  	free_children+0x12a()
  	dnode_sync_free_range_impl+0xdf()
  	dnode_sync_free_range+0x52()
  	range_tree_vacate+0x65()
  	dnode_sync+0x1d8()
  	dmu_objset_sync_dnodes+0x77()
  	dmu_objset_sync+0x19f()
  	dsl_dataset_sync+0x51()
  	dsl_pool_sync+0x9a()
  	spa_sync+0x2ff()
  	txg_sync_thread+0x21f()
  	thread_start+8()
  One way to reproduce the problem is if we are over the arc_meta_limit,
  e.g. because lots of indirect blocks are pinned because we have L0
  dbufs under them.  It could be that most of the L1 indirects are cached,
  in which case when dmu_free_long_range_impl() calls dmu_tx_hold_free(),
  it will complete very quickly. This allows dmu_free_long_range_impl() to
  put many (perhaps all of its) transactions in the same TXG. However,
  dmu_free_long_range_impl() calls dnode_evict_dbufs (and
  dnode_free_range()), which removes the L0 dbufs, thus reducing the hold
  count on the L1 indirect blocks above it, allowing them to be evicted.
  Because we are over the arc_meta_limit(), these L1 blocks will be
  evicted ASAP. Thus when we get to syncing context, the L1 indirects are
  no longer cached and must be read in.
  
  Obtained from:	illumos
  MFC after:	15 days

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
Directory Properties:
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Fri Jun 19 06:48:55 2015	(r284592)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Fri Jun 19 06:58:05 2015	(r284593)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
@@ -1298,6 +1298,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
 	dbuf_dirty_record_t *dr, **drp;
 
 	ASSERT(txg != 0);
+
+	/*
+	 * Due to our use of dn_nlevels below, this can only be called
+	 * in open context, unless we are operating on the MOS.
+	 * From syncing context, dn_nlevels may be different from the
+	 * dn_nlevels used when dbuf was dirtied.
+	 */
+	ASSERT(db->db_objset ==
+	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1320,11 +1330,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
 
 	ASSERT(db->db.db_size != 0);
 
-	/*
-	 * Any space we accounted for in dp_dirty_* will be cleaned up by
-	 * dsl_pool_sync().  This is relatively rare so the discrepancy
-	 * is not a big deal.
-	 */
+	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
+	    dr->dr_accounted, txg);
 
 	*drp = dr->dr_next;
 
@@ -1339,7 +1346,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
-	    db->db_level+1 == dn->dn_nlevels) {
+	    db->db_level + 1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
@@ -1356,11 +1363,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
 			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
 	}
 
-	if (db->db_level != 0) {
-		mutex_destroy(&dr->dt.di.dr_mtx);
-		list_destroy(&dr->dt.di.dr_children);
-	}
-
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	ASSERT(db->db_dirtycnt > 0);
@@ -2318,7 +2320,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *
 
 	zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
-	dbuf_sync_list(&dr->dt.di.dr_children, tx);
+	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 	mutex_exit(&dr->dt.di.dr_mtx);
 	zio_nowait(zio);
@@ -2464,7 +2466,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, 
 }
 
 void
-dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr;
 
@@ -2481,6 +2483,10 @@ dbuf_sync_list(list_t *list, dmu_tx_t *t
 			    DMU_META_DNODE_OBJECT);
 			break;
 		}
+		if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+			VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+		}
 		list_remove(list, dr);
 		if (dr->dr_dbuf->db_level > 0)
 			dbuf_sync_indirect(dr, tx);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Fri Jun 19 06:48:55 2015	(r284592)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Fri Jun 19 06:58:05 2015	(r284593)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -686,7 +686,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t 
 			uint64_t ibyte = i << shift;
 			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
-			if (err == ESRCH)
+			if (err == ESRCH || i > end)
 				break;
 			if (err) {
 				tx->tx_err = err;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	Fri Jun 19 06:48:55 2015	(r284592)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	Fri Jun 19 06:58:05 2015	(r284593)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -1492,6 +1492,16 @@ out:
 		rw_downgrade(&dn->dn_struct_rwlock);
 }
 
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+	if (db != NULL) {
+		dmu_buf_will_dirty(&db->db, tx);
+		dbuf_rele(db, FTAG);
+	}
+}
+
 void
 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 {
@@ -1612,27 +1622,67 @@ dnode_free_range(dnode_t *dn, uint64_t o
 		nblks += 1;
 
 	/*
-	 * Dirty the first and last indirect blocks, as they (and/or their
-	 * parents) will need to be written out if they were only
-	 * partially freed.  Interior indirect blocks will be themselves freed,
-	 * by free_children(), so they need not be dirtied.  Note that these
-	 * interior blocks have already been prefetched by dmu_tx_hold_free().
+	 * Dirty all the indirect blocks in this range.  Note that only
+	 * the first and last indirect blocks can actually be written
+	 * (if they were partially freed) -- they must be dirtied, even if
+	 * they do not exist on disk yet.  The interior blocks will
+	 * be freed by free_children(), so they will not actually be written.
+	 * Even though these interior blocks will not be written, we
+	 * dirty them for two reasons:
+	 *
+	 *  - It ensures that the indirect blocks remain in memory until
+	 *    syncing context.  (They have already been prefetched by
+	 *    dmu_tx_hold_free(), so we don't have to worry about reading
+	 *    them serially here.)
+	 *
+	 *  - The dirty space accounting will put pressure on the txg sync
+	 *    mechanism to begin syncing, and to delay transactions if there
+	 *    is a large amount of freeing.  Even though these indirect
+	 *    blocks will not be written, we could need to write the same
+	 *    amount of space if we copy the freed BPs into deadlists.
 	 */
 	if (dn->dn_nlevels > 1) {
 		uint64_t first, last;
 
 		first = blkid >> epbs;
-		if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
-			dmu_buf_will_dirty(&db->db, tx);
-			dbuf_rele(db, FTAG);
-		}
+		dnode_dirty_l1(dn, first, tx);
 		if (trunc)
 			last = dn->dn_maxblkid >> epbs;
 		else
 			last = (blkid + nblks - 1) >> epbs;
-		if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
-			dmu_buf_will_dirty(&db->db, tx);
-			dbuf_rele(db, FTAG);
+		if (last != first)
+			dnode_dirty_l1(dn, last, tx);
+
+		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+		    SPA_BLKPTRSHIFT;
+		for (uint64_t i = first + 1; i < last; i++) {
+			/*
+			 * Set i to the blockid of the next non-hole
+			 * level-1 indirect block at or after i.  Note
+			 * that dnode_next_offset() operates in terms of
+			 * level-0-equivalent bytes.
+			 */
+			uint64_t ibyte = i << shift;
+			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+			    &ibyte, 2, 1, 0);
+			i = ibyte >> shift;
+			if (i >= last)
+				break;
+
+			/*
+			 * Normally we should not see an error, either
+			 * from dnode_next_offset() or dbuf_hold_level()
+			 * (except for ESRCH from dnode_next_offset).
+			 * If there is an i/o error, then when we read
+			 * this block in syncing context, it will use
+			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+			 * to the "failmode" property.  dnode_next_offset()
+			 * doesn't have a flag to indicate MUSTSUCCEED.
+			 */
+			if (err != 0)
+				break;
+
+			dnode_dirty_l1(dn, i, tx);
 		}
 	}
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c	Fri Jun 19 06:48:55 2015	(r284592)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c	Fri Jun 19 06:58:05 2015	(r284593)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -712,7 +712,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		mutex_exit(&dn->dn_mtx);
 	}
 
-	dbuf_sync_list(list, tx);
+	dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
 
 	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		ASSERT3P(list_head(list), ==, NULL);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	Fri Jun 19 06:48:55 2015	(r284592)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	Fri Jun 19 06:58:05 2015	(r284593)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
@@ -287,7 +287,7 @@ void dbuf_evict(dmu_buf_impl_t *db);
 
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
-void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
 void dbuf_release_bp(dmu_buf_impl_t *db);
 
 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201506190658.t5J6w6L0042911>