Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 2 Aug 2018 23:59:52 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r337210 - in vendor-sys/illumos/dist/uts/common/fs/zfs: . sys
Message-ID:  <201808022359.w72NxqhY029681@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Thu Aug  2 23:59:52 2018
New Revision: 337210
URL: https://svnweb.freebsd.org/changeset/base/337210

Log:
  9577 remove zfs_dbuf_evict_key tsd
  
  The zfs_dbuf_evict_key TSD (thread-specific data) is not necessary - we can
  instead pass a flag down in a few places to prevent recursive dbuf eviction.
  Making this change has 3 benefits:
  
  1. The code semantics are easier to understand.
  2. On Linux, performance is improved, because creating/removing TSD values
  (by setting to NULL vs non-NULL) is expensive, and we do it very often.
  3. According to Nexenta, the current semantics can cause a deadlock when
  concurrently calling dmu_objset_evict_dbufs() (which is rare today, but they
  are working on a "parallel unmount" change that triggers this more easily)
  
  illumos/illumos-gate@c2919acbea007fa95c709b60d073db9a24526e01
  
  Reviewed by: George Wilson <george.wilson@delphix.com>
  Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
  Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
  Reviewed by: Andy Stormont <astormont@racktopsystems.com>
  Approved by: Richard Lowe <richlowe@richlowe.net>
  Author:     Matthew Ahrens <mahrens@delphix.com>

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c	Thu Aug  2 23:56:07 2018	(r337209)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c	Thu Aug  2 23:59:52 2018	(r337210)
@@ -51,8 +51,6 @@
 #include <sys/cityhash.h>
 #include <sys/spa_impl.h>
 
-uint_t zfs_dbuf_evict_key;
-
 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
@@ -524,14 +522,6 @@ dbuf_evict_one(void)
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
 
-	/*
-	 * Set the thread's tsd to indicate that it's processing evictions.
-	 * Once a thread stops evicting from the dbuf cache it will
-	 * reset its tsd to NULL.
-	 */
-	ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
-	(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
-
 	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
 	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
 		db = multilist_sublist_prev(mls, db);
@@ -551,7 +541,6 @@ dbuf_evict_one(void)
 	} else {
 		multilist_sublist_unlock(mls);
 	}
-	(void) tsd_set(zfs_dbuf_evict_key, NULL);
 }
 
 /*
@@ -605,30 +594,7 @@ dbuf_evict_thread(void *unused)
 static void
 dbuf_evict_notify(void)
 {
-
 	/*
-	 * We use thread specific data to track when a thread has
-	 * started processing evictions. This allows us to avoid deeply
-	 * nested stacks that would have a call flow similar to this:
-	 *
-	 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
-	 *	^						|
-	 *	|						|
-	 *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
-	 *
-	 * The dbuf_eviction_thread will always have its tsd set until
-	 * that thread exits. All other threads will only set their tsd
-	 * if they are participating in the eviction process. This only
-	 * happens if the eviction thread is unable to process evictions
-	 * fast enough. To keep the dbuf cache size in check, other threads
-	 * can evict from the dbuf cache directly. Those threads will set
-	 * their tsd values so that we ensure that they only evict one dbuf
-	 * from the dbuf cache.
-	 */
-	if (tsd_get(zfs_dbuf_evict_key) != NULL)
-		return;
-
-	/*
 	 * We check if we should evict without holding the dbuf_evict_lock,
 	 * because it's OK to occasionally make the wrong decision here,
 	 * and grabbing the lock results in massive lock contention.
@@ -704,7 +670,6 @@ retry:
 		refcount_create(&dbuf_caches[dcs].size);
 	}
 
-	tsd_create(&zfs_dbuf_evict_key, NULL);
 	dbuf_evict_thread_exit = B_FALSE;
 	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
@@ -731,7 +696,6 @@ dbuf_fini(void)
 		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
 	}
 	mutex_exit(&dbuf_evict_lock);
-	tsd_destroy(&zfs_dbuf_evict_key);
 
 	mutex_destroy(&dbuf_evict_lock);
 	cv_destroy(&dbuf_evict_cv);
@@ -1011,7 +975,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 		db->db_state = DB_CACHED;
 	}
 	cv_broadcast(&db->db_changed);
-	dbuf_rele_and_unlock(db, NULL);
+	dbuf_rele_and_unlock(db, NULL, B_FALSE);
 }
 
 static void
@@ -2171,7 +2135,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
 		 * release any lock.
 		 */
-		dnode_rele(dn, db);
+		mutex_enter(&dn->dn_mtx);
+		dnode_rele_and_unlock(dn, db, B_TRUE);
 		db->db_dnode_handle = NULL;
 
 		dbuf_hash_remove(db);
@@ -2198,8 +2163,10 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
-	if (parent && parent != dndb)
-		dbuf_rele(parent, db);
+	if (parent && parent != dndb) {
+		mutex_enter(&parent->db_mtx);
+		dbuf_rele_and_unlock(parent, db, B_TRUE);
+	}
 }
 
 /*
@@ -2823,7 +2790,7 @@ void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
 	mutex_enter(&db->db_mtx);
-	dbuf_rele_and_unlock(db, tag);
+	dbuf_rele_and_unlock(db, tag, B_FALSE);
 }
 
 void
@@ -2834,10 +2801,19 @@ dmu_buf_rele(dmu_buf_t *db, void *tag)
 
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
- * db_dirtycnt and db_holds to be updated atomically.
+ * db_dirtycnt and db_holds to be updated atomically.  The 'evicting'
+ * argument should be set if we are already in the dbuf-evicting code
+ * path, in which case we don't want to recursively evict.  This allows us to
+ * avoid deeply nested stacks that would have a call flow similar to this:
+ *
+ * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
+ *	^						|
+ *	|						|
+ *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
+ *
  */
 void
-dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
 {
 	int64_t holds;
 
@@ -2940,7 +2916,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 				    db->db.db_size, db);
 				mutex_exit(&db->db_mtx);
 
-				if (db->db_caching_status == DB_DBUF_CACHE) {
+				if (db->db_caching_status == DB_DBUF_CACHE &&
+				    !evicting) {
 					dbuf_evict_notify();
 				}
 			}
@@ -3203,7 +3180,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
-		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
 		return;
 	}
 
@@ -3553,7 +3530,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
-	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
+	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 }
 
 static void

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c	Thu Aug  2 23:56:07 2018	(r337209)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c	Thu Aug  2 23:59:52 2018	(r337210)
@@ -1229,11 +1229,11 @@ void
 dnode_rele(dnode_t *dn, void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
-	dnode_rele_and_unlock(dn, tag);
+	dnode_rele_and_unlock(dn, tag, B_FALSE);
 }
 
 void
-dnode_rele_and_unlock(dnode_t *dn, void *tag)
+dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
 {
 	uint64_t refs;
 	/* Get while the hold prevents the dnode from moving. */
@@ -1264,7 +1264,8 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag)
 		 * that the handle has zero references, but that will be
 		 * asserted anyway when the handle gets destroyed.
 		 */
-		dbuf_rele(db, dnh);
+		mutex_enter(&db->db_mtx);
+		dbuf_rele_and_unlock(db, dnh, evicting);
 	}
 }
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c	Thu Aug  2 23:56:07 2018	(r337209)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c	Thu Aug  2 23:59:52 2018	(r337210)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -439,6 +439,19 @@ dnode_evict_dbufs(dnode_t *dn)
 			avl_insert_here(&dn->dn_dbufs, &db_marker, db,
 			    AVL_BEFORE);
 
+			/*
+			 * We need to use the "marker" dbuf rather than
+			 * simply getting the next dbuf, because
+			 * dbuf_destroy() may actually remove multiple dbufs.
+			 * It can call itself recursively on the parent dbuf,
+			 * which may also be removed from dn_dbufs.  The code
+			 * flow would look like:
+			 *
+			 * dbuf_destroy():
+			 *   dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
+			 *	if (!cacheable || pending_evict)
+			 *	  dbuf_destroy()
+			 */
 			dbuf_destroy(db);
 
 			db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
@@ -497,7 +510,7 @@ dnode_undirty_dbufs(list_t *list)
 			list_destroy(&dr->dt.di.dr_children);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
-		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
 	}
 }
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h	Thu Aug  2 23:56:07 2018	(r337209)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h	Thu Aug  2 23:59:52 2018	(r337210)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
@@ -303,7 +303,7 @@ boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
-void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting);
 
 dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
     uint64_t blkid);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h	Thu Aug  2 23:56:07 2018	(r337209)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h	Thu Aug  2 23:59:52 2018	(r337210)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -291,7 +291,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object
     void *ref, dnode_t **dnp);
 boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
-void dnode_rele_and_unlock(dnode_t *dn, void *tag);
+void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
 void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
 void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
 void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808022359.w72NxqhY029681>