Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 11 Aug 2018 19:45:05 +0000 (UTC)
From:      Matt Macy <mmacy@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r337653 - in head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys
Message-ID:  <201808111945.w7BJj5PN023832@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mmacy
Date: Sat Aug 11 19:45:04 2018
New Revision: 337653
URL: https://svnweb.freebsd.org/changeset/base/337653

Log:
  Limit the amount of dnode metadata in the ARC
  
  In addition import most recent arc_prune_async implementation as dependency
  
  commit 25458cbef9e59ef9ee6a7e729ab2522ed308f88f
  Author: Tim Chase <tim@chase2k.com>
  Date:   Wed Jul 13 07:42:40 2016 -0500
  
      Limit the amount of dnode metadata in the ARC
  
      Metadata-intensive workloads can cause the ARC to become permanently
      filled with dnode_t objects as they're pinned by the VFS layer.
      Subsequent data-intensive workloads may only benefit from about
      25% of the potential ARC (arc_c_max - arc_meta_limit).
  
      In order to help track metadata usage more precisely, the other_size
      metadata arcstat has replaced with dbuf_size, dnode_size and bonus_size.
  
      The new zfs_arc_dnode_limit tunable, which defaults to 10% of
      zfs_arc_meta_limit, defines the minimum number of bytes which is desirable
      to be consumed by dnodes.  Attempts to evict non-metadata will trigger
      async prune tasks if the space used by dnodes exceeds this limit.
  
      The new zfs_arc_dnode_reduce_percent tunable specifies the amount by
      which the excess dnode space is attempted to be pruned as a percentage of
      the amount by which zfs_arc_dnode_limit is being exceeded.  By default,
      it tries to unpin 10% of the dnodes.
  
      The problem of dnode metadata pinning was observed with the following
      testing procedure (in this example, zfs_arc_max is set to 4GiB):
  
          - Create a large number of small files until arc_meta_used exceeds
            arc_meta_limit (3GiB with default tuning) and arc_prune
            starts increasing.
  
          - Create a 3GiB file with dd.  Observe arc_mata_used.  It will still
            be around 3GiB.
  
          - Repeatedly read the 3GiB file and observe arc_meta_limit as before.
            It will continue to stay around 3GiB.
  
      With this modification, space for the 3GiB file is gradually made
      available as subsequent demands on the ARC are made.  The previous behavior
      can be restored by setting zfs_arc_dnode_limit to the same value as the
      zfs_arc_meta_limit.
  
      Signed-off-by: Tim Chase <tim@chase2k.com>
      Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
      Issue #4345
      Issue #4512
      Issue #4773
      Closes #4858

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Sat Aug 11 19:21:53 2018	(r337652)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Sat Aug 11 19:45:04 2018	(r337653)
@@ -99,6 +99,14 @@
  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  * the active state mutex must be held before the ghost state mutex.
  *
+ * It as also possible to register a callback which is run when the
+ * arc_meta_limit is reached and no buffers can be safely evicted.  In
+ * this case the arc user should drop a reference on some arc buffers so
+ * they can be reclaimed and the arc_meta_limit honored.  For example,
+ * when using the ZPL each dentry holds a references on a znode.  These
+ * dentries must be pruned before the arc buffer holding the znode can
+ * be safely evicted.
+ *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
@@ -367,6 +375,8 @@ uint64_t zfs_arc_max;
 uint64_t zfs_arc_min;
 uint64_t zfs_arc_meta_limit = 0;
 uint64_t zfs_arc_meta_min = 0;
+uint64_t zfs_arc_dnode_limit = 0;
+uint64_t zfs_arc_dnode_reduce_percent = 10;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_no_grow_shift = 0;
@@ -624,15 +634,18 @@ typedef struct arc_stats {
 	 */
 	kstat_named_t arcstat_metadata_size;
 	/*
-	 * Number of bytes consumed by various buffers and structures
-	 * not actually backed with ARC buffers. This includes bonus
-	 * buffers (allocated directly via zio_buf_* functions),
-	 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
-	 * cache), and dnode_t structures (allocated via dnode_t cache).
-	 * Not updated directly; only synced in arc_kstat_update.
+	 * Number of bytes consumed by dmu_buf_impl_t objects.
 	 */
-	kstat_named_t arcstat_other_size;
+	kstat_named_t arcstat_dbuf_size;
 	/*
+	 * Number of bytes consumed by dnode_t objects.
+	 */
+	kstat_named_t arcstat_dnode_size;
+	/*
+	 * Number of bytes consumed by bonus buffers.
+	 */
+	kstat_named_t arcstat_bonus_size;
+	/*
 	 * Total number of bytes consumed by ARC buffers residing in the
 	 * arc_anon state. This includes *all* buffers in the arc_anon
 	 * state; e.g. data, metadata, evictable, and unevictable buffers
@@ -782,9 +795,19 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_l2_write_buffer_list_iter;
 	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
 	kstat_named_t arcstat_memory_throttle_count;
+	kstat_named_t arcstat_memory_direct_count;
+	kstat_named_t arcstat_memory_indirect_count;
+	kstat_named_t arcstat_memory_all_bytes;
+	kstat_named_t arcstat_memory_free_bytes;
+	kstat_named_t arcstat_memory_available_bytes;
+	kstat_named_t arcstat_no_grow;
+	kstat_named_t arcstat_tempreserve;
+	kstat_named_t arcstat_loaned_bytes;
+	kstat_named_t arcstat_prune;
 	/* Not updated directly; only synced in arc_kstat_update. */
 	kstat_named_t arcstat_meta_used;
 	kstat_named_t arcstat_meta_limit;
+	kstat_named_t arcstat_dnode_limit;
 	kstat_named_t arcstat_meta_max;
 	kstat_named_t arcstat_meta_min;
 	kstat_named_t arcstat_async_upgrade_sync;
@@ -833,7 +856,9 @@ static arc_stats_t arc_stats = {
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
 	{ "data_size",			KSTAT_DATA_UINT64 },
 	{ "metadata_size",		KSTAT_DATA_UINT64 },
-	{ "other_size",			KSTAT_DATA_UINT64 },
+	{ "dbuf_size",			KSTAT_DATA_UINT64 },
+	{ "dnode_size",			KSTAT_DATA_UINT64 },
+	{ "bonus_size",			KSTAT_DATA_UINT64 },
 	{ "anon_size",			KSTAT_DATA_UINT64 },
 	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
@@ -882,8 +907,18 @@ static arc_stats_t arc_stats = {
 	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
 	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
+	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
+	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
+	{ "memory_all_bytes",		KSTAT_DATA_UINT64 },
+	{ "memory_free_bytes",		KSTAT_DATA_UINT64 },
+	{ "memory_available_bytes",	KSTAT_DATA_UINT64 },
+	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
+	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
+	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
+	{ "arc_prune",			KSTAT_DATA_UINT64 },
 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
+	{ "arc_dnode_limit",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
 	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
@@ -950,8 +985,12 @@ static arc_state_t	*arc_l2c_only;
 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
 #define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+#define	arc_dnode_limit	ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
 #define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
 #define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
+#define	arc_dbuf_size	ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
+#define	arc_dnode_size	ARCSTAT(arcstat_dnode_size) /* dnode metadata */
+#define	arc_bonus_size	ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
 
 /* compressed size of entire arc */
 #define	arc_compressed_size	ARCSTAT(arcstat_compressed_size)
@@ -973,9 +1012,15 @@ aggsum_t arc_meta_used;
 aggsum_t astat_data_size;
 aggsum_t astat_metadata_size;
 aggsum_t astat_hdr_size;
-aggsum_t astat_other_size;
+aggsum_t astat_bonus_size;
+aggsum_t astat_dnode_size;
+aggsum_t astat_dbuf_size;
 aggsum_t astat_l2_hdr_size;
 
+static list_t arc_prune_list;
+static kmutex_t arc_prune_mtx;
+static taskq_t *arc_prune_taskq;
+
 static int		arc_no_grow;	/* Don't try to grow cache size */
 static uint64_t		arc_tempreserve;
 static uint64_t		arc_loaned_bytes;
@@ -1472,6 +1517,7 @@ static void arc_hdr_alloc_pabd(arc_buf_hdr_t *);
 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
 static boolean_t arc_is_overflowing();
 static void arc_buf_watch(arc_buf_t *);
+static void arc_prune_async(int64_t);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@@ -2703,9 +2749,15 @@ arc_space_consume(uint64_t space, arc_space_type_t typ
 	case ARC_SPACE_META:
 		aggsum_add(&astat_metadata_size, space);
 		break;
-	case ARC_SPACE_OTHER:
-		aggsum_add(&astat_other_size, space);
+	case ARC_SPACE_BONUS:
+		aggsum_add(&astat_bonus_size, space);
 		break;
+	case ARC_SPACE_DNODE:
+		aggsum_add(&astat_dnode_size, space);
+		break;
+	case ARC_SPACE_DBUF:
+		aggsum_add(&astat_dbuf_size, space);
+		break;
 	case ARC_SPACE_HDRS:
 		aggsum_add(&astat_hdr_size, space);
 		break;
@@ -2732,9 +2784,15 @@ arc_space_return(uint64_t space, arc_space_type_t type
 	case ARC_SPACE_META:
 		aggsum_add(&astat_metadata_size, -space);
 		break;
-	case ARC_SPACE_OTHER:
-		aggsum_add(&astat_other_size, -space);
+	case ARC_SPACE_BONUS:
+		aggsum_add(&astat_bonus_size, -space);
 		break;
+	case ARC_SPACE_DNODE:
+		aggsum_add(&astat_dnode_size, -space);
+		break;
+	case ARC_SPACE_DBUF:
+		aggsum_add(&astat_dbuf_size, -space);
+		break;
 	case ARC_SPACE_HDRS:
 		aggsum_add(&astat_hdr_size, -space);
 		break;
@@ -3833,16 +3891,28 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int6
 	 * we're evicting all available buffers.
 	 */
 	while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
+		int sublist_idx = multilist_get_random_index(ml);
+		uint64_t scan_evicted = 0;
+
 		/*
+		 * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
+		 * Request that 10% of the LRUs be scanned by the superblock
+		 * shrinker.
+		 */
+		if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
+		    arc_dnode_limit) > 0) {
+			arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
+			    arc_dnode_limit) / sizeof (dnode_t) /
+			    zfs_arc_dnode_reduce_percent);
+		}
+
+		/*
 		 * Start eviction using a randomly selected sublist,
 		 * this is to try and evenly balance eviction across all
 		 * sublists. Always starting at the same sublist
 		 * (e.g. index 0) would cause evictions to favor certain
 		 * sublists over others.
 		 */
-		int sublist_idx = multilist_get_random_index(ml);
-		uint64_t scan_evicted = 0;
-
 		for (int i = 0; i < num_sublists; i++) {
 			uint64_t bytes_remaining;
 			uint64_t bytes_evicted;
@@ -3933,6 +4003,57 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_
 }
 
 /*
+ * Helper function for arc_prune_async() it is responsible for safely
+ * handling the execution of a registered arc_prune_func_t.
+ */
+static void
+arc_prune_task(void *ptr)
+{
+	arc_prune_t *ap = (arc_prune_t *)ptr;
+	arc_prune_func_t *func = ap->p_pfunc;
+
+	if (func != NULL)
+		func(ap->p_adjust, ap->p_private);
+
+	refcount_remove(&ap->p_refcnt, func);
+}
+
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffered they reference.  This provides a mechanism to ensure the ARC can
+ * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
+ * is analogous to dnlc_reduce_cache() but more generic.
+ *
+ * This operation is performed asynchronously so it may be safely called
+ * in the context of the arc_reclaim_thread().  A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
+static void
+arc_prune_async(int64_t adjust)
+{
+	arc_prune_t *ap;
+
+	mutex_enter(&arc_prune_mtx);
+	for (ap = list_head(&arc_prune_list); ap != NULL;
+	    ap = list_next(&arc_prune_list, ap)) {
+
+		if (refcount_count(&ap->p_refcnt) >= 2)
+			continue;
+
+		refcount_add(&ap->p_refcnt, ap->p_pfunc);
+		ap->p_adjust = adjust;
+		if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
+		    ap, TQ_SLEEP) == TASKQID_INVALID) {
+			refcount_remove(&ap->p_refcnt, ap->p_pfunc);
+			continue;
+		}
+		ARCSTAT_BUMP(arcstat_prune);
+	}
+	mutex_exit(&arc_prune_mtx);
+}
+
+/*
  * Evict the specified number of bytes from the state specified,
  * restricting eviction to the spa and type given. This function
  * prevents us from trying to evict more from a state's list than
@@ -5764,6 +5885,43 @@ top:
 	return (0);
 }
 
+arc_prune_t *
+arc_add_prune_callback(arc_prune_func_t *func, void *private)
+{
+	arc_prune_t *p;
+
+	p = kmem_alloc(sizeof (*p), KM_SLEEP);
+	p->p_pfunc = func;
+	p->p_private = private;
+	list_link_init(&p->p_node);
+	refcount_create(&p->p_refcnt);
+
+	mutex_enter(&arc_prune_mtx);
+	refcount_add(&p->p_refcnt, &arc_prune_list);
+	list_insert_head(&arc_prune_list, p);
+	mutex_exit(&arc_prune_mtx);
+
+	return (p);
+}
+
+void
+arc_remove_prune_callback(arc_prune_t *p)
+{
+	boolean_t wait = B_FALSE;
+	mutex_enter(&arc_prune_mtx);
+	list_remove(&arc_prune_list, p);
+	if (refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
+		wait = B_TRUE;
+	mutex_exit(&arc_prune_mtx);
+
+	/* wait for arc_prune_task to finish */
+	if (wait)
+		taskq_wait(arc_prune_taskq);
+	ASSERT0(refcount_count(&p->p_refcnt));
+	refcount_destroy(&p->p_refcnt);
+	kmem_free(p, sizeof (*p));
+}
+
 /*
  * Notify the arc that a block was freed, and thus will never be used again.
  */
@@ -6476,7 +6634,9 @@ arc_kstat_update(kstat_t *ksp, int rw)
 		ARCSTAT(arcstat_metadata_size) =
 		    aggsum_value(&astat_metadata_size);
 		ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
-		ARCSTAT(arcstat_other_size) = aggsum_value(&astat_other_size);
+		ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
+		ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
+		ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
 		ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
 	}
 
@@ -6616,7 +6776,9 @@ arc_state_init(void)
 	aggsum_init(&astat_data_size, 0);
 	aggsum_init(&astat_metadata_size, 0);
 	aggsum_init(&astat_hdr_size, 0);
-	aggsum_init(&astat_other_size, 0);
+	aggsum_init(&astat_bonus_size, 0);
+	aggsum_init(&astat_dnode_size, 0);
+	aggsum_init(&astat_dbuf_size, 0);
 	aggsum_init(&astat_l2_hdr_size, 0);
 }
 
@@ -6730,6 +6892,7 @@ arc_init(void)
 	 */
 #ifdef __FreeBSD__
 	arc_meta_limit = MIN(arc_meta_limit, uma_limit() / 2);
+	arc_dnode_limit = arc_meta_limit / 10;
 #else
 	arc_meta_limit = MIN(arc_meta_limit,
 	    vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2);
@@ -6749,6 +6912,12 @@ arc_init(void)
 		arc_meta_min = arc_c_min / 2;
 	}
 
+	/* Valid range: <arc_meta_min> - <arc_c_max> */
+	if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) &&
+	    (zfs_arc_dnode_limit >= zfs_arc_meta_min) &&
+	    (zfs_arc_dnode_limit <= arc_c_max))
+		arc_dnode_limit = zfs_arc_dnode_limit;
+
 	if (zfs_arc_grow_retry > 0)
 		arc_grow_retry = zfs_arc_grow_retry;
 
@@ -6778,6 +6947,13 @@ arc_init(void)
 	arc_state_init();
 	buf_init();
 
+	list_create(&arc_prune_list, sizeof (arc_prune_t),
+	    offsetof(arc_prune_t, p_node));
+	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+	arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
+	    max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+
 	arc_reclaim_thread_exit = B_FALSE;
 	arc_dnlc_evicts_thread_exit = FALSE;
 
@@ -6859,6 +7035,8 @@ arc_init(void)
 void
 arc_fini(void)
 {
+	arc_prune_t *p;
+
 #ifdef _KERNEL
 	if (arc_event_lowmem != NULL)
 		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
@@ -6898,6 +7076,20 @@ arc_fini(void)
 		arc_ksp = NULL;
 	}
 
+	taskq_wait(arc_prune_taskq);
+	taskq_destroy(arc_prune_taskq);
+
+	mutex_enter(&arc_prune_mtx);
+	while ((p = list_head(&arc_prune_list)) != NULL) {
+		list_remove(&arc_prune_list, p);
+		refcount_remove(&p->p_refcnt, &arc_prune_list);
+		refcount_destroy(&p->p_refcnt);
+		kmem_free(p, sizeof (*p));
+	}
+	mutex_exit(&arc_prune_mtx);
+
+	list_destroy(&arc_prune_list);
+	mutex_destroy(&arc_prune_mtx);
 	mutex_destroy(&arc_reclaim_lock);
 	cv_destroy(&arc_reclaim_thread_cv);
 	cv_destroy(&arc_reclaim_waiters_cv);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Sat Aug 11 19:21:53 2018	(r337652)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Sat Aug 11 19:45:04 2018	(r337653)
@@ -1014,7 +1014,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_
 
 		ASSERT3U(bonuslen, <=, db->db.db_size);
 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
-		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_BONUS);
 		if (bonuslen < DN_MAX_BONUSLEN)
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
 		if (bonuslen)
@@ -1124,7 +1124,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		/* Note that the data bufs here are zio_bufs */
 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
-		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_BONUS);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		int size = arc_buf_size(db->db_buf);
@@ -2098,7 +2098,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(db->db.db_data != NULL);
 		zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
-		arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+		arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_BONUS);
 		db->db_state = DB_UNCACHED;
 	}
 
@@ -2172,7 +2172,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 
 	kmem_cache_free(dbuf_kmem_cache, db);
-	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 
 	/*
 	 * If this dbuf is referenced from an indirect dbuf,
@@ -2311,7 +2311,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid
 		db->db_state = DB_UNCACHED;
 		db->db_caching_status = DB_NO_CACHE;
 		/* the bonus dbuf is not placed in the hash table */
-		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 		return (db);
 	} else if (blkid == DMU_SPILL_BLKID) {
 		db->db.db_size = (blkptr != NULL) ?
@@ -2344,7 +2344,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid
 	db->db_state = DB_UNCACHED;
 	db->db_caching_status = DB_NO_CACHE;
 	mutex_exit(&dn->dn_dbufs_mtx);
-	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
@@ -3191,7 +3191,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 
 		if (*datap != db->db.db_data) {
 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
-			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_BONUS);
 		}
 		db->db_data_pending = NULL;
 		drp = &db->db_last_dirty;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	Sat Aug 11 19:21:53 2018	(r337652)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	Sat Aug 11 19:45:04 2018	(r337653)
@@ -472,7 +472,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_
 	dnh->dnh_dnode = dn;
 	mutex_exit(&os->os_lock);
 
-	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
+	arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
 	return (dn);
 }
 
@@ -527,7 +527,7 @@ dnode_destroy(dnode_t *dn)
 
 	dmu_zfetch_fini(&dn->dn_zfetch);
 	kmem_cache_free(dnode_cache, dn);
-	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+	arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
 
 	if (complete_os_eviction)
 		dmu_objset_evict_done(os);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h	Sat Aug 11 19:21:53 2018	(r337652)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h	Sat Aug 11 19:45:04 2018	(r337653)
@@ -58,14 +58,42 @@ _NOTE(CONSTCOND) } while (0)
 
 typedef struct arc_buf_hdr arc_buf_hdr_t;
 typedef struct arc_buf arc_buf_t;
+typedef struct arc_prune arc_prune_t;
+
+/*
+ * Because the ARC can store encrypted data, errors (not due to bugs) may arise
+ * while transforming data into its desired format - specifically, when
+ * decrypting, the key may not be present, or the HMAC may not be correct
+ * which signifies deliberate tampering with the on-disk state
+ * (assuming that the checksum was correct). If any error occurs, the "buf"
+ * parameter will be NULL.
+ */
 typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb,
-    const blkptr_t *bp, arc_buf_t *buf, void *priv);
-typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
+    const blkptr_t *bp, arc_buf_t *buf, void *private);
+typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
+typedef void arc_prune_func_t(int64_t bytes, void *private);
 
+/* Shared module parameters */
+extern uint64_t zfs_arc_average_blocksize;
+
 /* generic arc_done_func_t's which you can use */
 arc_read_done_func_t arc_bcopy_func;
 arc_read_done_func_t arc_getbuf_func;
 
+/* generic arc_prune_func_t wrapper for callbacks */
+struct arc_prune {
+	arc_prune_func_t	*p_pfunc;
+	void			*p_private;
+	uint64_t		p_adjust;
+	list_node_t		p_node;
+	refcount_t		p_refcnt;
+};
+
+typedef enum arc_strategy {
+	ARC_STRATEGY_META_ONLY		= 0, /* Evict only meta data buffers */
+	ARC_STRATEGY_META_BALANCED	= 1, /* Evict data buffers if needed */
+} arc_strategy_t;
+
 typedef enum arc_flags
 {
 	/*
@@ -151,7 +179,9 @@ typedef enum arc_space_type {
 	ARC_SPACE_META,
 	ARC_SPACE_HDRS,
 	ARC_SPACE_L2HDRS,
-	ARC_SPACE_OTHER,
+	ARC_SPACE_DBUF,
+	ARC_SPACE_DNODE,
+	ARC_SPACE_BONUS,
 	ARC_SPACE_NUMTYPES
 } arc_space_type_t;
 
@@ -190,6 +220,9 @@ zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     arc_write_done_func_t *physdone, arc_write_done_func_t *done,
     void *priv, zio_priority_t priority, int zio_flags,
     const zbookmark_phys_t *zb);
+
+arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *private);
+void arc_remove_prune_callback(arc_prune_t *p);
 void arc_freed(spa_t *spa, const blkptr_t *bp);
 
 void arc_flush(spa_t *spa, boolean_t retry);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808111945.w7BJj5PN023832>