Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 20 Nov 2013 10:41:11 +0000 (UTC)
From:      Andriy Gapon <avg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r258371 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c vendor-s...
Message-ID:  <201311201041.rAKAfBHw052390@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: avg
Date: Wed Nov 20 10:41:10 2013
New Revision: 258371
URL: http://svnweb.freebsd.org/changeset/base/258371

Log:
  4101 metaslab_debug should allow for fine-grained control
  
  4102 space_maps should store more information about themselves
  4103 space map object blocksize should be increased
  4104 ::spa_space no longer works
  4105 removing a mirrored log device results in a leaked object
  4106 asynchronously load metaslab
  
  Revision illumos/illumos-gate@0713e232b7712cd27d99e1e935ebb8d5de61c57d

Added:
  vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c/
  vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c/range_tree.c   (contents, props changed)
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_reftree.h/
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_reftree.h/space_reftree.h   (contents, props changed)
Modified:
  vendor-sys/illumos/dist/common/zfs/zfeature_common.c
  vendor-sys/illumos/dist/common/zfs/zfeature_common.h
  vendor-sys/illumos/dist/uts/common/Makefile.files
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_map.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfeature.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_label.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zfeature.c

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/cmd/zdb/zdb.c
  vendor/illumos/dist/cmd/ztest/ztest.c
  vendor/illumos/dist/man/man5/zpool-features.5

Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.c	Wed Nov 20 10:38:37 2013	(r258370)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.c	Wed Nov 20 10:41:10 2013	(r258371)
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
@@ -164,4 +164,7 @@ zpool_feature_init(void)
 	zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
 	    "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
 	    "Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE, NULL);
+	zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
+	    "com.delphix:spacemap_histogram", "spacemap_histogram",
+	    "Spacemaps maintain space histograms.", B_TRUE, B_FALSE, NULL);
 }

Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.h
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.h	Wed Nov 20 10:38:37 2013	(r258370)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.h	Wed Nov 20 10:41:10 2013	(r258371)
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
@@ -56,6 +56,7 @@ enum spa_feature {
 	SPA_FEATURE_EMPTY_BPOBJ,
 	SPA_FEATURE_LZ4_COMPRESS,
 	SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
+	SPA_FEATURE_SPACEMAP_HISTOGRAM,
 	SPA_FEATURES
 } spa_feature_t;
 

Modified: vendor-sys/illumos/dist/uts/common/Makefile.files
==============================================================================
--- vendor-sys/illumos/dist/uts/common/Makefile.files	Wed Nov 20 10:38:37 2013	(r258370)
+++ vendor-sys/illumos/dist/uts/common/Makefile.files	Wed Nov 20 10:41:10 2013	(r258371)
@@ -22,7 +22,7 @@
 #
 # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
 # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
-# Copyright (c) 2012 by Delphix. All rights reserved.
+# Copyright (c) 2013 by Delphix. All rights reserved.
 # Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 #
 
@@ -1359,6 +1359,7 @@ ZFS_COMMON_OBJS +=		\
 	lz4.o			\
 	lzjb.o			\
 	metaslab.o		\
+	range_tree.o		\
 	refcount.o		\
 	rrwlock.o		\
 	sa.o			\
@@ -1369,6 +1370,7 @@ ZFS_COMMON_OBJS +=		\
 	spa_history.o		\
 	spa_misc.o		\
 	space_map.o		\
+	space_reftree.o		\
 	txg.o			\
 	uberblock.o		\
 	unique.o		\

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c	Wed Nov 20 10:38:37 2013	(r258370)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c	Wed Nov 20 10:41:10 2013	(r258371)
@@ -1334,7 +1334,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t si
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
 	/* Check for any allocated blocks beyond the first */
-	if (dn->dn_phys->dn_maxblkid != 0)
+	if (dn->dn_maxblkid != 0)
 		goto fail;
 
 	mutex_enter(&dn->dn_dbufs_mtx);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c	Wed Nov 20 10:38:37 2013	(r258370)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c	Wed Nov 20 10:41:10 2013	(r258371)
@@ -31,6 +31,7 @@
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
+#include <sys/spa_impl.h>
 
 /*
  * Allow allocations to switch to gang blocks quickly. We do this to
@@ -44,6 +45,11 @@
 	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
 	METASLAB_GANG_AVOID)))
 
+#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
+#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
+#define	METASLAB_ACTIVE_MASK		\
+	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
@@ -79,9 +85,14 @@ int zfs_mg_alloc_failures = 0;
 int zfs_mg_noalloc_threshold = 0;
 
 /*
- * Metaslab debugging: when set, keeps all space maps in core to verify frees.
+ * When set will load all metaslabs when pool is first opened.
  */
-static int metaslab_debug = 0;
+int metaslab_debug_load = 0;
+
+/*
+ * When set will prevent metaslabs from being unloaded.
+ */
+int metaslab_debug_unload = 0;
 
 /*
  * Minimum size which forces the dynamic allocator to change
@@ -106,14 +117,16 @@ int metaslab_df_free_pct = 4;
 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 
 /*
- * Max number of space_maps to prefetch.
+ * Percentage of all cpus that can be used by the metaslab taskq.
  */
-int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
+int metaslab_load_pct = 50;
 
 /*
- * Percentage bonus multiplier for metaslabs that are in the bonus area.
+ * Determines how many txgs a metaslab may remain loaded without having any
+ * allocations from it. As long as a metaslab continues to be used we will
+ * keep it loaded.
  */
-int metaslab_smo_bonus_pct = 150;
+int metaslab_unload_delay = TXG_SIZE * 2;
 
 /*
  * Should we be willing to write data to degraded vdevs?
@@ -121,12 +134,28 @@ int metaslab_smo_bonus_pct = 150;
 boolean_t zfs_write_to_degraded = B_FALSE;
 
 /*
+ * Max number of metaslabs per group to preload.
+ */
+int metaslab_preload_limit = SPA_DVAS_PER_BP;
+
+/*
+ * Enable/disable preloading of metaslab.
+ */
+boolean_t metaslab_preload_enabled = B_TRUE;
+
+/*
+ * Enable/disable additional weight factor for each metaslab.
+ */
+boolean_t metaslab_weight_factor_enable = B_FALSE;
+
+
+/*
  * ==========================================================================
  * Metaslab classes
  * ==========================================================================
  */
 metaslab_class_t *
-metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
+metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
@@ -230,9 +259,9 @@ metaslab_compare(const void *x1, const v
 	/*
 	 * If the weights are identical, use the offset to force uniqueness.
 	 */
-	if (m1->ms_map->sm_start < m2->ms_map->sm_start)
+	if (m1->ms_start < m2->ms_start)
 		return (-1);
-	if (m1->ms_map->sm_start > m2->ms_map->sm_start)
+	if (m1->ms_start > m2->ms_start)
 		return (1);
 
 	ASSERT3P(m1, ==, m2);
@@ -300,6 +329,9 @@ metaslab_group_create(metaslab_class_t *
 	mg->mg_class = mc;
 	mg->mg_activation_count = 0;
 
+	mg->mg_taskq = taskq_create("metaslab_group_tasksq", metaslab_load_pct,
+	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
+
 	return (mg);
 }
 
@@ -368,6 +400,8 @@ metaslab_group_passivate(metaslab_group_
 		return;
 	}
 
+	taskq_wait(mg->mg_taskq);
+
 	mgprev = mg->mg_prev;
 	mgnext = mg->mg_next;
 
@@ -447,130 +481,200 @@ metaslab_group_allocatable(metaslab_grou
 
 /*
  * ==========================================================================
- * Common allocator routines
+ * Range tree callbacks
  * ==========================================================================
  */
+
+/*
+ * Comparison function for the private size-ordered tree. Tree is sorted
+ * by size, larger sizes at the end of the tree.
+ */
 static int
-metaslab_segsize_compare(const void *x1, const void *x2)
+metaslab_rangesize_compare(const void *x1, const void *x2)
 {
-	const space_seg_t *s1 = x1;
-	const space_seg_t *s2 = x2;
-	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
-	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+	const range_seg_t *r1 = x1;
+	const range_seg_t *r2 = x2;
+	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
-	if (ss_size1 < ss_size2)
+	if (rs_size1 < rs_size2)
 		return (-1);
-	if (ss_size1 > ss_size2)
+	if (rs_size1 > rs_size2)
 		return (1);
 
-	if (s1->ss_start < s2->ss_start)
+	if (r1->rs_start < r2->rs_start)
 		return (-1);
-	if (s1->ss_start > s2->ss_start)
+
+	if (r1->rs_start > r2->rs_start)
 		return (1);
 
 	return (0);
 }
 
 /*
- * This is a helper function that can be used by the allocator to find
- * a suitable block to allocate. This will search the specified AVL
- * tree looking for a block that matches the specified criteria.
+ * Create any block allocator specific components. The current allocators
+ * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
  */
-static uint64_t
-metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
-    uint64_t align)
+static void
+metaslab_rt_create(range_tree_t *rt, void *arg)
 {
-	space_seg_t *ss, ssearch;
-	avl_index_t where;
-
-	ssearch.ss_start = *cursor;
-	ssearch.ss_end = *cursor + size;
-
-	ss = avl_find(t, &ssearch, &where);
-	if (ss == NULL)
-		ss = avl_nearest(t, where, AVL_AFTER);
+	metaslab_t *msp = arg;
 
-	while (ss != NULL) {
-		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
-
-		if (offset + size <= ss->ss_end) {
-			*cursor = offset + size;
-			return (offset);
-		}
-		ss = AVL_NEXT(t, ss);
-	}
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT(msp->ms_tree == NULL);
 
-	/*
-	 * If we know we've searched the whole map (*cursor == 0), give up.
-	 * Otherwise, reset the cursor to the beginning and try again.
-	 */
-	if (*cursor == 0)
-		return (-1ULL);
-
-	*cursor = 0;
-	return (metaslab_block_picker(t, cursor, size, align));
+	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 }
 
+/*
+ * Destroy the block allocator specific components.
+ */
 static void
-metaslab_pp_load(space_map_t *sm)
+metaslab_rt_destroy(range_tree_t *rt, void *arg)
 {
-	space_seg_t *ss;
+	metaslab_t *msp = arg;
 
-	ASSERT(sm->sm_ppd == NULL);
-	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT3P(msp->ms_tree, ==, rt);
+	ASSERT0(avl_numnodes(&msp->ms_size_tree));
 
-	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
-	avl_create(sm->sm_pp_root, metaslab_segsize_compare,
-	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
-
-	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-		avl_add(sm->sm_pp_root, ss);
+	avl_destroy(&msp->ms_size_tree);
 }
 
 static void
-metaslab_pp_unload(space_map_t *sm)
+metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
-	void *cookie = NULL;
-
-	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
-	sm->sm_ppd = NULL;
+	metaslab_t *msp = arg;
 
-	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
-		/* tear down the tree */
-	}
-
-	avl_destroy(sm->sm_pp_root);
-	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
-	sm->sm_pp_root = NULL;
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT3P(msp->ms_tree, ==, rt);
+	VERIFY(!msp->ms_condensing);
+	avl_add(&msp->ms_size_tree, rs);
 }
 
-/* ARGSUSED */
 static void
-metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
-	/* No need to update cursor */
+	metaslab_t *msp = arg;
+
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT3P(msp->ms_tree, ==, rt);
+	VERIFY(!msp->ms_condensing);
+	avl_remove(&msp->ms_size_tree, rs);
 }
 
-/* ARGSUSED */
 static void
-metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_rt_vacate(range_tree_t *rt, void *arg)
 {
-	/* No need to update cursor */
+	metaslab_t *msp = arg;
+
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT3P(msp->ms_tree, ==, rt);
+
+	/*
+	 * Normally one would walk the tree freeing nodes along the way.
+	 * Since the nodes are shared with the range trees we can avoid
+	 * walking all nodes and just reinitialize the avl tree. The nodes
+	 * will be freed by the range tree, so we don't want to free them here.
+	 */
+	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 }
 
+static range_tree_ops_t metaslab_rt_ops = {
+	metaslab_rt_create,
+	metaslab_rt_destroy,
+	metaslab_rt_add,
+	metaslab_rt_remove,
+	metaslab_rt_vacate
+};
+
+/*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
 /*
  * Return the maximum contiguous segment within the metaslab.
  */
 uint64_t
-metaslab_pp_maxsize(space_map_t *sm)
+metaslab_block_maxsize(metaslab_t *msp)
 {
-	avl_tree_t *t = sm->sm_pp_root;
-	space_seg_t *ss;
+	avl_tree_t *t = &msp->ms_size_tree;
+	range_seg_t *rs;
 
-	if (t == NULL || (ss = avl_last(t)) == NULL)
+	if (t == NULL || (rs = avl_last(t)) == NULL)
 		return (0ULL);
 
-	return (ss->ss_end - ss->ss_start);
+	return (rs->rs_end - rs->rs_start);
+}
+
+uint64_t
+metaslab_block_alloc(metaslab_t *msp, uint64_t size)
+{
+	uint64_t start;
+	range_tree_t *rt = msp->ms_tree;
+
+	VERIFY(!msp->ms_condensing);
+
+	start = msp->ms_ops->msop_alloc(msp, size);
+	if (start != -1ULL) {
+		vdev_t *vd = msp->ms_group->mg_vd;
+
+		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
+		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
+		range_tree_remove(rt, start, size);
+	}
+	return (start);
+}
+
+/*
+ * ==========================================================================
+ * Common allocator routines
+ * ==========================================================================
+ */
+
+/*
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
+ */
+static uint64_t
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+    uint64_t align)
+{
+	range_seg_t *rs, rsearch;
+	avl_index_t where;
+
+	rsearch.rs_start = *cursor;
+	rsearch.rs_end = *cursor + size;
+
+	rs = avl_find(t, &rsearch, &where);
+	if (rs == NULL)
+		rs = avl_nearest(t, where, AVL_AFTER);
+
+	while (rs != NULL) {
+		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
+
+		if (offset + size <= rs->rs_end) {
+			*cursor = offset + size;
+			return (offset);
+		}
+		rs = AVL_NEXT(t, rs);
+	}
+
+	/*
+	 * If we know we've searched the whole map (*cursor == 0), give up.
+	 * Otherwise, reset the cursor to the beginning and try again.
+	 */
+	if (*cursor == 0)
+		return (-1ULL);
+
+	*cursor = 0;
+	return (metaslab_block_picker(t, cursor, size, align));
 }
 
 /*
@@ -579,29 +683,31 @@ metaslab_pp_maxsize(space_map_t *sm)
  * ==========================================================================
  */
 static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &sm->sm_root;
+	/*
+	 * Find the largest power of 2 block size that evenly divides the
+	 * requested size. This is used to try to allocate blocks with similar
+	 * alignment from the same area of the metaslab (i.e. same cursor
+	 * bucket) but it does not guarantee that other allocations sizes
+	 * may exist in the same region.
+	 */
 	uint64_t align = size & -size;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+	uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
+	avl_tree_t *t = &msp->ms_tree->rt_root;
 
 	return (metaslab_block_picker(t, cursor, size, align));
 }
 
 /* ARGSUSED */
-boolean_t
-metaslab_ff_fragmented(space_map_t *sm)
+static boolean_t
+metaslab_ff_fragmented(metaslab_t *msp)
 {
 	return (B_TRUE);
 }
 
-static space_map_ops_t metaslab_ff_ops = {
-	metaslab_pp_load,
-	metaslab_pp_unload,
+static metaslab_ops_t metaslab_ff_ops = {
 	metaslab_ff_alloc,
-	metaslab_pp_claim,
-	metaslab_pp_free,
-	metaslab_pp_maxsize,
 	metaslab_ff_fragmented
 };
 
@@ -614,16 +720,24 @@ static space_map_ops_t metaslab_ff_ops =
  * ==========================================================================
  */
 static uint64_t
-metaslab_df_alloc(space_map_t *sm, uint64_t size)
+metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &sm->sm_root;
+	/*
+	 * Find the largest power of 2 block size that evenly divides the
+	 * requested size. This is used to try to allocate blocks with similar
+	 * alignment from the same area of the metaslab (i.e. same cursor
+	 * bucket) but it does not guarantee that other allocations sizes
+	 * may exist in the same region.
+	 */
 	uint64_t align = size & -size;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-	int free_pct = sm->sm_space * 100 / sm->sm_size;
+	uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
+	range_tree_t *rt = msp->ms_tree;
+	avl_tree_t *t = &rt->rt_root;
+	uint64_t max_size = metaslab_block_maxsize(msp);
+	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 
 	if (max_size < size)
 		return (-1ULL);
@@ -634,7 +748,7 @@ metaslab_df_alloc(space_map_t *sm, uint6
 	 */
 	if (max_size < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
-		t = sm->sm_pp_root;
+		t = &msp->ms_size_tree;
 		*cursor = 0;
 	}
 
@@ -642,10 +756,11 @@ metaslab_df_alloc(space_map_t *sm, uint6
 }
 
 static boolean_t
-metaslab_df_fragmented(space_map_t *sm)
+metaslab_df_fragmented(metaslab_t *msp)
 {
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-	int free_pct = sm->sm_space * 100 / sm->sm_size;
+	range_tree_t *rt = msp->ms_tree;
+	uint64_t max_size = metaslab_block_maxsize(msp);
+	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
 	if (max_size >= metaslab_df_alloc_threshold &&
 	    free_pct >= metaslab_df_free_pct)
@@ -654,182 +769,228 @@ metaslab_df_fragmented(space_map_t *sm)
 	return (B_TRUE);
 }
 
-static space_map_ops_t metaslab_df_ops = {
-	metaslab_pp_load,
-	metaslab_pp_unload,
+static metaslab_ops_t metaslab_df_ops = {
 	metaslab_df_alloc,
-	metaslab_pp_claim,
-	metaslab_pp_free,
-	metaslab_pp_maxsize,
 	metaslab_df_fragmented
 };
 
 /*
  * ==========================================================================
- * Other experimental allocators
+ * Cursor fit block allocator -
+ * Select the largest region in the metaslab, set the cursor to the beginning
+ * of the range and the cursor_end to the end of the range. As allocations
+ * are made advance the cursor. Continue allocating from the cursor until
+ * the range is exhausted and then find a new range.
  * ==========================================================================
  */
 static uint64_t
-metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
+metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &sm->sm_root;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd;
-	uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-	uint64_t rsize = size;
+	range_tree_t *rt = msp->ms_tree;
+	avl_tree_t *t = &msp->ms_size_tree;
+	uint64_t *cursor = &msp->ms_lbas[0];
+	uint64_t *cursor_end = &msp->ms_lbas[1];
 	uint64_t offset = 0;
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
-
-	if (max_size < size)
-		return (-1ULL);
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
 
-	ASSERT3U(*extent_end, >=, *cursor);
+	ASSERT3U(*cursor_end, >=, *cursor);
 
-	/*
-	 * If we're running low on space switch to using the size
-	 * sorted AVL tree (best-fit).
-	 */
-	if ((*cursor + size) > *extent_end) {
+	if ((*cursor + size) > *cursor_end) {
+		range_seg_t *rs;
 
-		t = sm->sm_pp_root;
-		*cursor = *extent_end = 0;
+		rs = avl_last(&msp->ms_size_tree);
+		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
+			return (-1ULL);
 
-		if (max_size > 2 * SPA_MAXBLOCKSIZE)
-			rsize = MIN(metaslab_min_alloc_size, max_size);
-		offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
-		if (offset != -1)
-			*cursor = offset + size;
-	} else {
-		offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
+		*cursor = rs->rs_start;
+		*cursor_end = rs->rs_end;
 	}
-	ASSERT3U(*cursor, <=, *extent_end);
+
+	offset = *cursor;
+	*cursor += size;
+
 	return (offset);
 }
 
 static boolean_t
-metaslab_cdf_fragmented(space_map_t *sm)
+metaslab_cf_fragmented(metaslab_t *msp)
 {
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-
-	if (max_size > (metaslab_min_alloc_size * 10))
-		return (B_FALSE);
-	return (B_TRUE);
+	return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
 }
 
-static space_map_ops_t metaslab_cdf_ops = {
-	metaslab_pp_load,
-	metaslab_pp_unload,
-	metaslab_cdf_alloc,
-	metaslab_pp_claim,
-	metaslab_pp_free,
-	metaslab_pp_maxsize,
-	metaslab_cdf_fragmented
+static metaslab_ops_t metaslab_cf_ops = {
+	metaslab_cf_alloc,
+	metaslab_cf_fragmented
 };
 
+/*
+ * ==========================================================================
+ * New dynamic fit allocator -
+ * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
+ * contiguous blocks. If no region is found then just use the largest segment
+ * that remains.
+ * ==========================================================================
+ */
+
+/*
+ * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
+ * to request from the allocator.
+ */
 uint64_t metaslab_ndf_clump_shift = 4;
 
 static uint64_t
-metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
+metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &sm->sm_root;
+	avl_tree_t *t = &msp->ms_tree->rt_root;
 	avl_index_t where;
-	space_seg_t *ss, ssearch;
+	range_seg_t *rs, rsearch;
 	uint64_t hbit = highbit(size);
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
-	uint64_t max_size = metaslab_pp_maxsize(sm);
+	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
+	uint64_t max_size = metaslab_block_maxsize(msp);
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 
 	if (max_size < size)
 		return (-1ULL);
 
-	ssearch.ss_start = *cursor;
-	ssearch.ss_end = *cursor + size;
+	rsearch.rs_start = *cursor;
+	rsearch.rs_end = *cursor + size;
 
-	ss = avl_find(t, &ssearch, &where);
-	if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
-		t = sm->sm_pp_root;
+	rs = avl_find(t, &rsearch, &where);
+	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
+		t = &msp->ms_size_tree;
 
-		ssearch.ss_start = 0;
-		ssearch.ss_end = MIN(max_size,
+		rsearch.rs_start = 0;
+		rsearch.rs_end = MIN(max_size,
 		    1ULL << (hbit + metaslab_ndf_clump_shift));
-		ss = avl_find(t, &ssearch, &where);
-		if (ss == NULL)
-			ss = avl_nearest(t, where, AVL_AFTER);
-		ASSERT(ss != NULL);
+		rs = avl_find(t, &rsearch, &where);
+		if (rs == NULL)
+			rs = avl_nearest(t, where, AVL_AFTER);
+		ASSERT(rs != NULL);
 	}
 
-	if (ss != NULL) {
-		if (ss->ss_start + size <= ss->ss_end) {
-			*cursor = ss->ss_start + size;
-			return (ss->ss_start);
-		}
+	if ((rs->rs_end - rs->rs_start) >= size) {
+		*cursor = rs->rs_start + size;
+		return (rs->rs_start);
 	}
 	return (-1ULL);
 }
 
 static boolean_t
-metaslab_ndf_fragmented(space_map_t *sm)
+metaslab_ndf_fragmented(metaslab_t *msp)
 {
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-
-	if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
-		return (B_FALSE);
-	return (B_TRUE);
+	return (metaslab_block_maxsize(msp) <=
+	    (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
 }
 
-
-static space_map_ops_t metaslab_ndf_ops = {
-	metaslab_pp_load,
-	metaslab_pp_unload,
+static metaslab_ops_t metaslab_ndf_ops = {
 	metaslab_ndf_alloc,
-	metaslab_pp_claim,
-	metaslab_pp_free,
-	metaslab_pp_maxsize,
 	metaslab_ndf_fragmented
 };
 
-space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 
 /*
  * ==========================================================================
  * Metaslabs
  * ==========================================================================
  */
+
+/*
+ * Wait for any in-progress metaslab loads to complete.
+ */
+void
+metaslab_load_wait(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	while (msp->ms_loading) {
+		ASSERT(!msp->ms_loaded);
+		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
+	}
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+	int error = 0;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(!msp->ms_loaded);
+	ASSERT(!msp->ms_loading);
+
+	msp->ms_loading = B_TRUE;
+
+	/*
+	 * If the space map has not been allocated yet, then treat
+	 * all the space in the metaslab as free and add it to the
+	 * ms_tree.
+	 */
+	if (msp->ms_sm != NULL)
+		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
+	else
+		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
+
+	msp->ms_loaded = (error == 0);
+	msp->ms_loading = B_FALSE;
+
+	if (msp->ms_loaded) {
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			range_tree_walk(msp->ms_defertree[t],
+			    range_tree_remove, msp->ms_tree);
+		}
+	}
+	cv_broadcast(&msp->ms_load_cv);
+	return (error);
+}
+
+void
+metaslab_unload(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	range_tree_vacate(msp->ms_tree, NULL, NULL);
+	msp->ms_loaded = B_FALSE;
+	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
+}
+
 metaslab_t *
-metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
-	uint64_t start, uint64_t size, uint64_t txg)
+metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
 {
 	vdev_t *vd = mg->mg_vd;
+	objset_t *mos = vd->vdev_spa->spa_meta_objset;
 	metaslab_t *msp;
 
 	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
+	msp->ms_id = id;
+	msp->ms_start = id << vd->vdev_ms_shift;
+	msp->ms_size = 1ULL << vd->vdev_ms_shift;
 
-	msp->ms_smo_syncing = *smo;
+	/*
+	 * We only open space map objects that already exist. All others
+	 * will be opened when we finally allocate an object for it.
+	 */
+	if (object != 0) {
+		VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
+		    msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
+		ASSERT(msp->ms_sm != NULL);
+	}
 
 	/*
-	 * We create the main space map here, but we don't create the
-	 * allocmaps and freemaps until metaslab_sync_done().  This serves
+	 * We create the main range tree here, but we don't create the
+	 * alloctree and freetree until metaslab_sync_done().  This serves
 	 * two purposes: it allows metaslab_sync_done() to detect the
 	 * addition of new space; and for debugging, it ensures that we'd
 	 * data fault on any attempt to use this metaslab before it's ready.
 	 */
-	msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
-	space_map_create(msp->ms_map, start, size,
-	    vd->vdev_ashift, &msp->ms_lock);
-
+	msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
 	metaslab_group_add(mg, msp);
 
-	if (metaslab_debug && smo->smo_object != 0) {
-		mutex_enter(&msp->ms_lock);
-		VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops,
-		    SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
-		mutex_exit(&msp->ms_lock);
-	}
+	msp->ms_ops = mg->mg_class->mc_ops;
 
 	/*
 	 * If we're opening an existing pool (txg == 0) or creating
@@ -840,6 +1001,17 @@ metaslab_init(metaslab_group_t *mg, spac
 	if (txg <= TXG_INITIAL)
 		metaslab_sync_done(msp, 0);
 
+	/*
+	 * If metaslab_debug_load is set and we're initializing a metaslab
+	 * that has an allocated space_map object then load the its space
+	 * map so that can verify frees.
+	 */
+	if (metaslab_debug_load && msp->ms_sm != NULL) {
+		mutex_enter(&msp->ms_lock);
+		VERIFY0(metaslab_load(msp));
+		mutex_exit(&msp->ms_lock);
+	}
+
 	if (txg != 0) {
 		vdev_dirty(vd, 0, NULL, txg);
 		vdev_dirty(vd, VDD_METASLAB, msp, txg);
@@ -853,48 +1025,103 @@ metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 
-	vdev_space_update(mg->mg_vd,
-	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size);
-
 	metaslab_group_remove(mg, msp);
 
 	mutex_enter(&msp->ms_lock);
 
-	space_map_unload(msp->ms_map);
-	space_map_destroy(msp->ms_map);
-	kmem_free(msp->ms_map, sizeof (*msp->ms_map));
+	VERIFY(msp->ms_group == NULL);
+	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
+	    0, -msp->ms_size);
+	space_map_close(msp->ms_sm);
+
+	metaslab_unload(msp);
+	range_tree_destroy(msp->ms_tree);
 
 	for (int t = 0; t < TXG_SIZE; t++) {
-		space_map_destroy(msp->ms_allocmap[t]);
-		space_map_destroy(msp->ms_freemap[t]);
-		kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t]));
-		kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t]));
+		range_tree_destroy(msp->ms_alloctree[t]);
+		range_tree_destroy(msp->ms_freetree[t]);
 	}
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		space_map_destroy(msp->ms_defermap[t]);
-		kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t]));
+		range_tree_destroy(msp->ms_defertree[t]);
 	}
 
 	ASSERT0(msp->ms_deferspace);
 
 	mutex_exit(&msp->ms_lock);
+	cv_destroy(&msp->ms_load_cv);
 	mutex_destroy(&msp->ms_lock);
 
 	kmem_free(msp, sizeof (metaslab_t));
 }
 
-#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
-#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
-#define	METASLAB_ACTIVE_MASK		\
-	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+/*
+ * Apply a weighting factor based on the histogram information for this
+ * metaslab. The current weighting factor is somewhat arbitrary and requires
+ * additional investigation. The implementation provides a measure of
+ * "weighted" free space and gives a higher weighting for larger contiguous
+ * regions. The weighting factor is determined by counting the number of
+ * sm_shift sectors that exist in each region represented by the histogram.
+ * That value is then multiplied by the power of 2 exponent and the sm_shift
+ * value.
+ *
+ * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
+ * metaslab has an sm_shift value of 9 (512B):
+ *
+ * 1) calculate the number of sm_shift sectors in the region:
+ *	2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
+ * 2) multiply by the power of 2 exponent and the sm_shift value:
+ *	16384 * 21 * 9 = 3096576
+ * This value will be added to the weighting of the metaslab.
+ */
+static uint64_t
+metaslab_weight_factor(metaslab_t *msp)
+{
+	uint64_t factor = 0;
+	uint64_t sectors;
+	int i;
+
+	/*
+	 * A null space map means that the entire metaslab is free,
+	 * calculate a weight factor that spans the entire size of the
+	 * metaslab.

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201311201041.rAKAfBHw052390>