From owner-svn-src-all@FreeBSD.ORG Tue Jul 29 07:08:34 2014 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 8C98A315; Tue, 29 Jul 2014 07:08:34 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 77434205E; Tue, 29 Jul 2014 07:08:34 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.9/8.14.9) with ESMTP id s6T78YjJ000191; Tue, 29 Jul 2014 07:08:34 GMT (envelope-from delphij@svn.freebsd.org) Received: (from delphij@localhost) by svn.freebsd.org (8.14.9/8.14.9/Submit) id s6T78XTR000184; Tue, 29 Jul 2014 07:08:33 GMT (envelope-from delphij@svn.freebsd.org) Message-Id: <201407290708.s6T78XTR000184@svn.freebsd.org> From: Xin LI Date: Tue, 29 Jul 2014 07:08:33 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org Subject: svn commit: r269223 - vendor-sys/illumos/dist/common/avl vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor-sys/illumos/dist/uts/common/sys vendor/illumo... X-SVN-Group: vendor-sys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.18 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 29 Jul 2014 07:08:34 -0000 Author: delphij Date: Tue Jul 29 07:08:32 2014 New Revision: 269223 URL: http://svnweb.freebsd.org/changeset/base/269223 Log: 4873 zvol unmap calls can take a very long time for larger datasets Reviewed by: George Wilson Reviewed by: Matthew Ahrens Reviewed by: Paul Dagnelie Reviewed by: Basil Crow Reviewed by: Dan McDonald Approved by: Robert Mustacchi illumos/illumos-gate@0f6d88aded0d165f5954688a9b13bac76c38da84 Modified: vendor-sys/illumos/dist/common/avl/avl.c vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h vendor-sys/illumos/dist/uts/common/sys/avl.h Changes in other areas also in this revision: Modified: vendor/illumos/dist/common/avl/avl.c Modified: vendor-sys/illumos/dist/common/avl/avl.c ============================================================================== --- vendor-sys/illumos/dist/common/avl/avl.c Tue Jul 29 06:57:13 2014 (r269222) +++ vendor-sys/illumos/dist/common/avl/avl.c Tue Jul 29 07:08:32 2014 (r269223) @@ -24,6 +24,10 @@ */ /* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +/* * AVL - generic AVL tree implementation for kernel use * * A complete description of AVL trees can be found in many CS textbooks. @@ -85,6 +89,12 @@ * is a modified "avl_node_t *". The bottom bit (normally 0 for a * pointer) is set to indicate if that the new node has a value greater * than the value of the indicated "avl_node_t *". + * + * Note - in addition to userland (e.g. libavl and libutil) and the kernel + * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module, + * which each have their own compilation environments and subsequent + * requirements. Each of these environments must be considered when adding + * dependencies from avl.c. */ #include @@ -864,6 +874,24 @@ avl_update(avl_tree_t *t, void *obj) return (B_FALSE); } +void +avl_swap(avl_tree_t *tree1, avl_tree_t *tree2) +{ + avl_node_t *temp_node; + ulong_t temp_numnodes; + + ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar); + ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset); + ASSERT3U(tree1->avl_size, ==, tree2->avl_size); + + temp_node = tree1->avl_root; + temp_numnodes = tree1->avl_numnodes; + tree1->avl_root = tree2->avl_root; + tree1->avl_numnodes = tree2->avl_numnodes; + tree2->avl_root = temp_node; + tree2->avl_numnodes = temp_numnodes; +} + /* * initialize a new AVL tree */ Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c Tue Jul 29 06:57:13 2014 (r269222) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c Tue Jul 29 07:08:32 2014 (r269223) @@ -69,6 +69,9 @@ dbuf_cons(void *vdb, void *unused, int k mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); refcount_create(&db->db_holds); + + db->db_creation = gethrtime(); + return (0); } @@ -330,7 +333,7 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || - !list_is_empty(&dn->dn_dbufs)); + !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); @@ -803,18 +806,30 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * receive; see comment below for details. */ void -dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) +dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) { - dmu_buf_impl_t *db, *db_next; + dmu_buf_impl_t *db, *db_next, db_search; uint64_t txg = tx->tx_txg; + avl_index_t where; - if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) - end = dn->dn_maxblkid; - dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) + end_blkid = dn->dn_maxblkid; + dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); + + db_search.db_level = 0; + db_search.db_blkid = start_blkid; + db_search.db_creation = 0; mutex_enter(&dn->dn_dbufs_mtx); - if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) { + if (start_blkid >= dn->dn_unlisted_l0_blkid) { /* There can't be any dbufs in this range; no need to search. */ +#ifdef DEBUG + db = avl_find(&dn->dn_dbufs, &db_search, &where); + ASSERT3P(db, ==, NULL); + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + ASSERT(db == NULL || db->db_level > 0); +#endif mutex_exit(&dn->dn_dbufs_mtx); return; } else if (dmu_objset_is_receiving(dn->dn_objset)) { @@ -828,14 +843,18 @@ dbuf_free_range(dnode_t *dn, uint64_t st atomic_inc_64(&zfs_free_range_recv_miss); } - for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); + db = avl_find(&dn->dn_dbufs, &db_search, &where); + ASSERT3P(db, ==, NULL); + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + for (; db != NULL; db = db_next) { + db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); - if (db->db_level != 0) - continue; - if (db->db_blkid < start || db->db_blkid > end) - continue; + if (db->db_level != 0 || db->db_blkid > end_blkid) { + break; + } + ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); @@ -1585,7 +1604,7 @@ dbuf_clear(dmu_buf_impl_t *db) dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { - list_remove(&dn->dn_dbufs, db); + avl_remove(&dn->dn_dbufs, db); (void) atomic_dec_32_nv(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); @@ -1748,7 +1767,7 @@ dbuf_create(dnode_t *dn, uint8_t level, mutex_exit(&dn->dn_dbufs_mtx); return (odb); } - list_insert_head(&dn->dn_dbufs, db); + avl_add(&dn->dn_dbufs, db); if (db->db_level == 0 && db->db_blkid >= dn->dn_unlisted_l0_blkid) dn->dn_unlisted_l0_blkid = db->db_blkid + 1; @@ -1807,7 +1826,7 @@ dbuf_destroy(dmu_buf_impl_t *db) DB_DNODE_ENTER(db); dn = DB_DNODE(db); mutex_enter(&dn->dn_dbufs_mtx); - list_remove(&dn->dn_dbufs, db); + avl_remove(&dn->dn_dbufs, db); (void) atomic_dec_32_nv(&dn->dn_dbufs_count); mutex_exit(&dn->dn_dbufs_mtx); DB_DNODE_EXIT(db); @@ -1825,7 +1844,6 @@ dbuf_destroy(dmu_buf_impl_t *db) db->db_parent = NULL; db->db_buf = NULL; - ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c Tue Jul 29 06:57:13 2014 (r269222) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c Tue Jul 29 07:08:32 2014 (r269223) @@ -59,6 +59,43 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); +static int +dbuf_compare(const void *x1, const void *x2) +{ + const dmu_buf_impl_t *d1 = x1; + const dmu_buf_impl_t *d2 = x2; + + if (d1->db_level < d2->db_level) { + return (-1); + } else if (d1->db_level > d2->db_level) { + return (1); + } + + if (d1->db_blkid < d2->db_blkid) { + return (-1); + } else if (d1->db_blkid > d2->db_blkid) { + return (1); + } + + /* + * If a dbuf is being evicted while dn_dbufs_mutex is not held, we set + * the db_state to DB_EVICTING but do not remove it from dn_dbufs. If + * another thread creates a dbuf of the same blkid before the dbuf is + * removed from dn_dbufs, we can reach a state where there are two + * dbufs of the same blkid and level in db_dbufs. To maintain the avl + * invariant that there cannot be duplicate items, we distinguish + * between these two dbufs based on the time they were created. + */ + if (d1->db_creation < d2->db_creation) { + return (-1); + } else if (d1->db_creation > d2->db_creation) { + return (1); + } else { + ASSERT3P(d1, ==, d2); + return (0); + } +} + /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) @@ -113,7 +150,7 @@ dnode_cons(void *arg, void *unused, int dn->dn_dbufs_count = 0; dn->dn_unlisted_l0_blkid = 0; - list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), + avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); dn->dn_moved = 0; @@ -166,7 +203,7 @@ dnode_dest(void *arg, void *unused) ASSERT0(dn->dn_dbufs_count); ASSERT0(dn->dn_unlisted_l0_blkid); - list_destroy(&dn->dn_dbufs); + avl_destroy(&dn->dn_dbufs); } void @@ -502,7 +539,7 @@ dnode_allocate(dnode_t *dn, dmu_object_t ASSERT0(dn->dn_assigned_txg); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + ASSERT(avl_is_empty(&dn->dn_dbufs)); for (i = 0; i < TXG_SIZE; i++) { ASSERT0(dn->dn_next_nblkptr[i]); @@ -687,8 +724,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *n ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; ASSERT(refcount_count(&odn->dn_tx_holds) == 0); refcount_transfer(&ndn->dn_holds, &odn->dn_holds); - ASSERT(list_is_empty(&ndn->dn_dbufs)); - list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs); + ASSERT(avl_is_empty(&ndn->dn_dbufs)); + avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); ndn->dn_dbufs_count = odn->dn_dbufs_count; ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid; ndn->dn_bonus = odn->dn_bonus; @@ -722,7 +759,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *n */ odn->dn_dbuf = NULL; odn->dn_handle = NULL; - list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t), + avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); odn->dn_dbufs_count = 0; odn->dn_unlisted_l0_blkid = 0; @@ -1231,7 +1268,8 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx return; } - ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); + ASSERT(!refcount_is_zero(&dn->dn_holds) || + !avl_is_empty(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]); ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]); @@ -1304,7 +1342,7 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx) int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { - dmu_buf_impl_t *db, *db_next; + dmu_buf_impl_t *db; int err; if (size == 0) @@ -1327,9 +1365,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t si goto fail; mutex_enter(&dn->dn_dbufs_mtx); - for (db = list_head(&dn->dn_dbufs); db; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); - + for (db = avl_first(&dn->dn_dbufs); db != NULL; + db = AVL_NEXT(&dn->dn_dbufs, db)) { if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c Tue Jul 29 06:57:13 2014 (r269222) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c Tue Jul 29 07:08:32 2014 (r269223) @@ -400,16 +400,13 @@ dnode_evict_dbufs(dnode_t *dn) int pass = 0; do { - dmu_buf_impl_t *db, marker; + dmu_buf_impl_t *db, *db_next; int evicting = FALSE; progress = FALSE; mutex_enter(&dn->dn_dbufs_mtx); - list_insert_tail(&dn->dn_dbufs, &marker); - db = list_head(&dn->dn_dbufs); - for (; db != ▮ db = list_head(&dn->dn_dbufs)) { - list_remove(&dn->dn_dbufs, db); - list_insert_tail(&dn->dn_dbufs, db); + for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { + db_next = AVL_NEXT(&dn->dn_dbufs, db); #ifdef DEBUG DB_DNODE_ENTER(db); ASSERT3P(DB_DNODE(db), ==, dn); @@ -429,7 +426,6 @@ dnode_evict_dbufs(dnode_t *dn) } } - list_remove(&dn->dn_dbufs, &marker); /* * NB: we need to drop dn_dbufs_mtx between passes so * that any DB_EVICTING dbufs can make progress. @@ -500,7 +496,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *t dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + ASSERT(avl_is_empty(&dn->dn_dbufs)); ASSERT3P(dn->dn_bonus, ==, NULL); /* Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h Tue Jul 29 06:57:13 2014 (r269222) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h Tue Jul 29 07:08:32 2014 (r269223) @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ @@ -213,11 +213,14 @@ typedef struct dmu_buf_impl { /* pointer to most recent dirty record for this buffer */ dbuf_dirty_record_t *db_last_dirty; + /* Creation time of dbuf (see comment in dbuf_compare). */ + hrtime_t db_creation; + /* * Our link on the owner dnodes's dn_dbufs list. * Protected by its dn_dbufs_mtx. */ - list_node_t db_link; + avl_node_t db_link; /* Data which is unique to data (leaf) blocks: */ Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h Tue Jul 29 06:57:13 2014 (r269222) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h Tue Jul 29 07:08:32 2014 (r269223) @@ -211,7 +211,7 @@ typedef struct dnode { refcount_t dn_holds; kmutex_t dn_dbufs_mtx; - list_t dn_dbufs; /* descendent dbufs */ + avl_tree_t dn_dbufs; /* descendent dbufs */ /* protected by dn_struct_rwlock */ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ Modified: vendor-sys/illumos/dist/uts/common/sys/avl.h ============================================================================== --- vendor-sys/illumos/dist/uts/common/sys/avl.h Tue Jul 29 06:57:13 2014 (r269222) +++ vendor-sys/illumos/dist/uts/common/sys/avl.h Tue Jul 29 07:08:32 2014 (r269223) @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + #ifndef _AVL_H #define _AVL_H @@ -260,6 +264,11 @@ extern boolean_t avl_update_lt(avl_tree_ extern boolean_t avl_update_gt(avl_tree_t *, void *); /* + * Swaps the contents of the two trees. + */ +extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2); + +/* * Return the number of nodes in the tree */ extern ulong_t avl_numnodes(avl_tree_t *tree);