From owner-svn-src-all@FreeBSD.ORG Tue Jan 20 20:11:32 2015 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id E5CC6F26; Tue, 20 Jan 2015 20:11:32 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id CF8FB3F0; Tue, 20 Jan 2015 20:11:32 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.9/8.14.9) with ESMTP id t0KKBWAt004788; Tue, 20 Jan 2015 20:11:32 GMT (envelope-from delphij@FreeBSD.org) Received: (from delphij@localhost) by svn.freebsd.org (8.14.9/8.14.9/Submit) id t0KKBULc004776; Tue, 20 Jan 2015 20:11:30 GMT (envelope-from delphij@FreeBSD.org) Message-Id: <201501202011.t0KKBULc004776@svn.freebsd.org> X-Authentication-Warning: svn.freebsd.org: delphij set sender to delphij@FreeBSD.org using -f From: Xin LI Date: Tue, 20 Jan 2015 20:11:30 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org Subject: svn commit: r277428 - in vendor-sys/illumos/dist/uts/common/fs/zfs: . sys X-SVN-Group: vendor-sys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.18-1 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 20 Jan 2015 20:11:33 -0000 Author: delphij Date: Tue Jan 20 20:11:30 2015 New Revision: 277428 URL: https://svnweb.freebsd.org/changeset/base/277428 Log: 5056 ZFS deadlock on db_mtx and dn_holds Reviewed by: Will Andrews Reviewed by: Matt Ahrens Reviewed by: George Wilson Approved by: Dan McDonald Author: Justin Gibbs illumos/illumos-gate@bc9014e6a81272073b9854d9f65dd59e18d18c35 Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_bookmark.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_deadlist.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_deleg.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_destroy.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_prop.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_userhold.c vendor-sys/illumos/dist/uts/common/fs/zfs/sa.c vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_objset.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_dataset.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_dir.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/sa.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/sa_impl.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zap_impl.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zap_leaf.h vendor-sys/illumos/dist/uts/common/fs/zfs/zap.c vendor-sys/illumos/dist/uts/common/fs/zfs/zap_micro.c vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_sa.c vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c Tue Jan 20 20:10:03 2015 (r277427) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c Tue Jan 20 20:11:30 2015 (r277428) @@ -24,6 +24,7 @@ * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -54,10 +55,16 @@ static void dbuf_destroy(dmu_buf_impl_t static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); +#ifndef __lint +extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, + dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp); +#endif /* ! __lint */ + /* * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_cache; +static taskq_t *dbu_evict_taskq; /* ARGSUSED */ static int @@ -215,17 +222,72 @@ dbuf_hash_remove(dmu_buf_impl_t *db) static arc_evict_func_t dbuf_do_evict; +typedef enum { + DBVU_EVICTING, + DBVU_NOT_EVICTING +} dbvu_verify_type_t; + +static void +dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) +{ +#ifdef ZFS_DEBUG + int64_t holds; + + if (db->db_user == NULL) + return; + + /* Only data blocks support the attachment of user data. */ + ASSERT(db->db_level == 0); + + /* Clients must resolve a dbuf before attaching user data. */ + ASSERT(db->db.db_data != NULL); + ASSERT3U(db->db_state, ==, DB_CACHED); + + holds = refcount_count(&db->db_holds); + if (verify_type == DBVU_EVICTING) { + /* + * Immediate eviction occurs when holds == dirtycnt. + * For normal eviction buffers, holds is zero on + * eviction, except when dbuf_fix_old_data() calls + * dbuf_clear_data(). However, the hold count can grow + * during eviction even though db_mtx is held (see + * dmu_bonus_hold() for an example), so we can only + * test the generic invariant that holds >= dirtycnt. + */ + ASSERT3U(holds, >=, db->db_dirtycnt); + } else { + if (db->db_immediate_evict == TRUE) + ASSERT3U(holds, >=, db->db_dirtycnt); + else + ASSERT3U(holds, >, 0); + } +#endif +} + static void dbuf_evict_user(dmu_buf_impl_t *db) { + dmu_buf_user_t *dbu = db->db_user; + ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level != 0 || db->db_evict_func == NULL) + if (dbu == NULL) return; - db->db_evict_func(&db->db, db->db_user_ptr); - db->db_user_ptr = NULL; - db->db_evict_func = NULL; + dbuf_verify_user(db, DBVU_EVICTING); + db->db_user = NULL; + +#ifdef ZFS_DEBUG + if (dbu->dbu_clear_on_evict_dbufp != NULL) + *dbu->dbu_clear_on_evict_dbufp = NULL; +#endif + + /* + * Invoke the callback from a taskq to avoid lock order reversals + * and limit stack depth. + */ + taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, + &dbu->dbu_tqent); } boolean_t @@ -286,6 +348,12 @@ retry: for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + + /* + * All entries are queued via taskq_dispatch_ent(), so min/maxalloc + * configuration is not required. + */ + dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); } void @@ -298,6 +366,7 @@ dbuf_fini(void) mutex_destroy(&h->hash_mutexes[i]); kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); kmem_cache_destroy(dbuf_cache); + taskq_destroy(dbu_evict_taskq); } /* @@ -415,21 +484,27 @@ dbuf_verify(dmu_buf_impl_t *db) #endif static void +dbuf_clear_data(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + dbuf_evict_user(db); + db->db_buf = NULL; + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + db->db_state = DB_UNCACHED; +} + +static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(buf != NULL); + db->db_buf = buf; - if (buf != NULL) { - ASSERT(buf->b_data != NULL); - db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); - } else { - dbuf_evict_user(db); - db->db.db_data = NULL; - if (db->db_state != DB_NOFILL) - db->db_state = DB_UNCACHED; - } + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); } /* @@ -451,7 +526,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); - dbuf_set_data(db, NULL); + dbuf_clear_data(db); mutex_exit(&db->db_mtx); } return (abuf); @@ -687,7 +762,7 @@ dbuf_noread(dmu_buf_impl_t *db) dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { - dbuf_set_data(db, NULL); + dbuf_clear_data(db); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } @@ -743,7 +818,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, ui dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { - dbuf_set_data(db, NULL); + dbuf_clear_data(db); } } @@ -794,7 +869,8 @@ void dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { - dmu_buf_impl_t *db, *db_next, db_search; + dmu_buf_impl_t db_search; + dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; avl_index_t where; @@ -1370,7 +1446,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_ arc_buf_t *buf = db->db_buf; ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_set_data(db, NULL); + dbuf_clear_data(db); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); return (B_TRUE); @@ -1710,8 +1786,7 @@ dbuf_create(dnode_t *dn, uint8_t level, db->db_parent = parent; db->db_blkptr = blkptr; - db->db_user_ptr = NULL; - db->db_evict_func = NULL; + db->db_user = NULL; db->db_immediate_evict = 0; db->db_freed_in_flight = 0; @@ -2114,7 +2189,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, /* * This dbuf has anonymous data associated with it. */ - dbuf_set_data(db, NULL); + dbuf_clear_data(db); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); } else { @@ -2147,7 +2222,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, } else { dbuf_clear(db); } - } else if (arc_buf_eviction_needed(db->db_buf)) { + } else if (db->db_objset->os_evicting || + arc_buf_eviction_needed(db->db_buf)) { dbuf_clear(db); } else { mutex_exit(&db->db_mtx); @@ -2166,51 +2242,57 @@ dbuf_refcount(dmu_buf_impl_t *db) } void * -dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, + dmu_buf_user_t *new_user) { - return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + mutex_enter(&db->db_mtx); + dbuf_verify_user(db, DBVU_NOT_EVICTING); + if (db->db_user == old_user) + db->db_user = new_user; + else + old_user = db->db_user; + dbuf_verify_user(db, DBVU_NOT_EVICTING); + mutex_exit(&db->db_mtx); + + return (old_user); } void * -dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - db->db_immediate_evict = TRUE; - return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); + return (dmu_buf_replace_user(db_fake, NULL, user)); } void * -dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(db->db_level == 0); - - ASSERT((user_ptr == NULL) == (evict_func == NULL)); - - mutex_enter(&db->db_mtx); - if (db->db_user_ptr == old_user_ptr) { - db->db_user_ptr = user_ptr; - db->db_evict_func = evict_func; - } else { - old_user_ptr = db->db_user_ptr; - } + db->db_immediate_evict = TRUE; + return (dmu_buf_set_user(db_fake, user)); +} - mutex_exit(&db->db_mtx); - return (old_user_ptr); +void * +dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + return (dmu_buf_replace_user(db_fake, user, NULL)); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(!refcount_is_zero(&db->db_holds)); - return (db->db_user_ptr); + dbuf_verify_user(db, DBVU_NOT_EVICTING); + return (db->db_user); +} + +void +dmu_buf_user_evict_wait() +{ + taskq_wait(dbu_evict_taskq); } boolean_t Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c Tue Jan 20 20:10:03 2015 (r277427) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c Tue Jan 20 20:11:30 2015 (r277428) @@ -23,6 +23,7 @@ * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -355,7 +356,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), @@ -417,7 +418,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat os->os_secondary_cache = ZFS_CACHE_ALL; } - if (ds == NULL || !dsl_dataset_is_snapshot(ds)) + if (ds == NULL || !ds->ds_is_snapshot) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); @@ -436,16 +437,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - DMU_META_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, - &os->os_meta_dnode); + dnode_special_open(os, &os->os_phys->os_meta_dnode, + DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { - DMU_USERUSED_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, - &os->os_userused_dnode); - DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, - &os->os_groupused_dnode); + dnode_special_open(os, &os->os_phys->os_userused_dnode, + DMU_USERUSED_OBJECT, &os->os_userused_dnode); + dnode_special_open(os, &os->os_phys->os_groupused_dnode, + DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); } *osp = os; @@ -533,7 +531,7 @@ dmu_objset_own(const char *name, dmu_obj } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EINVAL)); - } else if (!readonly && dsl_dataset_is_snapshot(ds)) { + } else if (!readonly && ds->ds_is_snapshot) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EROFS)); } @@ -589,41 +587,53 @@ dmu_objset_disown(objset_t *os, void *ta void dmu_objset_evict_dbufs(objset_t *os) { + dnode_t dn_marker; dnode_t *dn; mutex_enter(&os->os_lock); - - /* process the mdn last, since the other dnodes have holds on it */ - list_remove(&os->os_dnodes, DMU_META_DNODE(os)); - list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); - - /* - * Find the first dnode with holds. We have to do this dance - * because dnode_add_ref() only works if you already have a - * hold. If there are no holds then it has no dbufs so OK to - * skip. - */ - for (dn = list_head(&os->os_dnodes); - dn && !dnode_add_ref(dn, FTAG); - dn = list_next(&os->os_dnodes, dn)) - continue; - - while (dn) { - dnode_t *next_dn = dn; - - do { - next_dn = list_next(&os->os_dnodes, next_dn); - } while (next_dn && !dnode_add_ref(next_dn, FTAG)); - - mutex_exit(&os->os_lock); - dnode_evict_dbufs(dn); - dnode_rele(dn, FTAG); - mutex_enter(&os->os_lock); - dn = next_dn; + dn = list_head(&os->os_dnodes); + while (dn != NULL) { + /* + * Skip dnodes without holds. We have to do this dance + * because dnode_add_ref() only works if there is already a + * hold. If the dnode has no holds, then it has no dbufs. + */ + if (dnode_add_ref(dn, FTAG)) { + list_insert_after(&os->os_dnodes, dn, &dn_marker); + mutex_exit(&os->os_lock); + + dnode_evict_dbufs(dn); + dnode_rele(dn, FTAG); + + mutex_enter(&os->os_lock); + dn = list_next(&os->os_dnodes, &dn_marker); + list_remove(&os->os_dnodes, &dn_marker); + } else { + dn = list_next(&os->os_dnodes, dn); + } } mutex_exit(&os->os_lock); + + if (DMU_USERUSED_DNODE(os) != NULL) { + dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); + dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); + } + dnode_evict_dbufs(DMU_META_DNODE(os)); } +/* + * Objset eviction processing is split into into two pieces. + * The first marks the objset as evicting, evicts any dbufs that + * have a refcount of zero, and then queues up the objset for the + * second phase of eviction. Once os->os_dnodes has been cleared by + * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. + * The second phase closes the special dnodes, dequeues the objset from + * the list of those undergoing eviction, and finally frees the objset. + * + * NOTE: Due to asynchronous eviction processing (invocation of + * dnode_buf_pageout()), it is possible for the meta dnode for the + * objset to have no holds even though os->os_dnodes is not empty. + */ void dmu_objset_evict(objset_t *os) { @@ -633,7 +643,7 @@ dmu_objset_evict(objset_t *os) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); @@ -670,8 +680,24 @@ dmu_objset_evict(objset_t *os) if (os->os_sa) sa_tear_down(os); + os->os_evicting = B_TRUE; dmu_objset_evict_dbufs(os); + mutex_enter(&os->os_lock); + spa_evicting_os_register(os->os_spa, os); + if (list_is_empty(&os->os_dnodes)) { + mutex_exit(&os->os_lock); + dmu_objset_evict_done(os); + } else { + mutex_exit(&os->os_lock); + } +} + +void +dmu_objset_evict_done(objset_t *os) +{ + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); @@ -679,8 +705,6 @@ dmu_objset_evict(objset_t *os) } zil_free(os->os_zil); - ASSERT3P(list_head(&os->os_dnodes), ==, NULL); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* @@ -695,6 +719,7 @@ dmu_objset_evict(objset_t *os) mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); + spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } @@ -903,7 +928,7 @@ dmu_objset_clone_check(void *arg, dmu_tx } /* You can only clone snapshots, not the head datasets. */ - if (!dsl_dataset_is_snapshot(origin)) { + if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EINVAL)); } @@ -1467,7 +1492,7 @@ int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) - return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); + return (os->os_dsl_dataset->ds_is_snapshot); else return (B_FALSE); } Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c Tue Jan 20 20:10:03 2015 (r277427) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c Tue Jan 20 20:11:30 2015 (r277428) @@ -611,7 +611,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, fromtxg = fromzb->zbm_creation_txg; } dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } @@ -818,7 +818,7 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl ASSERT(dsl_pool_config_held(dp)); /* tosnap must be a snapshot */ - if (!dsl_dataset_is_snapshot(ds)) + if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* @@ -1065,7 +1065,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t dsl_dataset_rele(ds, FTAG); return (error); } - if (!dsl_dataset_is_snapshot(origin)) { + if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c Tue Jan 20 20:10:03 2015 (r277427) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c Tue Jan 20 20:11:30 2015 (r277428) @@ -532,7 +532,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); /* See comment on ZIL traversal in dsl_scan_visitds. */ - if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { + if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c Tue Jan 20 20:10:03 2015 (r277427) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c Tue Jan 20 20:11:30 2015 (r277428) @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -402,8 +403,9 @@ static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { - dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); + dnode_t *dn; + dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); ASSERT(!POINTER_IS_VALID(dn->dn_objset)); dn->dn_moved = 0; @@ -440,13 +442,31 @@ dnode_create(objset_t *os, dnode_phys_t ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); - list_insert_head(&os->os_dnodes, dn); + if (dnh->dnh_dnode != NULL) { + /* Lost the allocation race. */ + mutex_exit(&os->os_lock); + kmem_cache_free(dnode_cache, dn); + return (dnh->dnh_dnode); + } + + /* + * Exclude special dnodes from os_dnodes so an empty os_dnodes + * signifies that the special dnodes have no references from + * their children (the entries in os_dnodes). This allows + * dnode_destroy() to easily determine if the last child has + * been removed and then complete eviction of the objset. + */ + if (!DMU_OBJECT_IS_SPECIAL(object)) + list_insert_head(&os->os_dnodes, dn); membar_producer(); + /* - * Everything else must be valid before assigning dn_objset makes the - * dnode eligible for dnode_move(). + * Everything else must be valid before assigning dn_objset + * makes the dnode eligible for dnode_move(). */ dn->dn_objset = os; + + dnh->dnh_dnode = dn; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); @@ -460,12 +480,18 @@ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; + boolean_t complete_os_eviction = B_FALSE; ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); POINTER_INVALIDATE(&dn->dn_objset); - list_remove(&os->os_dnodes, dn); + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + list_remove(&os->os_dnodes, dn); + complete_os_eviction = + list_is_empty(&os->os_dnodes) && + list_link_active(&os->os_evicting_node); + } mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ @@ -500,6 +526,9 @@ dnode_destroy(dnode_t *dn) dmu_zfetch_rele(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); + + if (complete_os_eviction) + dmu_objset_evict_done(os); } void @@ -966,33 +995,32 @@ dnode_special_close(dnode_handle_t *dnh) */ while (refcount_count(&dn->dn_holds) > 0) delay(1); + ASSERT(dn->dn_dbuf == NULL || + dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); zrl_add(&dnh->dnh_zrlock); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } -dnode_t * +void dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh) { - dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh); - dnh->dnh_dnode = dn; + dnode_t *dn; + + dn = dnode_create(os, dnp, NULL, object, dnh); zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); - return (dn); } static void -dnode_buf_pageout(dmu_buf_t *db, void *arg) +dnode_buf_pageout(void *dbu) { - dnode_children_t *children_dnodes = arg; + dnode_children_t *children_dnodes = dbu; int i; - int epb = db->db_size >> DNODE_SHIFT; - ASSERT(epb == children_dnodes->dnc_count); - - for (i = 0; i < epb; i++) { + for (i = 0; i < children_dnodes->dnc_count; i++) { dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; dnode_t *dn; @@ -1022,7 +1050,7 @@ dnode_buf_pageout(dmu_buf_t *db, void *a dnh->dnh_dnode = NULL; } kmem_free(children_dnodes, sizeof (dnode_children_t) + - epb * sizeof (dnode_handle_t)); + children_dnodes->dnc_count * sizeof (dnode_handle_t)); } /* @@ -1106,16 +1134,17 @@ dnode_hold_impl(objset_t *os, uint64_t o if (children_dnodes == NULL) { int i; dnode_children_t *winner; - children_dnodes = kmem_alloc(sizeof (dnode_children_t) + + children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); children_dnodes->dnc_count = epb; dnh = &children_dnodes->dnc_children[0]; for (i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); - dnh[i].dnh_dnode = NULL; } - if (winner = dmu_buf_set_user(&db->db, children_dnodes, - dnode_buf_pageout)) { + dmu_buf_init_user(&children_dnodes->dnc_dbu, + dnode_buf_pageout, NULL); + winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); + if (winner != NULL) { for (i = 0; i < epb; i++) { zrl_destroy(&dnh[i].dnh_zrlock); @@ -1130,17 +1159,11 @@ dnode_hold_impl(objset_t *os, uint64_t o dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); - if ((dn = dnh->dnh_dnode) == NULL) { + dn = dnh->dnh_dnode; + if (dn == NULL) { dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; - dnode_t *winner; dn = dnode_create(os, phys, db, object, dnh); - winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn); - if (winner != NULL) { - zrl_add(&dnh->dnh_zrlock); - dnode_destroy(dn); /* implicit zrl_remove() */ - dn = winner; - } } mutex_enter(&dn->dn_mtx); @@ -1154,10 +1177,10 @@ dnode_hold_impl(objset_t *os, uint64_t o dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } - mutex_exit(&dn->dn_mtx); - if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); + mutex_exit(&dn->dn_mtx); + /* Now we can rely on the hold to prevent the dnode from moving. */ zrl_remove(&dnh->dnh_zrlock); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c Tue Jan 20 20:10:03 2015 (r277427) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c Tue Jan 20 20:11:30 2015 (r277428) @@ -22,6 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -396,49 +397,37 @@ dnode_sync_free_range(void *arg, uint64_ void dnode_evict_dbufs(dnode_t *dn) { - int progress; - int pass = 0; + dmu_buf_impl_t db_marker; + dmu_buf_impl_t *db, *db_next; + + mutex_enter(&dn->dn_dbufs_mtx); + for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { - do { - dmu_buf_impl_t *db, *db_next; - int evicting = FALSE; - - progress = FALSE; - mutex_enter(&dn->dn_dbufs_mtx); - for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { - db_next = AVL_NEXT(&dn->dn_dbufs, db); #ifdef DEBUG - DB_DNODE_ENTER(db); - ASSERT3P(DB_DNODE(db), ==, dn); - DB_DNODE_EXIT(db); + DB_DNODE_ENTER(db); + ASSERT3P(DB_DNODE(db), ==, dn); + DB_DNODE_EXIT(db); #endif /* DEBUG */ - mutex_enter(&db->db_mtx); - if (db->db_state == DB_EVICTING) { - progress = TRUE; - evicting = TRUE; - mutex_exit(&db->db_mtx); - } else if (refcount_is_zero(&db->db_holds)) { - progress = TRUE; - dbuf_clear(db); /* exits db_mtx for us */ - } else { - mutex_exit(&db->db_mtx); - } + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING && + refcount_is_zero(&db->db_holds)) { + db_marker.db_level = db->db_level; + db_marker.db_blkid = db->db_blkid; + db_marker.db_state = DB_SEARCH; + avl_insert_here(&dn->dn_dbufs, &db_marker, db, + AVL_BEFORE); + + dbuf_clear(db); + db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); + avl_remove(&dn->dn_dbufs, &db_marker); + } else { + mutex_exit(&db->db_mtx); + db_next = AVL_NEXT(&dn->dn_dbufs, db); } - /* - * NB: we need to drop dn_dbufs_mtx between passes so - * that any DB_EVICTING dbufs can make progress. - * Ideally, we would have some cv we could wait on, but - * since we don't, just wait a bit to give the other - * thread a chance to run. - */ - mutex_exit(&dn->dn_dbufs_mtx); - if (evicting) - delay(1); - pass++; - ASSERT(pass < 100); /* sanity check */ - } while (progress); + } + mutex_exit(&dn->dn_dbufs_mtx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { @@ -497,7 +486,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *t dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); ASSERT(avl_is_empty(&dn->dn_dbufs)); - ASSERT3P(dn->dn_bonus, ==, NULL); /* * XXX - It would be nice to assert this, but we may still Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_bookmark.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_bookmark.c Tue Jan 20 20:10:03 2015 (r277427) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_bookmark.c Tue Jan 20 20:11:30 2015 (r277428) @@ -120,7 +120,7 @@ dsl_bookmark_create_check_impl(dsl_datas int error; zfs_bookmark_phys_t bmark_phys; - if (!dsl_dataset_is_snapshot(snapds)) + if (!snapds->ds_is_snapshot) return (SET_ERROR(EINVAL)); error = dsl_bookmark_hold_ds(dp, bookmark_name, Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c Tue Jan 20 20:10:03 2015 (r277427) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c Tue Jan 20 20:11:30 2015 (r277428) @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 RackTop Systems. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -71,7 +72,6 @@ int zfs_max_recordsize = 1 * 1024 * 1024 #define DS_REF_MAX (1ULL << 62) extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); -extern inline boolean_t dsl_dataset_is_snapshot(dsl_dataset_t *ds); /* * Figure out how much of this delta should be propogated to the dsl_dir @@ -155,7 +155,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); - ASSERT(!dsl_dataset_is_snapshot(ds)); + ASSERT(!ds->ds_is_snapshot); dmu_buf_will_dirty(ds->ds_dbuf, tx); if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { @@ -253,14 +253,15 @@ dsl_dataset_block_freeable(dsl_dataset_t return (B_TRUE); } -/* ARGSUSED */ static void -dsl_dataset_evict(dmu_buf_t *db, void *dsv) +dsl_dataset_evict(void *dbu) { - dsl_dataset_t *ds = dsv; + dsl_dataset_t *ds = dbu; ASSERT(ds->ds_owner == NULL); + ds->ds_dbuf = NULL; + unique_remove(ds->ds_fsid_guid); if (ds->ds_objset != NULL) @@ -272,10 +273,10 @@ dsl_dataset_evict(dmu_buf_t *db, void *d } bplist_destroy(&ds->ds_pending_deadlist); - if (dsl_dataset_phys(ds)->ds_deadlist_obj != 0) + if (ds->ds_deadlist.dl_os != NULL) dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_dir) - dsl_dir_rele(ds->ds_dir, ds); + dsl_dir_async_rele(ds->ds_dir, ds); ASSERT(!list_link_active(&ds->ds_synced_link)); @@ -389,6 +390,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uin ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; + ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); @@ -427,7 +429,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uin return (err); } - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { ds->ds_snapname[0] = '\0'; if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, @@ -454,7 +456,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uin } } - if (err == 0 && !dsl_dataset_is_snapshot(ds)) { + if (err == 0 && !ds->ds_is_snapshot) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &ds->ds_reserved); @@ -467,8 +469,11 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uin ds->ds_reserved = ds->ds_quota = 0; } - if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds, - dsl_dataset_evict)) != NULL) { + dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf); + if (err == 0) + winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); + + if (err != 0 || winner != NULL) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) @@ -848,7 +853,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset uint64_t mrs_used; uint64_t dlused, dlcomp, dluncomp; - ASSERT(!dsl_dataset_is_snapshot(ds)); + ASSERT(!ds->ds_is_snapshot); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes; @@ -1591,7 +1596,7 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvl dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, dsl_dataset_phys(ds)->ds_uncompressed_bytes); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dsl_dataset_phys(ds)->ds_unique_bytes); @@ -1659,7 +1664,7 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT; stat->dds_guid = dsl_dataset_phys(ds)->ds_guid; stat->dds_origin[0] = '\0'; - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = dsl_dataset_phys(ds)->ds_num_children - 1; @@ -1919,7 +1924,7 @@ dsl_dataset_rollback_check(void *arg, dm return (error); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***