From owner-svn-src-stable@FreeBSD.ORG Tue Apr 30 10:05:49 2013 Return-Path: Delivered-To: svn-src-stable@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by hub.freebsd.org (Postfix) with ESMTP id 1E56B50C; Tue, 30 Apr 2013 10:05:49 +0000 (UTC) (envelope-from mm@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) by mx1.freebsd.org (Postfix) with ESMTP id 000481745; Tue, 30 Apr 2013 10:05:48 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.6/8.14.6) with ESMTP id r3UA5ml4069229; Tue, 30 Apr 2013 10:05:48 GMT (envelope-from mm@svn.freebsd.org) Received: (from mm@localhost) by svn.freebsd.org (8.14.6/8.14.5/Submit) id r3UA5mbA069223; Tue, 30 Apr 2013 10:05:48 GMT (envelope-from mm@svn.freebsd.org) Message-Id: <201304301005.r3UA5mbA069223@svn.freebsd.org> From: Martin Matuska Date: Tue, 30 Apr 2013 10:05:48 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-9@freebsd.org Subject: svn commit: r250098 - in stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys X-SVN-Group: stable-9 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-stable@freebsd.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: SVN commit messages for all the -stable branches of the src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 30 Apr 2013 10:05:49 -0000 Author: mm Date: Tue Apr 30 10:05:47 2013 New Revision: 250098 URL: http://svnweb.freebsd.org/changeset/base/250098 Log: MFC r249858: Merge vendor bugfix for a possible deadlock related to async destroy and improve write performance by introducing a new lock protecting tx_open_txg. Illumos ZFS issues: 3642 dsl_scan_active() should not issue I/O to determine if async destroying is active 3643 txg_delay should not hold the tc_lock Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c Directory Properties: stable/9/sys/ (props changed) stable/9/sys/cddl/contrib/opensolaris/ (props changed) Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c ============================================================================== --- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c Tue Apr 30 08:33:38 2013 (r250097) +++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c Tue Apr 30 10:05:47 2013 (r250098) @@ -753,12 +753,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t zil_destroy_sync(dmu_objset_zil(os), tx); if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { + dsl_scan_t *scn = dp->dp_scan; + spa_feature_incr(dp->dp_spa, async_destroy, tx); dp->dp_bptree_obj = bptree_alloc(mos, tx); VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj, tx)); + ASSERT(!scn->scn_async_destroying); + scn->scn_async_destroying = B_TRUE; } used = ds->ds_dir->dd_phys->dd_used_bytes; Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c ============================================================================== --- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c Tue Apr 30 08:33:38 2013 (r250097) +++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c Tue Apr 30 10:05:47 2013 (r250098) @@ -125,6 +125,15 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t t scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); scn->scn_dp = dp; + /* + * It's possible that we're resuming a scan after a reboot so + * make sure that the scan_async_destroying flag is initialized + * appropriately. + */ + ASSERT(!scn->scn_async_destroying); + scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]); + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { @@ -1374,13 +1383,10 @@ dsl_scan_active(dsl_scan_t *scn) if (spa_shutting_down(spa)) return (B_FALSE); - if (scn->scn_phys.scn_state == DSS_SCANNING) + if (scn->scn_phys.scn_state == DSS_SCANNING || + scn->scn_async_destroying) return (B_TRUE); - if (spa_feature_is_active(spa, - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { - return (B_TRUE); - } if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); @@ -1436,6 +1442,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t * if (err == 0 && spa_feature_is_active(spa, &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + ASSERT(scn->scn_async_destroying); scn->scn_is_bptree = B_TRUE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); @@ -1456,6 +1463,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t * VERIFY0(bptree_free(dp->dp_meta_objset, dp->dp_bptree_obj, tx)); dp->dp_bptree_obj = 0; + scn->scn_async_destroying = B_FALSE; } } if (scn->scn_visited_this_txg) { Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h ============================================================================== --- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h Tue Apr 30 08:33:38 2013 (r250097) +++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h Tue Apr 30 10:05:47 2013 (r250098) @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_SCAN_H @@ -82,6 +82,7 @@ typedef struct dsl_scan { /* for freeing blocks */ boolean_t scn_is_bptree; + boolean_t scn_async_destroying; /* for debugging / information */ uint64_t scn_visited_this_txg; Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h ============================================================================== --- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h Tue Apr 30 08:33:38 2013 (r250097) +++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h Tue Apr 30 10:05:47 2013 (r250098) @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + #ifndef _SYS_TXG_IMPL_H #define _SYS_TXG_IMPL_H @@ -33,14 +37,55 @@ extern "C" { #endif +/* + * The tx_cpu structure is a per-cpu structure that is used to track + * the number of active transaction holds (tc_count). As transactions + * are assigned into a transaction group the appropriate tc_count is + * incremented to indicate that there are pending changes that have yet + * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement + * the tc_count. A transaction group is not considered quiesced until all + * tx_cpu structures have reached a tc_count of zero. + * + * This structure is a per-cpu structure by design. Updates to this structure + * are frequent and concurrent. Having a single structure would result in + * heavy lock contention so a per-cpu design was implemented. With the fanned + * out mutex design, consumers only need to lock the mutex associated with + * thread's cpu. + * + * The tx_cpu contains two locks, the tc_lock and tc_open_lock. + * The tc_lock is used to protect all members of the tx_cpu structure with + * the exception of the tc_open_lock. This lock should only be held for a + * short period of time, typically when updating the value of tc_count. + * + * The tc_open_lock protects the tx_open_txg member of the tx_state structure. + * This lock is used to ensure that transactions are only assigned into + * the current open transaction group. In order to move the current open + * transaction group to the quiesce phase, the txg_quiesce thread must + * grab all tc_open_locks, increment the tx_open_txg, and drop the locks. + * The tc_open_lock is held until the transaction is assigned into the + * transaction group. Typically, this is a short operation but if throttling + * is occuring it may be held for longer periods of time. + */ struct tx_cpu { - kmutex_t tc_lock; + kmutex_t tc_open_lock; /* protects tx_open_txg */ + kmutex_t tc_lock; /* protects the rest of this struct */ kcondvar_t tc_cv[TXG_SIZE]; uint64_t tc_count[TXG_SIZE]; list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ - char tc_pad[16]; + char tc_pad[8]; /* pad to fill 3 cache lines */ }; +/* + * The tx_state structure maintains the state information about the different + * stages of the pool's transcation groups. A per pool tx_state structure + * is used to track this information. The tx_state structure also points to + * an array of tx_cpu structures (described above). Although the tx_sync_lock + * is used to protect the members of this structure, it is not used to + * protect the tx_open_txg. Instead a special lock in the tx_cpu structure + * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock. + * Any thread wishing to update tx_open_txg must grab the tc_open_lock on + * every cpu (see txg_quiesce()). + */ typedef struct tx_state { tx_cpu_t *tx_cpu; /* protects right to enter txg */ kmutex_t tx_sync_lock; /* protects tx_state_t */ Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c ============================================================================== --- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c Tue Apr 30 08:33:38 2013 (r250097) +++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c Tue Apr 30 10:05:47 2013 (r250098) @@ -132,6 +132,8 @@ txg_init(dsl_pool_t *dp, uint64_t txg) int i; mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT, + NULL); for (i = 0; i < TXG_SIZE; i++) { cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL); @@ -174,6 +176,7 @@ txg_fini(dsl_pool_t *dp) for (c = 0; c < max_ncpus; c++) { int i; + mutex_destroy(&tx->tx_cpu[c].tc_open_lock); mutex_destroy(&tx->tx_cpu[c].tc_lock); for (i = 0; i < TXG_SIZE; i++) { cv_destroy(&tx->tx_cpu[c].tc_cv[i]); @@ -297,10 +300,12 @@ txg_hold_open(dsl_pool_t *dp, txg_handle tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; uint64_t txg; - mutex_enter(&tc->tc_lock); - + mutex_enter(&tc->tc_open_lock); txg = tx->tx_open_txg; + + mutex_enter(&tc->tc_lock); tc->tc_count[txg & TXG_MASK]++; + mutex_exit(&tc->tc_lock); th->th_cpu = tc; th->th_txg = txg; @@ -313,7 +318,8 @@ txg_rele_to_quiesce(txg_handle_t *th) { tx_cpu_t *tc = th->th_cpu; - mutex_exit(&tc->tc_lock); + ASSERT(!MUTEX_HELD(&tc->tc_lock)); + mutex_exit(&tc->tc_open_lock); } void @@ -350,10 +356,10 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg int c; /* - * Grab all tx_cpu locks so nobody else can get into this txg. + * Grab all tc_open_locks so nobody else can get into this txg. */ for (c = 0; c < max_ncpus; c++) - mutex_enter(&tx->tx_cpu[c].tc_lock); + mutex_enter(&tx->tx_cpu[c].tc_open_lock); ASSERT(txg == tx->tx_open_txg); tx->tx_open_txg++; @@ -363,7 +369,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg * enter the next transaction group. */ for (c = 0; c < max_ncpus; c++) - mutex_exit(&tx->tx_cpu[c].tc_lock); + mutex_exit(&tx->tx_cpu[c].tc_open_lock); /* * Quiesce the transaction group by waiting for everyone to txg_exit().