From owner-svn-src-all@FreeBSD.ORG  Thu Jan 16 15:57:41 2014
Return-Path: <owner-svn-src-all@FreeBSD.ORG>
Delivered-To: svn-src-all@freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115])
 (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits))
 (No client certificate requested)
 by hub.freebsd.org (Postfix) with ESMTPS id 386FCE0C;
 Thu, 16 Jan 2014 15:57:41 +0000 (UTC)
Received: from svn.freebsd.org (svn.freebsd.org
 [IPv6:2001:1900:2254:2068::e6a:0])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (No client certificate requested)
 by mx1.freebsd.org (Postfix) with ESMTPS id 2065A1957;
 Thu, 16 Jan 2014 15:57:41 +0000 (UTC)
Received: from svn.freebsd.org ([127.0.1.70])
 by svn.freebsd.org (8.14.7/8.14.7) with ESMTP id s0GFvfdo069664;
 Thu, 16 Jan 2014 15:57:41 GMT (envelope-from avg@svn.freebsd.org)
Received: (from avg@localhost)
 by svn.freebsd.org (8.14.7/8.14.7/Submit) id s0GFvdOZ069652;
 Thu, 16 Jan 2014 15:57:39 GMT (envelope-from avg@svn.freebsd.org)
Message-Id: <201401161557.s0GFvdOZ069652@svn.freebsd.org>
From: Andriy Gapon <avg@FreeBSD.org>
Date: Thu, 16 Jan 2014 15:57:39 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-all@freebsd.org,
 svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org
Subject: svn commit: r260763 - in stable/10: cddl/contrib/opensolaris/cmd/ztest
 cddl/contrib/opensolaris/lib/libzpool/common/sys
 sys/cddl/compat/opensolaris/sys
 sys/cddl/contrib/opensolaris/uts/common/fs/zf...
X-SVN-Group: stable-10
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-BeenThere: svn-src-all@freebsd.org
X-Mailman-Version: 2.1.17
Precedence: list
List-Id: "SVN commit messages for the entire src tree \(except for &quot;
 user&quot; and &quot; projects&quot; \)" <svn-src-all.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/options/svn-src-all>,
 <mailto:svn-src-all-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-all/>
List-Post: <mailto:svn-src-all@freebsd.org>
List-Help: <mailto:svn-src-all-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-all>,
 <mailto:svn-src-all-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Thu, 16 Jan 2014 15:57:41 -0000

Author: avg
Date: Thu Jan 16 15:57:39 2014
New Revision: 260763
URL: http://svnweb.freebsd.org/changeset/base/260763

Log:
  MFC r258632,258704: MFV r255255: 4045 zfs write throttle & i/o scheduler
  performance work
  
  Sponsored by:	HybridCluster [merge]

Added:
  stable/10/sys/cddl/compat/opensolaris/sys/disp.h
     - copied unchanged from r258632, head/sys/cddl/compat/opensolaris/sys/disp.h
Modified:
  stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c
  stable/10/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Thu Jan 16 15:47:09 2014	(r260762)
+++ stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Thu Jan 16 15:57:39 2014	(r260763)
@@ -186,7 +186,7 @@ static const ztest_shared_opts_t ztest_o
 
 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
-extern uint64_t zfs_deadman_synctime;
+extern uint64_t zfs_deadman_synctime_ms;
 
 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
@@ -5328,10 +5328,10 @@ ztest_deadman_thread(void *arg)
 	hrtime_t delta, total = 0;
 
 	for (;;) {
-		delta = (zs->zs_thread_stop - zs->zs_thread_start) /
-		    NANOSEC + zfs_deadman_synctime;
+		delta = zs->zs_thread_stop - zs->zs_thread_start +
+		    MSEC2NSEC(zfs_deadman_synctime_ms);
 
-		(void) poll(NULL, 0, (int)(1000 * delta));
+		(void) poll(NULL, 0, (int)NSEC2MSEC(delta));
 
 		/*
 		 * If the pool is suspended then fail immediately. Otherwise,
@@ -5342,12 +5342,12 @@ ztest_deadman_thread(void *arg)
 		if (spa_suspended(spa)) {
 			fatal(0, "aborting test after %llu seconds because "
 			    "pool has transitioned to a suspended state.",
-			    zfs_deadman_synctime);
+			    zfs_deadman_synctime_ms / 1000);
 			return (NULL);
 		}
 		vdev_deadman(spa->spa_root_vdev);
 
-		total += zfs_deadman_synctime;
+		total += zfs_deadman_synctime_ms/1000;
 		(void) printf("ztest has been running for %lld seconds\n",
 		    total);
 	}
@@ -6080,7 +6080,7 @@ main(int argc, char **argv)
 	(void) setvbuf(stdout, NULL, _IOLBF, 0);
 
 	dprintf_setup(&argc, argv);
-	zfs_deadman_synctime = 300;
+	zfs_deadman_synctime_ms = 300000;
 
 	ztest_fd_rand = open("/dev/urandom", O_RDONLY);
 	ASSERT3S(ztest_fd_rand, >=, 0);

Modified: stable/10/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
==============================================================================
--- stable/10/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h	Thu Jan 16 15:47:09 2014	(r260762)
+++ stable/10/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h	Thu Jan 16 15:57:39 2014	(r260763)
@@ -65,6 +65,7 @@ extern "C" {
 #include <inttypes.h>
 #include <fsshare.h>
 #include <pthread.h>
+#include <sched.h>
 #include <sys/debug.h>
 #include <sys/note.h>
 #include <sys/types.h>
@@ -204,6 +205,8 @@ extern int aok;
  */
 #define	curthread	((void *)(uintptr_t)thr_self())
 
+#define	kpreempt(x)	sched_yield()
+
 typedef struct kthread kthread_t;
 
 #define	thread_create(stk, stksize, func, arg, len, pp, state, pri)	\

Copied: stable/10/sys/cddl/compat/opensolaris/sys/disp.h (from r258632, head/sys/cddl/compat/opensolaris/sys/disp.h)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ stable/10/sys/cddl/compat/opensolaris/sys/disp.h	Thu Jan 16 15:57:39 2014	(r260763, copy of r258632, head/sys/cddl/compat/opensolaris/sys/disp.h)
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2013 Andriy Gapon
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _OPENSOLARIS_SYS_DISP_H_
+#define	_OPENSOLARIS_SYS_DISP_H_
+
+#ifdef _KERNEL
+
+#include <sys/proc.h>
+
+#define	kpreempt(x)	kern_yield(PRI_USER)
+
+#endif	/* _KERNEL */
+
+#endif	/* _OPENSOLARIS_SYS_DISP_H_ */

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Thu Jan 16 15:47:09 2014	(r260762)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Thu Jan 16 15:57:39 2014	(r260763)
@@ -127,6 +127,7 @@
 #include <sys/refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
 #ifdef _KERNEL
 #include <sys/dnlc.h>
 #endif
@@ -150,10 +151,6 @@ static kmutex_t		arc_reclaim_thr_lock;
 static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
 static uint8_t		arc_thread_exit;
 
-extern int zfs_write_limit_shift;
-extern uint64_t zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
-
 #define	ARC_REDUCE_DNLC_PERCENT	3
 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 
@@ -162,6 +159,12 @@ typedef enum arc_reclaim_strategy {
 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
 } arc_reclaim_strategy_t;
 
+/*
+ * The number of iterations through arc_evict_*() before we
+ * drop & reacquire the lock.
+ */
+int arc_evict_iterations = 100;
+
 /* number of seconds before growing cache again */
 static int		arc_grow_retry = 60;
 
@@ -177,6 +180,11 @@ static int		arc_shrink_shift = 5;
  */
 static int		arc_min_prefetch_lifespan;
 
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
+
 static int arc_dead;
 extern int zfs_prefetch_disable;
 
@@ -526,6 +534,7 @@ typedef struct arc_write_callback arc_wr
 struct arc_write_callback {
 	void		*awcb_private;
 	arc_done_func_t	*awcb_ready;
+	arc_done_func_t	*awcb_physdone;
 	arc_done_func_t	*awcb_done;
 	arc_buf_t	*awcb_buf;
 };
@@ -1312,7 +1321,7 @@ arc_change_state(arc_state_t *new_state,
 	kmutex_t *lock;
 
 	ASSERT(MUTEX_HELD(hash_lock));
-	ASSERT(new_state != old_state);
+	ASSERT3P(new_state, !=, old_state);
 	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
 	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
 	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
@@ -1937,8 +1946,10 @@ arc_evict(arc_state_t *state, uint64_t s
 	kmutex_t *hash_lock;
 	boolean_t have_lock;
 	void *stolen = NULL;
+	arc_buf_hdr_t marker = { 0 };
+	int count = 0;
 	static int evict_metadata_offset, evict_data_offset;
-	int i, idx, offset, list_count, count;
+	int i, idx, offset, list_count, lists;
 
 	ASSERT(state == arc_mru || state == arc_mfu);
 
@@ -1958,7 +1969,7 @@ arc_evict(arc_state_t *state, uint64_t s
 		idx = evict_data_offset;
 	}
 	bytes_remaining = evicted_state->arcs_lsize[type];
-	count = 0;
+	lists = 0;
 
 evict_start:
 	list = &list_start[idx];
@@ -1985,6 +1996,33 @@ evict_start:
 		if (recycle && ab->b_size != bytes &&
 		    ab_prev && ab_prev->b_size == bytes)
 			continue;
+
+		/* ignore markers */
+		if (ab->b_spa == 0)
+			continue;
+
+		/*
+		 * It may take a long time to evict all the bufs requested.
+		 * To avoid blocking all arc activity, periodically drop
+		 * the arcs_mtx and give other threads a chance to run
+		 * before reacquiring the lock.
+		 *
+		 * If we are looking for a buffer to recycle, we are in
+		 * the hot code path, so don't sleep.
+		 */
+		if (!recycle && count++ > arc_evict_iterations) {
+			list_insert_after(list, ab, &marker);
+			mutex_exit(evicted_lock);
+			mutex_exit(lock);
+			kpreempt(KPREEMPT_SYNC);
+			mutex_enter(lock);
+			mutex_enter(evicted_lock);
+			ab_prev = list_prev(list, &marker);
+			list_remove(list, &marker);
+			count = 0;
+			continue;
+		}
+
 		hash_lock = HDR_LOCK(ab);
 		have_lock = MUTEX_HELD(hash_lock);
 		if (have_lock || mutex_tryenter(hash_lock)) {
@@ -2051,7 +2089,7 @@ evict_start:
 				mutex_exit(evicted_lock);
 				mutex_exit(lock);
 				idx  = ((idx + 1) & (list_count - 1));
-				count++;
+				lists++;
 				goto evict_start;
 			}
 		} else {
@@ -2063,10 +2101,10 @@ evict_start:
 	mutex_exit(lock);
 
 	idx  = ((idx + 1) & (list_count - 1));
-	count++;
+	lists++;
 
 	if (bytes_evicted < bytes) {
-		if (count < list_count)
+		if (lists < list_count)
 			goto evict_start;
 		else
 			dprintf("only evicted %lld bytes from %x",
@@ -2084,28 +2122,14 @@ evict_start:
 		ARCSTAT_INCR(arcstat_mutex_miss, missed);
 
 	/*
-	 * We have just evicted some data into the ghost state, make
-	 * sure we also adjust the ghost state size if necessary.
+	 * Note: we have just evicted some data into the ghost state,
+	 * potentially putting the ghost size over the desired size.  Rather
+	 * that evicting from the ghost list in this hot code path, leave
+	 * this chore to the arc_reclaim_thread().
 	 */
-	if (arc_no_grow &&
-	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
-		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
-		    arc_mru_ghost->arcs_size - arc_c;
-
-		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
-			int64_t todelete =
-			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-			arc_evict_ghost(arc_mru_ghost, 0, todelete);
-		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
-			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
-			    arc_mru_ghost->arcs_size +
-			    arc_mfu_ghost->arcs_size - arc_c);
-			arc_evict_ghost(arc_mfu_ghost, 0, todelete);
-		}
-	}
+
 	if (stolen)
 		ARCSTAT_BUMP(arcstat_stolen);
-
 	return (stolen);
 }
 
@@ -2122,9 +2146,10 @@ arc_evict_ghost(arc_state_t *state, uint
 	kmutex_t *hash_lock, *lock;
 	uint64_t bytes_deleted = 0;
 	uint64_t bufs_skipped = 0;
+	int count = 0;
 	static int evict_offset;
 	int list_count, idx = evict_offset;
-	int offset, count = 0;
+	int offset, lists = 0;
 
 	ASSERT(GHOST_STATE(state));
 
@@ -2142,6 +2167,8 @@ evict_start:
 	mutex_enter(lock);
 	for (ab = list_tail(list); ab; ab = ab_prev) {
 		ab_prev = list_prev(list, ab);
+		if (ab->b_type > ARC_BUFC_NUMTYPES)
+			panic("invalid ab=%p", (void *)ab);
 		if (spa && ab->b_spa != spa)
 			continue;
 
@@ -2153,6 +2180,23 @@ evict_start:
 		/* caller may be trying to modify this buffer, skip it */
 		if (MUTEX_HELD(hash_lock))
 			continue;
+
+		/*
+		 * It may take a long time to evict all the bufs requested.
+		 * To avoid blocking all arc activity, periodically drop
+		 * the arcs_mtx and give other threads a chance to run
+		 * before reacquiring the lock.
+		 */
+		if (count++ > arc_evict_iterations) {
+			list_insert_after(list, ab, &marker);
+			mutex_exit(lock);
+			kpreempt(KPREEMPT_SYNC);
+			mutex_enter(lock);
+			ab_prev = list_prev(list, &marker);
+			list_remove(list, &marker);
+			count = 0;
+			continue;
+		}
 		if (mutex_tryenter(hash_lock)) {
 			ASSERT(!HDR_IO_IN_PROGRESS(ab));
 			ASSERT(ab->b_buf == NULL);
@@ -2188,14 +2232,16 @@ evict_start:
 			mutex_enter(lock);
 			ab_prev = list_prev(list, &marker);
 			list_remove(list, &marker);
-		} else
+		} else {
 			bufs_skipped += 1;
+		}
+
 	}
 	mutex_exit(lock);
 	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
-	count++;
+	lists++;
 
-	if (count < list_count)
+	if (lists < list_count)
 		goto evict_start;
 
 	evict_offset = idx;
@@ -2203,7 +2249,7 @@ evict_start:
 	    (bytes < 0 || bytes_deleted < bytes)) {
 		list_start = &state->arcs_lists[0];
 		list_count = ARC_BUFC_NUMMETADATALISTS;
-		offset = count = 0;
+		offset = lists = 0;
 		goto evict_start;
 	}
 
@@ -3083,7 +3129,7 @@ arc_read_done(zio_t *zio)
  */
 int
 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
-    void *private, int priority, int zio_flags, uint32_t *arc_flags,
+    void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
     const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr;
@@ -3699,6 +3745,18 @@ arc_write_ready(zio_t *zio)
 	hdr->b_flags |= ARC_IO_IN_PROGRESS;
 }
 
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write.  See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+	arc_write_callback_t *cb = zio->io_private;
+	if (cb->awcb_physdone != NULL)
+		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
 static void
 arc_write_done(zio_t *zio)
 {
@@ -3779,8 +3837,9 @@ arc_write_done(zio_t *zio)
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
-    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
-    void *private, int priority, int zio_flags, const zbookmark_t *zb)
+    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
+    arc_done_func_t *done, void *private, zio_priority_t priority,
+    int zio_flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
@@ -3797,18 +3856,20 @@ arc_write(zio_t *pio, spa_t *spa, uint64
 		hdr->b_flags |= ARC_L2COMPRESS;
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
+	callback->awcb_physdone = physdone;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
 	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
-	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
+	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
+	    priority, zio_flags, zb);
 
 	return (zio);
 }
 
 static int
-arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
 {
 #ifdef _KERNEL
 	uint64_t available_memory =
@@ -3822,7 +3883,9 @@ arc_memory_throttle(uint64_t reserve, ui
 	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
 #endif
 #endif	/* sun */
-	if (available_memory >= zfs_write_limit_max)
+
+	if (cnt.v_free_count + cnt.v_cache_count >
+	    (uint64_t)physmem * arc_lotsfree_percent / 100)
 		return (0);
 
 	if (txg > last_txg) {
@@ -3846,20 +3909,6 @@ arc_memory_throttle(uint64_t reserve, ui
 		return (SET_ERROR(EAGAIN));
 	}
 	page_load = 0;
-
-	if (arc_size > arc_c_min) {
-		uint64_t evictable_memory =
-		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
-		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
-		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
-		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
-		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
-	}
-
-	if (inflight_data > available_memory / 4) {
-		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
-		return (SET_ERROR(ERESTART));
-	}
 #endif
 	return (0);
 }
@@ -3877,15 +3926,6 @@ arc_tempreserve_space(uint64_t reserve, 
 	int error;
 	uint64_t anon_size;
 
-#ifdef ZFS_DEBUG
-	/*
-	 * Once in a while, fail for no reason.  Everything should cope.
-	 */
-	if (spa_get_random(10000) == 0) {
-		dprintf("forcing random failure\n");
-		return (SET_ERROR(ERESTART));
-	}
-#endif
 	if (reserve > arc_c/4 && !arc_no_grow)
 		arc_c = MIN(arc_c_max, reserve * 4);
 	if (reserve > arc_c)
@@ -3903,7 +3943,8 @@ arc_tempreserve_space(uint64_t reserve, 
 	 * in order to compress/encrypt/etc the data.  We therefore need to
 	 * make sure that there is sufficient available memory for this.
 	 */
-	if (error = arc_memory_throttle(reserve, anon_size, txg))
+	error = arc_memory_throttle(reserve, txg);
+	if (error != 0)
 		return (error);
 
 	/*
@@ -4094,11 +4135,20 @@ arc_init(void)
 	arc_dead = FALSE;
 	arc_warm = B_FALSE;
 
-	if (zfs_write_limit_max == 0)
-		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
-	else
-		zfs_write_limit_shift = 0;
-	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
+	/*
+	 * Calculate maximum amount of dirty data per pool.
+	 *
+	 * If it has been set by /etc/system, take that.
+	 * Otherwise, use a percentage of physical memory defined by
+	 * zfs_dirty_data_max_percent (default 10%) with a cap at
+	 * zfs_dirty_data_max_max (default 4GB).
+	 */
+	if (zfs_dirty_data_max == 0) {
+		zfs_dirty_data_max = ptob(physmem) *
+		    zfs_dirty_data_max_percent / 100;
+		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+		    zfs_dirty_data_max_max);
+	}
 
 #ifdef _KERNEL
 	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
@@ -4177,8 +4227,6 @@ arc_fini(void)
 		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
 	}
 
-	mutex_destroy(&zfs_write_limit_lock);
-
 	buf_fini();
 
 	ASSERT(arc_loaned_bytes == 0);

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Thu Jan 16 15:47:09 2014	(r260762)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Thu Jan 16 15:57:39 2014	(r260763)
@@ -842,7 +842,7 @@ dbuf_free_range(dnode_t *dn, uint64_t st
 		atomic_inc_64(&zfs_free_range_recv_miss);
 	}
 
-	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+	for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
 		db_next = list_next(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
@@ -1188,6 +1188,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
+	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
+		dr->dr_accounted = db->db.db_size;
 	dr->dr_dbuf = db;
 	dr->dr_txg = tx->tx_txg;
 	dr->dr_next = *drp;
@@ -1271,7 +1273,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 			dbuf_rele(parent, FTAG);
 
 		mutex_enter(&db->db_mtx);
-		/*  possible race with dbuf_undirty() */
+		/*
+		 * Since we've dropped the mutex, it's possible that
+		 * dbuf_undirty() might have changed this out from under us.
+		 */
 		if (db->db_last_dirty == dr ||
 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
 			mutex_enter(&di->dt.di.dr_mtx);
@@ -1333,7 +1338,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
 
 	ASSERT(db->db.db_size != 0);
 
-	/* XXX would be nice to fix up dn_towrite_space[] */
+	/*
+	 * Any space we accounted for in dp_dirty_* will be cleaned up by
+	 * dsl_pool_sync().  This is relatively rare so the discrepancy
+	 * is not a big deal.
+	 */
 
 	*drp = dr->dr_next;
 
@@ -1513,7 +1522,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, a
 
 /*
  * "Clear" the contents of this dbuf.  This will mark the dbuf
- * EVICTING and clear *most* of its references.  Unfortunetely,
+ * EVICTING and clear *most* of its references.  Unfortunately,
  * when we are not holding the dn_dbufs_mtx, we can't clear the
  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
  * in this case.  For callers from the DMU we will usually see:
@@ -1700,7 +1709,7 @@ dbuf_create(dnode_t *dn, uint8_t level, 
 		db->db.db_offset = 0;
 	} else {
 		int blocksize =
-		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
+		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
 		db->db.db_size = blocksize;
 		db->db.db_offset = db->db_blkid * blocksize;
 	}
@@ -1809,7 +1818,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
 }
 
 void
-dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
 {
 	dmu_buf_impl_t *db = NULL;
 	blkptr_t *bp = NULL;
@@ -1833,8 +1842,6 @@ dbuf_prefetch(dnode_t *dn, uint64_t blki
 
 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
 		if (bp && !BP_IS_HOLE(bp)) {
-			int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
-			    ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
 			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 			zbookmark_t zb;
@@ -1843,7 +1850,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blki
 			    dn->dn_object, 0, blkid);
 
 			(void) arc_read(NULL, dn->dn_objset->os_spa,
-			    bp, NULL, NULL, priority,
+			    bp, NULL, NULL, prio,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zb);
 		}
@@ -2528,6 +2535,38 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *
 	mutex_exit(&db->db_mtx);
 }
 
+/*
+ * The SPA will call this callback several times for each zio - once
+ * for every physical child i/o (zio->io_phys_children times).  This
+ * allows the DMU to monitor the progress of each logical i/o.  For example,
+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+ * block.  There may be a long delay before all copies/fragments are completed,
+ * so this callback allows us to retire dirty space gradually, as the physical
+ * i/os complete.
+ */
+/* ARGSUSED */
+static void
+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+	dmu_buf_impl_t *db = arg;
+	objset_t *os = db->db_objset;
+	dsl_pool_t *dp = dmu_objset_pool(os);
+	dbuf_dirty_record_t *dr;
+	int delta = 0;
+
+	dr = db->db_data_pending;
+	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+	/*
+	 * The callback will be called io_phys_children times.  Retire one
+	 * portion of our dirty space each time we are called.  Any rounding
+	 * error will be cleaned up by dsl_pool_sync()'s call to
+	 * dsl_pool_undirty_space().
+	 */
+	delta = dr->dr_accounted / zio->io_phys_children;
+	dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
 /* ARGSUSED */
 static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
@@ -2622,6 +2661,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
+
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 }
 
@@ -2740,8 +2780,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
 		ASSERT(db->db_state != DB_NOFILL);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
-		    dbuf_write_override_ready, dbuf_write_override_done, dr,
-		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+		    dbuf_write_override_ready, NULL, dbuf_write_override_done,
+		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
@@ -2752,7 +2792,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    db->db_blkptr, NULL, db->db.db_size, &zp,
-		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+		    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
 	} else {
@@ -2760,7 +2800,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
 		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
 		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
-		    dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
-		    ZIO_FLAG_MUSTSUCCEED, &zb);
+		    dbuf_write_physdone, dbuf_write_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Thu Jan 16 15:47:09 2014	(r260762)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Thu Jan 16 15:57:39 2014	(r260763)
@@ -374,13 +374,11 @@ static int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
     int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 {
-	dsl_pool_t *dp = NULL;
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
 	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio;
-	hrtime_t start;
 
 	ASSERT(length <= DMU_MAX_ACCESS);
 
@@ -408,9 +406,6 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn,
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
-	if (dn->dn_objset->os_dsl_dataset)
-		dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
-	start = gethrtime();
 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, offset);
 	for (i = 0; i < nblks; i++) {
@@ -434,9 +429,6 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn,
 
 	/* wait for async i/o */
 	err = zio_wait(zio);
-	/* track read overhead when we are in sync context */
-	if (dp && dsl_pool_sync_context(dp))
-		dp->dp_read_overhead += gethrtime() - start;
 	if (err) {
 		dmu_buf_rele_array(dbp, nblks, tag);
 		return (err);
@@ -518,12 +510,22 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake,
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
+/*
+ * Issue prefetch i/os for the given blocks.
+ *
+ * Note: The assumption is that we *know* these blocks will be needed
+ * almost immediately.  Therefore, the prefetch i/os will be issued at
+ * ZIO_PRIORITY_SYNC_READ
+ *
+ * Note: indirect blocks and other metadata will be read synchronously,
+ * causing this function to block if they are not already cached.
+ */
 void
 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 {
 	dnode_t *dn;
 	uint64_t blkid;
-	int nblks, i, err;
+	int nblks, err;
 
 	if (zfs_prefetch_disable)
 		return;
@@ -536,7 +538,7 @@ dmu_prefetch(objset_t *os, uint64_t obje
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
-		dbuf_prefetch(dn, blkid);
+		dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
 		rw_exit(&dn->dn_struct_rwlock);
 		return;
 	}
@@ -553,16 +555,16 @@ dmu_prefetch(objset_t *os, uint64_t obje
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
-		nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
-		    P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+		nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
+		    P2ALIGN(offset, 1 << blkshift)) >> blkshift;
 	} else {
 		nblks = (offset < dn->dn_datablksz);
 	}
 
 	if (nblks != 0) {
 		blkid = dbuf_whichblock(dn, offset);
-		for (i = 0; i < nblks; i++)
-			dbuf_prefetch(dn, blkid+i);
+		for (int i = 0; i < nblks; i++)
+			dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
@@ -1376,7 +1378,7 @@ dmu_sync_late_arrival(zio_t *pio, objset
 
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
-	    dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
+	    dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
@@ -1516,8 +1518,9 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
 
 	zio_nowait(arc_write(pio, os->os_spa, txg,
 	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
-	    DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done,
-	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+	    DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
+	    NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
+	    ZIO_FLAG_CANFAIL, &zb));
 
 	return (0);
 }

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Thu Jan 16 15:47:09 2014	(r260762)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Thu Jan 16 15:57:39 2014	(r260763)
@@ -1028,7 +1028,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
 	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
 	    DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
-	    dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
+	    NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
 	    ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Thu Jan 16 15:47:09 2014	(r260762)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Thu Jan 16 15:57:39 2014	(r260763)
@@ -54,6 +54,7 @@ dmu_tx_create_dd(dsl_dir_t *dd)
 	    offsetof(dmu_tx_hold_t, txh_node));
 	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
 	    offsetof(dmu_tx_callback_t, dcb_node));
+	tx->tx_start = gethrtime();
 #ifdef ZFS_DEBUG
 	refcount_create(&tx->tx_space_written);
 	refcount_create(&tx->tx_space_freed);
@@ -597,13 +598,13 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t 
 	if (txh == NULL)
 		return;
 	dn = txh->txh_dnode;
+	dmu_tx_count_dnode(txh);
 
 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 		return;
 	if (len == DMU_OBJECT_END)
 		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 
-	dmu_tx_count_dnode(txh);
 
 	/*
 	 * For i/o error checking, we read the first and last level-0
@@ -918,6 +919,161 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_i
 }
 #endif
 
+/*
+ * If we can't do 10 iops, something is wrong.  Let us go ahead
+ * and hit zfs_dirty_data_max.
+ */
+hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
+int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+
+/*
+ * We delay transactions when we've determined that the backend storage
+ * isn't able to accommodate the rate of incoming writes.
+ *
+ * If there is already a transaction waiting, we delay relative to when
+ * that transaction finishes waiting.  This way the calculated min_time
+ * is independent of the number of threads concurrently executing
+ * transactions.
+ *
+ * If we are the only waiter, wait relative to when the transaction
+ * started, rather than the current time.  This credits the transaction for
+ * "time already served", e.g. reading indirect blocks.
+ *
+ * The minimum time for a transaction to take is calculated as:
+ *     min_time = scale * (dirty - min) / (max - dirty)
+ *     min_time is then capped at zfs_delay_max_ns.
+ *
+ * The delay has two degrees of freedom that can be adjusted via tunables.
+ * The percentage of dirty data at which we start to delay is defined by
+ * zfs_delay_min_dirty_percent. This should typically be at or above
+ * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
+ * delay after writing at full speed has failed to keep up with the incoming
+ * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
+ * speaking, this variable determines the amount of delay at the midpoint of
+ * the curve.
+ *
+ * delay
+ *  10ms +-------------------------------------------------------------*+
+ *       |                                                             *|
+ *   9ms +                                                             *+
+ *       |                                                             *|
+ *   8ms +                                                             *+
+ *       |                                                            * |
+ *   7ms +                                                            * +
+ *       |                                                            * |
+ *   6ms +                                                            * +
+ *       |                                                            * |
+ *   5ms +                                                           *  +
+ *       |                                                           *  |
+ *   4ms +                                                           *  +
+ *       |                                                           *  |
+ *   3ms +                                                          *   +
+ *       |                                                          *   |
+ *   2ms +                                              (midpoint) *    +
+ *       |                                                  |    **     |
+ *   1ms +                                                  v ***       +
+ *       |             zfs_delay_scale ---------->     ********         |
+ *     0 +-------------------------------------*********----------------+
+ *       0%                    <- zfs_dirty_data_max ->               100%
+ *
+ * Note that since the delay is added to the outstanding time remaining on the
+ * most recent transaction, the delay is effectively the inverse of IOPS.
+ * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+ * was chosen such that small changes in the amount of accumulated dirty data
+ * in the first 3/4 of the curve yield relatively small differences in the
+ * amount of delay.
+ *
+ * The effects can be easier to understand when the amount of delay is
+ * represented on a log scale:
+ *
+ * delay
+ * 100ms +-------------------------------------------------------------++
+ *       +                                                              +
+ *       |                                                              |
+ *       +                                                             *+
+ *  10ms +                                                             *+
+ *       +                                                           ** +
+ *       |                                              (midpoint)  **  |
+ *       +                                                  |     **    +
+ *   1ms +                                                  v ****      +
+ *       +             zfs_delay_scale ---------->        *****         +
+ *       |                                             ****             |
+ *       +                                          ****                +
+ * 100us +                                        **                    +
+ *       +                                       *                      +
+ *       |                                      *                       |
+ *       +                                     *                        +
+ *  10us +                                     *                        +
+ *       +                                                              +
+ *       |                                                              |
+ *       +                                                              +
+ *       +--------------------------------------------------------------+
+ *       0%                    <- zfs_dirty_data_max ->               100%
+ *
+ * Note here that only as the amount of dirty data approaches its limit does
+ * the delay start to increase rapidly. The goal of a properly tuned system
+ * should be to keep the amount of dirty data out of that range by first
+ * ensuring that the appropriate limits are set for the I/O scheduler to reach
+ * optimal throughput on the backend storage, and then by changing the value
+ * of zfs_delay_scale to increase the steepness of the curve.
+ */
+static void
+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+{
+	dsl_pool_t *dp = tx->tx_pool;
+	uint64_t delay_min_bytes =
+	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+	hrtime_t wakeup, min_tx_time, now;
+
+	if (dirty <= delay_min_bytes)
+		return;
+
+	/*
+	 * The caller has already waited until we are under the max.
+	 * We make them pass us the amount of dirty data so we don't
+	 * have to handle the case of it being >= the max, which could
+	 * cause a divide-by-zero if it's == the max.
+	 */
+	ASSERT3U(dirty, <, zfs_dirty_data_max);
+
+	now = gethrtime();
+	min_tx_time = zfs_delay_scale *
+	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
+	if (now > tx->tx_start + min_tx_time)
+		return;
+
+	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
+
+	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
+	    uint64_t, min_tx_time);
+
+	mutex_enter(&dp->dp_lock);
+	wakeup = MAX(tx->tx_start + min_tx_time,
+	    dp->dp_last_wakeup + min_tx_time);
+	dp->dp_last_wakeup = wakeup;
+	mutex_exit(&dp->dp_lock);
+
+#ifdef _KERNEL
+#ifdef illumos
+	mutex_enter(&curthread->t_delay_lock);
+	while (cv_timedwait_hires(&curthread->t_delay_cv,
+	    &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
+	    CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
+		continue;
+	mutex_exit(&curthread->t_delay_lock);
+#else
+	pause_sbt("dmu_tx_delay", wakeup * SBT_1NS,
+	    zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE);
+#endif
+#else
+	hrtime_t delta = wakeup - gethrtime();
+	struct timespec ts;
+	ts.tv_sec = delta / NANOSEC;
+	ts.tv_nsec = delta % NANOSEC;
+	(void) nanosleep(&ts, NULL);
+#endif
+}
+
 static int
 dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 {
@@ -948,6 +1104,12 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_
 		return (SET_ERROR(ERESTART));
 	}
 
+	if (!tx->tx_waited &&
+	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
+		tx->tx_wait_dirty = B_TRUE;
+		return (SET_ERROR(ERESTART));
+	}

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***