Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 8 Mar 2016 16:11:59 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r296505 - vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor/illumos/dist/cmd/zinject
Message-ID:  <201603081611.u28GBxl2029770@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Tue Mar  8 16:11:59 2016
New Revision: 296505
URL: https://svnweb.freebsd.org/changeset/base/296505

Log:
  6531 Provide mechanism to artificially limit disk performance
  
  Reviewed by: Paul Dagnelie <pcd@delphix.com>
  Reviewed by: Matthew Ahrens <mahrens@delphix.com>
  Reviewed by: George Wilson <george.wilson@delphix.com>
  Approved by: Dan McDonald <danmcd@omniti.com>
  Author: Prakash Surya <prakash.surya@delphix.com>
  
  illumos/illumos-gate@97e81309571898df9fdd94aab1216dfcf23e060b

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio_inject.c

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/cmd/zinject/zinject.c

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h	Tue Mar  8 15:55:43 2016	(r296504)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h	Tue Mar  8 16:11:59 2016	(r296505)
@@ -292,6 +292,7 @@ typedef struct zinject_record {
 	uint32_t	zi_iotype;
 	int32_t		zi_duration;
 	uint64_t	zi_timer;
+	uint64_t	zi_nlanes;
 	uint32_t	zi_cmd;
 	uint32_t	zi_pad;
 } zinject_record_t;

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h	Tue Mar  8 15:55:43 2016	(r296504)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h	Tue Mar  8 16:11:59 2016	(r296505)
@@ -419,6 +419,7 @@ struct zio {
 
 	uint64_t	io_offset;
 	hrtime_t	io_timestamp;
+	hrtime_t	io_target_timestamp;
 	avl_node_t	io_queue_node;
 	avl_node_t	io_offset_node;
 
@@ -506,6 +507,8 @@ extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
 extern void zio_execute(zio_t *zio);
 extern void zio_interrupt(zio_t *zio);
+extern void zio_delay_init(zio_t *zio);
+extern void zio_delay_interrupt(zio_t *zio);
 
 extern zio_t *zio_walk_parents(zio_t *cio);
 extern zio_t *zio_walk_children(zio_t *pio);
@@ -567,7 +570,7 @@ extern int zio_handle_fault_injection(zi
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
-extern uint64_t zio_handle_io_delay(zio_t *zio);
+extern hrtime_t zio_handle_io_delay(zio_t *zio);
 
 /*
  * Checksum ereport functions

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c	Tue Mar  8 15:55:43 2016	(r296504)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c	Tue Mar  8 16:11:59 2016	(r296505)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
  */
@@ -691,7 +691,7 @@ vdev_disk_io_intr(buf_t *bp)
 
 	kmem_free(vb, sizeof (vdev_buf_t));
 
-	zio_interrupt(zio);
+	zio_delay_interrupt(zio);
 }
 
 static void
@@ -797,6 +797,7 @@ vdev_disk_io_start(zio_t *zio)
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
 
 	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c	Tue Mar  8 15:55:43 2016	(r296504)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c	Tue Mar  8 16:11:59 2016	(r296505)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -158,7 +158,7 @@ vdev_file_io_intr(buf_t *bp)
 		zio->io_error = SET_ERROR(ENOSPC);
 
 	kmem_free(vb, sizeof (vdev_buf_t));
-	zio_interrupt(zio);
+	zio_delay_interrupt(zio);
 }
 
 static void
@@ -212,6 +212,7 @@ vdev_file_io_start(zio_t *zio)
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
 
 	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c	Tue Mar  8 15:55:43 2016	(r296504)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c	Tue Mar  8 16:11:59 2016	(r296505)
@@ -729,9 +729,6 @@ vdev_queue_io_done(zio_t *zio)
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *nio;
 
-	if (zio_injection_enabled)
-		delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
-
 	mutex_enter(&vq->vq_lock);
 
 	vdev_queue_pending_remove(vq, zio);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c	Tue Mar  8 15:55:43 2016	(r296504)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c	Tue Mar  8 16:11:59 2016	(r296505)
@@ -1352,6 +1352,58 @@ zio_interrupt(zio_t *zio)
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
+void
+zio_delay_interrupt(zio_t *zio)
+{
+	/*
+	 * The timeout_generic() function isn't defined in userspace, so
+	 * rather than trying to implement the function, the zio delay
+	 * functionality has been disabled for userspace builds.
+	 */
+
+#ifdef _KERNEL
+	/*
+	 * If io_target_timestamp is zero, then no delay has been registered
+	 * for this IO, thus jump to the end of this function and "skip" the
+	 * delay; issuing it directly to the zio layer.
+	 */
+	if (zio->io_target_timestamp != 0) {
+		hrtime_t now = gethrtime();
+
+		if (now >= zio->io_target_timestamp) {
+			/*
+			 * This IO has already taken longer than the target
+			 * delay to complete, so we don't want to delay it
+			 * any longer; we "miss" the delay and issue it
+			 * directly to the zio layer. This is likely due to
+			 * the target latency being set to a value less than
+			 * the underlying hardware can satisfy (e.g. delay
+			 * set to 1ms, but the disks take 10ms to complete an
+			 * IO request).
+			 */
+
+			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
+			    hrtime_t, now);
+
+			zio_interrupt(zio);
+		} else {
+			hrtime_t diff = zio->io_target_timestamp - now;
+
+			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
+			    hrtime_t, now, hrtime_t, diff);
+
+			(void) timeout_generic(CALLOUT_NORMAL,
+			    (void (*)(void *))zio_interrupt, zio, diff, 1, 0);
+		}
+
+		return;
+	}
+#endif
+
+	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
+	zio_interrupt(zio);
+}
+
 /*
  * Execute the I/O pipeline until one of the following occurs:
  *

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zio_inject.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zio_inject.c	Tue Mar  8 15:55:43 2016	(r296504)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zio_inject.c	Tue Mar  8 16:11:59 2016	(r296505)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 /*
@@ -49,15 +49,53 @@
 
 uint32_t zio_injection_enabled;
 
+/*
+ * Data describing each zinject handler registered on the system, and
+ * contains the list node linking the handler in the global zinject
+ * handler list.
+ */
 typedef struct inject_handler {
 	int			zi_id;
 	spa_t			*zi_spa;
 	zinject_record_t	zi_record;
+	uint64_t		*zi_lanes;
+	int			zi_next_lane;
 	list_node_t		zi_link;
 } inject_handler_t;
 
+/*
+ * List of all zinject handlers registered on the system, protected by
+ * the inject_lock defined below.
+ */
 static list_t inject_handlers;
+
+/*
+ * This protects insertion into, and traversal of, the inject handler
+ * list defined above; as well as the inject_delay_count. Any time a
+ * handler is inserted or removed from the list, this lock should be
+ * taken as a RW_WRITER; and any time traversal is done over the list
+ * (without modification to it) this lock should be taken as a RW_READER.
+ */
 static krwlock_t inject_lock;
+
+/*
+ * This holds the number of zinject delay handlers that have been
+ * registered on the system. It is protected by the inject_lock defined
+ * above. Thus modifications to this count must be a RW_WRITER of the
+ * inject_lock, and reads of this count must be (at least) a RW_READER
+ * of the lock.
+ */
+static int inject_delay_count = 0;
+
+/*
+ * This lock is used only in zio_handle_io_delay(), refer to the comment
+ * in that function for more details.
+ */
+static kmutex_t inject_delay_mtx;
+
+/*
+ * Used to assign unique identifying numbers to each new zinject handler.
+ */
 static int inject_next_id = 1;
 
 /*
@@ -360,32 +398,164 @@ spa_handle_ignored_writes(spa_t *spa)
 	rw_exit(&inject_lock);
 }
 
-uint64_t
+hrtime_t
 zio_handle_io_delay(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
-	inject_handler_t *handler;
-	uint64_t seconds = 0;
-
-	if (zio_injection_enabled == 0)
-		return (0);
+	inject_handler_t *min_handler = NULL;
+	hrtime_t min_target = 0;
 
 	rw_enter(&inject_lock, RW_READER);
 
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler)) {
+	/*
+	 * inject_delay_count is a subset of zio_injection_enabled that
+	 * is only incremented for delay handlers. These checks are
+	 * mainly added to remind the reader why we're not explicitly
+	 * checking zio_injection_enabled like the other functions.
+	 */
+	IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
+	IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
+
+	/*
+	 * If there aren't any inject delay handlers registered, then we
+	 * can short circuit and simply return 0 here. A value of zero
+	 * informs zio_delay_interrupt() that this request should not be
+	 * delayed. This short circuit keeps us from acquiring the
+	 * inject_delay_mutex unnecessarily.
+	 */
+	if (inject_delay_count == 0) {
+		rw_exit(&inject_lock);
+		return (0);
+	}
+
+	/*
+	 * Each inject handler has a number of "lanes" associated with
+	 * it. Each lane is able to handle requests independently of one
+	 * another, and at a latency defined by the inject handler
+	 * record's zi_timer field. Thus if a handler in configured with
+	 * a single lane with a 10ms latency, it will delay requests
+	 * such that only a single request is completed every 10ms. So,
+	 * if more than one request is attempted per each 10ms interval,
+	 * the average latency of the requests will be greater than
+	 * 10ms; but if only a single request is submitted each 10ms
+	 * interval the average latency will be 10ms.
+	 *
+	 * We need to acquire this mutex to prevent multiple concurrent
+	 * threads being assigned to the same lane of a given inject
+	 * handler. The mutex allows us to perform the following two
+	 * operations atomically:
+	 *
+	 *	1. determine the minimum handler and minimum target
+	 *	   value of all the possible handlers
+	 *	2. update that minimum handler's lane array
+	 *
+	 * Without atomicity, two (or more) threads could pick the same
+	 * lane in step (1), and then conflict with each other in step
+	 * (2). This could allow a single lane handler to process
+	 * multiple requests simultaneously, which shouldn't be possible.
+	 */
+	mutex_enter(&inject_delay_mtx);
 
+	for (inject_handler_t *handler = list_head(&inject_handlers);
+	    handler != NULL; handler = list_next(&inject_handlers, handler)) {
 		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
 			continue;
 
-		if (vd->vdev_guid == handler->zi_record.zi_guid) {
-			seconds = handler->zi_record.zi_timer;
-			break;
+		if (vd->vdev_guid != handler->zi_record.zi_guid)
+			continue;
+
+		/*
+		 * Defensive; should never happen as the array allocation
+		 * occurs prior to inserting this handler on the list.
+		 */
+		ASSERT3P(handler->zi_lanes, !=, NULL);
+
+		/*
+		 * This should never happen, the zinject command should
+		 * prevent a user from setting an IO delay with zero lanes.
+		 */
+		ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
+
+		ASSERT3U(handler->zi_record.zi_nlanes, >,
+		    handler->zi_next_lane);
+
+		/*
+		 * We want to issue this IO to the lane that will become
+		 * idle the soonest, so we compare the soonest this
+		 * specific handler can complete the IO with all other
+		 * handlers, to find the lowest value of all possible
+		 * lanes. We then use this lane to submit the request.
+		 *
+		 * Since each handler has a constant value for its
+		 * delay, we can just use the "next" lane for that
+		 * handler; as it will always be the lane with the
+		 * lowest value for that particular handler (i.e. the
+		 * lane that will become idle the soonest). This saves a
+		 * scan of each handler's lanes array.
+		 *
+		 * There's two cases to consider when determining when
+		 * this specific IO request should complete. If this
+		 * lane is idle, we want to "submit" the request now so
+		 * it will complete after zi_timer milliseconds. Thus,
+		 * we set the target to now + zi_timer.
+		 *
+		 * If the lane is busy, we want this request to complete
+		 * zi_timer milliseconds after the lane becomes idle.
+		 * Since the 'zi_lanes' array holds the time at which
+		 * each lane will become idle, we use that value to
+		 * determine when this request should complete.
+		 */
+		hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
+		hrtime_t busy = handler->zi_record.zi_timer +
+		    handler->zi_lanes[handler->zi_next_lane];
+		hrtime_t target = MAX(idle, busy);
+
+		if (min_handler == NULL) {
+			min_handler = handler;
+			min_target = target;
+			continue;
 		}
 
+		ASSERT3P(min_handler, !=, NULL);
+		ASSERT3U(min_target, !=, 0);
+
+		/*
+		 * We don't yet increment the "next lane" variable since
+		 * we still might find a lower value lane in another
+		 * handler during any remaining iterations. Once we're
+		 * sure we've selected the absolute minimum, we'll claim
+		 * the lane and increment the handler's "next lane"
+		 * field below.
+		 */
+
+		if (target < min_target) {
+			min_handler = handler;
+			min_target = target;
+		}
 	}
+
+	/*
+	 * 'min_handler' will be NULL if no IO delays are registered for
+	 * this vdev, otherwise it will point to the handler containing
+	 * the lane that will become idle the soonest.
+	 */
+	if (min_handler != NULL) {
+		ASSERT3U(min_target, !=, 0);
+		min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
+
+		/*
+		 * If we've used all possible lanes for this handler,
+		 * loop back and start using the first lane again;
+		 * otherwise, just increment the lane index.
+		 */
+		min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
+		    min_handler->zi_record.zi_nlanes;
+	}
+
+	mutex_exit(&inject_delay_mtx);
 	rw_exit(&inject_lock);
-	return (seconds);
+
+	return (min_target);
 }
 
 /*
@@ -409,6 +579,24 @@ zio_inject_fault(char *name, int flags, 
 		if ((error = spa_reset(name)) != 0)
 			return (error);
 
+	if (record->zi_cmd == ZINJECT_DELAY_IO) {
+		/*
+		 * A value of zero for the number of lanes or for the
+		 * delay time doesn't make sense.
+		 */
+		if (record->zi_timer == 0 || record->zi_nlanes == 0)
+			return (SET_ERROR(EINVAL));
+
+		/*
+		 * The number of lanes is directly mapped to the size of
+		 * an array used by the handler. Thus, to ensure the
+		 * user doesn't trigger an allocation that's "too large"
+		 * we cap the number of lanes here.
+		 */
+		if (record->zi_nlanes >= UINT16_MAX)
+			return (SET_ERROR(EINVAL));
+	}
+
 	if (!(flags & ZINJECT_NULL)) {
 		/*
 		 * spa_inject_ref() will add an injection reference, which will
@@ -420,11 +608,34 @@ zio_inject_fault(char *name, int flags, 
 
 		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
 
+		handler->zi_spa = spa;
+		handler->zi_record = *record;
+
+		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+			handler->zi_lanes = kmem_zalloc(
+			    sizeof (*handler->zi_lanes) *
+			    handler->zi_record.zi_nlanes, KM_SLEEP);
+			handler->zi_next_lane = 0;
+		} else {
+			handler->zi_lanes = NULL;
+			handler->zi_next_lane = 0;
+		}
+
 		rw_enter(&inject_lock, RW_WRITER);
 
+		/*
+		 * We can't move this increment into the conditional
+		 * above because we need to hold the RW_WRITER lock of
+		 * inject_lock, and we don't want to hold that while
+		 * allocating the handler's zi_lanes array.
+		 */
+		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+			ASSERT3S(inject_delay_count, >=, 0);
+			inject_delay_count++;
+			ASSERT3S(inject_delay_count, >, 0);
+		}
+
 		*id = handler->zi_id = inject_next_id++;
-		handler->zi_spa = spa;
-		handler->zi_record = *record;
 		list_insert_tail(&inject_handlers, handler);
 		atomic_inc_32(&zio_injection_enabled);
 
@@ -502,9 +713,23 @@ zio_clear_fault(int id)
 		return (SET_ERROR(ENOENT));
 	}
 
+	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+		ASSERT3S(inject_delay_count, >, 0);
+		inject_delay_count--;
+		ASSERT3S(inject_delay_count, >=, 0);
+	}
+
 	list_remove(&inject_handlers, handler);
 	rw_exit(&inject_lock);
 
+	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+		ASSERT3P(handler->zi_lanes, !=, NULL);
+		kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
+		    handler->zi_record.zi_nlanes);
+	} else {
+		ASSERT3P(handler->zi_lanes, ==, NULL);
+	}
+
 	spa_inject_delref(handler->zi_spa);
 	kmem_free(handler, sizeof (inject_handler_t));
 	atomic_dec_32(&zio_injection_enabled);
@@ -516,6 +741,7 @@ void
 zio_inject_init(void)
 {
 	rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&inject_handlers, sizeof (inject_handler_t),
 	    offsetof(inject_handler_t, zi_link));
 }
@@ -524,5 +750,6 @@ void
 zio_inject_fini(void)
 {
 	list_destroy(&inject_handlers);
+	mutex_destroy(&inject_delay_mtx);
 	rw_destroy(&inject_lock);
 }



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201603081611.u28GBxl2029770>