Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 16 Sep 2015 22:15:51 +0000 (UTC)
From:      Warner Losh <imp@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-projects@freebsd.org
Subject:   svn commit: r287876 - in projects/iosched/sys/cam: . ata scsi
Message-ID:  <201509162215.t8GMFp1b023705@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: imp
Date: Wed Sep 16 22:15:50 2015
New Revision: 287876
URL: https://svnweb.freebsd.org/changeset/base/287876

Log:
  Commit the post-BSDcan level (and a little more) iosched work.
  
  This work is described in the paper that I presented at BSDcan
  http://people.freebsd.org/~imp/bsdcan2015/iosched-v3.pdf
  section XII. Recent Changes.

Modified:
  projects/iosched/sys/cam/ata/ata_da.c
  projects/iosched/sys/cam/cam_iosched.c
  projects/iosched/sys/cam/cam_iosched.h
  projects/iosched/sys/cam/scsi/scsi_da.c

Modified: projects/iosched/sys/cam/ata/ata_da.c
==============================================================================
--- projects/iosched/sys/cam/ata/ata_da.c	Wed Sep 16 21:43:51 2015	(r287875)
+++ projects/iosched/sys/cam/ata/ata_da.c	Wed Sep 16 22:15:50 2015	(r287876)
@@ -98,11 +98,13 @@ typedef enum {
 typedef enum {
 	ADA_Q_NONE		= 0x00,
 	ADA_Q_4K		= 0x01,
+	ADA_Q_NCQ_TRIM_BROKEN	= 0x02,
 } ada_quirks;
 
 #define ADA_Q_BIT_STRING	\
 	"\020"			\
-	"\0014K"
+	"\0014K"		\
+	"\002NCQ_TRIM_BROKEN"
 
 typedef enum {
 	ADA_CCB_RAHEAD		= 0x01,
@@ -160,6 +162,8 @@ struct ada_softc {
 	int	 trim_max_ranges;
 	int	 read_ahead;
 	int	 write_cache;
+	int	 unmappedio;
+	int	 rotating;
 #ifdef ADA_TEST_FAILURE
 	int      force_read_error;
 	int      force_write_error;
@@ -173,6 +177,13 @@ struct ada_softc {
 	struct sysctl_oid	*sysctl_tree;
 	struct callout		sendordered_c;
 	struct trim_request	trim_req;
+#ifdef CAM_IO_STATS
+	struct sysctl_ctx_list	sysctl_stats_ctx;
+	struct sysctl_oid	*sysctl_stats_tree;
+	u_int	timeouts;
+	u_int	errors;
+	u_int	invalidations;
+#endif
 };
 
 struct ada_quirk_entry {
@@ -350,6 +361,38 @@ static struct ada_quirk_entry ada_quirk_
 	},
 	{
 		/*
+		 * Crucial M500 SSDs EU07 firmware
+		 * NCQ Trim works ? 
+		 */
+		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M500*", "EU07" },
+		/*quirks*/0
+	},
+	{
+		/*
+		 * Crucial M500 SSDs all other firmware
+		 * NCQ Trim doesn't work
+		 */
+		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M500*", "*" },
+		/*quirks*/ADA_Q_NCQ_TRIM_BROKEN
+	},
+	{
+		/*
+		 * Crucial M550 SSDs
+		 * NCQ Trim doesn't work, but only on MU01 firmware
+		 */
+		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M550*", "MU01" },
+		/*quirks*/ADA_Q_NCQ_TRIM_BROKEN
+	},
+	{
+		/*
+		 * Crucial MX100 SSDs
+		 * NCQ Trim doesn't work, but only on MU01 firmware
+		 */
+		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*MX100*", "MU01" },
+		/*quirks*/ADA_Q_NCQ_TRIM_BROKEN
+	},
+	{
+		/*
 		 * Crucial RealSSD C300 SSDs
 		 * 4k optimised
 		 */
@@ -422,6 +465,30 @@ static struct ada_quirk_entry ada_quirk_
 	},
 	{
 		/*
+		 * Micron M500 SSDs firmware EU07
+		 * NCQ Trim works?
+		 */
+		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M500*", "EU07" },
+		/*quirks*/0
+	},
+	{
+		/*
+		 * Micron M500 SSDs all other firmware
+		 * NCQ Trim doesn't work
+		 */
+		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M500*", "*" },
+		/*quirks*/ADA_Q_NCQ_TRIM_BROKEN
+	},
+	{
+		/*
+		 * Micron M5[15]0 SSDs
+		 * NCQ Trim doesn't work, but only MU01 firmware
+		 */
+		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M5[15]0*", "MU01" },
+		/*quirks*/ADA_Q_NCQ_TRIM_BROKEN
+	},
+	{
+		/*
 		 * OCZ Agility 2 SSDs
 		 * 4k optimised & trim only works in 4k requests + 4k aligned
 		 */
@@ -471,22 +538,22 @@ static struct ada_quirk_entry ada_quirk_
 	{
 		/*
 		 * Samsung 830 Series SSDs
-		 * 4k optimised
+		 * 4k optimised, NCQ TRIM broken (normal TRIM fine)
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG SSD 830 Series*", "*" },
-		/*quirks*/ADA_Q_4K
+		/*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
 	},
 	{
 		/*
 		 * Samsung 840 SSDs
-		 * 4k optimised
+		 * 4k optimised, NCQ TRIM broken (normal TRIM fine)
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 840*", "*" },
-		/*quirks*/ADA_Q_4K
+		/*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
 	},
 	{
 		/*
-		 * Samsung 843T Series SSDs
+		 * Samsung PM843T Series SSDs
 		 * 4k optimised
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG MZ7WD*", "*" },
@@ -495,10 +562,10 @@ static struct ada_quirk_entry ada_quirk_
  	{
  		/*
 		 * Samsung 850 SSDs
-		 * 4k optimised
+		 * 4k optimised, NCQ TRIM broken (normal TRIM fine)
 		 */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 850*", "*" },
-		/*quirks*/ADA_Q_4K
+		/*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
 	},
 	{
 		/*
@@ -782,8 +849,6 @@ adastrategy(struct bio *bp)
 	/*
 	 * Place it in the queue of disk activities for this disk
 	 */
-	if (bp->bio_cmd == BIO_DELETE) {
-	}
 	cam_iosched_queue_work(softc->cam_iosched, bp);
 
 	/*
@@ -865,7 +930,7 @@ adadump(void *arg, void *virtual, vm_off
 				    0,
 				    NULL,
 				    0,
-				    ada_default_timeout*1000);
+				    5*1000);
 
 		if (softc->flags & ADA_FLAG_CAN_48BIT)
 			ata_48bit_cmd(&ccb.ataio, ATA_FLUSHCACHE48, 0, 0, 0);
@@ -939,6 +1004,9 @@ adaoninvalidate(struct cam_periph *perip
 	 * De-register any async callbacks.
 	 */
 	xpt_register_async(0, adaasync, periph, periph->path);
+#ifdef CAM_IO_STATS
+	softc->invalidations++;
+#endif
 
 	/*
 	 * Return all queued I/O with ENXIO.
@@ -959,12 +1027,20 @@ adacleanup(struct cam_periph *periph)
 
 	cam_periph_unlock(periph);
 
+	cam_iosched_fini(softc->cam_iosched);
+
 	/*
 	 * If we can't free the sysctl tree, oh well...
 	 */
-	if ((softc->flags & ADA_FLAG_SCTX_INIT) != 0
-	    && sysctl_ctx_free(&softc->sysctl_ctx) != 0) {
-		xpt_print(periph->path, "can't remove sysctl context\n");
+	if ((softc->flags & ADA_FLAG_SCTX_INIT) != 0) {
+#ifdef CAM_IO_STATS
+		if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0)
+			xpt_print(periph->path,
+			    "can't remove sysctl stats context\n");
+#endif
+		if (sysctl_ctx_free(&softc->sysctl_ctx) != 0)
+			xpt_print(periph->path,
+			    "can't remove sysctl context\n");
 	}
 
 	disk_destroy(softc->disk);
@@ -977,16 +1053,9 @@ static void
 adasetdeletemethod(struct ada_softc *softc)
 {
 
-#if 0
-	/*
-	 * Don't set NCQ_DSM_TRIM method by default. It is currently
-	 * a "feature of interest" implicated in some data corruption.
-	 */
 	if (softc->flags & ADA_FLAG_CAN_NCQ_TRIM)
 		softc->delete_method = ADA_DELETE_NCQ_DSM_TRIM;
-	else
-#endif
-	if (softc->flags & ADA_FLAG_CAN_TRIM)
+	else if (softc->flags & ADA_FLAG_CAN_TRIM)
 		softc->delete_method = ADA_DELETE_DSM_TRIM;
 	else if ((softc->flags & ADA_FLAG_CAN_CFA) && !(softc->flags & ADA_FLAG_CAN_48BIT))
 		softc->delete_method = ADA_DELETE_CFA_ERASE;
@@ -1069,7 +1138,8 @@ adaasync(void *callback_arg, u_int32_t c
 			 * the sim do do things properly. Perhaps we should look at log 13
 			 * dword 0 bit 0 and dword 1 bit 0 are set too...
 			 */
-			if ((softc->flags & ADA_FLAG_PIM_CAN_NCQ_TRIM) != 0 &&
+			if ((softc->quirks & ADA_Q_NCQ_TRIM_BROKEN) == 0 &&
+			    (softc->flags & ADA_FLAG_PIM_CAN_NCQ_TRIM) != 0 &&
 			    (cgd.ident_data.satacapabilities2 & ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 &&
 			    (softc->flags & ADA_FLAG_CAN_TRIM) != 0)
 				softc->flags |= ADA_FLAG_CAN_NCQ_TRIM;
@@ -1165,6 +1235,12 @@ adasysctlinit(void *context, int pending
 	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 		OID_AUTO, "write_cache", CTLFLAG_RW | CTLFLAG_MPSAFE,
 		&softc->write_cache, 0, "Enable disk write cache.");
+	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+		OID_AUTO, "unmapped_io", CTLFLAG_RD | CTLFLAG_MPSAFE,
+		&softc->unmappedio, 0, "Unmapped I/O leaf");
+	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+		OID_AUTO, "rotating", CTLFLAG_RD | CTLFLAG_MPSAFE,
+		&softc->rotating, 0, "Rotating media");
 #ifdef ADA_TEST_FAILURE
 	/*
 	 * Add a 'door bell' sysctl which allows one to set it from userland
@@ -1184,6 +1260,28 @@ adasysctlinit(void *context, int pending
 		&softc->periodic_read_error, 0,
 		"Force a read error every N reads (don't set too low).");
 #endif
+
+#ifdef CAM_IO_STATS
+	softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx,
+		SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats",
+		CTLFLAG_RD, 0, "Statistics");
+	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
+		SYSCTL_CHILDREN(softc->sysctl_stats_tree),
+		OID_AUTO, "timeouts", CTLFLAG_RD | CTLFLAG_MPSAFE,
+		&softc->timeouts, 0,
+		"Device timeouts reported by the SIM");
+	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
+		SYSCTL_CHILDREN(softc->sysctl_stats_tree),
+		OID_AUTO, "errors", CTLFLAG_RD | CTLFLAG_MPSAFE,
+		&softc->errors, 0,
+		"Transport errors reported by the SIM.");
+	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
+		SYSCTL_CHILDREN(softc->sysctl_stats_tree),
+		OID_AUTO, "pack_invalidations", CTLFLAG_RD | CTLFLAG_MPSAFE,
+		&softc->invalidations, 0,
+		"Device pack invalidations.");
+#endif
+
 	cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx,
 	    softc->sysctl_tree);
 
@@ -1270,7 +1368,7 @@ adaregister(struct cam_periph *periph, v
 		return(CAM_REQ_CMP_ERR);
 	}
 
-	if (cam_iosched_init(&softc->cam_iosched) != 0) {
+	if (cam_iosched_init(&softc->cam_iosched, periph) != 0) {
 		printf("adaregister: Unable to probe new device. "
 		       "Unable to allocate iosched memory\n");
 		return(CAM_REQ_CMP_ERR);
@@ -1346,8 +1444,12 @@ adaregister(struct cam_periph *periph, v
 	    "kern.cam.ada.%d.write_cache", periph->unit_number);
 	TUNABLE_INT_FETCH(announce_buf, &softc->write_cache);
 	/* Disable queue sorting for non-rotational media by default. */
-	cam_iosched_set_sort_queue(softc->cam_iosched,
-		cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING);
+	if (cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING) {
+		softc->rotating = 0;
+	} else {
+		softc->rotating = 1;
+	}
+	cam_iosched_set_sort_queue(softc->cam_iosched,  softc->rotating ? -1 : 0);
 	adagetparams(periph, cgd);
 	softc->disk = disk_alloc();
 	softc->disk->d_rotation_rate = cgd->ident_data.media_rotation_rate;
@@ -1390,8 +1492,10 @@ adaregister(struct cam_periph *periph, v
 		softc->disk->d_delmaxsize = 256 * softc->params.secsize;
 	} else
 		softc->disk->d_delmaxsize = maxio;
-	if ((cpi.hba_misc & PIM_UNMAPPED) != 0)
+	if ((cpi.hba_misc & PIM_UNMAPPED) != 0) {
 		softc->disk->d_flags |= DISKFLAG_UNMAPPED_BIO;
+		softc->unmappedio = 1;
+	}
 	/*
 	 * If we can do RCVSND_FPDMA_QUEUED commands, we may be able to do
 	 * NCQ trims, if we support trims at all. We also need support from
@@ -1400,9 +1504,9 @@ adaregister(struct cam_periph *periph, v
 	 */
 	if (cpi.hba_misc & PIM_NCQ_KLUDGE)
 		softc->flags |= ADA_FLAG_PIM_CAN_NCQ_TRIM;
-	if ((softc->flags & ADA_FLAG_PIM_CAN_NCQ_TRIM) != 0 &&
-	    (cgd->ident_data.satacapabilities2 &
-		ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 &&
+	if ((softc->quirks & ADA_Q_NCQ_TRIM_BROKEN) == 0 &&
+	    (softc->flags & ADA_FLAG_PIM_CAN_NCQ_TRIM) != 0 &&
+	    (cgd->ident_data.satacapabilities2 & ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 &&
 	    (softc->flags & ADA_FLAG_CAN_TRIM) != 0)
 		softc->flags |= ADA_FLAG_CAN_NCQ_TRIM;
 	strlcpy(softc->disk->d_descr, cgd->ident_data.model,
@@ -1675,8 +1779,7 @@ adastart(struct cam_periph *periph, unio
 		}
 
 		if ((bp->bio_flags & BIO_ORDERED) != 0 ||
-		    (bp->bio_cmd != BIO_DELETE &&
-		    (softc->flags & ADA_FLAG_NEED_OTAG) != 0)) {
+		    (bp->bio_cmd != BIO_DELETE && (softc->flags & ADA_FLAG_NEED_OTAG) != 0)) {
 			softc->flags &= ~ADA_FLAG_NEED_OTAG;
 			softc->flags |= ADA_FLAG_WAS_OTAG;
 			tag_code = 0;
@@ -1807,7 +1910,10 @@ adastart(struct cam_periph *periph, unio
 				ada_cfaerase(softc, bp, ataio);
 				break;
 			default:
-				panic("adastart: BIO_DELETE without method, not possible.");
+				biofinish(bp, NULL, EOPNOTSUPP);
+				xpt_release_ccb(start_ccb);
+				adaschedule(periph);
+				return;
 			}
 			start_ccb->ccb_h.ccb_state = ADA_CCB_TRIM;
 			start_ccb->ccb_h.flags |= CAM_UNLOCKED;
@@ -1893,7 +1999,7 @@ adadone(struct cam_periph *periph, union
 	case ADA_CCB_TRIM:
 	{
 		struct bio *bp;
-		int error, need_sched;
+		int error;
 
 		cam_periph_lock(periph);
 		bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
@@ -1945,7 +2051,7 @@ adadone(struct cam_periph *periph, union
 		if (softc->outstanding_cmds == 0)
 			softc->flags |= ADA_FLAG_WAS_OTAG;
 
-		need_sched = cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
+		cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
 		xpt_release_ccb(done_ccb);
 		if (state == ADA_CCB_TRIM) {
 			TAILQ_HEAD(, bio) queue;
@@ -1957,9 +2063,9 @@ adadone(struct cam_periph *periph, union
 			 * Normally, the xpt_release_ccb() above would make sure
 			 * that when we have more work to do, that work would
 			 * get kicked off. However, we specifically keep
-			 * trim running set to 0 before the call above to allow
+			 * trim_running set to 0 before the call above to allow
 			 * other I/O to progress when many BIO_DELETE requests
-			 * are pushed down. We set trim running to 0 and call
+			 * are pushed down. We set trim_running to 0 and call
 			 * daschedule again so that we don't stall if there are
 			 * no other I/Os pending apart from BIO_DELETEs.
 			 */
@@ -1977,8 +2083,7 @@ adadone(struct cam_periph *periph, union
 				biodone(bp1);
 			}
 		} else {
-			if (need_sched)
-				adaschedule(periph);
+			adaschedule(periph);
 			cam_periph_unlock(periph);
 			biodone(bp);
 		}
@@ -2070,6 +2175,31 @@ out:
 static int
 adaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
 {
+	struct ada_softc *softc;
+	struct cam_periph *periph;
+
+	periph = xpt_path_periph(ccb->ccb_h.path);
+	softc = (struct ada_softc *)periph->softc;
+
+	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
+	case CAM_CMD_TIMEOUT:
+#ifdef CAM_IO_STATS
+		softc->timeouts++;
+#endif
+		break;
+	case CAM_REQ_ABORTED:
+	case CAM_REQ_CMP_ERR:
+	case CAM_REQ_TERMIO:
+	case CAM_UNREC_HBA_ERROR:
+	case CAM_DATA_RUN_ERR:
+	case CAM_ATA_STATUS_ERROR:
+#ifdef CAM_IO_STATS
+		softc->errors++;
+#endif
+		break;
+	default:
+		break;
+	}
 
 	return(cam_periph_error(ccb, cam_flags, sense_flags, NULL));
 }

Modified: projects/iosched/sys/cam/cam_iosched.c
==============================================================================
--- projects/iosched/sys/cam/cam_iosched.c	Wed Sep 16 21:43:51 2015	(r287875)
+++ projects/iosched/sys/cam/cam_iosched.c	Wed Sep 16 22:15:50 2015	(r287876)
@@ -63,9 +63,6 @@ static MALLOC_DEFINE(M_CAMSCHED, "CAM I/
  */
 
 #ifdef CAM_NETFLIX_IOSCHED
-#define IOP_MAX_SKIP 50
-#define IOP_MAX_TRAINING 500
-#define ALPHA_BITS 14                                   /* ~32k events or about the last minute */
 
 SYSCTL_DECL(_kern_cam);
 static int do_netflix_iosched = 1;
@@ -74,20 +71,178 @@ SYSCTL_INT(_kern_cam, OID_AUTO, do_netfl
     &do_netflix_iosched, 1,
     "Enable Netflix I/O scheduler optimizations.");
 
+static int alpha_bits = 9;
+TUNABLE_INT("kern.cam.iosched_alpha_bits", &alpha_bits);
+SYSCTL_INT(_kern_cam, OID_AUTO, iosched_alpha_bits, CTLFLAG_RW,
+    &alpha_bits, 1,
+    "Bits in EMA's alpha.");
+
+
+
+struct iop_stats;
+struct cam_iosched_softc;
+
 int iosched_debug = 0;
 
+typedef enum {
+	none = 0,				/* No limits */
+	queue_depth,			/* Limit how many ops we queue to SIM */
+	iops,				/* Limit # of IOPS to the drive */
+	bandwidth,			/* Limit bandwidth to the drive */
+	limiter_max
+} io_limiter;
+	
+static const char *cam_iosched_limiter_names[] =
+    { "none", "queue_depth", "iops", "bandwidth" };
+
+/*
+ * Called to initialize the bits of the iop_stats structure relevant to the
+ * limiter. Called just after the limiter is set.
+ */
+typedef int l_init_t(struct iop_stats *);
+
+/*
+ * Called every tick.
+ */
+typedef int l_tick_t(struct iop_stats *);
+
+/*
+ * Called to see if the limiter thinks this IOP can be allowed to
+ * proceed. If so, the limiter assumes that the while IOP proceeded
+ * and makes any accounting of it that's needed.
+ */ 
+typedef int l_iop_t(struct iop_stats *, struct bio *);
+
+/*
+ * Called when an I/O completes so the limiter can updates its
+ * accounting. Pending I/Os may complete in any order (even when
+ * sent to the hardware at the same time), so the limiter may not
+ * make any assumptions other than this I/O has completed. If it
+ * returns 1, then xpt_schedule() needs to be called again.
+ */
+typedef int l_iodone_t(struct iop_stats *, struct bio *);
+
+static l_iop_t cam_iosched_qd_iop;
+static l_iop_t cam_iosched_qd_caniop;
+static l_iodone_t cam_iosched_qd_iodone;
+
+static l_init_t cam_iosched_iops_init;
+static l_tick_t cam_iosched_iops_tick;
+static l_iop_t cam_iosched_iops_caniop;
+static l_iop_t cam_iosched_iops_iop;
+
+static l_init_t cam_iosched_bw_init;
+static l_tick_t cam_iosched_bw_tick;
+static l_iop_t cam_iosched_bw_caniop;
+static l_iop_t cam_iosched_bw_iop;
+
+struct limswitch 
+{
+	l_init_t	*l_init;
+	l_tick_t	*l_tick;
+	l_iop_t		*l_iop;
+	l_iop_t		*l_caniop;
+	l_iodone_t	*l_iodone;
+} limsw[] =
+{
+	{	/* none */
+		.l_init = NULL,
+		.l_tick = NULL,
+		.l_iop = NULL,
+		.l_iodone= NULL,
+	},
+	{	/* queue_depth */
+		.l_init = NULL,
+		.l_tick = NULL,
+		.l_caniop = cam_iosched_qd_caniop,
+		.l_iop = cam_iosched_qd_iop,
+		.l_iodone= cam_iosched_qd_iodone,
+	},
+	{	/* iops */
+		.l_init = cam_iosched_iops_init,
+		.l_tick = cam_iosched_iops_tick,
+		.l_caniop = cam_iosched_iops_caniop,
+		.l_iop = cam_iosched_iops_iop,
+		.l_iodone= NULL,
+	},
+	{	/* bandwidth */
+		.l_init = cam_iosched_bw_init,
+		.l_tick = cam_iosched_bw_tick,
+		.l_caniop = cam_iosched_bw_caniop,
+		.l_iop = cam_iosched_bw_iop,
+		.l_iodone= NULL,
+	},
+};
+
 struct iop_stats 
 {
-	sbintime_t      data[IOP_MAX_TRAINING];	/* Data for training period */
-	sbintime_t	worst;		/* estimate of worst case latency */
-	int		outliers;	/* Number of outlier latency I/Os */
-	int		skipping;	/* Skipping I/Os when < IOP_MAX_SKIP */
-	int		training;	/* Training when < IOP_MAX_TRAINING */
+	/*
+	 * sysctl state for this subnode.
+	 */
+	struct sysctl_ctx_list	sysctl_ctx;
+	struct sysctl_oid	*sysctl_tree;
+
+	/*
+	 * Information about the current rate limiters, if any
+	 */
+	io_limiter	limiter;	/* How are I/Os being limited */
+	int		min;		/* Low range of limit */
+	int		max;		/* High range of limit */
+	int		current;	/* Current rate limiter */
+	int		l_value1;	/* per-limiter scratch value 1. */
+	int		l_value2;	/* per-limiter scratch value 2. */
+	
+
+	/*
+	 * Debug information about counts of I/Os that have gone through the
+	 * scheduler.
+	 */
+	int		pending;	/* I/Os pending in the hardware */
+	int		queued;		/* number currently in the queue */
+	int		total;		/* Total for all time -- wraps */
+	int		in;		/* number queued all time -- wraps */
+	int		out;		/* number completed all time -- wraps */
+	
+	/*
+	 * Statistics on different bits of the process.
+	 */
 		/* Exp Moving Average, alpha = 1 / (1 << alpha_bits) */
 	sbintime_t      ema;
 	sbintime_t      emss;		/* Exp Moving sum of the squares */
 	sbintime_t      sd;		/* Last computed sd */
+
+	struct cam_iosched_softc *softc;
 };
+
+
+typedef enum {
+	set_max = 0,			/* current = max */
+	read_latency,			/* Steer read latency by throttling writes */
+	cl_max				/* Keep last */
+} control_type;
+
+static const char *cam_iosched_control_type_names[] =
+    { "set_max", "read_latency" };
+
+struct control_loop
+{
+	/*
+	 * sysctl state for this subnode.
+	 */
+	struct sysctl_ctx_list	sysctl_ctx;
+	struct sysctl_oid	*sysctl_tree;
+
+	sbintime_t	next_steer;		/* Time of next steer */
+	sbintime_t	steer_interval;		/* How often do we steer? */
+	sbintime_t	lolat;
+	sbintime_t	hilat;
+	int		alpha;
+	control_type	type;			/* What type of control? */
+	int		last_count;		/* Last I/O count */
+
+	struct cam_iosched_softc *softc;
+};
+
 #endif
 
 struct cam_iosched_softc
@@ -98,36 +253,375 @@ struct cam_iosched_softc
 	uint32_t	flags;
 	int		sort_io_queue;
 #ifdef CAM_NETFLIX_IOSCHED
-	/* Number of pending transactions */
-	int		pending_reads;
-	int		pending_writes;
-	/* Have at least this many transactions in progress, if possible */
-	int		min_reads;
-	int		min_writes;
-	/* Maximum number of each type of transaction in progress */
-	int		max_reads;
-	int		max_writes;
-	
-	int		trims;
-	int		reads;
-	int		writes;
-	int		queued_reads;
-	int		queued_writes;
-	int		in_reads;
-	int		in_writes;
-	int		out_reads;
-	int		out_writes;
-
-	int		read_bias;
-	int		current_read_bias;
+	int		read_bias;		/* Read bias setting */
+	int		current_read_bias;	/* Current read bias state */
+	int		total_ticks;
 
 	struct bio_queue_head write_queue;
 	struct iop_stats read_stats, write_stats, trim_stats;
+	struct sysctl_ctx_list	sysctl_ctx;
+	struct sysctl_oid	*sysctl_tree;
+
+	int		quanta;			/* Number of quanta per second */
+	struct callout	ticker;			/* Callout for our quota system */
+	struct cam_periph *periph;		/* cam periph associated with this device */
+	uint32_t	this_frac;		/* Fraction of a second (1024ths) for this tick */
+	sbintime_t	last_time;		/* Last time we ticked */
+	struct control_loop cl;
 #endif
 };
 
+#ifdef CAM_NETFLIX_IOSCHED
+/*
+ * helper functions to call the limsw functions.
+ */
+static int
+cam_iosched_limiter_init(struct iop_stats *ios)
+{
+	int lim = ios->limiter;
+
+	/* maybe this should be a kassert */
+	if (lim < none || lim >= limiter_max)
+		return EINVAL;
+
+	if (limsw[lim].l_init)
+		return limsw[lim].l_init(ios);
+
+	return 0;
+}
+
+static int
+cam_iosched_limiter_tick(struct iop_stats *ios)
+{
+	int lim = ios->limiter;
+
+	/* maybe this should be a kassert */
+	if (lim < none || lim >= limiter_max)
+		return EINVAL;
+
+	if (limsw[lim].l_tick)
+		return limsw[lim].l_tick(ios);
+
+	return 0;
+}
+
+static int
+cam_iosched_limiter_iop(struct iop_stats *ios, struct bio *bp)
+{
+	int lim = ios->limiter;
+
+	/* maybe this should be a kassert */
+	if (lim < none || lim >= limiter_max)
+		return EINVAL;
+
+	if (limsw[lim].l_iop)
+		return limsw[lim].l_iop(ios, bp);
+
+	return 0;
+}
+
+static int
+cam_iosched_limiter_caniop(struct iop_stats *ios, struct bio *bp)
+{
+	int lim = ios->limiter;
+
+	/* maybe this should be a kassert */
+	if (lim < none || lim >= limiter_max)
+		return EINVAL;
+
+	if (limsw[lim].l_caniop)
+		return limsw[lim].l_caniop(ios, bp);
+
+	return 0;
+}
+
+static int
+cam_iosched_limiter_iodone(struct iop_stats *ios, struct bio *bp)
+{
+	int lim = ios->limiter;
+
+	/* maybe this should be a kassert */
+	if (lim < none || lim >= limiter_max)
+		return 0;
+
+	if (limsw[lim].l_iodone)
+		return limsw[lim].l_iodone(ios, bp);
+
+	return 0;
+}
+
+/*
+ * Functions to implement the different kinds of limiters
+ */
+
+static int
+cam_iosched_qd_iop(struct iop_stats *ios, struct bio *bp)
+{
+		
+	if (ios->current <= 0 || ios->pending < ios->current)
+		return 0;
+
+	return EAGAIN;
+}
+
+static int
+cam_iosched_qd_caniop(struct iop_stats *ios, struct bio *bp)
+{
+		
+	if (ios->current <= 0 || ios->pending < ios->current)
+		return 0;
+
+	return EAGAIN;
+}
+
+static int
+cam_iosched_qd_iodone(struct iop_stats *ios, struct bio *bp)
+{
+		
+	if (ios->current <= 0 || ios->pending != ios->current)
+		return 0;
+
+	return 1;
+}
+
+static int
+cam_iosched_iops_init(struct iop_stats *ios)
+{
+
+	ios->l_value1 = ios->current / ios->softc->quanta;
+	if (ios->l_value1 <= 0)
+		ios->l_value1 = 1;
+
+	return 0;
+}
+
+static int
+cam_iosched_iops_tick(struct iop_stats *ios)
+{
+
+	ios->l_value1 = (int)((ios->current * (uint64_t)ios->softc->this_frac) >> 16);
+	if (ios->l_value1 <= 0)
+		ios->l_value1 = 1;
+
+	return 0;
+}
+
+static int
+cam_iosched_iops_caniop(struct iop_stats *ios, struct bio *bp)
+{
+
+	/*
+	 * So if we have any more IOPs left, allow it,
+	 * otherwise wait.
+	 */
+	if (ios->l_value1 <= 0)
+		return EAGAIN;
+	return 0;
+}
+
+static int
+cam_iosched_iops_iop(struct iop_stats *ios, struct bio *bp)
+{
+	int rv;
+
+	rv = cam_iosched_limiter_caniop(ios, bp);
+	if (rv == 0)
+		ios->l_value1--;
+
+	return rv;
+}
+
+static int
+cam_iosched_bw_init(struct iop_stats *ios)
+{
+
+	/* ios->current is in kB/s, so scale to bytes */
+	ios->l_value1 = ios->current * 1000 / ios->softc->quanta;
+
+	return 0;
+}
+
+static int
+cam_iosched_bw_tick(struct iop_stats *ios)
+{
+	int bw;
+
+	/*
+	 * If we're in the hole for available quota from
+	 * the last time, then add the quantum for this.
+	 * If we have any left over from last quantum,
+	 * then too bad, that's lost. Also, ios->current
+	 * is in kB/s, so scale.
+	 *
+	 * We also allow up to 4 quanta of credits to
+	 * accumulate to deal with burstiness. 4 is extremely
+	 * arbitrary.
+	 */
+	bw = (int)((ios->current * 1000ull * (uint64_t)ios->softc->this_frac) >> 16);
+	if (ios->l_value1 < bw * 4)
+		ios->l_value1 += bw;
+
+	return 0;
+}
+
+static int
+cam_iosched_bw_caniop(struct iop_stats *ios, struct bio *bp)
+{
+	/*
+	 * So if we have any more bw quota left, allow it,
+	 * otherwise wait. Not, we'll go negative and that's
+	 * OK. We'll just get a lettle less next quota.
+	 *
+	 * Note on going negative: that allows us to process
+	 * requests in order better, since we won't allow
+	 * shorter reads to get around the long one that we
+	 * don't have the quota to do just yet. It also prevents
+	 * starvation by being a little more permissive about
+	 * what we let through this quantum (to prevent the
+	 * starvation), at the cost of getting a little less
+	 * next quantum.
+	 */
+	if (ios->l_value1 <= 0)
+		return EAGAIN;
+
+
+	return 0;
+}
+
+static int
+cam_iosched_bw_iop(struct iop_stats *ios, struct bio *bp)
+{
+	int rv;
+
+	rv = cam_iosched_limiter_caniop(ios, bp);
+	if (rv == 0)
+		ios->l_value1 -= bp->bio_length;
+
+	return rv;
+}
+
+static void cam_iosched_cl_maybe_steer(struct control_loop *clp);
+
+static void
+cam_iosched_ticker(void *arg)
+{
+	struct cam_iosched_softc *isc = arg;
+	sbintime_t now, delta;
+
+	callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc);
+
+	now = sbinuptime();
+	delta = now - isc->last_time;
+	isc->this_frac = (uint32_t)delta >> 16;		/* Note: discards seconds -- should be 0 harmless if not */
+	isc->last_time = now;
+
+	cam_iosched_cl_maybe_steer(&isc->cl);
+
+	cam_iosched_limiter_tick(&isc->read_stats);
+	cam_iosched_limiter_tick(&isc->write_stats);
+	cam_iosched_limiter_tick(&isc->trim_stats);
+
+	cam_iosched_schedule(isc, isc->periph);
+
+	isc->total_ticks++;
+}
+
+
+static void
+cam_iosched_cl_init(struct control_loop *clp, struct cam_iosched_softc *isc)
+{
+
+	clp->next_steer = sbinuptime();
+	clp->softc = isc;
+	clp->steer_interval = SBT_1S * 5;	/* Let's start out steering every 5s */
+	clp->lolat = 5 * SBT_1MS;
+	clp->hilat = 15 * SBT_1MS;
+	clp->alpha = 20;			/* Alpha == gain. 20 = .2 */
+	clp->type = set_max;
+}
+
+static void
+cam_iosched_cl_maybe_steer(struct control_loop *clp)
+{
+	struct cam_iosched_softc *isc;
+	sbintime_t now, lat;
+	int old;
+
+	isc = clp->softc;
+	now = isc->last_time;
+	if (now < clp->next_steer)
+		return;
+
+	clp->next_steer = now + clp->steer_interval;
+	switch (clp->type) {
+	case set_max:
+		if (isc->write_stats.current != isc->write_stats.max)
+			printf("Steering write from %d kBps to %d kBps\n",
+			    isc->write_stats.current, isc->write_stats.max);
+		isc->read_stats.current = isc->read_stats.max;
+		isc->write_stats.current = isc->write_stats.max;
+		isc->trim_stats.current = isc->trim_stats.max;
+		break;
+	case read_latency:
+		old = isc->write_stats.current;
+		lat = isc->read_stats.ema;
+		/*
+		 * Simple PLL-like engine. Since we're steering to a range for
+		 * the SP (set point) that makes things a little more
+		 * complicated. In addition, we're not directly controlling our
+		 * PV (process variable), the read latency, but instead are
+		 * manipulating the write bandwidth limit for our MV
+		 * (manipulation variable), analysis of this code gets a bit
+		 * messy. Also, the MV is a very noisy control surface for read
+		 * latency since it is affected by many hidden processes inside
+		 * the device which change how responsive read latency will be
+		 * in reaction to changes in write bandwidth. Unlike the classic
+		 * boiler control PLL. this may result in over-steering while
+		 * the SSD takes its time to react to the new, lower load. This
+		 * is why we use a relatively low alpha of between .1 and .25 to
+		 * compensate for this effect. At .1, it takes ~22 steering
+		 * intervals to back off by a factor of 10. At .2 it only takes
+		 * ~10. At .25 it only takes ~8. However some preliminary data
+		 * from the SSD drives suggests a reasponse time in 10's of
+		 * seconds before latency drops regardless of the new write
+		 * rate. Careful observation will be reqiured to tune this
+		 * effectively.
+		 *
+		 * Also, when there's no read traffic, we jack up the write
+		 * limit too regardless of the last read latency.  10 is
+		 * somewhat arbitrary.
+		 */
+		if (lat < clp->lolat || isc->read_stats.total - clp->last_count < 10)
+			isc->write_stats.current = isc->write_stats.current *
+			    (100 + clp->alpha) / 100;	/* Scale up */
+		else if (lat > clp->hilat)
+			isc->write_stats.current = isc->write_stats.current *
+			    (100 - clp->alpha) / 100;	/* Scale down */
+		clp->last_count = isc->read_stats.total;
+
+		/*
+		 * Even if we don't steer, per se, enforce the min/max limits as
+		 * those may have changed.
+		 */
+		if (isc->write_stats.current < isc->write_stats.min)
+			isc->write_stats.current = isc->write_stats.min;
+		if (isc->write_stats.current > isc->write_stats.max)
+			isc->write_stats.current = isc->write_stats.max;
+		if (old != isc->write_stats.current)
+			printf("Steering write from %d kBps to %d kBps due to latency of %ldus\n",
+			    old, isc->write_stats.current,
+			    ((uint64_t)1000000 * (uint32_t)lat) >> 32);
+		break;
+	case cl_max:
+		break;
+	}

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201509162215.t8GMFp1b023705>