Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 5 Jul 2017 16:20:22 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r320683 - head/usr.sbin/diskinfo
Message-ID:  <201707051620.v65GKMmN085409@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Wed Jul  5 16:20:22 2017
New Revision: 320683
URL: https://svnweb.freebsd.org/changeset/base/320683

Log:
  Add naive benchmark for SSDs in ZFS SLOG role.
  
  ZFS SLOGs have very specific access pattern with many cache flushes,
  which none of benchmarks I know can simulate.  Since SSD vendors rarely
  specify cache flush time, this measurement can be useful to explain why
  some ZFS pools are slower then expected.  This test writes data chunks
  of different size followed by cache flush, alike to what ZFS SLOG does,
  and measures average time.
  
  To illustrate, here is result for 6 years old SATA Intel 710 Series SSD:
  
  Synchronous random writes:
           0.5 kbytes:    138.3 usec/IO =      3.5 Mbytes/s
             1 kbytes:    137.7 usec/IO =      7.1 Mbytes/s
             2 kbytes:    151.1 usec/IO =     12.9 Mbytes/s
             4 kbytes:    158.2 usec/IO =     24.7 Mbytes/s
             8 kbytes:    175.6 usec/IO =     44.5 Mbytes/s
            16 kbytes:    210.1 usec/IO =     74.4 Mbytes/s
            32 kbytes:    274.2 usec/IO =    114.0 Mbytes/s
            64 kbytes:    416.5 usec/IO =    150.1 Mbytes/s
           128 kbytes:    776.6 usec/IO =    161.0 Mbytes/s
           256 kbytes:   1503.1 usec/IO =    166.3 Mbytes/s
           512 kbytes:   2968.7 usec/IO =    168.4 Mbytes/s
          1024 kbytes:   5866.8 usec/IO =    170.5 Mbytes/s
          2048 kbytes:  11696.6 usec/IO =    171.0 Mbytes/s
          4096 kbytes:  23329.6 usec/IO =    171.5 Mbytes/s
          8192 kbytes:  46779.5 usec/IO =    171.0 Mbytes/s
  
  , and much newer and supposedly much faster NVMe Samsung 950 PRO SSD:
  
  Synchronous random writes:
           0.5 kbytes:   2092.9 usec/IO =      0.2 Mbytes/s
             1 kbytes:   2013.1 usec/IO =      0.5 Mbytes/s
             2 kbytes:   2014.8 usec/IO =      1.0 Mbytes/s
             4 kbytes:   2090.7 usec/IO =      1.9 Mbytes/s
             8 kbytes:   2044.5 usec/IO =      3.8 Mbytes/s
            16 kbytes:   2084.8 usec/IO =      7.5 Mbytes/s
            32 kbytes:   2137.1 usec/IO =     14.6 Mbytes/s
            64 kbytes:   2173.4 usec/IO =     28.8 Mbytes/s
           128 kbytes:   2923.9 usec/IO =     42.8 Mbytes/s
           256 kbytes:   3085.3 usec/IO =     81.0 Mbytes/s
           512 kbytes:   3112.2 usec/IO =    160.7 Mbytes/s
          1024 kbytes:   2430.6 usec/IO =    411.4 Mbytes/s
          2048 kbytes:   3788.9 usec/IO =    527.9 Mbytes/s
          4096 kbytes:   6198.0 usec/IO =    645.4 Mbytes/s
          8192 kbytes:  10764.9 usec/IO =    743.2 Mbytes/s
  
  While the first one obviously has maximal throughput limitations, the
  second one has so high cache flush latency (about 2 millisecond), that
  it makes one almost useless in SLOG role, despite of its good throughput
  numbers.  Power loss protection is out of scope of this test, but I
  suspect it can be related.
  
  MFC after:	2 weeks
  Sponsored by:	iXsystems, Inc.

Modified:
  head/usr.sbin/diskinfo/diskinfo.8
  head/usr.sbin/diskinfo/diskinfo.c

Modified: head/usr.sbin/diskinfo/diskinfo.8
==============================================================================
--- head/usr.sbin/diskinfo/diskinfo.8	Wed Jul  5 16:10:30 2017	(r320682)
+++ head/usr.sbin/diskinfo/diskinfo.8	Wed Jul  5 16:20:22 2017	(r320683)
@@ -1,5 +1,6 @@
 .\"
 .\" Copyright (c) 2003 Poul-Henning Kamp
+.\" Copyright (c) 2017 Alexander Motin <mav@FreeBSD.org>
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
@@ -28,7 +29,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 1, 2017
+.Dd July 4, 2017
 .Dt DISKINFO 8
 .Os
 .Sh NAME
@@ -36,7 +37,7 @@
 .Nd get information about disk device
 .Sh SYNOPSIS
 .Nm
-.Op Fl citv
+.Op Fl citSvw
 .Ar disk ...
 .Nm
 .Op Fl p
@@ -64,9 +65,16 @@ This is a string that identifies the physical path to 
 storage enclosure.
 .It Fl s
 Return the disk serial number
+.It Fl S
+Perform synchronous random write test (ZFS SLOG test),
+measuring time required to write data blocks of different size and
+flush disk cache.
+Blocks of more then 128KB are written with multiple parallel operations.
 .It Fl t
 Perform a simple and rather naive benchmark of the disks seek
 and transfer performance.
+.It Fl w
+Allow disruptive write tests.
 .El
 .Pp
 If given no arguments, the output will be a single line per specified device

Modified: head/usr.sbin/diskinfo/diskinfo.c
==============================================================================
--- head/usr.sbin/diskinfo/diskinfo.c	Wed Jul  5 16:10:30 2017	(r320682)
+++ head/usr.sbin/diskinfo/diskinfo.c	Wed Jul  5 16:20:22 2017	(r320683)
@@ -1,6 +1,7 @@
 /*-
  * Copyright (c) 2003 Poul-Henning Kamp
  * Copyright (c) 2015 Spectra Logic Corporation
+ * Copyright (c) 2017 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,6 +41,7 @@
 #include <libutil.h>
 #include <paths.h>
 #include <err.h>
+#include <sysexits.h>
 #include <sys/aio.h>
 #include <sys/disk.h>
 #include <sys/param.h>
@@ -51,15 +53,16 @@
 static void
 usage(void)
 {
-	fprintf(stderr, "usage: diskinfo [-citv] disk ...\n");
+	fprintf(stderr, "usage: diskinfo [-cipsStvw] disk ...\n");
 	exit (1);
 }
 
-static int opt_c, opt_i, opt_p, opt_s, opt_t, opt_v;
+static int opt_c, opt_i, opt_p, opt_s, opt_S, opt_t, opt_v, opt_w;
 
 static void speeddisk(int fd, off_t mediasize, u_int sectorsize);
 static void commandtime(int fd, off_t mediasize, u_int sectorsize);
 static void iopsbench(int fd, off_t mediasize, u_int sectorsize);
+static void slogbench(int fd, int isreg, off_t mediasize, u_int sectorsize);
 static int zonecheck(int fd, uint32_t *zone_mode, char *zone_str,
 		     size_t zone_str_len);
 
@@ -71,10 +74,10 @@ main(int argc, char **argv)
 	char buf[BUFSIZ], ident[DISK_IDENT_SIZE], physpath[MAXPATHLEN];
 	char zone_desc[64];
 	off_t	mediasize, stripesize, stripeoffset;
-	u_int	sectorsize, fwsectors, fwheads, zoned = 0;
+	u_int	sectorsize, fwsectors, fwheads, zoned = 0, isreg;
 	uint32_t zone_mode;
 
-	while ((ch = getopt(argc, argv, "cipstv")) != -1) {
+	while ((ch = getopt(argc, argv, "cipsStvw")) != -1) {
 		switch (ch) {
 		case 'c':
 			opt_c = 1;
@@ -90,6 +93,10 @@ main(int argc, char **argv)
 		case 's':
 			opt_s = 1;
 			break;
+		case 'S':
+			opt_S = 1;
+			opt_v = 1;
+			break;
 		case 't':
 			opt_t = 1;
 			opt_v = 1;
@@ -97,6 +104,9 @@ main(int argc, char **argv)
 		case 'v':
 			opt_v = 1;
 			break;
+		case 'w':
+			opt_w = 1;
+			break;
 		default:
 			usage();
 		}
@@ -112,8 +122,13 @@ main(int argc, char **argv)
 		usage();
 	}
 
+	if (opt_S && !opt_w) {
+		warnx("-S require also -w");
+		usage();
+	}
+
 	for (i = 0; i < argc; i++) {
-		fd = open(argv[i], O_RDONLY | O_DIRECT);
+		fd = open(argv[i], (opt_w ? O_RDWR : O_RDONLY) | O_DIRECT);
 		if (fd < 0 && errno == ENOENT && *argv[i] != '/') {
 			snprintf(buf, BUFSIZ, "%s%s", _PATH_DEV, argv[i]);
 			fd = open(buf, O_RDONLY);
@@ -128,7 +143,8 @@ main(int argc, char **argv)
 			exitval = 1;
 			goto out;
 		}
-		if (S_ISREG(sb.st_mode)) {
+		isreg = S_ISREG(sb.st_mode);
+		if (isreg) {
 			mediasize = sb.st_size;
 			sectorsize = S_BLKSIZE;
 			fwsectors = 0;
@@ -228,16 +244,18 @@ main(int argc, char **argv)
 			speeddisk(fd, mediasize, sectorsize);
 		if (opt_i)
 			iopsbench(fd, mediasize, sectorsize);
+		if (opt_S)
+			slogbench(fd, isreg, mediasize, sectorsize);
 out:
 		close(fd);
 	}
 	exit (exitval);
 }
 
+#define MAXTX (8*1024*1024)
+#define MEGATX (1024*1024)
+static uint8_t buf[MAXTX];
 
-static char sector[65536];
-static char mega[1024 * 1024];
-
 static void
 rdsect(int fd, off_t blockno, u_int sectorsize)
 {
@@ -245,7 +263,7 @@ rdsect(int fd, off_t blockno, u_int sectorsize)
 
 	if (lseek(fd, (off_t)blockno * sectorsize, SEEK_SET) == -1)
 		err(1, "lseek");
-	error = read(fd, sector, sectorsize);
+	error = read(fd, buf, sectorsize);
 	if (error == -1)
 		err(1, "read");
 	if (error != (int)sectorsize)
@@ -257,10 +275,10 @@ rdmega(int fd)
 {
 	int error;
 
-	error = read(fd, mega, sizeof(mega));
+	error = read(fd, buf, MEGATX);
 	if (error == -1)
 		err(1, "read");
-	if (error != sizeof(mega))
+	if (error != MEGATX)
 		errx(1, "disk too small for test.");
 }
 
@@ -321,6 +339,16 @@ TI(double count)
 }
 
 static void
+TS(u_int size, int count)
+{
+	double dt;
+
+	dt = delta_t();
+	printf("%8.1f usec/IO = %8.1f Mbytes/s\n",
+	    dt * 1000000.0 / count, size * count / dt / (1024 * 1024));
+}
+
+static void
 speeddisk(int fd, off_t mediasize, u_int sectorsize)
 {
 	int bulk, i;
@@ -555,6 +583,69 @@ iopsbench(int fd, off_t mediasize, u_int sectorsize)
 	iops(fd, mediasize, 128 * 1024);
 
 	printf("\n");
+}
+
+#define MAXIO (128*1024)
+#define MAXIOS (MAXTX / MAXIO)
+
+static void
+parwrite(int fd, size_t size, off_t off)
+{
+	struct aiocb aios[MAXIOS];
+	off_t o;
+	size_t s;
+	int n, error;
+	struct aiocb *aiop;
+
+	for (n = 0, o = 0; size > MAXIO; n++, size -= s, o += s) {
+		s = (size >= MAXIO) ? MAXIO : size;
+		aiop = &aios[n];
+		bzero(aiop, sizeof(*aiop));
+		aiop->aio_buf = &buf[o];
+		aiop->aio_fildes = fd;
+		aiop->aio_offset = off + o;
+		aiop->aio_nbytes = s;
+		error = aio_write(aiop);
+		if (error != 0)
+			err(EX_IOERR, "AIO write submit error");
+	}
+	error = pwrite(fd, &buf[o], size, off + o);
+	if (error < 0)
+		err(EX_IOERR, "Sync write error");
+	for (; n > 0; n--) {
+		error = aio_waitcomplete(&aiop, NULL);
+		if (error < 0)
+			err(EX_IOERR, "AIO write wait error");
+	}
+}
+
+static void
+slogbench(int fd, int isreg, off_t mediasize, u_int sectorsize)
+{
+	off_t off;
+	u_int size;
+	int error, n, N;
+
+	printf("Synchronous random writes:\n");
+	for (size = sectorsize; size <= MAXTX; size *= 2) {
+		printf("\t%4.4g kbytes: ", (double)size / 1024);
+		N = 0;
+		T0();
+		do {
+			for (n = 0; n < 250; n++) {
+				off = random() % (mediasize / size);
+				parwrite(fd, size, off * size);
+				if (isreg)
+					error = fsync(fd);
+				else
+					error = ioctl(fd, DIOCGFLUSH);
+				if (error < 0)
+					err(EX_IOERR, "Flush error");
+			}
+			N += 250;
+		} while (delta_t() < 1.0);
+		TS(size, N);
+	}
 }
 
 static int



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201707051620.v65GKMmN085409>