Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 7 May 2013 08:57:34 GMT
From:      Adam Nowacki <nowak@tepeserwery.pl>
To:        freebsd-gnats-submit@FreeBSD.org
Subject:   kern/178387: [zfs] [patch] sparse files performance improvements
Message-ID:  <201305070857.r478vYev094211@oldred.FreeBSD.org>
Resent-Message-ID: <201305070900.r479009i089272@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help

>Number:         178387
>Category:       kern
>Synopsis:       [zfs] [patch] sparse files performance improvements
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Tue May 07 09:00:00 UTC 2013
>Closed-Date:
>Last-Modified:
>Originator:     Adam Nowacki
>Release:        9.1-STABLE
>Organization:
>Environment:
>Description:
Sparse files in ZFS are slow. Default behavior is to allocate a temporary dbuf, zero it and destroy on return. There is no caching. Bandwidth is limited by cpu time bzeroing temporary dbufs.

The attached patch avoids creating temporary dbufs.

4k reads are important for mmapped files.

without the patch:

zfs recordsize=128k, 4k reads:
104857600 bytes transferred in 1.178753 secs (88956388 bytes/sec)

zfs recordsize=8M, 4k reads:
8388608 bytes transferred in 9.134382 secs (918355 bytes/sec)

with the patch:

zfs recordsize=128k, 4k reads: (13x improvement)
1048576000 bytes transferred in 0.904384 secs (1159436525 bytes/sec)

zfs recordsize=8M, 4k reads: (over 1000x improvement)
1048576000 bytes transferred in 0.909877 secs (1152437009 bytes/sec)

>How-To-Repeat:
# truncate -s 1000m testfile
# time dd if=testfile of=/dev/null bs=4k

>Fix:


Patch attached with submission follows:

Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	(revision 250290)
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	(working copy)
@@ -591,6 +591,7 @@
  */
 #define	DMU_READ_PREFETCH	0 /* prefetch */
 #define	DMU_READ_NO_PREFETCH	1 /* don't prefetch */
+#define	DMU_READ_FAIL_SPARSE	2
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	void *buf, uint32_t flags);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	(revision 250290)
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	(working copy)
@@ -412,12 +412,19 @@
 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, offset);
 	for (i = 0; i < nblks; i++) {
-		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
-		if (db == NULL) {
-			rw_exit(&dn->dn_struct_rwlock);
-			dmu_buf_rele_array(dbp, nblks, tag);
-			zio_nowait(zio);
-			return (SET_ERROR(EIO));
+		dmu_buf_impl_t *db;
+		int dberr = dbuf_hold_impl(dn, 0, blkid + i,
+		    flags & DMU_READ_FAIL_SPARSE ? TRUE : FALSE, tag, &db);
+		if (dberr) {
+			if (dberr == ENOENT && flags & DMU_READ_FAIL_SPARSE && read) {
+				dbp[i] = NULL;
+				continue;
+			} else {
+				rw_exit(&dn->dn_struct_rwlock);
+				dmu_buf_rele_array(dbp, nblks, tag);
+				zio_nowait(zio);
+				return (SET_ERROR(EIO));
+			}
 		}
 		/* initiate async i/o */
 		if (read)
@@ -442,7 +449,7 @@
 
 	/* wait for other io to complete */
 	if (read) {
-		for (i = 0; i < nblks; i++) {
+		for (i = 0; i < nblks; i++) if (dbp[i]) {
 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 			mutex_enter(&db->db_mtx);
 			while (db->db_state == DB_READ ||
@@ -755,6 +762,7 @@
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+		uint64_t end_size = size - mylen;
 		int i;
 
 		/*
@@ -762,17 +770,22 @@
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
-		    TRUE, FTAG, &numbufs, &dbp, flags);
+		    TRUE, FTAG, &numbufs, &dbp, flags | DMU_READ_FAIL_SPARSE);
 		if (err)
 			break;
 
 		for (i = 0; i < numbufs; i++) {
-			int tocpy;
-			int bufoff;
+			uint64_t bufoff, tocpy;
 			dmu_buf_t *db = dbp[i];
-
-			ASSERT(size > 0);
-
+			if (!db)
+				continue;
+			if (offset < db->db_offset) {
+				uint64_t len = db->db_offset - offset;
+				bzero(buf, len);
+				buf = (char *)buf + len;
+				offset += len;
+				size -= len;
+			}
 			bufoff = offset - db->db_offset;
 			tocpy = (int)MIN(db->db_size - bufoff, size);
 
@@ -782,6 +795,14 @@
 			size -= tocpy;
 			buf = (char *)buf + tocpy;
 		}
+		if (size > end_size) {
+			uint64_t len = size - end_size;
+			bzero(buf, len);
+			buf = (char *)buf + len;
+			offset += len;
+			size -= len;
+		}
+
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	dnode_rele(dn, FTAG);
@@ -981,54 +1002,76 @@
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 	xuio_t *xuio = NULL;
+	dnode_t *dn;
 
-	/*
-	 * NB: we could do this block-at-a-time, but it's nice
-	 * to be reading in parallel.
-	 */
-	err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
-	    &numbufs, &dbp);
-	if (err)
-		return (err);
+	ASSERT(size > 0);
 
 #ifdef UIO_XUIO
 	if (uio->uio_extflg == UIO_XUIO)
 		xuio = (xuio_t *)uio;
 #endif
 
-	for (i = 0; i < numbufs; i++) {
-		int tocpy;
-		int bufoff;
-		dmu_buf_t *db = dbp[i];
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
 
-		ASSERT(size > 0);
+	/*
+	 * NB: we could do this block-at-a-time, but it's nice
+	 * to be reading in parallel.
+	 */
+	/* XXX - optimize sparse reads for xuio too? */
+	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, TRUE,
+	    FTAG, &numbufs, &dbp,
+	    DMU_READ_PREFETCH | (xuio ? 0 : DMU_READ_FAIL_SPARSE));
+	if (err) {
+		dnode_rele(dn, FTAG);
+		return (err);
+	}
 
-		bufoff = uio->uio_loffset - db->db_offset;
-		tocpy = (int)MIN(db->db_size - bufoff, size);
-
-		if (xuio) {
+	if (xuio) {
+		for (i = 0; i < numbufs; i++) {
+			dmu_buf_t *db = dbp[i];
+			int bufoff = uio->uio_loffset - db->db_offset;
+			int tocpy = MIN(db->db_size - bufoff, size);
 			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 			arc_buf_t *dbuf_abuf = dbi->db_buf;
 			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
 			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
-			if (!err) {
-				uio->uio_resid -= tocpy;
-				uio->uio_loffset += tocpy;
-			}
+			if (err)
+				break;
+			uio->uio_resid -= tocpy;
+			uio->uio_loffset += tocpy;
+			size -= tocpy;
 
 			if (abuf == dbuf_abuf)
 				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
 			else
 				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
-		} else {
-			err = uiomove((char *)db->db_data + bufoff, tocpy,
+		}
+	} else {
+		for (i = 0; i < numbufs; i++) {
+			dmu_buf_t *db = dbp[i];
+			if (!db)
+				continue;
+			if (uio->uio_loffset < db->db_offset) {
+				int len = db->db_offset - uio->uio_loffset;
+				err = uiozero(len, uio);
+				if (err)
+					break;
+				size -= len;
+			}
+			int bufoff = uio->uio_loffset - db->db_offset;
+			int tocpy = MIN(db->db_size - bufoff, size);
+			err = uiomove((char *)db->db_data + bufoff, tocpy,
 			    UIO_READ, uio);
+			if (err)
+				break;
+			size -= tocpy;
 		}
-		if (err)
-			break;
-
-		size -= tocpy;
+		if (size)
+			err = uiozero(size, uio);
 	}
+	dnode_rele(dn, FTAG);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (err);
Index: sys/sys/uio.h
===================================================================
--- sys/sys/uio.h	(revision 250290)
+++ sys/sys/uio.h	(working copy)
@@ -102,6 +102,8 @@
 	    struct uio *uio);
 int	uiomove_nofault(void *cp, int n, struct uio *uio);
 int	uiomoveco(void *cp, int n, struct uio *uio, int disposable);
+int	uiozero(int len, struct uio *uio);
+int	uiozero_nofault(int len, struct uio *uio);
 
 #else /* !_KERNEL */
 
Index: sys/kern/subr_uio.c
===================================================================
--- sys/kern/subr_uio.c	(revision 250290)
+++ sys/kern/subr_uio.c	(working copy)
@@ -65,7 +65,9 @@
 	"Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
 
 static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault);
+static int uiozero_faultflag(int len, struct uio *uio, int nofault);
 
+
 #ifdef ZERO_COPY_SOCKETS
 /* Declared in uipc_socket.c */
 extern int so_zero_copy_receive;
@@ -261,6 +263,32 @@
 	return (uiomove((char *)buf + offset, n, uio));
 }
 
+int
+uiozero(int len, struct uio *uio)
+{
+	return (uiozero_faultflag(len, uio, 0));
+}
+
+int
+uiozero_nofault(int len, struct uio *uio)
+{
+	return (uiozero_faultflag(len, uio, 1));
+}
+
+static int
+uiozero_faultflag(int len, struct uio *uio, int nofault)
+{
+	int bs, error;
+	for (; len > 0; len -= bs) {
+		bs = MIN(len, ZERO_REGION_SIZE);
+		error = uiomove_faultflag(__DECONST(void *, zero_region),
+			bs, uio, nofault);
+		if (error)
+			return (error);
+	}
+	return 0;
+}
+
 #ifdef ZERO_COPY_SOCKETS
 /*
  * Experimental support for zero-copy I/O


>Release-Note:
>Audit-Trail:
>Unformatted:



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201305070857.r478vYev094211>