Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 22 Dec 2014 20:58:52 +0000 (UTC)
From:      Xin LI <delphij@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org
Subject:   svn commit: r276081 - in stable/10: cddl/contrib/opensolaris/cmd/zdb cddl/contrib/opensolaris/cmd/zfs cddl/contrib/opensolaris/cmd/zpool cddl/contrib/opensolaris/cmd/zstreamdump cddl/contrib/openso...
Message-ID:  <201412222058.sBMKwqfr055298@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: delphij
Date: Mon Dec 22 20:58:51 2014
New Revision: 276081
URL: https://svnweb.freebsd.org/changeset/base/276081

Log:
  MFC r274337,r274673,274681,r275515:
  
  ZFS large block support.  The default recordsize remains at 128KB.
  
  A new tunable/sysctl variable, vfs.zfs.max_recordsize is added to
  allow adjusting the permitted maximum record size, or
  zfs_max_recordsize, with a default of 1MB.  ZFS will not allow
  setting recordsize greater than zfs_max_recordsize as a safety
  belt, because larger recordsize means greater read and write
  latency and more memory usage.
  
  Please note that booting from datasets that have recordsize greater
  than 128KB is not supported (but it's Okay to enable the feature on
  the pool).
  
  Limited safety belt is provided for mounted root filesystem but use
  caution when using a larger value.
  
  Illumos issue:
      5027 zfs large block support

Modified:
  stable/10/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  stable/10/cddl/contrib/opensolaris/cmd/zfs/zfs.8
  stable/10/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
  stable/10/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
  stable/10/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
  stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c
  stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
  stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
  stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
  stable/10/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
  stable/10/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
  stable/10/sys/boot/zfs/zfsimpl.c
  stable/10/sys/cddl/boot/zfs/zfsimpl.h
  stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
  stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
  stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
  stable/10/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/cddl/contrib/opensolaris/cmd/zdb/zdb.c
==============================================================================
--- stable/10/cddl/contrib/opensolaris/cmd/zdb/zdb.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/cddl/contrib/opensolaris/cmd/zdb/zdb.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -2147,6 +2147,8 @@ dump_label(const char *dev)
 	(void) close(fd);
 }
 
+static uint64_t num_large_blocks;
+
 /*ARGSUSED*/
 static int
 dump_one_dir(const char *dsname, void *arg)
@@ -2159,6 +2161,8 @@ dump_one_dir(const char *dsname, void *a
 		(void) printf("Could not open %s, error %d\n", dsname, error);
 		return (0);
 	}
+	if (dmu_objset_ds(os)->ds_large_blocks)
+		num_large_blocks++;
 	dump_dir(os);
 	dmu_objset_disown(os, FTAG);
 	fuid_table_destroy();
@@ -2169,7 +2173,7 @@ dump_one_dir(const char *dsname, void *a
 /*
  * Block statistics.
  */
-#define	PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1)
+#define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
 	uint64_t zb_asize;
 	uint64_t zb_lsize;
@@ -2234,7 +2238,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_count++;
-		zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;
+
+		/*
+		 * The histogram is only big enough to record blocks up to
+		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
+		 * "other", bucket.
+		 */
+		int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
+		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
+		zb->zb_psize_histogram[idx]++;
 
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
@@ -2946,6 +2958,7 @@ dump_zpool(spa_t *spa)
 		dump_metaslab_groups(spa);
 
 	if (dump_opt['d'] || dump_opt['i']) {
+		uint64_t refcount;
 		dump_dir(dp->dp_meta_objset);
 		if (dump_opt['d'] >= 3) {
 			dump_bpobj(&spa->spa_deferred_bpobj,
@@ -2965,8 +2978,21 @@ dump_zpool(spa_t *spa)
 		}
 		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+
+		(void) feature_get_refcount(spa,
+		    &spa_feature_table[SPA_FEATURE_LARGE_BLOCKS], &refcount);
+		if (num_large_blocks != refcount) {
+			(void) printf("large_blocks feature refcount mismatch: "
+			    "expected %lld != actual %lld\n",
+			    (longlong_t)num_large_blocks,
+			    (longlong_t)refcount);
+			rc = 2;
+		} else {
+			(void) printf("Verified large_blocks feature refcount "
+			    "is correct (%llu)\n", (longlong_t)refcount);
+		}
 	}
-	if (dump_opt['b'] || dump_opt['c'])
+	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
 		rc = dump_block_stats(spa);
 
 	if (rc == 0)

Modified: stable/10/cddl/contrib/opensolaris/cmd/zfs/zfs.8
==============================================================================
--- stable/10/cddl/contrib/opensolaris/cmd/zfs/zfs.8	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/cddl/contrib/opensolaris/cmd/zfs/zfs.8	Mon Dec 22 20:58:51 2014	(r276081)
@@ -30,7 +30,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd June 30, 2014
+.Dd November 10, 2014
 .Dt ZFS 8
 .Os
 .Sh NAME
@@ -179,12 +179,12 @@
 .Ar bookmark
 .Nm
 .Cm send
-.Op Fl DnPpRve
+.Op Fl DnPpRveL
 .Op Fl i Ar snapshot | Fl I Ar snapshot
 .Ar snapshot
 .Nm
 .Cm send
-.Op Fl e
+.Op Fl eL
 .Op Fl i Ar snapshot Ns | Ns bookmark
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Nm
@@ -1187,6 +1187,12 @@ systems is strongly discouraged, and may
 .Pp
 The size specified must be a power of two greater than or equal to 512 and less
 than or equal to 128 Kbytes.
+If the
+.Sy large_blocks
+feature is enabled on the pool, the size may be up to 1 Mbyte.
+See
+.Xr zpool-features 7
+for details on ZFS feature flags.
 .Pp
 Changing the file system's
 .Sy recordsize
@@ -2477,7 +2483,7 @@ feature.
 .It Xo
 .Nm
 .Cm send
-.Op Fl DnPpRve
+.Op Fl DnPpRveL
 .Op Fl i Ar snapshot | Fl I Ar snapshot
 .Ar snapshot
 .Xc
@@ -2549,6 +2555,22 @@ be used regardless of the dataset's
 property, but performance will be much better if the filesystem uses a
 dedup-capable checksum (eg.
 .Sy sha256 ) .
+.It Fl L
+Generate a stream which may contain blocks larger than 128KB.
+This flag
+has no effect if the
+.Sy large_blocks
+pool feature is disabled, or if the
+.Sy recordsize
+property of this filesystem has never been set above 128KB.
+The receiving system must have the
+.Sy large_blocks
+pool feature enabled as well.
+See
+.Xr zpool-features 7
+for details on ZFS feature flags and the
+.Sy large_blocks
+feature.
 .It Fl e
 Generate a more compact stream by using WRITE_EMBEDDED records for blocks
 which are stored more compactly on disk by the
@@ -2596,7 +2618,7 @@ on future versions of
 .It Xo
 .Nm
 .Cm send
-.Op Fl e
+.Op Fl eL
 .Op Fl i Ar snapshot Ns | Ns Ar bookmark
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Xc
@@ -2622,6 +2644,22 @@ specified as the last component of the n
 If the incremental target is a clone, the incremental source can
 be the origin snapshot, or an earlier snapshot in the origin's filesystem,
 or the origin's origin, etc.
+.It Fl L
+Generate a stream which may contain blocks larger than 128KB.
+This flag
+has no effect if the
+.Sy large_blocks
+pool feature is disabled, or if the
+.Sy recordsize
+property of this filesystem has never been set above 128KB.
+The receiving system must have the
+.Sy large_blocks
+pool feature enabled as well.
+See
+.Xr zpool-features 7
+for details on ZFS feature flags and the
+.Sy large_blocks
+feature.
 .It Fl e
 Generate a more compact stream by using WRITE_EMBEDDED records for blocks
 which are stored more compactly on disk by the

Modified: stable/10/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
==============================================================================
--- stable/10/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -274,9 +274,9 @@ get_usage(zfs_help_t idx)
 	case HELP_ROLLBACK:
 		return (gettext("\trollback [-rRf] <snapshot>\n"));
 	case HELP_SEND:
-		return (gettext("\tsend [-DnPpRve] [-[iI] snapshot] "
+		return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
 		    "<snapshot>\n"
-		    "\tsend [-e] [-i snapshot|bookmark] "
+		    "\tsend [-Le] [-i snapshot|bookmark] "
 		    "<filesystem|volume|snapshot>\n"));
 	case HELP_SET:
 		return (gettext("\tset <property=value> "
@@ -3709,7 +3709,7 @@ zfs_do_send(int argc, char **argv)
 	boolean_t extraverbose = B_FALSE;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) {
+	while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) {
 		switch (c) {
 		case 'i':
 			if (fromname)
@@ -3744,6 +3744,9 @@ zfs_do_send(int argc, char **argv)
 		case 'n':
 			flags.dryrun = B_TRUE;
 			break;
+		case 'L':
+			flags.largeblock = B_TRUE;
+			break;
 		case 'e':
 			flags.embed_data = B_TRUE;
 			break;
@@ -3800,6 +3803,8 @@ zfs_do_send(int argc, char **argv)
 		if (zhp == NULL)
 			return (1);
 
+		if (flags.largeblock)
+			lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
 		if (flags.embed_data)
 			lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
 

Modified: stable/10/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
==============================================================================
--- stable/10/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7	Mon Dec 22 20:58:51 2014	(r276081)
@@ -23,7 +23,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 1, 2014
+.Dd November 10, 2014
 .Dt ZPOOL-FEATURES 7
 .Os
 .Sh NAME
@@ -427,6 +427,33 @@ This feature becomes
 as soon as it is enabled and will
 never return to being
 .Sy enabled .
+.It Sy large_blocks
+.Bl -column "READ\-ONLY COMPATIBLE" "org.open-zfs:large_block"
+.It GUID Ta org.open-zfs:large_block
+.It READ\-ONLY COMPATIBLE Ta no
+.It DEPENDENCIES Ta extensible_dataset
+.El
+.Pp
+The
+.Sy large_block
+feature allows the record size on a dataset to be
+set larger than 128KB.
+.Pp
+This feature becomes
+.Sy active
+once a
+.Sy recordsize
+property has been set larger than 128KB, and will return to being 
+.Sy enabled
+once all filesystems that have ever had their recordsize larger than 128KB
+are destroyed.
+.Pp
+Please note that booting from datasets that have recordsize greater than
+128KB is
+.Em NOT
+supported by the
+.Fx
+boot loader.
 .El
 .Sh SEE ALSO
 .Xr zpool 8

Modified: stable/10/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
==============================================================================
--- stable/10/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -54,7 +54,6 @@ uint64_t total_stream_len = 0;
 FILE *send_stream = 0;
 boolean_t do_byteswap = B_FALSE;
 boolean_t do_cksum = B_TRUE;
-#define	INITIAL_BUFLEN (1<<20)
 
 static void
 usage(void)
@@ -67,6 +66,18 @@ usage(void)
 	exit(1);
 }
 
+static void *
+safe_malloc(size_t size)
+{
+	void *rv = malloc(size);
+	if (rv == NULL) {
+		(void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n",
+		    size);
+		abort();
+	}
+	return (rv);
+}
+
 /*
  * ssread - send stream read.
  *
@@ -158,7 +169,7 @@ print_block(char *buf, int length)
 int
 main(int argc, char *argv[])
 {
-	char *buf = malloc(INITIAL_BUFLEN);
+	char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
 	uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
 	uint64_t total_records = 0;
 	dmu_replay_record_t thedrr;
@@ -307,9 +318,9 @@ main(int argc, char *argv[])
 				nvlist_t *nv;
 				int sz = drr->drr_payloadlen;
 
-				if (sz > INITIAL_BUFLEN) {
+				if (sz > SPA_MAXBLOCKSIZE) {
 					free(buf);
-					buf = malloc(sz);
+					buf = safe_malloc(sz);
 				}
 				(void) ssread(buf, sz, &zc);
 				if (ferror(send_stream))

Modified: stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -987,9 +987,15 @@ ztest_spa_get_ashift() {
 static int
 ztest_random_blocksize(void)
 {
-	// Choose a block size >= the ashift.
-	uint64_t block_shift =
-	    ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
+	uint64_t block_shift;
+	/*
+	 * Choose a block size >= the ashift.
+	 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
+	 */
+	int maxbs = SPA_OLD_MAXBLOCKSHIFT;
+	if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
+		maxbs = 20;
+	block_shift = ztest_random(maxbs - ztest_spa_get_ashift() + 1);
 	return (1 << (SPA_MINBLOCKSHIFT + block_shift));
 }
 
@@ -4789,7 +4795,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint6
 	char path0[MAXPATHLEN];
 	char pathrand[MAXPATHLEN];
 	size_t fsize;
-	int bshift = SPA_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
+	int bshift = SPA_OLD_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
 	int iters = 1000;
 	int maxfaults;
 	int mirror_save;

Modified: stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
==============================================================================
--- stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h	Mon Dec 22 20:58:51 2014	(r276081)
@@ -609,6 +609,9 @@ typedef struct sendflags {
 	/* show progress (ie. -v) */
 	boolean_t progress;
 
+	/* large blocks (>128K) are permitted */
+	boolean_t largeblock;
+
 	/* WRITE_EMBEDDED records of type DATA are permitted */
 	boolean_t embed_data;
 } sendflags_t;

Modified: stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
==============================================================================
--- stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -1080,21 +1080,36 @@ zfs_valid_proplist(libzfs_handle_t *hdl,
 			break;
 		}
 
-		case ZFS_PROP_RECORDSIZE:
 		case ZFS_PROP_VOLBLOCKSIZE:
-			/* must be power of two within SPA_{MIN,MAX}BLOCKSIZE */
+		case ZFS_PROP_RECORDSIZE:
+		{
+			int maxbs = SPA_MAXBLOCKSIZE;
+			if (zhp != NULL) {
+				maxbs = zpool_get_prop_int(zhp->zpool_hdl,
+				    ZPOOL_PROP_MAXBLOCKSIZE, NULL);
+			}
+			/*
+			 * Volumes are limited to a volblocksize of 128KB,
+			 * because they typically service workloads with
+			 * small random writes, which incur a large performance
+			 * penalty with large blocks.
+			 */
+			if (prop == ZFS_PROP_VOLBLOCKSIZE)
+				maxbs = SPA_OLD_MAXBLOCKSIZE;
+			/*
+			 * The value must be a power of two between
+			 * SPA_MINBLOCKSIZE and maxbs.
+			 */
 			if (intval < SPA_MINBLOCKSIZE ||
-			    intval > SPA_MAXBLOCKSIZE || !ISP2(intval)) {
+			    intval > maxbs || !ISP2(intval)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "'%s' must be power of 2 from %u "
-				    "to %uk"), propname,
-				    (uint_t)SPA_MINBLOCKSIZE,
-				    (uint_t)SPA_MAXBLOCKSIZE >> 10);
+				    "'%s' must be power of 2 from 512B "
+				    "to %uKB"), propname, maxbs >> 10);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
-
+		}
 		case ZFS_PROP_MLSLABEL:
 		{
 #ifdef sun
@@ -1465,7 +1480,9 @@ zfs_setprop_error(libzfs_handle_t *hdl, 
 		break;
 
 	case ERANGE:
-		if (prop == ZFS_PROP_COMPRESSION) {
+	case EDOM:
+		if (prop == ZFS_PROP_COMPRESSION ||
+		    prop == ZFS_PROP_RECORDSIZE) {
 			(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "property setting is not allowed on "
 			    "bootable datasets"));
@@ -3191,9 +3208,7 @@ zfs_create(libzfs_handle_t *hdl, const c
 		case EDOM:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "volume block size must be power of 2 from "
-			    "%u to %uk"),
-			    (uint_t)SPA_MINBLOCKSIZE,
-			    (uint_t)SPA_MAXBLOCKSIZE >> 10);
+			    "512B to 128KB"));
 
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 

Modified: stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
==============================================================================
--- stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -215,7 +215,7 @@ static void *
 cksummer(void *arg)
 {
 	dedup_arg_t *dda = arg;
-	char *buf = malloc(1<<20);
+	char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE);
 	dmu_replay_record_t thedrr;
 	dmu_replay_record_t *drr = &thedrr;
 	struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
@@ -280,9 +280,9 @@ cksummer(void *arg)
 			    DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
 				int sz = drr->drr_payloadlen;
 
-				if (sz > 1<<20) {
-					free(buf);
-					buf = malloc(sz);
+				if (sz > SPA_MAXBLOCKSIZE) {
+					buf = zfs_realloc(dda->dedup_hdl, buf,
+					    SPA_MAXBLOCKSIZE, sz);
 				}
 				(void) ssread(buf, sz, ofp);
 				if (ferror(stdin))
@@ -815,7 +815,7 @@ typedef struct send_dump_data {
 	char prevsnap[ZFS_MAXNAMELEN];
 	uint64_t prevsnap_obj;
 	boolean_t seenfrom, seento, replicate, doall, fromorigin;
-	boolean_t verbose, dryrun, parsable, progress, embed_data;
+	boolean_t verbose, dryrun, parsable, progress, embed_data, large_block;
 	int outfd;
 	boolean_t err;
 	nvlist_t *fss;
@@ -1163,6 +1163,8 @@ dump_snapshot(zfs_handle_t *zhp, void *a
 		}
 
 		enum lzc_send_flags flags = 0;
+		if (sdd->large_block)
+			flags |= LZC_SEND_FLAG_LARGE_BLOCK;
 		if (sdd->embed_data)
 			flags |= LZC_SEND_FLAG_EMBED_DATA;
 
@@ -1511,6 +1513,7 @@ zfs_send(zfs_handle_t *zhp, const char *
 	sdd.parsable = flags->parsable;
 	sdd.progress = flags->progress;
 	sdd.dryrun = flags->dryrun;
+	sdd.large_block = flags->largeblock;
 	sdd.embed_data = flags->embed_data;
 	sdd.filter_cb = filter_func;
 	sdd.filter_cb_arg = cb_arg;
@@ -2545,7 +2548,7 @@ static int
 recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 {
 	dmu_replay_record_t *drr;
-	void *buf = malloc(1<<20);
+	void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE);
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,

Modified: stable/10/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
==============================================================================
--- stable/10/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -502,6 +502,10 @@ lzc_get_holds(const char *snapname, nvli
  *
  * "fd" is the file descriptor to write the send stream to.
  *
+ * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted
+ * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT
+ * records with drr_blksz > 128K.
+ *
  * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
  * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
  * which the receiving system must support (as indicated by support
@@ -518,6 +522,8 @@ lzc_send(const char *snapname, const cha
 	fnvlist_add_int32(args, "fd", fd);
 	if (from != NULL)
 		fnvlist_add_string(args, "fromsnap", from);
+	if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
+		fnvlist_add_boolean(args, "largeblockok");
 	if (flags & LZC_SEND_FLAG_EMBED_DATA)
 		fnvlist_add_boolean(args, "embedok");
 	err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);

Modified: stable/10/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
==============================================================================
--- stable/10/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h	Mon Dec 22 20:58:51 2014	(r276081)
@@ -54,7 +54,8 @@ int lzc_release(nvlist_t *, nvlist_t **)
 int lzc_get_holds(const char *, nvlist_t **);
 
 enum lzc_send_flags {
-	LZC_SEND_FLAG_EMBED_DATA = 1 << 0
+	LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
+	LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1
 };
 
 int lzc_send(const char *, const char *, int, enum lzc_send_flags);

Modified: stable/10/sys/boot/zfs/zfsimpl.c
==============================================================================
--- stable/10/sys/boot/zfs/zfsimpl.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/boot/zfs/zfsimpl.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -57,6 +57,7 @@ static const char *features_for_read[] =
 	"com.delphix:hole_birth",
 	"com.delphix:extensible_dataset",
 	"com.delphix:embedded_data",
+	"org.open-zfs:large_blocks",
 	NULL
 };
 
@@ -1222,6 +1223,11 @@ dnode_read(const spa_t *spa, const dnode
 	int nlevels = dnode->dn_nlevels;
 	int i, rc;
 
+	if (bsize > SPA_MAXBLOCKSIZE) {
+		printf("ZFS: I/O error - blocks larger than 128K are not supported\n");
+		return (EIO);
+	}
+
 	/*
 	 * Note: bsize may not be a power of two here so we need to do an
 	 * actual divide rather than a bitshift.

Modified: stable/10/sys/cddl/boot/zfs/zfsimpl.h
==============================================================================
--- stable/10/sys/cddl/boot/zfs/zfsimpl.h	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/cddl/boot/zfs/zfsimpl.h	Mon Dec 22 20:58:51 2014	(r276081)
@@ -113,17 +113,14 @@
 #define	BSWAP_64(x)	((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
 
 /*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * Note: the boot loader can't actually read blocks larger than 128KB,
+ * due to lack of memory.  Therefore its SPA_MAXBLOCKSIZE is still 128KB.
  */
 #define	SPA_MINBLOCKSHIFT	9
 #define	SPA_MAXBLOCKSHIFT	17
 #define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
 #define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
 
-#define	SPA_BLOCKSIZES		(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
 /*
  * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
  * The ASIZE encoding should be at least 64 times larger (6 more bits)

Modified: stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -56,7 +56,8 @@ valid_char(char c, boolean_t after_colon
 {
 	return ((c >= 'a' && c <= 'z') ||
 	    (c >= '0' && c <= '9') ||
-	    c == (after_colon ? '_' : '.'));
+	    (after_colon && c == '_') ||
+	    (!after_colon && (c == '.' || c == '-')));
 }
 
 /*
@@ -220,4 +221,13 @@ zpool_feature_init(void)
 	    "com.delphix:embedded_data", "embedded_data",
 	    "Blocks which compress very well use even less space.",
 	    B_FALSE, B_TRUE, B_TRUE, NULL);
+
+	static const spa_feature_t large_blocks_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
+	    "org.open-zfs:large_blocks", "large_blocks",
+	    "Support for blocks larger than 128KB.", B_FALSE, B_FALSE, B_FALSE,
+	    large_blocks_deps);
 }

Modified: stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h	Mon Dec 22 20:58:51 2014	(r276081)
@@ -50,6 +50,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_EMBEDDED_DATA,
 	SPA_FEATURE_BOOKMARKS,
 	SPA_FEATURE_FS_SS_LIMIT,
+	SPA_FEATURE_LARGE_BLOCKS,
 	SPA_FEATURES
 } spa_feature_t;
 

Modified: stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -409,8 +409,8 @@ zfs_prop_init(void)
 
 	/* inherit number properties */
 	zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
-	    SPA_MAXBLOCKSIZE, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");
+	    SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
 
 	/* hidden properties */
 	zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,

Modified: stable/10/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -127,6 +127,8 @@ zpool_prop_init(void)
 	/* hidden properties */
 	zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
 	    PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+	zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
+	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
 }
 
 /*

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -43,7 +43,7 @@ bpobj_alloc_empty(objset_t *os, int bloc
 		if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 			ASSERT0(dp->dp_empty_bpobj);
 			dp->dp_empty_bpobj =
-			    bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+			    bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
 			VERIFY(zap_add(os,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
@@ -396,7 +396,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint6
 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 	if (bpo->bpo_phys->bpo_subobjs == 0) {
 		bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
-		    DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+		    DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+		    DMU_OT_NONE, 0, tx);
 	}
 
 	dmu_object_info_t doi;

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -65,7 +65,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx)
 	bptree_phys_t *bt;
 
 	obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
-	    SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+	    SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
 	    sizeof (bptree_phys_t), tx);
 
 	/*

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -2022,10 +2022,8 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake,
 		return (SET_ERROR(ENOTSUP));
 	if (blksz == 0)
 		blksz = SPA_MINBLOCKSIZE;
-	if (blksz > SPA_MAXBLOCKSIZE)
-		blksz = SPA_MAXBLOCKSIZE;
-	else
-		blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -255,6 +255,14 @@ logbias_changed_cb(void *arg, uint64_t n
 		zil_set_logbias(os->os_zil, newval);
 }
 
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	os->os_recordsize = newval;
+}
+
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
@@ -384,6 +392,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat
 				    ZFS_PROP_REDUNDANT_METADATA),
 				    redundant_metadata_changed_cb, os);
 			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+				    recordsize_changed_cb, os);
+			}
 		}
 		if (err != 0) {
 			VERIFY(arc_buf_remove_ref(os->os_phys_buf,
@@ -642,6 +655,9 @@ dmu_objset_evict(objset_t *os)
 			VERIFY0(dsl_prop_unregister(ds,
 			    zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
 			    redundant_metadata_changed_cb, os));
+			VERIFY0(dsl_prop_unregister(ds,
+			    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+			    recordsize_changed_cb, os));
 		}
 		VERIFY0(dsl_prop_unregister(ds,
 		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -227,11 +227,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_objec
 	drrw->drr_offset = offset;
 	drrw->drr_length = blksz;
 	drrw->drr_toguid = dsp->dsa_toguid;
-	if (BP_IS_EMBEDDED(bp)) {
+	if (bp == NULL || BP_IS_EMBEDDED(bp)) {
 		/*
-		 * There's no pre-computed checksum of embedded BP's, so
-		 * (like fletcher4-checkummed blocks) userland will have
-		 * to compute a dedup-capable checksum itself.
+		 * There's no pre-computed checksum for partial-block
+		 * writes or embedded BP's, so (like
+		 * fletcher4-checkummed blocks) userland will have to
+		 * compute a dedup-capable checksum itself.
 		 */
 		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
 	} else {
@@ -393,6 +394,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t 
 	drro->drr_compress = dnp->dn_compress;
 	drro->drr_toguid = dsp->dsa_toguid;
 
+	if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+		drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 		return (SET_ERROR(EINTR));
 
@@ -512,6 +517,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, co
 		uint32_t aflags = ARC_WAIT;
 		arc_buf_t *abuf;
 		int blksz = BP_GET_LSIZE(bp);
+		uint64_t offset;
 
 		ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 		ASSERT0(zb->zb_level);
@@ -532,8 +538,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, co
 			}
 		}
 
-		err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
-		    blksz, bp, abuf->b_data);
+		offset = zb->zb_blkid * blksz;
+
+		if (!(dsp->dsa_featureflags &
+		    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+		    blksz > SPA_OLD_MAXBLOCKSIZE) {
+			char *buf = abuf->b_data;
+			while (blksz > 0 && err == 0) {
+				int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+				err = dump_write(dsp, type, zb->zb_object,
+				    offset, n, NULL, buf);
+				offset += n;
+				buf += n;
+				blksz -= n;
+			}
+		} else {
+			err = dump_write(dsp, type, zb->zb_object,
+			    offset, blksz, bp, abuf->b_data);
+		}
 		(void) arc_buf_remove_ref(abuf, &abuf);
 	}
 
@@ -548,9 +570,9 @@ static int
 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
     zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
 #ifdef illumos
-    int outfd, vnode_t *vp, offset_t *off)
+    boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
 #else
-    int outfd, struct file *fp, offset_t *off)
+    boolean_t large_block_ok, int outfd, struct file *fp, offset_t *off)
 #endif
 {
 	objset_t *os;
@@ -586,6 +608,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp,
 	}
 #endif
 
+	if (large_block_ok && ds->ds_large_blocks)
+		featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
 	if (embedok &&
 	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
 		featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -682,10 +706,11 @@ out:
 
 int
 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
-    boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
+    int outfd, vnode_t *vp, offset_t *off)
 #else
-    boolean_t embedok, int outfd, struct file *fp, offset_t *off)
+    int outfd, struct file *fp, offset_t *off)
 #endif
 {
 	dsl_pool_t *dp;
@@ -720,18 +745,19 @@ dmu_send_obj(const char *pool, uint64_t 
 		zb.zbm_guid = fromds->ds_phys->ds_guid;
 		is_clone = (fromds->ds_dir != ds->ds_dir);
 		dsl_dataset_rele(fromds, FTAG);
-		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
-		    outfd, fp, off);
+		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+		    embedok, large_block_ok, outfd, fp, off);
 	} else {
-		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
-		    outfd, fp, off);
+		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+		    embedok, large_block_ok, outfd, fp, off);
 	}
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 int
-dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+dmu_send(const char *tosnap, const char *fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
     int outfd, vnode_t *vp, offset_t *off)
 #else
@@ -802,11 +828,11 @@ dmu_send(const char *tosnap, const char 
 			dsl_pool_rele(dp, FTAG);
 			return (err);
 		}
-		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
-		    outfd, fp, off);
+		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+		    embedok, large_block_ok, outfd, fp, off);
 	} else {
-		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
-		    outfd, fp, off);
+		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+		    embedok, large_block_ok, outfd, fp, off);
 	}
 	if (owned)
 		dsl_dataset_disown(ds, FTAG);
@@ -1006,6 +1032,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
 		return (SET_ERROR(ENOTSUP));
 
+	/*
+	 * The receiving code doesn't know how to translate large blocks
+	 * to smaller ones, so the pool must have the LARGE_BLOCKS
+	 * feature enabled if the stream has LARGE_BLOCKS.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+		return (SET_ERROR(ENOTSUP));
+
 	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
 	if (error == 0) {
 		/* target fs already exists; recv into temp clone */
@@ -1131,6 +1166,13 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t 
 	}
 	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
 
+	if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+	    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    !newds->ds_large_blocks) {
+		dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+		newds->ds_large_blocks = B_TRUE;
+	}
+
 	dmu_buf_will_dirty(newds->ds_dbuf, tx);
 	newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 
@@ -1283,6 +1325,7 @@ restore_read(struct restorearg *ra, int 
 
 	/* some things will require 8-byte alignment, so everything must */
 	ASSERT0(len % 8);
+	ASSERT3U(len, <=, ra->bufsize);
 
 	while (done < len) {
 		ssize_t resid;
@@ -1420,7 +1463,7 @@ restore_object(struct restorearg *ra, ob
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
-	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
 		return (SET_ERROR(EINVAL));
 	}
@@ -1693,7 +1736,7 @@ restore_spill(struct restorearg *ra, obj
 	int err;
 
 	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
-	    drrs->drr_length > SPA_MAXBLOCKSIZE)
+	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
 		return (SET_ERROR(EINVAL));
 
 	data = restore_read(ra, drrs->drr_length, NULL);
@@ -1781,7 +1824,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, 
 	ra.td = curthread;
 	ra.fp = fp;
 	ra.voff = *voffp;
-	ra.bufsize = 1<<20;
+	ra.bufsize = SPA_MAXBLOCKSIZE;
 	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
 
 	/* these were verified in dmu_recv_begin */

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -224,7 +224,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u
 		return;
 
 	min_bs = SPA_MINBLOCKSHIFT;
-	max_bs = SPA_MAXBLOCKSHIFT;
+	max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
 	min_ibs = DN_MIN_INDBLKSHIFT;
 	max_ibs = DN_MAX_INDBLKSHIFT;
 
@@ -293,6 +293,14 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u
 			 */
 			ASSERT(dn->dn_datablkshift != 0);
 			min_bs = max_bs = dn->dn_datablkshift;
+		} else {
+			/*
+			 * The blocksize can increase up to the recordsize,
+			 * or if it is already more than the recordsize,
+			 * up to the next power of 2.
+			 */
+			min_bs = highbit64(dn->dn_datablksz - 1);
+			max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
 		}
 
 		/*
@@ -751,11 +759,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t o
 		bp = &dn->dn_phys->dn_blkptr[0];
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 		    bp, bp->blk_birth))
-			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+			txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
 		else
-			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+			txh->txh_space_towrite += MZAP_MAX_BLKSZ;
 		if (!BP_IS_HOLE(bp))
-			txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+			txh->txh_space_tounref += MZAP_MAX_BLKSZ;
 		return;
 	}
 
@@ -1549,18 +1557,18 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t
 
 	/* If blkptr doesn't exist then add space to towrite */
 	if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
-		txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+		txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
 	} else {
 		blkptr_t *bp;
 
 		bp = &dn->dn_phys->dn_spill;
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 		    bp, bp->blk_birth))
-			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+			txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
 		else
-			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+			txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
 		if (!BP_IS_HOLE(bp))
-			txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+			txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
 	}
 }
 

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	Mon Dec 22 20:53:45 2014	(r276080)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	Mon Dec 22 20:58:51 2014	(r276081)
@@ -513,10 +513,10 @@ dnode_allocate(dnode_t *dn, dmu_object_t
 {
 	int i;
 
+	ASSERT3U(blocksize, <=,
+	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (blocksize == 0)
 		blocksize = 1 << zfs_default_bs;
-	else if (blocksize > SPA_MAXBLOCKSIZE)
-		blocksize = SPA_MAXBLOCKSIZE;
 	else
 		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
@@ -597,7 +597,8 @@ dnode_reallocate(dnode_t *dn, dmu_object
 	int nblkptr;
 
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201412222058.sBMKwqfr055298>