Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 26 Jan 2016 13:09:16 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r294814 - in vendor-sys/illumos/dist/uts/common/fs/zfs: . sys
Message-ID:  <201601261309.u0QD9GYB021701@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Tue Jan 26 13:09:16 2016
New Revision: 294814
URL: https://svnweb.freebsd.org/changeset/base/294814

Log:
  6393 zfs receive a full send as a clone
  
  Reviewed by: Matthew Ahrens <mahrens@delphix.com>
  Reviewed by: Prakash Surya <prakash.surya@delphix.com>
  Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
  Approved by: Dan McDonald <danmcd@omniti.com>
  Author: Paul Dagnelie <pcd@delphix.com>
  
  illumos/illumos-gate@68ecb2ec930c4b0f00acaf8e0abb2b19c4b8b76f

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c	Tue Jan 26 13:03:01 2016	(r294813)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c	Tue Jan 26 13:09:16 2016	(r294814)
@@ -137,6 +137,14 @@ dump_record(dmu_sendarg_t *dsp, void *pa
 	return (0);
 }
 
+/*
+ * Fill in the drr_free struct, or perform aggregation if the previous record is
+ * also a free record, and the two are adjacent.
+ *
+ * Note that we send free records even for a full send, because we want to be
+ * able to receive a full send as a clone, which requires a list of all the free
+ * and freeobject records that were generated on the source.
+ */
 static int
 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
     uint64_t length)
@@ -160,15 +168,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t o
 	    (object == dsp->dsa_last_data_object &&
 	    offset > dsp->dsa_last_data_offset));
 
-	/*
-	 * If we are doing a non-incremental send, then there can't
-	 * be any data in the dataset we're receiving into.  Therefore
-	 * a free record would simply be a no-op.  Save space by not
-	 * sending it to begin with.
-	 */
-	if (!dsp->dsa_incremental)
-		return (0);
-
 	if (length != -1ULL && offset + length < offset)
 		length = -1ULL;
 
@@ -347,10 +346,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uin
 {
 	struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
 
-	/* See comment in dump_free(). */
-	if (!dsp->dsa_incremental)
-		return (0);
-
 	/*
 	 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
 	 * push it out, since free block aggregation can only be done for
@@ -750,6 +745,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp,
 	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
 	if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
+	drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
 
 	if (ancestor_zb != NULL) {
 		drr->drr_u.drr_begin.drr_fromguid =
@@ -772,7 +768,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp,
 	dsp->dsa_off = off;
 	dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
 	dsp->dsa_pending_op = PENDING_NONE;
-	dsp->dsa_incremental = (ancestor_zb != NULL);
 	dsp->dsa_featureflags = featureflags;
 	dsp->dsa_resume_object = resumeobj;
 	dsp->dsa_resume_offset = resumeoff;
@@ -1286,7 +1281,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t
 		/* target fs already exists; recv into temp clone */
 
 		/* Can't recv a clone into an existing fs */
-		if (flags & DRR_FLAG_CLONE) {
+		if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
@@ -1305,6 +1300,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t
 		    drba->drba_origin))
 			return (SET_ERROR(ENOENT));
 
+		/*
+		 * If we're receiving a full send as a clone, and it doesn't
+		 * contain all the necessary free records and freeobject
+		 * records, reject it.
+		 */
+		if (fromguid == 0 && drba->drba_origin &&
+		    !(flags & DRR_FLAG_FREERECORDS))
+			return (SET_ERROR(EINVAL));
+
 		/* Open the parent of tofs */
 		ASSERT3U(strlen(tofs), <, MAXNAMELEN);
 		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
@@ -1344,7 +1348,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
-			if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
+			if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
+			    fromguid != 0) {
 				dsl_dataset_rele(origin, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(ENODEV));
@@ -1674,6 +1679,20 @@ struct receive_writer_arg {
 	uint64_t bytes_read; /* bytes read when current record created */
 };
 
+struct objlist {
+	list_t list; /* List of struct receive_objnode. */
+	/*
+	 * Last object looked up. Used to assert that objects are being looked
+	 * up in ascending order.
+	 */
+	uint64_t last_lookup;
+};
+
+struct receive_objnode {
+	list_node_t node;
+	uint64_t object;
+};
+
 struct receive_arg  {
 	objset_t *os;
 	vnode_t *vp; /* The vnode to read the stream from */
@@ -1691,12 +1710,7 @@ struct receive_arg  {
 	int err;
 	boolean_t byteswap;
 	/* Sorted list of objects not to issue prefetches for. */
-	list_t ignore_obj_list;
-};
-
-struct receive_ign_obj_node {
-	list_node_t node;
-	uint64_t object;
+	struct objlist ignore_objlist;
 };
 
 typedef struct guid_map_entry {
@@ -2008,13 +2022,14 @@ receive_freeobjects(struct receive_write
     struct drr_freeobjects *drrfo)
 {
 	uint64_t obj;
+	int next_err = 0;
 
 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
 		return (SET_ERROR(EINVAL));
 
 	for (obj = drrfo->drr_firstobj;
-	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
-	    (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
+	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
+	    next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
 		int err;
 
 		if (dmu_object_info(rwa->os, obj, NULL) != 0)
@@ -2024,7 +2039,8 @@ receive_freeobjects(struct receive_write
 		if (err != 0)
 			return (err);
 	}
-
+	if (next_err != ESRCH)
+		return (next_err);
 	return (0);
 }
 
@@ -2354,6 +2370,66 @@ receive_read_payload_and_next_header(str
 	return (0);
 }
 
+static void
+objlist_create(struct objlist *list)
+{
+	list_create(&list->list, sizeof (struct receive_objnode),
+	    offsetof(struct receive_objnode, node));
+	list->last_lookup = 0;
+}
+
+static void
+objlist_destroy(struct objlist *list)
+{
+	for (struct receive_objnode *n = list_remove_head(&list->list);
+	    n != NULL; n = list_remove_head(&list->list)) {
+		kmem_free(n, sizeof (*n));
+	}
+	list_destroy(&list->list);
+}
+
+/*
+ * This function looks through the objlist to see if the specified object number
+ * is contained in the objlist.  In the process, it will remove all object
+ * numbers in the list that are smaller than the specified object number.  Thus,
+ * any lookup of an object number smaller than a previously looked up object
+ * number will always return false; therefore, all lookups should be done in
+ * ascending order.
+ */
+static boolean_t
+objlist_exists(struct objlist *list, uint64_t object)
+{
+	struct receive_objnode *node = list_head(&list->list);
+	ASSERT3U(object, >=, list->last_lookup);
+	list->last_lookup = object;
+	while (node != NULL && node->object < object) {
+		VERIFY3P(node, ==, list_remove_head(&list->list));
+		kmem_free(node, sizeof (*node));
+		node = list_head(&list->list);
+	}
+	return (node != NULL && node->object == object);
+}
+
+/*
+ * The objlist is a list of object numbers stored in ascending order.  However,
+ * the insertion of new object numbers does not seek out the correct location to
+ * store a new object number; instead, it appends it to the list for simplicity.
+ * Thus, any users must take care to only insert new object numbers in ascending
+ * order.
+ */
+static void
+objlist_insert(struct objlist *list, uint64_t object)
+{
+	struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
+	node->object = object;
+#ifdef ZFS_DEBUG
+	struct receive_objnode *last_object = list_tail(&list->list);
+	uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
+	ASSERT3U(node->object, >, last_objnum);
+#endif
+	list_insert_tail(&list->list, node);
+}
+
 /*
  * Issue the prefetch reads for any necessary indirect blocks.
  *
@@ -2376,13 +2452,7 @@ static void
 receive_read_prefetch(struct receive_arg *ra,
     uint64_t object, uint64_t offset, uint64_t length)
 {
-	struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
-	while (node != NULL && node->object < object) {
-		VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
-		kmem_free(node, sizeof (*node));
-		node = list_head(&ra->ignore_obj_list);
-	}
-	if (node == NULL || node->object > object) {
+	if (!objlist_exists(&ra->ignore_objlist, object)) {
 		dmu_prefetch(ra->os, object, 1, offset, length,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
@@ -2415,18 +2485,7 @@ receive_read_record(struct receive_arg *
 		 */
 		if (err == ENOENT ||
 		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
-			struct receive_ign_obj_node *node =
-			    kmem_zalloc(sizeof (*node),
-			    KM_SLEEP);
-			node->object = drro->drr_object;
-#ifdef ZFS_DEBUG
-			struct receive_ign_obj_node *last_object =
-			    list_tail(&ra->ignore_obj_list);
-			uint64_t last_objnum = (last_object != NULL ?
-			    last_object->object : 0);
-			ASSERT3U(node->object, >, last_objnum);
-#endif
-			list_insert_tail(&ra->ignore_obj_list, node);
+			objlist_insert(&ra->ignore_objlist, drro->drr_object);
 			err = 0;
 		}
 		return (err);
@@ -2643,7 +2702,6 @@ resume_check(struct receive_arg *ra, nvl
 	return (0);
 }
 
-
 /*
  * Read in the stream's records, one by one, and apply them to the pool.  There
  * are two threads involved; the thread that calls this function will spin up a
@@ -2677,8 +2735,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, 
 		    sizeof (ra.bytes_read), 1, &ra.bytes_read);
 	}
 
-	list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node),
-	    offsetof(struct receive_ign_obj_node, node));
+	objlist_create(&ra.ignore_objlist);
 
 	/* these were verified in dmu_recv_begin */
 	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
@@ -2832,12 +2889,7 @@ out:
 	}
 
 	*voffp = ra.voff;
-	for (struct receive_ign_obj_node *n =
-	    list_remove_head(&ra.ignore_obj_list); n != NULL;
-	    n = list_remove_head(&ra.ignore_obj_list)) {
-		kmem_free(n, sizeof (*n));
-	}
-	list_destroy(&ra.ignore_obj_list);
+	objlist_destroy(&ra.ignore_objlist);
 	return (err);
 }
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_impl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_impl.h	Tue Jan 26 13:03:01 2016	(r294813)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_impl.h	Tue Jan 26 13:09:16 2016	(r294814)
@@ -24,7 +24,7 @@
  */
 /*
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_DMU_IMPL_H
@@ -293,7 +293,6 @@ typedef struct dmu_sendarg {
 	uint64_t dsa_toguid;
 	int dsa_err;
 	dmu_pendop_t dsa_pending_op;
-	boolean_t dsa_incremental;
 	uint64_t dsa_featureflags;
 	uint64_t dsa_last_data_object;
 	uint64_t dsa_last_data_offset;

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h	Tue Jan 26 13:03:01 2016	(r294813)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h	Tue Jan 26 13:09:16 2016	(r294814)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_ZFS_IOCTL_H
@@ -126,6 +126,16 @@ typedef enum dmu_send_resume_token_versi
 
 #define	DRR_FLAG_CLONE		(1<<0)
 #define	DRR_FLAG_CI_DATA	(1<<1)
+/*
+ * This send stream, if it is a full send, includes the FREE and FREEOBJECT
+ * records that are created by the sending process.  This means that the send
+ * stream can be received as a clone, even though it is not an incremental.
+ * This is not implemented as a feature flag, because the receiving side does
+ * not need to have implemented it to receive this stream; it is fully backwards
+ * compatible.  We need a flag, though, because full send streams without it
+ * cannot necessarily be received as a clone correctly.
+ */
+#define	DRR_FLAG_FREERECORDS	(1<<2)
 
 /*
  * flags in the drr_checksumflags field in the DRR_WRITE and



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201601261309.u0QD9GYB021701>