Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 22 Feb 2018 03:52:04 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r329803 - vendor-sys/illumos/dist/uts/common/fs/zfs
Message-ID:  <201802220352.w1M3q4kN011959@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Thu Feb 22 03:52:03 2018
New Revision: 329803
URL: https://svnweb.freebsd.org/changeset/base/329803

Log:
  9080 recursive enter of vdev_indirect_rwlock from vdev_indirect_remap()
  
  illumos/illumos-gate@bdfded42e66b9fc1395ff2401aa2952f7c44ae34
  
  A scenario came up where a callback executed by vdev_indirect_remap() on a vdev, calls
  vdev_indirect_remap() on the same vdev and tries to reacquire vdev_indirect_rwlock that
  was already acquired from the first call to vdev_indirect_remap(). The specific scenario,
  is that we want to remap a block pointer that is snapshoted but its dataset's remap_deadlist
  is not cached. So in order to add it we issue a read through a vdev_indirect_remap() on the
  same vdev, which brings up the aforementioned issue.
  
  Reviewed by: Matthew Ahrens <mahrens@delphix.com>
  Reviewed by: George Wilson <george.wilson@delphix.com>
  Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
  Author: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c	Thu Feb 22 03:49:06 2018	(r329802)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c	Thu Feb 22 03:52:03 2018	(r329803)
@@ -14,7 +14,7 @@
  */
 
 /*
- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -852,6 +852,57 @@ rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, 
 }
 
 /*
+ * Given an indirect vdev and an extent on that vdev, it duplicates the
+ * physical entries of the indirect mapping that correspond to the extent
+ * to a new array and returns a pointer to it. In addition, copied_entries
+ * is populated with the number of mapping entries that were duplicated.
+ *
+ * Note that the function assumes that the caller holds vdev_indirect_rwlock.
+ * This ensures that the mapping won't change due to condensing as we
+ * copy over its contents.
+ *
+ * Finally, since we are doing an allocation, it is up to the caller to
+ * free the array allocated in this function.
+ */
+vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
+    uint64_t asize, uint64_t *copied_entries)
+{
+	vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	uint64_t entries = 0;
+
+	ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
+
+	vdev_indirect_mapping_entry_phys_t *first_mapping =
+	    vdev_indirect_mapping_entry_for_offset(vim, offset);
+	ASSERT3P(first_mapping, !=, NULL);
+
+	vdev_indirect_mapping_entry_phys_t *m = first_mapping;
+	while (asize > 0) {
+		uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
+
+		ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
+		ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
+
+		uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
+		uint64_t inner_size = MIN(asize, size - inner_offset);
+
+		offset += inner_size;
+		asize -= inner_size;
+		entries++;
+		m++;
+	}
+
+	size_t copy_length = entries * sizeof (*first_mapping);
+	duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
+	bcopy(first_mapping, duplicate_mappings, copy_length);
+	*copied_entries = entries;
+
+	return (duplicate_mappings);
+}
+
+/*
  * Goes through the relevant indirect mappings until it hits a concrete vdev
  * and issues the callback. On the way to the concrete vdev, if any other
  * indirect vdevs are encountered, then the callback will also be called on
@@ -891,24 +942,42 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint6
 	for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
 	    rs != NULL; rs = list_remove_head(&stack)) {
 		vdev_t *v = rs->rs_vd;
+		uint64_t num_entries = 0;
 
+		ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+		ASSERT(rs->rs_asize > 0);
+
 		/*
-		 * Note: this can be called from open context
-		 * (eg. zio_read()), so we need the rwlock to prevent
-		 * the mapping from being changed by condensing.
+		 * Note: As this function can be called from open context
+		 * (e.g. zio_read()), we need the following rwlock to
+		 * prevent the mapping from being changed by condensing.
+		 *
+		 * So we grab the lock and we make a copy of the entries
+		 * that are relevant to the extent that we are working on.
+		 * Once that is done, we drop the lock and iterate over
+		 * our copy of the mapping. Once we are done with the with
+		 * the remap segment and we free it, we also free our copy
+		 * of the indirect mapping entries that are relevant to it.
+		 *
+		 * This way we don't need to wait until the function is
+		 * finished with a segment, to condense it. In addition, we
+		 * don't need a recursive rwlock for the case that a call to
+		 * vdev_indirect_remap() needs to call itself (through the
+		 * codepath of its callback) for the same vdev in the middle
+		 * of its execution.
 		 */
 		rw_enter(&v->vdev_indirect_rwlock, RW_READER);
 		vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
 		ASSERT3P(vim, !=, NULL);
 
-		ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
-		ASSERT(rs->rs_asize > 0);
-
 		vdev_indirect_mapping_entry_phys_t *mapping =
-		    vdev_indirect_mapping_entry_for_offset(vim, rs->rs_offset);
+		    vdev_indirect_mapping_duplicate_adjacent_entries(v,
+		    rs->rs_offset, rs->rs_asize, &num_entries);
 		ASSERT3P(mapping, !=, NULL);
+		ASSERT3U(num_entries, >, 0);
+		rw_exit(&v->vdev_indirect_rwlock);
 
-		while (rs->rs_asize > 0) {
+		for (uint64_t i = 0; i < num_entries; i++) {
 			/*
 			 * Note: the vdev_indirect_mapping can not change
 			 * while we are running.  It only changes while the
@@ -917,20 +986,23 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint6
 			 * function is only called for frees, which also only
 			 * happen from syncing context.
 			 */
+			vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
 
-			uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
-			uint64_t dst_offset =
-			    DVA_GET_OFFSET(&mapping->vimep_dst);
-			uint64_t dst_vdev = DVA_GET_VDEV(&mapping->vimep_dst);
+			ASSERT3P(m, !=, NULL);
+			ASSERT3U(rs->rs_asize, >, 0);
 
+			uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
+			uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
+			uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
+
 			ASSERT3U(rs->rs_offset, >=,
-			    DVA_MAPPING_GET_SRC_OFFSET(mapping));
+			    DVA_MAPPING_GET_SRC_OFFSET(m));
 			ASSERT3U(rs->rs_offset, <,
-			    DVA_MAPPING_GET_SRC_OFFSET(mapping) + size);
+			    DVA_MAPPING_GET_SRC_OFFSET(m) + size);
 			ASSERT3U(dst_vdev, !=, v->vdev_id);
 
 			uint64_t inner_offset = rs->rs_offset -
-			    DVA_MAPPING_GET_SRC_OFFSET(mapping);
+			    DVA_MAPPING_GET_SRC_OFFSET(m);
 			uint64_t inner_size =
 			    MIN(rs->rs_asize, size - inner_offset);
 
@@ -971,10 +1043,10 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint6
 			rs->rs_offset += inner_size;
 			rs->rs_asize -= inner_size;
 			rs->rs_split_offset += inner_size;
-			mapping++;
 		}
+		VERIFY0(rs->rs_asize);
 
-		rw_exit(&v->vdev_indirect_rwlock);
+		kmem_free(mapping, num_entries * sizeof (*mapping));
 		kmem_free(rs, sizeof (remap_segment_t));
 	}
 	list_destroy(&stack);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201802220352.w1M3q4kN011959>