Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 12 Jul 2016 11:18:25 +0000 (UTC)
From:      Andriy Gapon <avg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r302641 - in vendor-sys/illumos/dist/uts/common/fs/zfs: . sys
Message-ID:  <201607121118.u6CBIPSR089241@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: avg
Date: Tue Jul 12 11:18:25 2016
New Revision: 302641
URL: https://svnweb.freebsd.org/changeset/base/302641

Log:
  6844 dnode_next_offset can detect fictional holes
  
  illumos/illumos-gate@11ceac77ea8034bf2fe9bdd6d314f5d1e5ceeba3
  https://github.com/illumos/illumos-gate/commit/11ceac77ea8034bf2fe9bdd6d314f5d1e5ceeba3
  
  https://www.illumos.org/issues/6844
    dnode_next_offset is used in a variety of places to iterate over the holes or
    allocated blocks in a dnode. It operates under the premise that it can iterate
    over the blockpointers of a dnode in open context while holding only the
    dn_struct_rwlock as reader. Unfortunately, this premise does not hold.
    When we create the zio for a dbuf, we pass in the actual block pointer in the
    indirect block above that dbuf. When we later zero the bp in
    zio_write_compress, we are directly modifying the bp. The state of the bp is
    now inconsistent from the perspective of dnode_next_offset: the bp will appear
    to be a hole until zio_dva_allocate finally finishes filling it in. In the
    meantime, dnode_next_offset can detect a hole in the dnode when none exists.
    I was able to experimentally demonstrate this behavior with the following
    setup:
    1. Create a file with 1 million dbufs.
    2. Create a thread that randomly dirties L2 blocks by writing to the first L0
    block under them.
    3. Observe dnode_next_offset, waiting for it to skip over a hole in the middle
    of a file.
    4. Do dnode_next_offset in a loop until we skip over such a non-existent hole.
    The fix is to ensure that it is valid to iterate over the indirect blocks in a
    dnode while holding the dn_struct_rwlock by passing the zio a copy of the BP
    and updating the actual BP in dbuf_write_ready while holding the lock.
  
  Reviewed by: Matthew Ahrens <mahrens@delphix.com>
  Reviewed by: George Wilson <george.wilson@delphix.com>
  Reviewed by: Boris Protopopov <bprotopopov@hotmail.com>
  Approved by: Dan McDonald <danmcd@omniti.com>
  Author: Alex Reece <alex@delphix.com>

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c	Tue Jul 12 11:16:43 2016	(r302640)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c	Tue Jul 12 11:18:25 2016	(r302641)
@@ -2879,7 +2879,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *
 	uint64_t fill = 0;
 	int i;
 
-	ASSERT3P(db->db_blkptr, ==, bp);
+	ASSERT3P(db->db_blkptr, !=, NULL);
+	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
@@ -2901,7 +2902,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
-		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+		ASSERT(!(BP_IS_HOLE(bp)) &&
 		    db->db_blkptr == &dn->dn_phys->dn_spill);
 	}
 #endif
@@ -2942,6 +2943,10 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *
 		bp->blk_fill = fill;
 
 	mutex_exit(&db->db_mtx);
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	*db->db_blkptr = *bp;
+	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
@@ -3120,6 +3125,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
 	zio_t *zio;
 	int wp_flag = 0;
 
+	ASSERT(dmu_tx_is_syncing(tx));
+
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	os = dn->dn_objset;
@@ -3178,6 +3185,14 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 	DB_DNODE_EXIT(db);
 
+	/*
+	 * We copy the blkptr now (rather than when we instantiate the dirty
+	 * record), because its value can change between open context and
+	 * syncing context. We do not need to hold dn_struct_rwlock to read
+	 * db_blkptr because we are in syncing context.
+	 */
+	dr->dr_bp_copy = *db->db_blkptr;
+
 	if (db->db_level == 0 &&
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
@@ -3187,7 +3202,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
 		void *contents = (data != NULL) ? data->b_data : NULL;
 
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
-		    db->db_blkptr, contents, db->db.db_size, &zp,
+		    &dr->dr_bp_copy, contents, db->db.db_size, &zp,
 		    dbuf_write_override_ready, NULL, dbuf_write_override_done,
 		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
@@ -3199,14 +3214,14 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
-		    db->db_blkptr, NULL, db->db.db_size, &zp,
+		    &dr->dr_bp_copy, NULL, db->db.db_size, &zp,
 		    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
 	} else {
 		ASSERT(arc_released(data));
 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
-		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
+		    &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
 		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
 		    dbuf_write_physdone, dbuf_write_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h	Tue Jul 12 11:16:43 2016	(r302640)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h	Tue Jul 12 11:18:25 2016	(r302641)
@@ -121,6 +121,9 @@ typedef struct dbuf_dirty_record {
 	/* How much space was changed to dsl_pool_dirty_space() for this? */
 	unsigned int dr_accounted;
 
+	/* A copy of the bp that points to us */
+	blkptr_t dr_bp_copy;
+
 	union dirty_types {
 		struct dirty_indirect {
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201607121118.u6CBIPSR089241>