Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 22 Sep 2009 03:10:04 +0000 (UTC)
From:      Kip Macy <kmacy@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r197398 - in user/kmacy/releng_8_fcs: cddl/contrib/opensolaris/cmd/ztest sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys
Message-ID:  <200909220310.n8M3A4vX007682@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kmacy
Date: Tue Sep 22 03:10:04 2009
New Revision: 197398
URL: http://svn.freebsd.org/changeset/base/197398

Log:
   - make tx type operation dependent rather than vfs state dependent
   - simplify ZIL replay handing
   - add dmu_read_flags to allow explicit disabling of prefetch
  
  -  remove assert that doesn't apply to freebsd
  -  update ztest for new zil_replay

Modified:
  user/kmacy/releng_8_fcs/cddl/contrib/opensolaris/cmd/ztest/ztest.c
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
  user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c

Modified: user/kmacy/releng_8_fcs/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- user/kmacy/releng_8_fcs/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Tue Sep 22 03:10:04 2009	(r197398)
@@ -1304,7 +1304,7 @@ ztest_dmu_objset_create_destroy(ztest_ar
 	if (ztest_random(2) == 0 &&
 	    dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os) == 0) {
 		zr.zr_os = os;
-		zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector, NULL);
+		zil_replay(os, &zr, ztest_replay_vector);
 		dmu_objset_close(os);
 	}
 
@@ -3321,8 +3321,7 @@ ztest_run(char *pool)
 			if (test_future)
 				ztest_dmu_check_future_leak(&za[t]);
 			zr.zr_os = za[d].za_os;
-			zil_replay(zr.zr_os, &zr, &zr.zr_assign,
-			    ztest_replay_vector, NULL);
+			zil_replay(zr.zr_os, &zr, ztest_replay_vector);
 			za[d].za_zilog = zil_open(za[d].za_os, NULL);
 		}
 

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Tue Sep 22 03:10:04 2009	(r197398)
@@ -177,22 +177,22 @@ dmu_bonus_hold(objset_t *os, uint64_t ob
  * whose dnodes are in the same block.
  */
 static int
-dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+    int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 {
 	dsl_pool_t *dp = NULL;
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
-	uint32_t flags;
+	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio;
 	hrtime_t start;
 
 	ASSERT(length <= DMU_MAX_ACCESS);
 
-	flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
-	if (length > zfetch_array_rd_sz)
-		flags |= DB_RF_NOPREFETCH;
+	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
+	if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
+		dbuf_flags |= DB_RF_NOPREFETCH;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
@@ -230,7 +230,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn,
 		/* initiate async i/o */
 		if (read) {
 			rw_exit(&dn->dn_struct_rwlock);
-			(void) dbuf_read(db, zio, flags);
+			(void) dbuf_read(db, zio, dbuf_flags);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		}
 		dbp[i] = &db->db;
@@ -282,7 +282,7 @@ dmu_buf_hold_array(objset_t *os, uint64_
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
-	    numbufsp, dbpp);
+	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	dnode_rele(dn, FTAG);
 
@@ -297,7 +297,7 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *d
 	int err;
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
-	    numbufsp, dbpp);
+	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	return (err);
 }
@@ -536,8 +536,8 @@ dmu_free_range(objset_t *os, uint64_t ob
 }
 
 int
-dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    void *buf)
+dmu_read_flags(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    void *buf, uint32_t flags)
 {
 	dnode_t *dn;
 	dmu_buf_t **dbp;
@@ -567,7 +567,7 @@ dmu_read(objset_t *os, uint64_t object, 
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
-		    TRUE, FTAG, &numbufs, &dbp);
+		    TRUE, FTAG, &numbufs, &dbp, flags);
 		if (err)
 			break;
 
@@ -593,6 +593,13 @@ dmu_read(objset_t *os, uint64_t object, 
 	return (err);
 }
 
+int
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    void *buf)
+{
+	return dmu_read_flags(os, object, offset, size, buf, 0);
+}
+
 void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	Tue Sep 22 03:10:04 2009	(r197398)
@@ -447,8 +447,12 @@ int dmu_free_object(objset_t *os, uint64
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
+#define	DMU_READ_PREFETCH	0 /* prefetch */
+#define	DMU_READ_NO_PREFETCH	1 /* don't prefetch */
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	void *buf);
+int dmu_read_flags(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t size, void *buf, uint32_t flags);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
@@ -456,7 +460,10 @@ int dmu_write_uio(objset_t *os, uint64_t
     dmu_tx_t *tx);
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct page *pp, dmu_tx_t *tx);
-
+struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
+void dmu_return_arcbuf(struct arc_buf *buf);
+void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
+    dmu_tx_t *tx);	
 extern int zfs_prefetch_disable;
 
 /*

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h	Tue Sep 22 03:10:04 2009	(r197398)
@@ -47,7 +47,6 @@ struct zfsvfs {
 	uint64_t	z_root;		/* id of root znode */
 	uint64_t	z_unlinkedobj;	/* id of unlinked zapobj */
 	uint64_t	z_max_blksz;	/* maximum block size for files */
-	uint64_t	z_assign;	/* TXG_NOWAIT or set by zil_replay() */
 	uint64_t	z_fuid_obj;	/* fuid table object number */
 	uint64_t	z_fuid_size;	/* fuid table size */
 	avl_tree_t	z_fuid_idx;	/* fuid tree keyed by index */
@@ -72,6 +71,7 @@ struct zfsvfs {
 	boolean_t	z_issnap;	/* true if this is a snapshot */
 	boolean_t	z_vscan;	/* virus scan on/off */
 	boolean_t	z_use_fuids;	/* version allows fuids */
+	boolean_t	z_replay;	/* set during ZIL replay */
 	kmutex_t	z_online_recv_lock; /* recv in prog grabs as WRITER */
 	uint64_t	z_version;	/* ZPL version */
 #define	ZFS_OBJ_MTX_SZ	64

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h	Tue Sep 22 03:10:04 2009	(r197398)
@@ -335,7 +335,6 @@ typedef void zil_parse_blk_func_t(zilog_
 typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
     uint64_t txg);
 typedef int zil_replay_func_t();
-typedef void zil_replay_cleaner_t();
 typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
 
 extern uint64_t	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
@@ -350,9 +349,8 @@ extern void	zil_free(zilog_t *zilog);
 extern zilog_t	*zil_open(objset_t *os, zil_get_data_t *get_data);
 extern void	zil_close(zilog_t *zilog);
 
-extern void	zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-    zil_replay_func_t *replay_func[TX_MAX_TYPE],
-    zil_replay_cleaner_t *replay_cleaner);
+extern void	zil_replay(objset_t *os, void *arg, 
+    zil_replay_func_t *replay_func[TX_MAX_TYPE]);
 extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
 extern void	zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
 

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h	Tue Sep 22 03:10:04 2009	(r197398)
@@ -74,13 +74,14 @@ struct zilog {
 	uint64_t	zl_commit_seq;	/* committed upto this number */
 	uint64_t	zl_lr_seq;	/* log record sequence number */
 	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
-	uint64_t	zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
+	uint64_t	zl_replayed_seq[TXG_SIZE]; /* seq of last replayed rec */
+	uint64_t	zl_replaying_seq; /* current replay seq number */
 	uint32_t	zl_suspend;	/* log suspend count */
 	kcondvar_t	zl_cv_writer;	/* log writer thread completion */
 	kcondvar_t	zl_cv_suspend;	/* log suspend completion */
 	uint8_t		zl_suspending;	/* log is currently suspending */
 	uint8_t		zl_keep_first;	/* keep first log block in destroy */
-	uint8_t		zl_stop_replay;	/* don't replay any further */
+	uint8_t		zl_replay;	/* don't replay any further */
 	uint8_t		zl_stop_sync;	/* for debugging */
 	uint8_t		zl_writer;	/* boolean: write setup in progress */
 	uint8_t		zl_log_error;	/* boolean: log write error */
@@ -102,6 +103,9 @@ typedef struct zil_dva_node {
 	avl_node_t	zn_node;
 } zil_dva_node_t;
 
+#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
+    sizeof (lr_write_t))
+
 #ifdef	__cplusplus
 }
 #endif

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c	Tue Sep 22 03:10:04 2009	(r197398)
@@ -2137,12 +2137,12 @@ top:
 		}
 	}
 
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		mutex_exit(&zp->z_acl_lock);
 		mutex_exit(&zp->z_lock);
 
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -2197,7 +2197,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t
 
 	*check_privs = B_TRUE;
 
-	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
+	if (zfsvfs->z_replay) {
 		*working_mode = 0;
 		return (0);
 	}

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c	Tue Sep 22 03:10:04 2009	(r197398)
@@ -564,24 +564,6 @@ zfs_rmnode(znode_t *zp)
 	ASSERT(zp->z_phys->zp_links == 0);
 
 	/*
-	 * If this is a ZIL replay then leave the object in the unlinked set.
-	 * Otherwise we can get a deadlock, because the delete can be
-	 * quite large and span multiple tx's and txgs, but each replay
-	 * creates a tx to atomically run the replay function and mark the
-	 * replay record as complete. We deadlock trying to start a tx in
-	 * a new txg to further the deletion but can't because the replay
-	 * tx hasn't finished.
-	 *
-	 * We actually delete the object if we get a failure to create an
-	 * object in zil_replay_log_record(), or after calling zil_replay().
-	 */
-	if (zfsvfs->z_assign >= TXG_INITIAL) {
-		zfs_znode_dmu_fini(zp);
-		zfs_znode_free(zp);
-		return;
-	}
-
-	/*
 	 * If this is an attribute directory, purge its contents.
 	 */
 	if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
@@ -855,9 +837,9 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *
 			    FUID_SIZE_ESTIMATE(zfsvfs));
 		}
 	}
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+		if (error == ERESTART)
 			dmu_tx_wait(tx);
 		dmu_tx_abort(tx);
 		return (error);
@@ -944,7 +926,7 @@ top:
 	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
 	zfs_dirent_unlock(dl);
 
-	if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+	if (error == ERESTART) {
 		/* NB: we already did dmu_tx_wait() if necessary */
 		goto top;
 	}
@@ -975,7 +957,7 @@ zfs_sticky_remove_access(znode_t *zdp, z
 	uid_t		fowner;
 	zfsvfs_t	*zfsvfs = zdp->z_zfsvfs;
 
-	if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL)	/* ZIL replay */
+	if (zdp->z_zfsvfs->z_replay)
 		return (0);
 
 	if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c	Tue Sep 22 03:10:04 2009	(r197398)
@@ -525,7 +525,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64
 	uint32_t rid;
 	idmap_stat status;
 	uint64_t idx;
-	boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL);
 	zfs_fuid_t *zfuid = NULL;
 	zfs_fuid_info_t *fuidp;
 
@@ -540,7 +539,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64
 	if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
 		return (id);
 
-	if (is_replay) {
+	if (zfsvfs->z_replay) {
 		fuidp = zfsvfs->z_fuid_replay;
 
 		/*
@@ -594,7 +593,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64
 
 	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
 
-	if (!is_replay)
+	if (!zfsvfs->z_replay)
 		zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type);
 	else if (zfuid != NULL) {
 		list_remove(&fuidp->z_fuids, zfuid);

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	Tue Sep 22 03:10:04 2009	(r197398)
@@ -42,6 +42,17 @@
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/zfs_fuid.h>
+#include <sys/dsl_dataset.h>
+
+#define	ZFS_HANDLE_REPLAY(zilog, tx) \
+	if (zilog->zl_replay) { \
+		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \
+		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \
+		    zilog->zl_replaying_seq; \
+		return; \
+	}
+
+
 
 /*
  * All the functions in this file are used to construct the log entries
@@ -236,6 +247,8 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t 
 	if (zilog == NULL)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	/*
 	 * If we have FUIDs present then add in space for
 	 * domains and ACE fuid's if any.
@@ -339,6 +352,8 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t 
 	if (zilog == NULL)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+	
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
 	lr = (lr_remove_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
@@ -363,6 +378,8 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *t
 	if (zilog == NULL)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
 	lr = (lr_link_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
@@ -390,6 +407,8 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t
 	if (zilog == NULL)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
 	lr = (lr_create_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
@@ -424,6 +443,8 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t 
 	if (zilog == NULL)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
 	lr = (lr_rename_t *)&itx->itx_lr;
 	lr->lr_sdoid = sdzp->z_id;
@@ -456,6 +477,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *
 	if (zilog == NULL || zp->z_unlinked)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	/*
 	 * Writes are handled in three different ways:
 	 *
@@ -554,6 +577,8 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_
 	if (zilog == NULL || zp->z_unlinked)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+	
 	itx = zil_itx_create(txtype, sizeof (*lr));
 	lr = (lr_truncate_t *)&itx->itx_lr;
 	lr->lr_foid = zp->z_id;
@@ -583,6 +608,8 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t
 	if (zilog == NULL || zp->z_unlinked)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	/*
 	 * If XVATTR set, then log record size needs to allow
 	 * for lr_attr_t + xvattr mask, mapsize and create time
@@ -649,6 +676,8 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx
 	if (zilog == NULL || zp->z_unlinked)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
 	    TX_ACL_V0 : TX_ACL;
 

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	Tue Sep 22 03:10:04 2009	(r197398)
@@ -499,6 +499,13 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t
 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
 	mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
 
+	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+	if (zil_disable) {
+		zil_destroy(zfsvfs->z_log, 0);
+		zfsvfs->z_log = NULL;
+	}
+
+	
 	/*
 	 * If we are not mounting (ie: online recv), then we don't
 	 * have to worry about replaying the log as we blocked all
@@ -512,21 +519,27 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t
 		 * allow replays to succeed.
 		 */
 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
-		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
-
-		/*
-		 * Parse and replay the intent log.
-		 */
-		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
-		    zfs_replay_vector, zfs_unlinked_drain);
-
-		zfs_unlinked_drain(zfsvfs);
+		if (readonly != 0)
+			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+		else
+			zfs_unlinked_drain(zfsvfs);
+		
+		if (zfsvfs->z_log) {
+			
+			/*
+			 * Parse and replay the intent log.
+			 * Because of ziltest, this must be done after
+			 * zfs_unlinked_drain().  (Further note: ziltest
+			 * doesn't use readonly mounts, where
+			 */
+			zfsvfs->z_replay = B_TRUE;
+			zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
+			zfsvfs->z_replay = B_FALSE;
+		}
+		
 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
 	}
 
-	if (!zil_disable)
-		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
-
 	return (0);
 }
 
@@ -562,7 +575,6 @@ zfs_domount(vfs_t *vfsp, char *osname)
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 	zfsvfs->z_vfs = vfsp;
 	zfsvfs->z_parent = zfsvfs;
-	zfsvfs->z_assign = TXG_NOWAIT;
 	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	Tue Sep 22 03:10:04 2009	(r197398)
@@ -132,12 +132,12 @@
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
- *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
+ *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
- *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ *		if (error == ERESTART) {
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
@@ -793,10 +793,9 @@ zfs_write(vnode_t *vp, uio_t *uio, int i
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_bonus(tx, zp->z_id);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
-		error = dmu_tx_assign(tx, zfsvfs->z_assign);
+		error = dmu_tx_assign(tx, TXG_NOWAIT);
 		if (error) {
-			if (error == ERESTART &&
-			    zfsvfs->z_assign == TXG_NOWAIT) {
+			if (error == ERESTART) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				continue;
@@ -906,7 +905,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int i
 	 * If we're in replay mode, or we made no progress, return error.
 	 * Otherwise, it's at least a partial write, so it's successful.
 	 */
-	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
+	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -1397,11 +1396,10 @@ top:
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, SPA_MAXBLOCKSIZE);
 		}
-		error = dmu_tx_assign(tx, zfsvfs->z_assign);
+		error = dmu_tx_assign(tx, TXG_NOWAIT);
 		if (error) {
 			zfs_dirent_unlock(dl);
-			if (error == ERESTART &&
-			    zfsvfs->z_assign == TXG_NOWAIT) {
+			if (error == ERESTART) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
@@ -1610,11 +1608,11 @@ top:
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -1812,10 +1810,10 @@ top:
 	if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, SPA_MAXBLOCKSIZE);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -1942,13 +1940,13 @@ top:
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		rw_exit(&zp->z_parent_lock);
 		rw_exit(&zp->z_name_lock);
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -2843,7 +2841,7 @@ top:
 		dmu_tx_hold_bonus(tx, attrzp->z_id);
 	}
 
-	err = dmu_tx_assign(tx, zfsvfs->z_assign);
+	err = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (err) {
 		if (attrzp)
 			VN_RELE(ZTOV(attrzp));
@@ -2853,7 +2851,7 @@ top:
 			aclp = NULL;
 		}
 
-		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (err == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -3298,7 +3296,7 @@ top:
 	if (tzp)
 		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		if (zl != NULL)
 			zfs_rename_unlock(&zl);
@@ -3307,7 +3305,7 @@ top:
 		VN_RELE(ZTOV(szp));
 		if (tzp)
 			VN_RELE(ZTOV(tzp));
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -3444,10 +3442,10 @@ top:
 			    FUID_SIZE_ESTIMATE(zfsvfs));
 		}
 	}
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -3663,10 +3661,10 @@ top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, szp->z_id);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c	Tue Sep 22 03:10:04 2009	(r197398)
@@ -658,7 +658,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, d
 
 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
 
-	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
+	if (zfsvfs->z_replay) {
 		obj = vap->va_nodeid;
 		flag |= IS_REPLAY;
 		now = vap->va_ctime;		/* see zfs_replay_create() */
@@ -1196,9 +1196,9 @@ top:
 		newblksz = 0;
 	}
 
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -1318,9 +1318,9 @@ zfs_trunc(znode_t *zp, uint64_t end)
 top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -1397,9 +1397,9 @@ zfs_freesp(znode_t *zp, uint64_t off, ui
 log:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto log;
@@ -1505,7 +1505,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, 
 	bzero(&zfsvfs, sizeof (zfsvfs_t));
 
 	zfsvfs.z_os = os;
-	zfsvfs.z_assign = TXG_NOWAIT;
 	zfsvfs.z_parent = &zfsvfs;
 	zfsvfs.z_version = version;
 	zfsvfs.z_use_fuids = USE_FUIDS(version, os);

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c	Tue Sep 22 03:10:04 2009	(r197398)
@@ -1221,7 +1221,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
-	zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
+	zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK];
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
@@ -1230,7 +1230,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 		ASSERT(spa_sync_pass(spa) == 1);
 
 		bzero(zh, sizeof (zil_header_t));
-		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
+		bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
@@ -1467,9 +1467,7 @@ zil_resume(zilog_t *zilog)
 typedef struct zil_replay_arg {
 	objset_t	*zr_os;
 	zil_replay_func_t **zr_replay;
-	zil_replay_cleaner_t *zr_replay_cleaner;
 	void		*zr_arg;
-	uint64_t	*zr_txgp;
 	boolean_t	zr_byteswap;
 	char		*zr_lrbuf;
 } zil_replay_arg_t;
@@ -1484,7 +1482,7 @@ zil_replay_log_record(zilog_t *zilog, lr
 	char *name;
 	int pass, error, sunk;
 
-	if (zilog->zl_stop_replay)
+	if (!zilog->zl_replay)			/* giving up */
 		return;
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
@@ -1548,44 +1546,14 @@ zil_replay_log_record(zilog_t *zilog, lr
 	/*
 	 * We must now do two things atomically: replay this log record,
 	 * and update the log header to reflect the fact that we did so.
-	 * We use the DMU's ability to assign into a specific txg to do this.
+	 * At the end of each replay function the sequence number
+	 * is updated if we are in replay mode.
 	 */
-	for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
-		uint64_t replay_txg;
-		dmu_tx_t *replay_tx;
-
-		replay_tx = dmu_tx_create(zr->zr_os);
-		error = dmu_tx_assign(replay_tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(replay_tx);
-			break;
-		}
-
-		replay_txg = dmu_tx_get_txg(replay_tx);
-
-		if (txtype == 0 || txtype >= TX_MAX_TYPE) {
-			error = EINVAL;
-		} else {
-			/*
-			 * On the first pass, arrange for the replay vector
-			 * to fail its dmu_tx_assign().  That's the only way
-			 * to ensure that those code paths remain well tested.
-			 *
-			 * Only byteswap (if needed) on the 1st pass.
-			 */
-			*zr->zr_txgp = replay_txg - (pass == 1);
-			error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
-			    zr->zr_byteswap && pass == 1);
-			*zr->zr_txgp = TXG_NOWAIT;
-		}
-
-		if (error == 0) {
-			dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
-			zilog->zl_replay_seq[replay_txg & TXG_MASK] =
-			    lr->lrc_seq;
-		}
-
-		dmu_tx_commit(replay_tx);
+	for (pass = 1; pass <= 2; pass++) {
+		zilog->zl_replaying_seq = lr->lrc_seq;
+		/* Only byteswap (if needed) on the 1st pass.  */
+		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
+		    zr->zr_byteswap && pass == 1);
 
 		if (!error)
 			return;
@@ -1593,36 +1561,22 @@ zil_replay_log_record(zilog_t *zilog, lr
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
-		 * EEXIST. So if we receive any error other than ERESTART
-		 * we try syncing out any removes then retrying the
-		 * transaction.
+		 * EEXIST. So if we receive any error we try syncing out
+		 * any removes then retry the transaction.
 		 */
-		if (error != ERESTART && !sunk) {
-			if (zr->zr_replay_cleaner)
-				zr->zr_replay_cleaner(zr->zr_arg);
+		if (pass == 1)
 			txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
-			sunk = B_TRUE;
-			continue; /* retry */
-		}
-
-		if (error != ERESTART)
-			break;
-
-		if (pass != 1)
-			txg_wait_open(spa_get_dsl(zilog->zl_spa),
-			    replay_txg + 1);
-
-		dprintf("pass %d, retrying\n", pass);
 	}
-
-	ASSERT(error && error != ERESTART);
+bad:
+	
+	ASSERT(error);
 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 	dmu_objset_name(zr->zr_os, name);
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
 	    "dataset %s, seq 0x%llx, txtype %llu %s\n",
 	    error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
-	zilog->zl_stop_replay = 1;
+	zilog->zl_replay = B_FALSE;
 	kmem_free(name, MAXNAMELEN);
 }
 
@@ -1637,9 +1591,8 @@ zil_incr_blks(zilog_t *zilog, blkptr_t *
  * If this dataset has a non-empty intent log, replay it and destroy it.
  */
 void
-zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-	zil_replay_func_t *replay_func[TX_MAX_TYPE],
-	zil_replay_cleaner_t *replay_cleaner)
+zil_replay(objset_t *os, void *arg,
+    zil_replay_func_t *replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
@@ -1653,9 +1606,7 @@ zil_replay(objset_t *os, void *arg, uint
 
 	zr.zr_os = os;
 	zr.zr_replay = replay_func;
-	zr.zr_replay_cleaner = replay_cleaner;
 	zr.zr_arg = arg;
-	zr.zr_txgp = txgp;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
@@ -1664,7 +1615,7 @@ zil_replay(objset_t *os, void *arg, uint
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
-	zilog->zl_stop_replay = 0;
+	zilog->zl_replay = B_TRUE;
 	zilog->zl_replay_time = LBOLT;
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
@@ -1673,6 +1624,7 @@ zil_replay(objset_t *os, void *arg, uint
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+	zilog->zl_replay = B_FALSE;
 	//printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name);
 }
 

Modified: user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
==============================================================================
--- user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	Mon Sep 21 23:58:29 2009	(r197397)
+++ user/kmacy/releng_8_fcs/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	Tue Sep 22 03:10:04 2009	(r197398)
@@ -73,6 +73,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/zvol.h>
 #include <geom/geom.h>
+#include <sys/zil_impl.h>
 
 #include "zfs_namecheck.h"
 
@@ -138,6 +139,7 @@ typedef struct zvol_state {
 #define	ZVOL_RDONLY	0x1
 #define	ZVOL_DUMPIFIED	0x2
 #define	ZVOL_EXCL	0x4
+#define	ZVOL_WCE	0x8
 
 /*
  * zvol maximum transfer in one DMU tx.
@@ -294,28 +296,72 @@ zvol_access(struct g_provider *pp, int a
 ssize_t zvol_immediate_write_sz = 32768;
 
 static void
-zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
+	boolean_t sync)
 {
 	uint32_t blocksize = zv->zv_volblocksize;
-	lr_write_t *lr;
+	zilog_t *zilog = zv->zv_zilog;
+	boolean_t slogging;
 
-	while (len) {
-		ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
-		itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+	if (zil_disable)
+		return;
 
-		itx->itx_wr_state =
-		    len > zvol_immediate_write_sz ?  WR_INDIRECT : WR_NEED_COPY;
-		itx->itx_private = zv;
+	if (zilog->zl_replay) {
+		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
+		    zilog->zl_replaying_seq;
+		return;
+	}
+	slogging = spa_has_slogs(zilog->zl_spa);
+
+	while (resid) {
+		ssize_t len;
+		itx_t *itx;
+		lr_write_t *lr;
+		itx_wr_state_t write_state;
+
+		/*
+		 * Unlike zfs_log_write() we can be called with
+		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
+		 */
+		if (blocksize > zvol_immediate_write_sz && !slogging &&
+		    resid >= blocksize && off % blocksize == 0) {
+			write_state = WR_INDIRECT; /* uses dmu_sync */
+			len = blocksize;
+		} else if (sync) {
+			write_state = WR_COPIED;
+			len = MIN(ZIL_MAX_LOG_DATA, resid);
+		} else {
+			write_state = WR_NEED_COPY;
+			len = MIN(ZIL_MAX_LOG_DATA, resid);
+		}
+
+		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
+		    (write_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
+		if (write_state == WR_COPIED && dmu_read_flags(zv->zv_objset,
+		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
+			kmem_free(itx, offsetof(itx_t, itx_lr) +
+			    itx->itx_lr.lrc_reclen);
+			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+			lr = (lr_write_t *)&itx->itx_lr;
+			write_state = WR_NEED_COPY;
+		}
+
+		itx->itx_wr_state = write_state;
+		if (write_state == WR_NEED_COPY)
+			itx->itx_sod += len;
+		itx->itx_private = zv;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = off;
-		lr->lr_length = nbytes;
+		lr->lr_length = len;
 		lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
 		BP_ZERO(&lr->lr_blkptr);
 
 		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
-		len -= nbytes;
-		off += nbytes;
+
+		off += len;
+		resid -= len;
 	}
 }
 
@@ -353,6 +399,7 @@ zvol_serve_one(zvol_state_t *zv, struct 
 	rl_t *rl;
 	int error = 0;
 	boolean_t reading;
+	boolean_t sync;
 
 	off = bp->bio_offset;
 	volsize = zv->zv_volsize;
@@ -365,12 +412,15 @@ zvol_serve_one(zvol_state_t *zv, struct 
 
 	error = 0;
 
+
+	reading = (bp->bio_cmd == BIO_READ);
+	sync =  /* !(bp->b_flags & B_ASYNC) &&  !is_dump && */ !reading &&
+	    !(zv->zv_flags & ZVOL_WCE) && !zil_disable;	
 	/*
 	 * There must be no buffer changes when doing a dmu_sync() because
 	 * we can't change the data whilst calculating the checksum.
 	 * A better approach than a per zvol rwlock would be to lock ranges.
 	 */
-	reading = (bp->bio_cmd == BIO_READ);
 	rl = zfs_range_lock(&zv->zv_znode, off, resid,
 	    reading ? RL_READER : RL_WRITER);
 
@@ -391,7 +441,7 @@ zvol_serve_one(zvol_state_t *zv, struct 

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200909220310.n8M3A4vX007682>