Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 17 Nov 2016 22:22:00 +0000
From:      Steven Hartland <steven.hartland@multiplay.co.uk>
To:        Alexander Motin <mav@FreeBSD.org>, src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   Re: svn commit: r308782 - in head: cddl/contrib/opensolaris/cmd/ztest sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys
Message-ID:  <7675c768-c1b1-108b-b651-28b6184f8176@multiplay.co.uk>
In-Reply-To: <725f293e-3d3b-1248-4900-acee5659d382@FreeBSD.org>
References:  <201611172101.uAHL1R2O009871@repo.freebsd.org> <79c521bc-3186-10a5-d553-12072523f439@multiplay.co.uk> <725f293e-3d3b-1248-4900-acee5659d382@FreeBSD.org>

next in thread | previous in thread | raw e-mail | index | archive | help
Thanks, looks like the PR needs a rebase before it can be merged.

On 17/11/2016 22:11, Alexander Motin wrote:
> It is in OpenZFS review queue now:
> https://github.com/openzfs/openzfs/pull/219  Welcome to comment there to
> speed up the process.
>
> On 17.11.2016 13:43, Steven Hartland wrote:
>> Is this something that should be upstreamed?
>>
>> On 17/11/2016 21:01, Alexander Motin wrote:
>>> Author: mav
>>> Date: Thu Nov 17 21:01:27 2016
>>> New Revision: 308782
>>> URL: https://svnweb.freebsd.org/changeset/base/308782
>>>
>>> Log:
>>>    After some ZIL changes 6 years ago zil_slog_limit got partially broken
>>>    due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
>>>    Actually because of other changes about that time zl_itx_list_sz is not
>>>    really required to implement the functionality, so this patch removes
>>>    some unneeded broken code and variables.
>>>    
>>>    Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
>>>    single heavy logger, that increased latency for other (more latency critical)
>>>    loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
>>>    huge latency increase for heavy writers, this implementation caused double
>>>    write of all data, since the log records were explicitly prepared for SLOG.
>>>    Since we now have I/O scheduler, I've found it can be much more efficient
>>>    to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
>>>    to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
>>>    
>>>    Existing ZIL implementation had problem with space efficiency when it
>>>    has to write large chunks of data into log blocks of limited size. In some
>>>    cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
>>>    spinning rust, that also reduced log write speed in half, since head had to
>>>    uselessly fly over allocated but not written areas. This change improves
>>>    the situation by offloading problematic operations from z*_log_write() to
>>>    zil_lwb_commit(), which knows real situation of log blocks allocation and
>>>    can split large requests into pieces much more efficiently. Also as side
>>>    effect it removes one of two data copy operations done by ZIL code WR_COPIED
>>>    case.
>>>    
>>>    While there, untangle and unify code of z*_log_write() functions.
>>>    Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
>>>    block boundary, that may also improve efficiency if ZPL is made to do that.
>>>    
>>>    Sponsored by:	iXsystems, Inc.
>>>
>>> Modified:
>>>    head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
>>>    head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
>>>    head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
>>>    head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
>>>    head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
>>>    head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
>>>    head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
>>>    head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
>>>
>>> Modified: head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
>>> ==============================================================================
>>> --- head/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Thu Nov 17 20:44:51 2016	(r308781)
>>> +++ head/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Thu Nov 17 21:01:27 2016	(r308782)
>>> @@ -1371,7 +1371,6 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t
>>>   	itx->itx_private = zd;
>>>   	itx->itx_wr_state = write_state;
>>>   	itx->itx_sync = (ztest_random(8) == 0);
>>> -	itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
>>>   
>>>   	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
>>>   	    sizeof (*lr) - sizeof (lr_t));
>>>
>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
>>> ==============================================================================
>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h	Thu Nov 17 20:44:51 2016	(r308781)
>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h	Thu Nov 17 21:01:27 2016	(r308782)
>>> @@ -369,7 +369,6 @@ typedef struct itx {
>>>   	void		*itx_private;	/* type-specific opaque data */
>>>   	itx_wr_state_t	itx_wr_state;	/* write state */
>>>   	uint8_t		itx_sync;	/* synchronous transaction */
>>> -	uint64_t	itx_sod;	/* record size on disk */
>>>   	uint64_t	itx_oid;	/* object id */
>>>   	lr_t		itx_lr;		/* common part of log record */
>>>   	/* followed by type-specific part of lr_xx_t and its immediate data */
>>>
>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
>>> ==============================================================================
>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h	Thu Nov 17 20:44:51 2016	(r308781)
>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h	Thu Nov 17 21:01:27 2016	(r308782)
>>> @@ -42,6 +42,7 @@ extern "C" {
>>>   typedef struct lwb {
>>>   	zilog_t		*lwb_zilog;	/* back pointer to log struct */
>>>   	blkptr_t	lwb_blk;	/* on disk address of this log blk */
>>> +	boolean_t	lwb_slog;	/* lwb_blk is on SLOG device */
>>>   	int		lwb_nused;	/* # used bytes in buffer */
>>>   	int		lwb_sz;		/* size of block and buffer */
>>>   	char		*lwb_buf;	/* log write buffer */
>>> @@ -62,7 +63,6 @@ typedef struct itxs {
>>>   typedef struct itxg {
>>>   	kmutex_t	itxg_lock;	/* lock for this structure */
>>>   	uint64_t	itxg_txg;	/* txg for this chain */
>>> -	uint64_t	itxg_sod;	/* total size on disk for this txg */
>>>   	itxs_t		*itxg_itxs;	/* sync and async itxs */
>>>   } itxg_t;
>>>   
>>> @@ -120,7 +120,6 @@ struct zilog {
>>>   	kcondvar_t	zl_cv_batch[2];	/* batch condition variables */
>>>   	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
>>>   	list_t		zl_itx_commit_list; /* itx list to be committed */
>>> -	uint64_t	zl_itx_list_sz;	/* total size of records on list */
>>>   	uint64_t	zl_cur_used;	/* current commit log size used */
>>>   	list_t		zl_lwb_list;	/* in-flight log write list */
>>>   	kmutex_t	zl_vdev_lock;	/* protects zl_vdev_tree */
>>> @@ -142,6 +141,8 @@ typedef struct zil_bp_node {
>>>   
>>>   #define	ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
>>>       sizeof (lr_write_t))
>>> +#define	ZIL_MAX_COPIED_DATA \
>>> +    ((SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))
>>>   
>>>   #ifdef	__cplusplus
>>>   }
>>>
>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
>>> ==============================================================================
>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	Thu Nov 17 20:44:51 2016	(r308781)
>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	Thu Nov 17 21:01:27 2016	(r308782)
>>> @@ -547,7 +547,7 @@ extern zio_t *zio_free_sync(zio_t *pio,
>>>       const blkptr_t *bp, uint64_t size, enum zio_flag flags);
>>>   
>>>   extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
>>> -    blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
>>> +    blkptr_t *old_bp, uint64_t size, boolean_t *slog);
>>>   extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
>>>   extern void zio_flush(zio_t *zio, vdev_t *vd);
>>>   extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,
>>>
>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
>>> ==============================================================================
>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	Thu Nov 17 20:44:51 2016	(r308781)
>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	Thu Nov 17 21:01:27 2016	(r308782)
>>> @@ -464,20 +464,17 @@ void
>>>   zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
>>>       znode_t *zp, offset_t off, ssize_t resid, int ioflag)
>>>   {
>>> +	uint32_t blocksize = zp->z_blksz;
>>>   	itx_wr_state_t write_state;
>>> -	boolean_t slogging;
>>>   	uintptr_t fsync_cnt;
>>> -	ssize_t immediate_write_sz;
>>>   
>>>   	if (zil_replaying(zilog, tx) || zp->z_unlinked)
>>>   		return;
>>>   
>>> -	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
>>> -	    ? 0 : zfs_immediate_write_sz;
>>> -
>>> -	slogging = spa_has_slogs(zilog->zl_spa) &&
>>> -	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
>>> -	if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
>>> +	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
>>> +		write_state = WR_INDIRECT;
>>> +	else if (!spa_has_slogs(zilog->zl_spa) &&
>>> +	    resid >= zfs_immediate_write_sz)
>>>   		write_state = WR_INDIRECT;
>>>   	else if (ioflag & (FSYNC | FDSYNC))
>>>   		write_state = WR_COPIED;
>>> @@ -491,30 +488,26 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *
>>>   	while (resid) {
>>>   		itx_t *itx;
>>>   		lr_write_t *lr;
>>> -		ssize_t len;
>>> +		itx_wr_state_t wr_state = write_state;
>>> +		ssize_t len = resid;
>>>   
>>> -		/*
>>> -		 * If the write would overflow the largest block then split it.
>>> -		 */
>>> -		if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
>>> -			len = SPA_OLD_MAXBLOCKSIZE >> 1;
>>> -		else
>>> -			len = resid;
>>> +		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
>>> +			wr_state = WR_NEED_COPY;
>>> +		else if (wr_state == WR_INDIRECT)
>>> +			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
>>>   
>>>   		itx = zil_itx_create(txtype, sizeof (*lr) +
>>> -		    (write_state == WR_COPIED ? len : 0));
>>> +		    (wr_state == WR_COPIED ? len : 0));
>>>   		lr = (lr_write_t *)&itx->itx_lr;
>>> -		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
>>> +		if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
>>>   		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
>>>   			zil_itx_destroy(itx);
>>>   			itx = zil_itx_create(txtype, sizeof (*lr));
>>>   			lr = (lr_write_t *)&itx->itx_lr;
>>> -			write_state = WR_NEED_COPY;
>>> +			wr_state = WR_NEED_COPY;
>>>   		}
>>>   
>>> -		itx->itx_wr_state = write_state;
>>> -		if (write_state == WR_NEED_COPY)
>>> -			itx->itx_sod += len;
>>> +		itx->itx_wr_state = wr_state;
>>>   		lr->lr_foid = zp->z_id;
>>>   		lr->lr_offset = off;
>>>   		lr->lr_length = len;
>>>
>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
>>> ==============================================================================
>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c	Thu Nov 17 20:44:51 2016	(r308781)
>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c	Thu Nov 17 21:01:27 2016	(r308782)
>>> @@ -88,6 +88,15 @@ SYSCTL_DECL(_vfs_zfs_trim);
>>>   SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
>>>       "Enable ZFS TRIM");
>>>   
>>> +/*
>>> + * Limit SLOG write size per commit executed with synchronous priority.
>>> + * Any writes above that executed with lower (asynchronous) priority to
>>> + * limit potential SLOG device abuse by single active ZIL writer.
>>> + */
>>> +uint64_t zil_slog_limit = 768 * 1024;
>>> +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_limit, CTLFLAG_RWTUN,
>>> +    &zil_slog_limit, 0, "Maximal SLOG commit size with sync priority");
>>> +
>>>   static kmem_cache_t *zil_lwb_cache;
>>>   
>>>   #define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
>>> @@ -447,13 +456,14 @@ zil_free_log_record(zilog_t *zilog, lr_t
>>>   }
>>>   
>>>   static lwb_t *
>>> -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
>>> +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
>>>   {
>>>   	lwb_t *lwb;
>>>   
>>>   	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
>>>   	lwb->lwb_zilog = zilog;
>>>   	lwb->lwb_blk = *bp;
>>> +	lwb->lwb_slog = slog;
>>>   	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
>>>   	lwb->lwb_max_txg = txg;
>>>   	lwb->lwb_zio = NULL;
>>> @@ -516,6 +526,7 @@ zil_create(zilog_t *zilog)
>>>   	dmu_tx_t *tx = NULL;
>>>   	blkptr_t blk;
>>>   	int error = 0;
>>> +	boolean_t slog = FALSE;
>>>   
>>>   	/*
>>>   	 * Wait for any previous destroy to complete.
>>> @@ -544,7 +555,7 @@ zil_create(zilog_t *zilog)
>>>   		}
>>>   
>>>   		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
>>> -		    ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
>>> +		    ZIL_MIN_BLKSZ, &slog);
>>>   
>>>   		if (error == 0)
>>>   			zil_init_log_chain(zilog, &blk);
>>> @@ -554,7 +565,7 @@ zil_create(zilog_t *zilog)
>>>   	 * Allocate a log write buffer (lwb) for the first log block.
>>>   	 */
>>>   	if (error == 0)
>>> -		lwb = zil_alloc_lwb(zilog, &blk, txg);
>>> +		lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
>>>   
>>>   	/*
>>>   	 * If we just allocated the first log block, commit our transaction
>>> @@ -885,6 +896,7 @@ static void
>>>   zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
>>>   {
>>>   	zbookmark_phys_t zb;
>>> +	zio_priority_t prio;
>>>   
>>>   	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
>>>   	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
>>> @@ -895,9 +907,13 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t
>>>   		    ZIO_FLAG_CANFAIL);
>>>   	}
>>>   	if (lwb->lwb_zio == NULL) {
>>> +		if (zilog->zl_cur_used <= zil_slog_limit || !lwb->lwb_slog)
>>> +			prio = ZIO_PRIORITY_SYNC_WRITE;
>>> +		else
>>> +			prio = ZIO_PRIORITY_ASYNC_WRITE;
>>>   		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
>>>   		    0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
>>> -		    zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
>>> +		    zil_lwb_write_done, lwb, prio,
>>>   		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
>>>   	}
>>>   }
>>> @@ -917,18 +933,6 @@ uint64_t zil_block_buckets[] = {
>>>   };
>>>   
>>>   /*
>>> - * Use the slog as long as the logbias is 'latency' and the current commit size
>>> - * is less than the limit or the total list size is less than 2X the limit.
>>> - * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
>>> - */
>>> -uint64_t zil_slog_limit = 1024 * 1024;
>>> -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_limit, CTLFLAG_RWTUN,
>>> -    &zil_slog_limit, 0, "Maximal commit size to use SLOG");
>>> -#define	USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
>>> -	(((zilog)->zl_cur_used < zil_slog_limit) || \
>>> -	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
>>> -
>>> -/*
>>>    * Start a log block write and advance to the next log block.
>>>    * Calls are serialized.
>>>    */
>>> @@ -943,6 +947,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
>>>   	uint64_t txg;
>>>   	uint64_t zil_blksz, wsz;
>>>   	int i, error;
>>> +	boolean_t slog;
>>>   
>>>   	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
>>>   		zilc = (zil_chain_t *)lwb->lwb_buf;
>>> @@ -999,8 +1004,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
>>>   
>>>   	BP_ZERO(bp);
>>>   	/* pass the old blkptr in order to spread log blocks across devs */
>>> -	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
>>> -	    USE_SLOG(zilog));
>>> +	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
>>>   	if (error == 0) {
>>>   		ASSERT3U(bp->blk_birth, ==, txg);
>>>   		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
>>> @@ -1009,7 +1013,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
>>>   		/*
>>>   		 * Allocate a new log write buffer (lwb).
>>>   		 */
>>> -		nlwb = zil_alloc_lwb(zilog, bp, txg);
>>> +		nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
>>>   
>>>   		/* Record the block for later vdev flushing */
>>>   		zil_add_block(zilog, &lwb->lwb_blk);
>>> @@ -1046,12 +1050,13 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
>>>   static lwb_t *
>>>   zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
>>>   {
>>> -	lr_t *lrc = &itx->itx_lr; /* common log record */
>>> -	lr_write_t *lrw = (lr_write_t *)lrc;
>>> +	lr_t *lrcb, *lrc = &itx->itx_lr; /* common log record */
>>> +	lr_write_t *lrwb, *lrw = (lr_write_t *)lrc;
>>>   	char *lr_buf;
>>>   	uint64_t txg = lrc->lrc_txg;
>>>   	uint64_t reclen = lrc->lrc_reclen;
>>>   	uint64_t dlen = 0;
>>> +	uint64_t dnow, lwb_sp;
>>>   
>>>   	if (lwb == NULL)
>>>   		return (NULL);
>>> @@ -1068,25 +1073,30 @@ zil_lwb_commit(zilog_t *zilog, itx_t *it
>>>   
>>>   	zil_lwb_write_init(zilog, lwb);
>>>   
>>> +cont:
>>>   	/*
>>>   	 * If this record won't fit in the current log block, start a new one.
>>> +	 * For WR_NEED_COPY optimize layout for minimal number of chunks, but
>>> +	 * try to keep wasted space withing reasonable range (12%).
>>>   	 */
>>> -	if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
>>> +	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
>>> +	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
>>> +	    lwb_sp < ZIL_MAX_LOG_DATA / 8 && (dlen % ZIL_MAX_LOG_DATA == 0 ||
>>> +	    lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
>>>   		lwb = zil_lwb_write_start(zilog, lwb);
>>>   		if (lwb == NULL)
>>>   			return (NULL);
>>>   		zil_lwb_write_init(zilog, lwb);
>>>   		ASSERT(LWB_EMPTY(lwb));
>>> -		if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
>>> -			txg_wait_synced(zilog->zl_dmu_pool, txg);
>>> -			return (lwb);
>>> -		}
>>> +		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
>>> +		ASSERT3U(reclen + MIN(dlen, sizeof(uint64_t)), <=, lwb_sp);
>>>   	}
>>>   
>>> +	dnow = MIN(dlen, lwb_sp - reclen);
>>>   	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
>>>   	bcopy(lrc, lr_buf, reclen);
>>> -	lrc = (lr_t *)lr_buf;
>>> -	lrw = (lr_write_t *)lrc;
>>> +	lrcb = (lr_t *)lr_buf;
>>> +	lrwb = (lr_write_t *)lrcb;
>>>   
>>>   	/*
>>>   	 * If it's a write, fetch the data or get its blkptr as appropriate.
>>> @@ -1098,16 +1108,19 @@ zil_lwb_commit(zilog_t *zilog, itx_t *it
>>>   			char *dbuf;
>>>   			int error;
>>>   
>>> -			if (dlen) {
>>> -				ASSERT(itx->itx_wr_state == WR_NEED_COPY);
>>> +			if (itx->itx_wr_state == WR_NEED_COPY) {
>>>   				dbuf = lr_buf + reclen;
>>> -				lrw->lr_common.lrc_reclen += dlen;
>>> +				lrcb->lrc_reclen += dnow;
>>> +				if (lrwb->lr_length > dnow)
>>> +					lrwb->lr_length = dnow;
>>> +				lrw->lr_offset += dnow;
>>> +				lrw->lr_length -= dnow;
>>>   			} else {
>>>   				ASSERT(itx->itx_wr_state == WR_INDIRECT);
>>>   				dbuf = NULL;
>>>   			}
>>>   			error = zilog->zl_get_data(
>>> -			    itx->itx_private, lrw, dbuf, lwb->lwb_zio);
>>> +			    itx->itx_private, lrwb, dbuf, lwb->lwb_zio);
>>>   			if (error == EIO) {
>>>   				txg_wait_synced(zilog->zl_dmu_pool, txg);
>>>   				return (lwb);
>>> @@ -1126,12 +1139,18 @@ zil_lwb_commit(zilog_t *zilog, itx_t *it
>>>   	 * equal to the itx sequence number because not all transactions
>>>   	 * are synchronous, and sometimes spa_sync() gets there first.
>>>   	 */
>>> -	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
>>> -	lwb->lwb_nused += reclen + dlen;
>>> +	lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
>>> +	lwb->lwb_nused += reclen + dnow;
>>>   	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
>>>   	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
>>>   	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
>>>   
>>> +	dlen -= dnow;
>>> +	if (dlen > 0) {
>>> +		zilog->zl_cur_used += reclen;
>>> +		goto cont;
>>> +	}
>>> +
>>>   	return (lwb);
>>>   }
>>>   
>>> @@ -1145,7 +1164,6 @@ zil_itx_create(uint64_t txtype, size_t l
>>>   	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
>>>   	itx->itx_lr.lrc_txtype = txtype;
>>>   	itx->itx_lr.lrc_reclen = lrsize;
>>> -	itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
>>>   	itx->itx_lr.lrc_seq = 0;	/* defensive */
>>>   	itx->itx_sync = B_TRUE;		/* default is synchronous */
>>>   
>>> @@ -1294,11 +1312,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *it
>>>   			 * this itxg. Save the itxs for release below.
>>>   			 * This should be rare.
>>>   			 */
>>> -			atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
>>> -			itxg->itxg_sod = 0;
>>>   			clean = itxg->itxg_itxs;
>>>   		}
>>> -		ASSERT(itxg->itxg_sod == 0);
>>>   		itxg->itxg_txg = txg;
>>>   		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
>>>   
>>> @@ -1310,8 +1325,6 @@ zil_itx_assign(zilog_t *zilog, itx_t *it
>>>   	}
>>>   	if (itx->itx_sync) {
>>>   		list_insert_tail(&itxs->i_sync_list, itx);
>>> -		atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
>>> -		itxg->itxg_sod += itx->itx_sod;
>>>   	} else {
>>>   		avl_tree_t *t = &itxs->i_async_tree;
>>>   		uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
>>> @@ -1359,8 +1372,6 @@ zil_clean(zilog_t *zilog, uint64_t synce
>>>   	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
>>>   	ASSERT(itxg->itxg_txg != 0);
>>>   	ASSERT(zilog->zl_clean_taskq != NULL);
>>> -	atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
>>> -	itxg->itxg_sod = 0;
>>>   	clean_me = itxg->itxg_itxs;
>>>   	itxg->itxg_itxs = NULL;
>>>   	itxg->itxg_txg = 0;
>>> @@ -1384,7 +1395,6 @@ zil_get_commit_list(zilog_t *zilog)
>>>   {
>>>   	uint64_t otxg, txg;
>>>   	list_t *commit_list = &zilog->zl_itx_commit_list;
>>> -	uint64_t push_sod = 0;
>>>   
>>>   	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
>>>   		otxg = ZILTEST_TXG;
>>> @@ -1401,12 +1411,9 @@ zil_get_commit_list(zilog_t *zilog)
>>>   		}
>>>   
>>>   		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
>>> -		push_sod += itxg->itxg_sod;
>>> -		itxg->itxg_sod = 0;
>>>   
>>>   		mutex_exit(&itxg->itxg_lock);
>>>   	}
>>> -	atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
>>>   }
>>>   
>>>   /*
>>>
>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
>>> ==============================================================================
>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Thu Nov 17 20:44:51 2016	(r308781)
>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Thu Nov 17 21:01:27 2016	(r308782)
>>> @@ -2908,20 +2908,21 @@ zio_dva_unallocate(zio_t *zio, zio_gang_
>>>    */
>>>   int
>>>   zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
>>> -    uint64_t size, boolean_t use_slog)
>>> +    uint64_t size, boolean_t *slog)
>>>   {
>>>   	int error = 1;
>>>   
>>>   	ASSERT(txg > spa_syncing_txg(spa));
>>>   
>>> -	if (use_slog) {
>>> -		error = metaslab_alloc(spa, spa_log_class(spa), size,
>>> -		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
>>> -	}
>>> -
>>> -	if (error) {
>>> +	error = metaslab_alloc(spa, spa_log_class(spa), size,
>>> +	    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
>>> +	if (error == 0) {
>>> +		*slog = TRUE;
>>> +	} else {
>>>   		error = metaslab_alloc(spa, spa_normal_class(spa), size,
>>>   		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
>>> +		if (error == 0)
>>> +			*slog = FALSE;
>>>   	}
>>>   
>>>   	if (error == 0) {
>>>
>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
>>> ==============================================================================
>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	Thu Nov 17 20:44:51 2016	(r308781)
>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	Thu Nov 17 21:01:27 2016	(r308782)
>>> @@ -1387,54 +1387,44 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_
>>>   {
>>>   	uint32_t blocksize = zv->zv_volblocksize;
>>>   	zilog_t *zilog = zv->zv_zilog;
>>> -	boolean_t slogging;
>>> -	ssize_t immediate_write_sz;
>>> +	itx_wr_state_t write_state;
>>>   
>>>   	if (zil_replaying(zilog, tx))
>>>   		return;
>>>   
>>> -	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
>>> -	    ? 0 : zvol_immediate_write_sz;
>>> -
>>> -	slogging = spa_has_slogs(zilog->zl_spa) &&
>>> -	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
>>> +	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
>>> +		write_state = WR_INDIRECT;
>>> +	else if (!spa_has_slogs(zilog->zl_spa) &&
>>> +	    resid >= blocksize && blocksize > zvol_immediate_write_sz)
>>> +		write_state = WR_INDIRECT;
>>> +	else if (sync)
>>> +		write_state = WR_COPIED;
>>> +	else
>>> +		write_state = WR_NEED_COPY;
>>>   
>>>   	while (resid) {
>>>   		itx_t *itx;
>>>   		lr_write_t *lr;
>>> -		ssize_t len;
>>> -		itx_wr_state_t write_state;
>>> +		itx_wr_state_t wr_state = write_state;
>>> +		ssize_t len = resid;
>>>   
>>> -		/*
>>> -		 * Unlike zfs_log_write() we can be called with
>>> -		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
>>> -		 */
>>> -		if (blocksize > immediate_write_sz && !slogging &&
>>> -		    resid >= blocksize && off % blocksize == 0) {
>>> -			write_state = WR_INDIRECT; /* uses dmu_sync */
>>> -			len = blocksize;
>>> -		} else if (sync) {
>>> -			write_state = WR_COPIED;
>>> -			len = MIN(ZIL_MAX_LOG_DATA, resid);
>>> -		} else {
>>> -			write_state = WR_NEED_COPY;
>>> -			len = MIN(ZIL_MAX_LOG_DATA, resid);
>>> -		}
>>> +		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
>>> +			wr_state = WR_NEED_COPY;
>>> +		else if (wr_state == WR_INDIRECT)
>>> +			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
>>>   
>>>   		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
>>> -		    (write_state == WR_COPIED ? len : 0));
>>> +		    (wr_state == WR_COPIED ? len : 0));
>>>   		lr = (lr_write_t *)&itx->itx_lr;
>>> -		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
>>> +		if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
>>>   		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
>>>   			zil_itx_destroy(itx);
>>>   			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
>>>   			lr = (lr_write_t *)&itx->itx_lr;
>>> -			write_state = WR_NEED_COPY;
>>> +			wr_state = WR_NEED_COPY;
>>>   		}
>>>   
>>> -		itx->itx_wr_state = write_state;
>>> -		if (write_state == WR_NEED_COPY)
>>> -			itx->itx_sod += len;
>>> +		itx->itx_wr_state = wr_state;
>>>   		lr->lr_foid = ZVOL_OBJ;
>>>   		lr->lr_offset = off;
>>>   		lr->lr_length = len;
>>>




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?7675c768-c1b1-108b-b651-28b6184f8176>