Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 6 Aug 2018 21:09:11 +0000 (UTC)
From:      Kirk McKusick <mckusick@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r337396 - in head/sys/ufs: ffs ufs
Message-ID:  <201808062109.w76L9B9h021616@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mckusick
Date: Mon Aug  6 21:09:11 2018
New Revision: 337396
URL: https://svnweb.freebsd.org/changeset/base/337396

Log:
  Put in place the framework for consolodating contiguous blocks into
  a smaller number of larger TRIM requests. The hope had been to have
  the full TRIM consolodation in place for 12.0, but the algorithms
  are still under development and need further testing. With this
  framework in place it will be possible to easily add TRIM consolodation
  once the optimal strategy has been found.
  
  The only functional change with this patch is the elimination of TRIM
  requests for blocks that are freed before they have been likely to
  have been written.
  
  Reviewed by: kib
  Discussed with: Warner Losh and Chuck Silvers
  Sponsored by: Netflix

Modified:
  head/sys/ufs/ffs/ffs_alloc.c
  head/sys/ufs/ffs/ffs_balloc.c
  head/sys/ufs/ffs/ffs_extern.h
  head/sys/ufs/ffs/ffs_inode.c
  head/sys/ufs/ffs/ffs_snapshot.c
  head/sys/ufs/ffs/ffs_softdep.c
  head/sys/ufs/ffs/ffs_vfsops.c
  head/sys/ufs/ffs/softdep.h
  head/sys/ufs/ufs/ufsmount.h

Modified: head/sys/ufs/ffs/ffs_alloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_alloc.c	Mon Aug  6 20:39:27 2018	(r337395)
+++ head/sys/ufs/ffs/ffs_alloc.c	Mon Aug  6 21:09:11 2018	(r337396)
@@ -110,8 +110,6 @@ static ufs2_daddr_t
 static void	ffs_blkfree_cg(struct ufsmount *, struct fs *,
 		    struct vnode *, ufs2_daddr_t, long, ino_t,
 		    struct workhead *);
-static void	ffs_blkfree_trim_completed(struct buf *);
-static void	ffs_blkfree_trim_task(void *ctx, int pending __unused);
 #ifdef INVARIANTS
 static int	ffs_checkblk(struct inode *, ufs2_daddr_t, long);
 #endif
@@ -395,8 +393,23 @@ retry:
 	if (bno > 0) {
 		bp->b_blkno = fsbtodb(fs, bno);
 		if (!DOINGSOFTDEP(vp))
+			/*
+			 * The usual case is that a smaller fragment that
+			 * was just allocated has been replaced with a bigger
+			 * fragment or a full-size block. If it is marked as
+			 * B_DELWRI, the current contents have not been written
+			 * to disk. It is possible that the block was written
+			 * earlier, but very uncommon. If the block has never
+			 * been written, there is no need to send a BIO_DELETE
+			 * for it when it is freed. The gain from avoiding the
+			 * TRIMs for the common case of unwritten blocks far
+			 * exceeds the cost of the write amplification for the
+			 * uncommon case of failing to send a TRIM for a block
+			 * that had been written.
+			 */
 			ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
-			    ip->i_number, vp->v_type, NULL);
+			    ip->i_number, vp->v_type, NULL,
+			    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON);
 		delta = btodb(nsize - osize);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
 		if (flags & IO_EXT)
@@ -521,7 +534,7 @@ ffs_reallocblks_ufs1(ap)
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp;
-	struct buf *sbp, *ebp;
+	struct buf *sbp, *ebp, *bp;
 	ufs1_daddr_t *bap, *sbap, *ebap;
 	struct cluster_save *buflist;
 	struct ufsmount *ump;
@@ -730,14 +743,29 @@ ffs_reallocblks_ufs1(ap)
 		printf("\n\tnew:");
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
+		bp = buflist->bs_children[i];
 		if (!DOINGSOFTDEP(vp))
+			/*
+			 * The usual case is that a set of N-contiguous blocks
+			 * that was just allocated has been replaced with a
+			 * set of N+1-contiguous blocks. If they are marked as
+			 * B_DELWRI, the current contents have not been written
+			 * to disk. It is possible that the blocks were written
+			 * earlier, but very uncommon. If the blocks have never
+			 * been written, there is no need to send a BIO_DELETE
+			 * for them when they are freed. The gain from avoiding
+			 * the TRIMs for the common case of unwritten blocks
+			 * far exceeds the cost of the write amplification for
+			 * the uncommon case of failing to send a TRIM for the
+			 * blocks that had been written.
+			 */
 			ffs_blkfree(ump, fs, ump->um_devvp,
-			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-			    fs->fs_bsize, ip->i_number, vp->v_type, NULL);
-		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
+			    dbtofsb(fs, bp->b_blkno),
+			    fs->fs_bsize, ip->i_number, vp->v_type, NULL,
+			    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON);
+		bp->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
-		if (!ffs_checkblk(ip,
-		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
+		if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DEBUG
@@ -771,7 +799,7 @@ ffs_reallocblks_ufs2(ap)
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp;
-	struct buf *sbp, *ebp;
+	struct buf *sbp, *ebp, *bp;
 	ufs2_daddr_t *bap, *sbap, *ebap;
 	struct cluster_save *buflist;
 	struct ufsmount *ump;
@@ -978,14 +1006,29 @@ ffs_reallocblks_ufs2(ap)
 		printf("\n\tnew:");
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
+		bp = buflist->bs_children[i];
 		if (!DOINGSOFTDEP(vp))
+			/*
+			 * The usual case is that a set of N-contiguous blocks
+			 * that was just allocated has been replaced with a
+			 * set of N+1-contiguous blocks. If they are marked as
+			 * B_DELWRI, the current contents have not been written
+			 * to disk. It is possible that the blocks were written
+			 * earlier, but very uncommon. If the blocks have never
+			 * been written, there is no need to send a BIO_DELETE
+			 * for them when they are freed. The gain from avoiding
+			 * the TRIMs for the common case of unwritten blocks
+			 * far exceeds the cost of the write amplification for
+			 * the uncommon case of failing to send a TRIM for the
+			 * blocks that had been written.
+			 */
 			ffs_blkfree(ump, fs, ump->um_devvp,
-			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-			    fs->fs_bsize, ip->i_number, vp->v_type, NULL);
-		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
+			    dbtofsb(fs, bp->b_blkno),
+			    fs->fs_bsize, ip->i_number, vp->v_type, NULL,
+			    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON);
+		bp->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
-		if (!ffs_checkblk(ip,
-		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
+		if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DEBUG
@@ -1823,8 +1866,7 @@ gotit:
 	/* XXX Fixme. */
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
-		    size, 0);
+		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0);
 	UFS_LOCK(ump);
 	return (blkno);
 }
@@ -2254,6 +2296,17 @@ ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd)
 	bdwrite(bp);
 }
 
+/*
+ * Structures and routines associated with trim management.
+ */
+MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures");
+
+#define	TRIMLIST_HASH(ump, inum) \
+	(&(ump)->um_trimhash[(inum) & (ump)->um_trimlisthashsize])
+
+static void	ffs_blkfree_trim_completed(struct buf *);
+static void	ffs_blkfree_trim_task(void *ctx, int pending __unused);
+
 struct ffs_blkfree_trim_params {
 	struct task task;
 	struct ufsmount *ump;
@@ -2277,7 +2330,7 @@ ffs_blkfree_trim_task(ctx, pending)
 	    tp->inum, tp->pdephd);
 	vn_finished_secondary_write(UFSTOVFS(tp->ump));
 	atomic_add_int(&tp->ump->um_trim_inflight, -1);
-	free(tp, M_TEMP);
+	free(tp, M_TRIM);
 }
 
 static void
@@ -2287,13 +2340,13 @@ ffs_blkfree_trim_completed(bp)
 	struct ffs_blkfree_trim_params *tp;
 
 	tp = bp->b_fsprivate1;
-	free(bp, M_TEMP);
+	free(bp, M_TRIM);
 	TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
 	taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
 }
 
 void
-ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
+ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, trimtype)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
@@ -2302,6 +2355,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
 	ino_t inum;
 	enum vtype vtype;
 	struct workhead *dephd;
+	int trimtype;
 {
 	struct mount *mp;
 	struct buf *bp;
@@ -2319,10 +2373,11 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
 		return;
 	}
 	/*
-	 * Nothing to delay if TRIM is disabled, or the operation is
-	 * performed on the snapshot.
+	 * Nothing to delay if TRIM is not required for this block or TRIM
+	 * is disabled or the operation is performed on a snapshot.
 	 */
-	if (((ump->um_flags) & UM_CANDELETE) == 0 || devvp->v_type == VREG) {
+	if (trimtype == NOTRIM || ((ump->um_flags & UM_CANDELETE) == 0) ||
+	    devvp->v_type == VREG) {
 		ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
 		return;
 	}
@@ -2334,7 +2389,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
 	 * and write some new data into it.
 	 */
 	atomic_add_int(&ump->um_trim_inflight, 1);
-	tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK);
+	tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK);
 	tp->ump = ump;
 	tp->devvp = devvp;
 	tp->bno = bno;
@@ -2347,7 +2402,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
 	} else
 		tp->pdephd = NULL;
 
-	bp = malloc(sizeof(*bp), M_TEMP, M_WAITOK | M_ZERO);
+	bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
 	bp->b_iocmd = BIO_DELETE;
 	bp->b_iooffset = dbtob(fsbtodb(fs, bno));
 	bp->b_iodone = ffs_blkfree_trim_completed;
@@ -2824,7 +2879,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 	long blkcnt, blksize;
 	struct file *fp, *vfp;
 	cap_rights_t rights;
-	int filetype, error;
+	int filetype, trimtype, error;
 	static struct fileops *origops, bufferedops;
 
 	if (req->newlen > sizeof cmd)
@@ -2956,14 +3011,17 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 		blkno = cmd.value;
 		blkcnt = cmd.size;
 		blksize = fs->fs_frag - (blkno % fs->fs_frag);
+		trimtype = (blksize < blkcnt) ? STARTFREE : SINGLETON;
 		while (blkcnt > 0) {
 			if (blksize > blkcnt)
 				blksize = blkcnt;
 			ffs_blkfree(ump, fs, ump->um_devvp, blkno,
-			    blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL);
+			    blksize * fs->fs_fsize, UFS_ROOTINO,
+			    VDIR, NULL, trimtype);
 			blkno += blksize;
 			blkcnt -= blksize;
 			blksize = fs->fs_frag;
+			trimtype = (blksize < blkcnt) ? CONTINUEFREE : ENDFREE;
 		}
 		break;
 

Modified: head/sys/ufs/ffs/ffs_balloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_balloc.c	Mon Aug  6 20:39:27 2018	(r337395)
+++ head/sys/ufs/ffs/ffs_balloc.c	Mon Aug  6 21:09:11 2018	(r337396)
@@ -553,7 +553,7 @@ fail:
 		lbns_remfree++;
 #endif
 		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
-		    ip->i_number, vp->v_type, NULL);
+		    ip->i_number, vp->v_type, NULL, SINGLETON);
 	}
 	return (error);
 }
@@ -1147,7 +1147,7 @@ fail:
 		lbns_remfree++;
 #endif
 		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
-		    ip->i_number, vp->v_type, NULL);
+		    ip->i_number, vp->v_type, NULL, SINGLETON);
 	}
 	return (error);
 }

Modified: head/sys/ufs/ffs/ffs_extern.h
==============================================================================
--- head/sys/ufs/ffs/ffs_extern.h	Mon Aug  6 20:39:27 2018	(r337395)
+++ head/sys/ufs/ffs/ffs_extern.h	Mon Aug  6 21:09:11 2018	(r337396)
@@ -63,7 +63,7 @@ int	ffs_balloc_ufs2(struct vnode *a_vp, off_t a_starto
             struct ucred *a_cred, int a_flags, struct buf **a_bpp);
 int	ffs_blkatoff(struct vnode *, off_t, char **, struct buf **);
 void	ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *,
-	    ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *);
+	    ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *, int);
 ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
 ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
 int	ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
@@ -111,10 +111,27 @@ vfs_vget_t ffs_vget;
 int	ffs_vgetf(struct mount *, ino_t, int, struct vnode **, int);
 void	process_deferred_inactive(struct mount *mp);
 
+/*
+ * Flags to ffs_vgetf
+ */
 #define	FFSV_FORCEINSMQ	0x0001
 
+/*
+ * Flags to ffs_reload
+ */
 #define	FFSR_FORCE	0x0001
 #define	FFSR_UNSUSPEND	0x0002
+
+/*
+ * Trim type to ffs_blkfree - used to help with BIO_DELETE (trim) requests
+ */
+#define	NOTRIM		1	/* never written, so don't call trim for it */
+#define	SINGLETON	2	/* only block being freed, so trim it now */
+#define	STARTFREE	3	/* beginning to free for this inum */
+#define	CONTINUEFREE	4	/* additional block free for this inum */
+#define	ENDFREE		5	/* last block to free for this inum */
+
+#define	MAXTRIMIO	1024	/* maximum expected outstanding trim requests */
 
 extern struct vop_vector ffs_vnodeops1;
 extern struct vop_vector ffs_fifoops1;

Modified: head/sys/ufs/ffs/ffs_inode.c
==============================================================================
--- head/sys/ufs/ffs/ffs_inode.c	Mon Aug  6 20:39:27 2018	(r337395)
+++ head/sys/ufs/ffs/ffs_inode.c	Mon Aug  6 21:09:11 2018	(r337396)
@@ -195,7 +195,7 @@ ffs_truncate(vp, length, flags, cred)
 	struct ufsmount *ump;
 	int softdeptrunc, journaltrunc;
 	int needextclean, extblocks;
-	int offset, size, level, nblocks;
+	int trimtype, firstfree, offset, size, level, nblocks;
 	int i, error, allerror, indiroff, waitforupdate;
 	off_t osize;
 
@@ -275,7 +275,7 @@ ffs_truncate(vp, length, flags, cred)
 					continue;
 				ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i],
 				    sblksize(fs, osize, i), ip->i_number,
-				    vp->v_type, NULL);
+				    vp->v_type, NULL, SINGLETON);
 			}
 		}
 	}
@@ -523,7 +523,7 @@ ffs_truncate(vp, length, flags, cred)
 				DIP_SET(ip, i_ib[level], 0);
 				ffs_blkfree(ump, fs, ump->um_devvp, bn,
 				    fs->fs_bsize, ip->i_number,
-				    vp->v_type, NULL);
+				    vp->v_type, NULL, SINGLETON);
 				blocksreleased += nblocks;
 			}
 		}
@@ -534,6 +534,7 @@ ffs_truncate(vp, length, flags, cred)
 	/*
 	 * All whole direct blocks or frags.
 	 */
+	firstfree = 1;
 	for (i = UFS_NDADDR - 1; i > lastblock; i--) {
 		long bsize;
 
@@ -542,8 +543,23 @@ ffs_truncate(vp, length, flags, cred)
 			continue;
 		DIP_SET(ip, i_db[i], 0);
 		bsize = blksize(fs, ip, i);
+		if (firstfree) {
+			if (i - 1 == lastblock || DIP(ip, i_db[i - 1]) == 0) {
+				trimtype = SINGLETON;
+			} else {
+				trimtype = STARTFREE;
+				firstfree = 0;
+			}
+		} else {
+			if (i - 1 == lastblock || DIP(ip, i_db[i - 1]) == 0) {
+				trimtype = ENDFREE;
+				firstfree = 1;
+			} else {
+				trimtype = CONTINUEFREE;
+			}
+		}
 		ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number,
-		    vp->v_type, NULL);
+		    vp->v_type, NULL, trimtype);
 		blocksreleased += btodb(bsize);
 	}
 	if (lastblock < 0)
@@ -575,7 +591,8 @@ ffs_truncate(vp, length, flags, cred)
 			 */
 			bn += numfrags(fs, newspace);
 			ffs_blkfree(ump, fs, ump->um_devvp, bn,
-			   oldspace - newspace, ip->i_number, vp->v_type, NULL);
+			   oldspace - newspace, ip->i_number, vp->v_type,
+			   NULL, SINGLETON);
 			blocksreleased += btodb(oldspace - newspace);
 		}
 	}
@@ -636,7 +653,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 	struct fs *fs;
 	struct vnode *vp;
 	caddr_t copy = NULL;
-	int i, nblocks, error = 0, allerror = 0;
+	int i, trimtype, nblocks, firstfree, error = 0, allerror = 0;
 	ufs2_daddr_t nb, nlbn, last;
 	ufs2_daddr_t blkcount, factor, blocksreleased = 0;
 	ufs1_daddr_t *bap1 = NULL;
@@ -719,6 +736,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 	/*
 	 * Recursively free totally unused blocks.
 	 */
+	firstfree = 1;
 	for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
 	    i--, nlbn += factor) {
 		nb = BAP(ip, i);
@@ -730,8 +748,23 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 				allerror = error;
 			blocksreleased += blkcount;
 		}
+		if (firstfree) {
+			if (i - 1 == last || BAP(ip, i - 1) == 0) {
+				trimtype = SINGLETON;
+			} else {
+				trimtype = STARTFREE;
+				firstfree = 0;
+			}
+		} else {
+			if (i - 1 == last || BAP(ip, i - 1) == 0) {
+				trimtype = ENDFREE;
+				firstfree = 1;
+			} else {
+				trimtype = CONTINUEFREE;
+			}
+		}
 		ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize,
-		    ip->i_number, vp->v_type, NULL);
+		    ip->i_number, vp->v_type, NULL, trimtype);
 		blocksreleased += nblocks;
 	}
 

Modified: head/sys/ufs/ffs/ffs_snapshot.c
==============================================================================
--- head/sys/ufs/ffs/ffs_snapshot.c	Mon Aug  6 20:39:27 2018	(r337395)
+++ head/sys/ufs/ffs/ffs_snapshot.c	Mon Aug  6 21:09:11 2018	(r337396)
@@ -583,7 +583,7 @@ loop:
 			if (len != 0 && len < fs->fs_bsize) {
 				ffs_blkfree(ump, copy_fs, vp,
 				    DIP(xp, i_db[loc]), len, xp->i_number,
-				    xvp->v_type, NULL);
+				    xvp->v_type, NULL, SINGLETON);
 				blkno = DIP(xp, i_db[loc]);
 				DIP_SET(xp, i_db[loc], 0);
 			}
@@ -1265,7 +1265,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expung
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
-		    vp->v_type, NULL);
+		    vp->v_type, NULL, SINGLETON);
 	}
 	return (0);
 }
@@ -1549,7 +1549,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expung
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
-		    vp->v_type, NULL);
+		    vp->v_type, NULL, SINGLETON);
 	}
 	return (0);
 }

Modified: head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- head/sys/ufs/ffs/ffs_softdep.c	Mon Aug  6 20:39:27 2018	(r337395)
+++ head/sys/ufs/ffs/ffs_softdep.c	Mon Aug  6 21:09:11 2018	(r337396)
@@ -869,7 +869,7 @@ static	void cancel_allocdirect(struct allocdirectlst *
 	    struct allocdirect *, struct freeblks *);
 static	int check_inode_unwritten(struct inodedep *);
 static	int free_inodedep(struct inodedep *);
-static	void freework_freeblock(struct freework *);
+static	void freework_freeblock(struct freework *, int);
 static	void freework_enqueue(struct freework *);
 static	int handle_workitem_freeblocks(struct freeblks *, int);
 static	int handle_complete_freeblocks(struct freeblks *, int);
@@ -884,7 +884,7 @@ static	struct allocindir *newallocindir(struct inode *
 	    ufs2_daddr_t, ufs_lbn_t);
 static	void handle_workitem_freefrag(struct freefrag *);
 static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
-	    ufs_lbn_t);
+	    ufs_lbn_t, int);
 static	void allocdirect_merge(struct allocdirectlst *,
 	    struct allocdirect *, struct allocdirect *);
 static	struct freefrag *allocindir_merge(struct allocindir *,
@@ -5289,7 +5289,22 @@ softdep_setup_allocdirect(ip, off, newblkno, oldblkno,
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_allocdirect called on non-softdep filesystem"));
 	if (oldblkno && oldblkno != newblkno)
-		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
+		/*
+		 * The usual case is that a smaller fragment that
+		 * was just allocated has been replaced with a bigger
+		 * fragment or a full-size block. If it is marked as
+		 * B_DELWRI, the current contents have not been written
+		 * to disk. It is possible that the block was written
+		 * earlier, but very uncommon. If the block has never
+		 * been written, there is no need to send a BIO_DELETE
+		 * for it when it is freed. The gain from avoiding the
+		 * TRIMs for the common case of unwritten blocks far
+		 * exceeds the cost of the write amplification for the
+		 * uncommon case of failing to send a TRIM for a block
+		 * that had been written.
+		 */
+		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
+		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON);
 	else
 		freefrag = NULL;
 
@@ -5566,11 +5581,12 @@ newjfreefrag(freefrag, ip, blkno, size, lbn)
  * Allocate a new freefrag structure.
  */
 static struct freefrag *
-newfreefrag(ip, blkno, size, lbn)
+newfreefrag(ip, blkno, size, lbn, trimtype)
 	struct inode *ip;
 	ufs2_daddr_t blkno;
 	long size;
 	ufs_lbn_t lbn;
+	int trimtype;
 {
 	struct freefrag *freefrag;
 	struct ufsmount *ump;
@@ -5591,6 +5607,7 @@ newfreefrag(ip, blkno, size, lbn)
 	freefrag->ff_vtype = ITOV(ip)->v_type;
 	freefrag->ff_blkno = blkno;
 	freefrag->ff_fragsize = size;
+	freefrag->ff_trimtype = trimtype;
 
 	if (MOUNTEDSUJ(UFSTOVFS(ump))) {
 		freefrag->ff_jdep = (struct worklist *)
@@ -5636,7 +5653,8 @@ handle_workitem_freefrag(freefrag)
 	}
 	FREE_LOCK(ump);
 	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
-	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
+	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd,
+	   freefrag->ff_trimtype);
 	ACQUIRE_LOCK(ump);
 	WORKITEM_FREE(freefrag, D_FREEFRAG);
 	FREE_LOCK(ump);
@@ -5676,7 +5694,22 @@ softdep_setup_allocext(ip, off, newblkno, oldblkno, ne
 
 	lbn = bp->b_lblkno;
 	if (oldblkno && oldblkno != newblkno)
-		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
+		/*
+		 * The usual case is that a smaller fragment that
+		 * was just allocated has been replaced with a bigger
+		 * fragment or a full-size block. If it is marked as
+		 * B_DELWRI, the current contents have not been written
+		 * to disk. It is possible that the block was written
+		 * earlier, but very uncommon. If the block has never
+		 * been written, there is no need to send a BIO_DELETE
+		 * for it when it is freed. The gain from avoiding the
+		 * TRIMs for the common case of unwritten blocks far
+		 * exceeds the cost of the write amplification for the
+		 * uncommon case of failing to send a TRIM for a block
+		 * that had been written.
+		 */
+		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
+		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON);
 	else
 		freefrag = NULL;
 
@@ -5789,7 +5822,8 @@ newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
 	struct jnewblk *jnewblk;
 
 	if (oldblkno)
-		freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn);
+		freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn,
+		    SINGLETON);
 	else
 		freefrag = NULL;
 	ACQUIRE_LOCK(ITOUMP(ip));
@@ -7724,8 +7758,9 @@ free_inodedep(inodedep)
  * in memory immediately.
  */
 static void
-freework_freeblock(freework)
+freework_freeblock(freework, trimtype)
 	struct freework *freework;
+	int trimtype;
 {
 	struct freeblks *freeblks;
 	struct jnewblk *jnewblk;
@@ -7779,10 +7814,10 @@ freework_freeblock(freework)
 	FREE_LOCK(ump);
 	freeblks_free(ump, freeblks, btodb(bsize));
 	CTR4(KTR_SUJ,
-	    "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
+	    "freework_freeblock: ino %jd blkno %jd lbn %jd size %d",
 	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
 	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
-	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
+	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd, trimtype);
 	ACQUIRE_LOCK(ump);
 	/*
 	 * The jnewblk will be discarded and the bits in the map never
@@ -7835,7 +7870,7 @@ handle_workitem_indirblk(freework)
 		return;
 	}
 	if (freework->fw_off == NINDIR(fs)) {
-		freework_freeblock(freework);
+		freework_freeblock(freework, SINGLETON);
 		return;
 	}
 	freework->fw_state |= INPROGRESS;
@@ -7889,16 +7924,19 @@ handle_workitem_freeblocks(freeblks, flags)
 	struct freeblks *freeblks;
 	int flags;
 {
-	struct freework *freework;
+	struct freework *freework, *prevfreework;
 	struct newblk *newblk;
 	struct allocindir *aip;
 	struct ufsmount *ump;
 	struct worklist *wk;
+	int trimtype;
 
 	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
 	    ("handle_workitem_freeblocks: Journal entries not written."));
 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 	ACQUIRE_LOCK(ump);
+	prevfreework = NULL;
+	trimtype = 0;
 	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		switch (wk->wk_type) {
@@ -7932,16 +7970,26 @@ handle_workitem_freeblocks(freeblks, flags)
 
 		case D_FREEWORK:
 			freework = WK_FREEWORK(wk);
-			if (freework->fw_lbn <= -UFS_NDADDR)
+			if (freework->fw_lbn <= -UFS_NDADDR) {
 				handle_workitem_indirblk(freework);
-			else
-				freework_freeblock(freework);
+				continue;
+			} else if (prevfreework == NULL) {
+				trimtype = SINGLETON;
+			} else if (trimtype == SINGLETON) {
+				freework_freeblock(prevfreework, STARTFREE);
+				trimtype = ENDFREE;
+			} else {
+				freework_freeblock(prevfreework, CONTINUEFREE);
+			}
+			prevfreework = freework;
 			continue;
 		default:
 			panic("handle_workitem_freeblocks: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 		}
 	}
+	if (prevfreework != NULL)
+		freework_freeblock(prevfreework, trimtype);
 	if (freeblks->fb_ref != 0) {
 		freeblks->fb_state &= ~INPROGRESS;
 		wake_worklist(&freeblks->fb_list);
@@ -8080,13 +8128,8 @@ indir_trunc(freework, dbn, lbn)
 	ufs1_daddr_t *bap1;
 	ufs2_daddr_t nb, nnb, *bap2;
 	ufs_lbn_t lbnadd, nlbn;
-	int i, nblocks, ufs1fmt;
-	int freedblocks;
-	int goingaway;
-	int freedeps;
-	int needj;
-	int level;
-	int cnt;
+	int nblocks, ufs1fmt, firstfree, trimtype, freedblocks;
+	int goingaway, freedeps, needj, level, cnt, i;
 
 	freeblks = freework->fw_freeblks;
 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
@@ -8180,6 +8223,7 @@ indir_trunc(freework, dbn, lbn)
 	 * arranges for the current level to be freed when subordinates
 	 * are free when journaling.
 	 */
+	firstfree = 1;
 	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
 		if (i != NINDIR(fs) - 1) {
 			if (ufs1fmt)
@@ -8215,11 +8259,26 @@ indir_trunc(freework, dbn, lbn)
 				freedeps++;
 			}
 			CTR3(KTR_SUJ,
-			    "indir_trunc: ino %d blkno %jd size %ld",
+			    "indir_trunc: ino %jd blkno %jd size %d",
 			    freeblks->fb_inum, nb, fs->fs_bsize);
+			if (firstfree) {
+				if (i == NINDIR(fs) - 1 || nnb == 0) {
+					trimtype = SINGLETON;
+				} else {
+					trimtype = STARTFREE;
+					firstfree = 0;
+				}
+			} else {
+				if (i == NINDIR(fs) - 1 || nnb == 0) {
+					trimtype = ENDFREE;
+					firstfree = 1;
+				} else {
+					trimtype = CONTINUEFREE;
+				}
+			}
 			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
 			    fs->fs_bsize, freeblks->fb_inum,
-			    freeblks->fb_vtype, &wkhd);
+			    freeblks->fb_vtype, &wkhd, trimtype);
 		}
 	}
 	if (goingaway) {
@@ -8244,7 +8303,7 @@ indir_trunc(freework, dbn, lbn)
 		if (level == 0)
 			freeblks->fb_cgwait += freedeps;
 		if (freework->fw_ref == 0)
-			freework_freeblock(freework);
+			freework_freeblock(freework, SINGLETON);
 		FREE_LOCK(ump);
 		return;
 	}
@@ -8253,10 +8312,10 @@ indir_trunc(freework, dbn, lbn)
 	 */
 	dbn = dbtofsb(fs, dbn);
 	CTR3(KTR_SUJ,
-	    "indir_trunc 2: ino %d blkno %jd size %ld",
+	    "indir_trunc 2: ino %jd blkno %jd size %d",
 	    freeblks->fb_inum, dbn, fs->fs_bsize);
 	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
-	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
+	    freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON);
 	/* Non SUJ softdep does single-threaded truncations. */
 	if (freework->fw_blkno == dbn) {
 		freework->fw_state |= ALLCOMPLETE;

Modified: head/sys/ufs/ffs/ffs_vfsops.c
==============================================================================
--- head/sys/ufs/ffs/ffs_vfsops.c	Mon Aug  6 20:39:27 2018	(r337395)
+++ head/sys/ufs/ffs/ffs_vfsops.c	Mon Aug  6 21:09:11 2018	(r337396)
@@ -978,6 +978,8 @@ ffs_mountfs(devvp, mp, td)
 			    taskqueue_thread_enqueue, &ump->um_trim_tq);
 			taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS,
 			    "%s trim", mp->mnt_stat.f_mntonname);
+			ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM,
+			    &ump->um_trimlisthashsize);
 		}
 	}
 
@@ -1256,6 +1258,7 @@ ffs_unmount(mp, mntflags)
 			pause("ufsutr", hz);
 		taskqueue_drain_all(ump->um_trim_tq);
 		taskqueue_free(ump->um_trim_tq);
+		free (ump->um_trimhash, M_TRIM);
 	}
 	g_topology_lock();
 	if (ump->um_fsckpid > 0) {

Modified: head/sys/ufs/ffs/softdep.h
==============================================================================
--- head/sys/ufs/ffs/softdep.h	Mon Aug  6 20:39:27 2018	(r337395)
+++ head/sys/ufs/ffs/softdep.h	Mon Aug  6 21:09:11 2018	(r337396)
@@ -557,6 +557,7 @@ struct freefrag {
 	long	ff_fragsize;		/* size of fragment being deleted */
 	ino_t	ff_inum;		/* owning inode number */
 	enum	vtype ff_vtype;		/* owning inode's file type */
+	int	ff_trimtype;		/* trim status when deleted */
 };
 
 /*

Modified: head/sys/ufs/ufs/ufsmount.h
==============================================================================
--- head/sys/ufs/ufs/ufsmount.h	Mon Aug  6 20:39:27 2018	(r337395)
+++ head/sys/ufs/ufs/ufsmount.h	Mon Aug  6 21:09:11 2018	(r337396)
@@ -47,6 +47,7 @@ struct ufs_args {
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_UFSMNT);
+MALLOC_DECLARE(M_TRIM);
 #endif
 
 struct buf;
@@ -63,6 +64,7 @@ struct inodedep;
 
 TAILQ_HEAD(inodedeplst, inodedep);
 LIST_HEAD(bmsafemaphd, bmsafemap);
+LIST_HEAD(trimlist_hashhead, ffs_blkfree_trim_params);
 
 /*
  * This structure describes the UFS specific mount structure data.
@@ -101,6 +103,8 @@ struct ufsmount {
 	u_int	um_flags;			/* (i) filesystem flags */
 	u_int	um_trim_inflight;		/* (a) outstanding trim count */
 	struct	taskqueue *um_trim_tq;		/* (c) trim request queue */
+	struct	trimlist_hashhead *um_trimhash;	/* (i) trimlist hash table */
+	u_long	um_trimlisthashsize;		/* (i) trim hash table size-1 */
 						/* (c) - below function ptrs */
 	int	(*um_balloc)(struct vnode *, off_t, int, struct ucred *,
 		    int, struct buf **);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808062109.w76L9B9h021616>