From owner-svn-src-all@FreeBSD.ORG  Mon Aug 18 22:53:49 2014
Return-Path: <owner-svn-src-all@FreeBSD.ORG>
Delivered-To: svn-src-all@freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115])
 (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits))
 (No client certificate requested)
 by hub.freebsd.org (Postfix) with ESMTPS id 8340E221;
 Mon, 18 Aug 2014 22:53:49 +0000 (UTC)
Received: from svn.freebsd.org (svn.freebsd.org
 [IPv6:2001:1900:2254:2068::e6a:0])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (Client did not present a certificate)
 by mx1.freebsd.org (Postfix) with ESMTPS id 6CF1830B1;
 Mon, 18 Aug 2014 22:53:49 +0000 (UTC)
Received: from svn.freebsd.org ([127.0.1.70])
 by svn.freebsd.org (8.14.9/8.14.9) with ESMTP id s7IMrnf2044494;
 Mon, 18 Aug 2014 22:53:49 GMT (envelope-from mckusick@FreeBSD.org)
Received: (from mckusick@localhost)
 by svn.freebsd.org (8.14.9/8.14.9/Submit) id s7IMrmSF044491;
 Mon, 18 Aug 2014 22:53:48 GMT (envelope-from mckusick@FreeBSD.org)
Message-Id: <201408182253.s7IMrmSF044491@svn.freebsd.org>
X-Authentication-Warning: svn.freebsd.org: mckusick set sender to
 mckusick@FreeBSD.org using -f
From: Kirk McKusick <mckusick@FreeBSD.org>
Date: Mon, 18 Aug 2014 22:53:48 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-all@freebsd.org,
 svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org
Subject: svn commit: r270157 - in stable/10/sys: kern ufs/ffs
X-SVN-Group: stable-10
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-BeenThere: svn-src-all@freebsd.org
X-Mailman-Version: 2.1.18-1
Precedence: list
List-Id: "SVN commit messages for the entire src tree \(except for &quot;
 user&quot; and &quot; projects&quot; \)" <svn-src-all.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/options/svn-src-all>,
 <mailto:svn-src-all-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-all/>
List-Post: <mailto:svn-src-all@freebsd.org>
List-Help: <mailto:svn-src-all-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-all>,
 <mailto:svn-src-all-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Mon, 18 Aug 2014 22:53:49 -0000

Author: mckusick
Date: Mon Aug 18 22:53:48 2014
New Revision: 270157
URL: http://svnweb.freebsd.org/changeset/base/270157

Log:
  MFC of 269533 (by mckusick):
  
  Add support for multi-threading of soft updates.
  
  Replace a single soft updates thread with a thread per FFS-filesystem
  mount point. The threads are associated with the bufdaemon process.
  
  Reviewed by:  kib
  Tested by:    Peter Holm and Scott Long
  MFC after:    2 weeks
  Sponsored by: Netflix
  
  MFC of 269853 (by kib):
  
  Revision r269457 removed the Giant around mount and unmount code, but
  r269533, which was tested before r269457 was committed, implicitely
  relied on the Giant to protect the manipulations of the softdepmounts
  list.  Use softdep global lock consistently to guarantee the list
  structure now.
  
  Insert the new struct mount_softdeps into the softdepmounts only after
  it is sufficiently initialized, to prevent softdep_speedup() from
  accessing bare memory.  Similarly, remove struct mount_softdeps for
  the unmounted filesystem from the tailq before destroying structure
  rwlock.
  
  Reported and tested by: pho
  Reviewed by:    mckusick
  Sponsored by:   The FreeBSD Foundation

Modified:
  stable/10/sys/kern/vfs_bio.c
  stable/10/sys/ufs/ffs/ffs_softdep.c
  stable/10/sys/ufs/ffs/softdep.h
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/kern/vfs_bio.c
==============================================================================
--- stable/10/sys/kern/vfs_bio.c	Mon Aug 18 21:07:12 2014	(r270156)
+++ stable/10/sys/kern/vfs_bio.c	Mon Aug 18 22:53:48 2014	(r270157)
@@ -98,7 +98,8 @@ struct	buf_ops buf_ops_bio = {
 struct buf *buf;		/* buffer header pool */
 caddr_t unmapped_buf;
 
-static struct proc *bufdaemonproc;
+/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
+struct proc *bufdaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);

Modified: stable/10/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- stable/10/sys/ufs/ffs/ffs_softdep.c	Mon Aug 18 21:07:12 2014	(r270156)
+++ stable/10/sys/ufs/ffs/ffs_softdep.c	Mon Aug 18 22:53:48 2014	(r270157)
@@ -908,9 +908,9 @@ static	void add_to_worklist(struct workl
 static	void wake_worklist(struct worklist *);
 static	void wait_worklist(struct worklist *, char *);
 static	void remove_from_worklist(struct worklist *);
-static	void softdep_flush(void);
+static	void softdep_flush(void *);
 static	void softdep_flushjournal(struct mount *);
-static	int softdep_speedup(void);
+static	int softdep_speedup(struct ufsmount *);
 static	void worklist_speedup(struct mount *);
 static	int journal_mount(struct mount *, struct fs *, struct ucred *);
 static	void journal_unmount(struct ufsmount *);
@@ -963,18 +963,21 @@ static	int softdep_count_dependencies(st
 /*
  * Global lock over all of soft updates.
  */
-static struct rwlock lk;
-RW_SYSINIT(softdep_lock, &lk, "Softdep Lock");
+static struct mtx lk;
+MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF);
+
+#define ACQUIRE_GBLLOCK(lk)	mtx_lock(lk)
+#define FREE_GBLLOCK(lk)	mtx_unlock(lk)
+#define GBLLOCK_OWNED(lk)	mtx_assert((lk), MA_OWNED)
 
 /*
- * Allow per-filesystem soft-updates locking.
- * For now all use the same global lock defined above.
+ * Per-filesystem soft-updates locking.
  */
-#define LOCK_PTR(ump)		((ump)->um_softdep->sd_fslock)
-#define TRY_ACQUIRE_LOCK(ump)	rw_try_wlock((ump)->um_softdep->sd_fslock)
-#define ACQUIRE_LOCK(ump)	rw_wlock((ump)->um_softdep->sd_fslock)
-#define FREE_LOCK(ump)		rw_wunlock((ump)->um_softdep->sd_fslock)
-#define LOCK_OWNED(ump)		rw_assert((ump)->um_softdep->sd_fslock, \
+#define LOCK_PTR(ump)		(&(ump)->um_softdep->sd_fslock)
+#define TRY_ACQUIRE_LOCK(ump)	rw_try_wlock(&(ump)->um_softdep->sd_fslock)
+#define ACQUIRE_LOCK(ump)	rw_wlock(&(ump)->um_softdep->sd_fslock)
+#define FREE_LOCK(ump)		rw_wunlock(&(ump)->um_softdep->sd_fslock)
+#define LOCK_OWNED(ump)		rw_assert(&(ump)->um_softdep->sd_fslock, \
 				    RA_WLOCKED)
 
 #define	BUF_AREC(bp)		lockallowrecurse(&(bp)->b_lock)
@@ -1179,7 +1182,7 @@ workitem_free(item, type)
 	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
 	    ("workitem_free: %s: softdep_curdeps[%s] going negative",
 	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
-	dep_current[item->wk_type]--;
+	atomic_subtract_long(&dep_current[item->wk_type], 1);
 	ump->softdep_curdeps[item->wk_type] -= 1;
 	free(item, DtoM(type));
 }
@@ -1197,11 +1200,13 @@ workitem_alloc(item, type, mp)
 	item->wk_state = 0;
 
 	ump = VFSTOUFS(mp);
-	ACQUIRE_LOCK(ump);
+	ACQUIRE_GBLLOCK(&lk);
 	dep_current[type]++;
 	if (dep_current[type] > dep_highuse[type])
 		dep_highuse[type] = dep_current[type];
 	dep_total[type]++;
+	FREE_GBLLOCK(&lk);
+	ACQUIRE_LOCK(ump);
 	ump->softdep_curdeps[type] += 1;
 	ump->softdep_deps++;
 	ump->softdep_accdeps++;
@@ -1225,11 +1230,13 @@ workitem_reassign(item, newtype)
 	KASSERT(dep_current[item->wk_type] > 0,
 	    ("workitem_reassign: %s: dep_current[%s] going negative",
 	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
-	dep_current[item->wk_type]--;
+	ACQUIRE_GBLLOCK(&lk);
 	dep_current[newtype]++;
+	dep_current[item->wk_type]--;
 	if (dep_current[newtype] > dep_highuse[newtype])
 		dep_highuse[newtype] = dep_current[newtype];
 	dep_total[newtype]++;
+	FREE_GBLLOCK(&lk);
 	item->wk_type = newtype;
 }
 
@@ -1237,13 +1244,10 @@ workitem_reassign(item, newtype)
  * Workitem queue management
  */
 static int max_softdeps;	/* maximum number of structs before slowdown */
-static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
 static int tickdelay = 2;	/* number of ticks to pause during slowdown */
 static int proc_waiting;	/* tracks whether we have a timeout posted */
 static int *stat_countp;	/* statistic to count in proc_waiting timeout */
 static struct callout softdep_callout;
-static struct mount *req_pending;
-#define ALLCLEAN ((struct mount *)-1)
 static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
 static int req_clear_remove;	/* syncer process flush some freeblks */
 static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
@@ -1251,7 +1255,7 @@ static int softdep_flushcache = 0; /* Sh
 /*
  * runtime statistics
  */
-static int stat_softdep_mounts;	/* number of softdep mounted filesystems */
+static int stat_flush_threads;	/* number of softdep flushing threads */
 static int stat_worklist_push;	/* number of worklist cleanups */
 static int stat_blk_limit_push;	/* number of times block limit neared */
 static int stat_ino_limit_push;	/* number of times inode limit neared */
@@ -1282,10 +1286,8 @@ SYSCTL_INT(_debug_softdep, OID_AUTO, max
     &max_softdeps, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
     &tickdelay, 0, "");
-SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
-    &maxindirdeps, 0, "");
-SYSCTL_INT(_debug_softdep, OID_AUTO, softdep_mounts, CTLFLAG_RD,
-    &stat_softdep_mounts, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
+    &stat_flush_threads, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
     &stat_worklist_push, 0,"");
 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
@@ -1345,53 +1347,67 @@ SYSCTL_DECL(_vfs_ffs);
 static int compute_summary_at_mount = 0;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
 	   &compute_summary_at_mount, 0, "Recompute summary at mount");
-static struct proc *softdepproc;
-static struct kproc_desc softdep_kp = {
-	"softdepflush",
-	softdep_flush,
-	&softdepproc
-};
-SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
-    &softdep_kp);
-
+static int print_threads = 0;
+SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
+    &print_threads, 0, "Notify flusher thread start/stop");
+
+/* List of all filesystems mounted with soft updates */
+static TAILQ_HEAD(, mount_softdeps) softdepmounts;
+
+/*
+ * This function cleans the worklist for a filesystem.
+ * Each filesystem running with soft dependencies gets its own
+ * thread to run in this function. The thread is started up in
+ * softdep_mount and shutdown in softdep_unmount. They show up
+ * as part of the kernel "bufdaemon" process whose process
+ * entry is available in bufdaemonproc.
+ */
+static int searchfailed;
+extern struct proc *bufdaemonproc;
 static void
-softdep_flush(void)
+softdep_flush(addr)
+	void *addr;
 {
-	struct mount *nmp;
 	struct mount *mp;
-	struct ufsmount *ump;
 	struct thread *td;
-	int remaining;
-	int progress;
+	struct ufsmount *ump;
 
 	td = curthread;
 	td->td_pflags |= TDP_NORUNNINGBUF;
-
+	mp = (struct mount *)addr;
+	ump = VFSTOUFS(mp);
+	atomic_add_int(&stat_flush_threads, 1);
+	if (print_threads) {
+		if (stat_flush_threads == 1)
+			printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
+			    bufdaemonproc->p_pid);
+		printf("Start thread %s\n", td->td_name);
+	}
 	for (;;) {	
-		kproc_suspend_check(softdepproc);
-		remaining = progress = 0;
-		mtx_lock(&mountlist_mtx);
-		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
-			nmp = TAILQ_NEXT(mp, mnt_list);
-			if (MOUNTEDSOFTDEP(mp) == 0)
-				continue;
-			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
-				continue;
-			ump = VFSTOUFS(mp);
-			progress += softdep_process_worklist(mp, 0);
-			remaining += ump->softdep_on_worklist;
-			mtx_lock(&mountlist_mtx);
-			nmp = TAILQ_NEXT(mp, mnt_list);
-			vfs_unbusy(mp);
-		}
-		mtx_unlock(&mountlist_mtx);
-		if (remaining && progress)
+		while (softdep_process_worklist(mp, 0) > 0 ||
+		    (MOUNTEDSUJ(mp) &&
+		    VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
+			kthread_suspend_check();
+		ACQUIRE_LOCK(ump);
+		if ((ump->softdep_flags & FLUSH_CLEANUP) == 0)
+			msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
+			    "sdflush", hz / 2);
+		ump->softdep_flags &= ~FLUSH_CLEANUP;
+		/*
+		 * Check to see if we are done and need to exit.
+		 */
+		if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
+			FREE_LOCK(ump);
 			continue;
-		rw_wlock(&lk);
-		if (req_pending == NULL)
-			msleep(&req_pending, &lk, PVM, "sdflush", hz);
-		req_pending = NULL;
-		rw_wunlock(&lk);
+		}
+		ump->softdep_flags &= ~FLUSH_EXIT;
+		FREE_LOCK(ump);
+		wakeup(&ump->softdep_flags);
+		if (print_threads)
+			printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
+		atomic_subtract_int(&stat_flush_threads, 1);
+		kthread_exit();
+		panic("kthread_exit failed\n");
 	}
 }
 
@@ -1399,19 +1415,70 @@ static void
 worklist_speedup(mp)
 	struct mount *mp;
 {
-	rw_assert(&lk, RA_WLOCKED);
-	if (req_pending == 0) {
-		req_pending = mp;
-		wakeup(&req_pending);
+	struct ufsmount *ump;
+
+	ump = VFSTOUFS(mp);
+	LOCK_OWNED(ump);
+	if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) {
+		ump->softdep_flags |= FLUSH_CLEANUP;
+		if (ump->softdep_flushtd->td_wchan == &ump->softdep_flushtd)
+			wakeup(&ump->softdep_flushtd);
 	}
 }
 
 static int
-softdep_speedup(void)
+softdep_speedup(ump)
+	struct ufsmount *ump;
 {
+	struct ufsmount *altump;
+	struct mount_softdeps *sdp;
 
-	worklist_speedup(ALLCLEAN);
+	LOCK_OWNED(ump);
+	worklist_speedup(ump->um_mountp);
 	bd_speedup();
+	/*
+	 * If we have global shortages, then we need other
+	 * filesystems to help with the cleanup. Here we wakeup a
+	 * flusher thread for a filesystem that is over its fair
+	 * share of resources.
+	 */
+	if (req_clear_inodedeps || req_clear_remove) {
+		ACQUIRE_GBLLOCK(&lk);
+		TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
+			if ((altump = sdp->sd_ump) == ump)
+				continue;
+			if (((req_clear_inodedeps &&
+			    altump->softdep_curdeps[D_INODEDEP] >
+			    max_softdeps / stat_flush_threads) ||
+			    (req_clear_remove &&
+			    altump->softdep_curdeps[D_DIRREM] >
+			    (max_softdeps / 2) / stat_flush_threads)) &&
+			    TRY_ACQUIRE_LOCK(altump))
+				break;
+		}
+		if (sdp == NULL) {
+			searchfailed++;
+			FREE_GBLLOCK(&lk);
+		} else {
+			/*
+			 * Move to the end of the list so we pick a
+			 * different one on out next try.
+			 */
+			TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
+			TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
+			FREE_GBLLOCK(&lk);
+			if ((altump->softdep_flags &
+			    (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) {
+				altump->softdep_flags |= FLUSH_CLEANUP;
+				altump->um_softdep->sd_cleanups++;
+				if (altump->softdep_flushtd->td_wchan ==
+				    &altump->softdep_flushtd) {
+					wakeup(&altump->softdep_flushtd);
+				}
+			}
+			FREE_LOCK(altump);
+		}
+	}
 	return (speedup_syncer());
 }
 
@@ -2127,9 +2194,14 @@ inodedep_lookup(mp, inum, flags, inodede
 	if ((flags & DEPALLOC) == 0)
 		return (0);
 	/*
-	 * If we are over our limit, try to improve the situation.
-	 */
-	if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0)
+	 * If the system is over its limit and our filesystem is
+	 * responsible for more than our share of that usage and
+	 * we are not in a rush, request some inodedep cleanup.
+	 */
+	while (dep_current[D_INODEDEP] > max_softdeps &&
+	    (flags & NODELAY) == 0 &&
+	    ump->softdep_curdeps[D_INODEDEP] >
+	    max_softdeps / stat_flush_threads)
 		request_cleanup(mp, FLUSH_INODES);
 	FREE_LOCK(ump);
 	inodedep = malloc(sizeof(struct inodedep),
@@ -2321,6 +2393,7 @@ void 
 softdep_initialize()
 {
 
+	TAILQ_INIT(&softdepmounts);
 	max_softdeps = desiredvnodes * 4;
 
 	/* initialise bioops hack */
@@ -2379,7 +2452,8 @@ softdep_mount(devvp, mp, fs, cred)
 	ump = VFSTOUFS(mp);
 	ump->um_softdep = sdp;
 	MNT_IUNLOCK(mp);
-	LOCK_PTR(ump) = &lk;
+	rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock");
+	sdp->sd_ump = ump;
 	LIST_INIT(&ump->softdep_workitem_pending);
 	LIST_INIT(&ump->softdep_journal_pending);
 	TAILQ_INIT(&ump->softdep_unlinked);
@@ -2404,13 +2478,21 @@ softdep_mount(devvp, mp, fs, cred)
 	ump->indir_hash_size = i - 1;
 	for (i = 0; i <= ump->indir_hash_size; i++)
 		TAILQ_INIT(&ump->indir_hashtbl[i]);
+	ACQUIRE_GBLLOCK(&lk);
+	TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
+	FREE_GBLLOCK(&lk);
 	if ((fs->fs_flags & FS_SUJ) &&
 	    (error = journal_mount(mp, fs, cred)) != 0) {
 		printf("Failed to start journal: %d\n", error);
 		softdep_unmount(mp);
 		return (error);
 	}
-	atomic_add_int(&stat_softdep_mounts, 1);
+	/*
+	 * Start our flushing thread in the bufdaemon process.
+	 */
+	kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
+	    &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
+	    mp->mnt_stat.f_mntonname);
 	/*
 	 * When doing soft updates, the counters in the
 	 * superblock may have gotten out of sync. Recomputation
@@ -2466,7 +2548,26 @@ softdep_unmount(mp)
 		MNT_IUNLOCK(mp);
 		journal_unmount(ump);
 	}
-	atomic_subtract_int(&stat_softdep_mounts, 1);
+	/*
+	 * Shut down our flushing thread. Check for NULL is if
+	 * softdep_mount errors out before the thread has been created.
+	 */
+	if (ump->softdep_flushtd != NULL) {
+		ACQUIRE_LOCK(ump);
+		ump->softdep_flags |= FLUSH_EXIT;
+		wakeup(&ump->softdep_flushtd);
+		msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
+		    "sdwait", 0);
+		KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
+		    ("Thread shutdown failed"));
+	}
+	/*
+	 * Free up our resources.
+	 */
+	ACQUIRE_GBLLOCK(&lk);
+	TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
+	FREE_GBLLOCK(&lk);
+	rw_destroy(LOCK_PTR(ump));
 	hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
 	hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
 	hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
@@ -2789,7 +2890,7 @@ journal_space(ump, thresh)
 	 */
 	limit = (max_softdeps / 10) * 9;
 	if (dep_current[D_INODEDEP] > limit &&
-	    ump->softdep_curdeps[D_INODEDEP] > limit / stat_softdep_mounts)
+	    ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
 		return (0);
 	if (thresh)
 		thresh = jblocks->jb_min;
@@ -2814,7 +2915,7 @@ journal_suspend(ump)
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
 		stat_journal_min++;
 		mp->mnt_kern_flag |= MNTK_SUSPEND;
-		mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
+		mp->mnt_susp_owner = ump->softdep_flushtd;
 	}
 	jblocks->jb_suspended = 1;
 	MNT_IUNLOCK(mp);
@@ -2889,7 +2990,7 @@ softdep_prealloc(vp, waitok)
 	process_removes(vp);
 	process_truncates(vp);
 	if (journal_space(ump, 0) == 0) {
-		softdep_speedup();
+		softdep_speedup(ump);
 		if (journal_space(ump, 1) == 0)
 			journal_suspend(ump);
 	}
@@ -2933,10 +3034,10 @@ softdep_prelink(dvp, vp)
 	}
 	process_removes(dvp);
 	process_truncates(dvp);
-	softdep_speedup();
+	softdep_speedup(ump);
 	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
 	if (journal_space(ump, 0) == 0) {
-		softdep_speedup();
+		softdep_speedup(ump);
 		if (journal_space(ump, 1) == 0)
 			journal_suspend(ump);
 	}
@@ -3258,7 +3359,7 @@ softdep_process_journal(mp, needwk, flag
 			if (flags != MNT_WAIT)
 				break;
 			printf("softdep: Out of journal space!\n");
-			softdep_speedup();
+			softdep_speedup(ump);
 			msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
 		}
 		FREE_LOCK(ump);
@@ -3971,7 +4072,7 @@ free_freedep(freedep)
 /*
  * Allocate a new freework structure that may be a level in an indirect
  * when parent is not NULL or a top level block when it is.  The top level
- * freework structures are allocated without the soft updates lock held
+ * freework structures are allocated without the per-filesystem lock held
  * and before the freeblks is visible outside of softdep_setup_freeblocks().
  */
 static struct freework *
@@ -4040,7 +4141,7 @@ cancel_jfreeblk(freeblks, blkno)
 
 /*
  * Allocate a new jfreeblk to journal top level block pointer when truncating
- * a file.  The caller must add this to the worklist when the soft updates
+ * a file.  The caller must add this to the worklist when the per-filesystem
  * lock is held.
  */
 static struct jfreeblk *
@@ -7450,7 +7551,7 @@ softdep_freefile(pvp, ino, mode)
 			clear_unlinked_inodedep(inodedep);
 			/*
 			 * Re-acquire inodedep as we've dropped the
-			 * soft updates lock in clear_unlinked_inodedep().
+			 * per-filesystem lock in clear_unlinked_inodedep().
 			 */
 			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
 		}
@@ -7996,10 +8097,8 @@ indir_trunc(freework, dbn, lbn)
 		 * If we're goingaway, free the indirdep.  Otherwise it will
 		 * linger until the write completes.
 		 */
-		if (goingaway) {
+		if (goingaway)
 			free_indirdep(indirdep);
-			ump->softdep_numindirdeps -= 1;
-		}
 	}
 	FREE_LOCK(ump);
 	/* Initialize pointers depending on block size. */
@@ -8171,7 +8270,7 @@ cancel_allocindir(aip, bp, freeblks, tru
  * Create the mkdir dependencies for . and .. in a new directory.  Link them
  * in to a newdirblk so any subsequent additions are tracked properly.  The
  * caller is responsible for adding the mkdir1 dependency to the journal
- * and updating id_mkdiradd.  This function returns with the soft updates
+ * and updating id_mkdiradd.  This function returns with the per-filesystem
  * lock held.
  */
 static struct mkdir *
@@ -8989,12 +9088,16 @@ newdirrem(bp, dp, ip, isrmdir, prevdirre
 		panic("newdirrem: whiteout");
 	dvp = ITOV(dp);
 	/*
-	 * If we are over our limit, try to improve the situation.
+	 * If the system is over its limit and our filesystem is
+	 * responsible for more than our share of that usage and
+	 * we are not a snapshot, request some inodedep cleanup.
 	 * Limiting the number of dirrem structures will also limit
 	 * the number of freefile and freeblks structures.
 	 */
 	ACQUIRE_LOCK(ip->i_ump);
-	if (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2)
+	while (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2 &&
+	    ip->i_ump->softdep_curdeps[D_DIRREM] >
+	    (max_softdeps / 2) / stat_flush_threads)
 		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
 	FREE_LOCK(ip->i_ump);
 	dirrem = malloc(sizeof(struct dirrem),
@@ -9945,7 +10048,7 @@ initiate_write_filepage(pagedep, bp)
 	 * Wait for all journal remove dependencies to hit the disk.
 	 * We can not allow any potentially conflicting directory adds
 	 * to be visible before removes and rollback is too difficult.
-	 * The soft updates lock may be dropped and re-acquired, however 
+	 * The per-filesystem lock may be dropped and re-acquired, however 
 	 * we hold the buf locked so the dependency can not go away.
 	 */
 	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
@@ -10409,7 +10512,6 @@ cancel_indirdep(indirdep, bp, freeblks)
 		LIST_REMOVE(indirdep, ir_next);
 	}
 	indirdep->ir_state |= GOINGAWAY;
-	VFSTOUFS(indirdep->ir_list.wk_mp)->softdep_numindirdeps += 1;
 	/*
 	 * Pass in bp for blocks still have journal writes
 	 * pending so we can cancel them on their own.
@@ -10836,7 +10938,7 @@ softdep_disk_write_complete(bp)
 	ACQUIRE_LOCK(ump);
 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 		WORKLIST_REMOVE(wk);
-		dep_write[wk->wk_type]++;
+		atomic_add_long(&dep_write[wk->wk_type], 1);
 		if (wk == owk)
 			panic("duplicate worklist: %p\n", wk);
 		owk = wk;
@@ -11519,7 +11621,7 @@ diradd_inode_written(dap, inodedep)
 
 /*
  * Returns true if the bmsafemap will have rollbacks when written.  Must only
- * be called with the soft updates lock and the buf lock on the cg held.
+ * be called with the per-filesystem lock and the buf lock on the cg held.
  */
 static int
 bmsafemap_backgroundwrite(bmsafemap, bp)
@@ -12943,18 +13045,42 @@ softdep_slowdown(vp)
 		if (journal_space(ump, 0) == 0)
 			jlow = 1;
 	}
+	/*
+	 * If the system is under its limits and our filesystem is
+	 * not responsible for more than our share of the usage and
+	 * we are not low on journal space, then no need to slow down.
+	 */
 	max_softdeps_hard = max_softdeps * 11 / 10;
 	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
 	    dep_current[D_INODEDEP] < max_softdeps_hard &&
-	    VFSTOUFS(vp->v_mount)->softdep_numindirdeps < maxindirdeps &&
-	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) {
+	    dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
+	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
+	    ump->softdep_curdeps[D_DIRREM] <
+	    (max_softdeps_hard / 2) / stat_flush_threads &&
+	    ump->softdep_curdeps[D_INODEDEP] <
+	    max_softdeps_hard / stat_flush_threads &&
+	    ump->softdep_curdeps[D_INDIRDEP] <
+	    (max_softdeps_hard / 1000) / stat_flush_threads &&
+	    ump->softdep_curdeps[D_FREEBLKS] <
+	    max_softdeps_hard / stat_flush_threads) {
 		FREE_LOCK(ump);
   		return (0);
 	}
-	if (VFSTOUFS(vp->v_mount)->softdep_numindirdeps >= maxindirdeps || jlow)
-		softdep_speedup();
+	/*
+	 * If the journal is low or our filesystem is over its limit
+	 * then speedup the cleanup.
+	 */
+	if (ump->softdep_curdeps[D_INDIRDEP] <
+	    (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
+		softdep_speedup(ump);
 	stat_sync_limit_hit += 1;
 	FREE_LOCK(ump);
+	/*
+	 * We only slow down the rate at which new dependencies are
+	 * generated if we are not using journaling. With journaling,
+	 * the cleanup should always be sufficient to keep things
+	 * under control.
+	 */
 	if (DOINGSUJ(vp))
 		return (0);
 	return (1);
@@ -13012,13 +13138,12 @@ softdep_request_cleanup(fs, vp, cred, re
 		return (0);
 	}
 	/*
-	 * If we are in need of resources, consider pausing for
-	 * tickdelay to give ourselves some breathing room.
+	 * If we are in need of resources, start by cleaning up
+	 * any block removals associated with our inode.
 	 */
 	ACQUIRE_LOCK(ump);
 	process_removes(vp);
 	process_truncates(vp);
-	request_cleanup(UFSTOVFS(ump), resource);
 	FREE_LOCK(ump);
 	/*
 	 * Now clean up at least as many resources as we will need.
@@ -13151,7 +13276,7 @@ request_cleanup(mp, resource)
 	 * Next, we attempt to speed up the syncer process. If that
 	 * is successful, then we allow the process to continue.
 	 */
-	if (softdep_speedup() &&
+	if (softdep_speedup(ump) &&
 	    resource != FLUSH_BLOCKS_WAIT &&
 	    resource != FLUSH_INODES_WAIT)
 		return(0);
@@ -13169,15 +13294,19 @@ request_cleanup(mp, resource)
 
 	case FLUSH_INODES:
 	case FLUSH_INODES_WAIT:
+		ACQUIRE_GBLLOCK(&lk);
 		stat_ino_limit_push += 1;
 		req_clear_inodedeps += 1;
+		FREE_GBLLOCK(&lk);
 		stat_countp = &stat_ino_limit_hit;
 		break;
 
 	case FLUSH_BLOCKS:
 	case FLUSH_BLOCKS_WAIT:
+		ACQUIRE_GBLLOCK(&lk);
 		stat_blk_limit_push += 1;
 		req_clear_remove += 1;
+		FREE_GBLLOCK(&lk);
 		stat_countp = &stat_blk_limit_hit;
 		break;
 
@@ -13188,6 +13317,8 @@ request_cleanup(mp, resource)
 	 * Hopefully the syncer daemon will catch up and awaken us.
 	 * We wait at most tickdelay before proceeding in any case.
 	 */
+	ACQUIRE_GBLLOCK(&lk);
+	FREE_LOCK(ump);
 	proc_waiting += 1;
 	if (callout_pending(&softdep_callout) == FALSE)
 		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
@@ -13195,6 +13326,8 @@ request_cleanup(mp, resource)
 
 	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
 	proc_waiting -= 1;
+	FREE_GBLLOCK(&lk);
+	ACQUIRE_LOCK(ump);
 	return (1);
 }
 
@@ -13208,16 +13341,13 @@ pause_timer(arg)
 	void *arg;
 {
 
-	rw_assert(&lk, RA_WLOCKED);
+	GBLLOCK_OWNED(&lk);
 	/*
 	 * The callout_ API has acquired mtx and will hold it around this
 	 * function call.
 	 */
-	*stat_countp += 1;
-	wakeup_one(&proc_waiting);
-	if (proc_waiting > 0)
-		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
-		    pause_timer, 0);
+	*stat_countp += proc_waiting;
+	wakeup(&proc_waiting);
 }
 
 /*
@@ -13228,7 +13358,6 @@ check_clear_deps(mp)
 	struct mount *mp;
 {
 
-	rw_assert(&lk, RA_WLOCKED);
 	/*
 	 * If we are suspended, it may be because of our using
 	 * too many inodedeps, so help clear them out.
@@ -13238,16 +13367,22 @@ check_clear_deps(mp)
 	/*
 	 * General requests for cleanup of backed up dependencies
 	 */
+	ACQUIRE_GBLLOCK(&lk);
 	if (req_clear_inodedeps) {
 		req_clear_inodedeps -= 1;
+		FREE_GBLLOCK(&lk);
 		clear_inodedeps(mp);
-		wakeup_one(&proc_waiting);
+		ACQUIRE_GBLLOCK(&lk);
+		wakeup(&proc_waiting);
 	}
 	if (req_clear_remove) {
 		req_clear_remove -= 1;
+		FREE_GBLLOCK(&lk);
 		clear_remove(mp);
-		wakeup_one(&proc_waiting);
+		ACQUIRE_GBLLOCK(&lk);
+		wakeup(&proc_waiting);
 	}
+	FREE_GBLLOCK(&lk);
 }
 
 /*

Modified: stable/10/sys/ufs/ffs/softdep.h
==============================================================================
--- stable/10/sys/ufs/ffs/softdep.h	Mon Aug 18 21:07:12 2014	(r270156)
+++ stable/10/sys/ufs/ffs/softdep.h	Mon Aug 18 22:53:48 2014	(r270157)
@@ -1025,7 +1025,7 @@ TAILQ_HEAD(indir_hashhead, freework);
  * Allocated at mount and freed at unmount.
  */
 struct mount_softdeps {
-	struct	rwlock *sd_fslock;		/* softdep lock */
+	struct	rwlock sd_fslock;		/* softdep lock */
 	struct	workhead sd_workitem_pending;	/* softdep work queue */
 	struct	worklist *sd_worklist_tail;	/* Tail pointer for above */
 	struct	workhead sd_journal_pending;	/* journal work queue */
@@ -1046,15 +1046,24 @@ struct mount_softdeps {
 	u_long	sd_bmhashsize;			/* bmsafemap hash table size-1*/
 	struct	indir_hashhead *sd_indirhash;	/* indir hash table */
 	u_long	sd_indirhashsize;		/* indir hash table size-1 */
-	long	sd_numindirdeps;		/* outstanding indirdeps */
 	int	sd_on_journal;			/* Items on the journal list */
 	int	sd_on_worklist;			/* Items on the worklist */
 	int	sd_deps;			/* Total dependency count */
 	int	sd_accdeps;			/* accumulated dep count */
 	int	sd_req;				/* Wakeup when deps hits 0. */
+	int	sd_flags;			/* comm with flushing thread */
+	int	sd_cleanups;			/* Calls to cleanup */
+	struct	thread *sd_flushtd;		/* thread handling flushing */
+	TAILQ_ENTRY(mount_softdeps) sd_next;	/* List of softdep filesystem */
+	struct	ufsmount *sd_ump;		/* our ufsmount structure */
 	u_long	sd_curdeps[D_LAST + 1];		/* count of current deps */
 };
 /*
+ * Flags for communicating with the syncer thread.
+ */
+#define FLUSH_EXIT	0x0001	/* time to exit */
+#define FLUSH_CLEANUP	0x0002	/* need to clear out softdep structures */
+/*
  * Keep the old names from when these were in the ufsmount structure.
  */
 #define	softdep_workitem_pending	um_softdep->sd_workitem_pending
@@ -1077,10 +1086,11 @@ struct mount_softdeps {
 #define	bmsafemap_hash_size		um_softdep->sd_bmhashsize
 #define	indir_hashtbl			um_softdep->sd_indirhash
 #define	indir_hash_size			um_softdep->sd_indirhashsize
-#define	softdep_numindirdeps		um_softdep->sd_numindirdeps
 #define	softdep_on_journal		um_softdep->sd_on_journal
 #define	softdep_on_worklist		um_softdep->sd_on_worklist
 #define	softdep_deps			um_softdep->sd_deps
 #define	softdep_accdeps			um_softdep->sd_accdeps
 #define	softdep_req			um_softdep->sd_req
+#define	softdep_flags			um_softdep->sd_flags
+#define	softdep_flushtd			um_softdep->sd_flushtd
 #define	softdep_curdeps			um_softdep->sd_curdeps