Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 27 May 2015 09:20:43 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r283600 - in head/sys: fs/nfsserver kern sys ufs/ffs
Message-ID:  <201505270920.t4R9Kh72062401@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Wed May 27 09:20:42 2015
New Revision: 283600
URL: https://svnweb.freebsd.org/changeset/base/283600

Log:
  Currently, softupdate code detects overstepping on the workitems
  limits in the code which is deep in the call stack, and owns several
  critical system resources, like vnode locks.  Attempt to wait while
  the per-mount softupdate thread cleans up the backlog may deadlock,
  because the thread might need to lock the same vnode which is owned by
  the waiting thread.
  
  Instead of synchronously waiting for the worker, perform the worker'
  tickle and pause until the backlog is cleaned, at the safe point
  during return from kernel to usermode.  A new ast request to call
  softdep_ast_cleanup() is created, the SU code now only checks the size
  of queue and schedules ast.
  
  There is no ast delivery for the kernel threads, so they are exempted
  from the mechanism, except NFS daemon threads.  NFS server loop
  explicitely checks for the request, and informs the schedule_cleanup()
  that it is capable of handling the requests by the process P2_AST_SU
  flag.  This is needed because nfsd may be the sole cause of the SU
  workqueue overflow.  But, to not cause nsfd to spawn additional
  threads just because we slow down existing workers, only tickle su
  threads, without waiting for the backlog cleanup.
  
  Reviewed by:	jhb, mckusick
  Tested by:	pho
  Sponsored by:	The FreeBSD Foundation
  MFC after:	2 weeks

Modified:
  head/sys/fs/nfsserver/nfs_nfsdkrpc.c
  head/sys/kern/kern_exit.c
  head/sys/kern/subr_trap.c
  head/sys/sys/proc.h
  head/sys/sys/systm.h
  head/sys/ufs/ffs/ffs_softdep.c

Modified: head/sys/fs/nfsserver/nfs_nfsdkrpc.c
==============================================================================
--- head/sys/fs/nfsserver/nfs_nfsdkrpc.c	Wed May 27 08:57:48 2015	(r283599)
+++ head/sys/fs/nfsserver/nfs_nfsdkrpc.c	Wed May 27 09:20:42 2015	(r283600)
@@ -294,6 +294,8 @@ nfssvc_program(struct svc_req *rqst, SVC
 	svc_freereq(rqst);
 
 out:
+	if (softdep_ast_cleanup != NULL)
+		softdep_ast_cleanup();
 	NFSEXITCODE(0);
 }
 
@@ -464,6 +466,7 @@ int
 nfsrvd_nfsd(struct thread *td, struct nfsd_nfsd_args *args)
 {
 	char principal[MAXHOSTNAMELEN + 5];
+	struct proc *p;
 	int error = 0;
 	bool_t ret2, ret3, ret4;
 
@@ -481,6 +484,10 @@ nfsrvd_nfsd(struct thread *td, struct nf
 	 */
 	NFSD_LOCK();
 	if (newnfs_numnfsd == 0) {
+		p = td->td_proc;
+		PROC_LOCK(p);
+		p->p_flag2 |= P2_AST_SU;
+		PROC_UNLOCK(p);
 		newnfs_numnfsd++;
 
 		NFSD_UNLOCK();
@@ -512,6 +519,9 @@ nfsrvd_nfsd(struct thread *td, struct nf
 		NFSD_LOCK();
 		newnfs_numnfsd--;
 		nfsrvd_init(1);
+		PROC_LOCK(p);
+		p->p_flag2 &= ~P2_AST_SU;
+		PROC_UNLOCK(p);
 	}
 	NFSD_UNLOCK();
 

Modified: head/sys/kern/kern_exit.c
==============================================================================
--- head/sys/kern/kern_exit.c	Wed May 27 08:57:48 2015	(r283599)
+++ head/sys/kern/kern_exit.c	Wed May 27 09:20:42 2015	(r283600)
@@ -205,6 +205,12 @@ exit1(struct thread *td, int rv)
 	}
 
 	/*
+	 * Deref SU mp, since the thread does not return to userspace.
+	 */
+	if (softdep_ast_cleanup != NULL)
+		softdep_ast_cleanup();
+
+	/*
 	 * MUST abort all other threads before proceeding past here.
 	 */
 	PROC_LOCK(p);

Modified: head/sys/kern/subr_trap.c
==============================================================================
--- head/sys/kern/subr_trap.c	Wed May 27 08:57:48 2015	(r283599)
+++ head/sys/kern/subr_trap.c	Wed May 27 09:20:42 2015	(r283600)
@@ -86,6 +86,8 @@ __FBSDID("$FreeBSD$");
 
 #include <security/mac/mac_framework.h>
 
+void (*softdep_ast_cleanup)(void);
+
 /*
  * Define the code needed before returning to user mode, for trap and
  * syscall.
@@ -114,6 +116,9 @@ userret(struct thread *td, struct trapfr
 #ifdef KTRACE
 	KTRUSERRET(td);
 #endif
+	if (softdep_ast_cleanup != NULL)
+		softdep_ast_cleanup();
+
 	/*
 	 * If this thread tickled GEOM, we need to wait for the giggling to
 	 * stop before we return to userland
@@ -157,6 +162,8 @@ userret(struct thread *td, struct trapfr
 	    ("userret: Returning while holding vnode reservation"));
 	KASSERT((td->td_flags & TDF_SBDRY) == 0,
 	    ("userret: Returning with stop signals deferred"));
+	KASSERT(td->td_su == NULL,
+	    ("userret: Returning with SU cleanup request not handled"));
 #ifdef VIMAGE
 	/* Unfortunately td_vnet_lpush needs VNET_DEBUG. */
 	VNET_ASSERT(curvnet == NULL,

Modified: head/sys/sys/proc.h
==============================================================================
--- head/sys/sys/proc.h	Wed May 27 08:57:48 2015	(r283599)
+++ head/sys/sys/proc.h	Wed May 27 09:20:42 2015	(r283600)
@@ -277,6 +277,7 @@ struct thread {
 	u_int		td_vp_reserv;	/* (k) Count of reserved vnodes. */
 	int		td_no_sleeping;	/* (k) Sleeping disabled count. */
 	int		td_dom_rr_idx;	/* (k) RR Numa domain selection. */
+	void		*td_su;		/* (k) FFS SU private */
 #define	td_endzero td_sigmask
 
 /* Copied during fork1() or create_thread(). */
@@ -677,6 +678,7 @@ struct proc {
 #define	P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */
 #define	P2_NOTRACE	0x00000002	/* No ptrace(2) attach or coredumps. */
 #define	P2_NOTRACE_EXEC 0x00000004	/* Keep P2_NOPTRACE on exec(2). */
+#define	P2_AST_SU	0x00000008	/* Handles SU ast for kthreads. */
 
 /* Flags protected by proctree_lock, kept in p_treeflags. */
 #define	P_TREE_ORPHANED		0x00000001	/* Reparented, on orphan list */

Modified: head/sys/sys/systm.h
==============================================================================
--- head/sys/sys/systm.h	Wed May 27 08:57:48 2015	(r283599)
+++ head/sys/sys/systm.h	Wed May 27 09:20:42 2015	(r283600)
@@ -431,4 +431,6 @@ void free_unr(struct unrhdr *uh, u_int i
 
 void	intr_prof_stack_use(struct thread *td, struct trapframe *frame);
 
+extern void (*softdep_ast_cleanup)(void);
+
 #endif /* !_SYS_SYSTM_H_ */

Modified: head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- head/sys/ufs/ffs/ffs_softdep.c	Wed May 27 08:57:48 2015	(r283599)
+++ head/sys/ufs/ffs/ffs_softdep.c	Wed May 27 09:20:42 2015	(r283600)
@@ -900,6 +900,8 @@ static	int pagedep_find(struct pagedep_h
 	    struct pagedep **);
 static	void pause_timer(void *);
 static	int request_cleanup(struct mount *, int);
+static	void schedule_cleanup(struct mount *);
+static void softdep_ast_cleanup_proc(void);
 static	int process_worklist_item(struct mount *, int, int);
 static	void process_removes(struct vnode *);
 static	void process_truncates(struct vnode *);
@@ -921,6 +923,8 @@ static	int journal_unsuspend(struct ufsm
 static	void softdep_prelink(struct vnode *, struct vnode *);
 static	void add_to_journal(struct worklist *);
 static	void remove_from_journal(struct worklist *);
+static	bool softdep_excess_inodes(struct ufsmount *);
+static	bool softdep_excess_dirrem(struct ufsmount *);
 static	void softdep_process_journal(struct mount *, struct worklist *, int);
 static	struct jremref *newjremref(struct dirrem *, struct inode *,
 	    struct inode *ip, off_t, nlink_t);
@@ -2209,12 +2213,10 @@ inodedep_lookup(mp, inum, flags, inodede
 	 * responsible for more than our share of that usage and
 	 * we are not in a rush, request some inodedep cleanup.
 	 */
-	while (dep_current[D_INODEDEP] > max_softdeps &&
-	    (flags & NODELAY) == 0 &&
-	    ump->softdep_curdeps[D_INODEDEP] >
-	    max_softdeps / stat_flush_threads)
-		request_cleanup(mp, FLUSH_INODES);
-	FREE_LOCK(ump);
+	if (softdep_excess_inodes(ump))
+		schedule_cleanup(mp);
+	else
+		FREE_LOCK(ump);
 	inodedep = malloc(sizeof(struct inodedep),
 		M_INODEDEP, M_SOFTDEP_FLAGS);
 	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
@@ -2412,6 +2414,7 @@ softdep_initialize()
 	bioops.io_complete = softdep_disk_write_complete;
 	bioops.io_deallocate = softdep_deallocate_dependencies;
 	bioops.io_countdeps = softdep_count_dependencies;
+	softdep_ast_cleanup = softdep_ast_cleanup_proc;
 
 	/* Initialize the callout with an mtx. */
 	callout_init_mtx(&softdep_callout, &lk, 0);
@@ -2430,6 +2433,7 @@ softdep_uninitialize()
 	bioops.io_complete = NULL;
 	bioops.io_deallocate = NULL;
 	bioops.io_countdeps = NULL;
+	softdep_ast_cleanup = NULL;
 
 	callout_drain(&softdep_callout);
 }
@@ -9126,13 +9130,12 @@ newdirrem(bp, dp, ip, isrmdir, prevdirre
 	 * the number of freefile and freeblks structures.
 	 */
 	ACQUIRE_LOCK(ip->i_ump);
-	while (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2 &&
-	    ip->i_ump->softdep_curdeps[D_DIRREM] >
-	    (max_softdeps / 2) / stat_flush_threads)
-		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
-	FREE_LOCK(ip->i_ump);
-	dirrem = malloc(sizeof(struct dirrem),
-		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
+	if (!IS_SNAPSHOT(ip) && softdep_excess_dirrem(ip->i_ump))
+		schedule_cleanup(ITOV(dp)->v_mount);
+	else
+		FREE_LOCK(ip->i_ump);
+	dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
+	    M_ZERO);
 	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
 	LIST_INIT(&dirrem->dm_jremrefhd);
 	LIST_INIT(&dirrem->dm_jwork);
@@ -13269,6 +13272,92 @@ retry:
 	return (1);
 }
 
+static bool
+softdep_excess_inodes(struct ufsmount *ump)
+{
+
+	return (dep_current[D_INODEDEP] > max_softdeps &&
+	    ump->softdep_curdeps[D_INODEDEP] > max_softdeps /
+	    stat_flush_threads);
+}
+
+static bool
+softdep_excess_dirrem(struct ufsmount *ump)
+{
+
+	return (dep_current[D_DIRREM] > max_softdeps / 2 &&
+	    ump->softdep_curdeps[D_DIRREM] > (max_softdeps / 2) /
+	    stat_flush_threads);
+}
+
+static void
+schedule_cleanup(struct mount *mp)
+{
+	struct ufsmount *ump;
+	struct thread *td;
+
+	ump = VFSTOUFS(mp);
+	LOCK_OWNED(ump);
+	FREE_LOCK(ump);
+	td = curthread;
+	if ((td->td_pflags & TDP_KTHREAD) != 0 &&
+	    (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
+		/*
+		 * No ast is delivered to kernel threads, so nobody
+		 * would deref the mp.  Some kernel threads
+		 * explicitely check for AST, e.g. NFS daemon does
+		 * this in the serving loop.
+		 */
+		return;
+	}
+	if (td->td_su != NULL)
+		vfs_rel(td->td_su);
+	vfs_ref(mp);
+	td->td_su = mp;
+	thread_lock(td);
+	td->td_flags |= TDF_ASTPENDING;
+	thread_unlock(td);
+}
+
+static void
+softdep_ast_cleanup_proc(void)
+{
+	struct thread *td;
+	struct mount *mp;
+	struct ufsmount *ump;
+	int error;
+	bool req;
+
+	td = curthread;
+	mp = td->td_su;
+	if (mp == NULL)
+		return;
+	td->td_su = NULL;
+	error = vfs_busy(mp, MBF_NOWAIT);
+	vfs_rel(mp);
+	if (error != 0)
+		return;
+	if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
+		ump = VFSTOUFS(mp);
+		for (;;) {
+			req = false;
+			ACQUIRE_LOCK(ump);
+			if (softdep_excess_inodes(ump)) {
+				req = true;
+				request_cleanup(mp, FLUSH_INODES);
+			}
+			if (softdep_excess_dirrem(ump)) {
+				req = true;
+				request_cleanup(mp, FLUSH_BLOCKS);
+			}
+			FREE_LOCK(ump);
+			if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
+				break;
+		}
+	}
+	vfs_unbusy(mp);
+}
+
 /*
  * If memory utilization has gotten too high, deliberately slow things
  * down and speed up the I/O processing.
@@ -13355,7 +13444,8 @@ request_cleanup(mp, resource)
 		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
 		    pause_timer, 0);
 
-	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
+	if ((td->td_pflags & TDP_KTHREAD) == 0)
+		msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
 	proc_waiting -= 1;
 	FREE_GBLLOCK(&lk);
 	ACQUIRE_LOCK(ump);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201505270920.t4R9Kh72062401>