Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 11 Nov 2015 01:54:59 +0700
From:      Eugene Grosbein <eugen@grosbein.net>
To:        Adrian Chadd <adrian@freebsd.org>
Cc:        "freebsd-mips@freebsd.org" <freebsd-mips@freebsd.org>
Subject:   Re: USB-related problem
Message-ID:  <56423D82.5030203@grosbein.net>
In-Reply-To: <564231DE.7090308@grosbein.net>
References:  <56348063.3090508@grosbein.net> <563707A0.3040700@grosbein.net> <CANCZdfrB0hkjmTU-9NzimBv=X_h-=bF0D2azBxg=9B=kmitv7Q@mail.gmail.com> <56370E1D.3040801@grosbein.net> <CAJ-Vmo=0vOAq8db_GeLWmdXr7xJdzUh44ZZJrQ9vVdpvzT9hiQ@mail.gmail.com> <563F5630.2000407@grosbein.net> <563F6F6F.1010807@grosbein.net> <CAJ-Vmo=fPSi7yZO5Xjodg8HPtTLd44Y9Y_8qg4EgTGwEpHO10A@mail.gmail.com> <563F91A8.9080702@grosbein.net> <CAJ-VmomUvoUerMS20qQsQujcjULVA=_jaLp9Mh3fU1fEpdwzZA@mail.gmail.com> <CAJ-Vmo=eUUZ928KgQbyOi8EdDFSmxhjvDOyAMvfXsqwDbO96ng@mail.gmail.com> <5640C0FD.2040803@grosbein.net> <CAJ-Vmo=6mztfvvBd91LPO5H418K8vW=%2BLk=6V5Z_y5DHu7v7HA@mail.gmail.com> <5640F315.5020303@grosbein.net> <CAJ-VmokpWM=d%2BtEFv8a8eU91UimVZ9W8da2QkKCTDjd%2B2ZM_LQ@mail.gmail.com> <56410214.3070901@grosbein.net> <CAJ-Vmo=QUUkTfQ7pvr_V%2BCQ8zQWOoqp3H8hD9LUR8C5U-5N=Ag@mail.gmail.com> <CAJ-VmomA27NcNCjx00DrZzFOU9bGYucUhbPXodH2uvNd8eJ3wg@mail.gmail.com> <564231DE.7090308@grosbein.net>

next in thread | previous in thread | raw e-mail | index | archive | help
On 11.11.2015 01:05, Eugene Grosbein wrote:
> On 10.11.2015 04:43, Adrian Chadd wrote:
>> Hi,
>>
>> How much C do you know? Are you able to add some debugging to
>> sched_sync() (which IIRC is the syncer thread) and work out where it's
>> getting stuck and spinning?
> 
> I did. I've added new global kernel symbol, variable schedstat
> so I can check its value using ddb. And it's always equals to 5
> when deadlock occurs, discovering the loop

I've digged it a bit more. I tried to make double sure it loops
and added loop counter "schedloop" hoping to see some insane large value.
But it does not loop this way and schedloop is always 1.

It just sits inside sync_vnode(). So I've added another variable "svnodestate"
to trace sync_vnode() and svnodestate is 9.

That is, VOP_FSYNC(vp, MNT_LAZY, td) does not return:

--- sys/kern/vfs_subr.c.orig	2015-10-08 23:33:53.935440000 +0700
+++ /sys/kern/vfs_subr.c	2015-11-11 01:39:09.450390000 +0700
@@ -1738,217 +1738,272 @@ sysctl_vfs_worklist_len(SYSCTL_HANDLER_A
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 static struct proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
 
+volatile unsigned svnodestate = 0;
+
 static int
 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 
+	svnodestate = 0;
 	*bo = LIST_FIRST(slp);
 	if (*bo == NULL)
 		return (0);
+
+	svnodestate = 1;
 	vp = (*bo)->__bo_vnode;	/* XXX */
 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
 		return (1);
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
 	 * going away when we unlock the sync_mtx so that
 	 * we can acquire the vnode interlock.
 	 */
+	svnodestate = 2;
 	vholdl(vp);
+	svnodestate = 3;
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
+	svnodestate = 4;
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+		svnodestate = 5;
 		vdrop(vp);
+		svnodestate = 6;
 		mtx_lock(&sync_mtx);
+		svnodestate = 7;
 		return (*bo == LIST_FIRST(slp));
 	}
+	svnodestate = 8;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	svnodestate = 9;
 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
+	svnodestate = 10;
 	VOP_UNLOCK(vp, 0);
+	svnodestate = 11;
 	vn_finished_write(mp);
+	svnodestate = 12;
 	BO_LOCK(*bo);
+	svnodestate = 13;
 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
+		svnodestate = 14;
 		vn_syncer_add_to_worklist(*bo, syncdelay);
 	}
+	svnodestate = 15;
 	BO_UNLOCK(*bo);
+	svnodestate = 16;
 	vdrop(vp);
+	svnodestate = 17;
 	mtx_lock(&sync_mtx);
+	svnodestate = 18;
 	return (0);
 }
 
 static int first_printf = 1;
 
+volatile unsigned schedstate = 0;
+volatile unsigned schedloop = 0;
+
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next, *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = curthread;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int error;
 
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
+	schedstate = 0;
 	mtx_lock(&sync_mtx);
 	for (;;) {
+		schedstate = 1;
+
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kproc_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
+		schedstate = 2;
+
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_uptime) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining...");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_uptime;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
+			schedstate = 3;
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes,
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
+		schedstate = 4;
+
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
+		schedloop=0;
 		while (!LIST_EMPTY(slp)) {
+	    		schedstate = 5;
+	    		schedloop++;
+
 			error = sync_vnode(slp, &bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
+			schedstate = 6;
 
 			if (first_printf == 0) {
+				schedstate = 7;
 				/*
 				 * Drop the sync mutex, because some watchdog
 				 * drivers need to sleep while patting
 				 */
 				mtx_unlock(&sync_mtx);
 				wdog_kern_pat(WD_LASTVAL);
 				mtx_lock(&sync_mtx);
 			}
 
+			schedstate = 8;
 		}
+		schedstate = 9;
+
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		if (rushjob > 0) {
 			rushjob -= 1;
 			continue;
 		}
 		/*
 		 * Just sleep for a short period of time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
+
+		schedstate = 10;
+
 		if (syncer_state != SYNCER_RUNNING ||
 		    time_uptime == starttime) {
+		    	schedstate = 11;
 			thread_lock(td);
 			sched_prio(td, PPAUSE);
 			thread_unlock(td);
 		}
-		if (syncer_state != SYNCER_RUNNING)
+		schedstate = 12;
+
+		if (syncer_state != SYNCER_RUNNING) {
+			schedstate = 13;
+
 			cv_timedwait(&sync_wakeup, &sync_mtx,
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
-		else if (time_uptime == starttime)
+		}
+		else if (time_uptime == starttime) {
+			schedstate = 14;
 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
+		}
+		schedstate = 15;
 	}
+	schedstate = 16;
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer(void)
 {
 	int ret = 0;
 
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?56423D82.5030203>