Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 20 Oct 2004 17:18:23 -0400
From:      John Baldwin <jhb@FreeBSD.org>
To:        Daniel Eischen <deischen@FreeBSD.org>
Cc:        threads@FreeBSD.org
Subject:   Infinite loop bug in libc_r on 4.x with condition variables and signals
Message-ID:  <200410201718.23862.jhb@FreeBSD.org>

next in thread | raw e-mail | index | archive | help
We are trying to run mono on 4.x and are having problems with the process 
getting stuck spinning in an infinite loop.  After some debugging, we 
determined that the problem is that the condition variable thread queues are 
getting corrupted due to threads being added to a queue while they are 
already queued on another queue.  For example, if a thread is somehow on c1's 
queue but runs and blocks on c2, later when c1 tries to do a broadcast, it 
tries to remove all the waiters to wake them up doing something like:

	while ((head = TAILQ_FIRST(&c1->c_queue)) != NULL) {
	}

The problem is that since the thread was last added to c2's queue, his 
tqe_prev pointer in his sqe TAILQ_ENTRY points to an item on c2's list, and 
thus the c_queue.tqe_next pointer doesn't get updated by TAILQ_REMOVE, so the 
thread just "sticks" on c1's head pointer and it spins forever.

We seemed to have tracked this down to some sort of bug related to signals and 
condition variables.  It seems that we try to go handle a signal while we are 
on a condition variable queue, but not in PS_COND_WAIT, so 
_cond_wait_backout() is not called to remove the thread from the queue.  I 
tried deferring signals around the cond queue manipulations in cond_wait() 
and cond_timedwait() but we are still seeing the problem.  The patches we 
currently are using (including debug cruft) are below.  Right now we see the 
assertion in _thread_sig_wrapper() firing, but if I remove that, one of the 
assertions in the condition variable code that check for threads not being on 
the right condition variable queue trigger instead.  Does anyone have any 
other ideas of how a thread could catch a signal while PS_RUNNING and on a 
condition variable queue?  (I'm also worried that the wait() functions assume 
that if the thread is interrupted, its always not on the queue, but that 
doesn't seem to be the case for pthread_cancel() for example.)

Index: Makefile
===================================================================
RCS file: /usr/cvs/src/lib/libc_r/Makefile,v
retrieving revision 1.24.2.7
diff -u -r1.24.2.7 Makefile
--- Makefile	22 Oct 2002 14:44:02 -0000	1.24.2.7
+++ Makefile	14 Oct 2004 23:33:42 -0000
@@ -14,7 +14,7 @@
 
 # Uncomment this if you want libc_r to contain debug information for
 # thread locking.
-CFLAGS+=-D_LOCK_DEBUG
+CFLAGS+=-D_LOCK_DEBUG -ggdb
 
 # enable extra internal consistancy checks
 CFLAGS+=-D_PTHREADS_INVARIANTS
Index: uthread/pthread_private.h
===================================================================
RCS file: /usr/cvs/src/lib/libc_r/uthread/pthread_private.h,v
retrieving revision 1.36.2.21
diff -u -r1.36.2.21 pthread_private.h
--- uthread/pthread_private.h	22 Oct 2002 14:44:02 -0000	1.36.2.21
+++ uthread/pthread_private.h	14 Oct 2004 22:24:00 -0000
@@ -744,6 +744,7 @@
 	 */
 	TAILQ_ENTRY(pthread)	pqe;	/* priority queue link */
 	TAILQ_ENTRY(pthread)	sqe;	/* synchronization queue link */
+	TAILQ_ENTRY(pthread)	cqe;	/* condition variable queue link */
 	TAILQ_ENTRY(pthread)	qe;	/* all other queues link */
 
 	/* Wait data. */
Index: uthread/uthread_cond.c
===================================================================
RCS file: /usr/cvs/src/lib/libc_r/uthread/uthread_cond.c,v
retrieving revision 1.22.2.8
diff -u -r1.22.2.8 uthread_cond.c
--- uthread/uthread_cond.c	22 Oct 2002 14:44:02 -0000	1.22.2.8
+++ uthread/uthread_cond.c	15 Oct 2004 21:55:48 -0000
@@ -37,6 +37,8 @@
 #include <pthread.h>
 #include "pthread_private.h"
 
+#define	DEFER
+
 /*
  * Prototypes
  */
@@ -195,6 +197,14 @@
 	 * signal handler.
 	 */
 	do {
+#ifdef DEFER
+		/*
+		 * Defer signals to protect the scheduling queues
+		 * from access by the signal handler:
+		 */
+		_thread_kern_sig_defer();
+#endif
+
 		/* Lock the condition variable structure: */
 		_SPINLOCK(&(*cond)->lock);
 
@@ -270,6 +280,7 @@
 					 * after handling a signal.
 					 */
 					if (interrupted != 0) {
+						PTHREAD_ASSERT_NOT_IN_SYNCQ(curthread);
 						/*
 						 * Lock the mutex and ignore any
 						 * errors.  Note that even
@@ -314,6 +325,13 @@
 
 		if ((interrupted != 0) && (curthread->continuation != NULL))
 			curthread->continuation((void *) curthread);
+#ifdef DEFER
+		/*
+		 * Undefer and handle pending signals, yielding if
+		 * necessary:
+		 */
+		_thread_kern_sig_undefer();
+#endif
 	} while ((done == 0) && (rval == 0));
 
 	_thread_leave_cancellation_point();
@@ -354,6 +372,13 @@
 	 * signal handler.
 	 */
 	do {
+#ifdef DEFER
+		/*
+		 * Defer signals to protect the scheduling queues
+		 * from access by the signal handler:
+		 */
+		_thread_kern_sig_defer();
+#endif
 		/* Lock the condition variable structure: */
 		_SPINLOCK(&(*cond)->lock);
 
@@ -431,6 +456,7 @@
 					 * after handling a signal.
 					 */
 					if (interrupted != 0) {
+						PTHREAD_ASSERT_NOT_IN_SYNCQ(curthread);
 						/*
 						 * Lock the mutex and ignore any
 						 * errors.  Note that even
@@ -484,6 +510,13 @@
 
 		if ((interrupted != 0) && (curthread->continuation != NULL))
 			curthread->continuation((void *) curthread);
+#ifdef DEFER
+		/*
+		 * Undefer and handle pending signals, yielding if
+		 * necessary:
+		 */
+		_thread_kern_sig_undefer();
+#endif
 	} while ((done == 0) && (rval == 0));
 
 	_thread_leave_cancellation_point();
@@ -671,8 +704,12 @@
 	pthread_t pthread;
 
 	while ((pthread = TAILQ_FIRST(&cond->c_queue)) != NULL) {
-		TAILQ_REMOVE(&cond->c_queue, pthread, sqe);
+		PTHREAD_ASSERT(pthread->data.cond == cond, "cond_queue_deq: mismatched 
condition variables");
+		PTHREAD_ASSERT(pthread->cqe.tqe_prev == &TAILQ_FIRST(&cond->c_queue), 
"cond_queue_deq: elem doesn't match");
+		PTHREAD_ASSERT(pthread->flags & PTHREAD_FLAGS_IN_CONDQ, "cond_queue_deq: 
condq flag not set");
+		TAILQ_REMOVE(&cond->c_queue, pthread, cqe);
 		pthread->flags &= ~PTHREAD_FLAGS_IN_CONDQ;
+		pthread->data.cond = NULL;
 		if ((pthread->timeout == 0) && (pthread->interrupted == 0))
 			/*
 			 * Only exit the loop when we find a thread
@@ -693,6 +730,9 @@
 static inline void
 cond_queue_remove(pthread_cond_t cond, pthread_t pthread)
 {
+	pthread_t foo;
+	int found;
+
 	/*
 	 * Because pthread_cond_timedwait() can timeout as well
 	 * as be signaled by another thread, it is necessary to
@@ -700,8 +740,28 @@
 	 * it isn't in the queue.
 	 */
 	if (pthread->flags & PTHREAD_FLAGS_IN_CONDQ) {
-		TAILQ_REMOVE(&cond->c_queue, pthread, sqe);
+		PTHREAD_ASSERT(pthread->data.cond == cond,
+		    "cond_queue_remove: mismatched condition variables");
+		found = 0;
+		TAILQ_FOREACH(foo, &cond->c_queue, cqe)
+			if (foo == pthread)
+				found++;
+		PTHREAD_ASSERT(found != 0, "thread not on queue");
+		PTHREAD_ASSERT(found <= 1, "thread on queue more than once");
+
+		if (TAILQ_FIRST(&cond->c_queue) == pthread)
+			PTHREAD_ASSERT(pthread->cqe.tqe_prev ==
+			    &TAILQ_FIRST(&cond->c_queue),
+			    "cond_queue_remove: elem doesn't match");
+		else
+			TAILQ_FOREACH(foo, &cond->c_queue, cqe)
+				if (TAILQ_NEXT(foo, cqe) == pthread)
+				PTHREAD_ASSERT(pthread->cqe.tqe_prev ==
+				    &foo->cqe.tqe_next,
+				    "cond_queue_remove: elem doesn't match");
+		TAILQ_REMOVE(&cond->c_queue, pthread, cqe);
 		pthread->flags &= ~PTHREAD_FLAGS_IN_CONDQ;
+		pthread->data.cond = NULL;
 	}
 }
 
@@ -713,21 +773,25 @@
 cond_queue_enq(pthread_cond_t cond, pthread_t pthread)
 {
 	pthread_t tid = TAILQ_LAST(&cond->c_queue, cond_head);
+	pthread_t foo;
 
 	PTHREAD_ASSERT_NOT_IN_SYNCQ(pthread);
 
+	TAILQ_FOREACH(foo, &cond->c_queue, cqe)
+		PTHREAD_ASSERT(pthread != foo, "thread already on queue");
+
 	/*
 	 * For the common case of all threads having equal priority,
 	 * we perform a quick check against the priority of the thread
 	 * at the tail of the queue.
 	 */
 	if ((tid == NULL) || (pthread->active_priority <= tid->active_priority))
-		TAILQ_INSERT_TAIL(&cond->c_queue, pthread, sqe);
+		TAILQ_INSERT_TAIL(&cond->c_queue, pthread, cqe);
 	else {
 		tid = TAILQ_FIRST(&cond->c_queue);
 		while (pthread->active_priority <= tid->active_priority)
-			tid = TAILQ_NEXT(tid, sqe);
-		TAILQ_INSERT_BEFORE(tid, pthread, sqe);
+			tid = TAILQ_NEXT(tid, cqe);
+		TAILQ_INSERT_BEFORE(tid, pthread, cqe);
 	}
 	pthread->flags |= PTHREAD_FLAGS_IN_CONDQ;
 	pthread->data.cond = cond;
Index: uthread/uthread_mutex.c
===================================================================
RCS file: /usr/cvs/src/lib/libc_r/uthread/uthread_mutex.c,v
retrieving revision 1.20.2.8
diff -u -r1.20.2.8 uthread_mutex.c
--- uthread/uthread_mutex.c	22 Oct 2002 14:44:03 -0000	1.20.2.8
+++ uthread/uthread_mutex.c	20 Oct 2004 20:18:41 -0000
@@ -59,6 +59,8 @@
 #define _MUTEX_ASSERT_NOT_OWNED(m)
 #endif
 
+#define DEFER
+
 /*
  * Prototypes
  */
@@ -748,6 +750,11 @@
 	struct pthread	*curthread = _get_curthread();
 	int	ret = 0;
 
+#ifdef DEFER
+	if (add_reference)
+		PTHREAD_ASSERT(curthread->sig_defer_count == 1,
+		    "lost defer count start");
+#endif
 	if (mutex == NULL || *mutex == NULL) {
 		ret = EINVAL;
 	} else {
@@ -755,6 +762,9 @@
 		 * Defer signals to protect the scheduling queues from
 		 * access by the signal handler:
 		 */
+#ifdef DEFER
+		if (!add_reference)
+#endif
 		_thread_kern_sig_defer();
 
 		/* Lock the mutex structure: */
@@ -1064,8 +1074,16 @@
 		 * Undefer and handle pending signals, yielding if
 		 * necessary:
 		 */
+#ifdef DEFER
+		if (!add_reference)
+#endif
 		_thread_kern_sig_undefer();
 	}
+#ifdef DEFER
+	if (add_reference)
+		PTHREAD_ASSERT(curthread->sig_defer_count == 1,
+		    "lost defer count finish");
+#endif
 
 	/* Return the completion status: */
 	return (ret);
Index: uthread/uthread_sig.c
===================================================================
RCS file: /usr/cvs/src/lib/libc_r/uthread/uthread_sig.c,v
retrieving revision 1.25.2.13
diff -u -r1.25.2.13 uthread_sig.c
--- uthread/uthread_sig.c	22 Oct 2002 14:44:03 -0000	1.25.2.13
+++ uthread/uthread_sig.c	15 Oct 2004 22:29:18 -0000
@@ -1007,6 +1007,10 @@
 			break;
 		}
 	}
+#if 1
+	PTHREAD_ASSERT((thread->flags & PTHREAD_FLAGS_IN_CONDQ) == 0,
+	    "still on cond queue");
+#endif
 
 	/* Unblock the signal in case we don't return from the handler: */
 	_thread_sigq[psf->signo - 1].blocked = 0;

-- 
John Baldwin <jhb@FreeBSD.org>  <><  http://www.FreeBSD.org/~jhb/
"Power Users Use the Power to Serve"  =  http://www.FreeBSD.org



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200410201718.23862.jhb>