From owner-freebsd-bugs Thu May 13 12:30:18 1999 Delivered-To: freebsd-bugs@freebsd.org Received: from freefall.freebsd.org (freefall.FreeBSD.ORG [204.216.27.21]) by hub.freebsd.org (Postfix) with ESMTP id 61464155DE for ; Thu, 13 May 1999 12:30:02 -0700 (PDT) (envelope-from gnats@FreeBSD.org) Received: (from gnats@localhost) by freefall.freebsd.org (8.9.3/8.9.2) id MAA50570; Thu, 13 May 1999 12:30:02 -0700 (PDT) (envelope-from gnats@FreeBSD.org) Received: from midten.fast.no (midten.fast.no [195.139.251.11]) by hub.freebsd.org (Postfix) with ESMTP id E0E6A14BDE for ; Thu, 13 May 1999 12:22:51 -0700 (PDT) (envelope-from tegge@not.fast.no) Received: from not.fast.no (IDENT:tegge@not.fast.no [195.139.251.12]) by midten.fast.no (8.9.1/8.9.1) with ESMTP id VAA75174 for ; Thu, 13 May 1999 21:22:51 +0200 (CEST) Received: (from tegge@localhost) by not.fast.no (8.9.3/8.8.8) id VAA59119; Thu, 13 May 1999 21:22:51 +0200 (CEST) (envelope-from tegge@not.fast.no) Message-Id: <199905131922.VAA59119@not.fast.no> Date: Thu, 13 May 1999 21:22:51 +0200 (CEST) From: Tor Egge Reply-To: tegge@not.fast.no To: FreeBSD-gnats-submit@freebsd.org X-Send-Pr-Version: 3.2 Subject: bin/11696: Signal handling is broken in libc_r. Sender: owner-freebsd-bugs@FreeBSD.ORG Precedence: bulk X-Loop: FreeBSD.org >Number: 11696 >Category: bin >Synopsis: Signal handling is broken in libc_r. >Confidential: no >Severity: serious >Priority: medium >Responsible: freebsd-bugs >State: open >Quarter: >Keywords: >Date-Required: >Class: sw-bug >Submitter-Id: current-users >Arrival-Date: Thu May 13 12:30:01 PDT 1999 >Closed-Date: >Last-Modified: >Originator: Tor Egge >Release: FreeBSD 3.2-BETA i386 >Organization: Fast Search & Transfer ASA >Environment: FreeBSD response.fast.no 3.2-BETA FreeBSD 3.2-BETA #1: Wed May 12 21:58:39 CEST 1999 root@response.fast.no:/usr/src/sys/compile/INDEX_SMP_SERIAL_DDB_NOIPFW i386 >Description: Signal handling is broken in libc_r. - Calls to PTHREAD_NEW_STATE() are not protected by _thread_kern_sched_defer(). This is not signal safe, since the queues might be changed by the signal handler. - TAILQ_FOREACH() is used for traveral over _waitingq, where the current element is removed from the _waitingq queue and added to a priority queue inside the loop. This breaks traversal of _waitingq. - The current thread state is modified without protection by _thread_kern_in_sched. This is not signal safe, since any signal making the thread runnable again will cause an incorrect call to PTHREAD_WAITQ_REMOVE(), then a call to PTHREAD_PRIOQ_INSERT_TAIL(). Later on, _thread_kern_sched() might call PTHREAD_PRIOQ_INSERT_HEAD() or PTHREAD_PRIOQ_INSERT_TAIL() with the current thread as argument. - During signal handling, thread_run is temporarily set to the thread receiving a signal without taking into account that the interrupted thread might have been inside a critical region modifying a priority or waiting queue. - Not taking into account that a signal might have made a thread runnable before or after the scan of waiting threads in _thread_kern_select(). This might cause the process to hang in select(), waiting for something to happen, while a thread is marked as runnable. >How-To-Repeat: Go to a directory with more than 10 GB free space. Compile and run the appended program. It causes some signals to be sent to the running process, due to emulating pread() by using aio_read(). cc -pthread -D_THREAD_SAFE -D_PTHREADS -O2 -static -g -o aiotest aiotest.c #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct myaio { struct aiocb cb; struct { int busy; pthread_mutex_t mutex; pthread_cond_t cond; } cond; struct myaio *next; struct myaio *prev; ssize_t retval; size_t reterrno; time_t started; }; static struct myaio *activeaios; static struct myaio *freeaios; static int freecnt; static pthread_mutex_t aiomutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t aiocond = PTHREAD_COND_INITIALIZER; static int aiostartcnt; static int aioendcnt; static volatile sig_atomic_t gotusr1; static pthread_once_t aiothread_once = PTHREAD_ONCE_INIT; static pthread_t aiothread; static int aiothread_running; static void runaiothread(void); ssize_t pread(const int fd, void *buf, const size_t buflen, const off_t off) { struct myaio *aio; int ret; size_t retval; int reterrno; pthread_mutex_lock(&aiomutex); if (freeaios != NULL) { assert(freecnt > 0); freecnt--; aio = freeaios; freeaios = aio->next; aio->next = NULL; aio->prev = NULL; } else { assert(freecnt == 0); pthread_once(&aiothread_once, runaiothread); while (aiothread_running == 0) pthread_cond_wait(&aiocond, &aiomutex); pthread_mutex_unlock(&aiomutex); aio = (struct myaio *) malloc(sizeof(struct myaio)); memset(aio, 0, sizeof(struct myaio)); pthread_mutex_init(&aio->cond.mutex, NULL); pthread_cond_init(&aio->cond.cond, NULL); aio->next = NULL; aio->prev = NULL; pthread_mutex_lock(&aiomutex); } assert(aio->cond.busy == 0); aio->cond.busy = 1; aio->cb.aio_fildes = fd; aio->cb.aio_offset = off; aio->cb.aio_buf = buf; aio->cb.aio_nbytes = buflen; aio->cb.aio_sigevent.sigev_notify = SIGEV_SIGNAL; aio->cb.aio_sigevent.sigev_signo = SIGUSR1; aio->cb.aio_sigevent.sigev_value.sival_ptr = &aio->cb; aio->cb.aio_lio_opcode = LIO_READ; aio->cb.aio_reqprio = 0; aio->retval = 0; aio->started = time(0); aio->prev = NULL; aio->next = activeaios; if (activeaios != NULL) activeaios->prev = aio; activeaios = aio; aiostartcnt++; ret = aio_read(&aio->cb); pthread_mutex_unlock(&aiomutex); assert(ret == 0); pthread_mutex_lock(&aio->cond.mutex); while (aio->cond.busy != 0) { pthread_cond_wait(&aio->cond.cond, &aio->cond.mutex); } pthread_mutex_unlock(&aio->cond.mutex); retval = aio->retval; reterrno = aio->reterrno; #if 0 assert((size_t) aio->retval == buflen); #endif pthread_mutex_lock(&aiomutex); assert(aio->next == NULL); assert(aio->prev == NULL); assert(aio != activeaios); assert(aio != freeaios); aio->next = freeaios; aio->prev = NULL; freeaios = aio; freecnt++; pthread_mutex_unlock(&aiomutex); errno = reterrno; return retval; } static void usr1handler(int sig) { (void) sig; gotusr1 = 1; } void processusr1(void) { struct myaio *aio, *naio; int reterrno; int now; int qpos; pthread_mutex_lock(&aiomutex); now = time(0); qpos = 0; for (aio = activeaios; aio != NULL; aio =naio, qpos++) { naio = aio->next; if (now - aio->started > 15) { printf("ERROR: aio used more than %d seconds: cb=%p, buflen=%u" ", qpos=%d %s, aiocnt=%d,%d\n", (int) (now - aio->started - 1), (void *) &aio->cb, aio->cb.aio_nbytes, qpos, aio->next == NULL ? "" : "(more elements)", aiostartcnt, aioendcnt); } reterrno = aio_error(&aio->cb); if (reterrno == EINPROGRESS) continue; else if (reterrno < 0) { assert(errno == EINVAL); assert(now - aio->started < 10); } else { aioendcnt++; assert(aio->prev != NULL || aio == activeaios); aio->retval = aio_return(&aio->cb); aio->reterrno = reterrno; if (aio->next != NULL) aio->next->prev = aio->prev; if (aio->prev != NULL) aio->prev->next = aio->next; if (aio == activeaios) activeaios = aio->next; aio->prev = NULL; aio->next = NULL; pthread_mutex_lock(&aio->cond.mutex); aio->cond.busy = 0; pthread_cond_signal(&aio->cond.cond); pthread_mutex_unlock(&aio->cond.mutex); } } pthread_mutex_unlock(&aiomutex); } void *aiothreadmeat(void *dummy) { sigset_t sigs_to_block; struct sigaction act; act.sa_handler=usr1handler; sigemptyset(&act.sa_mask); act.sa_flags=0; sigaction(SIGUSR1,&act,NULL); sigemptyset(&sigs_to_block); sigaddset(&sigs_to_block, SIGUSR1); pthread_sigmask(SIG_UNBLOCK, &sigs_to_block, NULL); pthread_mutex_lock(&aiomutex); aiothread_running = 1; pthread_cond_broadcast(&aiocond); pthread_mutex_unlock(&aiomutex); gotusr1 = 1; while (1) { if (gotusr1 != 0) { gotusr1 = 0; processusr1(); } sleep(1); gotusr1 = 1; } abort(); } static void runaiothread(void) { pthread_create(&aiothread, NULL, aiothreadmeat, NULL); } /* 10000 MB test file */ #define FILESIZE 10000 static off_t filesize; int writefile(void) { char *buf; size_t buflen; int fd; int count; ssize_t wgot; struct stat stbuf; buflen = 1024 * 1024; buf = malloc(buflen); assert(buf != NULL); filesize = (off_t) FILESIZE * (off_t) buflen; fd = open("largefile", O_RDWR | O_CREAT, 0666); assert(fd >= 0); fstat(fd, &stbuf); if (stbuf.st_size < filesize) { for (count = 0; count < FILESIZE; count++) { wgot = write(fd, buf, buflen); assert(wgot == buflen); } } return fd; } static pthread_mutex_t cntmutex = PTHREAD_MUTEX_INITIALIZER; static int startreadcnt; static int donereadcnt; void *readthread(void *data) { int fd; size_t buflen; char *buf; ssize_t rgot; off_t loc; sigset_t sigs_to_block; fd = (int) data; buflen = 1024; buf = malloc(buflen); assert(buf != NULL); sigemptyset(&sigs_to_block); sigaddset(&sigs_to_block, SIGUSR1); pthread_sigmask(SIG_BLOCK, &sigs_to_block, NULL); sleep(1); while (1) { loc = (off_t) (random() % (FILESIZE * 1024)) * (off_t) 1024; pthread_mutex_lock(&cntmutex); startreadcnt++; pthread_mutex_unlock(&cntmutex); rgot = pread(fd, buf, buflen, loc); if (rgot != buflen) { printf("rgot=%d, buflen=%d, loc=%qd, startreadcnt=%d,%d\n", rgot, buflen, loc, startreadcnt, donereadcnt); } assert(rgot == buflen); pthread_mutex_lock(&cntmutex); donereadcnt++; pthread_mutex_unlock(&cntmutex); } return NULL; } int main(int argc, char **argv) { int fd; int cnt; pthread_t curthread; int startcntcopy, donecntcopy; sigset_t sigs_to_block; struct timeval stime; struct timeval now; struct timeval report; struct timeval delta; double fdelta; double rate; struct timeval tvsel; fd = writefile(); sigemptyset(&sigs_to_block); sigaddset(&sigs_to_block, SIGUSR1); pthread_sigmask(SIG_BLOCK, &sigs_to_block, NULL); srandom(time(NULL)); gettimeofday(&stime, NULL); report = stime; report.tv_sec++; for (cnt = 0; cnt < 250; cnt++) { pthread_create(&curthread, NULL, readthread, (void *) fd); } while (1) { #if 0 sleep(1); /* XXX: Does not work */ #else gettimeofday(&now, NULL); if (now.tv_sec < report.tv_sec || (now.tv_sec == report.tv_sec && now.tv_usec < report.tv_usec)) { if (report.tv_usec >= now.tv_usec) { tvsel.tv_sec = report.tv_sec - now.tv_sec; tvsel.tv_usec = report.tv_usec - now.tv_usec; } else { tvsel.tv_sec = report.tv_sec -now.tv_sec - 1; tvsel.tv_usec = report.tv_usec + 1000000 - now.tv_usec; } select(1, NULL, NULL, NULL, &tvsel); continue; } report.tv_sec++; #endif gettimeofday(&now, NULL); if (now.tv_usec >= stime.tv_usec) { delta.tv_sec = now.tv_sec - stime.tv_sec; delta.tv_usec = now.tv_usec - stime.tv_usec; } else { delta.tv_sec = now.tv_sec - stime.tv_sec - 1; delta.tv_usec = now.tv_usec + 1000000 - stime.tv_usec; } fdelta = delta.tv_sec + ((double) delta.tv_usec) / 1000000.0; pthread_mutex_lock(&cntmutex); startcntcopy = startreadcnt; donecntcopy = donereadcnt; pthread_mutex_unlock(&cntmutex); rate = (double) donecntcopy / (double) fdelta; printf("%d(+%d) read operations time=%6.3f, rate=%6.3f\n", donecntcopy, startcntcopy - donecntcopy, fdelta, rate); fflush(stdout); } } >Fix: Index: uthread/uthread_cond.c =================================================================== RCS file: /home/ncvs/src/lib/libc_r/uthread/uthread_cond.c,v retrieving revision 1.12.2.1 diff -u -r1.12.2.1 uthread_cond.c --- uthread_cond.c 1999/05/09 07:48:26 1.12.2.1 +++ uthread_cond.c 1999/05/12 21:35:06 @@ -356,6 +356,16 @@ if (cond == NULL || *cond == NULL) rval = EINVAL; else { + /* + * Guard against preemption by a scheduling signal. + * A change of thread state modifies the waiting + * and priority queues. In addition, we must assure + * that all threads currently waiting on the condition + * variable are signaled and are not timedout by a + * scheduling signal that causes a preemption. + */ + _thread_kern_sched_defer(); + /* Lock the condition variable structure: */ _SPINLOCK(&(*cond)->lock); @@ -390,6 +400,10 @@ /* Unlock the condition variable structure: */ _SPINUNLOCK(&(*cond)->lock); + + /* Reenable preemption and yield if necessary. + */ + _thread_kern_sched_undefer(); } /* Return the completion status: */ Index: uthread/uthread_fd.c =================================================================== RCS file: /home/ncvs/src/lib/libc_r/uthread/uthread_fd.c,v retrieving revision 1.9.2.1 diff -u -r1.9.2.1 uthread_fd.c --- uthread_fd.c 1999/05/09 07:48:33 1.9.2.1 +++ uthread_fd.c 1999/05/12 21:36:07 @@ -169,6 +169,17 @@ */ if ((ret = _thread_fd_table_init(fd)) == 0) { /* + * Guard against being preempted by a scheduling signal. + * To support priority inheritence mutexes, we need to + * maintain lists of mutex ownerships for each thread as + * well as lists of waiting threads for each mutex. In + * order to propagate priorities we need to atomically + * walk these lists and cannot rely on a single mutex + * lock to provide protection against modification. + */ + _thread_kern_sched_defer(); + + /* * Lock the file descriptor table entry to prevent * other threads for clashing with the current * thread's accesses: @@ -254,6 +265,12 @@ /* Unlock the file descriptor table entry: */ _SPINUNLOCK(&_thread_fd_table[fd]->lock); + + /* + * Renable preemption and yield if a scheduling signal + * arrived while in the critical region: + */ + _thread_kern_sched_undefer(); } /* Nothing to return. */ @@ -440,6 +457,17 @@ */ if ((ret = _thread_fd_table_init(fd)) == 0) { /* + * Guard against being preempted by a scheduling signal. + * To support priority inheritence mutexes, we need to + * maintain lists of mutex ownerships for each thread as + * well as lists of waiting threads for each mutex. In + * order to propagate priorities we need to atomically + * walk these lists and cannot rely on a single mutex + * lock to provide protection against modification. + */ + _thread_kern_sched_defer(); + + /* * Lock the file descriptor table entry to prevent * other threads for clashing with the current * thread's accesses: @@ -525,6 +553,12 @@ /* Unlock the file descriptor table entry: */ _SPINUNLOCK(&_thread_fd_table[fd]->lock); + + /* + * Renable preemption and yield if a scheduling signal + * arrived while in the critical region: + */ + _thread_kern_sched_undefer(); } /* Nothing to return. */ Index: uthread/uthread_kern.c =================================================================== RCS file: /home/ncvs/src/lib/libc_r/uthread/uthread_kern.c,v retrieving revision 1.15.2.2 diff -u -r1.15.2.2 uthread_kern.c --- uthread_kern.c 1999/05/09 07:48:40 1.15.2.2 +++ uthread_kern.c 1999/05/13 17:04:45 @@ -49,6 +49,8 @@ #include #include "pthread_private.h" +int _thread_kern_new_state = 0; + /* Static function prototype definitions: */ static void _thread_kern_select(int wait_reqd); @@ -65,6 +67,7 @@ pthread_t pthread; pthread_t pthread_h = NULL; pthread_t last_thread = NULL; + pthread_t next_pthread; struct itimerval itimer; struct timespec ts; struct timespec ts1; @@ -159,7 +162,9 @@ * timeout value for threads waiting for a time. */ _waitingq_check_reqd = 0; /* reset flag before loop */ - TAILQ_FOREACH(pthread, &_waitingq, pqe) { + for (pthread = TAILQ_FIRST(&_waitingq); + pthread != NULL; pthread = next_pthread) { + next_pthread = TAILQ_NEXT(pthread, pqe); /* Check if this thread is ready: */ if (pthread->state == PS_RUNNING) { PTHREAD_WAITQ_REMOVE(pthread); @@ -462,6 +467,13 @@ void _thread_kern_sched_state(enum pthread_state state, char *fname, int lineno) { + /* + * Flag the pthread kernel as executing scheduler code + * to avoid a scheduler signal from interrupting this + * execution and calling the scheduler again. + */ + _thread_kern_in_sched = 1; + /* Change the state of the current thread: */ _thread_run->state = state; _thread_run->fname = fname; @@ -476,6 +488,13 @@ _thread_kern_sched_state_unlock(enum pthread_state state, spinlock_t *lock, char *fname, int lineno) { + /* + * Flag the pthread kernel as executing scheduler code + * to avoid a scheduler signal from interrupting this + * execution and calling the scheduler again. + */ + _thread_kern_in_sched = 1; + /* Change the state of the current thread: */ _thread_run->state = state; _thread_run->fname = fname; @@ -502,6 +521,7 @@ int nfds = -1; int settimeout; pthread_t pthread; + pthread_t next_pthread; ssize_t num; struct timespec ts; struct timespec ts1; @@ -536,7 +556,9 @@ * or times: */ _waitingq_check_reqd = 0; /* reset flag before loop */ - TAILQ_FOREACH (pthread, &_waitingq, pqe) { + for (pthread = TAILQ_FIRST(&_waitingq); + pthread != NULL; pthread = next_pthread) { + next_pthread = TAILQ_NEXT(pthread, pqe); /* Assume that this state does not time out: */ settimeout = 0; @@ -569,6 +591,7 @@ */ PTHREAD_WAITQ_REMOVE(pthread); PTHREAD_PRIOQ_INSERT_TAIL(pthread); + wait_reqd = 0; break; /* File descriptor read wait: */ @@ -809,6 +832,13 @@ */ _thread_kern_in_select = 1; + /* Just poll if a thread has been made runnable by a signal */ + if (_waitingq_check_reqd != 0) { + tv.tv_sec = 0; + tv.tv_usec = 0; + p_tv = &tv; + } + /* * Wait for a file descriptor to be ready for read, write, or * an exception, or a timeout to occur: @@ -877,7 +907,10 @@ * descriptors that are flagged as available by the * _select syscall: */ - TAILQ_FOREACH (pthread, &_waitingq, pqe) { + for (pthread = TAILQ_FIRST(&_waitingq); + pthread != NULL; + pthread = next_pthread) { + next_pthread = TAILQ_NEXT(pthread, pqe); /* Process according to thread state: */ switch (pthread->state) { /* @@ -1229,6 +1262,7 @@ _thread_kern_sched_undefer(void) { pthread_t pthread; + pthread_t next_pthread; int need_resched = 0; /* @@ -1244,7 +1278,9 @@ /* Clear the flag before checking the waiting queue: */ _waitingq_check_reqd = 0; - TAILQ_FOREACH(pthread, &_waitingq, pqe) { + for (pthread = TAILQ_FIRST(&_waitingq); + pthread != NULL; pthread = next_pthread) { + next_pthread = TAILQ_NEXT(pthread, pqe); if (pthread->state == PS_RUNNING) { PTHREAD_WAITQ_REMOVE(pthread); PTHREAD_PRIOQ_INSERT_TAIL(pthread); Index: uthread/uthread_sig.c =================================================================== RCS file: /home/ncvs/src/lib/libc_r/uthread/uthread_sig.c,v retrieving revision 1.15.2.1 diff -u -r1.15.2.1 uthread_sig.c --- uthread_sig.c 1999/05/09 07:48:49 1.15.2.1 +++ uthread_sig.c 1999/05/13 17:05:04 @@ -89,6 +89,7 @@ char c; int i; pthread_t pthread; + pthread_t next_pthread; /* * Check if the pthread kernel has unblocked signals (or is about to) @@ -196,20 +197,48 @@ * if there are multiple waiters, we'll give it to the * first one we find. */ - TAILQ_FOREACH(pthread, &_waitingq, pqe) { - if ((pthread->state == PS_SIGWAIT) && - sigismember(pthread->data.sigwait, sig)) { - /* Change the state of the thread to run: */ - PTHREAD_SIG_NEW_STATE(pthread,PS_RUNNING); - - /* Return the signal number: */ - pthread->signo = sig; - - /* - * Do not attempt to deliver this signal - * to other threads. - */ - return; + if (_thread_run->sched_defer_count != 0 || + _thread_kern_in_sched != 0) { + /* + * Waiting list might be inconsistent, thus we + * have to scan the whole thread list. + */ + for (pthread = _thread_link_list; + pthread != NULL; + pthread = pthread->nxt) { + if ((pthread->state == PS_SIGWAIT) && + sigismember(pthread->data.sigwait, sig)) { + /* Change the state of the thread to run: */ + PTHREAD_SIG_NEW_STATE(pthread,PS_RUNNING); + + /* Return the signal number: */ + pthread->signo = sig; + + /* + * Do not attempt to deliver this signal + * to other threads. + */ + return; + } + } + } else { + for (pthread = TAILQ_FIRST(&_waitingq); + pthread != NULL; pthread = next_pthread) { + next_pthread = TAILQ_NEXT(pthread, pqe); + if ((pthread->state == PS_SIGWAIT) && + sigismember(pthread->data.sigwait, sig)) { + /* Change the state of the thread to run: */ + PTHREAD_SIG_NEW_STATE(pthread,PS_RUNNING); + + /* Return the signal number: */ + pthread->signo = sig; + + /* + * Do not attempt to deliver this signal + * to other threads. + */ + return; + } } } @@ -223,6 +252,10 @@ pthread = pthread->nxt) { pthread_t pthread_saved = _thread_run; + /* Current thread inside critical region ? */ + if (_thread_run->sched_defer_count > 0) + pthread->sched_defer_count++; + _thread_run = pthread; _thread_signal(pthread,sig); @@ -232,6 +265,10 @@ */ _dispatch_signals(); _thread_run = pthread_saved; + + /* Current thread inside critical region ? */ + if (_thread_run->sched_defer_count > 0) + pthread->sched_defer_count--; } } >Release-Note: >Audit-Trail: >Unformatted: To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-bugs" in the body of the message