Date: Wed, 21 Apr 2004 12:34:35 -0400 From: "Brian F. Feldman" <green@FreeBSD.org> To: Eivind Eklund <eivind@FreeBSD.org> Cc: arch@FreeBSD.org Subject: Re: stable kqueue locking up and running on SMP Message-ID: <200404211634.i3LGYZsr046745@green.homeunix.org> In-Reply-To: Message from Eivind Eklund <eivind@FreeBSD.org> of "Wed, 21 Apr 2004 09:13:33 -0000." <20040421091333.GA4518@FreeBSD.org>
next in thread | previous in thread | raw e-mail | index | archive | help
Eivind Eklund <eivind@FreeBSD.org> wrote: > On Wed, Apr 21, 2004 at 12:39:52AM -0400, Brian Fundakowski Feldman wrote: > > what is expected. The only missing feature is the ill-conceived NOTE_TRACK. > > I do not think that returning for that one note type EINVAL is a deal-breaker > > when trying to make kqueue not suck for 5.3; it is far more useful as a > > novelty than utility, especially considering I've never seen mention of it > > in actual software that uses kqueue. > > There are no interesting references to it in Google - the only > indication that anybody has ever used it is a bugfix: > > https://citadelle.intrinsec.com/mailing/current/HTML/ml_openbsd-bugs/1801.html That's good news, then. It seems my IP changed, so until the DNS updates, here are the contents of the latest patch: Index: sys/cam/scsi/scsi_target.c =================================================================== RCS file: /usr/ncvs/src/sys/cam/scsi/scsi_target.c,v retrieving revision 1.60 diff -u -r1.60 scsi_target.c --- sys/cam/scsi/scsi_target.c 21 Feb 2004 21:10:39 -0000 1.60 +++ sys/cam/scsi/scsi_target.c 21 Apr 2004 03:24:44 -0000 @@ -336,9 +336,7 @@ softc = (struct targ_softc *)dev->si_drv1; kn->kn_hook = (caddr_t)softc; kn->kn_fop = &targread_filtops; - TARG_LOCK(softc); - SLIST_INSERT_HEAD(&softc->read_select.si_note, kn, kn_selnext); - TARG_UNLOCK(softc); + KLIST_INSERT(&softc->read_select.si_note, kn); return (0); } @@ -348,9 +346,7 @@ struct targ_softc *softc; softc = (struct targ_softc *)kn->kn_hook; - TARG_LOCK(softc); - SLIST_REMOVE(&softc->read_select.si_note, kn, knote, kn_selnext); - TARG_UNLOCK(softc); + KLIST_REMOVE(&softc->read_select.si_note, kn); } /* Notify the user's kqueue when the user queue or abort queue gets a CCB */ Index: sys/fs/fifofs/fifo_vnops.c =================================================================== RCS file: /usr/ncvs/src/sys/fs/fifofs/fifo_vnops.c,v retrieving revision 1.93 diff -u -r1.93 fifo_vnops.c --- sys/fs/fifofs/fifo_vnops.c 7 Apr 2004 20:45:59 -0000 1.93 +++ sys/fs/fifofs/fifo_vnops.c 21 Apr 2004 03:30:46 -0000 @@ -432,7 +432,7 @@ ap->a_kn->kn_hook = (caddr_t)so; - SLIST_INSERT_HEAD(&sb->sb_sel.si_note, ap->a_kn, kn_selnext); + KLIST_INSERT(&sb->sb_sel.si_note, ap->a_kn); sb->sb_flags |= SB_KNOTE; return (0); @@ -443,9 +443,7 @@ { struct socket *so = (struct socket *)kn->kn_hook; - SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); - if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) - so->so_rcv.sb_flags &= ~SB_KNOTE; + KLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn); } static int @@ -467,9 +465,7 @@ { struct socket *so = (struct socket *)kn->kn_hook; - SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); - if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) - so->so_snd.sb_flags &= ~SB_KNOTE; + KLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn); } static int Index: sys/gnu/ext2fs/ext2_vnops.c =================================================================== RCS file: /usr/ncvs/src/sys/gnu/ext2fs/ext2_vnops.c,v retrieving revision 1.83 diff -u -r1.83 ext2_vnops.c --- sys/gnu/ext2fs/ext2_vnops.c 7 Apr 2004 20:46:03 -0000 1.83 +++ sys/gnu/ext2fs/ext2_vnops.c 21 Apr 2004 03:33:27 -0000 @@ -1894,9 +1894,7 @@ if (vp->v_pollinfo == NULL) v_addpollinfo(vp); - mtx_lock(&vp->v_pollinfo->vpi_lock); - SLIST_INSERT_HEAD(&vp->v_pollinfo->vpi_selinfo.si_note, kn, kn_selnext); - mtx_unlock(&vp->v_pollinfo->vpi_lock); + KLIST_INSERT(&vp->v_pollinfo->vpi_selinfo.si_note, kn); return (0); } @@ -1907,10 +1905,7 @@ struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Mising v_pollinfo")); - mtx_lock(&vp->v_pollinfo->vpi_lock); - SLIST_REMOVE(&vp->v_pollinfo->vpi_selinfo.si_note, - kn, knote, kn_selnext); - mtx_unlock(&vp->v_pollinfo->vpi_lock); + KLIST_REMOVE(&vp->v_pollinfo->vpi_selinfo.si_note, kn); } /*ARGSUSED*/ Index: sys/kern/kern_event.c =================================================================== RCS file: /usr/ncvs/src/sys/kern/kern_event.c,v retrieving revision 1.68 diff -u -r1.68 kern_event.c --- sys/kern/kern_event.c 7 Apr 2004 05:59:57 -0000 1.68 +++ sys/kern/kern_event.c 21 Apr 2004 03:52:37 -0000 @@ -55,9 +55,9 @@ MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); -static int kqueue_scan(struct file *fp, int maxevents, - struct kevent *ulistp, const struct timespec *timeout, - struct thread *td); +static int kqueue_scan(struct file *fp, struct kevent *kq_kev, + int maxevents, struct kevent *ulistp, + const struct timespec *timeout, struct thread *td); static void kqueue_wakeup(struct kqueue *kq); static fo_rdwr_t kqueue_read; @@ -78,16 +78,17 @@ .fo_close = kqueue_close, }; +static void knote_unlocked(struct klist *list, long hint); static void knote_attach(struct knote *kn, struct filedesc *fdp); -static void knote_drop(struct knote *kn, struct thread *td); +static void knote_drop(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); static struct knote *knote_alloc(void); static void knote_free(struct knote *kn); - static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); + static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); @@ -106,6 +107,7 @@ static struct filterops timer_filtops = { 0, filt_timerattach, filt_timerdetach, filt_timer }; +static struct mtx kq_Giant; static uma_zone_t knote_zone; static int kq_ncallouts = 0; static int kq_calloutmax = (4 * 1024); @@ -154,6 +156,10 @@ return (fo_kqfilter(kn->kn_fp, kn)); } +/* + * We allow kqueues to be on kqueues. This is unsafe for any locking + * more complicated than kq_Giant unless we add kqueue "holding." + */ /*ARGSUSED*/ static int kqueue_kqfilter(struct file *fp, struct knote *kn) @@ -164,7 +170,7 @@ return (1); kn->kn_fop = &kqread_filtops; - SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext); + KLIST_INSERT(&kq->kq_sel.si_note, kn); return (0); } @@ -173,7 +179,7 @@ { struct kqueue *kq = kn->kn_fp->f_data; - SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext); + KLIST_REMOVE(&kq->kq_sel.si_note, kn); } /*ARGSUSED*/ @@ -193,6 +199,19 @@ int immediate; int error; + /* + * Here be dragons. We are already called with various locks held + * once we get to filt_proc() and it is essentially impossible + * to take our unknown process-related locks and knote locks and + * then both allocate the necessary memory for the child knotes + * and lock the various struct filedescs in order to attach them + * correctly. In order to allow this sort of (bad) behavior, + * we would want to do something like moving all of the knotes + * out of the process filedescs and into a global knote tree, + * and even then, memory allocation is problematic. + */ + if (kn->kn_sfflags & NOTE_TRACK) + return (EINVAL); immediate = 0; p = pfind(kn->kn_id); if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { @@ -219,15 +238,15 @@ } if (immediate == 0) - SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext); + KLIST_INSERT(&p->p_klist, kn); /* * Immediately activate any exit notes if the target process is a * zombie. This is necessary to handle the case where the target * process, e.g. a child, dies before the kevent is registered. */ - if (immediate && filt_proc(kn, NOTE_EXIT)) - KNOTE_ACTIVATE(kn); + if (immediate) + (void)filt_proc(kn, NOTE_EXIT); PROC_UNLOCK(p); @@ -250,9 +269,7 @@ if (kn->kn_status & KN_DETACHED) return; - PROC_LOCK(p); - SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext); - PROC_UNLOCK(p); + KLIST_REMOVE(&p->p_klist, kn); } static int @@ -280,6 +297,7 @@ return (1); } +#ifdef notyet /* * process forked, and user wants to track the new process, * so attach a new knote to it, and immediately report an @@ -302,6 +320,7 @@ if (error) kn->kn_fflags |= NOTE_TRACKERR; } +#endif return (kn->kn_fflags != 0); } @@ -314,6 +333,7 @@ struct timeval tv; int tticks; + mtx_lock(&kq_Giant); kn->kn_data++; KNOTE_ACTIVATE(kn); @@ -324,6 +344,7 @@ calloutp = (struct callout *)kn->kn_hook; callout_reset(calloutp, tticks, filt_timerexpire, kn); } + mtx_unlock(&kq_Giant); } /* @@ -383,13 +404,13 @@ struct file *fp; int fd, error; - mtx_lock(&Giant); fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd); if (error) - goto done2; + return (error); /* An extra reference on `nfp' has been held for us by falloc(). */ kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO); + mtx_lock(&kq_Giant); TAILQ_INIT(&kq->kq_head); FILE_LOCK(fp); fp->f_flag = FREAD | FWRITE; @@ -404,8 +425,7 @@ fdp->fd_knlistsize = 0; /* this process has a kq */ FILEDESC_UNLOCK(fdp); kq->kq_fdp = fdp; -done2: - mtx_unlock(&Giant); + mtx_unlock(&kq_Giant); return (error); } @@ -425,7 +445,7 @@ int kevent(struct thread *td, struct kevent_args *uap) { - struct kevent *kevp; + struct kevent *kevp, kq_kev[KQ_NEVENTS]; struct kqueue *kq; struct file *fp; struct timespec ts; @@ -440,22 +460,21 @@ if (uap->timeout != NULL) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) - goto done_nogiant; + goto done; uap->timeout = &ts; } - mtx_lock(&Giant); kq = fp->f_data; nerrors = 0; while (uap->nchanges > 0) { n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges; - error = copyin(uap->changelist, kq->kq_kev, + error = copyin(uap->changelist, kq_kev, n * sizeof(struct kevent)); if (error) goto done; for (i = 0; i < n; i++) { - kevp = &kq->kq_kev[i]; + kevp = &kq_kev[i]; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, td); if (error) { @@ -482,10 +501,9 @@ goto done; } - error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, td); + error = kqueue_scan(fp, kq_kev, uap->nevents, uap->eventlist, + uap->timeout, td); done: - mtx_unlock(&Giant); -done_nogiant: if (fp != NULL) fdrop(fp, td); return (error); @@ -495,6 +513,7 @@ kqueue_add_filteropts(int filt, struct filterops *filtops) { + mtx_lock(&kq_Giant); if (filt > 0) panic("filt(%d) > 0", filt); if (filt + EVFILT_SYSCOUNT < 0) @@ -503,6 +522,7 @@ if (sysfilt_ops[~filt] != &null_filtops) panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt); sysfilt_ops[~filt] = filtops; + mtx_unlock(&kq_Giant); return (0); } @@ -510,6 +530,7 @@ kqueue_del_filteropts(int filt) { + mtx_lock(&kq_Giant); if (filt > 0) panic("filt(%d) > 0", filt); if (filt + EVFILT_SYSCOUNT < 0) @@ -518,6 +539,7 @@ if (sysfilt_ops[~filt] == &null_filtops) panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt); sysfilt_ops[~filt] = &null_filtops; + mtx_unlock(&kq_Giant); return (0); } @@ -528,7 +550,7 @@ struct filterops *fops; struct file *fp = NULL; struct knote *kn = NULL; - int s, error = 0; + int error = 0; if (kev->filter < 0) { if (kev->filter + EVFILT_SYSCOUNT < 0) @@ -537,19 +559,20 @@ } else { /* * XXX - * filter attach routine is responsible for insuring that + * filter attach routine is responsible for ensuring that * the identifier can be attached to it. */ - printf("unknown filter: %d\n", kev->filter); return (EINVAL); } + mtx_lock(&kq_Giant); FILEDESC_LOCK(fdp); if (fops->f_isfd) { /* validate descriptor */ if ((u_int)kev->ident >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[kev->ident]) == NULL) { FILEDESC_UNLOCK(fdp); + mtx_unlock(&kq_Giant); return (EBADF); } fhold(fp); @@ -586,6 +609,7 @@ if (kev->flags & EV_ADD) { if (kn == NULL) { + mtx_unlock(&kq_Giant); kn = knote_alloc(); if (kn == NULL) { error = ENOMEM; @@ -599,19 +623,21 @@ * apply reference count to knote structure, and * do not release it at the end of this routine. */ - fp = NULL; - kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; + kn->kn_status = 0; - knote_attach(kn, fdp); - if ((error = fops->f_attach(kn)) != 0) { - knote_drop(kn, td); + error = fops->f_attach(kn); + if (error) { + knote_free(kn); goto done; } + fp = NULL; + knote_attach(kn, fdp); + mtx_lock(&kq_Giant); } else { /* * The user may change some filter values after the @@ -623,33 +649,27 @@ kn->kn_kevent.udata = kev->udata; } - s = splhigh(); - if (kn->kn_fop->f_event(kn, 0)) + if (kn->kn_status & KN_DETACHED || kn->kn_fop->f_event(kn, 0)) KNOTE_ACTIVATE(kn); - splx(s); } else if (kev->flags & EV_DELETE) { - kn->kn_fop->f_detach(kn); knote_drop(kn, td); goto done; } if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0)) { - s = splhigh(); kn->kn_status |= KN_DISABLED; - splx(s); } if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { - s = splhigh(); kn->kn_status &= ~KN_DISABLED; if ((kn->kn_status & KN_ACTIVE) && ((kn->kn_status & KN_QUEUED) == 0)) knote_enqueue(kn); - splx(s); } + mtx_unlock(&kq_Giant); done: if (fp != NULL) fdrop(fp, td); @@ -657,27 +677,27 @@ } static int -kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp, - const struct timespec *tsp, struct thread *td) +kqueue_scan(struct file *fp, struct kevent *kq_kev, int maxevents, + struct kevent *ulistp, const struct timespec *tsp, struct thread *td) { struct kqueue *kq; struct kevent *kevp; struct timeval atv, rtv, ttv; - struct knote *kn, marker; - int s, count, timeout, nkev = 0, error = 0; + struct knote *kn, *marker; + int count, timeout, nkev = 0, error = 0; FILE_LOCK_ASSERT(fp, MA_NOTOWNED); kq = fp->f_data; count = maxevents; if (count == 0) - goto done; + goto done_nokev; if (tsp != NULL) { TIMESPEC_TO_TIMEVAL(&atv, tsp); if (itimerfix(&atv)) { error = EINVAL; - goto done; + goto done_nokev; } if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) timeout = -1; @@ -691,6 +711,18 @@ atv.tv_usec = 0; timeout = 0; } + /* + * Attach a marker knote to signify the end of the kqueue. + * The (per-thread) markers have kn_filters of 0, signifying + * that they were contrived by kqueue_scan() and that they + * could not have been created with kqueue_register(). + */ + marker = knote_alloc(); + if (marker == NULL) { + error = ENOMEM; + goto done_nokev; + } + mtx_lock(&kq_Giant); goto start; retry: @@ -705,16 +737,15 @@ } start: - kevp = kq->kq_kev; - s = splhigh(); + kevp = kq_kev; if (kq->kq_count == 0) { if (timeout < 0) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; - error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout); + error = msleep(kq, &kq_Giant, PSOCK | PCATCH, + "kqread", timeout); } - splx(s); if (error == 0) goto retry; /* don't restart after signals... */ @@ -725,16 +756,19 @@ goto done; } - TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe); + TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); while (count) { kn = TAILQ_FIRST(&kq->kq_head); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); - if (kn == &marker) { - splx(s); + if (kn == marker) { if (count == maxevents) goto retry; goto done; } + if (kn->kn_filter == 0) { /* skip other threads' markers */ + TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); + continue; + } if (kn->kn_status & KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; @@ -752,10 +786,8 @@ if (kn->kn_flags & EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; - splx(s); - kn->kn_fop->f_detach(kn); knote_drop(kn, td); - s = splhigh(); + mtx_lock(&kq_Giant); } else if (kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; @@ -766,23 +798,25 @@ } count--; if (nkev == KQ_NEVENTS) { - splx(s); - error = copyout(&kq->kq_kev, ulistp, + mtx_unlock(&kq_Giant); + error = copyout(kq_kev, ulistp, sizeof(struct kevent) * nkev); ulistp += nkev; nkev = 0; - kevp = kq->kq_kev; - s = splhigh(); + kevp = kq_kev; + mtx_lock(&kq_Giant); if (error) break; } } - TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe); - splx(s); + TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); done: + knote_free(marker); + mtx_unlock(&kq_Giant); if (nkev != 0) - error = copyout(&kq->kq_kev, ulistp, + error = copyout(kq_kev, ulistp, sizeof(struct kevent) * nkev); +done_nokev: td->td_retval[0] = maxevents - count; return (error); } @@ -822,18 +856,18 @@ { struct kqueue *kq; int revents = 0; - int s = splnet(); kq = fp->f_data; if (events & (POLLIN | POLLRDNORM)) { + mtx_lock(&kq_Giant); if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); kq->kq_state |= KQ_SEL; } + mtx_unlock(&kq_Giant); } - splx(s); return (revents); } @@ -846,7 +880,9 @@ kq = fp->f_data; bzero((void *)st, sizeof(*st)); + mtx_lock(&kq_Giant); st->st_size = kq->kq_count; + mtx_unlock(&kq_Giant); st->st_blksize = sizeof(struct kevent); st->st_mode = S_IFIFO; return (0); @@ -858,46 +894,35 @@ { struct kqueue *kq = fp->f_data; struct filedesc *fdp = kq->kq_fdp; - struct knote **knp, *kn, *kn0; + struct knote *kn; int i; +again: + mtx_lock(&kq_Giant); FILEDESC_LOCK(fdp); for (i = 0; i < fdp->fd_knlistsize; i++) { - knp = &SLIST_FIRST(&fdp->fd_knlist[i]); - kn = *knp; + kn = SLIST_FIRST(&fdp->fd_knlist[i]); while (kn != NULL) { - kn0 = SLIST_NEXT(kn, kn_link); if (kq == kn->kn_kq) { - kn->kn_fop->f_detach(kn); - *knp = kn0; - FILE_LOCK(kn->kn_fp); FILEDESC_UNLOCK(fdp); - fdrop_locked(kn->kn_fp, td); - knote_free(kn); - FILEDESC_LOCK(fdp); + knote_drop(kn, td); + goto again; } else { - knp = &SLIST_NEXT(kn, kn_link); + kn = SLIST_NEXT(kn, kn_link); } - kn = kn0; } } if (fdp->fd_knhashmask != 0) { for (i = 0; i < fdp->fd_knhashmask + 1; i++) { - knp = &SLIST_FIRST(&fdp->fd_knhash[i]); - kn = *knp; + kn = SLIST_FIRST(&fdp->fd_knhash[i]); while (kn != NULL) { - kn0 = SLIST_NEXT(kn, kn_link); if (kq == kn->kn_kq) { - kn->kn_fop->f_detach(kn); - *knp = kn0; - /* XXX non-fd release of kn->kn_ptr */ FILEDESC_UNLOCK(fdp); - knote_free(kn); - FILEDESC_LOCK(fdp); + knote_drop(kn, td); + goto again; } else { - knp = &SLIST_NEXT(kn, kn_link); + kn = SLIST_NEXT(kn, kn_link); } - kn = kn0; } } } @@ -906,6 +931,7 @@ kq->kq_state &= ~KQ_SEL; selwakeuppri(&kq->kq_sel, PSOCK); } + mtx_unlock(&kq_Giant); free(kq, M_KQUEUE); fp->f_data = NULL; @@ -924,7 +950,18 @@ kq->kq_state &= ~KQ_SEL; selwakeuppri(&kq->kq_sel, PSOCK); } - KNOTE(&kq->kq_sel.si_note, 0); + knote_unlocked(&kq->kq_sel.si_note, 0); +} + +/* unlocked version for use with recursion */ +static void +knote_unlocked(struct klist *list, long hint) +{ + struct knote *kn; + + SLIST_FOREACH(kn, list, kn_selnext) + if (kn->kn_fop->f_event(kn, hint)) + KNOTE_ACTIVATE(kn); } /* @@ -933,11 +970,25 @@ void knote(struct klist *list, long hint) { - struct knote *kn; + mtx_lock(&kq_Giant); + knote_unlocked(list, hint); + mtx_unlock(&kq_Giant); +} - SLIST_FOREACH(kn, list, kn_selnext) - if (kn->kn_fop->f_event(kn, hint)) - KNOTE_ACTIVATE(kn); +void +knote_list_insert(struct klist *list, struct knote *kn) +{ + mtx_lock(&kq_Giant); + SLIST_INSERT_HEAD(list, kn, kn_selnext); + mtx_unlock(&kq_Giant); +} + +void +knote_list_remove(struct klist *list, struct knote *kn) +{ + mtx_lock(&kq_Giant); + SLIST_REMOVE(list, kn, knote, kn_selnext); + mtx_unlock(&kq_Giant); } /* @@ -948,10 +999,12 @@ { struct knote *kn; + mtx_lock(&kq_Giant); while ((kn = SLIST_FIRST(list)) != NULL) { - kn->kn_fop->f_detach(kn); knote_drop(kn, td); + mtx_lock(&kq_Giant); } + mtx_unlock(&kq_Giant); } /* @@ -976,13 +1029,16 @@ u_long tmp_knhashmask; int size; + mtx_lock(&kq_Giant); FILEDESC_LOCK(fdp); if (! kn->kn_fop->f_isfd) { if (fdp->fd_knhashmask == 0) { FILEDESC_UNLOCK(fdp); + mtx_unlock(&kq_Giant); tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, &tmp_knhashmask); + mtx_lock(&kq_Giant); FILEDESC_LOCK(fdp); if (fdp->fd_knhashmask == 0) { fdp->fd_knhash = tmp_knhash; @@ -1000,8 +1056,10 @@ while (size <= kn->kn_id) size += KQEXTENT; FILEDESC_UNLOCK(fdp); + mtx_unlock(&kq_Giant); MALLOC(list, struct klist *, size * sizeof(struct klist *), M_KQUEUE, M_WAITOK); + mtx_lock(&kq_Giant); FILEDESC_LOCK(fdp); if (fdp->fd_knlistsize > kn->kn_id) { FREE(list, M_KQUEUE); @@ -1023,12 +1081,12 @@ done: FILEDESC_UNLOCK(fdp); SLIST_INSERT_HEAD(list, kn, kn_link); - kn->kn_status = 0; + mtx_unlock(&kq_Giant); } /* - * should be called at spl == 0, since we don't want to hold spl - * while calling fdrop and free. + * Release our reference to the knote and object, dropping our kqueue lock. + * The knote is always detached and freed afterward. */ static void knote_drop(struct knote *kn, struct thread *td) @@ -1041,15 +1099,15 @@ list = &fdp->fd_knlist[kn->kn_id]; else list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; - if (kn->kn_fop->f_isfd) - FILE_LOCK(kn->kn_fp); FILEDESC_UNLOCK(fdp); SLIST_REMOVE(list, kn, knote, kn_link); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); + mtx_unlock(&kq_Giant); + kn->kn_fop->f_detach(kn); if (kn->kn_fop->f_isfd) - fdrop_locked(kn->kn_fp, td); + fdrop(kn->kn_fp, td); knote_free(kn); } @@ -1058,14 +1116,12 @@ knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; - int s = splhigh(); KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; - splx(s); kqueue_wakeup(kq); } @@ -1073,14 +1129,12 @@ knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; - int s = splhigh(); KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; - splx(s); } static void @@ -1088,9 +1142,10 @@ { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + mtx_init(&kq_Giant, "global kqueue lock", NULL, MTX_DEF); } -SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) +SYSINIT(knote, SI_SUB_VM_CONF, SI_ORDER_ANY, knote_init, NULL) static struct knote * knote_alloc(void) Index: sys/kern/kern_exit.c =================================================================== RCS file: /usr/ncvs/src/sys/kern/kern_exit.c,v retrieving revision 1.229 diff -u -r1.229 kern_exit.c --- sys/kern/kern_exit.c 5 Apr 2004 21:03:34 -0000 1.229 +++ sys/kern/kern_exit.c 21 Apr 2004 03:53:36 -0000 @@ -442,19 +442,19 @@ calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL); mtx_unlock_spin(&sched_lock); ruadd(p->p_ru, &p->p_stats->p_cru); + mtx_unlock(&Giant); /* * Notify interested parties of our demise. */ KNOTE(&p->p_klist, NOTE_EXIT); - mtx_unlock(&Giant); /* * Just delete all entries in the p_klist. At this point we won't * report any more events, and there are nasty race conditions that - * can beat us if we don't. + * can beat us if we don't; the KNOTE(NOTE_EXIT) already detached + * them. */ - while (SLIST_FIRST(&p->p_klist)) - SLIST_REMOVE_HEAD(&p->p_klist, kn_selnext); + knote_remove(td, &p->p_klist); /* * Notify parent that we're gone. If parent has the PS_NOCLDWAIT Index: sys/kern/kern_fork.c =================================================================== RCS file: /usr/ncvs/src/sys/kern/kern_fork.c,v retrieving revision 1.226 diff -u -r1.226 kern_fork.c --- sys/kern/kern_fork.c 5 Apr 2004 21:03:34 -0000 1.226 +++ sys/kern/kern_fork.c 20 Apr 2004 23:52:45 -0000 @@ -715,15 +715,12 @@ /* * Now can be swapped. */ - PROC_LOCK(p1); - _PRELE(p1); + PRELE(p1); /* * Tell any interested parties about the new process. */ KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid); - - PROC_UNLOCK(p1); /* * Preserve synchronization semantics of vfork. If waiting for Index: sys/kern/kern_sig.c =================================================================== RCS file: /usr/ncvs/src/sys/kern/kern_sig.c,v retrieving revision 1.276 diff -u -r1.276 kern_sig.c --- sys/kern/kern_sig.c 12 Apr 2004 15:56:05 -0000 1.276 +++ sys/kern/kern_sig.c 21 Apr 2004 03:36:33 -0000 @@ -1677,7 +1677,11 @@ ps = p->p_sigacts; PROC_LOCK_ASSERT(p, MA_OWNED); + _PHOLD(p); + PROC_UNLOCK(p); KNOTE(&p->p_klist, NOTE_SIGNAL | sig); + PROC_LOCK(p); + _PRELE(p); prop = sigprop(sig); @@ -2681,9 +2685,7 @@ kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ - PROC_LOCK(p); - SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext); - PROC_UNLOCK(p); + KLIST_INSERT(&p->p_klist, kn); return (0); } @@ -2693,9 +2695,7 @@ { struct proc *p = kn->kn_ptr.p_proc; - PROC_LOCK(p); - SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext); - PROC_UNLOCK(p); + KLIST_REMOVE(&p->p_klist, kn); } /* Index: sys/kern/sys_pipe.c =================================================================== RCS file: /usr/ncvs/src/sys/kern/sys_pipe.c,v retrieving revision 1.171 diff -u -r1.171 sys_pipe.c --- sys/kern/sys_pipe.c 27 Mar 2004 19:50:22 -0000 1.171 +++ sys/kern/sys_pipe.c 21 Apr 2004 03:38:41 -0000 @@ -475,7 +475,6 @@ } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(&cpipe->pipe_sigio, SIGIO, 0); - KNOTE(&cpipe->pipe_sel.si_note, 0); } /* @@ -518,7 +517,7 @@ { struct pipe *rpipe = fp->f_data; int error; - int nread = 0; + int nread = 0, doknote = 0; u_int size; PIPE_LOCK(rpipe); @@ -664,10 +663,14 @@ } } - if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) + if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) { pipeselwakeup(rpipe); + doknote = 1; + } PIPE_UNLOCK(rpipe); + if (doknote) + KNOTE(&rpipe->pipe_sel.si_note, 0); return (error); } @@ -907,7 +910,7 @@ struct thread *td; int flags; { - int error = 0; + int error = 0, doknote = 0; int orig_resid; struct pipe *wpipe, *rpipe; @@ -1142,6 +1145,7 @@ * wake up select/poll. */ pipeselwakeup(wpipe); + doknote = 1; wpipe->pipe_state |= PIPE_WANTW; error = msleep(wpipe, PIPE_MTX(rpipe), @@ -1191,10 +1195,20 @@ * We have something to offer, * wake up select/poll. */ - if (wpipe->pipe_buffer.cnt) + if (wpipe->pipe_buffer.cnt) { pipeselwakeup(wpipe); + doknote = 1; + } + if (doknote) + pipelock(wpipe, 0); PIPE_UNLOCK(rpipe); + if (doknote) { + KNOTE(&wpipe->pipe_sel.si_note, 0); + PIPE_LOCK(rpipe); + pipeunlock(wpipe); + PIPE_UNLOCK(rpipe); + } return (error); } @@ -1446,7 +1460,11 @@ ppipe->pipe_state |= PIPE_EOF; wakeup(ppipe); + pipelock(ppipe, 0); + PIPE_UNLOCK(cpipe); KNOTE(&ppipe->pipe_sel.si_note, 0); + PIPE_LOCK(cpipe); + pipeunlock(ppipe); } /* @@ -1457,6 +1475,7 @@ */ pipelock(cpipe, 0); PIPE_UNLOCK(cpipe); + KNOTE(&cpipe->pipe_sel.si_note, 0); pipe_free_kmem(cpipe); PIPE_LOCK(cpipe); cpipe->pipe_present = 0; @@ -1502,8 +1521,8 @@ return (1); } - SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext); PIPE_UNLOCK(cpipe); + KLIST_INSERT(&cpipe->pipe_sel.si_note, kn); return (0); } @@ -1520,8 +1539,8 @@ } cpipe = cpipe->pipe_peer; } - SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext); PIPE_UNLOCK(cpipe); + KLIST_REMOVE(&cpipe->pipe_sel.si_note, kn); } /*ARGSUSED*/ Index: sys/kern/tty.c =================================================================== RCS file: /usr/ncvs/src/sys/kern/tty.c,v retrieving revision 1.210 diff -u -r1.210 tty.c --- sys/kern/tty.c 7 Apr 2004 20:46:10 -0000 1.210 +++ sys/kern/tty.c 21 Apr 2004 03:39:14 -0000 @@ -1199,7 +1199,7 @@ kn->kn_hook = (caddr_t)dev; s = spltty(); - SLIST_INSERT_HEAD(klist, kn, kn_selnext); + KLIST_INSERT(klist, kn); splx(s); return (0); @@ -1211,7 +1211,7 @@ struct tty *tp = ((dev_t)kn->kn_hook)->si_tty; int s = spltty(); - SLIST_REMOVE(&tp->t_rsel.si_note, kn, knote, kn_selnext); + KLIST_REMOVE(&tp->t_rsel.si_note, kn); splx(s); } @@ -1234,7 +1234,7 @@ struct tty *tp = ((dev_t)kn->kn_hook)->si_tty; int s = spltty(); - SLIST_REMOVE(&tp->t_wsel.si_note, kn, knote, kn_selnext); + KLIST_REMOVE(&tp->t_wsel.si_note, kn); splx(s); } Index: sys/kern/uipc_socket.c =================================================================== RCS file: /usr/ncvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.170 diff -u -r1.170 uipc_socket.c --- sys/kern/uipc_socket.c 9 Apr 2004 13:23:51 -0000 1.170 +++ sys/kern/uipc_socket.c 21 Apr 2004 03:32:18 -0000 @@ -1821,7 +1821,7 @@ } s = splnet(); - SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); + KLIST_INSERT(&sb->sb_sel.si_note, kn); sb->sb_flags |= SB_KNOTE; splx(s); return (0); @@ -1833,9 +1833,7 @@ struct socket *so = kn->kn_fp->f_data; int s = splnet(); - SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); - if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) - so->so_rcv.sb_flags &= ~SB_KNOTE; + KLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn); splx(s); } @@ -1866,9 +1864,7 @@ struct socket *so = kn->kn_fp->f_data; int s = splnet(); - SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); - if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) - so->so_snd.sb_flags &= ~SB_KNOTE; + KLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn); splx(s); } Index: sys/kern/vfs_aio.c =================================================================== RCS file: /usr/ncvs/src/sys/kern/vfs_aio.c,v retrieving revision 1.169 diff -u -r1.169 vfs_aio.c --- sys/kern/vfs_aio.c 14 Mar 2004 02:06:27 -0000 1.169 +++ sys/kern/vfs_aio.c 21 Apr 2004 03:40:11 -0000 @@ -2275,7 +2275,7 @@ return (EPERM); kn->kn_flags &= ~EV_FLAG1; - SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext); + KLIST_INSERT(&aiocbe->klist, kn); return (0); } @@ -2286,7 +2286,7 @@ { struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; - SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext); + KLIST_REMOVE(&aiocbe->klist, kn); } /* kqueue filter function */ Index: sys/kern/vfs_subr.c =================================================================== RCS file: /usr/ncvs/src/sys/kern/vfs_subr.c,v retrieving revision 1.490 diff -u -r1.490 vfs_subr.c --- sys/kern/vfs_subr.c 11 Apr 2004 21:09:22 -0000 1.490 +++ sys/kern/vfs_subr.c 21 Apr 2004 04:10:58 -0000 @@ -3227,8 +3227,8 @@ struct vnode *vp; { - mtx_lock(&vp->v_pollinfo->vpi_lock); VN_KNOTE(vp, NOTE_REVOKE); + mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_events) { vp->v_pollinfo->vpi_events = 0; selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO); Index: sys/net/bpf.c =================================================================== RCS file: /usr/ncvs/src/sys/net/bpf.c,v retrieving revision 1.125 diff -u -r1.125 bpf.c --- sys/net/bpf.c 7 Apr 2004 20:46:11 -0000 1.125 +++ sys/net/bpf.c 21 Apr 2004 03:40:46 -0000 @@ -525,7 +525,6 @@ pgsigio(&d->bd_sigio, d->bd_sig, 0); selwakeuppri(&d->bd_sel, PRINET); - KNOTE(&d->bd_sel.si_note, 0); } static void @@ -541,6 +540,7 @@ bpf_wakeup(d); } BPFD_UNLOCK(d); + KNOTE(&d->bd_sel.si_note, 0); } static int @@ -1088,9 +1088,7 @@ kn->kn_fop = &bpfread_filtops; kn->kn_hook = d; - BPFD_LOCK(d); - SLIST_INSERT_HEAD(&d->bd_sel.si_note, kn, kn_selnext); - BPFD_UNLOCK(d); + KLIST_INSERT(&d->bd_sel.si_note, kn); return (0); } @@ -1101,9 +1099,7 @@ { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; - BPFD_LOCK(d); - SLIST_REMOVE(&d->bd_sel.si_note, kn, knote, kn_selnext); - BPFD_UNLOCK(d); + KLIST_REMOVE(&d->bd_sel.si_note, kn); } static int @@ -1158,6 +1154,7 @@ catchpacket(d, pkt, pktlen, slen, bcopy); } BPFD_UNLOCK(d); + KNOTE(&d->bd_sel.si_note, 0); } BPFIF_UNLOCK(bp); } @@ -1220,6 +1217,7 @@ catchpacket(d, (u_char *)m, pktlen, slen, bpf_mcopy); BPFD_UNLOCK(d); + KNOTE(&d->bd_sel.si_note, 0); } BPFIF_UNLOCK(bp); } @@ -1264,6 +1262,7 @@ catchpacket(d, (u_char *)&mb, pktlen, slen, bpf_mcopy); BPFD_UNLOCK(d); + KNOTE(&d->bd_sel.si_note, 0); } BPFIF_UNLOCK(bp); } @@ -1480,6 +1479,7 @@ while ((d = bp->bif_dlist) != NULL) { bpf_detachd(d); + KNOTE(&d->bd_sel.si_note, 0); BPFD_LOCK(d); bpf_wakeup(d); BPFD_UNLOCK(d); Index: sys/net/if.c =================================================================== RCS file: /usr/ncvs/src/sys/net/if.c,v retrieving revision 1.190 diff -u -r1.190 if.c --- sys/net/if.c 19 Apr 2004 17:28:15 -0000 1.190 +++ sys/net/if.c 21 Apr 2004 03:41:54 -0000 @@ -209,8 +209,7 @@ kn->kn_hook = (caddr_t)klist; - /* XXX locking? */ - SLIST_INSERT_HEAD(klist, kn, kn_selnext); + KLIST_INSERT(klist, kn); return (0); } @@ -222,7 +221,7 @@ if (kn->kn_status & KN_DETACHED) return; - SLIST_REMOVE(klist, kn, knote, kn_selnext); + KLIST_REMOVE(klist, kn); } static int Index: sys/sys/event.h =================================================================== RCS file: /usr/ncvs/src/sys/sys/event.h,v retrieving revision 1.22 diff -u -r1.22 event.h --- sys/sys/event.h 2 Feb 2003 19:39:51 -0000 1.22 +++ sys/sys/event.h 21 Apr 2004 03:52:24 -0000 @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: src/sys/sys/event.h,v 1.22 2003/02/02 19:39:51 nectar Exp $ + * $FreeBSD$ */ #ifndef _SYS_EVENT_H_ @@ -128,6 +128,8 @@ #endif #define KNOTE(list, hint) if ((list) != NULL) knote(list, hint) +#define KLIST_INSERT(list, kn) knote_list_insert(list, kn) +#define KLIST_REMOVE(list, kn) knote_list_remove(list, kn) /* * Flag indicating hint is a signal. Used by EVFILT_SIGNAL, and also @@ -174,6 +176,8 @@ struct proc; extern void knote(struct klist *list, long hint); +extern void knote_list_insert(struct klist *list, struct knote *kn); +extern void knote_list_remove(struct klist *list, struct knote *kn); extern void knote_remove(struct thread *p, struct klist *list); extern void knote_fdclose(struct thread *p, int fd); extern int kqueue_register(struct kqueue *kq, Index: sys/sys/eventvar.h =================================================================== RCS file: /usr/ncvs/src/sys/sys/eventvar.h,v retrieving revision 1.4 diff -u -r1.4 eventvar.h --- sys/sys/eventvar.h 18 Jul 2000 19:31:48 -0000 1.4 +++ sys/sys/eventvar.h 20 Apr 2004 23:52:45 -0000 @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: src/sys/sys/eventvar.h,v 1.4 2000/07/18 19:31:48 jlemon Exp $ + * $FreeBSD: src/sys/sys/eventvar.h,v 1.3 2000/05/26 02:06:54 jake Exp $ */ #ifndef _SYS_EVENTVAR_H_ @@ -40,7 +40,6 @@ int kq_state; #define KQ_SEL 0x01 #define KQ_SLEEP 0x02 - struct kevent kq_kev[KQ_NEVENTS]; }; #endif /* !_SYS_EVENTVAR_H_ */ Index: sys/sys/socketvar.h =================================================================== RCS file: /usr/ncvs/src/sys/sys/socketvar.h,v retrieving revision 1.111 diff -u -r1.111 socketvar.h --- sys/sys/socketvar.h 7 Apr 2004 04:19:49 -0000 1.111 +++ sys/sys/socketvar.h 21 Apr 2004 03:30:29 -0000 @@ -118,7 +118,7 @@ #define SB_UPCALL 0x20 /* someone wants an upcall */ #define SB_NOINTR 0x40 /* operations not interruptible */ #define SB_AIO 0x80 /* AIO operations queued */ -#define SB_KNOTE 0x100 /* kernel note attached */ +#define SB_KNOTE 0x100 /* kernel note ever attached? */ void (*so_upcall)(struct socket *, void *, int); void *so_upcallarg; Index: sys/ufs/ufs/ufs_vnops.c =================================================================== RCS file: /usr/ncvs/src/sys/ufs/ufs/ufs_vnops.c,v retrieving revision 1.239 diff -u -r1.239 ufs_vnops.c --- sys/ufs/ufs/ufs_vnops.c 7 Apr 2004 03:47:20 -0000 1.239 +++ sys/ufs/ufs/ufs_vnops.c 21 Apr 2004 03:43:44 -0000 @@ -2620,9 +2620,7 @@ if (vp->v_pollinfo == NULL) v_addpollinfo(vp); - mtx_lock(&vp->v_pollinfo->vpi_lock); - SLIST_INSERT_HEAD(&vp->v_pollinfo->vpi_selinfo.si_note, kn, kn_selnext); - mtx_unlock(&vp->v_pollinfo->vpi_lock); + KLIST_INSERT(&vp->v_pollinfo->vpi_selinfo.si_note, kn); return (0); } @@ -2633,10 +2631,7 @@ struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Mising v_pollinfo")); - mtx_lock(&vp->v_pollinfo->vpi_lock); - SLIST_REMOVE(&vp->v_pollinfo->vpi_selinfo.si_note, - kn, knote, kn_selnext); - mtx_unlock(&vp->v_pollinfo->vpi_lock); + KLIST_REMOVE(&vp->v_pollinfo->vpi_selinfo.si_note, kn); } /*ARGSUSED*/ -- Brian Fundakowski Feldman \'[ FreeBSD ]''''''''''\ <> green@FreeBSD.org \ The Power to Serve! \ Opinions expressed are my own. \,,,,,,,,,,,,,,,,,,,,,,\
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200404211634.i3LGYZsr046745>