Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 2 May 2014 11:49:29 -0700
From:      Adrian Chadd <adrian@freebsd.org>
To:        Roman Divacky <rdivacky@freebsd.org>
Cc:        "svn-src-head@freebsd.org" <svn-src-head@freebsd.org>, "svn-src-all@freebsd.org" <svn-src-all@freebsd.org>, "src-committers@freebsd.org" <src-committers@freebsd.org>
Subject:   Re: svn commit: r255672 - in head/sys: amd64/linux32 compat/linux conf i386/linux kern modules/linux sys
Message-ID:  <CAJ-Vmo=EWwzeLrZzrEwdyj4RRcoJM5DQE98S=essx8wT9hk%2BOQ@mail.gmail.com>
In-Reply-To: <201309181756.r8IHu4qV052882@svn.freebsd.org>
References:  <201309181756.r8IHu4qV052882@svn.freebsd.org>

next in thread | previous in thread | raw e-mail | index | archive | help
Hi,

why not just extend the kqueue data fields to 64 bits and leave the
freebsd API only copy 32 bits in?



-a


On 18 September 2013 10:56, Roman Divacky <rdivacky@freebsd.org> wrote:
> Author: rdivacky
> Date: Wed Sep 18 17:56:04 2013
> New Revision: 255672
> URL: http://svnweb.freebsd.org/changeset/base/255672
>
> Log:
>   Implement epoll support in Linuxulator. This is a tiny wrapper around kqueue
>   to implement epoll subset of functionality. The kqueue user data are 32bit
>   on i386 which is not enough for epoll user data so this patch overrides
>   kqueue fileops to maintain enough space in struct file.
>
>   Initial patch developed by me in 2007 and then extended and finished
>   by Yuri Victorovich.
>
>   Approved by:    re (delphij)
>   Sponsored by:   Google Summer of Code
>   Submitted by:   Yuri Victorovich <yuri at rawbw dot com>
>   Tested by:      Yuri Victorovich <yuri at rawbw dot com>
>
> Added:
>   head/sys/compat/linux/linux_epoll.c   (contents, props changed)
>   head/sys/compat/linux/linux_epoll.h   (contents, props changed)
> Modified:
>   head/sys/amd64/linux32/linux32_dummy.c
>   head/sys/amd64/linux32/syscalls.master
>   head/sys/conf/files.amd64
>   head/sys/conf/files.i386
>   head/sys/conf/files.pc98
>   head/sys/i386/linux/linux_dummy.c
>   head/sys/i386/linux/syscalls.master
>   head/sys/kern/kern_event.c
>   head/sys/modules/linux/Makefile
>   head/sys/sys/event.h
>   head/sys/sys/file.h
>   head/sys/sys/syscallsubr.h
>
> Modified: head/sys/amd64/linux32/linux32_dummy.c
> ==============================================================================
> --- head/sys/amd64/linux32/linux32_dummy.c      Wed Sep 18 17:28:19 2013        (r255671)
> +++ head/sys/amd64/linux32/linux32_dummy.c      Wed Sep 18 17:56:04 2013        (r255672)
> @@ -70,9 +70,6 @@ DUMMY(pivot_root);
>  DUMMY(mincore);
>  DUMMY(ptrace);
>  DUMMY(lookup_dcookie);
> -DUMMY(epoll_create);
> -DUMMY(epoll_ctl);
> -DUMMY(epoll_wait);
>  DUMMY(remap_file_pages);
>  DUMMY(timer_create);
>  DUMMY(timer_settime);
> @@ -129,7 +126,6 @@ DUMMY(timerfd_gettime);
>  /* linux 2.6.27: */
>  DUMMY(signalfd4);
>  DUMMY(eventfd2);
> -DUMMY(epoll_create1);
>  DUMMY(dup3);
>  DUMMY(inotify_init1);
>  /* linux 2.6.30: */
>
> Modified: head/sys/amd64/linux32/syscalls.master
> ==============================================================================
> --- head/sys/amd64/linux32/syscalls.master      Wed Sep 18 17:28:19 2013        (r255671)
> +++ head/sys/amd64/linux32/syscalls.master      Wed Sep 18 17:56:04 2013        (r255672)
> @@ -430,9 +430,11 @@
>  251    AUE_NULL        UNIMPL
>  252    AUE_EXIT        STD     { int linux_exit_group(int error_code); }
>  253    AUE_NULL        STD     { int linux_lookup_dcookie(void); }
> -254    AUE_NULL        STD     { int linux_epoll_create(void); }
> -255    AUE_NULL        STD     { int linux_epoll_ctl(void); }
> -256    AUE_NULL        STD     { int linux_epoll_wait(void); }
> +254    AUE_NULL        STD     { int linux_epoll_create(l_int size); }
> +255    AUE_NULL        STD     { int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \
> +                                       struct linux_epoll_event *event); }
> +256    AUE_NULL        STD     { int linux_epoll_wait(l_int epfd, struct linux_epoll_event *events, \
> +                                       l_int maxevents, l_int timeout); }
>  257    AUE_NULL        STD     { int linux_remap_file_pages(void); }
>  258    AUE_NULL        STD     { int linux_set_tid_address(int *tidptr); }
>  259    AUE_NULL        STD     { int linux_timer_create(void); }
> @@ -534,7 +536,7 @@
>  ; linux 2.6.27:
>  327    AUE_NULL        STD     { int linux_signalfd4(void); }
>  328    AUE_NULL        STD     { int linux_eventfd2(void); }
> -329    AUE_NULL        STD     { int linux_epoll_create1(void); }
> +329    AUE_NULL        STD     { int linux_epoll_create1(l_int flags); }
>  330    AUE_NULL        STD     { int linux_dup3(void); }
>  331    AUE_NULL        STD     { int linux_pipe2(l_int *pipefds, l_int flags); }
>  332    AUE_NULL        STD     { int linux_inotify_init1(void); }
>
> Added: head/sys/compat/linux/linux_epoll.c
> ==============================================================================
> --- /dev/null   00:00:00 1970   (empty, because file is newly added)
> +++ head/sys/compat/linux/linux_epoll.c Wed Sep 18 17:56:04 2013        (r255672)
> @@ -0,0 +1,554 @@
> +/*-
> + * Copyright (c) 2007 Roman Divacky
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +
> +#include <sys/cdefs.h>
> +__FBSDID("$FreeBSD$");
> +
> +#include "opt_compat.h"
> +#include "opt_ktrace.h"
> +
> +#include <sys/limits.h>
> +#include <sys/param.h>
> +#include <sys/kernel.h>
> +#include <sys/capability.h>
> +#include <sys/types.h>
> +#include <sys/systm.h>
> +#include <sys/file.h>
> +#include <sys/filedesc.h>
> +#include <sys/errno.h>
> +#include <sys/event.h>
> +#include <sys/proc.h>
> +#include <sys/sysproto.h>
> +#include <sys/syscallsubr.h>
> +#include <sys/timespec.h>
> +#include <compat/linux/linux_epoll.h>
> +#include <compat/linux/linux_util.h>
> +#ifdef KTRACE
> +#include <sys/ktrace.h>
> +#endif
> +
> +#ifdef COMPAT_LINUX32
> +#include <machine/../linux32/linux.h>
> +#include <machine/../linux32/linux32_proto.h>
> +#else
> +#include <machine/../linux/linux.h>
> +#include <machine/../linux/linux_proto.h>
> +#endif
> +
> +#define ktrepoll_events(evt, count) \
> +       ktrstruct("linux_epoll_event", (evt), count * sizeof(*evt))
> +
> +/*
> + * epoll defines 'struct epoll_event' with the field 'data' as 64 bits
> + * on all architectures. But on 32 bit architectures BSD 'struct kevent' only
> + * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied
> + * data verbatuim. Therefore on 32 bit architectures we allocate 64-bit memory
> + * block to pass user supplied data for every file descriptor.
> + */
> +typedef        uint64_t        epoll_udata_t;
> +#if defined(__i386__)
> +#define EPOLL_WIDE_USER_DATA   1
> +#else
> +#define EPOLL_WIDE_USER_DATA   0
> +#endif
> +
> +#if EPOLL_WIDE_USER_DATA
> +
> +/*
> + * Approach similar to epoll_user_data could also be used to
> + * keep track of event bits per file descriptor for all architectures.
> + * However, it isn't obvious that such tracking would be beneficial
> + * in practice.
> + */
> +
> +struct epoll_user_data {
> +       unsigned        sz;
> +       epoll_udata_t   data[1];
> +};
> +static MALLOC_DEFINE(M_LINUX_EPOLL, "epoll", "memory for epoll system");
> +#define        EPOLL_USER_DATA_SIZE(ndata) \
> +       (sizeof(struct epoll_user_data)+((ndata)-1)*sizeof(epoll_udata_t))
> +#define        EPOLL_USER_DATA_MARGIN  16
> +
> +static void epoll_init_user_data(struct thread *td, struct file *epfp);
> +static void epoll_set_user_data(struct thread *td, struct file *epfp, int fd, epoll_udata_t user_data);
> +static epoll_udata_t epoll_get_user_data(struct thread *td, struct file *epfp, int fd);
> +static fo_close_t epoll_close;
> +
> +/* overload kqueue fileops */
> +static struct fileops epollops = {
> +       .fo_read =      kqueue_read,
> +       .fo_write =     kqueue_write,
> +       .fo_truncate =  kqueue_truncate,
> +       .fo_ioctl =     kqueue_ioctl,
> +       .fo_poll =      kqueue_poll,
> +       .fo_kqfilter =  kqueue_kqfilter,
> +       .fo_stat =      kqueue_stat,
> +       .fo_close =     epoll_close,
> +       .fo_chmod =     invfo_chmod,
> +       .fo_chown =     invfo_chown,
> +       .fo_sendfile =  invfo_sendfile,
> +};
> +#endif
> +
> +static struct file* epoll_fget(struct thread *td, int epfd);
> +
> +struct epoll_copyin_args {
> +       struct kevent   *changelist;
> +};
> +
> +struct epoll_copyout_args {
> +       struct linux_epoll_event        *leventlist;
> +       int                             count;
> +       int                             error;
> +#if KTRACE || EPOLL_WIDE_USER_DATA
> +       struct thread                   *td;
> +#endif
> +#if EPOLL_WIDE_USER_DATA
> +       struct file                     *epfp;
> +#endif
> +};
> +
> +
> +/* Create a new epoll file descriptor. */
> +
> +static int
> +linux_epoll_create_common(struct thread *td)
> +{
> +       struct file *fp;
> +       int error;
> +
> +       error = kern_kqueue_locked(td, &fp);
> +#if EPOLL_WIDE_USER_DATA
> +       if (error == 0) {
> +               epoll_init_user_data(td, fp);
> +               fdrop(fp, td);
> +       }
> +#endif
> +       return (error);
> +}
> +
> +int
> +linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
> +{
> +       if (args->size <= 0)
> +               return (EINVAL);
> +       /* args->size is unused. Linux just tests it
> +        * and then forgets it as well. */
> +
> +       return (linux_epoll_create_common(td));
> +}
> +
> +int
> +linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
> +{
> +       int error;
> +
> +       error = linux_epoll_create_common(td);
> +
> +       if (!error) {
> +               if (args->flags & LINUX_EPOLL_CLOEXEC)
> +                       td->td_proc->p_fd->fd_ofiles[td->td_retval[0]].fde_flags |= UF_EXCLOSE;
> +               if (args->flags & LINUX_EPOLL_NONBLOCK)
> +                       linux_msg(td, "epoll_create1 doesn't yet support EPOLL_NONBLOCK flag\n");
> +       }
> +
> +       return (error);
> +}
> +
> +/* Structure converting function from epoll to kevent. */
> +static int
> +linux_epoll_to_kevent(struct thread *td,
> +#if EPOLL_WIDE_USER_DATA
> +       struct file *epfp,
> +#endif
> +       int fd, struct linux_epoll_event *l_event, int kev_flags, struct kevent *kevent, int *nkevents)
> +{
> +       /* flags related to how event is registered */
> +       if (l_event->events & LINUX_EPOLLONESHOT)
> +               kev_flags |= EV_ONESHOT;
> +       if (l_event->events & LINUX_EPOLLET) {
> +               kev_flags |= EV_CLEAR;
> +       }
> +
> +       /* flags related to what event is registered */
> +       if (l_event->events & LINUX_EPOLLIN ||
> +           l_event->events & LINUX_EPOLLRDNORM ||
> +           l_event->events & LINUX_EPOLLPRI ||
> +           l_event->events & LINUX_EPOLLRDHUP) {
> +               EV_SET(kevent++, fd, EVFILT_READ, kev_flags, 0, 0,
> +                       (void*)(EPOLL_WIDE_USER_DATA ? 0 : l_event->data));
> +               ++*nkevents;
> +       }
> +       if (l_event->events & LINUX_EPOLLOUT ||
> +           l_event->events & LINUX_EPOLLWRNORM) {
> +               EV_SET(kevent++, fd, EVFILT_WRITE, kev_flags, 0, 0,
> +                       (void*)(EPOLL_WIDE_USER_DATA ? 0 : l_event->data));
> +               ++*nkevents;
> +       }
> +       if (l_event->events & LINUX_EPOLLRDBAND ||
> +           l_event->events & LINUX_EPOLLWRBAND ||
> +           l_event->events & LINUX_EPOLLHUP ||
> +           l_event->events & LINUX_EPOLLMSG ||
> +           l_event->events & LINUX_EPOLLWAKEUP ||
> +           l_event->events & LINUX_EPOLLERR) {
> +               linux_msg(td, "epoll_ctl doesn't yet support some event flags supplied: 0x%x\n",
> +                       l_event->events);
> +               return (EINVAL);
> +       }
> +
> +#if EPOLL_WIDE_USER_DATA
> +       epoll_set_user_data(td, epfp, fd, l_event->data);
> +#endif
> +       return (0);
> +}
> +
> +/*
> + * Structure converting function from kevent to epoll. In a case
> + * this is called on error in registration we store the error in
> + * event->data and pick it up later in linux_epoll_ctl().
> + */
> +static void
> +linux_kevent_to_epoll(
> +#if EPOLL_WIDE_USER_DATA
> +       struct thread *td, struct file *epfp,
> +#endif
> +       struct kevent *kevent, struct linux_epoll_event *l_event)
> +{
> +       if ((kevent->flags & EV_ERROR) == 0)
> +               switch (kevent->filter) {
> +               case EVFILT_READ:
> +                       l_event->events = LINUX_EPOLLIN|LINUX_EPOLLRDNORM|LINUX_EPOLLPRI;
> +               break;
> +               case EVFILT_WRITE:
> +                       l_event->events = LINUX_EPOLLOUT|LINUX_EPOLLWRNORM;
> +               break;
> +               }
> +#if EPOLL_WIDE_USER_DATA
> +       l_event->data = epoll_get_user_data(td, epfp, kevent->ident);
> +#else
> +       l_event->data = (epoll_udata_t)kevent->udata;
> +#endif
> +}
> +
> +/*
> + * Copyout callback used by kevent. This converts kevent
> + * events to epoll events and copies them back to the
> + * userspace. This is also called on error on registering
> + * of the filter.
> + */
> +static int
> +epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
> +{
> +       struct epoll_copyout_args *args;
> +       struct linux_epoll_event *eep;
> +       int error, i;
> +
> +       args = (struct epoll_copyout_args*) arg;
> +       eep = malloc(sizeof(*eep) * count, M_TEMP, M_WAITOK | M_ZERO);
> +
> +       for (i = 0; i < count; i++)
> +               linux_kevent_to_epoll(
> +#if EPOLL_WIDE_USER_DATA
> +                       args->td, args->epfp,
> +#endif
> +                       &kevp[i], &eep[i]);
> +
> +       error = copyout(eep, args->leventlist, count * sizeof(*eep));
> +       if (!error) {
> +               args->leventlist += count;
> +               args->count += count;
> +       } else if (!args->error)
> +               args->error = error;
> +
> +#ifdef KTRACE
> +       if (KTRPOINT(args->td, KTR_STRUCT))
> +               ktrepoll_events(eep, count);
> +#endif
> +
> +       free(eep, M_TEMP);
> +       return (error);
> +}
> +
> +/*
> + * Copyin callback used by kevent. This copies already
> + * converted filters from kernel memory to the kevent
> + * internal kernel memory. Hence the memcpy instead of
> + * copyin.
> + */
> +static int
> +epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
> +{
> +       struct epoll_copyin_args *args;
> +
> +       args = (struct epoll_copyin_args*) arg;
> +
> +       memcpy(kevp, args->changelist, count * sizeof(*kevp));
> +       args->changelist += count;
> +
> +       return (0);
> +}
> +
> +static int
> +ignore_enoent(int error) {
> +       if (error == ENOENT)
> +               error = 0;
> +       return (error);
> +}
> +
> +static int
> +delete_event(struct thread *td, struct file *epfp, int fd, int filter)
> +{
> +       struct epoll_copyin_args ciargs;
> +       struct kevent kev;
> +       struct kevent_copyops k_ops = { &ciargs,
> +                                       NULL,
> +                                       epoll_kev_copyin};
> +       ciargs.changelist = &kev;
> +
> +       EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0);
> +       return (kern_kevent_locked(td, epfp, 1, 0, &k_ops, NULL));
> +}
> +
> +static int
> +delete_all_events(struct thread *td, struct file *epfp, int fd)
> +{
> +       /* here we ignore ENONT, because we don't keep track of events here */
> +       int error1, error2;
> +
> +       error1 = ignore_enoent(delete_event(td, epfp, fd, EVFILT_READ));
> +       error2 = ignore_enoent(delete_event(td, epfp, fd, EVFILT_WRITE));
> +
> +       /* report any errors we got */
> +       if (error1)
> +               return (error1);
> +       if (error2)
> +               return (error2);
> +       return (0);
> +}
> +
> +/*
> + * Load epoll filter, convert it to kevent filter
> + * and load it into kevent subsystem.
> + */
> +int
> +linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
> +{
> +       struct file *epfp;
> +       struct epoll_copyin_args ciargs;
> +       struct kevent kev[2];
> +       struct kevent_copyops k_ops = { &ciargs,
> +                                       NULL,
> +                                       epoll_kev_copyin};
> +       struct linux_epoll_event le;
> +       int kev_flags;
> +       int nchanges = 0;
> +       int error;
> +
> +       if (args->epfd == args->fd)
> +               return (EINVAL);
> +
> +       if (args->op != LINUX_EPOLL_CTL_DEL) {
> +               error = copyin(args->event, &le, sizeof(le));
> +               if (error)
> +                       return (error);
> +       }
> +#ifdef DEBUG
> +       if (ldebug(epoll_ctl))
> +               printf(ARGS(epoll_ctl,"%i, %i, %i, %u"), args->epfd, args->op,
> +                       args->fd, le.events);
> +#endif
> +#ifdef KTRACE
> +       if (KTRPOINT(td, KTR_STRUCT) && args->op != LINUX_EPOLL_CTL_DEL)
> +               ktrepoll_events(&le, 1);
> +#endif
> +       epfp = epoll_fget(td, args->epfd);
> +
> +       ciargs.changelist = kev;
> +
> +       switch (args->op) {
> +       case LINUX_EPOLL_CTL_MOD:
> +                       /* we don't memorize which events were set for this FD
> +                          on this level, so just delete all we could have set:
> +                          EVFILT_READ and EVFILT_WRITE, ignoring any errors
> +                       */
> +                       error = delete_all_events(td, epfp, args->fd);
> +                       if (error)
> +                               goto leave;
> +               /* FALLTHROUGH */
> +       case LINUX_EPOLL_CTL_ADD:
> +                       kev_flags = EV_ADD | EV_ENABLE;
> +               break;
> +       case LINUX_EPOLL_CTL_DEL:
> +                       /* CTL_DEL means unregister this fd with this epoll */
> +                       error = delete_all_events(td, epfp, args->fd);
> +               goto leave;
> +       default:
> +               error = EINVAL;
> +               goto leave;
> +       }
> +
> +       error = linux_epoll_to_kevent(td,
> +#if EPOLL_WIDE_USER_DATA
> +               epfp,
> +#endif
> +               args->fd, &le, kev_flags, kev, &nchanges);
> +       if (error)
> +               goto leave;
> +
> +       error = kern_kevent_locked(td, epfp, nchanges, 0, &k_ops, NULL);
> +leave:
> +       fdrop(epfp, td);
> +       return (error);
> +}
> +
> +/*
> + * Wait for a filter to be triggered on the epoll file descriptor. */
> +int
> +linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
> +{
> +       struct file *epfp;
> +       struct timespec ts, *tsp;
> +       struct epoll_copyout_args coargs;
> +       struct kevent_copyops k_ops = { &coargs,
> +                                       epoll_kev_copyout,
> +                                       NULL};
> +       int error;
> +
> +       if (args->maxevents <= 0 || args->maxevents > LINUX_MAX_EVENTS)
> +               return (EINVAL);
> +
> +       epfp = epoll_fget(td, args->epfd);
> +
> +       coargs.leventlist = args->events;
> +       coargs.count = 0;
> +       coargs.error = 0;
> +#if defined(KTRACE) || EPOLL_WIDE_USER_DATA
> +       coargs.td = td;
> +#endif
> +#if EPOLL_WIDE_USER_DATA
> +       coargs.epfp = epfp;
> +#endif
> +
> +       if (args->timeout != -1) {
> +               if (args->timeout < 0) {
> +                       error = EINVAL;
> +                       goto leave;
> +               }
> +               /* Convert from milliseconds to timespec. */
> +               ts.tv_sec = args->timeout / 1000;
> +               ts.tv_nsec = (args->timeout % 1000) * 1000000;
> +               tsp = &ts;
> +       } else {
> +               tsp = NULL;
> +       }
> +
> +       error = kern_kevent_locked(td, epfp, 0, args->maxevents, &k_ops, tsp);
> +       if (!error && coargs.error)
> +               error = coargs.error;
> +
> +       /*
> +        * kern_keven might return ENOMEM which is not expected from epoll_wait.
> +        * Maybe we should translate that but I don't think it matters at all.
> +        */
> +
> +       if (!error)
> +               td->td_retval[0] = coargs.count;
> +leave:
> +       fdrop(epfp, td);
> +       return (error);
> +}
> +
> +#if EPOLL_WIDE_USER_DATA
> +/*
> + * we store user_data vector in an unused for kqueue descriptor
> + * field fvn_epollpriv in struct file.
> + */
> +#define EPOLL_USER_DATA_GET(epfp) \
> +       ((struct epoll_user_data*)(epfp)->f_vnun.fvn_epollpriv)
> +#define EPOLL_USER_DATA_SET(epfp, udv) \
> +       (epfp)->f_vnun.fvn_epollpriv = (udv)
> +
> +static void
> +epoll_init_user_data(struct thread *td, struct file *epfp)
> +{
> +       struct epoll_user_data *udv;
> +
> +       /* override file ops to have our close operation */
> +       atomic_store_rel_ptr((volatile uintptr_t *)&epfp->f_ops, (uintptr_t)&epollops);
> +
> +       /* allocate epoll_user_data initially for up to 16 file descriptor values */
> +       udv = malloc(EPOLL_USER_DATA_SIZE(EPOLL_USER_DATA_MARGIN), M_LINUX_EPOLL, M_WAITOK);
> +       udv->sz = EPOLL_USER_DATA_MARGIN;
> +       EPOLL_USER_DATA_SET(epfp, udv);
> +}
> +
> +static void
> +epoll_set_user_data(struct thread *td, struct file *epfp, int fd, epoll_udata_t user_data)
> +{
> +       struct epoll_user_data *udv = EPOLL_USER_DATA_GET(epfp);
> +
> +       if (fd >= udv->sz) {
> +               udv = realloc(udv, EPOLL_USER_DATA_SIZE(fd + EPOLL_USER_DATA_MARGIN), M_LINUX_EPOLL, M_WAITOK);
> +               udv->sz = fd + EPOLL_USER_DATA_MARGIN;
> +               EPOLL_USER_DATA_SET(epfp, udv);
> +       }
> +       udv->data[fd] = user_data;
> +}
> +
> +static epoll_udata_t
> +epoll_get_user_data(struct thread *td, struct file *epfp, int fd)
> +{
> +       struct epoll_user_data *udv = EPOLL_USER_DATA_GET(epfp);
> +       if (fd >= udv->sz)
> +               panic("epoll: user data vector is too small");
> +
> +       return (udv->data[fd]);
> +}
> +
> +/*ARGSUSED*/
> +static int
> +epoll_close(struct file *epfp, struct thread *td)
> +{
> +       /* free user data vector */
> +       free(EPOLL_USER_DATA_GET(epfp), M_LINUX_EPOLL);
> +       /* over to kqueue parent */
> +       return (kqueue_close(epfp, td));
> +}
> +#endif
> +
> +static struct file*
> +epoll_fget(struct thread *td, int epfd)
> +{
> +       struct file *fp;
> +       cap_rights_t rights;
> +
> +       if (fget(td, epfd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp) != 0)
> +               panic("epoll: no file object found for kqueue descriptor");
> +
> +       return (fp);
> +}
> +
>
> Added: head/sys/compat/linux/linux_epoll.h
> ==============================================================================
> --- /dev/null   00:00:00 1970   (empty, because file is newly added)
> +++ head/sys/compat/linux/linux_epoll.h Wed Sep 18 17:56:04 2013        (r255672)
> @@ -0,0 +1,68 @@
> +/*-
> + * Copyright (c) 2007 Roman Divacky
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * $FreeBSD$
> + */
> +
> +#ifndef _LINUX_EPOLL_H_
> +#define        _LINUX_EPOLL_H_
> +
> +#ifdef __amd64__
> +#define        EPOLL_PACKED    __packed
> +#else
> +#define        EPOLL_PACKED
> +#endif
> +
> +struct linux_epoll_event {
> +       uint32_t        events;
> +       uint64_t        data;
> +} EPOLL_PACKED;
> +
> +#define        LINUX_EPOLLIN           0x001
> +#define        LINUX_EPOLLPRI          0x002
> +#define        LINUX_EPOLLOUT          0x004
> +#define        LINUX_EPOLLRDNORM       0x040
> +#define        LINUX_EPOLLRDBAND       0x080
> +#define        LINUX_EPOLLWRNORM       0x100
> +#define        LINUX_EPOLLWRBAND       0x200
> +#define        LINUX_EPOLLMSG          0x400
> +#define        LINUX_EPOLLERR          0x008
> +#define        LINUX_EPOLLHUP          0x010
> +#define        LINUX_EPOLLRDHUP        0x2000
> +#define        LINUX_EPOLLWAKEUP       1u<<29
> +#define        LINUX_EPOLLONESHOT      1u<<30
> +#define        LINUX_EPOLLET           1u<<31
> +
> +#define        LINUX_EPOLL_CTL_ADD     1
> +#define        LINUX_EPOLL_CTL_DEL     2
> +#define        LINUX_EPOLL_CTL_MOD     3
> +
> +#define        LINUX_EPOLL_CLOEXEC     02000000
> +#define        LINUX_EPOLL_NONBLOCK    00004000
> +
> +#define        LINUX_MAX_EVENTS        (INT_MAX / sizeof(struct linux_epoll_event))
> +
> +#endif /* !_LINUX_EPOLL_H_ */
> +
>
> Modified: head/sys/conf/files.amd64
> ==============================================================================
> --- head/sys/conf/files.amd64   Wed Sep 18 17:28:19 2013        (r255671)
> +++ head/sys/conf/files.amd64   Wed Sep 18 17:56:04 2013        (r255672)
> @@ -467,6 +467,7 @@ amd64/linux32/linux32_support.s     optional
>         dependency      "linux32_assym.h"
>  amd64/linux32/linux32_sysent.c optional        compat_linux32
>  amd64/linux32/linux32_sysvec.c optional        compat_linux32
> +compat/linux/linux_epoll.c     optional        compat_linux32
>  compat/linux/linux_emul.c      optional        compat_linux32
>  compat/linux/linux_file.c      optional        compat_linux32
>  compat/linux/linux_fork.c      optional        compat_linux32
>
> Modified: head/sys/conf/files.i386
> ==============================================================================
> --- head/sys/conf/files.i386    Wed Sep 18 17:28:19 2013        (r255671)
> +++ head/sys/conf/files.i386    Wed Sep 18 17:56:04 2013        (r255672)
> @@ -80,6 +80,7 @@ hptrr_lib.o                   optional        hptrr                   \
>  cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S       optional zfs compile-with "${ZFS_S}"
>  compat/linprocfs/linprocfs.c   optional linprocfs
>  compat/linsysfs/linsysfs.c     optional linsysfs
> +compat/linux/linux_epoll.c     optional compat_linux
>  compat/linux/linux_emul.c      optional compat_linux
>  compat/linux/linux_file.c      optional compat_linux
>  compat/linux/linux_fork.c      optional compat_linux
>
> Modified: head/sys/conf/files.pc98
> ==============================================================================
> --- head/sys/conf/files.pc98    Wed Sep 18 17:28:19 2013        (r255671)
> +++ head/sys/conf/files.pc98    Wed Sep 18 17:56:04 2013        (r255672)
> @@ -41,6 +41,7 @@ ukbdmap.h                     optional        ukbd_dflt_keymap        \
>  cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S       optional zfs compile-with "${ZFS_S}"
>  compat/linprocfs/linprocfs.c   optional linprocfs
>  compat/linsysfs/linsysfs.c     optional linsysfs
> +compat/linux/linux_epoll.c     optional compat_linux
>  compat/linux/linux_emul.c      optional compat_linux
>  compat/linux/linux_file.c      optional compat_linux
>  compat/linux/linux_fork.c      optional compat_linux
>
> Modified: head/sys/i386/linux/linux_dummy.c
> ==============================================================================
> --- head/sys/i386/linux/linux_dummy.c   Wed Sep 18 17:28:19 2013        (r255671)
> +++ head/sys/i386/linux/linux_dummy.c   Wed Sep 18 17:56:04 2013        (r255672)
> @@ -72,9 +72,6 @@ DUMMY(setfsgid);
>  DUMMY(pivot_root);
>  DUMMY(mincore);
>  DUMMY(lookup_dcookie);
> -DUMMY(epoll_create);
> -DUMMY(epoll_ctl);
> -DUMMY(epoll_wait);
>  DUMMY(remap_file_pages);
>  DUMMY(fstatfs64);
>  DUMMY(mbind);
> @@ -120,7 +117,6 @@ DUMMY(timerfd_gettime);
>  /* linux 2.6.27: */
>  DUMMY(signalfd4);
>  DUMMY(eventfd2);
> -DUMMY(epoll_create1);
>  DUMMY(dup3);
>  DUMMY(inotify_init1);
>  /* linux 2.6.30: */
>
> Modified: head/sys/i386/linux/syscalls.master
> ==============================================================================
> --- head/sys/i386/linux/syscalls.master Wed Sep 18 17:28:19 2013        (r255671)
> +++ head/sys/i386/linux/syscalls.master Wed Sep 18 17:56:04 2013        (r255672)
> @@ -432,9 +432,11 @@
>  251    AUE_NULL        UNIMPL
>  252    AUE_EXIT        STD     { int linux_exit_group(int error_code); }
>  253    AUE_NULL        STD     { int linux_lookup_dcookie(void); }
> -254    AUE_NULL        STD     { int linux_epoll_create(void); }
> -255    AUE_NULL        STD     { int linux_epoll_ctl(void); }
> -256    AUE_NULL        STD     { int linux_epoll_wait(void); }
> +254    AUE_NULL        STD     { int linux_epoll_create(l_int size); }
> +255    AUE_NULL        STD     { int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \
> +                                       struct linux_epoll_event *event); }
> +256    AUE_NULL        STD     { int linux_epoll_wait(l_int epfd, struct linux_epoll_event *events, \
> +                                       l_int maxevents, l_int timeout); }
>  257    AUE_NULL        STD     { int linux_remap_file_pages(void); }
>  258    AUE_NULL        STD     { int linux_set_tid_address(int *tidptr); }
>  259    AUE_NULL        STD     { int linux_timer_create(clockid_t clock_id, \
> @@ -544,7 +546,7 @@
>  ; linux 2.6.27:
>  327    AUE_NULL        STD     { int linux_signalfd4(void); }
>  328    AUE_NULL        STD     { int linux_eventfd2(void); }
> -329    AUE_NULL        STD     { int linux_epoll_create1(void); }
> +329    AUE_NULL        STD     { int linux_epoll_create1(l_int flags); }
>  330    AUE_NULL        STD     { int linux_dup3(void); }
>  331    AUE_NULL        STD     { int linux_pipe2(l_int *pipefds, l_int flags); }
>  332    AUE_NULL        STD     { int linux_inotify_init1(void); }
>
> Modified: head/sys/kern/kern_event.c
> ==============================================================================
> --- head/sys/kern/kern_event.c  Wed Sep 18 17:28:19 2013        (r255671)
> +++ head/sys/kern/kern_event.c  Wed Sep 18 17:56:04 2013        (r255672)
> @@ -107,16 +107,7 @@ static void        kqueue_wakeup(struct kqueue
>  static struct filterops *kqueue_fo_find(int filt);
>  static void    kqueue_fo_release(int filt);
>
> -static fo_rdwr_t       kqueue_read;
> -static fo_rdwr_t       kqueue_write;
> -static fo_truncate_t   kqueue_truncate;
> -static fo_ioctl_t      kqueue_ioctl;
> -static fo_poll_t       kqueue_poll;
> -static fo_kqfilter_t   kqueue_kqfilter;
> -static fo_stat_t       kqueue_stat;
> -static fo_close_t      kqueue_close;
> -
> -static struct fileops kqueueops = {
> +struct fileops kqueueops = {
>         .fo_read = kqueue_read,
>         .fo_write = kqueue_write,
>         .fo_truncate = kqueue_truncate,
> @@ -303,7 +294,7 @@ filt_fileattach(struct knote *kn)
>  }
>
>  /*ARGSUSED*/
> -static int
> +int
>  kqueue_kqfilter(struct file *fp, struct knote *kn)
>  {
>         struct kqueue *kq = kn->kn_fp->f_data;
> @@ -688,34 +679,7 @@ filt_usertouch(struct knote *kn, struct
>  int
>  sys_kqueue(struct thread *td, struct kqueue_args *uap)
>  {
> -       struct filedesc *fdp;
> -       struct kqueue *kq;
> -       struct file *fp;
> -       int fd, error;
> -
> -       fdp = td->td_proc->p_fd;
> -       error = falloc(td, &fp, &fd, 0);
> -       if (error)
> -               goto done2;
> -
> -       /* An extra reference on `fp' has been held for us by falloc(). */
> -       kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
> -       mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
> -       TAILQ_INIT(&kq->kq_head);
> -       kq->kq_fdp = fdp;
> -       knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
> -       TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
> -
> -       FILEDESC_XLOCK(fdp);
> -       TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
> -       FILEDESC_XUNLOCK(fdp);
> -
> -       finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
> -       fdrop(fp, td);
> -
> -       td->td_retval[0] = fd;
> -done2:
> -       return (error);
> +       return (kern_kqueue(td));
>  }
>
>  #ifndef _SYS_SYSPROTO_H_
> @@ -817,19 +781,75 @@ kevent_copyin(void *arg, struct kevent *
>  }
>
>  int
> +kern_kqueue(struct thread *td)
> +{
> +       struct file *fp;
> +       int error;
> +
> +       error = kern_kqueue_locked(td, &fp);
> +
> +       fdrop(fp, td);
> +       return (error);
> +}
> +
> +int
> +kern_kqueue_locked(struct thread *td, struct file **fpp)
> +{
> +       struct filedesc *fdp;
> +       struct kqueue *kq;
> +       struct file *fp;
> +       int fd, error;
> +
> +       fdp = td->td_proc->p_fd;
> +       error = falloc(td, &fp, &fd, 0);
> +       if (error)
> +               return (error);
> +
> +       /* An extra reference on `fp' has been held for us by falloc(). */
> +       kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
> +       mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
> +       TAILQ_INIT(&kq->kq_head);
> +       kq->kq_fdp = fdp;
> +       knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
> +       TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
> +
> +       FILEDESC_XLOCK(fdp);
> +       TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
> +       FILEDESC_XUNLOCK(fdp);
> +
> +       finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
> +
> +       td->td_retval[0] = fd;
> +       *fpp = fp;
> +       return (0);
> +}
> +
> +int
>  kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
>      struct kevent_copyops *k_ops, const struct timespec *timeout)
>  {
> +       struct file *fp;
> +       cap_rights_t rights;
> +       int error;
> +
> +       if ((error = fget(td, fd, cap_rights_init(&rights, CAP_POST_EVENT), &fp)) != 0)
> +               return (error);
> +
> +       error = kern_kevent_locked(td, fp, nchanges, nevents, k_ops, timeout);
> +
> +       fdrop(fp, td);
> +       return (error);
> +}
> +
> +int
> +kern_kevent_locked(struct thread *td, struct file *fp, int nchanges, int nevents,
> +    struct kevent_copyops *k_ops, const struct timespec *timeout)
> +{
>         struct kevent keva[KQ_NEVENTS];
>         struct kevent *kevp, *changes;
>         struct kqueue *kq;
> -       struct file *fp;
> -       cap_rights_t rights;
>         int i, n, nerrors, error;
>
> -       error = fget(td, fd, cap_rights_init(&rights, CAP_POST_EVENT), &fp);
> -       if (error != 0)
> -               return (error);
>         if ((error = kqueue_acquire(fp, &kq)) != 0)
>                 goto done_norel;
>
> @@ -872,7 +892,6 @@ kern_kevent(struct thread *td, int fd, i
>  done:
>         kqueue_release(kq, 0);
>  done_norel:
> -       fdrop(fp, td);
>         return (error);
>  }
>
> @@ -1526,7 +1545,7 @@ done_nl:
>   * This could be expanded to call kqueue_scan, if desired.
>   */
>  /*ARGSUSED*/
> -static int
> +int
>  kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
>         int flags, struct thread *td)
>  {
> @@ -1534,7 +1553,7 @@ kqueue_read(struct file *fp, struct uio
>  }
>
>  /*ARGSUSED*/
> -static int
> +int
>  kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
>          int flags, struct thread *td)
>  {
> @@ -1542,7 +1561,7 @@ kqueue_write(struct file *fp, struct uio
>  }
>
>  /*ARGSUSED*/
> -static int
> +int
>  kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
>         struct thread *td)
>  {
> @@ -1551,7 +1570,7 @@ kqueue_truncate(struct file *fp, off_t l
>  }
>
>  /*ARGSUSED*/
> -static int
> +int
>  kqueue_ioctl(struct file *fp, u_long cmd, void *data,
>         struct ucred *active_cred, struct thread *td)
>  {
> @@ -1599,7 +1618,7 @@ kqueue_ioctl(struct file *fp, u_long cmd
>  }
>
>  /*ARGSUSED*/
> -static int
> +int
>  kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
>         struct thread *td)
>  {
> @@ -1626,7 +1645,7 @@ kqueue_poll(struct file *fp, int events,
>  }
>
>  /*ARGSUSED*/
> -static int
> +int
>  kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
>         struct thread *td)
>  {
> @@ -1644,7 +1663,7 @@ kqueue_stat(struct file *fp, struct stat
>  }
>
>  /*ARGSUSED*/
> -static int
> +int
>  kqueue_close(struct file *fp, struct thread *td)
>  {
>         struct kqueue *kq = fp->f_data;
>
> Modified: head/sys/modules/linux/Makefile
> ==============================================================================
> --- head/sys/modules/linux/Makefile     Wed Sep 18 17:28:19 2013        (r255671)
> +++ head/sys/modules/linux/Makefile     Wed Sep 18 17:56:04 2013        (r255672)
> @@ -9,7 +9,7 @@ CFLAGS+=-DCOMPAT_FREEBSD32 -DCOMPAT_LINU
>
>  KMOD=  linux
>  SRCS=  linux_fork.c linux${SFX}_dummy.c linux_emul.c linux_file.c \
> -       linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \
> +       linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c linux_epoll.c \
>         linux${SFX}_machdep.c linux_mib.c linux_misc.c linux_signal.c \
>         linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c \
>         linux${SFX}_sysvec.c linux_uid16.c linux_util.c linux_time.c \
>
> Modified: head/sys/sys/event.h
> ==============================================================================
> --- head/sys/sys/event.h        Wed Sep 18 17:28:19 2013        (r255671)
> +++ head/sys/sys/event.h        Wed Sep 18 17:56:04 2013        (r255672)
> @@ -236,6 +236,9 @@ struct proc;
>  struct knlist;
>  struct mtx;
>  struct rwlock;
> +struct uio;
> +struct stat;
> +struct ucred;
>
>  extern void    knote(struct knlist *list, long hint, int lockflags);
>  extern void    knote_fork(struct knlist *list, int pid);
> @@ -261,6 +264,21 @@ extern int         kqfd_register(int fd, struct
>  extern int     kqueue_add_filteropts(int filt, struct filterops *filtops);
>  extern int     kqueue_del_filteropts(int filt);
>
> +int kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
> +       int flags, struct thread *td);
> +int kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
> +        int flags, struct thread *td);
> +int kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
> +       struct thread *td);
>
> *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAJ-Vmo=EWwzeLrZzrEwdyj4RRcoJM5DQE98S=essx8wT9hk%2BOQ>