Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 18 Sep 2013 17:56:04 +0000 (UTC)
From:      Roman Divacky <rdivacky@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r255672 - in head/sys: amd64/linux32 compat/linux conf i386/linux kern modules/linux sys
Message-ID:  <201309181756.r8IHu4qV052882@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: rdivacky
Date: Wed Sep 18 17:56:04 2013
New Revision: 255672
URL: http://svnweb.freebsd.org/changeset/base/255672

Log:
  Implement epoll support in Linuxulator. This is a tiny wrapper around kqueue
  to implement epoll subset of functionality. The kqueue user data are 32bit
  on i386 which is not enough for epoll user data so this patch overrides
  kqueue fileops to maintain enough space in struct file.
  
  Initial patch developed by me in 2007 and then extended and finished
  by Yuri Victorovich.
  
  Approved by:    re (delphij)
  Sponsored by:   Google Summer of Code
  Submitted by:   Yuri Victorovich <yuri at rawbw dot com>
  Tested by:      Yuri Victorovich <yuri at rawbw dot com>

Added:
  head/sys/compat/linux/linux_epoll.c   (contents, props changed)
  head/sys/compat/linux/linux_epoll.h   (contents, props changed)
Modified:
  head/sys/amd64/linux32/linux32_dummy.c
  head/sys/amd64/linux32/syscalls.master
  head/sys/conf/files.amd64
  head/sys/conf/files.i386
  head/sys/conf/files.pc98
  head/sys/i386/linux/linux_dummy.c
  head/sys/i386/linux/syscalls.master
  head/sys/kern/kern_event.c
  head/sys/modules/linux/Makefile
  head/sys/sys/event.h
  head/sys/sys/file.h
  head/sys/sys/syscallsubr.h

Modified: head/sys/amd64/linux32/linux32_dummy.c
==============================================================================
--- head/sys/amd64/linux32/linux32_dummy.c	Wed Sep 18 17:28:19 2013	(r255671)
+++ head/sys/amd64/linux32/linux32_dummy.c	Wed Sep 18 17:56:04 2013	(r255672)
@@ -70,9 +70,6 @@ DUMMY(pivot_root);
 DUMMY(mincore);
 DUMMY(ptrace);
 DUMMY(lookup_dcookie);
-DUMMY(epoll_create);
-DUMMY(epoll_ctl);
-DUMMY(epoll_wait);
 DUMMY(remap_file_pages);
 DUMMY(timer_create);
 DUMMY(timer_settime);
@@ -129,7 +126,6 @@ DUMMY(timerfd_gettime);
 /* linux 2.6.27: */
 DUMMY(signalfd4);
 DUMMY(eventfd2);
-DUMMY(epoll_create1);
 DUMMY(dup3);
 DUMMY(inotify_init1);
 /* linux 2.6.30: */

Modified: head/sys/amd64/linux32/syscalls.master
==============================================================================
--- head/sys/amd64/linux32/syscalls.master	Wed Sep 18 17:28:19 2013	(r255671)
+++ head/sys/amd64/linux32/syscalls.master	Wed Sep 18 17:56:04 2013	(r255672)
@@ -430,9 +430,11 @@
 251	AUE_NULL	UNIMPL
 252	AUE_EXIT	STD	{ int linux_exit_group(int error_code); }
 253	AUE_NULL	STD	{ int linux_lookup_dcookie(void); }
-254	AUE_NULL	STD	{ int linux_epoll_create(void); }
-255	AUE_NULL	STD	{ int linux_epoll_ctl(void); }
-256	AUE_NULL	STD	{ int linux_epoll_wait(void); }
+254	AUE_NULL	STD	{ int linux_epoll_create(l_int size); }
+255	AUE_NULL	STD	{ int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \
+					struct linux_epoll_event *event); }
+256	AUE_NULL	STD	{ int linux_epoll_wait(l_int epfd, struct linux_epoll_event *events, \
+					l_int maxevents, l_int timeout); }
 257	AUE_NULL	STD	{ int linux_remap_file_pages(void); }
 258	AUE_NULL	STD	{ int linux_set_tid_address(int *tidptr); }
 259	AUE_NULL	STD	{ int linux_timer_create(void); }
@@ -534,7 +536,7 @@
 ; linux 2.6.27:
 327	AUE_NULL	STD	{ int linux_signalfd4(void); }
 328	AUE_NULL	STD	{ int linux_eventfd2(void); }
-329	AUE_NULL	STD	{ int linux_epoll_create1(void); }
+329	AUE_NULL	STD	{ int linux_epoll_create1(l_int flags); }
 330	AUE_NULL	STD	{ int linux_dup3(void); }
 331	AUE_NULL	STD	{ int linux_pipe2(l_int *pipefds, l_int flags); }
 332	AUE_NULL	STD	{ int linux_inotify_init1(void); }

Added: head/sys/compat/linux/linux_epoll.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/compat/linux/linux_epoll.c	Wed Sep 18 17:56:04 2013	(r255672)
@@ -0,0 +1,554 @@
+/*-
+ * Copyright (c) 2007 Roman Divacky
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/limits.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/capability.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/errno.h>
+#include <sys/event.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/syscallsubr.h>
+#include <sys/timespec.h>
+#include <compat/linux/linux_epoll.h>
+#include <compat/linux/linux_util.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#ifdef COMPAT_LINUX32
+#include <machine/../linux32/linux.h>
+#include <machine/../linux32/linux32_proto.h>
+#else
+#include <machine/../linux/linux.h>
+#include <machine/../linux/linux_proto.h>
+#endif
+
+#define ktrepoll_events(evt, count) \
+	ktrstruct("linux_epoll_event", (evt), count * sizeof(*evt))
+
+/*
+ * epoll defines 'struct epoll_event' with the field 'data' as 64 bits
+ * on all architectures. But on 32 bit architectures BSD 'struct kevent' only
+ * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied
+ * data verbatuim. Therefore on 32 bit architectures we allocate 64-bit memory
+ * block to pass user supplied data for every file descriptor.
+ */
+typedef	uint64_t	epoll_udata_t;
+#if defined(__i386__)
+#define EPOLL_WIDE_USER_DATA	1
+#else
+#define EPOLL_WIDE_USER_DATA	0
+#endif
+
+#if EPOLL_WIDE_USER_DATA
+
+/*
+ * Approach similar to epoll_user_data could also be used to
+ * keep track of event bits per file descriptor for all architectures.
+ * However, it isn't obvious that such tracking would be beneficial
+ * in practice.
+ */
+
+struct epoll_user_data {
+	unsigned	sz;
+	epoll_udata_t	data[1];
+};
+static MALLOC_DEFINE(M_LINUX_EPOLL, "epoll", "memory for epoll system");
+#define	EPOLL_USER_DATA_SIZE(ndata) \
+	(sizeof(struct epoll_user_data)+((ndata)-1)*sizeof(epoll_udata_t))
+#define	EPOLL_USER_DATA_MARGIN	16
+
+static void epoll_init_user_data(struct thread *td, struct file *epfp);
+static void epoll_set_user_data(struct thread *td, struct file *epfp, int fd, epoll_udata_t user_data);
+static epoll_udata_t epoll_get_user_data(struct thread *td, struct file *epfp, int fd);
+static fo_close_t epoll_close;
+
+/* overload kqueue fileops */
+static struct fileops epollops = {
+	.fo_read =	kqueue_read,
+	.fo_write =	kqueue_write,
+	.fo_truncate =	kqueue_truncate,
+	.fo_ioctl =	kqueue_ioctl,
+	.fo_poll =	kqueue_poll,
+	.fo_kqfilter =	kqueue_kqfilter,
+	.fo_stat =	kqueue_stat,
+	.fo_close =	epoll_close,
+	.fo_chmod =	invfo_chmod,
+	.fo_chown =	invfo_chown,
+	.fo_sendfile =	invfo_sendfile,
+};
+#endif
+
+static struct file* epoll_fget(struct thread *td, int epfd);
+
+struct epoll_copyin_args {
+	struct kevent	*changelist;
+};
+
+struct epoll_copyout_args {
+	struct linux_epoll_event	*leventlist;
+	int				count;
+	int				error;
+#if KTRACE || EPOLL_WIDE_USER_DATA
+	struct thread 			*td;
+#endif
+#if EPOLL_WIDE_USER_DATA
+	struct file			*epfp;
+#endif
+};
+
+
+/* Create a new epoll file descriptor. */
+
+static int
+linux_epoll_create_common(struct thread *td)
+{
+	struct file *fp;
+	int error;
+
+	error = kern_kqueue_locked(td, &fp);
+#if EPOLL_WIDE_USER_DATA
+	if (error == 0) {
+		epoll_init_user_data(td, fp);
+		fdrop(fp, td);
+	}
+#endif
+	return (error);
+}
+
+int
+linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
+{
+	if (args->size <= 0)
+		return (EINVAL);
+	/* args->size is unused. Linux just tests it
+	 * and then forgets it as well. */
+
+	return (linux_epoll_create_common(td));
+}
+
+int
+linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
+{
+	int error;
+
+	error = linux_epoll_create_common(td);
+
+	if (!error) {
+		if (args->flags & LINUX_EPOLL_CLOEXEC)
+			td->td_proc->p_fd->fd_ofiles[td->td_retval[0]].fde_flags |= UF_EXCLOSE;
+		if (args->flags & LINUX_EPOLL_NONBLOCK)
+			linux_msg(td, "epoll_create1 doesn't yet support EPOLL_NONBLOCK flag\n");
+	}
+
+	return (error);
+}
+
+/* Structure converting function from epoll to kevent. */
+static int
+linux_epoll_to_kevent(struct thread *td,
+#if EPOLL_WIDE_USER_DATA
+	struct file *epfp,
+#endif
+	int fd, struct linux_epoll_event *l_event, int kev_flags, struct kevent *kevent, int *nkevents)
+{
+	/* flags related to how event is registered */
+	if (l_event->events & LINUX_EPOLLONESHOT)
+		kev_flags |= EV_ONESHOT;
+	if (l_event->events & LINUX_EPOLLET) {
+		kev_flags |= EV_CLEAR;
+	}
+
+	/* flags related to what event is registered */
+	if (l_event->events & LINUX_EPOLLIN ||
+	    l_event->events & LINUX_EPOLLRDNORM ||
+	    l_event->events & LINUX_EPOLLPRI ||
+	    l_event->events & LINUX_EPOLLRDHUP) {
+		EV_SET(kevent++, fd, EVFILT_READ, kev_flags, 0, 0,
+			(void*)(EPOLL_WIDE_USER_DATA ? 0 : l_event->data));
+		++*nkevents;
+	}
+	if (l_event->events & LINUX_EPOLLOUT ||
+	    l_event->events & LINUX_EPOLLWRNORM) {
+		EV_SET(kevent++, fd, EVFILT_WRITE, kev_flags, 0, 0,
+			(void*)(EPOLL_WIDE_USER_DATA ? 0 : l_event->data));
+		++*nkevents;
+	}
+	if (l_event->events & LINUX_EPOLLRDBAND ||
+	    l_event->events & LINUX_EPOLLWRBAND ||
+	    l_event->events & LINUX_EPOLLHUP ||
+	    l_event->events & LINUX_EPOLLMSG ||
+	    l_event->events & LINUX_EPOLLWAKEUP ||
+	    l_event->events & LINUX_EPOLLERR) {
+		linux_msg(td, "epoll_ctl doesn't yet support some event flags supplied: 0x%x\n",
+			l_event->events);
+		return (EINVAL);
+	}
+
+#if EPOLL_WIDE_USER_DATA
+	epoll_set_user_data(td, epfp, fd, l_event->data);
+#endif
+	return (0);
+}
+
+/* 
+ * Structure converting function from kevent to epoll. In a case
+ * this is called on error in registration we store the error in
+ * event->data and pick it up later in linux_epoll_ctl().
+ */
+static void
+linux_kevent_to_epoll(
+#if EPOLL_WIDE_USER_DATA
+	struct thread *td, struct file *epfp,
+#endif
+	struct kevent *kevent, struct linux_epoll_event *l_event)
+{
+	if ((kevent->flags & EV_ERROR) == 0)
+		switch (kevent->filter) {
+		case EVFILT_READ:
+			l_event->events = LINUX_EPOLLIN|LINUX_EPOLLRDNORM|LINUX_EPOLLPRI;
+		break;
+		case EVFILT_WRITE:
+			l_event->events = LINUX_EPOLLOUT|LINUX_EPOLLWRNORM;
+		break;
+		}
+#if EPOLL_WIDE_USER_DATA
+	l_event->data = epoll_get_user_data(td, epfp, kevent->ident);
+#else
+	l_event->data = (epoll_udata_t)kevent->udata;
+#endif
+}
+
+/* 
+ * Copyout callback used by kevent. This converts kevent
+ * events to epoll events and copies them back to the
+ * userspace. This is also called on error on registering
+ * of the filter.
+ */
+static int
+epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
+{
+	struct epoll_copyout_args *args;
+	struct linux_epoll_event *eep;
+	int error, i;
+
+	args = (struct epoll_copyout_args*) arg;
+	eep = malloc(sizeof(*eep) * count, M_TEMP, M_WAITOK | M_ZERO);
+
+	for (i = 0; i < count; i++)
+		linux_kevent_to_epoll(
+#if EPOLL_WIDE_USER_DATA
+			args->td, args->epfp,
+#endif
+			&kevp[i], &eep[i]);
+
+	error = copyout(eep, args->leventlist, count * sizeof(*eep));
+	if (!error) {
+		args->leventlist += count;
+		args->count += count;
+	} else if (!args->error)
+		args->error = error;
+
+#ifdef KTRACE
+	if (KTRPOINT(args->td, KTR_STRUCT))
+		ktrepoll_events(eep, count);
+#endif
+
+	free(eep, M_TEMP);
+	return (error);
+}
+
+/*
+ * Copyin callback used by kevent. This copies already
+ * converted filters from kernel memory to the kevent 
+ * internal kernel memory. Hence the memcpy instead of
+ * copyin.
+ */
+static int
+epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
+{
+	struct epoll_copyin_args *args;
+
+	args = (struct epoll_copyin_args*) arg;
+	
+	memcpy(kevp, args->changelist, count * sizeof(*kevp));
+	args->changelist += count;
+
+	return (0);
+}
+
+static int
+ignore_enoent(int error) {
+	if (error == ENOENT)
+		error = 0;
+	return (error);
+}
+
+static int
+delete_event(struct thread *td, struct file *epfp, int fd, int filter)
+{
+	struct epoll_copyin_args ciargs;
+	struct kevent kev;
+	struct kevent_copyops k_ops = { &ciargs,
+					NULL,
+					epoll_kev_copyin};
+	ciargs.changelist = &kev;
+
+	EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0);
+	return (kern_kevent_locked(td, epfp, 1, 0, &k_ops, NULL));
+}
+
+static int
+delete_all_events(struct thread *td, struct file *epfp, int fd)
+{
+	/* here we ignore ENONT, because we don't keep track of events here */
+	int error1, error2;
+
+	error1 = ignore_enoent(delete_event(td, epfp, fd, EVFILT_READ));
+	error2 = ignore_enoent(delete_event(td, epfp, fd, EVFILT_WRITE));
+
+	/* report any errors we got */
+	if (error1)
+		return (error1);
+	if (error2)
+		return (error2);
+	return (0);
+}
+
+/*
+ * Load epoll filter, convert it to kevent filter
+ * and load it into kevent subsystem.
+ */
+int
+linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
+{
+	struct file *epfp;
+	struct epoll_copyin_args ciargs;
+	struct kevent kev[2];
+	struct kevent_copyops k_ops = { &ciargs,
+					NULL,
+					epoll_kev_copyin};
+	struct linux_epoll_event le;
+	int kev_flags;
+	int nchanges = 0;
+	int error;
+
+	if (args->epfd == args->fd)
+		return (EINVAL);
+
+	if (args->op != LINUX_EPOLL_CTL_DEL) {
+		error = copyin(args->event, &le, sizeof(le));
+		if (error)
+			return (error);
+	}
+#ifdef DEBUG
+	if (ldebug(epoll_ctl))
+		printf(ARGS(epoll_ctl,"%i, %i, %i, %u"), args->epfd, args->op,
+			args->fd, le.events);
+#endif
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT) && args->op != LINUX_EPOLL_CTL_DEL)
+		ktrepoll_events(&le, 1);
+#endif
+	epfp = epoll_fget(td, args->epfd);
+
+	ciargs.changelist = kev;
+
+	switch (args->op) {
+	case LINUX_EPOLL_CTL_MOD:
+			/* we don't memorize which events were set for this FD
+			   on this level, so just delete all we could have set:
+			   EVFILT_READ and EVFILT_WRITE, ignoring any errors
+			*/
+			error = delete_all_events(td, epfp, args->fd);
+			if (error)
+				goto leave;
+		/* FALLTHROUGH */
+	case LINUX_EPOLL_CTL_ADD:
+			kev_flags = EV_ADD | EV_ENABLE;
+		break;
+	case LINUX_EPOLL_CTL_DEL:
+			/* CTL_DEL means unregister this fd with this epoll */
+			error = delete_all_events(td, epfp, args->fd);
+		goto leave;
+	default:
+		error = EINVAL;
+		goto leave;
+	}
+
+	error = linux_epoll_to_kevent(td,
+#if EPOLL_WIDE_USER_DATA
+		epfp,
+#endif
+		args->fd, &le, kev_flags, kev, &nchanges);
+	if (error)
+		goto leave;
+
+	error = kern_kevent_locked(td, epfp, nchanges, 0, &k_ops, NULL);
+leave:
+	fdrop(epfp, td);
+	return (error);
+}
+
+/*
+ * Wait for a filter to be triggered on the epoll file descriptor. */
+int
+linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
+{
+	struct file *epfp;
+	struct timespec ts, *tsp;
+	struct epoll_copyout_args coargs;
+	struct kevent_copyops k_ops = { &coargs,
+					epoll_kev_copyout,
+					NULL};
+	int error;
+
+	if (args->maxevents <= 0 || args->maxevents > LINUX_MAX_EVENTS)
+		return (EINVAL);
+
+	epfp = epoll_fget(td, args->epfd);
+
+	coargs.leventlist = args->events;
+	coargs.count = 0;
+	coargs.error = 0;
+#if defined(KTRACE) || EPOLL_WIDE_USER_DATA
+	coargs.td = td;
+#endif
+#if EPOLL_WIDE_USER_DATA
+	coargs.epfp = epfp;
+#endif
+
+	if (args->timeout != -1) {
+		if (args->timeout < 0) {
+			error = EINVAL;
+			goto leave;
+		}
+		/* Convert from milliseconds to timespec. */
+		ts.tv_sec = args->timeout / 1000;
+		ts.tv_nsec = (args->timeout % 1000) * 1000000;
+		tsp = &ts;
+	} else {
+		tsp = NULL;
+	}
+
+	error = kern_kevent_locked(td, epfp, 0, args->maxevents, &k_ops, tsp);
+	if (!error && coargs.error)
+		error = coargs.error;
+
+	/* 
+	 * kern_keven might return ENOMEM which is not expected from epoll_wait.
+	 * Maybe we should translate that but I don't think it matters at all.
+	 */
+
+	if (!error)
+		td->td_retval[0] = coargs.count;
+leave:
+	fdrop(epfp, td);
+	return (error);
+}
+
+#if EPOLL_WIDE_USER_DATA
+/*
+ * we store user_data vector in an unused for kqueue descriptor
+ * field fvn_epollpriv in struct file.
+ */
+#define EPOLL_USER_DATA_GET(epfp) \
+	((struct epoll_user_data*)(epfp)->f_vnun.fvn_epollpriv)
+#define EPOLL_USER_DATA_SET(epfp, udv) \
+	(epfp)->f_vnun.fvn_epollpriv = (udv)
+
+static void
+epoll_init_user_data(struct thread *td, struct file *epfp)
+{
+	struct epoll_user_data *udv;
+
+	/* override file ops to have our close operation */
+	atomic_store_rel_ptr((volatile uintptr_t *)&epfp->f_ops, (uintptr_t)&epollops);
+
+	/* allocate epoll_user_data initially for up to 16 file descriptor values */
+	udv = malloc(EPOLL_USER_DATA_SIZE(EPOLL_USER_DATA_MARGIN), M_LINUX_EPOLL, M_WAITOK);
+	udv->sz = EPOLL_USER_DATA_MARGIN;
+	EPOLL_USER_DATA_SET(epfp, udv);
+}
+
+static void
+epoll_set_user_data(struct thread *td, struct file *epfp, int fd, epoll_udata_t user_data)
+{
+	struct epoll_user_data *udv = EPOLL_USER_DATA_GET(epfp);
+
+	if (fd >= udv->sz) {
+		udv = realloc(udv, EPOLL_USER_DATA_SIZE(fd + EPOLL_USER_DATA_MARGIN), M_LINUX_EPOLL, M_WAITOK);
+		udv->sz = fd + EPOLL_USER_DATA_MARGIN;
+		EPOLL_USER_DATA_SET(epfp, udv);
+	}
+	udv->data[fd] = user_data;
+}
+
+static epoll_udata_t
+epoll_get_user_data(struct thread *td, struct file *epfp, int fd)
+{
+	struct epoll_user_data *udv = EPOLL_USER_DATA_GET(epfp);
+	if (fd >= udv->sz)
+		panic("epoll: user data vector is too small");
+
+	return (udv->data[fd]);
+}
+
+/*ARGSUSED*/
+static int
+epoll_close(struct file *epfp, struct thread *td)
+{
+	/* free user data vector */
+	free(EPOLL_USER_DATA_GET(epfp), M_LINUX_EPOLL);
+	/* over to kqueue parent */
+	return (kqueue_close(epfp, td));
+}
+#endif
+
+static struct file*
+epoll_fget(struct thread *td, int epfd)
+{
+	struct file *fp;
+	cap_rights_t rights;
+
+	if (fget(td, epfd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp) != 0)
+		panic("epoll: no file object found for kqueue descriptor");
+
+	return (fp);
+}
+

Added: head/sys/compat/linux/linux_epoll.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/compat/linux/linux_epoll.h	Wed Sep 18 17:56:04 2013	(r255672)
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2007 Roman Divacky
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _LINUX_EPOLL_H_
+#define	_LINUX_EPOLL_H_
+
+#ifdef __amd64__
+#define	EPOLL_PACKED	__packed
+#else
+#define	EPOLL_PACKED
+#endif
+
+struct linux_epoll_event {
+	uint32_t	events;
+	uint64_t	data;	
+} EPOLL_PACKED;
+
+#define	LINUX_EPOLLIN		0x001
+#define	LINUX_EPOLLPRI		0x002
+#define	LINUX_EPOLLOUT		0x004
+#define	LINUX_EPOLLRDNORM	0x040
+#define	LINUX_EPOLLRDBAND	0x080
+#define	LINUX_EPOLLWRNORM	0x100
+#define	LINUX_EPOLLWRBAND	0x200
+#define	LINUX_EPOLLMSG		0x400
+#define	LINUX_EPOLLERR		0x008
+#define	LINUX_EPOLLHUP		0x010
+#define	LINUX_EPOLLRDHUP	0x2000
+#define	LINUX_EPOLLWAKEUP	1u<<29
+#define	LINUX_EPOLLONESHOT	1u<<30
+#define	LINUX_EPOLLET		1u<<31
+
+#define	LINUX_EPOLL_CTL_ADD	1
+#define	LINUX_EPOLL_CTL_DEL	2
+#define	LINUX_EPOLL_CTL_MOD	3
+
+#define	LINUX_EPOLL_CLOEXEC	02000000
+#define	LINUX_EPOLL_NONBLOCK	00004000
+
+#define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct linux_epoll_event))
+
+#endif	/* !_LINUX_EPOLL_H_ */
+

Modified: head/sys/conf/files.amd64
==============================================================================
--- head/sys/conf/files.amd64	Wed Sep 18 17:28:19 2013	(r255671)
+++ head/sys/conf/files.amd64	Wed Sep 18 17:56:04 2013	(r255672)
@@ -467,6 +467,7 @@ amd64/linux32/linux32_support.s	optional
 	dependency 	"linux32_assym.h"
 amd64/linux32/linux32_sysent.c	optional	compat_linux32
 amd64/linux32/linux32_sysvec.c	optional	compat_linux32
+compat/linux/linux_epoll.c	optional	compat_linux32
 compat/linux/linux_emul.c	optional	compat_linux32
 compat/linux/linux_file.c	optional	compat_linux32
 compat/linux/linux_fork.c	optional	compat_linux32

Modified: head/sys/conf/files.i386
==============================================================================
--- head/sys/conf/files.i386	Wed Sep 18 17:28:19 2013	(r255671)
+++ head/sys/conf/files.i386	Wed Sep 18 17:56:04 2013	(r255672)
@@ -80,6 +80,7 @@ hptrr_lib.o			optional	hptrr			\
 cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S	optional zfs compile-with "${ZFS_S}"
 compat/linprocfs/linprocfs.c	optional linprocfs
 compat/linsysfs/linsysfs.c	optional linsysfs
+compat/linux/linux_epoll.c	optional compat_linux
 compat/linux/linux_emul.c	optional compat_linux
 compat/linux/linux_file.c	optional compat_linux
 compat/linux/linux_fork.c	optional compat_linux

Modified: head/sys/conf/files.pc98
==============================================================================
--- head/sys/conf/files.pc98	Wed Sep 18 17:28:19 2013	(r255671)
+++ head/sys/conf/files.pc98	Wed Sep 18 17:56:04 2013	(r255672)
@@ -41,6 +41,7 @@ ukbdmap.h			optional	ukbd_dflt_keymap	\
 cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S	optional zfs compile-with "${ZFS_S}"
 compat/linprocfs/linprocfs.c	optional linprocfs
 compat/linsysfs/linsysfs.c	optional linsysfs
+compat/linux/linux_epoll.c	optional compat_linux
 compat/linux/linux_emul.c	optional compat_linux
 compat/linux/linux_file.c	optional compat_linux
 compat/linux/linux_fork.c	optional compat_linux

Modified: head/sys/i386/linux/linux_dummy.c
==============================================================================
--- head/sys/i386/linux/linux_dummy.c	Wed Sep 18 17:28:19 2013	(r255671)
+++ head/sys/i386/linux/linux_dummy.c	Wed Sep 18 17:56:04 2013	(r255672)
@@ -72,9 +72,6 @@ DUMMY(setfsgid);
 DUMMY(pivot_root);
 DUMMY(mincore);
 DUMMY(lookup_dcookie);
-DUMMY(epoll_create);
-DUMMY(epoll_ctl);
-DUMMY(epoll_wait);
 DUMMY(remap_file_pages);
 DUMMY(fstatfs64);
 DUMMY(mbind);
@@ -120,7 +117,6 @@ DUMMY(timerfd_gettime);
 /* linux 2.6.27: */
 DUMMY(signalfd4);
 DUMMY(eventfd2);
-DUMMY(epoll_create1);
 DUMMY(dup3);
 DUMMY(inotify_init1);
 /* linux 2.6.30: */

Modified: head/sys/i386/linux/syscalls.master
==============================================================================
--- head/sys/i386/linux/syscalls.master	Wed Sep 18 17:28:19 2013	(r255671)
+++ head/sys/i386/linux/syscalls.master	Wed Sep 18 17:56:04 2013	(r255672)
@@ -432,9 +432,11 @@
 251	AUE_NULL	UNIMPL
 252	AUE_EXIT	STD	{ int linux_exit_group(int error_code); }
 253	AUE_NULL	STD	{ int linux_lookup_dcookie(void); }
-254	AUE_NULL	STD	{ int linux_epoll_create(void); }
-255	AUE_NULL	STD	{ int linux_epoll_ctl(void); }
-256	AUE_NULL	STD	{ int linux_epoll_wait(void); }
+254	AUE_NULL	STD	{ int linux_epoll_create(l_int size); }
+255	AUE_NULL	STD	{ int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \
+					struct linux_epoll_event *event); }
+256	AUE_NULL	STD	{ int linux_epoll_wait(l_int epfd, struct linux_epoll_event *events, \
+					l_int maxevents, l_int timeout); }
 257	AUE_NULL	STD	{ int linux_remap_file_pages(void); }
 258	AUE_NULL	STD	{ int linux_set_tid_address(int *tidptr); }
 259	AUE_NULL	STD	{ int linux_timer_create(clockid_t clock_id, \
@@ -544,7 +546,7 @@
 ; linux 2.6.27:
 327	AUE_NULL	STD	{ int linux_signalfd4(void); }
 328	AUE_NULL	STD	{ int linux_eventfd2(void); }
-329	AUE_NULL	STD	{ int linux_epoll_create1(void); }
+329	AUE_NULL	STD	{ int linux_epoll_create1(l_int flags); }
 330	AUE_NULL	STD	{ int linux_dup3(void); }
 331	AUE_NULL	STD	{ int linux_pipe2(l_int *pipefds, l_int flags); }
 332	AUE_NULL	STD	{ int linux_inotify_init1(void); }

Modified: head/sys/kern/kern_event.c
==============================================================================
--- head/sys/kern/kern_event.c	Wed Sep 18 17:28:19 2013	(r255671)
+++ head/sys/kern/kern_event.c	Wed Sep 18 17:56:04 2013	(r255672)
@@ -107,16 +107,7 @@ static void 	kqueue_wakeup(struct kqueue
 static struct filterops *kqueue_fo_find(int filt);
 static void	kqueue_fo_release(int filt);
 
-static fo_rdwr_t	kqueue_read;
-static fo_rdwr_t	kqueue_write;
-static fo_truncate_t	kqueue_truncate;
-static fo_ioctl_t	kqueue_ioctl;
-static fo_poll_t	kqueue_poll;
-static fo_kqfilter_t	kqueue_kqfilter;
-static fo_stat_t	kqueue_stat;
-static fo_close_t	kqueue_close;
-
-static struct fileops kqueueops = {
+struct fileops kqueueops = {
 	.fo_read = kqueue_read,
 	.fo_write = kqueue_write,
 	.fo_truncate = kqueue_truncate,
@@ -303,7 +294,7 @@ filt_fileattach(struct knote *kn)
 }
 
 /*ARGSUSED*/
-static int
+int
 kqueue_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
@@ -688,34 +679,7 @@ filt_usertouch(struct knote *kn, struct 
 int
 sys_kqueue(struct thread *td, struct kqueue_args *uap)
 {
-	struct filedesc *fdp;
-	struct kqueue *kq;
-	struct file *fp;
-	int fd, error;
-
-	fdp = td->td_proc->p_fd;
-	error = falloc(td, &fp, &fd, 0);
-	if (error)
-		goto done2;
-
-	/* An extra reference on `fp' has been held for us by falloc(). */
-	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
-	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
-	TAILQ_INIT(&kq->kq_head);
-	kq->kq_fdp = fdp;
-	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
-	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
-
-	FILEDESC_XLOCK(fdp);
-	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
-	FILEDESC_XUNLOCK(fdp);
-
-	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
-	fdrop(fp, td);
-
-	td->td_retval[0] = fd;
-done2:
-	return (error);
+	return (kern_kqueue(td));
 }
 
 #ifndef _SYS_SYSPROTO_H_
@@ -817,19 +781,75 @@ kevent_copyin(void *arg, struct kevent *
 }
 
 int
+kern_kqueue(struct thread *td)
+{
+	struct file *fp;
+	int error;
+
+	error = kern_kqueue_locked(td, &fp);
+
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+kern_kqueue_locked(struct thread *td, struct file **fpp)
+{
+	struct filedesc *fdp;
+	struct kqueue *kq;
+	struct file *fp;
+	int fd, error;
+
+	fdp = td->td_proc->p_fd;
+	error = falloc(td, &fp, &fd, 0);
+	if (error)
+		return (error);
+
+	/* An extra reference on `fp' has been held for us by falloc(). */
+	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
+	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
+	TAILQ_INIT(&kq->kq_head);
+	kq->kq_fdp = fdp;
+	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
+	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
+
+	FILEDESC_XLOCK(fdp);
+	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
+	FILEDESC_XUNLOCK(fdp);
+
+	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
+
+	td->td_retval[0] = fd;
+	*fpp = fp;
+	return (0);
+}
+
+int
 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	if ((error = fget(td, fd, cap_rights_init(&rights, CAP_POST_EVENT), &fp)) != 0)
+		return (error);
+
+	error = kern_kevent_locked(td, fp, nchanges, nevents, k_ops, timeout);
+
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+kern_kevent_locked(struct thread *td, struct file *fp, int nchanges, int nevents,
+    struct kevent_copyops *k_ops, const struct timespec *timeout)
+{
 	struct kevent keva[KQ_NEVENTS];
 	struct kevent *kevp, *changes;
 	struct kqueue *kq;
-	struct file *fp;
-	cap_rights_t rights;
 	int i, n, nerrors, error;
 
-	error = fget(td, fd, cap_rights_init(&rights, CAP_POST_EVENT), &fp);
-	if (error != 0)
-		return (error);
 	if ((error = kqueue_acquire(fp, &kq)) != 0)
 		goto done_norel;
 
@@ -872,7 +892,6 @@ kern_kevent(struct thread *td, int fd, i
 done:
 	kqueue_release(kq, 0);
 done_norel:
-	fdrop(fp, td);
 	return (error);
 }
 
@@ -1526,7 +1545,7 @@ done_nl:
  * This could be expanded to call kqueue_scan, if desired.
  */
 /*ARGSUSED*/
-static int
+int
 kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
 	int flags, struct thread *td)
 {
@@ -1534,7 +1553,7 @@ kqueue_read(struct file *fp, struct uio 
 }
 
 /*ARGSUSED*/
-static int
+int
 kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
 	 int flags, struct thread *td)
 {
@@ -1542,7 +1561,7 @@ kqueue_write(struct file *fp, struct uio
 }
 
 /*ARGSUSED*/
-static int
+int
 kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
 	struct thread *td)
 {
@@ -1551,7 +1570,7 @@ kqueue_truncate(struct file *fp, off_t l
 }
 
 /*ARGSUSED*/
-static int
+int
 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
 	struct ucred *active_cred, struct thread *td)
 {
@@ -1599,7 +1618,7 @@ kqueue_ioctl(struct file *fp, u_long cmd
 }
 
 /*ARGSUSED*/
-static int
+int
 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
@@ -1626,7 +1645,7 @@ kqueue_poll(struct file *fp, int events,
 }
 
 /*ARGSUSED*/
-static int
+int
 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
 	struct thread *td)
 {
@@ -1644,7 +1663,7 @@ kqueue_stat(struct file *fp, struct stat
 }
 
 /*ARGSUSED*/
-static int
+int
 kqueue_close(struct file *fp, struct thread *td)
 {
 	struct kqueue *kq = fp->f_data;

Modified: head/sys/modules/linux/Makefile
==============================================================================
--- head/sys/modules/linux/Makefile	Wed Sep 18 17:28:19 2013	(r255671)
+++ head/sys/modules/linux/Makefile	Wed Sep 18 17:56:04 2013	(r255672)
@@ -9,7 +9,7 @@ CFLAGS+=-DCOMPAT_FREEBSD32 -DCOMPAT_LINU
 
 KMOD=	linux
 SRCS=	linux_fork.c linux${SFX}_dummy.c linux_emul.c linux_file.c \
-	linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \
+	linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c linux_epoll.c \
 	linux${SFX}_machdep.c linux_mib.c linux_misc.c linux_signal.c \
 	linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c \
 	linux${SFX}_sysvec.c linux_uid16.c linux_util.c linux_time.c \

Modified: head/sys/sys/event.h
==============================================================================
--- head/sys/sys/event.h	Wed Sep 18 17:28:19 2013	(r255671)
+++ head/sys/sys/event.h	Wed Sep 18 17:56:04 2013	(r255672)
@@ -236,6 +236,9 @@ struct proc;
 struct knlist;
 struct mtx;
 struct rwlock;
+struct uio;
+struct stat;
+struct ucred;
 
 extern void	knote(struct knlist *list, long hint, int lockflags);
 extern void	knote_fork(struct knlist *list, int pid);
@@ -261,6 +264,21 @@ extern int 	kqfd_register(int fd, struct
 extern int	kqueue_add_filteropts(int filt, struct filterops *filtops);
 extern int	kqueue_del_filteropts(int filt);
 
+int kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+	int flags, struct thread *td);
+int kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+	 int flags, struct thread *td);
+int kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+	struct thread *td);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201309181756.r8IHu4qV052882>