| /* Copyright (c) 2015 Google Inc. | 
 |  * Barret Rhoden <brho@cs.berkeley.edu> | 
 |  * See LICENSE for details. | 
 |  * | 
 |  * Epoll, built on FD taps, CEQs, and blocking uthreads on event queues. | 
 |  * | 
 |  * TODO: There are a few incompatibilities with Linux's epoll, some of which are | 
 |  * artifacts of the implementation, and other issues: | 
 |  * - you can't epoll on an epoll fd (or any user fd).  you can only epoll on a | 
 |  * kernel FD that accepts your FD taps. | 
 |  * - there's no EPOLLONESHOT or level-triggered support. | 
 |  * - you can only tap one FD at a time, so you can't add the same FD to | 
 |  * multiple epoll sets. | 
 |  * - closing the epoll is a little dangerous, if there are outstanding INDIR | 
 |  * events.  this will only pop up if you're yielding cores, maybe getting | 
 |  * preempted, and are unlucky. | 
 |  * - epoll_create1 does not support CLOEXEC.  That'd need some work in glibc's | 
 |  * exec and flags in struct user_fd. | 
 |  * - EPOLL_CTL_MOD is just a DEL then an ADD.  There might be races associated | 
 |  * with that. | 
 |  * - epoll_pwait is probably racy. | 
 |  * - You can't dup an epoll fd (same as other user FDs). | 
 |  * - If you add a BSD socket FD to an epoll set, you'll get taps on both the | 
 |  * data FD and the listen FD. | 
 |  * - If you add the same BSD socket listener to multiple epoll sets, you will | 
 |  * likely fail.  This is in addition to being able to tap only one FD at a | 
 |  * time. | 
 |  * */ | 
 |  | 
 | #include <sys/epoll.h> | 
 | #include <parlib/parlib.h> | 
 | #include <parlib/event.h> | 
 | #include <parlib/ceq.h> | 
 | #include <parlib/uthread.h> | 
 | #include <parlib/timing.h> | 
 | #include <parlib/slab.h> | 
 | #include <parlib/assert.h> | 
 | #include <sys/user_fd.h> | 
 | #include <sys/close_cb.h> | 
 | #include <stdio.h> | 
 | #include <errno.h> | 
 | #include <unistd.h> | 
 | #include <malloc.h> | 
 | #include <sys/queue.h> | 
 | #include <sys/plan9_helpers.h> | 
 | #include <ros/fs.h> | 
 |  | 
 | /* Sanity check, so we can ID our own FDs */ | 
 | #define EPOLL_UFD_MAGIC 		0xe9011 | 
 |  | 
 | /* Each waiter that uses a timeout will have its own structure for dealing with | 
 |  * its timeout. | 
 |  * | 
 |  * TODO: (RCU/SLAB) it's not safe to reap the objects, until we sort out | 
 |  * INDIRs and RCU-style grace periods.  Not a big deal, since the number of | 
 |  * these is the number of threads that concurrently do epoll timeouts. */ | 
 | struct ep_alarm { | 
 | 	struct event_queue		*alarm_evq; | 
 | 	struct syscall			sysc; | 
 | }; | 
 |  | 
 | static struct kmem_cache *ep_alarms_cache; | 
 |  | 
 | struct epoll_ctlr { | 
 | 	TAILQ_ENTRY(epoll_ctlr)		link; | 
 | 	struct event_queue		*ceq_evq; | 
 | 	uth_mutex_t			*mtx; | 
 | 	struct user_fd			ufd; | 
 | }; | 
 |  | 
 | TAILQ_HEAD(epoll_ctlrs, epoll_ctlr); | 
 | static struct epoll_ctlrs all_ctlrs = TAILQ_HEAD_INITIALIZER(all_ctlrs); | 
 | static uth_mutex_t *ctlrs_mtx; | 
 |  | 
 | /* There's some bookkeeping we need to maintain on every FD.  Right now, the FD | 
 |  * is the index into the CEQ event array, so we can just hook this into the user | 
 |  * data blob in the ceq_event. | 
 |  * | 
 |  * If we ever do not maintain a 1:1 mapping from FDs to CEQ IDs, we can use this | 
 |  * to track the CEQ ID and FD. */ | 
 | struct ep_fd_data { | 
 | 	struct epoll_event		ep_event; | 
 | 	int				fd; | 
 | 	int				filter; | 
 | }; | 
 |  | 
 | /* Converts epoll events to FD taps. */ | 
 | static int ep_events_to_taps(uint32_t ep_ev) | 
 | { | 
 | 	int taps = 0; | 
 |  | 
 | 	if (ep_ev & EPOLLIN) | 
 | 		taps |= FDTAP_FILT_READABLE; | 
 | 	if (ep_ev & EPOLLOUT) | 
 | 		taps |= FDTAP_FILT_WRITABLE; | 
 | 	if (ep_ev & EPOLLRDHUP) | 
 | 		taps |= FDTAP_FILT_RDHUP; | 
 | 	if (ep_ev & EPOLLPRI) | 
 | 		taps |= FDTAP_FILT_PRIORITY; | 
 | 	if (ep_ev & EPOLLERR) | 
 | 		taps |= FDTAP_FILT_ERROR; | 
 | 	if (ep_ev & EPOLLHUP) | 
 | 		taps |= FDTAP_FILT_HANGUP; | 
 | 	return taps; | 
 | } | 
 |  | 
 | /* Converts corresponding FD Taps to epoll events.  There are other taps that do | 
 |  * not make sense for epoll. */ | 
 | static uint32_t taps_to_ep_events(int taps) | 
 | { | 
 | 	uint32_t ep_ev = 0; | 
 |  | 
 | 	if (taps & FDTAP_FILT_READABLE) | 
 | 		ep_ev |= EPOLLIN; | 
 | 	if (taps & FDTAP_FILT_WRITABLE) | 
 | 		ep_ev |= EPOLLOUT; | 
 | 	if (taps & FDTAP_FILT_RDHUP) | 
 | 		ep_ev |= EPOLLRDHUP; | 
 | 	if (taps & FDTAP_FILT_PRIORITY) | 
 | 		ep_ev |= EPOLLPRI; | 
 | 	if (taps & FDTAP_FILT_ERROR) | 
 | 		ep_ev |= EPOLLERR; | 
 | 	if (taps & FDTAP_FILT_HANGUP) | 
 | 		ep_ev |= EPOLLHUP; | 
 | 	return ep_ev; | 
 | } | 
 |  | 
 | static unsigned int ep_get_ceq_max_ever(struct epoll_ctlr *ep) | 
 | { | 
 | 	return atomic_read(&ep->ceq_evq->ev_mbox->ceq.max_event_ever); | 
 | } | 
 |  | 
 | static struct ceq_event *ep_get_ceq_ev(struct epoll_ctlr *ep, size_t idx) | 
 | { | 
 | 	if (ep->ceq_evq->ev_mbox->ceq.nr_events <= idx) | 
 | 		return 0; | 
 | 	return &ep->ceq_evq->ev_mbox->ceq.events[idx]; | 
 | } | 
 |  | 
 | static struct epoll_ctlr *fd_to_cltr(int fd) | 
 | { | 
 | 	struct user_fd *ufd = ufd_lookup(fd); | 
 |  | 
 | 	if (!ufd) | 
 | 		return 0; | 
 | 	if (ufd->magic != EPOLL_UFD_MAGIC) { | 
 | 		errno = EBADF; | 
 | 		return 0; | 
 | 	} | 
 | 	return container_of(ufd, struct epoll_ctlr, ufd); | 
 | } | 
 |  | 
 | /* Event queue helpers: */ | 
 | static struct event_queue *ep_get_ceq_evq(unsigned int ceq_ring_sz) | 
 | { | 
 | 	struct event_queue *ceq_evq = get_eventq_raw(); | 
 |  | 
 | 	ceq_evq->ev_mbox->type = EV_MBOX_CEQ; | 
 | 	ceq_init(&ceq_evq->ev_mbox->ceq, CEQ_OR, NR_FILE_DESC_MAX, ceq_ring_sz); | 
 | 	ceq_evq->ev_flags = EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP; | 
 | 	evq_attach_wakeup_ctlr(ceq_evq); | 
 | 	return ceq_evq; | 
 | } | 
 |  | 
 | static struct event_queue *ep_get_alarm_evq(void) | 
 | { | 
 | 	/* Don't care about the actual message, just using it for a wakeup */ | 
 | 	struct event_queue *alarm_evq = get_eventq(EV_MBOX_BITMAP); | 
 |  | 
 | 	alarm_evq->ev_flags = EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP; | 
 | 	evq_attach_wakeup_ctlr(alarm_evq); | 
 | 	return alarm_evq; | 
 | } | 
 |  | 
 | /* Once we've closed our sources of events, we can try to clean up the event | 
 |  * queues.  These are actually dangerous, since there could be INDIRs floating | 
 |  * around for these evqs still, which are basically pointers.  We'll need to run | 
 |  * some sort of user deferred destruction. (TODO). */ | 
 | static void ep_put_ceq_evq(struct event_queue *ceq_evq) | 
 | { | 
 | #if 0 /* TODO: EVQ/INDIR Cleanup */ | 
 | 	ceq_cleanup(&ceq_evq->ev_mbox->ceq); | 
 | 	evq_remove_wakeup_ctlr(ceq_evq); | 
 | 	put_eventq_raw(ceq_evq); | 
 | #endif | 
 | } | 
 |  | 
 | static void ep_put_alarm_evq(struct event_queue *alarm_evq) | 
 | { | 
 | #if 0 /* TODO: EVQ/INDIR Cleanup */ | 
 | 	evq_remove_wakeup_ctlr(alarm_evq); | 
 | 	put_eventq(alarm_evq); | 
 | #endif | 
 | } | 
 |  | 
 | static void epoll_close(struct user_fd *ufd) | 
 | { | 
 | 	struct epoll_ctlr *ep = container_of(ufd, struct epoll_ctlr, ufd); | 
 | 	struct fd_tap_req *tap_reqs, *tap_req_i; | 
 | 	struct ceq_event *ceq_ev_i; | 
 | 	struct ep_fd_data *ep_fd_i; | 
 | 	int nr_tap_req = 0; | 
 | 	int nr_done = 0; | 
 | 	unsigned int max_ceq_events = ep_get_ceq_max_ever(ep); | 
 |  | 
 | 	tap_reqs = malloc(sizeof(struct fd_tap_req) * max_ceq_events); | 
 | 	memset(tap_reqs, 0, sizeof(struct fd_tap_req) * max_ceq_events); | 
 | 	/* Slightly painful, O(n) with no escape hatch */ | 
 | 	for (int i = 0; i < max_ceq_events; i++) { | 
 | 		ceq_ev_i = ep_get_ceq_ev(ep, i); | 
 | 		/* CEQ should have been big enough for our size */ | 
 | 		assert(ceq_ev_i); | 
 | 		ep_fd_i = (struct ep_fd_data*)ceq_ev_i->user_data; | 
 | 		if (!ep_fd_i) | 
 | 			continue; | 
 | 		tap_req_i = &tap_reqs[nr_tap_req++]; | 
 | 		tap_req_i->fd = i; | 
 | 		tap_req_i->cmd = FDTAP_CMD_REM; | 
 | 		free(ep_fd_i); | 
 | 	} | 
 | 	/* Requests could fail if the tapped files are already closed.  We need | 
 | 	 * to skip the failed one (the +1) and untap the rest. */ | 
 | 	do { | 
 | 		nr_done += sys_tap_fds(tap_reqs + nr_done, | 
 | 				       nr_tap_req - nr_done); | 
 | 		nr_done += 1;	/* nr_done could be more than nr_tap_req now */ | 
 | 	} while (nr_done < nr_tap_req); | 
 | 	free(tap_reqs); | 
 | 	ep_put_ceq_evq(ep->ceq_evq); | 
 | 	uth_mutex_lock(ctlrs_mtx); | 
 | 	TAILQ_REMOVE(&all_ctlrs, ep, link); | 
 | 	uth_mutex_unlock(ctlrs_mtx); | 
 | 	uth_mutex_free(ep->mtx); | 
 | 	free(ep); | 
 | } | 
 |  | 
 | static int init_ep_ctlr(struct epoll_ctlr *ep, int size) | 
 | { | 
 | 	if (size == 1) | 
 | 		size = 128; | 
 | 	ep->mtx = uth_mutex_alloc(); | 
 | 	ep->ufd.magic = EPOLL_UFD_MAGIC; | 
 | 	ep->ufd.close = epoll_close; | 
 | 	/* Size is a hint for the CEQ concurrency.  We can actually handle as | 
 | 	 * many kernel FDs as is possible. */ | 
 | 	ep->ceq_evq = ep_get_ceq_evq(ROUNDUPPWR2(size)); | 
 | 	return 0; | 
 | } | 
 |  | 
 | static void epoll_fd_closed(int fd) | 
 | { | 
 | 	struct epoll_ctlr *ep; | 
 |  | 
 | 	/* Lockless peek, avoid locking for every close() */ | 
 | 	if (TAILQ_EMPTY(&all_ctlrs)) | 
 | 		return; | 
 | 	uth_mutex_lock(ctlrs_mtx); | 
 | 	TAILQ_FOREACH(ep, &all_ctlrs, link) | 
 | 		epoll_ctl(ep->ufd.fd, EPOLL_CTL_DEL, fd, 0); | 
 | 	uth_mutex_unlock(ctlrs_mtx); | 
 | } | 
 |  | 
 | static int ep_alarm_ctor(void *obj, void *priv, int flags) | 
 | { | 
 | 	struct ep_alarm *ep_a = (struct ep_alarm*)obj; | 
 |  | 
 | 	ep_a->alarm_evq = ep_get_alarm_evq(); | 
 | 	return 0; | 
 | } | 
 |  | 
 | static void ep_alarm_dtor(void *obj, void *priv) | 
 | { | 
 | 	struct ep_alarm *ep_a = (struct ep_alarm*)obj; | 
 |  | 
 | 	/* TODO: (RCU/SLAB).  Somehow the slab allocator is trying to reap our | 
 | 	 * objects.  Note that when we update userspace to use magazines, the | 
 | 	 * dtor will fire earlier (when the object is given to the slab layer). | 
 | 	 * We'll need to be careful about the final freeing of the ev_q. */ | 
 | 	panic("Epoll alarms should never be destroyed!"); | 
 | 	ep_put_alarm_evq(ep_a->alarm_evq); | 
 | } | 
 |  | 
 | static void epoll_init(void *arg) | 
 | { | 
 | 	static struct close_cb epoll_close_cb = {.func = epoll_fd_closed}; | 
 |  | 
 | 	register_close_cb(&epoll_close_cb); | 
 | 	ctlrs_mtx = uth_mutex_alloc(); | 
 | 	ep_alarms_cache = kmem_cache_create("epoll alarms", | 
 | 	                  	sizeof(struct ep_alarm), | 
 | 	                  	__alignof__(sizeof(struct ep_alarm)), 0, | 
 | 	                  	ep_alarm_ctor, ep_alarm_dtor, NULL); | 
 | 	assert(ep_alarms_cache); | 
 | } | 
 |  | 
 | int epoll_create(int size) | 
 | { | 
 | 	int fd; | 
 | 	struct epoll_ctlr *ep; | 
 | 	static parlib_once_t once = PARLIB_ONCE_INIT; | 
 |  | 
 | 	parlib_run_once(&once, epoll_init, NULL); | 
 | 	/* good thing the arg is a signed int... */ | 
 | 	if (size < 0) { | 
 | 		errno = EINVAL; | 
 | 		return -1; | 
 | 	} | 
 | 	ep = malloc(sizeof(struct epoll_ctlr)); | 
 | 	memset(ep, 0, sizeof(struct epoll_ctlr)); | 
 | 	if (init_ep_ctlr(ep, size)) { | 
 | 		free(ep); | 
 | 		return -1; | 
 | 	} | 
 | 	fd = ufd_get_fd(&ep->ufd); | 
 | 	if (fd < 0) | 
 | 		free(ep); | 
 | 	uth_mutex_lock(ctlrs_mtx); | 
 | 	TAILQ_INSERT_TAIL(&all_ctlrs, ep, link); | 
 | 	uth_mutex_unlock(ctlrs_mtx); | 
 | 	return fd; | 
 | } | 
 |  | 
 | int epoll_create1(int flags) | 
 | { | 
 | 	/* TODO: we're supposed to support CLOEXEC.  Our FD is a user_fd, so | 
 | 	 * that'd require some support in glibc's exec to close our epoll ctlr. | 
 | 	 * */ | 
 | 	return epoll_create(1); | 
 | } | 
 |  | 
 | /* Linux's epoll will check for events, even if edge-triggered, during | 
 |  * additions (and probably modifications) to the epoll set.  It's a questionable | 
 |  * policy, since it can hide user bugs. | 
 |  * | 
 |  * We can do the same, though only for EPOLLIN and EPOLLOUT for FDs that can | 
 |  * report their status via stat.  (same as select()). | 
 |  * | 
 |  * Note that this could result in spurious events, which should be fine. */ | 
 | static void fire_existing_events(int fd, int ep_events, | 
 |                                  struct event_queue *ev_q) | 
 | { | 
 | 	struct stat stat_buf[1]; | 
 | 	struct event_msg ev_msg[1]; | 
 | 	int ret; | 
 | 	int synth_ep_events = 0; | 
 |  | 
 | 	ret = fstat(fd, stat_buf); | 
 | 	assert(!ret); | 
 | 	if ((ep_events & EPOLLIN) && S_READABLE(stat_buf->st_mode)) | 
 | 		synth_ep_events |= EPOLLIN; | 
 | 	if ((ep_events & EPOLLOUT) && S_WRITABLE(stat_buf->st_mode)) | 
 | 		synth_ep_events |= EPOLLOUT; | 
 | 	if (synth_ep_events) { | 
 | 		ev_msg->ev_type = fd; | 
 | 		ev_msg->ev_arg2 = ep_events_to_taps(synth_ep_events); | 
 | 		ev_msg->ev_arg3 = 0; /* tap->data is unused for epoll. */ | 
 | 		sys_send_event(ev_q, ev_msg, vcore_id()); | 
 | 	} | 
 | } | 
 |  | 
 | static int __epoll_ctl_add_raw(struct epoll_ctlr *ep, int fd, | 
 |                                struct epoll_event *event) | 
 | { | 
 | 	struct ceq_event *ceq_ev; | 
 | 	struct ep_fd_data *ep_fd; | 
 | 	struct fd_tap_req tap_req = {0}; | 
 | 	int ret, filter; | 
 |  | 
 | 	ceq_ev = ep_get_ceq_ev(ep, fd); | 
 | 	if (!ceq_ev) { | 
 | 		errno = ENOMEM; | 
 | 		werrstr("Epoll set cannot grow yet!"); | 
 | 		return -1; | 
 | 	} | 
 | 	ep_fd = (struct ep_fd_data*)ceq_ev->user_data; | 
 | 	if (ep_fd) { | 
 | 		errno = EEXIST; | 
 | 		return -1; | 
 | 	} | 
 | 	tap_req.fd = fd; | 
 | 	tap_req.cmd = FDTAP_CMD_ADD; | 
 | 	/* EPOLLHUP is implicitly set for all epolls. */ | 
 | 	filter = ep_events_to_taps(event->events | EPOLLHUP); | 
 | 	tap_req.filter = filter; | 
 | 	tap_req.ev_q = ep->ceq_evq; | 
 | 	tap_req.ev_id = fd;	/* using FD as the CEQ ID */ | 
 | 	ret = sys_tap_fds(&tap_req, 1); | 
 | 	if (ret != 1) | 
 | 		return -1; | 
 | 	ep_fd = malloc(sizeof(struct ep_fd_data)); | 
 | 	ep_fd->fd = fd; | 
 | 	ep_fd->filter = filter; | 
 | 	ep_fd->ep_event = *event; | 
 | 	ep_fd->ep_event.events |= EPOLLHUP; | 
 | 	ceq_ev->user_data = (uint64_t)ep_fd; | 
 | 	fire_existing_events(fd, ep_fd->ep_event.events, ep->ceq_evq); | 
 | 	return 0; | 
 | } | 
 |  | 
 | static int __epoll_ctl_add(struct epoll_ctlr *ep, int fd, | 
 |                            struct epoll_event *event) | 
 | { | 
 | 	struct fd_tap_req tap_req = {0}; | 
 | 	int ret, sock_listen_fd, sock_ctl_fd; | 
 | 	struct epoll_event listen_event; | 
 |  | 
 | 	/* Only support ET.  Also, we just ignore EPOLLONESHOT.  That might | 
 | 	 * work, logically, just with spurious events firing. */ | 
 | 	if (!(event->events & EPOLLET)) { | 
 | 		errno = EPERM; | 
 | 		werrstr("Epoll level-triggered not supported"); | 
 | 		return -1; | 
 | 	} | 
 | 	if (event->events & EPOLLONESHOT) { | 
 | 		errno = EPERM; | 
 | 		werrstr("Epoll one-shot not supported"); | 
 | 		return -1; | 
 | 	} | 
 | 	/* The sockets-to-plan9 networking shims are a bit inconvenient.  The | 
 | 	 * user asked us to epoll on an FD, but that FD is actually a Qdata FD. | 
 | 	 * We might need to actually epoll on the listen_fd.  Further, we don't | 
 | 	 * know yet whether or not they want the listen FD.  They could epoll on | 
 | 	 * the socket, then listen later and want to wake up on the listen. | 
 | 	 * | 
 | 	 * So in the case we have a socket FD, we'll actually open the listen FD | 
 | 	 * regardless (glibc handles this), and we'll epoll on both FDs. | 
 | 	 * Technically, either FD could fire and they'd get an epoll event for | 
 | 	 * it, but I think socket users will use only listen or data. | 
 | 	 * | 
 | 	 * As far as tracking the FD goes for epoll_wait() reporting, if the app | 
 | 	 * wants to track the FD they think we are using, then they already | 
 | 	 * passed that in event->data. */ | 
 | 	_sock_lookup_rock_fds(fd, TRUE, &sock_listen_fd, &sock_ctl_fd); | 
 | 	if (sock_listen_fd >= 0) { | 
 | 		listen_event.events = EPOLLET | EPOLLIN | EPOLLHUP; | 
 | 		listen_event.data = event->data; | 
 | 		ret = __epoll_ctl_add_raw(ep, sock_listen_fd, &listen_event); | 
 | 		if (ret < 0) | 
 | 			return ret; | 
 | 	} | 
 | 	return __epoll_ctl_add_raw(ep, fd, event); | 
 | } | 
 |  | 
 | static int __epoll_ctl_del_raw(struct epoll_ctlr *ep, int fd, | 
 |                                struct epoll_event *event) | 
 | { | 
 | 	struct ceq_event *ceq_ev; | 
 | 	struct ep_fd_data *ep_fd; | 
 | 	struct fd_tap_req tap_req = {0}; | 
 |  | 
 | 	ceq_ev = ep_get_ceq_ev(ep, fd); | 
 | 	if (!ceq_ev) { | 
 | 		errno = ENOENT; | 
 | 		return -1; | 
 | 	} | 
 | 	ep_fd = (struct ep_fd_data*)ceq_ev->user_data; | 
 | 	if (!ep_fd) { | 
 | 		errno = ENOENT; | 
 | 		return -1; | 
 | 	} | 
 | 	assert(ep_fd->fd == fd); | 
 | 	tap_req.fd = fd; | 
 | 	tap_req.cmd = FDTAP_CMD_REM; | 
 | 	/* ignoring the return value; we could have failed to remove it if the | 
 | 	 * FD has already closed and the kernel removed the tap. */ | 
 | 	sys_tap_fds(&tap_req, 1); | 
 | 	ceq_ev->user_data = 0; | 
 | 	free(ep_fd); | 
 | 	return 0; | 
 | } | 
 |  | 
 | static int __epoll_ctl_del(struct epoll_ctlr *ep, int fd, | 
 |                            struct epoll_event *event) | 
 | { | 
 | 	int sock_listen_fd, sock_ctl_fd; | 
 |  | 
 | 	/* If we were dealing with a socket shim FD, we tapped both the listen | 
 | 	 * and the data file and need to untap both of them. | 
 | 	 * | 
 | 	 * We could be called from a close_cb, and we already closed the listen | 
 | 	 * FD.  In that case, we don't want to try and open it.  If the listen | 
 | 	 * FD isn't open, then we know it isn't in an epoll set.  We also know | 
 | 	 * the data FD isn't epolled either, since we always epoll both FDs for | 
 | 	 * rocks. */ | 
 | 	_sock_lookup_rock_fds(fd, FALSE, &sock_listen_fd, &sock_ctl_fd); | 
 | 	if (sock_listen_fd >= 0) { | 
 | 		/* It's possible to fail here.  Even though we tapped it | 
 | 		 * already, if the deletion was triggered from close callbacks, | 
 | 		 * it's possible for the sock_listen_fd to be closed first, | 
 | 		 * which would have triggered an epoll_ctl_del.  When we get | 
 | 		 * around to closing the Rock FD, the listen FD was already | 
 | 		 * closed. */ | 
 | 		__epoll_ctl_del_raw(ep, sock_listen_fd, event); | 
 | 	} | 
 | 	return __epoll_ctl_del_raw(ep, fd, event); | 
 | } | 
 |  | 
 | int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event) | 
 | { | 
 | 	int ret; | 
 | 	struct epoll_ctlr *ep = fd_to_cltr(epfd); | 
 | 	if (!ep) { | 
 | 		errno = EBADF;/* or EINVAL */ | 
 | 		return -1; | 
 | 	} | 
 | 	if (fd >= USER_FD_BASE) { | 
 | 		errno = EINVAL; | 
 | 		werrstr("Epoll can't track User FDs"); | 
 | 		return -1; | 
 | 	} | 
 | 	uth_mutex_lock(ep->mtx); | 
 | 	switch (op) { | 
 | 	case (EPOLL_CTL_MOD): | 
 | 		/* In lieu of a proper MOD, just remove and readd.  The errors | 
 | 		 * might not work out well, and there could be a missed event in | 
 | 		 * the middle.  Not sure what the guarantees are, but we can | 
 | 		 * fake a poke. (TODO). */ | 
 | 		ret = __epoll_ctl_del(ep, fd, 0); | 
 | 		if (ret) | 
 | 			break; | 
 | 		ret = __epoll_ctl_add(ep, fd, event); | 
 | 		break; | 
 | 	case (EPOLL_CTL_ADD): | 
 | 		ret = __epoll_ctl_add(ep, fd, event); | 
 | 		break; | 
 | 	case (EPOLL_CTL_DEL): | 
 | 		ret = __epoll_ctl_del(ep, fd, event); | 
 | 		break; | 
 | 	default: | 
 | 		errno = EINVAL; | 
 | 		ret = -1; | 
 | 	} | 
 | 	uth_mutex_unlock(ep->mtx); | 
 | 	return ret; | 
 | } | 
 |  | 
 | static bool get_ep_event_from_msg(struct epoll_ctlr *ep, struct event_msg *msg, | 
 |                                   struct epoll_event *ep_ev) | 
 | { | 
 | 	struct ceq_event *ceq_ev; | 
 | 	struct ep_fd_data *ep_fd; | 
 |  | 
 | 	ceq_ev = ep_get_ceq_ev(ep, msg->ev_type); | 
 | 	/* should never get a tap FD > size of the epoll set */ | 
 | 	assert(ceq_ev); | 
 | 	ep_fd = (struct ep_fd_data*)ceq_ev->user_data; | 
 | 	if (!ep_fd) { | 
 | 		/* it's possible the FD was unregistered and this was an old | 
 | 		 * event sent to this epoll set. */ | 
 | 		return FALSE; | 
 | 	} | 
 | 	ep_ev->data = ep_fd->ep_event.data; | 
 | 	/* The events field was initialized to 0 in epoll_wait() */ | 
 | 	ep_ev->events |= taps_to_ep_events(msg->ev_arg2); | 
 | 	return TRUE; | 
 | } | 
 |  | 
 | /* Helper: extracts as many epoll_events as possible from the ep. */ | 
 | static int __epoll_wait_poll(struct epoll_ctlr *ep, struct epoll_event *events, | 
 |                              int maxevents) | 
 | { | 
 | 	struct event_msg msg = {0}; | 
 | 	int nr_ret = 0; | 
 |  | 
 | 	if (maxevents <= 0) | 
 | 		return 0; | 
 | 	/* Locking to protect get_ep_event_from_msg, specifically that the ep_fd | 
 | 	 * stored at ceq_ev->user_data does not get concurrently removed and | 
 | 	 * freed. */ | 
 | 	uth_mutex_lock(ep->mtx); | 
 | 	for (int i = 0; i < maxevents; i++) { | 
 | retry: | 
 | 		if (!uth_check_evqs(&msg, NULL, 1, ep->ceq_evq)) | 
 | 			break; | 
 | 		if (!get_ep_event_from_msg(ep, &msg, &events[i])) | 
 | 			goto retry; | 
 | 		nr_ret++; | 
 | 	} | 
 | 	uth_mutex_unlock(ep->mtx); | 
 | 	return nr_ret; | 
 | } | 
 |  | 
 | /* We should be able to have multiple waiters.  ep shouldn't be closed or | 
 |  * anything, since we have the FD (that'd be bad programming on the user's | 
 |  * behalf).  We could have concurrent ADD/MOD/DEL operations (which lock). */ | 
 | static int __epoll_wait(struct epoll_ctlr *ep, struct epoll_event *events, | 
 |                         int maxevents, int timeout) | 
 | { | 
 | 	struct event_msg msg = {0}; | 
 | 	struct event_msg dummy_msg; | 
 | 	struct event_queue *which_evq; | 
 | 	struct ep_alarm *ep_a; | 
 | 	int nr_ret; | 
 |  | 
 | 	nr_ret = __epoll_wait_poll(ep, events, maxevents); | 
 | 	if (nr_ret) | 
 | 		return nr_ret; | 
 | 	if (timeout == 0) | 
 | 		return 0; | 
 | 	/* From here on down, we're going to block until there is some activity | 
 | 	 */ | 
 | 	if (timeout != -1) { | 
 | 		ep_a = kmem_cache_alloc(ep_alarms_cache, 0); | 
 | 		assert(ep_a); | 
 | 		syscall_async_evq(&ep_a->sysc, ep_a->alarm_evq, SYS_block, | 
 | 		                  timeout * 1000); | 
 | 		uth_blockon_evqs(&msg, &which_evq, 2, ep->ceq_evq, | 
 | 				 ep_a->alarm_evq); | 
 | 		if (which_evq == ep_a->alarm_evq) { | 
 | 			kmem_cache_free(ep_alarms_cache, ep_a); | 
 | 			return 0; | 
 | 		} | 
 | 		/* The alarm sysc may or may not have finished yet.  This will | 
 | 		 * force it to *start* to finish iff it is still a submitted | 
 | 		 * syscall. */ | 
 | 		sys_abort_sysc(&ep_a->sysc); | 
 | 		/* But we still need to wait until the syscall completed.  Need | 
 | 		 * a dummy msg, since we don't want to clobber the real msg. */ | 
 | 		uth_blockon_evqs(&dummy_msg, 0, 1, ep_a->alarm_evq); | 
 | 		kmem_cache_free(ep_alarms_cache, ep_a); | 
 | 	} else { | 
 | 		uth_blockon_evqs(&msg, &which_evq, 1, ep->ceq_evq); | 
 | 	} | 
 | 	uth_mutex_lock(ep->mtx); | 
 | 	if (get_ep_event_from_msg(ep, &msg, &events[0])) | 
 | 		nr_ret = 1; | 
 | 	uth_mutex_unlock(ep->mtx); | 
 | 	/* We had to extract one message already as part of the blocking | 
 | 	 * process.  We might be able to get more. */ | 
 | 	nr_ret += __epoll_wait_poll(ep, events + nr_ret, maxevents - nr_ret); | 
 | 	/* This is a little nasty and hopefully a rare race.  We still might not | 
 | 	 * have a ret, but we expected to block until we had something.  We | 
 | 	 * didn't time out yet, but we spuriously woke up.  We need to try again | 
 | 	 * (ideally, we'd subtract the time left from the original timeout). */ | 
 | 	if (!nr_ret) | 
 | 		return __epoll_wait(ep, events, maxevents, timeout); | 
 | 	return nr_ret; | 
 | } | 
 |  | 
 | int epoll_wait(int epfd, struct epoll_event *events, int maxevents, | 
 |                int timeout) | 
 | { | 
 | 	struct epoll_ctlr *ep = fd_to_cltr(epfd); | 
 |  | 
 | 	if (!ep) { | 
 | 		errno = EBADF;/* or EINVAL */ | 
 | 		return -1; | 
 | 	} | 
 | 	if (maxevents <= 0) { | 
 | 		errno = EINVAL; | 
 | 		return -1; | 
 | 	} | 
 | 	for (int i = 0; i < maxevents; i++) | 
 | 		events[i].events = 0; | 
 | 	return __epoll_wait(ep, events, maxevents, timeout); | 
 | } | 
 |  | 
 | int epoll_pwait(int epfd, struct epoll_event *events, int maxevents, | 
 |                 int timeout, const sigset_t *sigmask) | 
 | { | 
 | 	int ready; | 
 | 	sigset_t origmask; | 
 |  | 
 | 	/* TODO: this is probably racy */ | 
 | 	sigprocmask(SIG_SETMASK, sigmask, &origmask); | 
 | 	ready = epoll_wait(epfd, events, maxevents, timeout); | 
 | 	sigprocmask(SIG_SETMASK, &origmask, NULL); | 
 | 	return ready; | 
 | } |