user/iplib/select.c - upstream - Git at Google

 /* Copyright (c) 2016 Google Inc.
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
  * select()
  *
  * Our select() is a bit rough and only works with FDs where fstat can return
  * S_READABLE or S_WRITABLE.  For the most part, this applies to qio queues,
  * which are the basis for a lot of the network stack and pipes.  FDs where
  * fstat doesn't tell us the readiness will have races.
  *
  * Under the hood, our select() is implemented with epoll (and under that, FD
  * taps).  Those can only detect edges (e.g. a socket becomes readable).
  *
  * The problem is that we want to detect a level status (e.g. socket is
  * readable) with an edge event (e.g. socket *becomes* readable).  To do this,
  * when someone initially selects, the FD gets tracked with epoll and we
  * manually poll the FDs with fstat.  Subsequent selects() will still be tracked
  * in the epoll set, but since apps can select() even on FDs they didn't drain
  * to the point of blocking, we still need to poll every FD on every select()
  * call.
  *
  * We maintain one FD set per program.  It tracks *any* FD being tracked by
  * *any* select call.  This is because you can only have one tap per FD.
  * Regardless of whether the user asked for read/write/except, the FD gets
  * watched for anything until it closes.
  *
  * One issue with the global FD set is that one thread may consume the epoll
  * events intended for another thread (or even for itself at another call
  * site!).  To get around this, only one thread is the actual epoller, and the
  * others block on a mutex.  TLS isn't an option for two reasons: not all 2LSs
  * use TLS (minor concern, maybe they should) and there are some threads who
  * make multiple select calls - we actually want per-call-site-and-thread fd
  * sets.
  *
  * Notes:
  * - pselect might be racy
  * - if the user has no read/write/except sets, we won't wait.  some users of
  *   select use it as a timer only.  if that comes up, we can expand this.
  * - if you epoll or FD tap an FD, then try to use select on it, you'll get an
  *   error (only one tap per FD).  select() only knows about the FDs in its set.
  * - if you select() on a readfd that is a disk file, it'll always say it is
  *   available for I/O.
  */

 #include <sys/select.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <unistd.h>

 #include <errno.h>
 #include <malloc.h>
 #include <parlib/arch/arch.h>
 #include <parlib/uthread.h>
 #include <parlib/parlib.h>
 #include <ros/common.h>
 #include <ros/fs.h>
 #include <signal.h>
 #include <stdlib.h>
 #include <sys/close_cb.h>
 #include <sys/epoll.h>
 #include <sys/fork_cb.h>

 static int epoll_fd;
 static fd_set all_fds;
 static fd_set working_read_fds;
 static fd_set working_write_fds;
 static fd_set working_except_fds;
 static uth_mutex_t *epoll_mtx;

 static bool fd_is_set(unsigned int fd, fd_set *set)
 {
 	if (fd > FD_SETSIZE)
 		return FALSE;
 	if (!set)
 		return FALSE;
 	return FD_ISSET(fd, set);
 }

 static void select_fd_closed(int fd)
 {
 	/* Slightly racy, but anything concurrently added will be closed later,
 	 * and after it is_set. */
 	if (!fd_is_set(fd, &all_fds))
 		return;
 	/* We just need to stop tracking FD.  We do not need to remove it from
 	 * the epoll set, since that will happen automatically on close(). */
 	uth_mutex_lock(epoll_mtx);
 	FD_CLR(fd, &all_fds);
 	uth_mutex_unlock(epoll_mtx);
 }

 static void select_forked(void)
 {
 	struct epoll_event ep_ev;

 	uth_mutex_lock(epoll_mtx);
 	for (int i = 0; i < FD_SETSIZE; i++) {
 		if (fd_is_set(i, &all_fds)) {
 			ep_ev.events = EPOLLET | EPOLLIN | EPOLLOUT | EPOLLHUP |
 				       EPOLLERR;
 			ep_ev.data.fd = i;
 			/* Discard error.  The underlying tap is gone, and the
 			 * epoll ctlr might also have been emptied.  We just
 			 * want to make sure there is no epoll/tap so that a
 			 * future CTL_ADD doesn't fail. */
 			epoll_ctl(epoll_fd, EPOLL_CTL_DEL, i, &ep_ev);
 			FD_CLR(i, &all_fds);
 		}
 	}
 	uth_mutex_unlock(epoll_mtx);
 }

 static void select_init(void *arg)
 {
 	static struct close_cb select_close_cb = {.func = select_fd_closed};
 	static struct fork_cb select_fork_cb = {.func = select_forked};

 	register_close_cb(&select_close_cb);
 	epoll_fd = epoll_create(FD_SETSIZE);
 	if (epoll_fd < 0) {
 		perror("select failed epoll_create");
 		exit(-1);
 	}
 	epoll_mtx = uth_mutex_alloc();
 	register_fork_cb(&select_fork_cb);
 }

 static int select_tv_to_ep_timeout(struct timeval *tv)
 {
 	if (!tv)
 		return -1;
 	return tv->tv_sec * 1000 + DIV_ROUND_UP(tv->tv_usec, 1000);
 }

 /* Helper: check with the kernel if FD is readable/writable or not.  Some apps
  * will call select() on something even if it is already actionable, and not
  * wait until they get the EAGAIN.
  *
  * This modifies the global working_ fd sets by setting bits of actionable FDs
  * and will return the number of bits turned on.  So basically, 1 for readable
  * xor writable, 2 for both.
  *
  * TODO: this *won't* work for disk based files.  It only works on qids that are
  * backed with qio queues or something similar, where the device has support for
  * setting DMREADABLE/DMWRITABLE. */
 static unsigned int fd_set_actionable(int fd, fd_set *readfds, fd_set *writefds)
 {
 	struct stat stat_buf;
 	int ret;

 	/* Avoid the stat call on FDs we're not tracking (which should trigger
 	 * an error, or give us the stat for FD 0). */
 	if (!(fd_is_set(fd, readfds) || fd_is_set(fd, writefds)))
 		return 0;
 	ret = fstat(fd, &stat_buf);
 	assert(!ret);
 	ret = 0;
 	if (fd_is_set(fd, readfds)) {
 		if (S_READABLE(stat_buf.st_mode)) {
 			ret++;
 			FD_SET(fd, &working_read_fds);
 		}
 	}
 	if (fd_is_set(fd, writefds)) {
 		if (S_WRITABLE(stat_buf.st_mode)) {
 			ret++;
 			FD_SET(fd, &working_write_fds);
 		}
 	}
 	return ret;
 }

 /* Helper: extracts events from ep_result for types ep_event_types, and sets
  * their bits in ret_fds if the FD was watched.  Returns the number of bits set.
  */
 static int extract_bits_for_events(struct epoll_event *ep_result,
                                    uint32_t ep_event_types,
                                    fd_set *watched_fds, fd_set *ret_fds)
 {
 	int ret = 0;
 	int fd = ep_result->data.fd;

 	if (ep_result->events & ep_event_types) {
 		if (fd_is_set(fd, watched_fds) && !FD_ISSET(fd, ret_fds)) {
 			FD_SET(fd, ret_fds);
 			ret++;
 		}
 	}
 	return ret;
 }

 int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
            struct timeval *timeout)
 {
 	struct epoll_event ep_ev;
 	struct epoll_event *ep_results;
 	int ret, ep_ret, ep_timeout;
 	static parlib_once_t once = PARLIB_ONCE_INIT;
 	struct timeval start_tv[1], end_tv[1];

 	parlib_run_once(&once, select_init, NULL);
 	/* good thing nfds is a signed int... */
 	if (nfds < 0) {
 		errno = EINVAL;
 		return -1;
 	}
 loop:
 	if (timeout)
 		gettimeofday(start_tv, NULL);
 	ep_timeout = select_tv_to_ep_timeout(timeout);
 	uth_mutex_lock(epoll_mtx);
 	for (int i = 0; i < nfds; i++) {
 		if ((fd_is_set(i, readfds) || fd_is_set(i, writefds) ||
 		     fd_is_set(i, exceptfds)) &&
 		    !fd_is_set(i, &all_fds)) {

 			FD_SET(i, &all_fds);
 			/* FDs that we track for *any* reason with select will
 			 * be tracked for *all* reasons with epoll. */
 			ep_ev.events = EPOLLET | EPOLLIN | EPOLLOUT | EPOLLHUP |
 				       EPOLLERR;
 			ep_ev.data.fd = i;
 			if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, i, &ep_ev)) {
 				/* We might have failed because we tried to set
 				 * up too many FD tap types.  Listen FDs, for
 				 * instance, can only be tapped for READABLE and
 				 * HANGUP.  Let's try for one of those. */
 				if (errno == ENOSYS) {
 					ep_ev.events = EPOLLET | EPOLLIN |
 						       EPOLLHUP;
 					if (!epoll_ctl(epoll_fd, EPOLL_CTL_ADD,
 						       i, &ep_ev))
 						continue;
 				}
 				/* Careful to unlock before calling perror.
 				 * perror calls close, which calls our CB, which
 				 * grabs the lock. */
 				uth_mutex_unlock(epoll_mtx);
 				perror("select epoll_ctl failed");
 				return -1;
 			}
 		}
 	}
 	/* Since we just added some FDs to our tracking set, we don't know if
 	 * they are readable or not.  We'll only catch edge-triggered changes in
 	 * the future.
 	 *
 	 * Similarly, it is legal to select on a readable FD even if you didn't
 	 * consume all of the data yet; similarly for writers on non-full FDs.
 	 *
 	 * Additionally, since there is a global epoll set, we could have
 	 * multiple threads epolling concurrently and one thread could consume
 	 * the events that should wake another thread.  Also, keep in mind we
 	 * could also have a single thread that selects multiple times on
 	 * separate FD sets.
 	 *
 	 * Due to any of these cases, we need to check every FD this select call
 	 * cares about (i.e. in an fd_set) to see if it is actionable.  We do it
 	 * while holding the mutex to prevent other threads from consuming our
 	 * epoll events. */
 	ret = 0;
 	FD_ZERO(&working_read_fds);
 	FD_ZERO(&working_write_fds);
 	FD_ZERO(&working_except_fds);
 	/* Note the helper sets bits in the working_ fd sets */
 	for (int i = 0; i < nfds; i++)
 		ret += fd_set_actionable(i, readfds, writefds);
 	if (ret) {
 		if (readfds)
 			*readfds = working_read_fds;
 		if (writefds)
 			*writefds = working_write_fds;
 		uth_mutex_unlock(epoll_mtx);
 		return ret;
 	}
 	/* Need to check for up to FD_SETSIZE - nfds isn't the size of all FDs
 	 * tracked; it's the size of only our current select call */
 	ep_results = malloc(sizeof(struct epoll_event) * FD_SETSIZE);
 	if (!ep_results) {
 		uth_mutex_unlock(epoll_mtx);
 		errno = ENOMEM;
 		return -1;
 	}
 	ep_ret = epoll_wait(epoll_fd, ep_results, FD_SETSIZE, ep_timeout);
 	/* We need to hold the mtx during all of this processing since we're
 	 * using the global working_ fds sets.  We can't modify the
 	 * readfds/writefds/exceptfds until we're sure we are done. */
 	ret = 0;
 	/* Note that ret can be > ep_ret.  An FD that is both readable and
 	 * writable counts as one event for epoll, but as two bits for select.
 	 * */
 	for (int i = 0; i < ep_ret; i++) {
 		ret += extract_bits_for_events(&ep_results[i],
 					       EPOLLIN | EPOLLHUP,
 		                               readfds, &working_read_fds);
 		ret += extract_bits_for_events(&ep_results[i],
 					       EPOLLOUT | EPOLLHUP,
 		                               writefds, &working_write_fds);
 		ret += extract_bits_for_events(&ep_results[i], EPOLLERR,
 		                               exceptfds, &working_except_fds);
 	}
 	free(ep_results);
 	if (ret) {
 		if (readfds)
 			*readfds = working_read_fds;
 		if (writefds)
 			*writefds = working_write_fds;
 		if (exceptfds)
 			*exceptfds = working_except_fds;
 	}
 	uth_mutex_unlock(epoll_mtx);
 	/* TODO: Consider updating timeval for non-timeouts.  It's not mandatory
 	 * (POSIX). */
 	if (ret)
 		return ret;
 	/* If we have no rets at this point, there are a few options: we could
 	 * have timed out (if requested), or we could have consumed someone
 	 * else's event.  No one could have consumed our event, since we were
 	 * the only epoller (while holding the mtx).  In the latter case, we'll
 	 * need to try again, but with an updated timeout. */
 	if (timeout) {
 		gettimeofday(end_tv, NULL);
 		timersub(end_tv, start_tv, end_tv);	/* diff in end_tv */
 		if (timercmp(timeout, end_tv, >))
 			timersub(timeout, end_tv, timeout);
 		else
 			return 0;	/* select timed out */
 	}
 	goto loop;
 }

 int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
             const struct timespec *timeout, const sigset_t *sigmask)
 {
 	int ready;
 	sigset_t origmask;
 	struct timeval local_tv, *tv = &local_tv;

 	if (!timeout) {
 		tv = 0;
 	} else {
 		tv->tv_sec = timeout->tv_sec;
 		tv->tv_usec = DIV_ROUND_UP(timeout->tv_nsec, 1000);
 	}
 	/* TODO: this is probably racy */
 	sigprocmask(SIG_SETMASK, sigmask, &origmask);
 	ready = select(nfds, readfds, writefds, exceptfds, tv);
 	sigprocmask(SIG_SETMASK, &origmask, NULL);
 	return ready;
 }
	/* Copyright (c) 2016 Google Inc.
	* Barret Rhoden <brho@cs.berkeley.edu>
	* See LICENSE for details.
	*
	* select()
	*
	* Our select() is a bit rough and only works with FDs where fstat can return
	* S_READABLE or S_WRITABLE. For the most part, this applies to qio queues,
	* which are the basis for a lot of the network stack and pipes. FDs where
	* fstat doesn't tell us the readiness will have races.
	*
	* Under the hood, our select() is implemented with epoll (and under that, FD
	* taps). Those can only detect edges (e.g. a socket becomes readable).
	*
	* The problem is that we want to detect a level status (e.g. socket is
	* readable) with an edge event (e.g. socket becomes readable). To do this,
	* when someone initially selects, the FD gets tracked with epoll and we
	* manually poll the FDs with fstat. Subsequent selects() will still be tracked
	* in the epoll set, but since apps can select() even on FDs they didn't drain
	* to the point of blocking, we still need to poll every FD on every select()
	* call.
	*
	* We maintain one FD set per program. It tracks any FD being tracked by
	* any select call. This is because you can only have one tap per FD.
	* Regardless of whether the user asked for read/write/except, the FD gets
	* watched for anything until it closes.
	*
	* One issue with the global FD set is that one thread may consume the epoll
	* events intended for another thread (or even for itself at another call
	* site!). To get around this, only one thread is the actual epoller, and the
	* others block on a mutex. TLS isn't an option for two reasons: not all 2LSs
	* use TLS (minor concern, maybe they should) and there are some threads who
	* make multiple select calls - we actually want per-call-site-and-thread fd
	* sets.
	*
	* Notes:
	* - pselect might be racy
	* - if the user has no read/write/except sets, we won't wait. some users of
	* select use it as a timer only. if that comes up, we can expand this.
	* - if you epoll or FD tap an FD, then try to use select on it, you'll get an
	* error (only one tap per FD). select() only knows about the FDs in its set.
	* - if you select() on a readfd that is a disk file, it'll always say it is
	* available for I/O.
	*/

	#include <sys/select.h>
	#include <sys/stat.h>
	#include <sys/time.h>
	#include <sys/types.h>
	#include <unistd.h>

	#include <errno.h>
	#include <malloc.h>
	#include <parlib/arch/arch.h>
	#include <parlib/uthread.h>
	#include <parlib/parlib.h>
	#include <ros/common.h>
	#include <ros/fs.h>
	#include <signal.h>
	#include <stdlib.h>
	#include <sys/close_cb.h>
	#include <sys/epoll.h>
	#include <sys/fork_cb.h>

	static int epoll_fd;
	static fd_set all_fds;
	static fd_set working_read_fds;
	static fd_set working_write_fds;
	static fd_set working_except_fds;
	static uth_mutex_t *epoll_mtx;

	static bool fd_is_set(unsigned int fd, fd_set *set)
	{
	if (fd > FD_SETSIZE)
	return FALSE;
	if (!set)
	return FALSE;
	return FD_ISSET(fd, set);
	}

	static void select_fd_closed(int fd)
	{
	/* Slightly racy, but anything concurrently added will be closed later,
	* and after it is_set. */
	if (!fd_is_set(fd, &all_fds))
	return;
	/* We just need to stop tracking FD. We do not need to remove it from
	* the epoll set, since that will happen automatically on close(). */
	uth_mutex_lock(epoll_mtx);
	FD_CLR(fd, &all_fds);
	uth_mutex_unlock(epoll_mtx);
	}

	static void select_forked(void)
	{
	struct epoll_event ep_ev;

	uth_mutex_lock(epoll_mtx);
	for (int i = 0; i < FD_SETSIZE; i++) {
	if (fd_is_set(i, &all_fds)) {
	ep_ev.events = EPOLLET \| EPOLLIN \| EPOLLOUT \| EPOLLHUP \|
	EPOLLERR;
	ep_ev.data.fd = i;
	/* Discard error. The underlying tap is gone, and the
	* epoll ctlr might also have been emptied. We just
	* want to make sure there is no epoll/tap so that a
	* future CTL_ADD doesn't fail. */
	epoll_ctl(epoll_fd, EPOLL_CTL_DEL, i, &ep_ev);
	FD_CLR(i, &all_fds);
	}
	}
	uth_mutex_unlock(epoll_mtx);
	}

	static void select_init(void *arg)
	{
	static struct close_cb select_close_cb = {.func = select_fd_closed};
	static struct fork_cb select_fork_cb = {.func = select_forked};

	register_close_cb(&select_close_cb);
	epoll_fd = epoll_create(FD_SETSIZE);
	if (epoll_fd < 0) {
	perror("select failed epoll_create");
	exit(-1);
	}
	epoll_mtx = uth_mutex_alloc();
	register_fork_cb(&select_fork_cb);
	}

	static int select_tv_to_ep_timeout(struct timeval *tv)
	{
	if (!tv)
	return -1;
	return tv->tv_sec * 1000 + DIV_ROUND_UP(tv->tv_usec, 1000);
	}

	/* Helper: check with the kernel if FD is readable/writable or not. Some apps
	* will call select() on something even if it is already actionable, and not
	* wait until they get the EAGAIN.
	*
	* This modifies the global working_ fd sets by setting bits of actionable FDs
	* and will return the number of bits turned on. So basically, 1 for readable
	* xor writable, 2 for both.
	*
	* TODO: this won't work for disk based files. It only works on qids that are
	* backed with qio queues or something similar, where the device has support for
	* setting DMREADABLE/DMWRITABLE. */
	static unsigned int fd_set_actionable(int fd, fd_set readfds, fd_set writefds)
	{
	struct stat stat_buf;
	int ret;

	/* Avoid the stat call on FDs we're not tracking (which should trigger
	* an error, or give us the stat for FD 0). */
	if (!(fd_is_set(fd, readfds) \|\| fd_is_set(fd, writefds)))
	return 0;
	ret = fstat(fd, &stat_buf);
	assert(!ret);
	ret = 0;
	if (fd_is_set(fd, readfds)) {
	if (S_READABLE(stat_buf.st_mode)) {
	ret++;
	FD_SET(fd, &working_read_fds);
	}
	}
	if (fd_is_set(fd, writefds)) {
	if (S_WRITABLE(stat_buf.st_mode)) {
	ret++;
	FD_SET(fd, &working_write_fds);
	}
	}
	return ret;
	}

	/* Helper: extracts events from ep_result for types ep_event_types, and sets
	* their bits in ret_fds if the FD was watched. Returns the number of bits set.
	*/
	static int extract_bits_for_events(struct epoll_event *ep_result,
	uint32_t ep_event_types,
	fd_set watched_fds, fd_set ret_fds)
	{
	int ret = 0;
	int fd = ep_result->data.fd;

	if (ep_result->events & ep_event_types) {
	if (fd_is_set(fd, watched_fds) && !FD_ISSET(fd, ret_fds)) {
	FD_SET(fd, ret_fds);
	ret++;
	}
	}
	return ret;
	}

	int select(int nfds, fd_set readfds, fd_set writefds, fd_set *exceptfds,
	struct timeval *timeout)
	{
	struct epoll_event ep_ev;
	struct epoll_event *ep_results;
	int ret, ep_ret, ep_timeout;
	static parlib_once_t once = PARLIB_ONCE_INIT;
	struct timeval start_tv[1], end_tv[1];

	parlib_run_once(&once, select_init, NULL);
	/* good thing nfds is a signed int... */
	if (nfds < 0) {
	errno = EINVAL;
	return -1;
	}
	loop:
	if (timeout)
	gettimeofday(start_tv, NULL);
	ep_timeout = select_tv_to_ep_timeout(timeout);
	uth_mutex_lock(epoll_mtx);
	for (int i = 0; i < nfds; i++) {
	if ((fd_is_set(i, readfds) \|\| fd_is_set(i, writefds) \|\|
	fd_is_set(i, exceptfds)) &&
	!fd_is_set(i, &all_fds)) {

	FD_SET(i, &all_fds);
	/* FDs that we track for any reason with select will
	* be tracked for all reasons with epoll. */
	ep_ev.events = EPOLLET \| EPOLLIN \| EPOLLOUT \| EPOLLHUP \|
	EPOLLERR;
	ep_ev.data.fd = i;
	if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, i, &ep_ev)) {
	/* We might have failed because we tried to set
	* up too many FD tap types. Listen FDs, for
	* instance, can only be tapped for READABLE and
	* HANGUP. Let's try for one of those. */
	if (errno == ENOSYS) {
	ep_ev.events = EPOLLET \| EPOLLIN \|
	EPOLLHUP;
	if (!epoll_ctl(epoll_fd, EPOLL_CTL_ADD,
	i, &ep_ev))
	continue;
	}
	/* Careful to unlock before calling perror.
	* perror calls close, which calls our CB, which
	* grabs the lock. */
	uth_mutex_unlock(epoll_mtx);
	perror("select epoll_ctl failed");
	return -1;
	}
	}
	}
	/* Since we just added some FDs to our tracking set, we don't know if
	* they are readable or not. We'll only catch edge-triggered changes in
	* the future.
	*
	* Similarly, it is legal to select on a readable FD even if you didn't
	* consume all of the data yet; similarly for writers on non-full FDs.
	*
	* Additionally, since there is a global epoll set, we could have
	* multiple threads epolling concurrently and one thread could consume
	* the events that should wake another thread. Also, keep in mind we
	* could also have a single thread that selects multiple times on
	* separate FD sets.
	*
	* Due to any of these cases, we need to check every FD this select call
	* cares about (i.e. in an fd_set) to see if it is actionable. We do it
	* while holding the mutex to prevent other threads from consuming our
	* epoll events. */
	ret = 0;
	FD_ZERO(&working_read_fds);
	FD_ZERO(&working_write_fds);
	FD_ZERO(&working_except_fds);
	/* Note the helper sets bits in the working_ fd sets */
	for (int i = 0; i < nfds; i++)
	ret += fd_set_actionable(i, readfds, writefds);
	if (ret) {
	if (readfds)
	*readfds = working_read_fds;
	if (writefds)
	*writefds = working_write_fds;
	uth_mutex_unlock(epoll_mtx);
	return ret;
	}
	/* Need to check for up to FD_SETSIZE - nfds isn't the size of all FDs
	* tracked; it's the size of only our current select call */
	ep_results = malloc(sizeof(struct epoll_event) * FD_SETSIZE);
	if (!ep_results) {
	uth_mutex_unlock(epoll_mtx);
	errno = ENOMEM;
	return -1;
	}
	ep_ret = epoll_wait(epoll_fd, ep_results, FD_SETSIZE, ep_timeout);
	/* We need to hold the mtx during all of this processing since we're
	* using the global working_ fds sets. We can't modify the
	* readfds/writefds/exceptfds until we're sure we are done. */
	ret = 0;
	/* Note that ret can be > ep_ret. An FD that is both readable and
	* writable counts as one event for epoll, but as two bits for select.
	* */
	for (int i = 0; i < ep_ret; i++) {
	ret += extract_bits_for_events(&ep_results[i],
	EPOLLIN \| EPOLLHUP,
	readfds, &working_read_fds);
	ret += extract_bits_for_events(&ep_results[i],
	EPOLLOUT \| EPOLLHUP,
	writefds, &working_write_fds);
	ret += extract_bits_for_events(&ep_results[i], EPOLLERR,
	exceptfds, &working_except_fds);
	}
	free(ep_results);
	if (ret) {
	if (readfds)
	*readfds = working_read_fds;
	if (writefds)
	*writefds = working_write_fds;
	if (exceptfds)
	*exceptfds = working_except_fds;
	}
	uth_mutex_unlock(epoll_mtx);
	/* TODO: Consider updating timeval for non-timeouts. It's not mandatory
	* (POSIX). */
	if (ret)
	return ret;
	/* If we have no rets at this point, there are a few options: we could
	* have timed out (if requested), or we could have consumed someone
	* else's event. No one could have consumed our event, since we were
	* the only epoller (while holding the mtx). In the latter case, we'll
	* need to try again, but with an updated timeout. */
	if (timeout) {
	gettimeofday(end_tv, NULL);
	timersub(end_tv, start_tv, end_tv); /* diff in end_tv */
	if (timercmp(timeout, end_tv, >))
	timersub(timeout, end_tv, timeout);
	else
	return 0; /* select timed out */
	}
	goto loop;
	}

	int pselect(int nfds, fd_set readfds, fd_set writefds, fd_set *exceptfds,
	const struct timespec timeout, const sigset_t sigmask)
	{
	int ready;
	sigset_t origmask;
	struct timeval local_tv, *tv = &local_tv;

	if (!timeout) {
	tv = 0;
	} else {
	tv->tv_sec = timeout->tv_sec;
	tv->tv_usec = DIV_ROUND_UP(timeout->tv_nsec, 1000);
	}
	/* TODO: this is probably racy */
	sigprocmask(SIG_SETMASK, sigmask, &origmask);
	ready = select(nfds, readfds, writefds, exceptfds, tv);
	sigprocmask(SIG_SETMASK, &origmask, NULL);
	return ready;
	}