| /* Copyright (c) 2016 Google Inc. | 
 |  * Barret Rhoden <brho@cs.berkeley.edu> | 
 |  * See LICENSE for details. | 
 |  * | 
 |  * select() | 
 |  * | 
 |  * Our select() is a bit rough and only works with FDs where fstat can return | 
 |  * S_READABLE or S_WRITABLE.  For the most part, this applies to qio queues, | 
 |  * which are the basis for a lot of the network stack and pipes.  FDs where | 
 |  * fstat doesn't tell us the readiness will have races. | 
 |  * | 
 |  * Under the hood, our select() is implemented with epoll (and under that, FD | 
 |  * taps).  Those can only detect edges (e.g. a socket becomes readable). | 
 |  * | 
 |  * The problem is that we want to detect a level status (e.g. socket is | 
 |  * readable) with an edge event (e.g. socket *becomes* readable).  To do this, | 
 |  * when someone initially selects, the FD gets tracked with epoll and we | 
 |  * manually poll the FDs with fstat.  Subsequent selects() will still be tracked | 
 |  * in the epoll set, but since apps can select() even on FDs they didn't drain | 
 |  * to the point of blocking, we still need to poll every FD on every select() | 
 |  * call. | 
 |  * | 
 |  * We maintain one FD set per program.  It tracks *any* FD being tracked by | 
 |  * *any* select call.  This is because you can only have one tap per FD. | 
 |  * Regardless of whether the user asked for read/write/except, the FD gets | 
 |  * watched for anything until it closes. | 
 |  * | 
 |  * One issue with the global FD set is that one thread may consume the epoll | 
 |  * events intended for another thread (or even for itself at another call | 
 |  * site!).  To get around this, only one thread is the actual epoller, and the | 
 |  * others block on a mutex.  TLS isn't an option for two reasons: not all 2LSs | 
 |  * use TLS (minor concern, maybe they should) and there are some threads who | 
 |  * make multiple select calls - we actually want per-call-site-and-thread fd | 
 |  * sets. | 
 |  * | 
 |  * Notes: | 
 |  * - pselect might be racy | 
 |  * - if the user has no read/write/except sets, we won't wait.  some users of | 
 |  *   select use it as a timer only.  if that comes up, we can expand this. | 
 |  * - if you epoll or FD tap an FD, then try to use select on it, you'll get an | 
 |  *   error (only one tap per FD).  select() only knows about the FDs in its set. | 
 |  * - if you select() on a readfd that is a disk file, it'll always say it is | 
 |  *   available for I/O. | 
 |  */ | 
 |  | 
 | #include <sys/select.h> | 
 | #include <sys/stat.h> | 
 | #include <sys/time.h> | 
 | #include <sys/types.h> | 
 | #include <unistd.h> | 
 |  | 
 | #include <errno.h> | 
 | #include <malloc.h> | 
 | #include <parlib/arch/arch.h> | 
 | #include <parlib/uthread.h> | 
 | #include <parlib/parlib.h> | 
 | #include <ros/common.h> | 
 | #include <ros/fs.h> | 
 | #include <signal.h> | 
 | #include <stdlib.h> | 
 | #include <sys/close_cb.h> | 
 | #include <sys/epoll.h> | 
 | #include <sys/fork_cb.h> | 
 |  | 
 | static int epoll_fd; | 
 | static fd_set all_fds; | 
 | static fd_set working_read_fds; | 
 | static fd_set working_write_fds; | 
 | static fd_set working_except_fds; | 
 | static uth_mutex_t *epoll_mtx; | 
 |  | 
 | static bool fd_is_set(unsigned int fd, fd_set *set) | 
 | { | 
 | 	if (fd > FD_SETSIZE) | 
 | 		return FALSE; | 
 | 	if (!set) | 
 | 		return FALSE; | 
 | 	return FD_ISSET(fd, set); | 
 | } | 
 |  | 
 | static void select_fd_closed(int fd) | 
 | { | 
 | 	/* Slightly racy, but anything concurrently added will be closed later, | 
 | 	 * and after it is_set. */ | 
 | 	if (!fd_is_set(fd, &all_fds)) | 
 | 		return; | 
 | 	/* We just need to stop tracking FD.  We do not need to remove it from | 
 | 	 * the epoll set, since that will happen automatically on close(). */ | 
 | 	uth_mutex_lock(epoll_mtx); | 
 | 	FD_CLR(fd, &all_fds); | 
 | 	uth_mutex_unlock(epoll_mtx); | 
 | } | 
 |  | 
 | static void select_forked(void) | 
 | { | 
 | 	struct epoll_event ep_ev; | 
 |  | 
 | 	uth_mutex_lock(epoll_mtx); | 
 | 	for (int i = 0; i < FD_SETSIZE; i++) { | 
 | 		if (fd_is_set(i, &all_fds)) { | 
 | 			ep_ev.events = EPOLLET | EPOLLIN | EPOLLOUT | EPOLLHUP | | 
 | 				       EPOLLERR; | 
 | 			ep_ev.data.fd = i; | 
 | 			/* Discard error.  The underlying tap is gone, and the | 
 | 			 * epoll ctlr might also have been emptied.  We just | 
 | 			 * want to make sure there is no epoll/tap so that a | 
 | 			 * future CTL_ADD doesn't fail. */ | 
 | 			epoll_ctl(epoll_fd, EPOLL_CTL_DEL, i, &ep_ev); | 
 | 			FD_CLR(i, &all_fds); | 
 | 		} | 
 | 	} | 
 | 	uth_mutex_unlock(epoll_mtx); | 
 | } | 
 |  | 
 | static void select_init(void *arg) | 
 | { | 
 | 	static struct close_cb select_close_cb = {.func = select_fd_closed}; | 
 | 	static struct fork_cb select_fork_cb = {.func = select_forked}; | 
 |  | 
 | 	register_close_cb(&select_close_cb); | 
 | 	epoll_fd = epoll_create(FD_SETSIZE); | 
 | 	if (epoll_fd < 0) { | 
 | 		perror("select failed epoll_create"); | 
 | 		exit(-1); | 
 | 	} | 
 | 	epoll_mtx = uth_mutex_alloc(); | 
 | 	register_fork_cb(&select_fork_cb); | 
 | } | 
 |  | 
 | static int select_tv_to_ep_timeout(struct timeval *tv) | 
 | { | 
 | 	if (!tv) | 
 | 		return -1; | 
 | 	return tv->tv_sec * 1000 + DIV_ROUND_UP(tv->tv_usec, 1000); | 
 | } | 
 |  | 
 | /* Helper: check with the kernel if FD is readable/writable or not.  Some apps | 
 |  * will call select() on something even if it is already actionable, and not | 
 |  * wait until they get the EAGAIN. | 
 |  * | 
 |  * This modifies the global working_ fd sets by setting bits of actionable FDs | 
 |  * and will return the number of bits turned on.  So basically, 1 for readable | 
 |  * xor writable, 2 for both. | 
 |  * | 
 |  * TODO: this *won't* work for disk based files.  It only works on qids that are | 
 |  * backed with qio queues or something similar, where the device has support for | 
 |  * setting DMREADABLE/DMWRITABLE. */ | 
 | static unsigned int fd_set_actionable(int fd, fd_set *readfds, fd_set *writefds) | 
 | { | 
 | 	struct stat stat_buf; | 
 | 	int ret; | 
 |  | 
 | 	/* Avoid the stat call on FDs we're not tracking (which should trigger | 
 | 	 * an error, or give us the stat for FD 0). */ | 
 | 	if (!(fd_is_set(fd, readfds) || fd_is_set(fd, writefds))) | 
 | 		return 0; | 
 | 	ret = fstat(fd, &stat_buf); | 
 | 	assert(!ret); | 
 | 	ret = 0; | 
 | 	if (fd_is_set(fd, readfds)) { | 
 | 		if (S_READABLE(stat_buf.st_mode)) { | 
 | 			ret++; | 
 | 			FD_SET(fd, &working_read_fds); | 
 | 		} | 
 | 	} | 
 | 	if (fd_is_set(fd, writefds)) { | 
 | 		if (S_WRITABLE(stat_buf.st_mode)) { | 
 | 			ret++; | 
 | 			FD_SET(fd, &working_write_fds); | 
 | 		} | 
 | 	} | 
 | 	return ret; | 
 | } | 
 |  | 
 | /* Helper: extracts events from ep_result for types ep_event_types, and sets | 
 |  * their bits in ret_fds if the FD was watched.  Returns the number of bits set. | 
 |  */ | 
 | static int extract_bits_for_events(struct epoll_event *ep_result, | 
 |                                    uint32_t ep_event_types, | 
 |                                    fd_set *watched_fds, fd_set *ret_fds) | 
 | { | 
 | 	int ret = 0; | 
 | 	int fd = ep_result->data.fd; | 
 |  | 
 | 	if (ep_result->events & ep_event_types) { | 
 | 		if (fd_is_set(fd, watched_fds) && !FD_ISSET(fd, ret_fds)) { | 
 | 			FD_SET(fd, ret_fds); | 
 | 			ret++; | 
 | 		} | 
 | 	} | 
 | 	return ret; | 
 | } | 
 |  | 
 | int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, | 
 |            struct timeval *timeout) | 
 | { | 
 | 	struct epoll_event ep_ev; | 
 | 	struct epoll_event *ep_results; | 
 | 	int ret, ep_ret, ep_timeout; | 
 | 	static parlib_once_t once = PARLIB_ONCE_INIT; | 
 | 	struct timeval start_tv[1], end_tv[1]; | 
 |  | 
 | 	parlib_run_once(&once, select_init, NULL); | 
 | 	/* good thing nfds is a signed int... */ | 
 | 	if (nfds < 0) { | 
 | 		errno = EINVAL; | 
 | 		return -1; | 
 | 	} | 
 | loop: | 
 | 	if (timeout) | 
 | 		gettimeofday(start_tv, NULL); | 
 | 	ep_timeout = select_tv_to_ep_timeout(timeout); | 
 | 	uth_mutex_lock(epoll_mtx); | 
 | 	for (int i = 0; i < nfds; i++) { | 
 | 		if ((fd_is_set(i, readfds) || fd_is_set(i, writefds) || | 
 | 		     fd_is_set(i, exceptfds)) && | 
 | 		    !fd_is_set(i, &all_fds)) { | 
 |  | 
 | 			FD_SET(i, &all_fds); | 
 | 			/* FDs that we track for *any* reason with select will | 
 | 			 * be tracked for *all* reasons with epoll. */ | 
 | 			ep_ev.events = EPOLLET | EPOLLIN | EPOLLOUT | EPOLLHUP | | 
 | 				       EPOLLERR; | 
 | 			ep_ev.data.fd = i; | 
 | 			if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, i, &ep_ev)) { | 
 | 				/* We might have failed because we tried to set | 
 | 				 * up too many FD tap types.  Listen FDs, for | 
 | 				 * instance, can only be tapped for READABLE and | 
 | 				 * HANGUP.  Let's try for one of those. */ | 
 | 				if (errno == ENOSYS) { | 
 | 					ep_ev.events = EPOLLET | EPOLLIN | | 
 | 						       EPOLLHUP; | 
 | 					if (!epoll_ctl(epoll_fd, EPOLL_CTL_ADD, | 
 | 						       i, &ep_ev)) | 
 | 						continue; | 
 | 				} | 
 | 				/* Careful to unlock before calling perror. | 
 | 				 * perror calls close, which calls our CB, which | 
 | 				 * grabs the lock. */ | 
 | 				uth_mutex_unlock(epoll_mtx); | 
 | 				perror("select epoll_ctl failed"); | 
 | 				return -1; | 
 | 			} | 
 | 		} | 
 | 	} | 
 | 	/* Since we just added some FDs to our tracking set, we don't know if | 
 | 	 * they are readable or not.  We'll only catch edge-triggered changes in | 
 | 	 * the future. | 
 | 	 * | 
 | 	 * Similarly, it is legal to select on a readable FD even if you didn't | 
 | 	 * consume all of the data yet; similarly for writers on non-full FDs. | 
 | 	 * | 
 | 	 * Additionally, since there is a global epoll set, we could have | 
 | 	 * multiple threads epolling concurrently and one thread could consume | 
 | 	 * the events that should wake another thread.  Also, keep in mind we | 
 | 	 * could also have a single thread that selects multiple times on | 
 | 	 * separate FD sets. | 
 | 	 * | 
 | 	 * Due to any of these cases, we need to check every FD this select call | 
 | 	 * cares about (i.e. in an fd_set) to see if it is actionable.  We do it | 
 | 	 * while holding the mutex to prevent other threads from consuming our | 
 | 	 * epoll events. */ | 
 | 	ret = 0; | 
 | 	FD_ZERO(&working_read_fds); | 
 | 	FD_ZERO(&working_write_fds); | 
 | 	FD_ZERO(&working_except_fds); | 
 | 	/* Note the helper sets bits in the working_ fd sets */ | 
 | 	for (int i = 0; i < nfds; i++) | 
 | 		ret += fd_set_actionable(i, readfds, writefds); | 
 | 	if (ret) { | 
 | 		if (readfds) | 
 | 			*readfds = working_read_fds; | 
 | 		if (writefds) | 
 | 			*writefds = working_write_fds; | 
 | 		uth_mutex_unlock(epoll_mtx); | 
 | 		return ret; | 
 | 	} | 
 | 	/* Need to check for up to FD_SETSIZE - nfds isn't the size of all FDs | 
 | 	 * tracked; it's the size of only our current select call */ | 
 | 	ep_results = malloc(sizeof(struct epoll_event) * FD_SETSIZE); | 
 | 	if (!ep_results) { | 
 | 		uth_mutex_unlock(epoll_mtx); | 
 | 		errno = ENOMEM; | 
 | 		return -1; | 
 | 	} | 
 | 	ep_ret = epoll_wait(epoll_fd, ep_results, FD_SETSIZE, ep_timeout); | 
 | 	/* We need to hold the mtx during all of this processing since we're | 
 | 	 * using the global working_ fds sets.  We can't modify the | 
 | 	 * readfds/writefds/exceptfds until we're sure we are done. */ | 
 | 	ret = 0; | 
 | 	/* Note that ret can be > ep_ret.  An FD that is both readable and | 
 | 	 * writable counts as one event for epoll, but as two bits for select. | 
 | 	 * */ | 
 | 	for (int i = 0; i < ep_ret; i++) { | 
 | 		ret += extract_bits_for_events(&ep_results[i], | 
 | 					       EPOLLIN | EPOLLHUP, | 
 | 		                               readfds, &working_read_fds); | 
 | 		ret += extract_bits_for_events(&ep_results[i], | 
 | 					       EPOLLOUT | EPOLLHUP, | 
 | 		                               writefds, &working_write_fds); | 
 | 		ret += extract_bits_for_events(&ep_results[i], EPOLLERR, | 
 | 		                               exceptfds, &working_except_fds); | 
 | 	} | 
 | 	free(ep_results); | 
 | 	if (ret) { | 
 | 		if (readfds) | 
 | 			*readfds = working_read_fds; | 
 | 		if (writefds) | 
 | 			*writefds = working_write_fds; | 
 | 		if (exceptfds) | 
 | 			*exceptfds = working_except_fds; | 
 | 	} | 
 | 	uth_mutex_unlock(epoll_mtx); | 
 | 	/* TODO: Consider updating timeval for non-timeouts.  It's not mandatory | 
 | 	 * (POSIX). */ | 
 | 	if (ret) | 
 | 		return ret; | 
 | 	/* If we have no rets at this point, there are a few options: we could | 
 | 	 * have timed out (if requested), or we could have consumed someone | 
 | 	 * else's event.  No one could have consumed our event, since we were | 
 | 	 * the only epoller (while holding the mtx).  In the latter case, we'll | 
 | 	 * need to try again, but with an updated timeout. */ | 
 | 	if (timeout) { | 
 | 		gettimeofday(end_tv, NULL); | 
 | 		timersub(end_tv, start_tv, end_tv);	/* diff in end_tv */ | 
 | 		if (timercmp(timeout, end_tv, >)) | 
 | 			timersub(timeout, end_tv, timeout); | 
 | 		else | 
 | 			return 0;	/* select timed out */ | 
 | 	} | 
 | 	goto loop; | 
 | } | 
 |  | 
 | int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, | 
 |             const struct timespec *timeout, const sigset_t *sigmask) | 
 | { | 
 | 	int ready; | 
 | 	sigset_t origmask; | 
 | 	struct timeval local_tv, *tv = &local_tv; | 
 |  | 
 | 	if (!timeout) { | 
 | 		tv = 0; | 
 | 	} else { | 
 | 		tv->tv_sec = timeout->tv_sec; | 
 | 		tv->tv_usec = DIV_ROUND_UP(timeout->tv_nsec, 1000); | 
 | 	} | 
 | 	/* TODO: this is probably racy */ | 
 | 	sigprocmask(SIG_SETMASK, sigmask, &origmask); | 
 | 	ready = select(nfds, readfds, writefds, exceptfds, tv); | 
 | 	sigprocmask(SIG_SETMASK, &origmask, NULL); | 
 | 	return ready; | 
 | } |