|  | /* Copyright (c) 2016 Google Inc. | 
|  | * Barret Rhoden <brho@cs.berkeley.edu> | 
|  | * See LICENSE for details. | 
|  | * | 
|  | * select() | 
|  | * | 
|  | * Our select() is super spurious and will only work with apps that use | 
|  | * non-blocking I/O. | 
|  | * | 
|  | * Under the hood, our select() is implemented with epoll (and under that, FD | 
|  | * taps).  Those can only detect edges (e.g. a socket becomes readable). | 
|  | * | 
|  | * The problem is that we want to detect a level status (e.g. socket is | 
|  | * readable) with an edge event (e.g. socket *becomes* readable).  To do this, | 
|  | * when someone initially selects, the FD gets tracked with epoll and we | 
|  | * immediately return saying the FD is ready for whatever they asked for.  This | 
|  | * is usually not true, and the application will need to poll all of its FDs | 
|  | * once after the initial select() call.  Subsequent selects() will still be | 
|  | * tracking the FD in the epoll set.  If any edge events that come after the | 
|  | * poll (which eventually returns EAGAIN) will be caught by epoll, and a | 
|  | * subsequent select will wake up (or never block in the first place) due to the | 
|  | * reception of that edge event. | 
|  | * | 
|  | * We maintain one FD set per program.  It tracks *any* FD being tracked by | 
|  | * *any* select call.  Regardless of whether the user asked for | 
|  | * read/write/except, the FD gets watched for anything until it closes.  This | 
|  | * will result in spurious wakeups. | 
|  | * | 
|  | * One issue with the global FD set is that one thread may consume the epoll | 
|  | * events intended for another thread (or even for itself at another call | 
|  | * site!).  To get around this, only one thread is the actual epoller, and the | 
|  | * others block on a mutex.  An alternative is to use a per-thread FD set, using | 
|  | * TLS, but not every 2LS uses TLS, and performance is not a concern for code | 
|  | * using select(). | 
|  | * | 
|  | * Notes: | 
|  | * - pselect might be racy | 
|  | * - if the user has no read/write/except sets, we won't wait.  some users of | 
|  | *   select use it as a timer only.  if that comes up, we can expand this. | 
|  | * - if you epoll or FD tap an FD, then try to use select on it, you'll get an | 
|  | *   error (only one tap per FD).  select() only knows about the FDs in its set. | 
|  | * - if you select() on a readfd that is a disk file, it'll always say it is | 
|  | *   available for I/O. | 
|  | */ | 
|  |  | 
|  | #include <sys/select.h> | 
|  | #include <sys/time.h> | 
|  | #include <sys/types.h> | 
|  | #include <unistd.h> | 
|  | #include <sys/stat.h> | 
|  |  | 
|  | #include <ros/common.h> | 
|  | #include <parlib/uthread.h> | 
|  | #include <parlib/arch/arch.h> | 
|  | #include <sys/close_cb.h> | 
|  | #include <sys/fork_cb.h> | 
|  | #include <sys/epoll.h> | 
|  | #include <malloc.h> | 
|  | #include <stdlib.h> | 
|  | #include <errno.h> | 
|  | #include <signal.h> | 
|  | #include <ros/fs.h> | 
|  |  | 
|  | static int epoll_fd; | 
|  | static fd_set all_fds; | 
|  | static uth_mutex_t fdset_mtx; | 
|  | static uintptr_t unique_caller; | 
|  | static uth_mutex_t sleep_mtx; | 
|  |  | 
|  | static bool fd_is_set(unsigned int fd, fd_set *set) | 
|  | { | 
|  | if (fd > FD_SETSIZE) | 
|  | return FALSE; | 
|  | if (!set) | 
|  | return FALSE; | 
|  | return FD_ISSET(fd, set); | 
|  | } | 
|  |  | 
|  | static void select_fd_closed(int fd) | 
|  | { | 
|  | /* Slightly racy, but anything concurrently added will be closed later, and | 
|  | * after it is_set. */ | 
|  | if (!fd_is_set(fd, &all_fds)) | 
|  | return; | 
|  | /* We just need to stop tracking FD.  We do not need to remove it from the | 
|  | * epoll set, since that will happen automatically on close(). */ | 
|  | uth_mutex_lock(fdset_mtx); | 
|  | FD_CLR(fd, &all_fds); | 
|  | uth_mutex_unlock(fdset_mtx); | 
|  | } | 
|  |  | 
|  | static void select_forked(void) | 
|  | { | 
|  | struct epoll_event ep_ev; | 
|  |  | 
|  | uth_mutex_lock(fdset_mtx); | 
|  | for (int i = 0; i < FD_SETSIZE; i++) { | 
|  | if (fd_is_set(i, &all_fds)) { | 
|  | ep_ev.events = EPOLLET | EPOLLIN | EPOLLOUT | EPOLLHUP | | 
|  | EPOLLERR; | 
|  | ep_ev.data.fd = i; | 
|  | /* Discard error.  The underlying tap is gone, and the epoll ctlr | 
|  | * might also have been emptied.  We just want to make sure there is | 
|  | * no epoll/tap so that a future CTL_ADD doesn't fail. */ | 
|  | epoll_ctl(epoll_fd, EPOLL_CTL_DEL, i, &ep_ev); | 
|  | FD_CLR(i, &all_fds); | 
|  | } | 
|  | } | 
|  | uth_mutex_unlock(fdset_mtx); | 
|  | } | 
|  |  | 
|  | static void select_init(void) | 
|  | { | 
|  | static struct close_cb select_close_cb = {.func = select_fd_closed}; | 
|  | static struct fork_cb select_fork_cb = {.func = select_forked}; | 
|  |  | 
|  | register_close_cb(&select_close_cb); | 
|  | epoll_fd = epoll_create(FD_SETSIZE); | 
|  | if (epoll_fd < 0) { | 
|  | perror("select failed epoll_create"); | 
|  | exit(-1); | 
|  | } | 
|  | fdset_mtx = uth_mutex_alloc(); | 
|  | sleep_mtx = uth_mutex_alloc(); | 
|  | register_fork_cb(&select_fork_cb); | 
|  | } | 
|  |  | 
|  | static int select_tv_to_ep_timeout(struct timeval *tv) | 
|  | { | 
|  | if (!tv) | 
|  | return -1; | 
|  | return tv->tv_sec * 1000 + DIV_ROUND_UP(tv->tv_usec, 1000); | 
|  | } | 
|  |  | 
|  | /* Check with the kernel if FD is readable/writable or not.  Some apps will call | 
|  | * select() on something even if it is already actionable, and not wait until | 
|  | * they get the EAGAIN. | 
|  | * | 
|  | * TODO: this *won't* work for disk based files.  It only works on qids that are | 
|  | * backed with qio queues or something similar, where the device has support for | 
|  | * setting DMREADABLE/DMWRITABLE. */ | 
|  | static bool fd_is_actionable(int fd, fd_set *readfds, fd_set *writefds) | 
|  | { | 
|  | struct stat stat_buf; | 
|  | int ret; | 
|  |  | 
|  | /* Avoid the stat call on FDs we're not tracking (which should trigger an | 
|  | * error, or give us the stat for FD 0). */ | 
|  | if (!(fd_is_set(fd, readfds) || fd_is_set(fd, writefds))) | 
|  | return FALSE; | 
|  | ret = fstat(fd, &stat_buf); | 
|  | assert(!ret); | 
|  | return (fd_is_set(fd, readfds)  && S_READABLE(stat_buf.st_mode)) || | 
|  | (fd_is_set(fd, writefds) && S_WRITABLE(stat_buf.st_mode)); | 
|  | } | 
|  |  | 
|  | int select(int nfds, fd_set *readfds, fd_set *writefds, | 
|  | fd_set *exceptfds, struct timeval *timeout) | 
|  | { | 
|  | bool changed_set = FALSE; | 
|  | struct epoll_event ep_ev; | 
|  | struct epoll_event *ep_results; | 
|  | uintptr_t my_call_id; | 
|  | int ret; | 
|  | int ep_timeout = select_tv_to_ep_timeout(timeout); | 
|  |  | 
|  | run_once(select_init()); | 
|  | /* good thing nfds is a signed int... */ | 
|  | if (nfds < 0) { | 
|  | errno = EINVAL; | 
|  | return -1; | 
|  | } | 
|  | /* It is legal to select on read even if you didn't consume all of the data | 
|  | * in an FD; similarly for writers on non-full FDs. */ | 
|  | for (int i = 0; i < nfds; i++) { | 
|  | if (fd_is_actionable(i, readfds, writefds)) | 
|  | return nfds; | 
|  | } | 
|  | uth_mutex_lock(fdset_mtx); | 
|  | for (int i = 0; i < nfds; i++) { | 
|  | if ((fd_is_set(i, readfds) || fd_is_set(i, writefds) || | 
|  | fd_is_set(i, exceptfds)) && !fd_is_set(i, &all_fds)) { | 
|  |  | 
|  | changed_set = TRUE; | 
|  | FD_SET(i, &all_fds); | 
|  | /* FDs that we track for *any* reason with select will be | 
|  | * tracked for *all* reasons with epoll. */ | 
|  | ep_ev.events = EPOLLET | EPOLLIN | EPOLLOUT | EPOLLHUP | | 
|  | EPOLLERR; | 
|  | ep_ev.data.fd = i; | 
|  | if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, i, &ep_ev)) { | 
|  | /* We might have failed because we tried to set up too many | 
|  | * FD tap types.  Listen FDs, for instance, can only be | 
|  | * tapped for READABLE and HANGUP.  Let's try for one of | 
|  | * those. */ | 
|  | if (errno == ENOSYS) { | 
|  | ep_ev.events = EPOLLET | EPOLLIN | EPOLLHUP; | 
|  | if (!epoll_ctl(epoll_fd, EPOLL_CTL_ADD, i, &ep_ev)) | 
|  | continue; | 
|  | } | 
|  | /* Careful to unlock before calling perror.  perror calls | 
|  | * close, which calls our CB, which grabs the lock. */ | 
|  | uth_mutex_unlock(fdset_mtx); | 
|  | perror("select epoll_ctl failed"); | 
|  | return -1; | 
|  | } | 
|  | } | 
|  | } | 
|  | uth_mutex_unlock(fdset_mtx); | 
|  | /* Since we just added some FD to our tracking set, we don't know if its | 
|  | * readable or not.  We'll only catch edge-triggered changes in the future. | 
|  | * We can spuriously tell the user all FDs are ready, and next time they | 
|  | * can block until there is edge activity. */ | 
|  | if (changed_set) | 
|  | return nfds; | 
|  | /* Since there is a global epoll set, we could have multiple threads | 
|  | * epolling at a time and one thread could consume the events that should | 
|  | * wake another thread.  We don't know when the 'other' thread last polled, | 
|  | * so we'll need to assume its event was consumed and just return. | 
|  | * | 
|  | * To make matters more confusing, we could also have a single thread that | 
|  | * selects multiple times on separate FD sets.  So we also need to | 
|  | * distinguish between calls and threads. | 
|  | * | 
|  | * If the same {thread, callsite} selects again and no one else has since | 
|  | * selected, then we know no one consumed the events.  We'll use the stack | 
|  | * pointer to uniquely identify the {thread, callsite} combo that recently | 
|  | * selected.  We use a mutex so that the extra threads sleep. */ | 
|  | uth_mutex_lock(sleep_mtx); | 
|  | my_call_id = get_stack_pointer(); | 
|  | if (my_call_id != unique_caller) { | 
|  | /* Could thrash, if we fight with another uth for unique_caller */ | 
|  | unique_caller = my_call_id; | 
|  | uth_mutex_unlock(sleep_mtx); | 
|  | return nfds; | 
|  | } | 
|  | /* Need to check for up to FD_SETSIZE - nfds isn't the size of all FDs | 
|  | * tracked; it's the size of only our current select call */ | 
|  | ep_results = malloc(sizeof(struct epoll_event) * FD_SETSIZE); | 
|  | if (!ep_results) { | 
|  | uth_mutex_unlock(sleep_mtx); | 
|  | errno = ENOMEM; | 
|  | return -1; | 
|  | } | 
|  | /* Don't care which ones were set; we'll just tell the user they all were | 
|  | * set.  If they can't handle that, this whole plan won't work. */ | 
|  | ret = epoll_wait(epoll_fd, ep_results, FD_SETSIZE, ep_timeout); | 
|  | uth_mutex_unlock(sleep_mtx); | 
|  | free(ep_results); | 
|  | /* TODO: consider updating timeval.  It's not mandatory (POSIX). */ | 
|  | if (ret == 0)	/* timeout */ | 
|  | return 0; | 
|  | return nfds; | 
|  | } | 
|  |  | 
|  | int pselect(int nfds, fd_set *readfds, fd_set *writefds, | 
|  | fd_set *exceptfds, const struct timespec *timeout, | 
|  | const sigset_t *sigmask) | 
|  | { | 
|  | int ready; | 
|  | sigset_t origmask; | 
|  | struct timeval local_tv, *tv = &local_tv; | 
|  |  | 
|  | if (!timeout) { | 
|  | tv = 0; | 
|  | } else { | 
|  | tv->tv_sec = timeout->tv_sec; | 
|  | tv->tv_usec = DIV_ROUND_UP(timeout->tv_nsec, 1000); | 
|  | } | 
|  | /* TODO: this is probably racy */ | 
|  | sigprocmask(SIG_SETMASK, sigmask, &origmask); | 
|  | ready = select(nfds, readfds, writefds, exceptfds, tv); | 
|  | sigprocmask(SIG_SETMASK, &origmask, NULL); | 
|  | return ready; | 
|  | } |