|  | /* Copyright (c) 2015 Google Inc | 
|  | * Barret Rhoden <brho@cs.berkeley.edu> | 
|  | * See LICENSE for details. | 
|  | * | 
|  | * #eventfd device, the kernel-side implementation of man 2 eventfd. | 
|  | * | 
|  | * Unlike the Linux interface, which takes host-endian u64s, we read and write | 
|  | * strings.  It's a little slower, but it maintains the distributed-system | 
|  | * nature of Plan 9 devices. */ | 
|  |  | 
|  | #include <ns.h> | 
|  | #include <kmalloc.h> | 
|  | #include <kref.h> | 
|  | #include <atomic.h> | 
|  | #include <string.h> | 
|  | #include <stdio.h> | 
|  | #include <assert.h> | 
|  | #include <error.h> | 
|  | #include <sys/queue.h> | 
|  | #include <fdtap.h> | 
|  | #include <syscall.h> | 
|  |  | 
|  | struct dev efd_devtab; | 
|  |  | 
|  | static char *devname(void) | 
|  | { | 
|  | return efd_devtab.name; | 
|  | } | 
|  |  | 
|  | enum { | 
|  | Qdir, | 
|  | Qctl, | 
|  | Qefd, | 
|  | }; | 
|  |  | 
|  | static struct dirtab efd_dir[] = { | 
|  | {".", {Qdir, 0, QTDIR}, 0, DMDIR | 0555}, | 
|  | {"ctl", {Qctl, 0, QTFILE}, 0, 0666}, | 
|  | {"efd", {Qefd, 0, QTFILE}, 8, 0666}, | 
|  | }; | 
|  |  | 
|  | enum { | 
|  | EFD_SEMAPHORE = 	1 << 0, | 
|  | EFD_MAX_VAL =		(unsigned long)(-2), // i.e. 0xfffffffffffffffe | 
|  | }; | 
|  |  | 
|  |  | 
|  | struct eventfd { | 
|  | int 				flags; | 
|  | atomic_t			counter; | 
|  | struct fdtap_slist		fd_taps; | 
|  | spinlock_t			tap_lock; | 
|  | struct rendez			rv_readers; | 
|  | struct rendez			rv_writers; | 
|  | struct kref			refcnt; | 
|  | }; | 
|  |  | 
|  |  | 
|  | static void efd_release(struct kref *kref) | 
|  | { | 
|  | struct eventfd *efd = container_of(kref, struct eventfd, refcnt); | 
|  |  | 
|  | /* All FDs with taps must be closed before we decreffed all the chans */ | 
|  | assert(SLIST_EMPTY(&efd->fd_taps)); | 
|  | kfree(efd); | 
|  | } | 
|  |  | 
|  | static struct chan *efd_attach(char *spec) | 
|  | { | 
|  | struct chan *c; | 
|  | struct eventfd *efd; | 
|  |  | 
|  | c = devattach(devname(), spec); | 
|  | efd = kzmalloc(sizeof(struct eventfd), MEM_WAIT); | 
|  | SLIST_INIT(&efd->fd_taps); | 
|  | spinlock_init(&efd->tap_lock); | 
|  | rendez_init(&efd->rv_readers); | 
|  | rendez_init(&efd->rv_writers); | 
|  | /* Attach and walk are the two sources of chans.  Each returns a | 
|  | * refcnt'd object, for the most part. */ | 
|  | kref_init(&efd->refcnt, efd_release, 1); | 
|  | /* nothing special in the qid to ID this eventfd.  the main thing is the | 
|  | * aux.  we could put a debugging ID in the path like pipe. */ | 
|  | mkqid(&c->qid, Qdir, 0, QTDIR); | 
|  | c->aux = efd; | 
|  | /* just to be fancy and remove a syscall, if they pass spec == "sem", | 
|  | * then we'll treat them as being in semaphore mode. */ | 
|  | if (!strcmp(spec, "sem")) | 
|  | efd->flags |= EFD_SEMAPHORE; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | static struct walkqid *efd_walk(struct chan *c, struct chan *nc, char **name, | 
|  | unsigned int nname) | 
|  | { | 
|  | struct walkqid *wq; | 
|  | struct eventfd *efd = c->aux; | 
|  |  | 
|  | wq = devwalk(c, nc, name, nname, efd_dir, ARRAY_SIZE(efd_dir), devgen); | 
|  | /* Walk is a source of a distinct chan from this device.  The other | 
|  | * source is attach.  Once created, these chans will eventually be | 
|  | * closed, and when they close, they will decref their aux, efd.  All | 
|  | * chans within this *instance* of eventfd share the same efd.  Each one | 
|  | * will have one refcnt.  Each chan may also have several copies of its | 
|  | * pointer out there (e.g. FD dup), all of which have their own *chan* | 
|  | * refcnt. | 
|  | * | 
|  | * All of the above applies on successful walks that found all nname | 
|  | * parts of the path.  A mid-success is wq: we got something.  wq->clone | 
|  | * means we got to the end and the "big walk" considers this a success. | 
|  | * | 
|  | * There is a slight chance the new chan is the same as our original | 
|  | * chan (if nc == c when we're called).  In which case, there's only one | 
|  | * chan.  The number of refs on efd == the number of distinct chans | 
|  | * within this instance of #eventfd. */ | 
|  | if (wq != NULL && wq->clone != NULL && wq->clone != c) | 
|  | kref_get(&efd->refcnt, 1); | 
|  | return wq; | 
|  | } | 
|  |  | 
|  | /* In the future, we could use stat / wstat to get and set O_NONBLOCK */ | 
|  | static size_t efd_stat(struct chan *c, uint8_t *db, size_t n) | 
|  | { | 
|  | return devstat(c, db, n, efd_dir, ARRAY_SIZE(efd_dir), devgen); | 
|  | } | 
|  |  | 
|  | static struct chan *efd_open(struct chan *c, int omode) | 
|  | { | 
|  | return devopen(c, omode, efd_dir, ARRAY_SIZE(efd_dir), devgen); | 
|  | } | 
|  |  | 
|  | static void efd_close(struct chan *c) | 
|  | { | 
|  | struct eventfd *efd = c->aux; | 
|  |  | 
|  | /* Here's where we put the ref from attach and successful walks */ | 
|  | kref_put(&efd->refcnt); | 
|  | } | 
|  |  | 
|  | static void efd_fire_taps(struct eventfd *efd, int filter) | 
|  | { | 
|  | struct fd_tap *tap_i; | 
|  |  | 
|  | if (SLIST_EMPTY(&efd->fd_taps)) | 
|  | return; | 
|  | /* We're not expecting many FD taps, so it's not worth splitting readers | 
|  | * from writers or anything like that. | 
|  | * TODO: (RCU) Locking to protect the list and the tap's existence. */ | 
|  | spin_lock(&efd->tap_lock); | 
|  | SLIST_FOREACH(tap_i, &efd->fd_taps, link) | 
|  | fire_tap(tap_i, filter); | 
|  | spin_unlock(&efd->tap_lock); | 
|  | } | 
|  |  | 
|  | static int has_counts(void *arg) | 
|  | { | 
|  | struct eventfd *efd = arg; | 
|  |  | 
|  | return atomic_read(&efd->counter) != 0; | 
|  | } | 
|  |  | 
|  | /* The heart of reading an eventfd */ | 
|  | static unsigned long efd_read_efd(struct eventfd *efd, struct chan *c) | 
|  | { | 
|  | unsigned long old_count, new_count, ret; | 
|  |  | 
|  | while (1) { | 
|  | old_count = atomic_read(&efd->counter); | 
|  | if (!old_count) { | 
|  | if (c->flag & O_NONBLOCK) | 
|  | error(EAGAIN, "Would block on #%s read", | 
|  | devname()); | 
|  | rendez_sleep(&efd->rv_readers, has_counts, efd); | 
|  | } else { | 
|  | if (efd->flags & EFD_SEMAPHORE) { | 
|  | new_count = old_count - 1; | 
|  | ret = 1; | 
|  | } else { | 
|  | new_count = 0; | 
|  | ret = old_count; | 
|  | } | 
|  | if (atomic_cas(&efd->counter, old_count, new_count)) | 
|  | goto success; | 
|  | } | 
|  | } | 
|  | success: | 
|  | rendez_wakeup(&efd->rv_writers); | 
|  | efd_fire_taps(efd, FDTAP_FILT_WRITABLE); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static size_t efd_read(struct chan *c, void *ubuf, size_t n, off64_t offset) | 
|  | { | 
|  | struct eventfd *efd = c->aux; | 
|  |  | 
|  | switch (c->qid.path) { | 
|  | case Qdir: | 
|  | return devdirread(c, ubuf, n, efd_dir, ARRAY_SIZE(efd_dir), | 
|  | devgen); | 
|  | case Qctl: | 
|  | return readnum(offset, ubuf, n, efd->flags, NUMSIZE32); | 
|  | case Qefd: | 
|  | /* ignoring the chan offset for Qefd */ | 
|  | return readnum(0, ubuf, n, efd_read_efd(efd, c), NUMSIZE64); | 
|  | default: | 
|  | panic("Bad Qid %p!", c->qid.path); | 
|  | } | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | static int has_room(void *arg) | 
|  | { | 
|  | struct eventfd *efd = arg; | 
|  | return atomic_read(&efd->counter) != EFD_MAX_VAL; | 
|  | } | 
|  |  | 
|  | /* The heart of writing an eventfd */ | 
|  | static void efd_write_efd(struct eventfd *efd, unsigned long add_to, | 
|  | struct chan *c) | 
|  | { | 
|  | unsigned long old_count, new_count; | 
|  |  | 
|  | while (1) { | 
|  | old_count = atomic_read(&efd->counter); | 
|  | new_count = old_count + add_to; | 
|  | if (new_count > EFD_MAX_VAL) { | 
|  | if (c->flag & O_NONBLOCK) | 
|  | error(EAGAIN, "Would block on #%s write", | 
|  | devname()); | 
|  | rendez_sleep(&efd->rv_writers, has_room, efd); | 
|  | } else { | 
|  | if (atomic_cas(&efd->counter, old_count, new_count)) | 
|  | goto success; | 
|  | } | 
|  | } | 
|  | success: | 
|  | rendez_wakeup(&efd->rv_readers); | 
|  | efd_fire_taps(efd, FDTAP_FILT_READABLE); | 
|  | } | 
|  |  | 
|  | static size_t efd_write(struct chan *c, void *ubuf, size_t n, off64_t offset) | 
|  | { | 
|  | struct eventfd *efd = c->aux; | 
|  | unsigned long write_val; | 
|  | char num64[NUMSIZE64]; | 
|  |  | 
|  | switch (c->qid.path) { | 
|  | case Qctl: | 
|  | /* If we want to allow runtime changing of settings, we can do | 
|  | * it here. */ | 
|  | error(EFAIL, "No #%s ctl commands supported", devname()); | 
|  | break; | 
|  | case Qefd: | 
|  | /* We want to give strtoul a null-terminated buf (can't handle | 
|  | * arbitrary user strings).  Ignoring the chan offset too. */ | 
|  | if (n > sizeof(num64)) | 
|  | error(EAGAIN, "attempted to write %d chars, max %d", n, | 
|  | sizeof(num64)); | 
|  | memcpy(num64, ubuf, n); | 
|  | num64[n] = 0;	/* enforce trailing 0 */ | 
|  | write_val = strtoul(num64, 0, 0); | 
|  | if (write_val == (unsigned long)(-1)) | 
|  | error(EFAIL, "Eventfd write must not be -1"); | 
|  | efd_write_efd(efd, write_val, c); | 
|  | break; | 
|  | default: | 
|  | panic("Bad Qid %p!", c->qid.path); | 
|  | } | 
|  | return n; | 
|  | } | 
|  |  | 
|  | static char *efd_chaninfo(struct chan *c, char *ret, size_t ret_l) | 
|  | { | 
|  | struct eventfd *efd = c->aux; | 
|  |  | 
|  | snprintf(ret, ret_l, "QID type %s, flags %p, counter %p", | 
|  | efd_dir[c->qid.path].name, efd->flags, | 
|  | atomic_read(&efd->counter)); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static int efd_tapfd(struct chan *c, struct fd_tap *tap, int cmd) | 
|  | { | 
|  | struct eventfd *efd = c->aux; | 
|  | int ret; | 
|  |  | 
|  | /* HANGUP, ERROR, and PRIORITY will never fire, but people can ask for | 
|  | * them.  We don't actually support HANGUP, but epoll implies it. | 
|  | * Linux's eventfd cand have ERROR, so apps can ask for it.  Likewise, | 
|  | * priority is meaningless for us, but sometimes people ask for it. */ | 
|  | #define EFD_LEGAL_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE |        \ | 
|  | FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |          \ | 
|  | FDTAP_FILT_ERROR) | 
|  |  | 
|  | switch (c->qid.path) { | 
|  | case Qefd: | 
|  | if (tap->filter & ~EFD_LEGAL_TAPS) { | 
|  | set_error(ENOSYS, "Unsupported #%s tap %p, must be %p", | 
|  | devname(), tap->filter, EFD_LEGAL_TAPS); | 
|  | return -1; | 
|  | } | 
|  | spin_lock(&efd->tap_lock); | 
|  | switch (cmd) { | 
|  | case (FDTAP_CMD_ADD): | 
|  | SLIST_INSERT_HEAD(&efd->fd_taps, tap, link); | 
|  | ret = 0; | 
|  | break; | 
|  | case (FDTAP_CMD_REM): | 
|  | SLIST_REMOVE(&efd->fd_taps, tap, fd_tap, link); | 
|  | ret = 0; | 
|  | break; | 
|  | default: | 
|  | set_error(ENOSYS, "Unsupported #%s tap command %p", | 
|  | devname(), cmd); | 
|  | ret = -1; | 
|  | } | 
|  | spin_unlock(&efd->tap_lock); | 
|  | return ret; | 
|  | default: | 
|  | set_error(ENOSYS, "Can't tap #%s file type %d", devname(), | 
|  | c->qid.path); | 
|  | return -1; | 
|  | } | 
|  | } | 
|  |  | 
|  | struct dev efd_devtab __devtab = { | 
|  | .name = "eventfd", | 
|  | .reset = devreset, | 
|  | .init = devinit, | 
|  | .shutdown = devshutdown, | 
|  | .attach = efd_attach, | 
|  | .walk = efd_walk, | 
|  | .stat = efd_stat, | 
|  | .open = efd_open, | 
|  | .create = devcreate, | 
|  | .close = efd_close, | 
|  | .read = efd_read, | 
|  | .bread = devbread, | 
|  | .write = efd_write, | 
|  | .bwrite = devbwrite, | 
|  | .remove = devremove, | 
|  | .wstat = devwstat, | 
|  | .power = devpower, | 
|  | .chaninfo = efd_chaninfo, | 
|  | .tapfd = efd_tapfd, | 
|  | }; |