blob: 479666cc20907a639d5d303fdeb559777f4522a3 [file] [log] [blame]
/* Copyright (c) 2015 Google Inc
* Barret Rhoden <brho@cs.berkeley.edu>
* See LICENSE for details.
*
* #eventfd device, the kernel-side implementation of man 2 eventfd.
*
* Unlike the Linux interface, which takes host-endian u64s, we read and write
* strings. It's a little slower, but it maintains the distributed-system
* nature of Plan 9 devices. */
#include <ns.h>
#include <kmalloc.h>
#include <kref.h>
#include <atomic.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <error.h>
#include <sys/queue.h>
#include <fdtap.h>
#include <syscall.h>
struct dev efd_devtab;
static char *devname(void)
{
return efd_devtab.name;
}
enum {
Qdir,
Qctl,
Qefd,
};
static struct dirtab efd_dir[] = {
{".", {Qdir, 0, QTDIR}, 0, DMDIR | 0555},
{"ctl", {Qctl, 0, QTFILE}, 0, 0666},
{"efd", {Qefd, 0, QTFILE}, 8, 0666},
};
enum {
EFD_SEMAPHORE = 1 << 0,
EFD_MAX_VAL = (unsigned long)(-2), // i.e. 0xfffffffffffffffe
};
struct eventfd {
int flags;
atomic_t counter;
struct fdtap_slist fd_taps;
spinlock_t tap_lock;
struct rendez rv_readers;
struct rendez rv_writers;
struct kref refcnt;
};
static void efd_release(struct kref *kref)
{
struct eventfd *efd = container_of(kref, struct eventfd, refcnt);
/* All FDs with taps must be closed before we decreffed all the chans */
assert(SLIST_EMPTY(&efd->fd_taps));
kfree(efd);
}
static struct chan *efd_attach(char *spec)
{
struct chan *c;
struct eventfd *efd;
c = devattach(devname(), spec);
efd = kzmalloc(sizeof(struct eventfd), MEM_WAIT);
SLIST_INIT(&efd->fd_taps);
spinlock_init(&efd->tap_lock);
rendez_init(&efd->rv_readers);
rendez_init(&efd->rv_writers);
/* Attach and walk are the two sources of chans. Each returns a
* refcnt'd object, for the most part. */
kref_init(&efd->refcnt, efd_release, 1);
/* nothing special in the qid to ID this eventfd. the main thing is the
* aux. we could put a debugging ID in the path like pipe. */
mkqid(&c->qid, Qdir, 0, QTDIR);
c->aux = efd;
/* just to be fancy and remove a syscall, if they pass spec == "sem",
* then we'll treat them as being in semaphore mode. */
if (!strcmp(spec, "sem"))
efd->flags |= EFD_SEMAPHORE;
return c;
}
static struct walkqid *efd_walk(struct chan *c, struct chan *nc, char **name,
unsigned int nname)
{
struct walkqid *wq;
struct eventfd *efd = c->aux;
wq = devwalk(c, nc, name, nname, efd_dir, ARRAY_SIZE(efd_dir), devgen);
/* Walk is a source of a distinct chan from this device. The other
* source is attach. Once created, these chans will eventually be
* closed, and when they close, they will decref their aux, efd. All
* chans within this *instance* of eventfd share the same efd. Each one
* will have one refcnt. Each chan may also have several copies of its
* pointer out there (e.g. FD dup), all of which have their own *chan*
* refcnt.
*
* All of the above applies on successful walks that found all nname
* parts of the path. A mid-success is wq: we got something. wq->clone
* means we got to the end and the "big walk" considers this a success.
*
* There is a slight chance the new chan is the same as our original
* chan (if nc == c when we're called). In which case, there's only one
* chan. The number of refs on efd == the number of distinct chans
* within this instance of #eventfd. */
if (wq != NULL && wq->clone != NULL && wq->clone != c)
kref_get(&efd->refcnt, 1);
return wq;
}
/* In the future, we could use stat / wstat to get and set O_NONBLOCK */
static size_t efd_stat(struct chan *c, uint8_t *db, size_t n)
{
return devstat(c, db, n, efd_dir, ARRAY_SIZE(efd_dir), devgen);
}
static struct chan *efd_open(struct chan *c, int omode)
{
return devopen(c, omode, efd_dir, ARRAY_SIZE(efd_dir), devgen);
}
static void efd_close(struct chan *c)
{
struct eventfd *efd = c->aux;
/* Here's where we put the ref from attach and successful walks */
kref_put(&efd->refcnt);
}
static void efd_fire_taps(struct eventfd *efd, int filter)
{
struct fd_tap *tap_i;
if (SLIST_EMPTY(&efd->fd_taps))
return;
/* We're not expecting many FD taps, so it's not worth splitting readers
* from writers or anything like that.
* TODO: (RCU) Locking to protect the list and the tap's existence. */
spin_lock(&efd->tap_lock);
SLIST_FOREACH(tap_i, &efd->fd_taps, link)
fire_tap(tap_i, filter);
spin_unlock(&efd->tap_lock);
}
static int has_counts(void *arg)
{
struct eventfd *efd = arg;
return atomic_read(&efd->counter) != 0;
}
/* The heart of reading an eventfd */
static unsigned long efd_read_efd(struct eventfd *efd, struct chan *c)
{
unsigned long old_count, new_count, ret;
while (1) {
old_count = atomic_read(&efd->counter);
if (!old_count) {
if (c->flag & O_NONBLOCK)
error(EAGAIN, "Would block on #%s read",
devname());
rendez_sleep(&efd->rv_readers, has_counts, efd);
} else {
if (efd->flags & EFD_SEMAPHORE) {
new_count = old_count - 1;
ret = 1;
} else {
new_count = 0;
ret = old_count;
}
if (atomic_cas(&efd->counter, old_count, new_count))
goto success;
}
}
success:
rendez_wakeup(&efd->rv_writers);
efd_fire_taps(efd, FDTAP_FILT_WRITABLE);
return ret;
}
static size_t efd_read(struct chan *c, void *ubuf, size_t n, off64_t offset)
{
struct eventfd *efd = c->aux;
switch (c->qid.path) {
case Qdir:
return devdirread(c, ubuf, n, efd_dir, ARRAY_SIZE(efd_dir),
devgen);
case Qctl:
return readnum(offset, ubuf, n, efd->flags, NUMSIZE32);
case Qefd:
/* ignoring the chan offset for Qefd */
return readnum(0, ubuf, n, efd_read_efd(efd, c), NUMSIZE64);
default:
panic("Bad Qid %p!", c->qid.path);
}
return -1;
}
static int has_room(void *arg)
{
struct eventfd *efd = arg;
return atomic_read(&efd->counter) != EFD_MAX_VAL;
}
/* The heart of writing an eventfd */
static void efd_write_efd(struct eventfd *efd, unsigned long add_to,
struct chan *c)
{
unsigned long old_count, new_count;
while (1) {
old_count = atomic_read(&efd->counter);
new_count = old_count + add_to;
if (new_count > EFD_MAX_VAL) {
if (c->flag & O_NONBLOCK)
error(EAGAIN, "Would block on #%s write",
devname());
rendez_sleep(&efd->rv_writers, has_room, efd);
} else {
if (atomic_cas(&efd->counter, old_count, new_count))
goto success;
}
}
success:
rendez_wakeup(&efd->rv_readers);
efd_fire_taps(efd, FDTAP_FILT_READABLE);
}
static size_t efd_write(struct chan *c, void *ubuf, size_t n, off64_t offset)
{
struct eventfd *efd = c->aux;
unsigned long write_val;
char num64[NUMSIZE64];
switch (c->qid.path) {
case Qctl:
/* If we want to allow runtime changing of settings, we can do
* it here. */
error(EFAIL, "No #%s ctl commands supported", devname());
break;
case Qefd:
/* We want to give strtoul a null-terminated buf (can't handle
* arbitrary user strings). Ignoring the chan offset too. */
if (n > sizeof(num64))
error(EAGAIN, "attempted to write %d chars, max %d", n,
sizeof(num64));
memcpy(num64, ubuf, n);
num64[n] = 0; /* enforce trailing 0 */
write_val = strtoul(num64, 0, 0);
if (write_val == (unsigned long)(-1))
error(EFAIL, "Eventfd write must not be -1");
efd_write_efd(efd, write_val, c);
break;
default:
panic("Bad Qid %p!", c->qid.path);
}
return n;
}
static char *efd_chaninfo(struct chan *c, char *ret, size_t ret_l)
{
struct eventfd *efd = c->aux;
snprintf(ret, ret_l, "QID type %s, flags %p, counter %p",
efd_dir[c->qid.path].name, efd->flags,
atomic_read(&efd->counter));
return ret;
}
static int efd_tapfd(struct chan *c, struct fd_tap *tap, int cmd)
{
struct eventfd *efd = c->aux;
int ret;
/* HANGUP, ERROR, and PRIORITY will never fire, but people can ask for
* them. We don't actually support HANGUP, but epoll implies it.
* Linux's eventfd cand have ERROR, so apps can ask for it. Likewise,
* priority is meaningless for us, but sometimes people ask for it. */
#define EFD_LEGAL_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE | \
FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY | \
FDTAP_FILT_ERROR)
switch (c->qid.path) {
case Qefd:
if (tap->filter & ~EFD_LEGAL_TAPS) {
set_error(ENOSYS, "Unsupported #%s tap, must be %p",
devname(), EFD_LEGAL_TAPS);
return -1;
}
spin_lock(&efd->tap_lock);
switch (cmd) {
case (FDTAP_CMD_ADD):
SLIST_INSERT_HEAD(&efd->fd_taps, tap, link);
ret = 0;
break;
case (FDTAP_CMD_REM):
SLIST_REMOVE(&efd->fd_taps, tap, fd_tap, link);
ret = 0;
break;
default:
set_error(ENOSYS, "Unsupported #%s tap command %p",
devname(), cmd);
ret = -1;
}
spin_unlock(&efd->tap_lock);
return ret;
default:
set_error(ENOSYS, "Can't tap #%s file type %d", devname(),
c->qid.path);
return -1;
}
}
struct dev efd_devtab __devtab = {
.name = "eventfd",
.reset = devreset,
.init = devinit,
.shutdown = devshutdown,
.attach = efd_attach,
.walk = efd_walk,
.stat = efd_stat,
.open = efd_open,
.create = devcreate,
.close = efd_close,
.read = efd_read,
.bread = devbread,
.write = efd_write,
.bwrite = devbwrite,
.remove = devremove,
.wstat = devwstat,
.power = devpower,
.chaninfo = efd_chaninfo,
.tapfd = efd_tapfd,
};