kern/drivers/dev/eventfd.c - akaros - Git at Google

 /* Copyright (c) 2015 Google Inc
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
  * #eventfd device, the kernel-side implementation of man 2 eventfd.
  *
  * Unlike the Linux interface, which takes host-endian u64s, we read and write
  * strings.  It's a little slower, but it maintains the distributed-system
  * nature of Plan 9 devices. */

 #include <ns.h>
 #include <kmalloc.h>
 #include <kref.h>
 #include <atomic.h>
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
 #include <error.h>
 #include <sys/queue.h>
 #include <fdtap.h>
 #include <syscall.h>

 struct dev efd_devtab;

 static char *devname(void)
 {
 	return efd_devtab.name;
 }

 enum {
 	Qdir,
 	Qctl,
 	Qefd,
 };

 static struct dirtab efd_dir[] = {
 	{".", {Qdir, 0, QTDIR}, 0, DMDIR | 0555},
 	{"ctl", {Qctl, 0, QTFILE}, 0, 0666},
 	{"efd", {Qefd, 0, QTFILE}, 8, 0666},
 };

 enum {
 	EFD_SEMAPHORE = 	1 << 0,
 	EFD_MAX_VAL =		(unsigned long)(-2), // i.e. 0xfffffffffffffffe
 };


 struct eventfd {
 	int 				flags;
 	atomic_t			counter;
 	struct fdtap_slist		fd_taps;
 	spinlock_t			tap_lock;
 	struct rendez			rv_readers;
 	struct rendez			rv_writers;
 	struct kref			refcnt;
 };


 static void efd_release(struct kref *kref)
 {
 	struct eventfd *efd = container_of(kref, struct eventfd, refcnt);

 	/* All FDs with taps must be closed before we decreffed all the chans */
 	assert(SLIST_EMPTY(&efd->fd_taps));
 	kfree(efd);
 }

 static struct chan *efd_attach(char *spec)
 {
 	struct chan *c;
 	struct eventfd *efd;

 	c = devattach(devname(), spec);
 	efd = kzmalloc(sizeof(struct eventfd), MEM_WAIT);
 	SLIST_INIT(&efd->fd_taps);
 	spinlock_init(&efd->tap_lock);
 	rendez_init(&efd->rv_readers);
 	rendez_init(&efd->rv_writers);
 	/* Attach and walk are the two sources of chans.  Each returns a
 	 * refcnt'd object, for the most part. */
 	kref_init(&efd->refcnt, efd_release, 1);
 	/* nothing special in the qid to ID this eventfd.  the main thing is the
 	 * aux.  we could put a debugging ID in the path like pipe. */
 	mkqid(&c->qid, Qdir, 0, QTDIR);
 	c->aux = efd;
 	/* just to be fancy and remove a syscall, if they pass spec == "sem",
 	 * then we'll treat them as being in semaphore mode. */
 	if (!strcmp(spec, "sem"))
 		efd->flags |= EFD_SEMAPHORE;
 	return c;
 }

 static struct walkqid *efd_walk(struct chan *c, struct chan *nc, char **name,
 				unsigned int nname)
 {
 	struct walkqid *wq;
 	struct eventfd *efd = c->aux;

 	wq = devwalk(c, nc, name, nname, efd_dir, ARRAY_SIZE(efd_dir), devgen);
 	/* Walk is a source of a distinct chan from this device.  The other
 	 * source is attach.  Once created, these chans will eventually be
 	 * closed, and when they close, they will decref their aux, efd.  All
 	 * chans within this *instance* of eventfd share the same efd.  Each one
 	 * will have one refcnt.  Each chan may also have several copies of its
 	 * pointer out there (e.g. FD dup), all of which have their own *chan*
 	 * refcnt.
 	 *
 	 * All of the above applies on successful walks that found all nname
 	 * parts of the path.  A mid-success is wq: we got something.  wq->clone
 	 * means we got to the end and the "big walk" considers this a success.
 	 *
 	 * There is a slight chance the new chan is the same as our original
 	 * chan (if nc == c when we're called).  In which case, there's only one
 	 * chan.  The number of refs on efd == the number of distinct chans
 	 * within this instance of #eventfd. */
 	if (wq != NULL && wq->clone != NULL && wq->clone != c)
 		kref_get(&efd->refcnt, 1);
 	return wq;
 }

 /* In the future, we could use stat / wstat to get and set O_NONBLOCK */
 static size_t efd_stat(struct chan *c, uint8_t *db, size_t n)
 {
 	return devstat(c, db, n, efd_dir, ARRAY_SIZE(efd_dir), devgen);
 }

 static struct chan *efd_open(struct chan *c, int omode)
 {
 	return devopen(c, omode, efd_dir, ARRAY_SIZE(efd_dir), devgen);
 }

 static void efd_close(struct chan *c)
 {
 	struct eventfd *efd = c->aux;

 	/* Here's where we put the ref from attach and successful walks */
 	kref_put(&efd->refcnt);
 }

 static void efd_fire_taps(struct eventfd *efd, int filter)
 {
 	struct fd_tap *tap_i;

 	if (SLIST_EMPTY(&efd->fd_taps))
 		return;
 	/* We're not expecting many FD taps, so it's not worth splitting readers
 	 * from writers or anything like that.
 	 * TODO: (RCU) Locking to protect the list and the tap's existence. */
 	spin_lock(&efd->tap_lock);
 	SLIST_FOREACH(tap_i, &efd->fd_taps, link)
 		fire_tap(tap_i, filter);
 	spin_unlock(&efd->tap_lock);
 }

 static int has_counts(void *arg)
 {
 	struct eventfd *efd = arg;

 	return atomic_read(&efd->counter) != 0;
 }

 /* The heart of reading an eventfd */
 static unsigned long efd_read_efd(struct eventfd *efd, struct chan *c)
 {
 	unsigned long old_count, new_count, ret;

 	while (1) {
 		old_count = atomic_read(&efd->counter);
 		if (!old_count) {
 			if (c->flag & O_NONBLOCK)
 				error(EAGAIN, "Would block on #%s read",
 				      devname());
 			rendez_sleep(&efd->rv_readers, has_counts, efd);
 		} else {
 			if (efd->flags & EFD_SEMAPHORE) {
 				new_count = old_count - 1;
 				ret = 1;
 			} else {
 				new_count = 0;
 				ret = old_count;
 			}
 			if (atomic_cas(&efd->counter, old_count, new_count))
 				goto success;
 		}
 	}
 success:
 	rendez_wakeup(&efd->rv_writers);
 	efd_fire_taps(efd, FDTAP_FILT_WRITABLE);
 	return ret;
 }

 static size_t efd_read(struct chan *c, void *ubuf, size_t n, off64_t offset)
 {
 	struct eventfd *efd = c->aux;

 	switch (c->qid.path) {
 	case Qdir:
 		return devdirread(c, ubuf, n, efd_dir, ARRAY_SIZE(efd_dir),
 				  devgen);
 	case Qctl:
 		return readnum(offset, ubuf, n, efd->flags, NUMSIZE32);
 	case Qefd:
 		/* ignoring the chan offset for Qefd */
 		return readnum(0, ubuf, n, efd_read_efd(efd, c), NUMSIZE64);
 	default:
 		panic("Bad Qid %p!", c->qid.path);
 	}
 	return -1;
 }

 static int has_room(void *arg)
 {
 	struct eventfd *efd = arg;
 	return atomic_read(&efd->counter) != EFD_MAX_VAL;
 }

 /* The heart of writing an eventfd */
 static void efd_write_efd(struct eventfd *efd, unsigned long add_to,
                           struct chan *c)
 {
 	unsigned long old_count, new_count;

 	while (1) {
 		old_count = atomic_read(&efd->counter);
 		new_count = old_count + add_to;
 		if (new_count > EFD_MAX_VAL) {
 			if (c->flag & O_NONBLOCK)
 				error(EAGAIN, "Would block on #%s write",
 				      devname());
 			rendez_sleep(&efd->rv_writers, has_room, efd);
 		} else {
 			if (atomic_cas(&efd->counter, old_count, new_count))
 				goto success;
 		}
 	}
 success:
 	rendez_wakeup(&efd->rv_readers);
 	efd_fire_taps(efd, FDTAP_FILT_READABLE);
 }

 static size_t efd_write(struct chan *c, void *ubuf, size_t n, off64_t offset)
 {
 	struct eventfd *efd = c->aux;
 	unsigned long write_val;
 	char num64[NUMSIZE64];

 	switch (c->qid.path) {
 	case Qctl:
 		/* If we want to allow runtime changing of settings, we can do
 		 * it here. */
 		error(EFAIL, "No #%s ctl commands supported", devname());
 		break;
 	case Qefd:
 		/* We want to give strtoul a null-terminated buf (can't handle
 		 * arbitrary user strings).  Ignoring the chan offset too. */
 		if (n > sizeof(num64))
 			error(EAGAIN, "attempted to write %d chars, max %d", n,
 				  sizeof(num64));
 		memcpy(num64, ubuf, n);
 		num64[n] = 0;	/* enforce trailing 0 */
 		write_val = strtoul(num64, 0, 0);
 		if (write_val == (unsigned long)(-1))
 			error(EFAIL, "Eventfd write must not be -1");
 		efd_write_efd(efd, write_val, c);
 		break;
 	default:
 		panic("Bad Qid %p!", c->qid.path);
 	}
 	return n;
 }

 static char *efd_chaninfo(struct chan *c, char *ret, size_t ret_l)
 {
 	struct eventfd *efd = c->aux;

 	snprintf(ret, ret_l, "QID type %s, flags %p, counter %p",
 		 efd_dir[c->qid.path].name, efd->flags,
 		 atomic_read(&efd->counter));
 	return ret;
 }

 static int efd_tapfd(struct chan *c, struct fd_tap *tap, int cmd)
 {
 	struct eventfd *efd = c->aux;
 	int ret;

 	/* HANGUP, ERROR, and PRIORITY will never fire, but people can ask for
 	 * them.  We don't actually support HANGUP, but epoll implies it.
 	 * Linux's eventfd cand have ERROR, so apps can ask for it.  Likewise,
 	 * priority is meaningless for us, but sometimes people ask for it. */
 #define EFD_LEGAL_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE |        \
                         FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |          \
                         FDTAP_FILT_ERROR)

 	switch (c->qid.path) {
 	case Qefd:
 		if (tap->filter & ~EFD_LEGAL_TAPS) {
 			set_error(ENOSYS, "Unsupported #%s tap, must be %p",
 				  devname(), EFD_LEGAL_TAPS);
 			return -1;
 		}
 		spin_lock(&efd->tap_lock);
 		switch (cmd) {
 		case (FDTAP_CMD_ADD):
 			SLIST_INSERT_HEAD(&efd->fd_taps, tap, link);
 			ret = 0;
 			break;
 		case (FDTAP_CMD_REM):
 			SLIST_REMOVE(&efd->fd_taps, tap, fd_tap, link);
 			ret = 0;
 			break;
 		default:
 			set_error(ENOSYS, "Unsupported #%s tap command %p",
 				  devname(), cmd);
 			ret = -1;
 		}
 		spin_unlock(&efd->tap_lock);
 		return ret;
 	default:
 		set_error(ENOSYS, "Can't tap #%s file type %d", devname(),
 		          c->qid.path);
 		return -1;
 	}
 }

 struct dev efd_devtab __devtab = {
 	.name = "eventfd",
 	.reset = devreset,
 	.init = devinit,
 	.shutdown = devshutdown,
 	.attach = efd_attach,
 	.walk = efd_walk,
 	.stat = efd_stat,
 	.open = efd_open,
 	.create = devcreate,
 	.close = efd_close,
 	.read = efd_read,
 	.bread = devbread,
 	.write = efd_write,
 	.bwrite = devbwrite,
 	.remove = devremove,
 	.wstat = devwstat,
 	.power = devpower,
 	.chaninfo = efd_chaninfo,
 	.tapfd = efd_tapfd,
 };
	/* Copyright (c) 2015 Google Inc
	* Barret Rhoden <brho@cs.berkeley.edu>
	* See LICENSE for details.
	*
	* #eventfd device, the kernel-side implementation of man 2 eventfd.
	*
	* Unlike the Linux interface, which takes host-endian u64s, we read and write
	* strings. It's a little slower, but it maintains the distributed-system
	* nature of Plan 9 devices. */

	#include <ns.h>
	#include <kmalloc.h>
	#include <kref.h>
	#include <atomic.h>
	#include <string.h>
	#include <stdio.h>
	#include <assert.h>
	#include <error.h>
	#include <sys/queue.h>
	#include <fdtap.h>
	#include <syscall.h>

	struct dev efd_devtab;

	static char *devname(void)
	{
	return efd_devtab.name;
	}

	enum {
	Qdir,
	Qctl,
	Qefd,
	};

	static struct dirtab efd_dir[] = {
	{".", {Qdir, 0, QTDIR}, 0, DMDIR \| 0555},
	{"ctl", {Qctl, 0, QTFILE}, 0, 0666},
	{"efd", {Qefd, 0, QTFILE}, 8, 0666},
	};

	enum {
	EFD_SEMAPHORE = 1 << 0,
	EFD_MAX_VAL = (unsigned long)(-2), // i.e. 0xfffffffffffffffe
	};


	struct eventfd {
	int flags;
	atomic_t counter;
	struct fdtap_slist fd_taps;
	spinlock_t tap_lock;
	struct rendez rv_readers;
	struct rendez rv_writers;
	struct kref refcnt;
	};


	static void efd_release(struct kref *kref)
	{
	struct eventfd *efd = container_of(kref, struct eventfd, refcnt);

	/* All FDs with taps must be closed before we decreffed all the chans */
	assert(SLIST_EMPTY(&efd->fd_taps));
	kfree(efd);
	}

	static struct chan efd_attach(char spec)
	{
	struct chan *c;
	struct eventfd *efd;

	c = devattach(devname(), spec);
	efd = kzmalloc(sizeof(struct eventfd), MEM_WAIT);
	SLIST_INIT(&efd->fd_taps);
	spinlock_init(&efd->tap_lock);
	rendez_init(&efd->rv_readers);
	rendez_init(&efd->rv_writers);
	/* Attach and walk are the two sources of chans. Each returns a
	* refcnt'd object, for the most part. */
	kref_init(&efd->refcnt, efd_release, 1);
	/* nothing special in the qid to ID this eventfd. the main thing is the
	* aux. we could put a debugging ID in the path like pipe. */
	mkqid(&c->qid, Qdir, 0, QTDIR);
	c->aux = efd;
	/* just to be fancy and remove a syscall, if they pass spec == "sem",
	* then we'll treat them as being in semaphore mode. */
	if (!strcmp(spec, "sem"))
	efd->flags \|= EFD_SEMAPHORE;
	return c;
	}

	static struct walkqid efd_walk(struct chan c, struct chan nc, char *name,
	unsigned int nname)
	{
	struct walkqid *wq;
	struct eventfd *efd = c->aux;

	wq = devwalk(c, nc, name, nname, efd_dir, ARRAY_SIZE(efd_dir), devgen);
	/* Walk is a source of a distinct chan from this device. The other
	* source is attach. Once created, these chans will eventually be
	* closed, and when they close, they will decref their aux, efd. All
	* chans within this instance of eventfd share the same efd. Each one
	* will have one refcnt. Each chan may also have several copies of its
	* pointer out there (e.g. FD dup), all of which have their own chan
	* refcnt.
	*
	* All of the above applies on successful walks that found all nname
	* parts of the path. A mid-success is wq: we got something. wq->clone
	* means we got to the end and the "big walk" considers this a success.
	*
	* There is a slight chance the new chan is the same as our original
	* chan (if nc == c when we're called). In which case, there's only one
	* chan. The number of refs on efd == the number of distinct chans
	* within this instance of #eventfd. */
	if (wq != NULL && wq->clone != NULL && wq->clone != c)
	kref_get(&efd->refcnt, 1);
	return wq;
	}

	/* In the future, we could use stat / wstat to get and set O_NONBLOCK */
	static size_t efd_stat(struct chan c, uint8_t db, size_t n)
	{
	return devstat(c, db, n, efd_dir, ARRAY_SIZE(efd_dir), devgen);
	}

	static struct chan efd_open(struct chan c, int omode)
	{
	return devopen(c, omode, efd_dir, ARRAY_SIZE(efd_dir), devgen);
	}

	static void efd_close(struct chan *c)
	{
	struct eventfd *efd = c->aux;

	/* Here's where we put the ref from attach and successful walks */
	kref_put(&efd->refcnt);
	}

	static void efd_fire_taps(struct eventfd *efd, int filter)
	{
	struct fd_tap *tap_i;

	if (SLIST_EMPTY(&efd->fd_taps))
	return;
	/* We're not expecting many FD taps, so it's not worth splitting readers
	* from writers or anything like that.
	* TODO: (RCU) Locking to protect the list and the tap's existence. */
	spin_lock(&efd->tap_lock);
	SLIST_FOREACH(tap_i, &efd->fd_taps, link)
	fire_tap(tap_i, filter);
	spin_unlock(&efd->tap_lock);
	}

	static int has_counts(void *arg)
	{
	struct eventfd *efd = arg;

	return atomic_read(&efd->counter) != 0;
	}

	/* The heart of reading an eventfd */
	static unsigned long efd_read_efd(struct eventfd efd, struct chan c)
	{
	unsigned long old_count, new_count, ret;

	while (1) {
	old_count = atomic_read(&efd->counter);
	if (!old_count) {
	if (c->flag & O_NONBLOCK)
	error(EAGAIN, "Would block on #%s read",
	devname());
	rendez_sleep(&efd->rv_readers, has_counts, efd);
	} else {
	if (efd->flags & EFD_SEMAPHORE) {
	new_count = old_count - 1;
	ret = 1;
	} else {
	new_count = 0;
	ret = old_count;
	}
	if (atomic_cas(&efd->counter, old_count, new_count))
	goto success;
	}
	}
	success:
	rendez_wakeup(&efd->rv_writers);
	efd_fire_taps(efd, FDTAP_FILT_WRITABLE);
	return ret;
	}

	static size_t efd_read(struct chan c, void ubuf, size_t n, off64_t offset)
	{
	struct eventfd *efd = c->aux;

	switch (c->qid.path) {
	case Qdir:
	return devdirread(c, ubuf, n, efd_dir, ARRAY_SIZE(efd_dir),
	devgen);
	case Qctl:
	return readnum(offset, ubuf, n, efd->flags, NUMSIZE32);
	case Qefd:
	/* ignoring the chan offset for Qefd */
	return readnum(0, ubuf, n, efd_read_efd(efd, c), NUMSIZE64);
	default:
	panic("Bad Qid %p!", c->qid.path);
	}
	return -1;
	}

	static int has_room(void *arg)
	{
	struct eventfd *efd = arg;
	return atomic_read(&efd->counter) != EFD_MAX_VAL;
	}

	/* The heart of writing an eventfd */
	static void efd_write_efd(struct eventfd *efd, unsigned long add_to,
	struct chan *c)
	{
	unsigned long old_count, new_count;

	while (1) {
	old_count = atomic_read(&efd->counter);
	new_count = old_count + add_to;
	if (new_count > EFD_MAX_VAL) {
	if (c->flag & O_NONBLOCK)
	error(EAGAIN, "Would block on #%s write",
	devname());
	rendez_sleep(&efd->rv_writers, has_room, efd);
	} else {
	if (atomic_cas(&efd->counter, old_count, new_count))
	goto success;
	}
	}
	success:
	rendez_wakeup(&efd->rv_readers);
	efd_fire_taps(efd, FDTAP_FILT_READABLE);
	}

	static size_t efd_write(struct chan c, void ubuf, size_t n, off64_t offset)
	{
	struct eventfd *efd = c->aux;
	unsigned long write_val;
	char num64[NUMSIZE64];

	switch (c->qid.path) {
	case Qctl:
	/* If we want to allow runtime changing of settings, we can do
	* it here. */
	error(EFAIL, "No #%s ctl commands supported", devname());
	break;
	case Qefd:
	/* We want to give strtoul a null-terminated buf (can't handle
	* arbitrary user strings). Ignoring the chan offset too. */
	if (n > sizeof(num64))
	error(EAGAIN, "attempted to write %d chars, max %d", n,
	sizeof(num64));
	memcpy(num64, ubuf, n);
	num64[n] = 0; /* enforce trailing 0 */
	write_val = strtoul(num64, 0, 0);
	if (write_val == (unsigned long)(-1))
	error(EFAIL, "Eventfd write must not be -1");
	efd_write_efd(efd, write_val, c);
	break;
	default:
	panic("Bad Qid %p!", c->qid.path);
	}
	return n;
	}

	static char efd_chaninfo(struct chan c, char *ret, size_t ret_l)
	{
	struct eventfd *efd = c->aux;

	snprintf(ret, ret_l, "QID type %s, flags %p, counter %p",
	efd_dir[c->qid.path].name, efd->flags,
	atomic_read(&efd->counter));
	return ret;
	}

	static int efd_tapfd(struct chan c, struct fd_tap tap, int cmd)
	{
	struct eventfd *efd = c->aux;
	int ret;

	/* HANGUP, ERROR, and PRIORITY will never fire, but people can ask for
	* them. We don't actually support HANGUP, but epoll implies it.
	* Linux's eventfd cand have ERROR, so apps can ask for it. Likewise,
	* priority is meaningless for us, but sometimes people ask for it. */
	#define EFD_LEGAL_TAPS (FDTAP_FILT_READABLE \| FDTAP_FILT_WRITABLE \| \
	FDTAP_FILT_HANGUP \| FDTAP_FILT_PRIORITY \| \
	FDTAP_FILT_ERROR)

	switch (c->qid.path) {
	case Qefd:
	if (tap->filter & ~EFD_LEGAL_TAPS) {
	set_error(ENOSYS, "Unsupported #%s tap, must be %p",
	devname(), EFD_LEGAL_TAPS);
	return -1;
	}
	spin_lock(&efd->tap_lock);
	switch (cmd) {
	case (FDTAP_CMD_ADD):
	SLIST_INSERT_HEAD(&efd->fd_taps, tap, link);
	ret = 0;
	break;
	case (FDTAP_CMD_REM):
	SLIST_REMOVE(&efd->fd_taps, tap, fd_tap, link);
	ret = 0;
	break;
	default:
	set_error(ENOSYS, "Unsupported #%s tap command %p",
	devname(), cmd);
	ret = -1;
	}
	spin_unlock(&efd->tap_lock);
	return ret;
	default:
	set_error(ENOSYS, "Can't tap #%s file type %d", devname(),
	c->qid.path);
	return -1;
	}
	}

	struct dev efd_devtab __devtab = {
	.name = "eventfd",
	.reset = devreset,
	.init = devinit,
	.shutdown = devshutdown,
	.attach = efd_attach,
	.walk = efd_walk,
	.stat = efd_stat,
	.open = efd_open,
	.create = devcreate,
	.close = efd_close,
	.read = efd_read,
	.bread = devbread,
	.write = efd_write,
	.bwrite = devbwrite,
	.remove = devremove,
	.wstat = devwstat,
	.power = devpower,
	.chaninfo = efd_chaninfo,
	.tapfd = efd_tapfd,
	};