| /* Copyright (c) 2015 Google Inc | 
 |  * Barret Rhoden <brho@cs.berkeley.edu> | 
 |  * See LICENSE for details. | 
 |  * | 
 |  * #eventfd device, the kernel-side implementation of man 2 eventfd. | 
 |  * | 
 |  * Unlike the Linux interface, which takes host-endian u64s, we read and write | 
 |  * strings.  It's a little slower, but it maintains the distributed-system | 
 |  * nature of Plan 9 devices. */ | 
 |  | 
 | #include <ns.h> | 
 | #include <kmalloc.h> | 
 | #include <kref.h> | 
 | #include <atomic.h> | 
 | #include <string.h> | 
 | #include <stdio.h> | 
 | #include <assert.h> | 
 | #include <error.h> | 
 | #include <sys/queue.h> | 
 | #include <fdtap.h> | 
 | #include <syscall.h> | 
 |  | 
 | struct dev efd_devtab; | 
 |  | 
 | static char *devname(void) | 
 | { | 
 | 	return efd_devtab.name; | 
 | } | 
 |  | 
 | enum { | 
 | 	Qdir, | 
 | 	Qctl, | 
 | 	Qefd, | 
 | }; | 
 |  | 
 | static struct dirtab efd_dir[] = { | 
 | 	{".", {Qdir, 0, QTDIR}, 0, DMDIR | 0555}, | 
 | 	{"ctl", {Qctl, 0, QTFILE}, 0, 0666}, | 
 | 	{"efd", {Qefd, 0, QTFILE}, 8, 0666}, | 
 | }; | 
 |  | 
 | enum { | 
 | 	EFD_SEMAPHORE = 			1 << 0, | 
 | 	EFD_MAX_VAL =				(unsigned long)(-2), // i.e. 0xfffffffffffffffe | 
 | }; | 
 |  | 
 |  | 
 | struct eventfd { | 
 | 	int 						flags; | 
 | 	atomic_t					counter; | 
 | 	struct fdtap_slist			fd_taps; | 
 | 	spinlock_t					tap_lock; | 
 | 	struct rendez				rv_readers; | 
 | 	struct rendez				rv_writers; | 
 | 	struct kref					refcnt; | 
 | }; | 
 |  | 
 |  | 
 | static void efd_release(struct kref *kref) | 
 | { | 
 | 	struct eventfd *efd = container_of(kref, struct eventfd, refcnt); | 
 | 	/* All FDs with taps should be closed before we decreffed all the chans */ | 
 | 	assert(SLIST_EMPTY(&efd->fd_taps)); | 
 | 	kfree(efd); | 
 | } | 
 |  | 
 | static struct chan *efd_attach(char *spec) | 
 | { | 
 | 	struct chan *c; | 
 | 	struct eventfd *efd; | 
 |  | 
 | 	c = devattach(devname(), spec); | 
 | 	efd = kzmalloc(sizeof(struct eventfd), MEM_WAIT); | 
 | 	SLIST_INIT(&efd->fd_taps); | 
 | 	spinlock_init(&efd->tap_lock); | 
 | 	rendez_init(&efd->rv_readers); | 
 | 	rendez_init(&efd->rv_writers); | 
 | 	/* Attach and walk are the two sources of chans.  Each returns a refcnt'd | 
 | 	 * object, for the most part. */ | 
 | 	kref_init(&efd->refcnt, efd_release, 1); | 
 | 	/* nothing special in the qid to ID this eventfd.  the main thing is the | 
 | 	 * aux.  we could put a debugging ID in the path like pipe. */ | 
 | 	mkqid(&c->qid, Qdir, 0, QTDIR); | 
 | 	c->aux = efd; | 
 | 	/* just to be fancy and remove a syscall, if they pass spec == "sem", then | 
 | 	 * we'll treat them as being in semaphore mode. */ | 
 | 	if (!strcmp(spec, "sem")) | 
 | 		efd->flags |= EFD_SEMAPHORE; | 
 | 	return c; | 
 | } | 
 |  | 
 | static struct walkqid *efd_walk(struct chan *c, struct chan *nc, char **name, | 
 | 								unsigned int nname) | 
 | { | 
 | 	struct walkqid *wq; | 
 | 	struct eventfd *efd = c->aux; | 
 |  | 
 | 	wq = devwalk(c, nc, name, nname, efd_dir, ARRAY_SIZE(efd_dir), devgen); | 
 | 	/* Walk is a source of a distinct chan from this device.  The other source | 
 | 	 * is attach.  Once created, these chans will eventually be closed, and when | 
 | 	 * they close, they will decref their aux, efd.  All chans within this | 
 | 	 * *instance* of eventfd share the same efd.  Each one will have one refcnt. | 
 | 	 * Each chan may also have several copies of its pointer out there (e.g. FD | 
 | 	 * dup), all of which have their own *chan* refcnt. | 
 | 	 * | 
 | 	 * All of the above applies on successful walks that found all nname parts | 
 | 	 * of the path.  A mid-success is wq: we got something.  wq->clone means we | 
 | 	 * got to the end and the "big walk" considers this a success. | 
 | 	 * | 
 | 	 * There is a slight chance the new chan is the same as our original chan | 
 | 	 * (if nc == c when we're called).  In which case, there's only one chan. | 
 | 	 * The number of refs on efd == the number of distinct chans within this | 
 | 	 * instance of #eventfd. */ | 
 | 	if (wq != NULL && wq->clone != NULL && wq->clone != c) | 
 | 		kref_get(&efd->refcnt, 1); | 
 | 	return wq; | 
 | } | 
 |  | 
 | /* In the future, we could use stat / wstat to get and set O_NONBLOCK */ | 
 | static size_t efd_stat(struct chan *c, uint8_t *db, size_t n) | 
 | { | 
 | 	return devstat(c, db, n, efd_dir, ARRAY_SIZE(efd_dir), devgen); | 
 | } | 
 |  | 
 | static struct chan *efd_open(struct chan *c, int omode) | 
 | { | 
 | 	return devopen(c, omode, efd_dir, ARRAY_SIZE(efd_dir), devgen); | 
 | } | 
 |  | 
 | static void efd_close(struct chan *c) | 
 | { | 
 | 	struct eventfd *efd = c->aux; | 
 | 	/* Here's where we put the ref from attach and successful walks */ | 
 | 	kref_put(&efd->refcnt); | 
 | } | 
 |  | 
 | static void efd_fire_taps(struct eventfd *efd, int filter) | 
 | { | 
 | 	struct fd_tap *tap_i; | 
 | 	if (SLIST_EMPTY(&efd->fd_taps)) | 
 | 		return; | 
 | 	/* We're not expecting many FD taps, so it's not worth splitting readers | 
 | 	 * from writers or anything like that. | 
 | 	 * TODO: (RCU) Locking to protect the list and the tap's existence. */ | 
 | 	spin_lock(&efd->tap_lock); | 
 | 	SLIST_FOREACH(tap_i, &efd->fd_taps, link) | 
 | 		fire_tap(tap_i, filter); | 
 | 	spin_unlock(&efd->tap_lock); | 
 | } | 
 |  | 
 | static int has_counts(void *arg) | 
 | { | 
 | 	struct eventfd *efd = arg; | 
 | 	return atomic_read(&efd->counter) != 0; | 
 | } | 
 |  | 
 | /* The heart of reading an eventfd */ | 
 | static unsigned long efd_read_efd(struct eventfd *efd, struct chan *c) | 
 | { | 
 | 	unsigned long old_count, new_count, ret; | 
 | 	while (1) { | 
 | 		old_count = atomic_read(&efd->counter); | 
 | 		if (!old_count) { | 
 | 			if (c->flag & O_NONBLOCK) | 
 | 				error(EAGAIN, "Would block on #%s read", devname()); | 
 | 			rendez_sleep(&efd->rv_readers, has_counts, efd); | 
 | 		} else { | 
 | 			if (efd->flags & EFD_SEMAPHORE) { | 
 | 				new_count = old_count - 1; | 
 | 				ret = 1; | 
 | 			} else { | 
 | 				new_count = 0; | 
 | 				ret = old_count; | 
 | 			} | 
 | 			if (atomic_cas(&efd->counter, old_count, new_count)) | 
 | 				goto success; | 
 | 		} | 
 | 	} | 
 | success: | 
 | 	rendez_wakeup(&efd->rv_writers); | 
 | 	efd_fire_taps(efd, FDTAP_FILT_WRITABLE); | 
 | 	return ret; | 
 | } | 
 |  | 
 | static size_t efd_read(struct chan *c, void *ubuf, size_t n, off64_t offset) | 
 | { | 
 | 	struct eventfd *efd = c->aux; | 
 |  | 
 | 	switch (c->qid.path) { | 
 | 		case Qdir: | 
 | 			return devdirread(c, ubuf, n, efd_dir, ARRAY_SIZE(efd_dir), | 
 | 							  devgen); | 
 | 		case Qctl: | 
 | 			return readnum(offset, ubuf, n, efd->flags, NUMSIZE32); | 
 | 		case Qefd: | 
 | 			/* ignoring the chan offset for Qefd */ | 
 | 			return readnum(0, ubuf, n, efd_read_efd(efd, c), | 
 | 						   NUMSIZE64); | 
 | 		default: | 
 | 			panic("Bad Qid %p!", c->qid.path); | 
 | 	} | 
 | 	return -1; | 
 | } | 
 |  | 
 | static int has_room(void *arg) | 
 | { | 
 | 	struct eventfd *efd = arg; | 
 | 	return atomic_read(&efd->counter) != EFD_MAX_VAL; | 
 | } | 
 |  | 
 | /* The heart of writing an eventfd */ | 
 | static void efd_write_efd(struct eventfd *efd, unsigned long add_to, | 
 |                           struct chan *c) | 
 | { | 
 | 	unsigned long old_count, new_count; | 
 | 	while (1) { | 
 | 		old_count = atomic_read(&efd->counter); | 
 | 		new_count = old_count + add_to; | 
 | 		if (new_count > EFD_MAX_VAL) { | 
 | 			if (c->flag & O_NONBLOCK) | 
 | 				error(EAGAIN, "Would block on #%s write", devname()); | 
 | 			rendez_sleep(&efd->rv_writers, has_room, efd); | 
 | 		} else { | 
 | 			if (atomic_cas(&efd->counter, old_count, new_count)) | 
 | 				goto success; | 
 | 		} | 
 | 	} | 
 | success: | 
 | 	rendez_wakeup(&efd->rv_readers); | 
 | 	efd_fire_taps(efd, FDTAP_FILT_READABLE); | 
 | } | 
 |  | 
 | static size_t efd_write(struct chan *c, void *ubuf, size_t n, off64_t offset) | 
 | { | 
 | 	struct eventfd *efd = c->aux; | 
 | 	unsigned long write_val; | 
 | 	char num64[NUMSIZE64]; | 
 |  | 
 | 	switch (c->qid.path) { | 
 | 		case Qctl: | 
 | 			/* If we want to allow runtime changing of settings, we can do it | 
 | 			 * here. */ | 
 | 			error(EFAIL, "No #%s ctl commands supported", devname()); | 
 | 			break; | 
 | 		case Qefd: | 
 | 			/* We want to give strtoul a null-terminated buf (can't handle | 
 | 			 * arbitrary user strings).  Ignoring the chan offset too. */ | 
 | 			if (n > sizeof(num64)) | 
 | 				error(EAGAIN, "attempted to write %d chars, max %d", n, | 
 | 					  sizeof(num64)); | 
 | 			memcpy(num64, ubuf, n); | 
 | 			num64[n] = 0;	/* enforce trailing 0 */ | 
 | 			write_val = strtoul(num64, 0, 0); | 
 | 			if (write_val == (unsigned long)(-1)) | 
 | 				error(EFAIL, "Eventfd write must not be -1"); | 
 | 			efd_write_efd(efd, write_val, c); | 
 | 			break; | 
 | 		default: | 
 | 			panic("Bad Qid %p!", c->qid.path); | 
 | 	} | 
 | 	return n; | 
 | } | 
 |  | 
 | static char *efd_chaninfo(struct chan *c, char *ret, size_t ret_l) | 
 | { | 
 | 	struct eventfd *efd = c->aux; | 
 |  | 
 | 	snprintf(ret, ret_l, "QID type %s, flags %p, counter %p", | 
 | 	         efd_dir[c->qid.path].name, efd->flags, atomic_read(&efd->counter)); | 
 | 	return ret; | 
 | } | 
 |  | 
 | static int efd_tapfd(struct chan *c, struct fd_tap *tap, int cmd) | 
 | { | 
 | 	struct eventfd *efd = c->aux; | 
 | 	int ret; | 
 |  | 
 | 	/* HANGUP, ERROR, and PRIORITY will never fire, but people can ask for them. | 
 | 	 * We don't actually support HANGUP, but epoll implies it.  Linux's eventfd | 
 | 	 * cand have ERROR, so apps can ask for it.  Likewise, priority is | 
 | 	 * meaningless for us, but sometimes people ask for it. */ | 
 | 	#define EFD_LEGAL_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE |        \ | 
 | 	                        FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |          \ | 
 | 	                        FDTAP_FILT_ERROR) | 
 |  | 
 | 	switch (c->qid.path) { | 
 | 		case Qefd: | 
 | 			if (tap->filter & ~EFD_LEGAL_TAPS) { | 
 | 				set_error(ENOSYS, "Unsupported #%s tap, must be %p", devname(), | 
 | 						  EFD_LEGAL_TAPS); | 
 | 				return -1; | 
 | 			} | 
 | 			spin_lock(&efd->tap_lock); | 
 | 			switch (cmd) { | 
 | 				case (FDTAP_CMD_ADD): | 
 | 					SLIST_INSERT_HEAD(&efd->fd_taps, tap, link); | 
 | 					ret = 0; | 
 | 					break; | 
 | 				case (FDTAP_CMD_REM): | 
 | 					SLIST_REMOVE(&efd->fd_taps, tap, fd_tap, link); | 
 | 					ret = 0; | 
 | 					break; | 
 | 				default: | 
 | 					set_error(ENOSYS, "Unsupported #%s tap command %p", | 
 | 							  devname(), cmd); | 
 | 					ret = -1; | 
 | 			} | 
 | 			spin_unlock(&efd->tap_lock); | 
 | 			return ret; | 
 | 		default: | 
 | 			set_error(ENOSYS, "Can't tap #%s file type %d", devname(), | 
 | 			          c->qid.path); | 
 | 			return -1; | 
 | 	} | 
 | } | 
 |  | 
 | struct dev efd_devtab __devtab = { | 
 | 	.name = "eventfd", | 
 | 	.reset = devreset, | 
 | 	.init = devinit, | 
 | 	.shutdown = devshutdown, | 
 | 	.attach = efd_attach, | 
 | 	.walk = efd_walk, | 
 | 	.stat = efd_stat, | 
 | 	.open = efd_open, | 
 | 	.create = devcreate, | 
 | 	.close = efd_close, | 
 | 	.read = efd_read, | 
 | 	.bread = devbread, | 
 | 	.write = efd_write, | 
 | 	.bwrite = devbwrite, | 
 | 	.remove = devremove, | 
 | 	.wstat = devwstat, | 
 | 	.power = devpower, | 
 | 	.chaninfo = efd_chaninfo, | 
 | 	.tapfd = efd_tapfd, | 
 | }; |