kern/src/ceq.c - upstream - Git at Google

 /* Copyright (c) 2015 Google Inc.
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
  * Coalescing Event Queue: encapuslates the essence of epoll/kqueue in shared
  * memory: a dense array of sticky status bits.
  *
  * Kernel side (producer)
  *
  * All of the printks are just us helping the user debug their CEQs. */

 #include <ceq.h>
 #include <process.h>
 #include <stdio.h>
 #include <umem.h>

 static void error_addr(struct ceq *ceq, struct proc *p, void *addr)
 {
 	printk("[kernel] Invalid ceq (%p) bad addr %p for proc %d\n", ceq,
 	       addr, p->pid);
 }

 static void ceq_update_max_event(struct ceq *ceq, unsigned int new_max)
 {
 	unsigned int old_max;

 	do {
 		old_max = atomic_read(&ceq->max_event_ever);
 		if (new_max <= old_max)
 			return;
 	} while (!atomic_cas(&ceq->max_event_ever, old_max, new_max));
 }

 void send_ceq_msg(struct ceq *ceq, struct proc *p, struct event_msg *msg)
 {
 	struct ceq_event *ceq_ev;
 	int32_t *ring_slot;
 	unsigned long my_slot;
 	int loops = 0;
 	#define NR_RING_TRIES 10

 	/* should have been checked by the kernel func that called us */
 	assert(is_user_rwaddr(ceq, sizeof(struct ceq)));
 	if (msg->ev_type >= ceq->nr_events) {
 		printk("[kernel] CEQ %p too small.  Wanted %d, had %d\n", ceq,
 		       msg->ev_type, ceq->nr_events);
 		return;
 	}
 	ceq_update_max_event(ceq, msg->ev_type);
 	/* ACCESS_ONCE, prevent the compiler from rereading ceq->events later,
 	 * and possibly getting a new, illegal version after our check */
 	ceq_ev = &(ACCESS_ONCE(ceq->events))[msg->ev_type];
 	if (!is_user_rwaddr(ceq_ev, sizeof(struct ceq_event))) {
 		error_addr(ceq, p, ceq);
 		return;
 	}
 	/* ideally, we'd like the blob to be posted after the coal, so that the
 	 * 'reason' for the blob is present when the blob is.  but we can't
 	 * guarantee that.  after we write the coal, the cons could consume
 	 * that.  then the next time it looks at us, it could just see the blob
 	 * - so there's no good way to keep them together.  the user will just
 	 *   have to deal with it.  in that case, we might as well do it first,
 	 *   to utilize the atomic ops's memory barrier. */
 	ceq_ev->blob_data = (uint64_t)msg->ev_arg3;
 	switch (ceq->operation) {
 	case (CEQ_OR):
 		atomic_or(&ceq_ev->coalesce, msg->ev_arg2);
 		break;
 	case (CEQ_ADD):
 		atomic_add(&ceq_ev->coalesce, msg->ev_arg2);
 		break;
 	default:
 		printk("[kernel] CEQ %p invalid op %d\n", ceq, ceq->operation);
 		return;
 	}
 	/* write before checking if we need to post (covered by the atomic) */
 	if (ceq_ev->idx_posted) {
 		/* our entry was updated and posted was still set: we know the
 		 * consumer will still check it, so we can safely leave.  If we
 		 * ever have exit codes or something from send_*_msg, then we
 		 * can tell the kernel to not bother with INDIRS/IPIs/etc.  This
 		 * is unnecessary now since INDIRs are throttled */
 		return;
 	}
 	/* at this point, we need to make sure the cons looks at our entry.  it
 	 * may have already done so while we were mucking around, but 'poking'
 	 * them to look again can't hurt */
 	ceq_ev->idx_posted = TRUE;
 	/* idx_posted write happens before the writes posting it.  the following
 	 * atomic provides the cpu mb() */
 	cmb();
 	/* I considered checking the buffer for full-ness or the ceq overflow
 	 * here.  Those would be reads, which would require a wrmb() right above
 	 * for every ring post, all for something we check for later anyways and
 	 * for something that should be rare.  In return, when we are
 	 * overflowed, which should be rare if the user sizes their ring buffer
 	 * appropriately, we go through a little more hassle below. */
 	/* I tried doing this with fetch_and_add to avoid the while loop and
 	 * picking a number of times to try.  The trick is that you need to back
 	 * out, and could have multiple producers working on the same slot.
 	 * Although the overflow makes it okay for the producers idxes to be
 	 * clobbered, it's not okay to have two producers on the same slot,
 	 * since there'd only be one consumer.  Theoretically, you could have a
 	 * producer delayed a long time that just clobbers an index at some
 	 * point in the future, or leaves an index in the non-init state (-1).
 	 * It's a mess. */
 	do {
 		cmb();	/* reread the indices */
 		my_slot = atomic_read(&ceq->prod_idx);
 		if (__ring_full(ceq->ring_sz, my_slot,
 		                atomic_read(&ceq->cons_pub_idx))) {
 			ceq->ring_overflowed = TRUE;
 			return;
 		}
 		if (loops++ == NR_RING_TRIES) {
 			ceq->ring_overflowed = TRUE;
 			return;
 		}
 	} while (!atomic_cas(&ceq->prod_idx, my_slot, my_slot + 1));
 	/* ring_slot is a user pointer, calculated by ring, my_slot, and sz */
 	ring_slot = &(ACCESS_ONCE(ceq->ring))[my_slot & (ceq->ring_sz - 1)];
 	if (!is_user_rwaddr(ring_slot, sizeof(int32_t))) {
 		/* This is a serious user error.  We're just bailing out, and
 		 * any consumers might be spinning waiting on us to produce.
 		 * Probably not though, since the ring slot is bad memory. */
 		error_addr(ceq, p, ring_slot);
 		return;
 	}
 	/* At this point, we have a valid slot */
 	*ring_slot = msg->ev_type;
 }

 void ceq_dumper(int pid, struct event_queue *ev_q)
 {
 	struct proc *p;
 	uintptr_t switch_state;
 	struct ceq *ceq;

 	p = pid2proc(pid);
 	if (!p) {
 		printk("No such proc %d\n", pid);
 		return;
 	}
 	switch_state = switch_to(p);
 	if (ev_q->ev_mbox->type != EV_MBOX_CEQ) {
 		printk("Not a CEQ evq (type %d)\n", ev_q->ev_mbox->type);
 		goto out;
 	}
 	ceq = &ev_q->ev_mbox->ceq;
 	printk("CEQ %p\n---------------\n"
 	       "\tevents ptr %p\n"
 	       "\tnr_events %d\n"
 	       "\tlast_recovered %d\n"
 	       "\tmax_event_ever %ld\n"
 	       "\tring %p\n"
 	       "\tring_sz %d\n"
 	       "\toperation %d\n"
 	       "\tring_overflowed %d\n"
 	       "\toverflow_recovery %d\n"
 	       "\tprod_idx %lu\n"
 	       "\tcons_pub_idx %lu\n"
 	       "\tcons_pvt_idx %lu\n"
 	       "\n",
 		   ceq,
 	       ceq->events,
 	       ceq->nr_events,
 	       ceq->last_recovered,
 	       atomic_read(&ceq->max_event_ever),
 	       ceq->ring,
 	       ceq->ring_sz,
 	       ceq->operation,
 	       ceq->ring_overflowed,
 	       ceq->overflow_recovery,
 	       atomic_read(&ceq->prod_idx),
 	       atomic_read(&ceq->cons_pub_idx),
 	       atomic_read(&ceq->cons_pvt_idx));
 	for (int i = 0; i < atomic_read(&ceq->max_event_ever) + 1; i++)
 		printk("\tEvent%3d, coal %p, blob %p, idx_posted %d, user %p\n",
 		       i, atomic_read(&ceq->events[i].coalesce),
 		       ceq->events[i].blob_data,
 		       ceq->events[i].idx_posted,
 		       ceq->events[i].user_data);
 out:
 	switch_back(p, switch_state);
 	proc_decref(p);
 }
	/* Copyright (c) 2015 Google Inc.
	* Barret Rhoden <brho@cs.berkeley.edu>
	* See LICENSE for details.
	*
	* Coalescing Event Queue: encapuslates the essence of epoll/kqueue in shared
	* memory: a dense array of sticky status bits.
	*
	* Kernel side (producer)
	*
	* All of the printks are just us helping the user debug their CEQs. */

	#include <ceq.h>
	#include <process.h>
	#include <stdio.h>
	#include <umem.h>

	static void error_addr(struct ceq ceq, struct proc p, void *addr)
	{
	printk("[kernel] Invalid ceq (%p) bad addr %p for proc %d\n", ceq,
	addr, p->pid);
	}

	static void ceq_update_max_event(struct ceq *ceq, unsigned int new_max)
	{
	unsigned int old_max;

	do {
	old_max = atomic_read(&ceq->max_event_ever);
	if (new_max <= old_max)
	return;
	} while (!atomic_cas(&ceq->max_event_ever, old_max, new_max));
	}

	void send_ceq_msg(struct ceq ceq, struct proc p, struct event_msg *msg)
	{
	struct ceq_event *ceq_ev;
	int32_t *ring_slot;
	unsigned long my_slot;
	int loops = 0;
	#define NR_RING_TRIES 10

	/* should have been checked by the kernel func that called us */
	assert(is_user_rwaddr(ceq, sizeof(struct ceq)));
	if (msg->ev_type >= ceq->nr_events) {
	printk("[kernel] CEQ %p too small. Wanted %d, had %d\n", ceq,
	msg->ev_type, ceq->nr_events);
	return;
	}
	ceq_update_max_event(ceq, msg->ev_type);
	/* ACCESS_ONCE, prevent the compiler from rereading ceq->events later,
	* and possibly getting a new, illegal version after our check */
	ceq_ev = &(ACCESS_ONCE(ceq->events))[msg->ev_type];
	if (!is_user_rwaddr(ceq_ev, sizeof(struct ceq_event))) {
	error_addr(ceq, p, ceq);
	return;
	}
	/* ideally, we'd like the blob to be posted after the coal, so that the
	* 'reason' for the blob is present when the blob is. but we can't
	* guarantee that. after we write the coal, the cons could consume
	* that. then the next time it looks at us, it could just see the blob
	* - so there's no good way to keep them together. the user will just
	* have to deal with it. in that case, we might as well do it first,
	* to utilize the atomic ops's memory barrier. */
	ceq_ev->blob_data = (uint64_t)msg->ev_arg3;
	switch (ceq->operation) {
	case (CEQ_OR):
	atomic_or(&ceq_ev->coalesce, msg->ev_arg2);
	break;
	case (CEQ_ADD):
	atomic_add(&ceq_ev->coalesce, msg->ev_arg2);
	break;
	default:
	printk("[kernel] CEQ %p invalid op %d\n", ceq, ceq->operation);
	return;
	}
	/* write before checking if we need to post (covered by the atomic) */
	if (ceq_ev->idx_posted) {
	/* our entry was updated and posted was still set: we know the
	* consumer will still check it, so we can safely leave. If we
	* ever have exit codes or something from send_*_msg, then we
	* can tell the kernel to not bother with INDIRS/IPIs/etc. This
	* is unnecessary now since INDIRs are throttled */
	return;
	}
	/* at this point, we need to make sure the cons looks at our entry. it
	* may have already done so while we were mucking around, but 'poking'
	* them to look again can't hurt */
	ceq_ev->idx_posted = TRUE;
	/* idx_posted write happens before the writes posting it. the following
	* atomic provides the cpu mb() */
	cmb();
	/* I considered checking the buffer for full-ness or the ceq overflow
	* here. Those would be reads, which would require a wrmb() right above
	* for every ring post, all for something we check for later anyways and
	* for something that should be rare. In return, when we are
	* overflowed, which should be rare if the user sizes their ring buffer
	* appropriately, we go through a little more hassle below. */
	/* I tried doing this with fetch_and_add to avoid the while loop and
	* picking a number of times to try. The trick is that you need to back
	* out, and could have multiple producers working on the same slot.
	* Although the overflow makes it okay for the producers idxes to be
	* clobbered, it's not okay to have two producers on the same slot,
	* since there'd only be one consumer. Theoretically, you could have a
	* producer delayed a long time that just clobbers an index at some
	* point in the future, or leaves an index in the non-init state (-1).
	* It's a mess. */
	do {
	cmb(); /* reread the indices */
	my_slot = atomic_read(&ceq->prod_idx);
	if (__ring_full(ceq->ring_sz, my_slot,
	atomic_read(&ceq->cons_pub_idx))) {
	ceq->ring_overflowed = TRUE;
	return;
	}
	if (loops++ == NR_RING_TRIES) {
	ceq->ring_overflowed = TRUE;
	return;
	}
	} while (!atomic_cas(&ceq->prod_idx, my_slot, my_slot + 1));
	/* ring_slot is a user pointer, calculated by ring, my_slot, and sz */
	ring_slot = &(ACCESS_ONCE(ceq->ring))[my_slot & (ceq->ring_sz - 1)];
	if (!is_user_rwaddr(ring_slot, sizeof(int32_t))) {
	/* This is a serious user error. We're just bailing out, and
	* any consumers might be spinning waiting on us to produce.
	* Probably not though, since the ring slot is bad memory. */
	error_addr(ceq, p, ring_slot);
	return;
	}
	/* At this point, we have a valid slot */
	*ring_slot = msg->ev_type;
	}

	void ceq_dumper(int pid, struct event_queue *ev_q)
	{
	struct proc *p;
	uintptr_t switch_state;
	struct ceq *ceq;

	p = pid2proc(pid);
	if (!p) {
	printk("No such proc %d\n", pid);
	return;
	}
	switch_state = switch_to(p);
	if (ev_q->ev_mbox->type != EV_MBOX_CEQ) {
	printk("Not a CEQ evq (type %d)\n", ev_q->ev_mbox->type);
	goto out;
	}
	ceq = &ev_q->ev_mbox->ceq;
	printk("CEQ %p\n---------------\n"
	"\tevents ptr %p\n"
	"\tnr_events %d\n"
	"\tlast_recovered %d\n"
	"\tmax_event_ever %ld\n"
	"\tring %p\n"
	"\tring_sz %d\n"
	"\toperation %d\n"
	"\tring_overflowed %d\n"
	"\toverflow_recovery %d\n"
	"\tprod_idx %lu\n"
	"\tcons_pub_idx %lu\n"
	"\tcons_pvt_idx %lu\n"
	"\n",
	ceq,
	ceq->events,
	ceq->nr_events,
	ceq->last_recovered,
	atomic_read(&ceq->max_event_ever),
	ceq->ring,
	ceq->ring_sz,
	ceq->operation,
	ceq->ring_overflowed,
	ceq->overflow_recovery,
	atomic_read(&ceq->prod_idx),
	atomic_read(&ceq->cons_pub_idx),
	atomic_read(&ceq->cons_pvt_idx));
	for (int i = 0; i < atomic_read(&ceq->max_event_ever) + 1; i++)
	printk("\tEvent%3d, coal %p, blob %p, idx_posted %d, user %p\n",
	i, atomic_read(&ceq->events[i].coalesce),
	ceq->events[i].blob_data,
	ceq->events[i].idx_posted,
	ceq->events[i].user_data);
	out:
	switch_back(p, switch_state);
	proc_decref(p);
	}