kern/src/ceq.c - upstream - Git at Google

 /* Copyright (c) 2015 Google Inc.
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
  * Coalescing Event Queue: encapuslates the essence of epoll/kqueue in shared
  * memory: a dense array of sticky status bits.
  *
  * Kernel side (producer)
  *
  * All of the printks are just us helping the user debug their CEQs. */

 #include <ceq.h>
 #include <process.h>
 #include <stdio.h>
 #include <umem.h>

 static void error_addr(struct ceq *ceq, struct proc *p, void *addr)
 {
 	printk("[kernel] Invalid ceq (%p) bad addr %p for proc %d\n", ceq,
 	       addr, p->pid);
 }

 static void ceq_update_max_event(struct ceq *ceq, unsigned int new_max)
 {
 	unsigned int old_max;

 	do {
 		old_max = atomic_read(&ceq->max_event_ever);
 		if (new_max <= old_max)
 			return;
 	} while (!atomic_cas(&ceq->max_event_ever, old_max, new_max));
 }

 void send_ceq_msg(struct ceq *ceq, struct proc *p, struct event_msg *msg)
 {
 	struct ceq_event *ceq_ev;
 	int32_t *ring_slot;
 	unsigned long my_slot;
 	int loops = 0;
 	#define NR_RING_TRIES 10

 	/* should have been checked by the kernel func that called us */
 	assert(is_user_rwaddr(ceq, sizeof(struct ceq)));
 	if (msg->ev_type >= ceq->nr_events) {
 		printk("[kernel] CEQ %p too small.  Wanted %d, had %d\n", ceq,
 		       msg->ev_type, ceq->nr_events);
 		return;
 	}
 	ceq_update_max_event(ceq, msg->ev_type);
 	/* ACCESS_ONCE, prevent the compiler from rereading ceq->events later, and
 	 * possibly getting a new, illegal version after our check */
 	ceq_ev = &(ACCESS_ONCE(ceq->events))[msg->ev_type];
 	if (!is_user_rwaddr(ceq_ev, sizeof(struct ceq_event))) {
 		error_addr(ceq, p, ceq);
 		return;
 	}
 	/* ideally, we'd like the blob to be posted after the coal, so that the
 	 * 'reason' for the blob is present when the blob is.  but we can't
 	 * guarantee that.  after we write the coal, the cons could consume that.
 	 * then the next time it looks at us, it could just see the blob - so
 	 * there's no good way to keep them together.  the user will just have to
 	 * deal with it.  in that case, we might as well do it first, to utilize the
 	 * atomic ops's memory barrier. */
 	ceq_ev->blob_data = (uint64_t)msg->ev_arg3;
 	switch (ceq->operation) {
 		case (CEQ_OR):
 			atomic_or(&ceq_ev->coalesce, msg->ev_arg2);
 			break;
 		case (CEQ_ADD):
 			atomic_add(&ceq_ev->coalesce, msg->ev_arg2);
 			break;
 		default:
 			printk("[kernel] CEQ %p invalid op %d\n", ceq, ceq->operation);
 			return;
 	}
 	/* write before checking if we need to post (covered by the atomic) */
 	if (ceq_ev->idx_posted) {
 		/* our entry was updated and posted was still set: we know the consumer
 		 * will still check it, so we can safely leave.  If we ever have exit
 		 * codes or something from send_*_msg, then we can tell the kernel to
 		 * not bother with INDIRS/IPIs/etc.  This is unnecessary now since
 		 * INDIRs are throttled */
 		return;
 	}
 	/* at this point, we need to make sure the cons looks at our entry.  it may
 	 * have already done so while we were mucking around, but 'poking' them to
 	 * look again can't hurt */
 	ceq_ev->idx_posted = TRUE;
 	/* idx_posted write happens before the writes posting it.  the following
 	 * atomic provides the cpu mb() */
 	cmb();
 	/* I considered checking the buffer for full-ness or the ceq overflow here.
 	 * Those would be reads, which would require a wrmb() right above for every
 	 * ring post, all for something we check for later anyways and for something
 	 * that should be rare.  In return, when we are overflowed, which should be
 	 * rare if the user sizes their ring buffer appropriately, we go through a
 	 * little more hassle below. */
 	/* I tried doing this with fetch_and_add to avoid the while loop and picking
 	 * a number of times to try.  The trick is that you need to back out, and
 	 * could have multiple producers working on the same slot.  Although the
 	 * overflow makes it okay for the producers idxes to be clobbered, it's not
 	 * okay to have two producers on the same slot, since there'd only be one
 	 * consumer.  Theoretically, you could have a producer delayed a long time
 	 * that just clobbers an index at some point in the future, or leaves an
 	 * index in the non-init state (-1).  It's a mess. */
 	do {
 		cmb();	/* reread the indices */
 		my_slot = atomic_read(&ceq->prod_idx);
 		if (__ring_full(ceq->ring_sz, my_slot,
 		                atomic_read(&ceq->cons_pub_idx))) {
 			ceq->ring_overflowed = TRUE;
 			return;
 		}
 		if (loops++ == NR_RING_TRIES) {
 			ceq->ring_overflowed = TRUE;
 			return;
 		}
 	} while (!atomic_cas(&ceq->prod_idx, my_slot, my_slot + 1));
 	/* ring_slot is a user pointer, calculated by ring, my_slot, and sz */
 	ring_slot = &(ACCESS_ONCE(ceq->ring))[my_slot & (ceq->ring_sz - 1)];
 	if (!is_user_rwaddr(ring_slot, sizeof(int32_t))) {
 		/* This is a serious user error.  We're just bailing out, and any
 		 * consumers might be spinning waiting on us to produce.  Probably not
 		 * though, since the ring slot is bad memory. */
 		error_addr(ceq, p, ring_slot);
 		return;
 	}
 	/* At this point, we have a valid slot */
 	*ring_slot = msg->ev_type;
 }

 void ceq_dumper(int pid, struct event_queue *ev_q)
 {
 	struct proc *p;
 	uintptr_t switch_state;
 	struct ceq *ceq;

 	p = pid2proc(pid);
 	if (!p) {
 		printk("No such proc %d\n", pid);
 		return;
 	}
 	switch_state = switch_to(p);
 	if (ev_q->ev_mbox->type != EV_MBOX_CEQ) {
 		printk("Not a CEQ evq (type %d)\n", ev_q->ev_mbox->type);
 		goto out;
 	}
 	ceq = &ev_q->ev_mbox->ceq;
 	printk("CEQ %p\n---------------\n"
 	       "\tevents ptr %p\n"
 	       "\tnr_events %d\n"
 	       "\tlast_recovered %d\n"
 	       "\tmax_event_ever %ld\n"
 	       "\tring %p\n"
 	       "\tring_sz %d\n"
 	       "\toperation %d\n"
 	       "\tring_overflowed %d\n"
 	       "\toverflow_recovery %d\n"
 	       "\tprod_idx %lu\n"
 	       "\tcons_pub_idx %lu\n"
 	       "\tcons_pvt_idx %lu\n"
 	       "\n",
 		   ceq,
 	       ceq->events,
 	       ceq->nr_events,
 	       ceq->last_recovered,
 	       atomic_read(&ceq->max_event_ever),
 	       ceq->ring,
 	       ceq->ring_sz,
 	       ceq->operation,
 	       ceq->ring_overflowed,
 	       ceq->overflow_recovery,
 	       atomic_read(&ceq->prod_idx),
 	       atomic_read(&ceq->cons_pub_idx),
 	       atomic_read(&ceq->cons_pvt_idx));
 	for (int i = 0; i < atomic_read(&ceq->max_event_ever) + 1; i++)
 		printk("\tEvent %3d, coal %p, blob %p, idx_posted %d, user %p\n", i,
 		       atomic_read(&ceq->events[i].coalesce),
 		       ceq->events[i].blob_data,
 		       ceq->events[i].idx_posted,
 		       ceq->events[i].user_data);
 out:
 	switch_back(p, switch_state);
 	proc_decref(p);
 }
	/* Copyright (c) 2015 Google Inc.
	* Barret Rhoden <brho@cs.berkeley.edu>
	* See LICENSE for details.
	*
	* Coalescing Event Queue: encapuslates the essence of epoll/kqueue in shared
	* memory: a dense array of sticky status bits.
	*
	* Kernel side (producer)
	*
	* All of the printks are just us helping the user debug their CEQs. */

	#include <ceq.h>
	#include <process.h>
	#include <stdio.h>
	#include <umem.h>

	static void error_addr(struct ceq ceq, struct proc p, void *addr)
	{
	printk("[kernel] Invalid ceq (%p) bad addr %p for proc %d\n", ceq,
	addr, p->pid);
	}

	static void ceq_update_max_event(struct ceq *ceq, unsigned int new_max)
	{
	unsigned int old_max;

	do {
	old_max = atomic_read(&ceq->max_event_ever);
	if (new_max <= old_max)
	return;
	} while (!atomic_cas(&ceq->max_event_ever, old_max, new_max));
	}

	void send_ceq_msg(struct ceq ceq, struct proc p, struct event_msg *msg)
	{
	struct ceq_event *ceq_ev;
	int32_t *ring_slot;
	unsigned long my_slot;
	int loops = 0;
	#define NR_RING_TRIES 10

	/* should have been checked by the kernel func that called us */
	assert(is_user_rwaddr(ceq, sizeof(struct ceq)));
	if (msg->ev_type >= ceq->nr_events) {
	printk("[kernel] CEQ %p too small. Wanted %d, had %d\n", ceq,
	msg->ev_type, ceq->nr_events);
	return;
	}
	ceq_update_max_event(ceq, msg->ev_type);
	/* ACCESS_ONCE, prevent the compiler from rereading ceq->events later, and
	* possibly getting a new, illegal version after our check */
	ceq_ev = &(ACCESS_ONCE(ceq->events))[msg->ev_type];
	if (!is_user_rwaddr(ceq_ev, sizeof(struct ceq_event))) {
	error_addr(ceq, p, ceq);
	return;
	}
	/* ideally, we'd like the blob to be posted after the coal, so that the
	* 'reason' for the blob is present when the blob is. but we can't
	* guarantee that. after we write the coal, the cons could consume that.
	* then the next time it looks at us, it could just see the blob - so
	* there's no good way to keep them together. the user will just have to
	* deal with it. in that case, we might as well do it first, to utilize the
	* atomic ops's memory barrier. */
	ceq_ev->blob_data = (uint64_t)msg->ev_arg3;
	switch (ceq->operation) {
	case (CEQ_OR):
	atomic_or(&ceq_ev->coalesce, msg->ev_arg2);
	break;
	case (CEQ_ADD):
	atomic_add(&ceq_ev->coalesce, msg->ev_arg2);
	break;
	default:
	printk("[kernel] CEQ %p invalid op %d\n", ceq, ceq->operation);
	return;
	}
	/* write before checking if we need to post (covered by the atomic) */
	if (ceq_ev->idx_posted) {
	/* our entry was updated and posted was still set: we know the consumer
	* will still check it, so we can safely leave. If we ever have exit
	* codes or something from send_*_msg, then we can tell the kernel to
	* not bother with INDIRS/IPIs/etc. This is unnecessary now since
	* INDIRs are throttled */
	return;
	}
	/* at this point, we need to make sure the cons looks at our entry. it may
	* have already done so while we were mucking around, but 'poking' them to
	* look again can't hurt */
	ceq_ev->idx_posted = TRUE;
	/* idx_posted write happens before the writes posting it. the following
	* atomic provides the cpu mb() */
	cmb();
	/* I considered checking the buffer for full-ness or the ceq overflow here.
	* Those would be reads, which would require a wrmb() right above for every
	* ring post, all for something we check for later anyways and for something
	* that should be rare. In return, when we are overflowed, which should be
	* rare if the user sizes their ring buffer appropriately, we go through a
	* little more hassle below. */
	/* I tried doing this with fetch_and_add to avoid the while loop and picking
	* a number of times to try. The trick is that you need to back out, and
	* could have multiple producers working on the same slot. Although the
	* overflow makes it okay for the producers idxes to be clobbered, it's not
	* okay to have two producers on the same slot, since there'd only be one
	* consumer. Theoretically, you could have a producer delayed a long time
	* that just clobbers an index at some point in the future, or leaves an
	* index in the non-init state (-1). It's a mess. */
	do {
	cmb(); /* reread the indices */
	my_slot = atomic_read(&ceq->prod_idx);
	if (__ring_full(ceq->ring_sz, my_slot,
	atomic_read(&ceq->cons_pub_idx))) {
	ceq->ring_overflowed = TRUE;
	return;
	}
	if (loops++ == NR_RING_TRIES) {
	ceq->ring_overflowed = TRUE;
	return;
	}
	} while (!atomic_cas(&ceq->prod_idx, my_slot, my_slot + 1));
	/* ring_slot is a user pointer, calculated by ring, my_slot, and sz */
	ring_slot = &(ACCESS_ONCE(ceq->ring))[my_slot & (ceq->ring_sz - 1)];
	if (!is_user_rwaddr(ring_slot, sizeof(int32_t))) {
	/* This is a serious user error. We're just bailing out, and any
	* consumers might be spinning waiting on us to produce. Probably not
	* though, since the ring slot is bad memory. */
	error_addr(ceq, p, ring_slot);
	return;
	}
	/* At this point, we have a valid slot */
	*ring_slot = msg->ev_type;
	}

	void ceq_dumper(int pid, struct event_queue *ev_q)
	{
	struct proc *p;
	uintptr_t switch_state;
	struct ceq *ceq;

	p = pid2proc(pid);
	if (!p) {
	printk("No such proc %d\n", pid);
	return;
	}
	switch_state = switch_to(p);
	if (ev_q->ev_mbox->type != EV_MBOX_CEQ) {
	printk("Not a CEQ evq (type %d)\n", ev_q->ev_mbox->type);
	goto out;
	}
	ceq = &ev_q->ev_mbox->ceq;
	printk("CEQ %p\n---------------\n"
	"\tevents ptr %p\n"
	"\tnr_events %d\n"
	"\tlast_recovered %d\n"
	"\tmax_event_ever %ld\n"
	"\tring %p\n"
	"\tring_sz %d\n"
	"\toperation %d\n"
	"\tring_overflowed %d\n"
	"\toverflow_recovery %d\n"
	"\tprod_idx %lu\n"
	"\tcons_pub_idx %lu\n"
	"\tcons_pvt_idx %lu\n"
	"\n",
	ceq,
	ceq->events,
	ceq->nr_events,
	ceq->last_recovered,
	atomic_read(&ceq->max_event_ever),
	ceq->ring,
	ceq->ring_sz,
	ceq->operation,
	ceq->ring_overflowed,
	ceq->overflow_recovery,
	atomic_read(&ceq->prod_idx),
	atomic_read(&ceq->cons_pub_idx),
	atomic_read(&ceq->cons_pvt_idx));
	for (int i = 0; i < atomic_read(&ceq->max_event_ever) + 1; i++)
	printk("\tEvent %3d, coal %p, blob %p, idx_posted %d, user %p\n", i,
	atomic_read(&ceq->events[i].coalesce),
	ceq->events[i].blob_data,
	ceq->events[i].idx_posted,
	ceq->events[i].user_data);
	out:
	switch_back(p, switch_state);
	proc_decref(p);
	}