blob: 0655c0710bf4aeb3d25268d2fb9e23cb8c00449b [file] [log] [blame]
/* Copyright (c) 2011-2014 The Regents of the University of California
* Copyright (c) 2015 Google Inc
* Barret Rhoden <brho@cs.berkeley.edu>
* See LICENSE for details.
*
* Userspace utility functions for receiving events and notifications (IPIs).
* Some are higher level than others; just use what you need. */
#include <ros/event.h>
#include <ros/procdata.h>
#include <parlib/ucq.h>
#include <parlib/evbitmap.h>
#include <parlib/ceq.h>
#include <parlib/vcore.h>
#include <stdlib.h>
#include <string.h>
#include <parlib/assert.h>
#include <parlib/stdio.h>
#include <errno.h>
#include <parlib/parlib.h>
#include <parlib/event.h>
#include <parlib/uthread.h>
#include <parlib/spinlock.h>
#include <parlib/mcs.h>
#include <parlib/poke.h>
#include <sys/queue.h>
#include <malloc.h>
/* For remote VCPD mbox event handling */
__thread bool __vc_handle_an_mbox = FALSE;
__thread uint32_t __vc_rem_vcoreid;
/********* Event_q Setup / Registration ***********/
/* Get event_qs via these interfaces, since eventually we'll want to either
* allocate from pinned memory or use some form of a slab allocator. Also,
* these stitch up the big_q so its ev_mbox points to its internal mbox. Never
* access the internal mbox directly.
*
* Raw ones need to have their mailboxes initialized. If you're making a lot of
* these and they perform their own mmaps (e.g. UCQs), you can do one big mmap
* and init the ucqs on your own, which ought to perform better.
*
* Use the 'regular' one for big_qs if you don't want to worry about the mbox
* initalization */
struct event_queue *get_eventq_raw(void)
{
/* TODO: (PIN) should be pinned memory */
struct event_queue_big *big_q = malloc(sizeof(struct event_queue_big));
memset(big_q, 0, sizeof(struct event_queue_big));
big_q->ev_mbox = &big_q->ev_imbox;
return (struct event_queue*)big_q;
}
struct event_queue *get_eventq(int mbox_type)
{
struct event_queue *big_q = get_eventq_raw();
event_mbox_init(big_q->ev_mbox, mbox_type);
return big_q;
}
/* Basic initialization of a single mbox. If you know the type, you can set up
* the mbox manually with possibly better performance. For instance, ucq_init()
* calls mmap internally. You could mmap a huge blob on your own and call
* ucq_raw_init (don't forget to set the mbox_type!) */
void event_mbox_init(struct event_mbox *ev_mbox, int mbox_type)
{
ev_mbox->type = mbox_type;
switch (ev_mbox->type) {
case (EV_MBOX_UCQ):
ucq_init(&ev_mbox->ucq);
break;
case (EV_MBOX_BITMAP):
evbitmap_init(&ev_mbox->evbm);
break;
case (EV_MBOX_CEQ):
ceq_init(&ev_mbox->ceq, CEQ_OR, CEQ_DEFAULT_SZ, CEQ_DEFAULT_SZ);
break;
default:
printf("Unknown mbox type %d!\n", ev_mbox->type);
break;
}
}
/* Give it up. I don't recommend calling these unless you're sure the queues
* aren't in use (unregistered, etc). (TODO: consider some checks for this) */
void put_eventq_raw(struct event_queue *ev_q)
{
/* if we use something other than malloc, we'll need to be aware that
* ev_q is actually an event_queue_big. One option is to use the flags,
* though this could be error prone. */
free(ev_q);
}
void put_eventq(struct event_queue *ev_q)
{
event_mbox_cleanup(ev_q->ev_mbox);
put_eventq_raw(ev_q);
}
void event_mbox_cleanup(struct event_mbox *ev_mbox)
{
switch (ev_mbox->type) {
case (EV_MBOX_UCQ):
ucq_free_pgs(&ev_mbox->ucq);
break;
case (EV_MBOX_BITMAP):
evbitmap_cleanup(&ev_mbox->evbm);
break;
case (EV_MBOX_CEQ):
ceq_cleanup(&ev_mbox->ceq);
break;
default:
printf("Unknown mbox type %d!\n", ev_mbox->type);
break;
}
}
/* Need to point this event_q to an mbox - usually to a vcpd */
struct event_queue *get_eventq_slim(void)
{
/* TODO: (PIN) should be pinned memory */
struct event_queue *ev_q = malloc(sizeof(struct event_queue));
memset(ev_q, 0, sizeof(struct event_queue));
return ev_q;
}
/* Gets a small ev_q, with ev_mbox pointing to the vcpd mbox of vcoreid. If
* ev_flags has EVENT_VCORE_PRIVATE set, it'll give you the private mbox. o/w,
* you'll get the public one. */
struct event_queue *get_eventq_vcpd(uint32_t vcoreid, int ev_flags)
{
struct event_queue *ev_q = get_eventq_slim();
if (ev_flags & EVENT_VCORE_PRIVATE)
ev_q->ev_mbox = &vcpd_of(vcoreid)->ev_mbox_private;
else
ev_q->ev_mbox = &vcpd_of(vcoreid)->ev_mbox_public;
return ev_q;
}
void put_eventq_slim(struct event_queue *ev_q)
{
/* if we use something other than malloc, we'll need to be aware that
* ev_q is not an event_queue_big. */
free(ev_q);
}
void put_eventq_vcpd(struct event_queue *ev_q)
{
put_eventq_slim(ev_q);
}
/* Sets ev_q to be the receiving end for kernel event ev_type */
void register_kevent_q(struct event_queue *ev_q, unsigned int ev_type)
{
__procdata.kernel_evts[ev_type] = ev_q;
}
/* Clears the event, returning an ev_q if there was one there. You'll need to
* free it. */
struct event_queue *clear_kevent_q(unsigned int ev_type)
{
struct event_queue *ev_q = __procdata.kernel_evts[ev_type];
__procdata.kernel_evts[ev_type] = 0;
return ev_q;
}
/* Enables an IPI/event combo for ev_type sent to vcoreid's default mbox. IPI
* if you want one or not. If you want the event to go to the vcore private
* mbox (meaning no other core should ever handle it), send in
* EVENT_VCORE_PRIVATE with ev_flags.
*
* This is the simplest thing applications may want, and shows how you can put
* the other event functions together to get similar things done. */
void enable_kevent(unsigned int ev_type, uint32_t vcoreid, int ev_flags)
{
struct event_queue *ev_q = get_eventq_vcpd(vcoreid, ev_flags);
ev_q->ev_flags = ev_flags;
ev_q->ev_vcore = vcoreid;
ev_q->ev_handler = 0;
wmb(); /* make sure ev_q is filled out before registering */
register_kevent_q(ev_q, ev_type);
}
/* Stop receiving the events (one could be on the way). Caller needs to be
* careful, since the kernel might be sending an event to the ev_q. Depending
* on the ev_q, it may be hard to know when it is done (for instance, if all
* syscalls you ever registered with the ev_q are done, then it would be okay).
* o/w, don't free it. */
struct event_queue *disable_kevent(unsigned int ev_type)
{
return clear_kevent_q(ev_type);
}
/********* Event Handling / Reception ***********/
/* Somewhat ghetto helper, for the lazy. If all you care about is an event
* number, this will see if the event happened or not. It will try for a
* message, but if there is none, it will go for a bit. Note that multiple
* bit messages will turn into just one bit. */
unsigned int get_event_type(struct event_mbox *ev_mbox)
{
struct event_msg local_msg = {0};
if (extract_one_mbox_msg(ev_mbox, &local_msg))
return local_msg.ev_type;
return EV_NONE;
}
/* Attempts to register ev_q with sysc, so long as sysc is not done/progress.
* Returns true if it succeeded, and false otherwise. False means that the
* syscall is done, and does not need an event set (and should be handled
* accordingly).
*
* A copy of this is in glibc/sysdeps/akaros/syscall.c. Keep them in sync. */
bool register_evq(struct syscall *sysc, struct event_queue *ev_q)
{
int old_flags;
sysc->ev_q = ev_q;
wrmb(); /* don't let that write pass any future reads (flags) */
/* Try and set the SC_UEVENT flag (so the kernel knows to look at ev_q)
*/
do {
/* no cmb() needed, the atomic_read will reread flags */
old_flags = atomic_read(&sysc->flags);
/* Spin if the kernel is mucking with syscall flags */
while (old_flags & SC_K_LOCK)
old_flags = atomic_read(&sysc->flags);
/* If the kernel finishes while we are trying to sign up for an
* event, we need to bail out */
if (old_flags & (SC_DONE | SC_PROGRESS)) {
/* not necessary, but might help with bugs */
sysc->ev_q = 0;
return FALSE;
}
} while (!atomic_cas(&sysc->flags, old_flags, old_flags | SC_UEVENT));
return TRUE;
}
/* De-registers a syscall, so that the kernel will not send an event when it is
* done. The call could already be SC_DONE, or could even finish while we try
* to unset SC_UEVENT.
*
* There is a chance the kernel sent an event if you didn't do this in time, but
* once this returns, the kernel won't send a message.
*
* If the kernel is trying to send a message right now, this will spin (on
* SC_K_LOCK). We need to make sure we deregistered, and that if a message
* is coming, that it already was sent (and possibly overflowed), before
* returning. */
void deregister_evq(struct syscall *sysc)
{
int old_flags;
sysc->ev_q = 0;
wrmb(); /* don't let that write pass any future reads (flags) */
/* Try and unset the SC_UEVENT flag */
do {
/* no cmb() needed, the atomic_read will reread flags */
old_flags = atomic_read(&sysc->flags);
/* Spin if the kernel is mucking with syscall flags */
while (old_flags & SC_K_LOCK)
old_flags = atomic_read(&sysc->flags);
/* Note we don't care if the SC_DONE flag is getting set. We
* just need to avoid clobbering flags */
} while (!atomic_cas(&sysc->flags, old_flags, old_flags & ~SC_UEVENT));
}
/* Actual Event Handling */
/* List of handler lists, process-wide. They all must return (don't context
* switch to a u_thread) */
struct ev_handler *ev_handlers[MAX_NR_EVENT] = {0};
spinpdrlock_t ev_h_wlock = SPINPDR_INITIALIZER;
int register_ev_handler(unsigned int ev_type, handle_event_t handler,
void *data)
{
/* Nasty uthread code assumes this was malloced */
struct ev_handler *new_h = malloc(sizeof(struct ev_handler));
if (!new_h)
return -1;
new_h->func = handler;
new_h->data = data;
spin_pdr_lock(&ev_h_wlock);
new_h->next = ev_handlers[ev_type];
wmb(); /* make sure new_h is done before publishing to readers */
ev_handlers[ev_type] = new_h;
spin_pdr_unlock(&ev_h_wlock);
return 0;
}
int deregister_ev_handler(unsigned int ev_type, handle_event_t handler,
void *data)
{
/* TODO: User-level RCU */
printf("Failed to dereg handler, not supported yet!\n");
return -1;
}
static void run_ev_handlers(unsigned int ev_type, struct event_msg *ev_msg)
{
struct ev_handler *handler;
/* TODO: RCU read lock */
handler = ev_handlers[ev_type];
while (handler) {
handler->func(ev_msg, ev_type, handler->data);
handler = handler->next;
}
}
/* Attempts to extract a message from an mbox, copying it into ev_msg.
* Returns TRUE on success. */
bool extract_one_mbox_msg(struct event_mbox *ev_mbox, struct event_msg *ev_msg)
{
switch (ev_mbox->type) {
case (EV_MBOX_UCQ):
return get_ucq_msg(&ev_mbox->ucq, ev_msg);
case (EV_MBOX_BITMAP):
return get_evbitmap_msg(&ev_mbox->evbm, ev_msg);
case (EV_MBOX_CEQ):
return get_ceq_msg(&ev_mbox->ceq, ev_msg);
default:
printf("Unknown mbox type %d!\n", ev_mbox->type);
return FALSE;
}
}
/* Attempts to handle a message. Returns 1 if we dequeued a msg, 0 o/w. */
int handle_one_mbox_msg(struct event_mbox *ev_mbox)
{
struct event_msg local_msg;
unsigned int ev_type;
/* extract returns TRUE on success, we return 1. */
if (!extract_one_mbox_msg(ev_mbox, &local_msg))
return 0;
ev_type = local_msg.ev_type;
assert(ev_type < MAX_NR_EVENT);
printd("[event] UCQ (mbox %08p), ev_type: %d\n", ev_mbox, ev_type);
run_ev_handlers(ev_type, &local_msg);
return 1;
}
/* Handle an mbox. This is the receive-side processing of an event_queue. It
* takes an ev_mbox, since the vcpd mbox isn't a regular ev_q. Returns 1 if we
* handled something, 0 o/w. */
int handle_mbox(struct event_mbox *ev_mbox)
{
int retval = 0;
printd("[event] handling ev_mbox %08p on vcore %d\n", ev_mbox,
vcore_id());
/* Some stack-smashing bugs cause this to fail */
assert(ev_mbox);
/* Handle all full messages, tracking if we do at least one. */
while (handle_one_mbox_msg(ev_mbox))
retval = 1;
return retval;
}
/* Empty if the UCQ is empty and the bits don't need checked */
bool mbox_is_empty(struct event_mbox *ev_mbox)
{
switch (ev_mbox->type) {
case (EV_MBOX_UCQ):
return ucq_is_empty(&ev_mbox->ucq);
case (EV_MBOX_BITMAP):
return evbitmap_is_empty(&ev_mbox->evbm);
case (EV_MBOX_CEQ):
return ceq_is_empty(&ev_mbox->ceq);
default:
printf("Unknown mbox type %d!\n", ev_mbox->type);
return FALSE;
}
}
/* The EV_EVENT handler - extract the ev_q from the message. */
void handle_ev_ev(struct event_msg *ev_msg, unsigned int ev_type, void *data)
{
struct event_queue *ev_q;
/* EV_EVENT can't handle not having a message / being a bit. If we got
* a bit message, it's a bug somewhere */
assert(ev_msg);
ev_q = ev_msg->ev_arg3;
/* Same deal, a null ev_q is probably a bug, or someone being a jackass
*/
assert(ev_q);
/* Clear pending, so we can start getting INDIRs and IPIs again. We
* must set this before (compared to handle_events, then set it, then
* handle again), since there is no guarantee handle_event_q() will
* return. If there is a pending preemption, the vcore quickly yields
* and will deal with the remaining events in the future - meaning it
* won't return to here. */
ev_q->ev_alert_pending = FALSE;
wmb();/* don't let the pending write pass the signaling of an ev recv */
handle_event_q(ev_q);
}
/* Handles VCPD events (public and private). The kernel always sets
* notif_pending after posting a message to either public or private mailbox.
* When this returns, as far as we are concerned, notif_pending is FALSE.
* However, a concurrent kernel writer could have reset it to true. This is
* fine; whenever we leave VC ctx we double check notif_pending. Returns 1 or 2
* if we actually handled a message, 0 o/w.
*
* WARNING: this might not return and/or current_uthread may change. */
int handle_events(uint32_t vcoreid)
{
struct preempt_data *vcpd = vcpd_of(vcoreid);
int retval = 0;
vcpd->notif_pending = FALSE;
wrmb(); /* prevent future reads from happening before notif_p write */
retval += handle_mbox(&vcpd->ev_mbox_private);
retval += handle_mbox(&vcpd->ev_mbox_public);
return retval;
}
/* Handles the events on ev_q IAW the event_handlers[]. If the ev_q is
* application specific, then this will dispatch/handle based on its flags. */
void handle_event_q(struct event_queue *ev_q)
{
printd("[event] handling ev_q %08p on vcore %d\n", ev_q, vcore_id());
/* If the program wants to handle the ev_q on its own: */
if (ev_q->ev_handler) {
/* Remember this can't block or page fault */
ev_q->ev_handler(ev_q);
return;
}
/* Raw ev_qs that haven't been connected to an mbox, user bug: */
assert(ev_q->ev_mbox);
/* The "default" ev_handler, common enough that I don't want a func ptr
*/
handle_mbox(ev_q->ev_mbox);
}
/* Sends the calling vcore a message to its public mbox. This is purposefully
* limited to just the calling vcore, since in future versions, we can send via
* ucqs directly (in many cases). That will require the caller to be the
* vcoreid, due to some preemption recovery issues (another ucq poller is
* waiting on us when we got preempted, and we never up nr_cons). */
void send_self_vc_msg(struct event_msg *ev_msg)
{
// TODO: try to use UCQs (requires additional support)
/* ev_type actually gets ignored currently. ev_msg is what matters if
* it is non-zero. FALSE means it's going to the public mbox */
sys_self_notify(vcore_id(), ev_msg->ev_type, ev_msg, FALSE);
}
/* Helper: makes the current core handle a remote vcore's VCPD public mbox
* events.
*
* Both cases (whether we are handling someone else's already or not) use some
* method of telling our future self what to do. When we aren't already
* handling it, we use TLS, and jump to vcore entry. When we are already
* handling, then we send a message to ourself, which we deal with when we
* handle our own events (which is later in vcore entry).
*
* We need to reset the stack and deal with it in vcore entry to avoid recursing
* deeply and running off the transition stack. (handler calling handle event).
*
* Note that we might not be the one that gets the message we send. If we pull
* a sys_change_to, someone else might be polling our public message box. All
* we're doing is making sure that we don't forget to check rem_vcoreid's mbox.
*
* Finally, note that this function might not return. However, it'll handle the
* details related to vcpd mboxes, so you don't use the ev_might_not_return()
* helpers with this. */
void handle_vcpd_mbox(uint32_t rem_vcoreid)
{
uint32_t vcoreid = vcore_id();
struct preempt_data *vcpd = vcpd_of(vcoreid);
struct event_msg local_msg = {0};
assert(vcoreid != rem_vcoreid);
/* If they are empty, then we're done */
if (mbox_is_empty(&vcpd_of(rem_vcoreid)->ev_mbox_public))
return;
if (__vc_handle_an_mbox) {
/* we might be already handling them, in which case, abort */
if (__vc_rem_vcoreid == rem_vcoreid)
return;
/* Already handling message for someone, need to send ourselves
* a message to check rem_vcoreid, which we'll process later. */
local_msg.ev_type = EV_CHECK_MSGS;
local_msg.ev_arg2 = rem_vcoreid; /* 32bit arg */
send_self_vc_msg(&local_msg);
return;
}
/* No return after here */
/* At this point, we aren't in the process of handling someone else's
* messages, so just tell our future self what to do */
__vc_handle_an_mbox = TRUE;
__vc_rem_vcoreid = rem_vcoreid;
/* Reset the stack and start over in vcore context */
set_stack_pointer((void*)vcpd->vcore_stack);
vcore_entry();
assert(0);
}
/* Handle remote vcpd public mboxes, if that's what we want to do. Call this
* from vcore entry, pairs with handle_vcpd_mbox(). */
void try_handle_remote_mbox(void)
{
if (__vc_handle_an_mbox) {
handle_mbox(&vcpd_of(__vc_rem_vcoreid)->ev_mbox_public);
/* only clear the flag when we have returned from handling
* messages. if an event handler (like preempt_recover) doesn't
* return, we'll clear this flag elsewhere. (it's actually not a
* big deal if we don't). */
cmb();
__vc_handle_an_mbox = FALSE;
}
}
/* Event handler helpers */
/* For event handlers that might not return, we need to call this before the
* command that might not return. In the event we were handling a remote
* vcore's messages, it'll send ourselves a messages that we (or someone who
* polls us) will get so that someone finishes off that vcore's messages).
* Doesn't matter who does, so long as someone does.
*
* This returns whether or not we were handling someone's messages. Pass the
* parameter to ev_we_returned() */
bool ev_might_not_return(void)
{
struct event_msg local_msg = {0};
bool were_handling_remotes = FALSE;
if (__vc_handle_an_mbox) {
/* slight chance we finished with their mbox (were on the last
* one) */
if (!mbox_is_empty(&vcpd_of(__vc_rem_vcoreid)->ev_mbox_public))
{
/* But we aren't, so we'll need to send a message */
local_msg.ev_type = EV_CHECK_MSGS;
local_msg.ev_arg2 = __vc_rem_vcoreid; /* 32bit arg */
send_self_vc_msg(&local_msg);
}
/* Either way, we're not working on this one now. Note this is
* more of an optimization - it'd be harmless (I think) to poll
* another vcore's pub mbox once when we pop up in vc_entry in
* the future */
__vc_handle_an_mbox = FALSE;
return TRUE;
}
return FALSE;
}
/* Call this when you return, paired up with ev_might_not_return(). If
* ev_might_not_return turned off uth_handle, we'll turn it back on. */
void ev_we_returned(bool were_handling_remotes)
{
if (were_handling_remotes)
__vc_handle_an_mbox = TRUE;
}
/* Debugging */
void print_ev_msg(struct event_msg *msg)
{
printf("MSG at %08p\n", msg);
printf("\ttype: %d\n", msg->ev_type);
printf("\targ1 (16): 0x%4x\n", msg->ev_arg1);
printf("\targ2 (32): 0x%8x\n", msg->ev_arg2);
printf("\targ3 (32): 0x%8x\n", msg->ev_arg3);
printf("\targ4 (64): 0x%16x\n", msg->ev_arg4);
}
/* Uthreads blocking on event queues
*
* It'd be nice to have a uthread sleep until an event queue has some activity
* (e.g. a new message). It'd also be nice to wake up early with a timer. It
* is tempting to try something like an INDIR and have one evq multiplex two
* others (the real event and an alarm). But then you can't separate the two
* streams; what if one thread sleeps on just the event at the same time? What
* if we want to support something like Go's select: a thread wants to block
* until there is some activity on some channel?
*
* Ultimately, we want to allow M uthreads to block on possibly different
* subsets of N event queues.
*
* Every uthread will have a sleep controller, and every event queue will have a
* wakeup controller. There are up to MxN linkage structures connecting these.
*
* We'll use the event_queue handler to override the default event processing.
* This means the event queues that are used for blocking uthreads can *only* be
* used for that; the regular event processing will not happen. This is mostly
* true. It is possible to extract events from an evq's mbox concurrently.
*
* I briefly considered having one global lock to protect all of the lists and
* structures. That's lousy for the obvious scalability reason, but it seemed
* like it'd make things easier, especially when I thought I needed locks in
* both the ectlr and the uctlr (in early versions, I considered having the
* handler yank itself out of the ectlr, copying a message into that struct, or
* o/w needing protection). On occasion, we run into the "I'd like to split my
* lock between two components and still somehow synchronize" issue (e.g. FD
* taps, with the FDT lock and the blocking/whatever that goes on in a device).
* Whenever that comes up, we usually can get some help from other shared memory
* techniques. For FD taps, it's the kref. For us, it's post-and-poke, though
* it didn't solve all of our problems - I use it as a tool with some basic
* shared memory signalling. */
struct evq_wait_link;
TAILQ_HEAD(wait_link_tailq, evq_wait_link);
/* Bookkeeping for the uthread sleeping on a bunch of event queues.
*
* Notes on concurrency: most fields are not protected. check_evqs is racy, and
* written to by handlers. The tailq is only used by the uthread. blocked is
* never concurrently *written*; see __uth_wakeup_poke() for details. */
struct uth_sleep_ctlr {
struct uthread *uth;
struct spin_pdr_lock in_use;
bool check_evqs;
bool blocked;
struct poke_tracker poker;
struct wait_link_tailq evqs;
};
/* Attaches to an event_queue (ev_udata), tracks the uthreads for this evq */
struct evq_wakeup_ctlr {
/* If we ever use a sync_obj, that would replace waiters. But also note
* that we want a pointer to something other than the uthread, and currently
* we also wake all threads - there's no scheduling decision. */
struct wait_link_tailq waiters;
struct spin_pdr_lock lock;
};
/* Up to MxN of these, N of them per uthread. */
struct evq_wait_link {
struct uth_sleep_ctlr *uth_ctlr;
TAILQ_ENTRY(evq_wait_link) link_uth;
struct evq_wakeup_ctlr *evq_ctlr;
TAILQ_ENTRY(evq_wait_link) link_evq;
};
/* Poke function: ensures the uth managed by uctlr wakes up. poke() ensures
* there is only one thread in this function at a time. However, it could be
* called spuriously, which is why we check 'blocked.' */
static void __uth_wakeup_poke(void *arg)
{
struct uth_sleep_ctlr *uctlr = arg;
/* There are no concurrent writes to 'blocked'. Blocked is only ever
* written when the uth sleeps and only ever cleared here. Once the uth
* writes it, it does not write it again until after we clear it.
*
* This is still racy - we could see !blocked, then blocked gets set.
* In that case, the poke failed, and that is harmless. The uth will
* see 'check_evqs', which was set before poke, which would be before
* writing blocked, and the uth checks 'check_evqs' after writing. */
if (uctlr->blocked) {
uctlr->blocked = FALSE;
cmb(); /* clear blocked before starting the uth */
uthread_runnable(uctlr->uth);
}
}
static void uth_sleep_ctlr_init(struct uth_sleep_ctlr *uctlr,
struct uthread *uth)
{
uctlr->uth = uth;
spin_pdr_init(&uctlr->in_use);
uctlr->check_evqs = FALSE;
uctlr->blocked = FALSE;
poke_init(&uctlr->poker, __uth_wakeup_poke);
TAILQ_INIT(&uctlr->evqs);
}
/* This handler runs when the ev_q is checked. Instead of doing anything with
* the ev_q, we make sure that every uthread that was waiting on us wakes up.
* The uthreads could be waiting on several evqs, so there could be multiple
* independent wake-up attempts, hence the poke. Likewise, the uthread could be
* awake when we poke. The uthread will check check_evqs after sleeping, in
* case we poke before it blocks (and the poke fails).
*
* Also, there could be concurrent callers of this handler, and other uthreads
* signing up for a wakeup. */
void evq_wakeup_handler(struct event_queue *ev_q)
{
struct evq_wakeup_ctlr *ectlr = ev_q->ev_udata;
struct evq_wait_link *i;
assert(ectlr);
spin_pdr_lock(&ectlr->lock);
/* Note we wake up all sleepers, even though only one is likely to get
* the message. See the notes in unlink_ectlr() for more info. */
TAILQ_FOREACH(i, &ectlr->waiters, link_evq) {
i->uth_ctlr->check_evqs = TRUE;
cmb(); /* order check write before poke (poke has atomic) */
poke(&i->uth_ctlr->poker, i->uth_ctlr);
}
spin_pdr_unlock(&ectlr->lock);
}
/* Helper, attaches a wakeup controller to the event queue. */
void evq_attach_wakeup_ctlr(struct event_queue *ev_q)
{
struct evq_wakeup_ctlr *ectlr = malloc(sizeof(struct evq_wakeup_ctlr));
memset(ectlr, 0, sizeof(struct evq_wakeup_ctlr));
spin_pdr_init(&ectlr->lock);
TAILQ_INIT(&ectlr->waiters);
ev_q->ev_udata = ectlr;
ev_q->ev_handler = evq_wakeup_handler;
}
void evq_remove_wakeup_ctlr(struct event_queue *ev_q)
{
free(ev_q->ev_udata);
ev_q->ev_udata = 0;
ev_q->ev_handler = 0;
}
static void link_uctlr_ectlr(struct uth_sleep_ctlr *uctlr,
struct evq_wakeup_ctlr *ectlr,
struct evq_wait_link *link)
{
/* No lock needed for the uctlr; we're the only one modifying evqs */
link->uth_ctlr = uctlr;
TAILQ_INSERT_HEAD(&uctlr->evqs, link, link_uth);
/* Once we add ourselves to the ectrl list, we could start getting poked
*/
link->evq_ctlr = ectlr;
spin_pdr_lock(&ectlr->lock);
TAILQ_INSERT_HEAD(&ectlr->waiters, link, link_evq);
spin_pdr_unlock(&ectlr->lock);
}
/* Disconnects us from a wakeup controller.
*
* Our evq handlers wake up *all* uthreads that are waiting for activity
* (broadcast). It's a tradeoff. If the list of uthreads is long, then it is
* wasted effort. An alternative is to wake up exactly one, with slightly
* greater overheads. In the exactly-one case, multiple handlers could wake
* this uth up at once, but we can only extract one message. If we do the
* single wake up, then when we detach from an ectlr, we need to peak in the
* mbox to see if it is not empty, and conditionally run its handler again, such
* that no uthread sits on a ectlr that has activity/pending messages (in
* essence, level triggered). */
static void unlink_ectlr(struct evq_wait_link *link)
{
struct evq_wakeup_ctlr *ectlr = link->evq_ctlr;
spin_pdr_lock(&ectlr->lock);
TAILQ_REMOVE(&ectlr->waiters, link, link_evq);
spin_pdr_unlock(&ectlr->lock);
}
/* Helper: polls all evqs once and extracts the first message available. The
* message is copied into ev_msg, and the evq with the activity is copied into
* which_evq (if it is non-zero). Returns TRUE on success. */
static bool extract_evqs_msg(struct event_queue *evqs[], size_t nr_evqs,
struct event_msg *ev_msg,
struct event_queue **which_evq)
{
struct event_queue *evq_i;
bool ret = FALSE;
/* We need to have notifs disabled when extracting messages from some
* mboxes. Many mboxes have some form of busy waiting between consumers
* (userspace). If we're just a uthread, we could wind up on a runqueue
* somewhere while someone else spins, possibly in VC ctx. */
uth_disable_notifs();
for (int i = 0; i < nr_evqs; i++) {
evq_i = evqs[i];
if (extract_one_mbox_msg(evq_i->ev_mbox, ev_msg)) {
if (which_evq)
*which_evq = evq_i;
ret = TRUE;
break;
}
}
uth_enable_notifs();
return ret;
}
/* Yield callback */
static void __uth_blockon_evq_cb(struct uthread *uth, void *arg)
{
struct uth_sleep_ctlr *uctlr = arg;
uthread_has_blocked(uth, UTH_EXT_BLK_EVENTQ);
cmb(); /* actually block before saying 'blocked' */
uctlr->blocked = TRUE; /* can be woken up now */
wrmb(); /* write 'blocked' before read 'check_evqs' */
/* If someone set check_evqs, we should wake up. We're competing with
* other wakers via poke (we may have already woken up!). */
if (uctlr->check_evqs)
poke(&uctlr->poker, uctlr);
/* Once we say we're blocked, we could be woken up (possibly by our poke
* here) and the uthread could run on another core. Holding this lock
* prevents the uthread from quickly returning and freeing the memory of
* uctrl before we have a chance to check_evqs or poke. */
spin_pdr_unlock(&uctlr->in_use);
}
/* Direct version, with *evqs[]. */
void uth_blockon_evqs_arr(struct event_msg *ev_msg,
struct event_queue **which_evq,
struct event_queue *evqs[], size_t nr_evqs)
{
struct uth_sleep_ctlr uctlr;
struct evq_wait_link linkage[nr_evqs];
/* Catch user mistakes. If they lack a handler, they didn't attach.
* They are probably using our evq_wakeup_handler, but they might have
* their own wrapper function. */
for (int i = 0; i < nr_evqs; i++)
assert(evqs[i]->ev_handler);
/* Check for activity on the evqs before going through the hassle of
* sleeping. ("check, signal, check again" pattern). */
if (extract_evqs_msg(evqs, nr_evqs, ev_msg, which_evq))
return;
uth_sleep_ctlr_init(&uctlr, current_uthread);
memset(linkage, 0, sizeof(struct evq_wait_link) * nr_evqs);
for (int i = 0; i < nr_evqs; i++)
link_uctlr_ectlr(&uctlr,
(struct evq_wakeup_ctlr*)evqs[i]->ev_udata,
&linkage[i]);
/* Mesa-style sleep until we get a message. Mesa helps a bit here,
* since we can just deregister from them all when we're done. o/w it
* is tempting to have us deregister from *the* one in the handler and
* extract the message there; which can be tricky and harder to reason
* about. */
while (1) {
/* We need to make sure only one 'version/ctx' of this thread is
* active at a time. Later on, we'll unlock in vcore ctx on the
* other side of a yield. We could restart from the yield,
* return, and free the uctlr before that ctx has a chance to
* finish. */
spin_pdr_lock(&uctlr.in_use);
/* We're signed up. We might already have been told to check
* the evqs, or there could be messages still sitting in the
* evqs. check_evqs is only ever cleared here, and only ever
* set in evq handlers. */
uctlr.check_evqs = FALSE;
cmb(); /* look for messages after clearing check_evqs */
if (extract_evqs_msg(evqs, nr_evqs, ev_msg, which_evq))
break;
uthread_yield(TRUE, __uth_blockon_evq_cb, &uctlr);
}
/* On the one hand, it's not necessary to unlock, since the memory will
* be freed. But we do need to go through the process to turn on notifs
* and adjust the notif_disabled_depth for the case where we don't
* yield. */
spin_pdr_unlock(&uctlr.in_use);
for (int i = 0; i < nr_evqs; i++)
unlink_ectlr(&linkage[i]);
}
/* ... are event_queue *s, nr_evqs of them. This will block until it can
* extract some message from one of evqs. The message will be placed in ev_msg,
* and the particular evq it extracted it from will be placed in which_evq, if
* which is non-zero. */
void uth_blockon_evqs(struct event_msg *ev_msg, struct event_queue **which_evq,
size_t nr_evqs, ...)
{
struct event_queue *evqs[nr_evqs];
va_list va;
va_start(va, nr_evqs);
for (int i = 0; i < nr_evqs; i++)
evqs[i] = va_arg(va, struct event_queue *);
va_end(va);
uth_blockon_evqs_arr(ev_msg, which_evq, evqs, nr_evqs);
}
/* ... are event_queue *s, nr_evqs of them. This will attempt to extract some
* message from one of evqs. The message will be placed in ev_msg, and the
* particular evq it extracted it from will be placed in which_evq. Returns
* TRUE if it extracted a message. */
bool uth_check_evqs(struct event_msg *ev_msg, struct event_queue **which_evq,
size_t nr_evqs, ...)
{
struct event_queue *evqs[nr_evqs];
va_list va;
va_start(va, nr_evqs);
for (int i = 0; i < nr_evqs; i++)
evqs[i] = va_arg(va, struct event_queue *);
va_end(va);
return extract_evqs_msg(evqs, nr_evqs, ev_msg, which_evq);
}