|  | /* Copyright (c) 2011-2014 The Regents of the University of California | 
|  | * Copyright (c) 2015 Google Inc | 
|  | * Barret Rhoden <brho@cs.berkeley.edu> | 
|  | * See LICENSE for details. | 
|  | * | 
|  | * Userspace utility functions for receiving events and notifications (IPIs). | 
|  | * Some are higher level than others; just use what you need. */ | 
|  |  | 
|  | #include <ros/event.h> | 
|  | #include <ros/procdata.h> | 
|  | #include <parlib/ucq.h> | 
|  | #include <parlib/evbitmap.h> | 
|  | #include <parlib/ceq.h> | 
|  | #include <parlib/vcore.h> | 
|  | #include <stdlib.h> | 
|  | #include <string.h> | 
|  | #include <parlib/assert.h> | 
|  | #include <parlib/stdio.h> | 
|  | #include <errno.h> | 
|  | #include <parlib/parlib.h> | 
|  | #include <parlib/event.h> | 
|  | #include <parlib/uthread.h> | 
|  | #include <parlib/spinlock.h> | 
|  | #include <parlib/mcs.h> | 
|  | #include <parlib/poke.h> | 
|  | #include <sys/queue.h> | 
|  | #include <malloc.h> | 
|  |  | 
|  | /* For remote VCPD mbox event handling */ | 
|  | __thread bool __vc_handle_an_mbox = FALSE; | 
|  | __thread uint32_t __vc_rem_vcoreid; | 
|  |  | 
|  | /********* Event_q Setup / Registration  ***********/ | 
|  |  | 
|  | /* Get event_qs via these interfaces, since eventually we'll want to either | 
|  | * allocate from pinned memory or use some form of a slab allocator.  Also, | 
|  | * these stitch up the big_q so its ev_mbox points to its internal mbox.  Never | 
|  | * access the internal mbox directly. | 
|  | * | 
|  | * Raw ones need to have their mailboxes initialized.  If you're making a lot of | 
|  | * these and they perform their own mmaps (e.g. UCQs), you can do one big mmap | 
|  | * and init the ucqs on your own, which ought to perform better. | 
|  | * | 
|  | * Use the 'regular' one for big_qs if you don't want to worry about the mbox | 
|  | * initalization */ | 
|  | struct event_queue *get_eventq_raw(void) | 
|  | { | 
|  | /* TODO: (PIN) should be pinned memory */ | 
|  | struct event_queue_big *big_q = malloc(sizeof(struct event_queue_big)); | 
|  | memset(big_q, 0, sizeof(struct event_queue_big)); | 
|  | big_q->ev_mbox = &big_q->ev_imbox; | 
|  | return (struct event_queue*)big_q; | 
|  | } | 
|  |  | 
|  | struct event_queue *get_eventq(int mbox_type) | 
|  | { | 
|  | struct event_queue *big_q = get_eventq_raw(); | 
|  | event_mbox_init(big_q->ev_mbox, mbox_type); | 
|  | return big_q; | 
|  | } | 
|  |  | 
|  | /* Basic initialization of a single mbox.  If you know the type, you can set up | 
|  | * the mbox manually with possibly better performance.  For instance, ucq_init() | 
|  | * calls mmap internally.  You could mmap a huge blob on your own and call | 
|  | * ucq_raw_init (don't forget to set the mbox_type!) */ | 
|  | void event_mbox_init(struct event_mbox *ev_mbox, int mbox_type) | 
|  | { | 
|  | ev_mbox->type = mbox_type; | 
|  | switch (ev_mbox->type) { | 
|  | case (EV_MBOX_UCQ): | 
|  | ucq_init(&ev_mbox->ucq); | 
|  | break; | 
|  | case (EV_MBOX_BITMAP): | 
|  | evbitmap_init(&ev_mbox->evbm); | 
|  | break; | 
|  | case (EV_MBOX_CEQ): | 
|  | ceq_init(&ev_mbox->ceq, CEQ_OR, CEQ_DEFAULT_SZ, CEQ_DEFAULT_SZ); | 
|  | break; | 
|  | default: | 
|  | printf("Unknown mbox type %d!\n", ev_mbox->type); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Give it up.  I don't recommend calling these unless you're sure the queues | 
|  | * aren't in use (unregistered, etc). (TODO: consider some checks for this) */ | 
|  | void put_eventq_raw(struct event_queue *ev_q) | 
|  | { | 
|  | /* if we use something other than malloc, we'll need to be aware that | 
|  | * ev_q is actually an event_queue_big.  One option is to use the flags, | 
|  | * though this could be error prone. */ | 
|  | free(ev_q); | 
|  | } | 
|  |  | 
|  | void put_eventq(struct event_queue *ev_q) | 
|  | { | 
|  | event_mbox_cleanup(ev_q->ev_mbox); | 
|  | put_eventq_raw(ev_q); | 
|  | } | 
|  |  | 
|  | void event_mbox_cleanup(struct event_mbox *ev_mbox) | 
|  | { | 
|  | switch (ev_mbox->type) { | 
|  | case (EV_MBOX_UCQ): | 
|  | ucq_free_pgs(&ev_mbox->ucq); | 
|  | break; | 
|  | case (EV_MBOX_BITMAP): | 
|  | evbitmap_cleanup(&ev_mbox->evbm); | 
|  | break; | 
|  | case (EV_MBOX_CEQ): | 
|  | ceq_cleanup(&ev_mbox->ceq); | 
|  | break; | 
|  | default: | 
|  | printf("Unknown mbox type %d!\n", ev_mbox->type); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Need to point this event_q to an mbox - usually to a vcpd */ | 
|  | struct event_queue *get_eventq_slim(void) | 
|  | { | 
|  | /* TODO: (PIN) should be pinned memory */ | 
|  | struct event_queue *ev_q = malloc(sizeof(struct event_queue)); | 
|  | memset(ev_q, 0, sizeof(struct event_queue)); | 
|  | return ev_q; | 
|  | } | 
|  |  | 
|  | /* Gets a small ev_q, with ev_mbox pointing to the vcpd mbox of vcoreid.  If | 
|  | * ev_flags has EVENT_VCORE_PRIVATE set, it'll give you the private mbox.  o/w, | 
|  | * you'll get the public one. */ | 
|  | struct event_queue *get_eventq_vcpd(uint32_t vcoreid, int ev_flags) | 
|  | { | 
|  | struct event_queue *ev_q = get_eventq_slim(); | 
|  | if (ev_flags & EVENT_VCORE_PRIVATE) | 
|  | ev_q->ev_mbox = &vcpd_of(vcoreid)->ev_mbox_private; | 
|  | else | 
|  | ev_q->ev_mbox = &vcpd_of(vcoreid)->ev_mbox_public; | 
|  | return ev_q; | 
|  | } | 
|  |  | 
|  | void put_eventq_slim(struct event_queue *ev_q) | 
|  | { | 
|  | /* if we use something other than malloc, we'll need to be aware that | 
|  | * ev_q is not an event_queue_big. */ | 
|  | free(ev_q); | 
|  | } | 
|  |  | 
|  | void put_eventq_vcpd(struct event_queue *ev_q) | 
|  | { | 
|  | put_eventq_slim(ev_q); | 
|  | } | 
|  |  | 
|  | /* Sets ev_q to be the receiving end for kernel event ev_type */ | 
|  | void register_kevent_q(struct event_queue *ev_q, unsigned int ev_type) | 
|  | { | 
|  | __procdata.kernel_evts[ev_type] = ev_q; | 
|  | } | 
|  |  | 
|  | /* Clears the event, returning an ev_q if there was one there.  You'll need to | 
|  | * free it. */ | 
|  | struct event_queue *clear_kevent_q(unsigned int ev_type) | 
|  | { | 
|  | struct event_queue *ev_q = __procdata.kernel_evts[ev_type]; | 
|  |  | 
|  | __procdata.kernel_evts[ev_type] = 0; | 
|  | return ev_q; | 
|  | } | 
|  |  | 
|  | /* Enables an IPI/event combo for ev_type sent to vcoreid's default mbox.  IPI | 
|  | * if you want one or not.  If you want the event to go to the vcore private | 
|  | * mbox (meaning no other core should ever handle it), send in | 
|  | * EVENT_VCORE_PRIVATE with ev_flags. | 
|  | * | 
|  | * This is the simplest thing applications may want, and shows how you can put | 
|  | * the other event functions together to get similar things done. */ | 
|  | void enable_kevent(unsigned int ev_type, uint32_t vcoreid, int ev_flags) | 
|  | { | 
|  | struct event_queue *ev_q = get_eventq_vcpd(vcoreid, ev_flags); | 
|  |  | 
|  | ev_q->ev_flags = ev_flags; | 
|  | ev_q->ev_vcore = vcoreid; | 
|  | ev_q->ev_handler = 0; | 
|  | wmb();	/* make sure ev_q is filled out before registering */ | 
|  | register_kevent_q(ev_q, ev_type); | 
|  | } | 
|  |  | 
|  | /* Stop receiving the events (one could be on the way).  Caller needs to be | 
|  | * careful, since the kernel might be sending an event to the ev_q.  Depending | 
|  | * on the ev_q, it may be hard to know when it is done (for instance, if all | 
|  | * syscalls you ever registered with the ev_q are done, then it would be okay). | 
|  | * o/w, don't free it. */ | 
|  | struct event_queue *disable_kevent(unsigned int ev_type) | 
|  | { | 
|  | return clear_kevent_q(ev_type); | 
|  | } | 
|  |  | 
|  | /********* Event Handling / Reception ***********/ | 
|  | /* Somewhat ghetto helper, for the lazy.  If all you care about is an event | 
|  | * number, this will see if the event happened or not.  It will try for a | 
|  | * message, but if there is none, it will go for a bit.  Note that multiple | 
|  | * bit messages will turn into just one bit. */ | 
|  | unsigned int get_event_type(struct event_mbox *ev_mbox) | 
|  | { | 
|  | struct event_msg local_msg = {0}; | 
|  |  | 
|  | if (extract_one_mbox_msg(ev_mbox, &local_msg)) | 
|  | return local_msg.ev_type; | 
|  | return EV_NONE; | 
|  | } | 
|  |  | 
|  | /* Attempts to register ev_q with sysc, so long as sysc is not done/progress. | 
|  | * Returns true if it succeeded, and false otherwise.  False means that the | 
|  | * syscall is done, and does not need an event set (and should be handled | 
|  | * accordingly). | 
|  | * | 
|  | * A copy of this is in glibc/sysdeps/akaros/syscall.c.  Keep them in sync. */ | 
|  | bool register_evq(struct syscall *sysc, struct event_queue *ev_q) | 
|  | { | 
|  | int old_flags; | 
|  |  | 
|  | sysc->ev_q = ev_q; | 
|  | wrmb();	/* don't let that write pass any future reads (flags) */ | 
|  | /* Try and set the SC_UEVENT flag (so the kernel knows to look at ev_q) | 
|  | */ | 
|  | do { | 
|  | /* no cmb() needed, the atomic_read will reread flags */ | 
|  | old_flags = atomic_read(&sysc->flags); | 
|  | /* Spin if the kernel is mucking with syscall flags */ | 
|  | while (old_flags & SC_K_LOCK) | 
|  | old_flags = atomic_read(&sysc->flags); | 
|  | /* If the kernel finishes while we are trying to sign up for an | 
|  | * event, we need to bail out */ | 
|  | if (old_flags & (SC_DONE | SC_PROGRESS)) { | 
|  | /* not necessary, but might help with bugs */ | 
|  | sysc->ev_q = 0; | 
|  | return FALSE; | 
|  | } | 
|  | } while (!atomic_cas(&sysc->flags, old_flags, old_flags | SC_UEVENT)); | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | /* De-registers a syscall, so that the kernel will not send an event when it is | 
|  | * done.  The call could already be SC_DONE, or could even finish while we try | 
|  | * to unset SC_UEVENT. | 
|  | * | 
|  | * There is a chance the kernel sent an event if you didn't do this in time, but | 
|  | * once this returns, the kernel won't send a message. | 
|  | * | 
|  | * If the kernel is trying to send a message right now, this will spin (on | 
|  | * SC_K_LOCK).  We need to make sure we deregistered, and that if a message | 
|  | * is coming, that it already was sent (and possibly overflowed), before | 
|  | * returning. */ | 
|  | void deregister_evq(struct syscall *sysc) | 
|  | { | 
|  | int old_flags; | 
|  |  | 
|  | sysc->ev_q = 0; | 
|  | wrmb();	/* don't let that write pass any future reads (flags) */ | 
|  | /* Try and unset the SC_UEVENT flag */ | 
|  | do { | 
|  | /* no cmb() needed, the atomic_read will reread flags */ | 
|  | old_flags = atomic_read(&sysc->flags); | 
|  | /* Spin if the kernel is mucking with syscall flags */ | 
|  | while (old_flags & SC_K_LOCK) | 
|  | old_flags = atomic_read(&sysc->flags); | 
|  | /* Note we don't care if the SC_DONE flag is getting set.  We | 
|  | * just need to avoid clobbering flags */ | 
|  | } while (!atomic_cas(&sysc->flags, old_flags, old_flags & ~SC_UEVENT)); | 
|  | } | 
|  |  | 
|  | /* Actual Event Handling */ | 
|  |  | 
|  | /* List of handler lists, process-wide.  They all must return (don't context | 
|  | * switch to a u_thread) */ | 
|  | struct ev_handler *ev_handlers[MAX_NR_EVENT] = {0}; | 
|  | spinpdrlock_t ev_h_wlock = SPINPDR_INITIALIZER; | 
|  |  | 
|  | int register_ev_handler(unsigned int ev_type, handle_event_t handler, | 
|  | void *data) | 
|  | { | 
|  | /* Nasty uthread code assumes this was malloced */ | 
|  | struct ev_handler *new_h = malloc(sizeof(struct ev_handler)); | 
|  |  | 
|  | if (!new_h) | 
|  | return -1; | 
|  | new_h->func = handler; | 
|  | new_h->data = data; | 
|  | spin_pdr_lock(&ev_h_wlock); | 
|  | new_h->next = ev_handlers[ev_type]; | 
|  | wmb();	/* make sure new_h is done before publishing to readers */ | 
|  | ev_handlers[ev_type] = new_h; | 
|  | spin_pdr_unlock(&ev_h_wlock); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | int deregister_ev_handler(unsigned int ev_type, handle_event_t handler, | 
|  | void *data) | 
|  | { | 
|  | /* TODO: User-level RCU */ | 
|  | printf("Failed to dereg handler, not supported yet!\n"); | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | static void run_ev_handlers(unsigned int ev_type, struct event_msg *ev_msg) | 
|  | { | 
|  | struct ev_handler *handler; | 
|  |  | 
|  | /* TODO: RCU read lock */ | 
|  | handler = ev_handlers[ev_type]; | 
|  | while (handler) { | 
|  | handler->func(ev_msg, ev_type, handler->data); | 
|  | handler = handler->next; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Attempts to extract a message from an mbox, copying it into ev_msg. | 
|  | * Returns TRUE on success. */ | 
|  | bool extract_one_mbox_msg(struct event_mbox *ev_mbox, struct event_msg *ev_msg) | 
|  | { | 
|  | switch (ev_mbox->type) { | 
|  | case (EV_MBOX_UCQ): | 
|  | return get_ucq_msg(&ev_mbox->ucq, ev_msg); | 
|  | case (EV_MBOX_BITMAP): | 
|  | return get_evbitmap_msg(&ev_mbox->evbm, ev_msg); | 
|  | case (EV_MBOX_CEQ): | 
|  | return get_ceq_msg(&ev_mbox->ceq, ev_msg); | 
|  | default: | 
|  | printf("Unknown mbox type %d!\n", ev_mbox->type); | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Attempts to handle a message.  Returns 1 if we dequeued a msg, 0 o/w. */ | 
|  | int handle_one_mbox_msg(struct event_mbox *ev_mbox) | 
|  | { | 
|  | struct event_msg local_msg; | 
|  | unsigned int ev_type; | 
|  |  | 
|  | /* extract returns TRUE on success, we return 1. */ | 
|  | if (!extract_one_mbox_msg(ev_mbox, &local_msg)) | 
|  | return 0; | 
|  | ev_type = local_msg.ev_type; | 
|  | assert(ev_type < MAX_NR_EVENT); | 
|  | printd("[event] UCQ (mbox %08p), ev_type: %d\n", ev_mbox, ev_type); | 
|  | run_ev_handlers(ev_type, &local_msg); | 
|  | return 1; | 
|  | } | 
|  |  | 
|  | /* Handle an mbox.  This is the receive-side processing of an event_queue.  It | 
|  | * takes an ev_mbox, since the vcpd mbox isn't a regular ev_q.  Returns 1 if we | 
|  | * handled something, 0 o/w. */ | 
|  | int handle_mbox(struct event_mbox *ev_mbox) | 
|  | { | 
|  | int retval = 0; | 
|  | printd("[event] handling ev_mbox %08p on vcore %d\n", ev_mbox, | 
|  | vcore_id()); | 
|  | /* Some stack-smashing bugs cause this to fail */ | 
|  | assert(ev_mbox); | 
|  | /* Handle all full messages, tracking if we do at least one. */ | 
|  | while (handle_one_mbox_msg(ev_mbox)) | 
|  | retval = 1; | 
|  | return retval; | 
|  | } | 
|  |  | 
|  | /* Empty if the UCQ is empty and the bits don't need checked */ | 
|  | bool mbox_is_empty(struct event_mbox *ev_mbox) | 
|  | { | 
|  | switch (ev_mbox->type) { | 
|  | case (EV_MBOX_UCQ): | 
|  | return ucq_is_empty(&ev_mbox->ucq); | 
|  | case (EV_MBOX_BITMAP): | 
|  | return evbitmap_is_empty(&ev_mbox->evbm); | 
|  | case (EV_MBOX_CEQ): | 
|  | return ceq_is_empty(&ev_mbox->ceq); | 
|  | default: | 
|  | printf("Unknown mbox type %d!\n", ev_mbox->type); | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* The EV_EVENT handler - extract the ev_q from the message. */ | 
|  | void handle_ev_ev(struct event_msg *ev_msg, unsigned int ev_type, void *data) | 
|  | { | 
|  | struct event_queue *ev_q; | 
|  |  | 
|  | /* EV_EVENT can't handle not having a message / being a bit.  If we got | 
|  | * a bit message, it's a bug somewhere */ | 
|  | assert(ev_msg); | 
|  | ev_q = ev_msg->ev_arg3; | 
|  | /* Same deal, a null ev_q is probably a bug, or someone being a jackass | 
|  | */ | 
|  | assert(ev_q); | 
|  | /* Clear pending, so we can start getting INDIRs and IPIs again.  We | 
|  | * must set this before (compared to handle_events, then set it, then | 
|  | * handle again), since there is no guarantee handle_event_q() will | 
|  | * return.  If there is a pending preemption, the vcore quickly yields | 
|  | * and will deal with the remaining events in the future - meaning it | 
|  | * won't return to here. */ | 
|  | ev_q->ev_alert_pending = FALSE; | 
|  | wmb();/* don't let the pending write pass the signaling of an ev recv */ | 
|  | handle_event_q(ev_q); | 
|  | } | 
|  |  | 
|  | /* Handles VCPD events (public and private).  The kernel always sets | 
|  | * notif_pending after posting a message to either public or private mailbox. | 
|  | * When this returns, as far as we are concerned, notif_pending is FALSE. | 
|  | * However, a concurrent kernel writer could have reset it to true.  This is | 
|  | * fine; whenever we leave VC ctx we double check notif_pending.  Returns 1 or 2 | 
|  | * if we actually handled a message, 0 o/w. | 
|  | * | 
|  | * WARNING: this might not return and/or current_uthread may change. */ | 
|  | int handle_events(uint32_t vcoreid) | 
|  | { | 
|  | struct preempt_data *vcpd = vcpd_of(vcoreid); | 
|  | int retval = 0; | 
|  |  | 
|  | vcpd->notif_pending = FALSE; | 
|  | wrmb();	/* prevent future reads from happening before notif_p write */ | 
|  | retval += handle_mbox(&vcpd->ev_mbox_private); | 
|  | retval += handle_mbox(&vcpd->ev_mbox_public); | 
|  | return retval; | 
|  | } | 
|  |  | 
|  | /* Handles the events on ev_q IAW the event_handlers[].  If the ev_q is | 
|  | * application specific, then this will dispatch/handle based on its flags. */ | 
|  | void handle_event_q(struct event_queue *ev_q) | 
|  | { | 
|  | printd("[event] handling ev_q %08p on vcore %d\n", ev_q, vcore_id()); | 
|  | /* If the program wants to handle the ev_q on its own: */ | 
|  | if (ev_q->ev_handler) { | 
|  | /* Remember this can't block or page fault */ | 
|  | ev_q->ev_handler(ev_q); | 
|  | return; | 
|  | } | 
|  | /* Raw ev_qs that haven't been connected to an mbox, user bug: */ | 
|  | assert(ev_q->ev_mbox); | 
|  | /* The "default" ev_handler, common enough that I don't want a func ptr | 
|  | */ | 
|  | handle_mbox(ev_q->ev_mbox); | 
|  | } | 
|  |  | 
|  | /* Sends the calling vcore a message to its public mbox.  This is purposefully | 
|  | * limited to just the calling vcore, since in future versions, we can send via | 
|  | * ucqs directly (in many cases).  That will require the caller to be the | 
|  | * vcoreid, due to some preemption recovery issues (another ucq poller is | 
|  | * waiting on us when we got preempted, and we never up nr_cons). */ | 
|  | void send_self_vc_msg(struct event_msg *ev_msg) | 
|  | { | 
|  | // TODO: try to use UCQs (requires additional support) | 
|  | /* ev_type actually gets ignored currently.  ev_msg is what matters if | 
|  | * it is non-zero.  FALSE means it's going to the public mbox */ | 
|  | sys_self_notify(vcore_id(), ev_msg->ev_type, ev_msg, FALSE); | 
|  | } | 
|  |  | 
|  | /* Helper: makes the current core handle a remote vcore's VCPD public mbox | 
|  | * events. | 
|  | * | 
|  | * Both cases (whether we are handling someone else's already or not) use some | 
|  | * method of telling our future self what to do.  When we aren't already | 
|  | * handling it, we use TLS, and jump to vcore entry.  When we are already | 
|  | * handling, then we send a message to ourself, which we deal with when we | 
|  | * handle our own events (which is later in vcore entry). | 
|  | * | 
|  | * We need to reset the stack and deal with it in vcore entry to avoid recursing | 
|  | * deeply and running off the transition stack.  (handler calling handle event). | 
|  | * | 
|  | * Note that we might not be the one that gets the message we send.  If we pull | 
|  | * a sys_change_to, someone else might be polling our public message box.  All | 
|  | * we're doing is making sure that we don't forget to check rem_vcoreid's mbox. | 
|  | * | 
|  | * Finally, note that this function might not return.  However, it'll handle the | 
|  | * details related to vcpd mboxes, so you don't use the ev_might_not_return() | 
|  | * helpers with this. */ | 
|  | void handle_vcpd_mbox(uint32_t rem_vcoreid) | 
|  | { | 
|  | uint32_t vcoreid = vcore_id(); | 
|  | struct preempt_data *vcpd = vcpd_of(vcoreid); | 
|  | struct event_msg local_msg = {0}; | 
|  | assert(vcoreid != rem_vcoreid); | 
|  | /* If they are empty, then we're done */ | 
|  | if (mbox_is_empty(&vcpd_of(rem_vcoreid)->ev_mbox_public)) | 
|  | return; | 
|  | if (__vc_handle_an_mbox) { | 
|  | /* we might be already handling them, in which case, abort */ | 
|  | if (__vc_rem_vcoreid == rem_vcoreid) | 
|  | return; | 
|  | /* Already handling message for someone, need to send ourselves | 
|  | * a message to check rem_vcoreid, which we'll process later. */ | 
|  | local_msg.ev_type = EV_CHECK_MSGS; | 
|  | local_msg.ev_arg2 = rem_vcoreid;	/* 32bit arg */ | 
|  | send_self_vc_msg(&local_msg); | 
|  | return; | 
|  | } | 
|  | /* No return after here */ | 
|  | /* At this point, we aren't in the process of handling someone else's | 
|  | * messages, so just tell our future self what to do */ | 
|  | __vc_handle_an_mbox = TRUE; | 
|  | __vc_rem_vcoreid = rem_vcoreid; | 
|  | /* Reset the stack and start over in vcore context */ | 
|  | set_stack_pointer((void*)vcpd->vcore_stack); | 
|  | vcore_entry(); | 
|  | assert(0); | 
|  | } | 
|  |  | 
|  | /* Handle remote vcpd public mboxes, if that's what we want to do.  Call this | 
|  | * from vcore entry, pairs with handle_vcpd_mbox(). */ | 
|  | void try_handle_remote_mbox(void) | 
|  | { | 
|  | if (__vc_handle_an_mbox) { | 
|  | handle_mbox(&vcpd_of(__vc_rem_vcoreid)->ev_mbox_public); | 
|  | /* only clear the flag when we have returned from handling | 
|  | * messages.  if an event handler (like preempt_recover) doesn't | 
|  | * return, we'll clear this flag elsewhere. (it's actually not a | 
|  | * big deal if we don't). */ | 
|  | cmb(); | 
|  | __vc_handle_an_mbox = FALSE; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Event handler helpers */ | 
|  |  | 
|  | /* For event handlers that might not return, we need to call this before the | 
|  | * command that might not return.  In the event we were handling a remote | 
|  | * vcore's messages, it'll send ourselves a messages that we (or someone who | 
|  | * polls us) will get so that someone finishes off that vcore's messages). | 
|  | * Doesn't matter who does, so long as someone does. | 
|  | * | 
|  | * This returns whether or not we were handling someone's messages.  Pass the | 
|  | * parameter to ev_we_returned() */ | 
|  | bool ev_might_not_return(void) | 
|  | { | 
|  | struct event_msg local_msg = {0}; | 
|  | bool were_handling_remotes = FALSE; | 
|  | if (__vc_handle_an_mbox) { | 
|  | /* slight chance we finished with their mbox (were on the last | 
|  | * one) */ | 
|  | if (!mbox_is_empty(&vcpd_of(__vc_rem_vcoreid)->ev_mbox_public)) | 
|  | { | 
|  | /* But we aren't, so we'll need to send a message */ | 
|  | local_msg.ev_type = EV_CHECK_MSGS; | 
|  | local_msg.ev_arg2 = __vc_rem_vcoreid;	/* 32bit arg */ | 
|  | send_self_vc_msg(&local_msg); | 
|  | } | 
|  | /* Either way, we're not working on this one now.  Note this is | 
|  | * more of an optimization - it'd be harmless (I think) to poll | 
|  | * another vcore's pub mbox once when we pop up in vc_entry in | 
|  | * the future */ | 
|  | __vc_handle_an_mbox = FALSE; | 
|  | return TRUE; | 
|  | } | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  | /* Call this when you return, paired up with ev_might_not_return().  If | 
|  | * ev_might_not_return turned off uth_handle, we'll turn it back on. */ | 
|  | void ev_we_returned(bool were_handling_remotes) | 
|  | { | 
|  | if (were_handling_remotes) | 
|  | __vc_handle_an_mbox = TRUE; | 
|  | } | 
|  |  | 
|  | /* Debugging */ | 
|  | void print_ev_msg(struct event_msg *msg) | 
|  | { | 
|  | printf("MSG at %08p\n", msg); | 
|  | printf("\ttype: %d\n", msg->ev_type); | 
|  | printf("\targ1 (16): 0x%4x\n", msg->ev_arg1); | 
|  | printf("\targ2 (32): 0x%8x\n", msg->ev_arg2); | 
|  | printf("\targ3 (32): 0x%8x\n", msg->ev_arg3); | 
|  | printf("\targ4 (64): 0x%16x\n", msg->ev_arg4); | 
|  | } | 
|  |  | 
|  | /* Uthreads blocking on event queues | 
|  | * | 
|  | * It'd be nice to have a uthread sleep until an event queue has some activity | 
|  | * (e.g. a new message).  It'd also be nice to wake up early with a timer.  It | 
|  | * is tempting to try something like an INDIR and have one evq multiplex two | 
|  | * others (the real event and an alarm).  But then you can't separate the two | 
|  | * streams; what if one thread sleeps on just the event at the same time?  What | 
|  | * if we want to support something like Go's select: a thread wants to block | 
|  | * until there is some activity on some channel? | 
|  | * | 
|  | * Ultimately, we want to allow M uthreads to block on possibly different | 
|  | * subsets of N event queues. | 
|  | * | 
|  | * Every uthread will have a sleep controller, and every event queue will have a | 
|  | * wakeup controller.  There are up to MxN linkage structures connecting these. | 
|  | * | 
|  | * We'll use the event_queue handler to override the default event processing. | 
|  | * This means the event queues that are used for blocking uthreads can *only* be | 
|  | * used for that; the regular event processing will not happen.  This is mostly | 
|  | * true.  It is possible to extract events from an evq's mbox concurrently. | 
|  | * | 
|  | * I briefly considered having one global lock to protect all of the lists and | 
|  | * structures.  That's lousy for the obvious scalability reason, but it seemed | 
|  | * like it'd make things easier, especially when I thought I needed locks in | 
|  | * both the ectlr and the uctlr (in early versions, I considered having the | 
|  | * handler yank itself out of the ectlr, copying a message into that struct, or | 
|  | * o/w needing protection).  On occasion, we run into the "I'd like to split my | 
|  | * lock between two components and still somehow synchronize" issue (e.g. FD | 
|  | * taps, with the FDT lock and the blocking/whatever that goes on in a device). | 
|  | * Whenever that comes up, we usually can get some help from other shared memory | 
|  | * techniques.  For FD taps, it's the kref.  For us, it's post-and-poke, though | 
|  | * it didn't solve all of our problems - I use it as a tool with some basic | 
|  | * shared memory signalling. */ | 
|  |  | 
|  | struct evq_wait_link; | 
|  | TAILQ_HEAD(wait_link_tailq, evq_wait_link); | 
|  |  | 
|  | /* Bookkeeping for the uthread sleeping on a bunch of event queues. | 
|  | * | 
|  | * Notes on concurrency: most fields are not protected.  check_evqs is racy, and | 
|  | * written to by handlers.  The tailq is only used by the uthread.  blocked is | 
|  | * never concurrently *written*; see __uth_wakeup_poke() for details. */ | 
|  | struct uth_sleep_ctlr { | 
|  | struct uthread			*uth; | 
|  | struct spin_pdr_lock		in_use; | 
|  | bool				check_evqs; | 
|  | bool				blocked; | 
|  | struct poke_tracker		poker; | 
|  | struct wait_link_tailq		evqs; | 
|  | }; | 
|  |  | 
|  | /* Attaches to an event_queue (ev_udata), tracks the uthreads for this evq */ | 
|  | struct evq_wakeup_ctlr { | 
|  | /* If we ever use a sync_obj, that would replace waiters.  But also note | 
|  | * that we want a pointer to something other than the uthread, and currently | 
|  | * we also wake all threads - there's no scheduling decision. */ | 
|  | struct wait_link_tailq		waiters; | 
|  | struct spin_pdr_lock		lock; | 
|  | }; | 
|  |  | 
|  | /* Up to MxN of these, N of them per uthread. */ | 
|  | struct evq_wait_link { | 
|  | struct uth_sleep_ctlr		*uth_ctlr; | 
|  | TAILQ_ENTRY(evq_wait_link)	link_uth; | 
|  | struct evq_wakeup_ctlr		*evq_ctlr; | 
|  | TAILQ_ENTRY(evq_wait_link)	link_evq; | 
|  | }; | 
|  |  | 
|  | /* Poke function: ensures the uth managed by uctlr wakes up.  poke() ensures | 
|  | * there is only one thread in this function at a time.  However, it could be | 
|  | * called spuriously, which is why we check 'blocked.' */ | 
|  | static void __uth_wakeup_poke(void *arg) | 
|  | { | 
|  | struct uth_sleep_ctlr *uctlr = arg; | 
|  |  | 
|  | /* There are no concurrent writes to 'blocked'.  Blocked is only ever | 
|  | * written when the uth sleeps and only ever cleared here.  Once the uth | 
|  | * writes it, it does not write it again until after we clear it. | 
|  | * | 
|  | * This is still racy - we could see !blocked, then blocked gets set. | 
|  | * In that case, the poke failed, and that is harmless.  The uth will | 
|  | * see 'check_evqs', which was set before poke, which would be before | 
|  | * writing blocked, and the uth checks 'check_evqs' after writing. */ | 
|  | if (uctlr->blocked) { | 
|  | uctlr->blocked = FALSE; | 
|  | cmb();	/* clear blocked before starting the uth */ | 
|  | uthread_runnable(uctlr->uth); | 
|  | } | 
|  | } | 
|  |  | 
|  | static void uth_sleep_ctlr_init(struct uth_sleep_ctlr *uctlr, | 
|  | struct uthread *uth) | 
|  | { | 
|  | uctlr->uth = uth; | 
|  | spin_pdr_init(&uctlr->in_use); | 
|  | uctlr->check_evqs = FALSE; | 
|  | uctlr->blocked = FALSE; | 
|  | poke_init(&uctlr->poker, __uth_wakeup_poke); | 
|  | TAILQ_INIT(&uctlr->evqs); | 
|  | } | 
|  |  | 
|  | /* This handler runs when the ev_q is checked.  Instead of doing anything with | 
|  | * the ev_q, we make sure that every uthread that was waiting on us wakes up. | 
|  | * The uthreads could be waiting on several evqs, so there could be multiple | 
|  | * independent wake-up attempts, hence the poke.  Likewise, the uthread could be | 
|  | * awake when we poke.  The uthread will check check_evqs after sleeping, in | 
|  | * case we poke before it blocks (and the poke fails). | 
|  | * | 
|  | * Also, there could be concurrent callers of this handler, and other uthreads | 
|  | * signing up for a wakeup. */ | 
|  | void evq_wakeup_handler(struct event_queue *ev_q) | 
|  | { | 
|  | struct evq_wakeup_ctlr *ectlr = ev_q->ev_udata; | 
|  | struct evq_wait_link *i; | 
|  |  | 
|  | assert(ectlr); | 
|  | spin_pdr_lock(&ectlr->lock); | 
|  | /* Note we wake up all sleepers, even though only one is likely to get | 
|  | * the message.  See the notes in unlink_ectlr() for more info. */ | 
|  | TAILQ_FOREACH(i, &ectlr->waiters, link_evq) { | 
|  | i->uth_ctlr->check_evqs = TRUE; | 
|  | cmb();	/* order check write before poke (poke has atomic) */ | 
|  | poke(&i->uth_ctlr->poker, i->uth_ctlr); | 
|  | } | 
|  | spin_pdr_unlock(&ectlr->lock); | 
|  | } | 
|  |  | 
|  | /* Helper, attaches a wakeup controller to the event queue. */ | 
|  | void evq_attach_wakeup_ctlr(struct event_queue *ev_q) | 
|  | { | 
|  | struct evq_wakeup_ctlr *ectlr = malloc(sizeof(struct evq_wakeup_ctlr)); | 
|  |  | 
|  | memset(ectlr, 0, sizeof(struct evq_wakeup_ctlr)); | 
|  | spin_pdr_init(&ectlr->lock); | 
|  | TAILQ_INIT(&ectlr->waiters); | 
|  | ev_q->ev_udata = ectlr; | 
|  | ev_q->ev_handler = evq_wakeup_handler; | 
|  | } | 
|  |  | 
|  | void evq_remove_wakeup_ctlr(struct event_queue *ev_q) | 
|  | { | 
|  | free(ev_q->ev_udata); | 
|  | ev_q->ev_udata = 0; | 
|  | ev_q->ev_handler = 0; | 
|  | } | 
|  |  | 
|  | static void link_uctlr_ectlr(struct uth_sleep_ctlr *uctlr, | 
|  | struct evq_wakeup_ctlr *ectlr, | 
|  | struct evq_wait_link *link) | 
|  | { | 
|  | /* No lock needed for the uctlr; we're the only one modifying evqs */ | 
|  | link->uth_ctlr = uctlr; | 
|  | TAILQ_INSERT_HEAD(&uctlr->evqs, link, link_uth); | 
|  | /* Once we add ourselves to the ectrl list, we could start getting poked | 
|  | */ | 
|  | link->evq_ctlr = ectlr; | 
|  | spin_pdr_lock(&ectlr->lock); | 
|  | TAILQ_INSERT_HEAD(&ectlr->waiters, link, link_evq); | 
|  | spin_pdr_unlock(&ectlr->lock); | 
|  | } | 
|  |  | 
|  | /* Disconnects us from a wakeup controller. | 
|  | * | 
|  | * Our evq handlers wake up *all* uthreads that are waiting for activity | 
|  | * (broadcast).  It's a tradeoff.  If the list of uthreads is long, then it is | 
|  | * wasted effort.  An alternative is to wake up exactly one, with slightly | 
|  | * greater overheads.  In the exactly-one case, multiple handlers could wake | 
|  | * this uth up at once, but we can only extract one message.  If we do the | 
|  | * single wake up, then when we detach from an ectlr, we need to peak in the | 
|  | * mbox to see if it is not empty, and conditionally run its handler again, such | 
|  | * that no uthread sits on a ectlr that has activity/pending messages (in | 
|  | * essence, level triggered). */ | 
|  | static void unlink_ectlr(struct evq_wait_link *link) | 
|  | { | 
|  | struct evq_wakeup_ctlr *ectlr = link->evq_ctlr; | 
|  |  | 
|  | spin_pdr_lock(&ectlr->lock); | 
|  | TAILQ_REMOVE(&ectlr->waiters, link, link_evq); | 
|  | spin_pdr_unlock(&ectlr->lock); | 
|  | } | 
|  |  | 
|  | /* Helper: polls all evqs once and extracts the first message available.  The | 
|  | * message is copied into ev_msg, and the evq with the activity is copied into | 
|  | * which_evq (if it is non-zero).  Returns TRUE on success. */ | 
|  | static bool extract_evqs_msg(struct event_queue *evqs[], size_t nr_evqs, | 
|  | struct event_msg *ev_msg, | 
|  | struct event_queue **which_evq) | 
|  | { | 
|  | struct event_queue *evq_i; | 
|  | bool ret = FALSE; | 
|  |  | 
|  | /* We need to have notifs disabled when extracting messages from some | 
|  | * mboxes.  Many mboxes have some form of busy waiting between consumers | 
|  | * (userspace).  If we're just a uthread, we could wind up on a runqueue | 
|  | * somewhere while someone else spins, possibly in VC ctx. */ | 
|  | uth_disable_notifs(); | 
|  | for (int i = 0; i < nr_evqs; i++) { | 
|  | evq_i = evqs[i]; | 
|  | if (extract_one_mbox_msg(evq_i->ev_mbox, ev_msg)) { | 
|  | if (which_evq) | 
|  | *which_evq = evq_i; | 
|  | ret = TRUE; | 
|  | break; | 
|  | } | 
|  | } | 
|  | uth_enable_notifs(); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | /* Yield callback */ | 
|  | static void __uth_blockon_evq_cb(struct uthread *uth, void *arg) | 
|  | { | 
|  | struct uth_sleep_ctlr *uctlr = arg; | 
|  |  | 
|  | uthread_has_blocked(uth, UTH_EXT_BLK_EVENTQ); | 
|  | cmb();	/* actually block before saying 'blocked' */ | 
|  | uctlr->blocked = TRUE;	/* can be woken up now */ | 
|  | wrmb();	/* write 'blocked' before read 'check_evqs' */ | 
|  | /* If someone set check_evqs, we should wake up.  We're competing with | 
|  | * other wakers via poke (we may have already woken up!). */ | 
|  | if (uctlr->check_evqs) | 
|  | poke(&uctlr->poker, uctlr); | 
|  | /* Once we say we're blocked, we could be woken up (possibly by our poke | 
|  | * here) and the uthread could run on another core.  Holding this lock | 
|  | * prevents the uthread from quickly returning and freeing the memory of | 
|  | * uctrl before we have a chance to check_evqs or poke. */ | 
|  | spin_pdr_unlock(&uctlr->in_use); | 
|  | } | 
|  |  | 
|  | /* Direct version, with *evqs[]. */ | 
|  | void uth_blockon_evqs_arr(struct event_msg *ev_msg, | 
|  | struct event_queue **which_evq, | 
|  | struct event_queue *evqs[], size_t nr_evqs) | 
|  | { | 
|  | struct uth_sleep_ctlr uctlr; | 
|  | struct evq_wait_link linkage[nr_evqs]; | 
|  |  | 
|  | /* Catch user mistakes.  If they lack a handler, they didn't attach. | 
|  | * They are probably using our evq_wakeup_handler, but they might have | 
|  | * their own wrapper function. */ | 
|  | for (int i = 0; i < nr_evqs; i++) | 
|  | assert(evqs[i]->ev_handler); | 
|  | /* Check for activity on the evqs before going through the hassle of | 
|  | * sleeping.  ("check, signal, check again" pattern). */ | 
|  | if (extract_evqs_msg(evqs, nr_evqs, ev_msg, which_evq)) | 
|  | return; | 
|  | uth_sleep_ctlr_init(&uctlr, current_uthread); | 
|  | memset(linkage, 0, sizeof(struct evq_wait_link) * nr_evqs); | 
|  | for (int i = 0; i < nr_evqs; i++) | 
|  | link_uctlr_ectlr(&uctlr, | 
|  | (struct evq_wakeup_ctlr*)evqs[i]->ev_udata, | 
|  | &linkage[i]); | 
|  | /* Mesa-style sleep until we get a message.  Mesa helps a bit here, | 
|  | * since we can just deregister from them all when we're done.  o/w it | 
|  | * is tempting to have us deregister from *the* one in the handler and | 
|  | * extract the message there; which can be tricky and harder to reason | 
|  | * about. */ | 
|  | while (1) { | 
|  | /* We need to make sure only one 'version/ctx' of this thread is | 
|  | * active at a time.  Later on, we'll unlock in vcore ctx on the | 
|  | * other side of a yield.  We could restart from the yield, | 
|  | * return, and free the uctlr before that ctx has a chance to | 
|  | * finish. */ | 
|  | spin_pdr_lock(&uctlr.in_use); | 
|  | /* We're signed up.  We might already have been told to check | 
|  | * the evqs, or there could be messages still sitting in the | 
|  | * evqs.  check_evqs is only ever cleared here, and only ever | 
|  | * set in evq handlers. */ | 
|  | uctlr.check_evqs = FALSE; | 
|  | cmb();	/* look for messages after clearing check_evqs */ | 
|  | if (extract_evqs_msg(evqs, nr_evqs, ev_msg, which_evq)) | 
|  | break; | 
|  | uthread_yield(TRUE, __uth_blockon_evq_cb, &uctlr); | 
|  | } | 
|  | /* On the one hand, it's not necessary to unlock, since the memory will | 
|  | * be freed.  But we do need to go through the process to turn on notifs | 
|  | * and adjust the notif_disabled_depth for the case where we don't | 
|  | * yield. */ | 
|  | spin_pdr_unlock(&uctlr.in_use); | 
|  | for (int i = 0; i < nr_evqs; i++) | 
|  | unlink_ectlr(&linkage[i]); | 
|  | } | 
|  |  | 
|  | /* ... are event_queue *s, nr_evqs of them.  This will block until it can | 
|  | * extract some message from one of evqs.  The message will be placed in ev_msg, | 
|  | * and the particular evq it extracted it from will be placed in which_evq, if | 
|  | * which is non-zero. */ | 
|  | void uth_blockon_evqs(struct event_msg *ev_msg, struct event_queue **which_evq, | 
|  | size_t nr_evqs, ...) | 
|  | { | 
|  | struct event_queue *evqs[nr_evqs]; | 
|  | va_list va; | 
|  |  | 
|  | va_start(va, nr_evqs); | 
|  | for (int i = 0; i < nr_evqs; i++) | 
|  | evqs[i] = va_arg(va, struct event_queue *); | 
|  | va_end(va); | 
|  | uth_blockon_evqs_arr(ev_msg, which_evq, evqs, nr_evqs); | 
|  | } | 
|  |  | 
|  | /* ... are event_queue *s, nr_evqs of them.  This will attempt to extract some | 
|  | * message from one of evqs.  The message will be placed in ev_msg, and the | 
|  | * particular evq it extracted it from will be placed in which_evq.  Returns | 
|  | * TRUE if it extracted a message. */ | 
|  | bool uth_check_evqs(struct event_msg *ev_msg, struct event_queue **which_evq, | 
|  | size_t nr_evqs, ...) | 
|  | { | 
|  | struct event_queue *evqs[nr_evqs]; | 
|  | va_list va; | 
|  |  | 
|  | va_start(va, nr_evqs); | 
|  | for (int i = 0; i < nr_evqs; i++) | 
|  | evqs[i] = va_arg(va, struct event_queue *); | 
|  | va_end(va); | 
|  | return extract_evqs_msg(evqs, nr_evqs, ev_msg, which_evq); | 
|  | } |