|  | /* Copyright (c) 2016 Google Inc. | 
|  | * Barret Rhoden <brho@cs.berkeley.edu> | 
|  | * See LICENSE for details. | 
|  | * | 
|  | * 2LS for virtual machines */ | 
|  |  | 
|  | #include <vmm/sched.h> | 
|  | #include <vmm/vmm.h> | 
|  | #include <vmm/vthread.h> | 
|  | #include <sys/mman.h> | 
|  | #include <stdlib.h> | 
|  | #include <assert.h> | 
|  | #include <parlib/spinlock.h> | 
|  | #include <parlib/event.h> | 
|  | #include <parlib/ucq.h> | 
|  | #include <parlib/arch/trap.h> | 
|  | #include <parlib/ros_debug.h> | 
|  | #include <parlib/vcore_tick.h> | 
|  | #include <parlib/slab.h> | 
|  |  | 
|  | int vmm_sched_period_usec = 1000; | 
|  |  | 
|  | /* For now, we only have one VM managed by the 2LS.  If we ever expand that, | 
|  | * we'll need something analogous to current_uthread, so the 2LS knows which VM | 
|  | * it is working on. */ | 
|  | static struct virtual_machine *current_vm; | 
|  |  | 
|  | static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER; | 
|  | /* Runnable queues, broken up by thread type. */ | 
|  | static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks); | 
|  | static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests); | 
|  | static struct vmm_thread **greedy_rnbl_guests; | 
|  | /* Counts of *unblocked* threads.  Unblocked = Running + Runnable. */ | 
|  | static atomic_t nr_unblk_tasks; | 
|  | static atomic_t nr_unblk_guests; | 
|  | /* Global evq for all syscalls.  Could make this per vcore or whatever. */ | 
|  | static struct event_queue *sysc_evq; | 
|  | static struct kmem_cache *task_thread_cache; | 
|  |  | 
|  | static void vmm_sched_init(void); | 
|  | static void vmm_sched_entry(void); | 
|  | static void vmm_thread_runnable(struct uthread *uth); | 
|  | static void vmm_thread_paused(struct uthread *uth); | 
|  | static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc); | 
|  | static void vmm_thread_has_blocked(struct uthread *uth, int flags); | 
|  | static void vmm_thread_refl_fault(struct uthread *uth, | 
|  | struct user_context *ctx); | 
|  | static void vmm_thread_exited(struct uthread *uth); | 
|  | static struct uthread *vmm_thread_create(void *(*func)(void *), void *arg); | 
|  |  | 
|  | struct schedule_ops vmm_sched_ops = { | 
|  | .sched_init = vmm_sched_init, | 
|  | .sched_entry = vmm_sched_entry, | 
|  | .thread_runnable = vmm_thread_runnable, | 
|  | .thread_paused = vmm_thread_paused, | 
|  | .thread_blockon_sysc = vmm_thread_blockon_sysc, | 
|  | .thread_has_blocked = vmm_thread_has_blocked, | 
|  | .thread_refl_fault = vmm_thread_refl_fault, | 
|  | .thread_exited = vmm_thread_exited, | 
|  | .thread_create = vmm_thread_create, | 
|  | }; | 
|  |  | 
|  | struct schedule_ops *sched_ops = &vmm_sched_ops; | 
|  |  | 
|  | /* Helpers */ | 
|  | static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type, | 
|  | void *data); | 
|  | static void acct_thread_blocked(struct vmm_thread *vth); | 
|  | static void acct_thread_unblocked(struct vmm_thread *vth); | 
|  | static void enqueue_vmm_thread(struct vmm_thread *vth); | 
|  | static int task_thread_ctor(void *obj, void *priv, int flags); | 
|  | static void task_thread_dtor(void *obj, void *priv); | 
|  | static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, | 
|  | int type); | 
|  | static void *__alloc_stack(size_t stacksize); | 
|  | static void __free_stack(void *stacktop, size_t stacksize); | 
|  |  | 
|  | static bool sched_is_greedy(void) | 
|  | { | 
|  | return parlib_never_yield; | 
|  | } | 
|  |  | 
|  | static unsigned int sched_nr_greedy_cores(void) | 
|  | { | 
|  | if (!current_vm) | 
|  | return 1; | 
|  | return current_vm->nr_gpcs + 1; | 
|  | } | 
|  |  | 
|  | static void restart_thread(struct syscall *sysc) | 
|  | { | 
|  | struct uthread *ut_restartee = (struct uthread*)sysc->u_data; | 
|  |  | 
|  | /* uthread stuff here: */ | 
|  | assert(ut_restartee); | 
|  | assert(ut_restartee->sysc == sysc);	/* set in uthread.c */ | 
|  | ut_restartee->sysc = 0;	/* so we don't 'reblock' on this later */ | 
|  | vmm_thread_runnable(ut_restartee); | 
|  | } | 
|  |  | 
|  | static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type, | 
|  | void *data) | 
|  | { | 
|  | struct syscall *sysc; | 
|  |  | 
|  | /* I think we can make this assert now.  If not, check pthread.c. (concern | 
|  | * was having old ev_qs firing and running this handler). */ | 
|  | assert(ev_msg); | 
|  | sysc = ev_msg->ev_arg3; | 
|  | assert(sysc); | 
|  | restart_thread(sysc); | 
|  | } | 
|  |  | 
|  | /* Helper: allocates a UCQ-based event queue suitable for syscalls.  Will | 
|  | * attempt to route the notifs/IPIs to vcoreid */ | 
|  | static struct event_queue *setup_sysc_evq(int vcoreid) | 
|  | { | 
|  | struct event_queue *evq; | 
|  | uintptr_t mmap_block; | 
|  |  | 
|  | mmap_block = (uintptr_t)mmap(0, PGSIZE * 2, | 
|  | PROT_WRITE | PROT_READ, | 
|  | MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE, | 
|  | -1, 0); | 
|  | evq = get_eventq_raw(); | 
|  | assert(mmap_block && evq); | 
|  | evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP; | 
|  | evq->ev_vcore = vcoreid; | 
|  | evq->ev_mbox->type = EV_MBOX_UCQ; | 
|  | ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE); | 
|  | return evq; | 
|  | } | 
|  |  | 
|  | static void vmm_sched_init(void) | 
|  | { | 
|  | struct task_thread *thread0; | 
|  |  | 
|  | /* Note that thread0 doesn't belong to a VM.  We can set this during | 
|  | * vmm_init() if we need to. */ | 
|  | thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK); | 
|  | assert(thread0); | 
|  | acct_thread_unblocked((struct vmm_thread*)thread0); | 
|  | thread0->stacksize = USTACK_NUM_PAGES * PGSIZE; | 
|  | thread0->stacktop = (void*)USTACKTOP; | 
|  | /* for lack of a better vcore, might as well send to 0 */ | 
|  | sysc_evq = setup_sysc_evq(0); | 
|  | uthread_2ls_init((struct uthread*)thread0, vmm_handle_syscall, NULL); | 
|  | task_thread_cache = kmem_cache_create("task threads", | 
|  | sizeof(struct vmm_thread), | 
|  | __alignof__(struct vmm_thread), 0, | 
|  | task_thread_ctor, task_thread_dtor, | 
|  | NULL); | 
|  | } | 
|  |  | 
|  | /* The scheduling policy is encapsulated in the next few functions (from here | 
|  | * down to sched_entry()). */ | 
|  |  | 
|  | static int desired_nr_vcores(void) | 
|  | { | 
|  | /* Sanity checks on our accounting. */ | 
|  | assert(atomic_read(&nr_unblk_guests) >= 0); | 
|  | assert(atomic_read(&nr_unblk_tasks) >= 0); | 
|  | /* Lockless peak.  This is always an estimate.  Some of our tasks busy-wait, | 
|  | * so it's not enough to just give us one vcore for all tasks, yet. */ | 
|  | return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks); | 
|  | } | 
|  |  | 
|  | static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq) | 
|  | { | 
|  | struct vmm_thread *vth; | 
|  |  | 
|  | vth = TAILQ_FIRST(tq); | 
|  | if (vth) | 
|  | TAILQ_REMOVE(tq, vth, tq_next); | 
|  | return vth; | 
|  | } | 
|  |  | 
|  | static struct vmm_thread *pick_a_thread_degraded(void) | 
|  | { | 
|  | struct vmm_thread *vth; | 
|  |  | 
|  | spin_pdr_lock(&queue_lock); | 
|  | vth = __pop_first(&rnbl_tasks); | 
|  | if (!vth) | 
|  | vth = __pop_first(&rnbl_guests); | 
|  | spin_pdr_unlock(&queue_lock); | 
|  | return vth; | 
|  | } | 
|  |  | 
|  | /* We have plenty of cores - run whatever we want.  We'll prioritize tasks. */ | 
|  | static struct vmm_thread *pick_a_thread_plenty(void) | 
|  | { | 
|  | struct vmm_thread *vth = 0; | 
|  |  | 
|  | spin_pdr_lock(&queue_lock); | 
|  | if (!vth) | 
|  | vth = __pop_first(&rnbl_tasks); | 
|  | if (!vth) | 
|  | vth = __pop_first(&rnbl_guests); | 
|  | spin_pdr_unlock(&queue_lock); | 
|  | return vth; | 
|  | } | 
|  |  | 
|  | static void yield_current_uth(void) | 
|  | { | 
|  | struct vmm_thread *vth; | 
|  |  | 
|  | if (!current_uthread) | 
|  | return; | 
|  | vth = (struct vmm_thread*)stop_current_uthread(); | 
|  | enqueue_vmm_thread(vth); | 
|  | } | 
|  |  | 
|  | /* Helper, tries to get the right number of vcores.  Returns TRUE if we think we | 
|  | * have enough, FALSE otherwise. | 
|  | * | 
|  | * TODO: this doesn't handle a lot of issues, like preemption, how to | 
|  | * run/yield our vcores, dynamic changes in the number of runnables, where | 
|  | * to send events, how to avoid interfering with gpcs, etc. */ | 
|  | static bool try_to_get_vcores(void) | 
|  | { | 
|  | int nr_vcores_wanted; | 
|  | bool have_enough; | 
|  |  | 
|  | if (sched_is_greedy()) | 
|  | return num_vcores() == sched_nr_greedy_cores(); | 
|  | nr_vcores_wanted = desired_nr_vcores(); | 
|  | have_enough = nr_vcores_wanted <= num_vcores(); | 
|  | if (have_enough) { | 
|  | vcore_tick_disable(); | 
|  | return TRUE; | 
|  | } | 
|  | vcore_tick_enable(vmm_sched_period_usec); | 
|  | vcore_request_total(nr_vcores_wanted); | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  | static void stats_run_vth(struct vmm_thread *vth) | 
|  | { | 
|  | vth->nr_runs++; | 
|  | if (vth->prev_vcoreid != vcore_id()) { | 
|  | vth->prev_vcoreid = vcore_id(); | 
|  | vth->nr_resched++; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* TODO: This assumes we get all of our vcores. */ | 
|  | static struct vmm_thread *sched_pick_thread_greedy(void) | 
|  | { | 
|  | struct vmm_thread *vth; | 
|  |  | 
|  | if (current_uthread) { | 
|  | stats_run_vth((struct vmm_thread*)current_uthread); | 
|  | run_current_uthread(); | 
|  | } | 
|  | if (vcore_id() == 0) { | 
|  | spin_pdr_lock(&queue_lock); | 
|  | vth = __pop_first(&rnbl_tasks); | 
|  | spin_pdr_unlock(&queue_lock); | 
|  | return vth; | 
|  | } | 
|  | /* This races with enqueue_vmm_thread, which can run on another core. | 
|  | * Here are the rules: | 
|  | * - set when runnable (race free, only one state for the thread at a time) | 
|  | * - cleared when we run it (race free, we're the only runners) | 
|  | * - if we take an interrupt, we'll just run_current_uthread and not check | 
|  | * - if we vmexit, we'll run the buddy directly */ | 
|  | assert(vcore_id() <= current_vm->nr_gpcs); | 
|  | vth = greedy_rnbl_guests[vcore_id() - 1]; | 
|  | if (vth) | 
|  | greedy_rnbl_guests[vcore_id() - 1] = NULL; | 
|  | return vth; | 
|  | } | 
|  |  | 
|  | static struct vmm_thread *sched_pick_thread_nice(void) | 
|  | { | 
|  | struct vmm_thread *vth; | 
|  | bool have_enough; | 
|  |  | 
|  | have_enough = try_to_get_vcores(); | 
|  | if (!have_enough && vcore_tick_poll()) { | 
|  | /* slightly less than ideal: we grab the queue lock twice */ | 
|  | yield_current_uth(); | 
|  | } | 
|  | if (current_uthread) { | 
|  | stats_run_vth((struct vmm_thread*)current_uthread); | 
|  | run_current_uthread(); | 
|  | } | 
|  | if (have_enough) | 
|  | vth = pick_a_thread_plenty(); | 
|  | else | 
|  | vth = pick_a_thread_degraded(); | 
|  | return vth; | 
|  | } | 
|  |  | 
|  | static void __attribute__((noreturn)) vmm_sched_entry(void) | 
|  | { | 
|  | struct vmm_thread *vth; | 
|  |  | 
|  | if (sched_is_greedy()) { | 
|  | vth = sched_pick_thread_greedy(); | 
|  | if (!vth) { | 
|  | /* sys_halt_core will return, but we need to restart the vcore.  We | 
|  | * might have woke due to an event, and we'll need to handle_events | 
|  | * and other things dealt with by uthreads. */ | 
|  | if (vcore_id() == 0) | 
|  | sys_halt_core(0); | 
|  | /* In greedy mode, yield will abort and we'll just restart */ | 
|  | vcore_yield_or_restart(); | 
|  | } | 
|  | } else { | 
|  | vth = sched_pick_thread_nice(); | 
|  | if (!vth) | 
|  | vcore_yield_or_restart(); | 
|  | } | 
|  | stats_run_vth(vth); | 
|  | run_uthread((struct uthread*)vth); | 
|  | } | 
|  |  | 
|  | static void vmm_thread_runnable(struct uthread *uth) | 
|  | { | 
|  | /* A thread that was blocked is now runnable.  This counts as becoming | 
|  | * unblocked (running + runnable) */ | 
|  | acct_thread_unblocked((struct vmm_thread*)uth); | 
|  | enqueue_vmm_thread((struct vmm_thread*)uth); | 
|  | } | 
|  |  | 
|  | static void vmm_thread_paused(struct uthread *uth) | 
|  | { | 
|  | /* The thread stopped for some reason, usually a preemption.  We'd like to | 
|  | * just run it whenever we get a chance.  Note that it didn't become | 
|  | * 'blocked' - it's still runnable. */ | 
|  | enqueue_vmm_thread((struct vmm_thread*)uth); | 
|  | } | 
|  |  | 
|  | static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall) | 
|  | { | 
|  | struct syscall *sysc = (struct syscall*)syscall; | 
|  |  | 
|  | acct_thread_blocked((struct vmm_thread*)uth); | 
|  | sysc->u_data = uth; | 
|  | if (!register_evq(sysc, sysc_evq)) { | 
|  | /* Lost the race with the call being done.  The kernel won't send the | 
|  | * event.  Just restart him. */ | 
|  | restart_thread(sysc); | 
|  | } | 
|  | /* GIANT WARNING: do not touch the thread after this point. */ | 
|  | } | 
|  |  | 
|  | static void vmm_thread_has_blocked(struct uthread *uth, int flags) | 
|  | { | 
|  | /* The thread blocked on something like a mutex.  It's not runnable, so we | 
|  | * don't need to put it on a list, but we do need to account for it not | 
|  | * running.  We'll find out (via thread_runnable) when it starts up again. | 
|  | */ | 
|  | acct_thread_blocked((struct vmm_thread*)uth); | 
|  | } | 
|  |  | 
|  | static void refl_error(struct uthread *uth, unsigned int trap_nr, | 
|  | unsigned int err, unsigned long aux) | 
|  | { | 
|  | printf("Thread has unhandled fault: %d, err: %d, aux: %p\n", | 
|  | trap_nr, err, aux); | 
|  | /* Note that uthread.c already copied out our ctx into the uth | 
|  | * struct */ | 
|  | print_user_context(&uth->u_ctx); | 
|  | printf("Turn on printx to spew unhandled, malignant trap info\n"); | 
|  | exit(-1); | 
|  | } | 
|  |  | 
|  | static bool handle_page_fault(struct uthread *uth, unsigned int err, | 
|  | unsigned long aux) | 
|  | { | 
|  | if (!(err & PF_VMR_BACKED)) | 
|  | return FALSE; | 
|  | syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1); | 
|  | __block_uthread_on_async_sysc(uth); | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | static void vmm_thread_refl_hw_fault(struct uthread *uth, | 
|  | unsigned int trap_nr, | 
|  | unsigned int err, unsigned long aux) | 
|  | { | 
|  | switch (trap_nr) { | 
|  | case HW_TRAP_PAGE_FAULT: | 
|  | if (!handle_page_fault(uth, err, aux)) | 
|  | refl_error(uth, trap_nr, err, aux); | 
|  | break; | 
|  | default: | 
|  | refl_error(uth, trap_nr, err, aux); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Yield callback for __ctlr_entry */ | 
|  | static void __swap_to_gth(struct uthread *uth, void *dummy) | 
|  | { | 
|  | struct ctlr_thread *cth = (struct ctlr_thread*)uth; | 
|  |  | 
|  | /* We just immediately run our buddy.  The ctlr and the guest are accounted | 
|  | * together ("pass the token" back and forth). */ | 
|  | current_uthread = NULL; | 
|  | stats_run_vth((struct vmm_thread*)cth->buddy); | 
|  | run_uthread((struct uthread*)cth->buddy); | 
|  | assert(0); | 
|  | } | 
|  |  | 
|  | /* All ctrl threads start here, each time their guest has a fault.  They can | 
|  | * block and unblock along the way.  Once a ctlr does its final uthread_yield, | 
|  | * the next time it will start again from the top. */ | 
|  | static void __ctlr_entry(void) | 
|  | { | 
|  | struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread; | 
|  | struct virtual_machine *vm = gth_to_vm(cth->buddy); | 
|  |  | 
|  | if (!handle_vmexit(cth->buddy)) { | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(cth->buddy); | 
|  |  | 
|  | fprintf(stderr, "vmm: handle_vmexit returned false\n"); | 
|  | fprintf(stderr, "Note: this may be a kernel module, not the kernel\n"); | 
|  | fprintf(stderr, "RSP was %p, ", (void *)vm_tf->tf_rsp); | 
|  | fprintf(stderr, "RIP was %p:\n", (void *)vm_tf->tf_rip); | 
|  | /* TODO: properly walk the kernel page tables to map the tf_rip | 
|  | * to a physical address. For now, however, this hack is good | 
|  | * enough. | 
|  | */ | 
|  | hexdump(stderr, (void *)(vm_tf->tf_rip & 0x3fffffff), 16); | 
|  | showstatus(stderr, cth->buddy); | 
|  | exit(0); | 
|  | } | 
|  | /* We want to atomically yield and start/reenqueue our buddy.  We do so in | 
|  | * vcore context on the other side of the yield. */ | 
|  | uthread_yield(FALSE, __swap_to_gth, 0); | 
|  | } | 
|  |  | 
|  | static void vmm_thread_refl_vm_fault(struct uthread *uth) | 
|  | { | 
|  | struct guest_thread *gth = (struct guest_thread*)uth; | 
|  | struct ctlr_thread *cth = gth->buddy; | 
|  |  | 
|  | gth->nr_vmexits++; | 
|  | /* The ctlr starts frm the top every time we get a new fault. */ | 
|  | cth->uthread.flags |= UTHREAD_SAVED; | 
|  | init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry, | 
|  | (uintptr_t)(cth->stacktop)); | 
|  | /* We just immediately run our buddy.  The ctlr and the guest are accounted | 
|  | * together ("pass the token" back and forth). */ | 
|  | current_uthread = NULL; | 
|  | stats_run_vth((struct vmm_thread*)cth); | 
|  | run_uthread((struct uthread*)cth); | 
|  | assert(0); | 
|  | } | 
|  |  | 
|  | static void vmm_thread_refl_fault(struct uthread *uth, | 
|  | struct user_context *ctx) | 
|  | { | 
|  | switch (ctx->type) { | 
|  | case ROS_HW_CTX: | 
|  | /* Guests should only ever VM exit */ | 
|  | assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST); | 
|  | vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx), | 
|  | __arch_refl_get_err(ctx), | 
|  | __arch_refl_get_aux(ctx)); | 
|  | break; | 
|  | case ROS_VM_CTX: | 
|  | vmm_thread_refl_vm_fault(uth); | 
|  | break; | 
|  | default: | 
|  | assert(0); | 
|  | } | 
|  | } | 
|  |  | 
|  | static void task_thread_dtor(void *obj, void *priv) | 
|  | { | 
|  | struct task_thread *tth = (struct task_thread*)obj; | 
|  |  | 
|  | uthread_cleanup((struct uthread*)tth); | 
|  | __free_stack(tth->stacktop, tth->stacksize); | 
|  | } | 
|  |  | 
|  | static void task_thread_exit(struct task_thread *tth) | 
|  | { | 
|  | struct uthread *uth = (struct uthread*)tth; | 
|  |  | 
|  | if (uth->flags & UTHREAD_IS_THREAD0) | 
|  | return; | 
|  | kmem_cache_free(task_thread_cache, tth); | 
|  | } | 
|  |  | 
|  | static void ctlr_thread_exit(struct ctlr_thread *cth) | 
|  | { | 
|  | __vthread_exited((struct vthread*)cth->buddy); | 
|  | } | 
|  |  | 
|  | static void vmm_thread_exited(struct uthread *uth) | 
|  | { | 
|  | struct vmm_thread *vth = (struct vmm_thread*)uth; | 
|  |  | 
|  | assert(vth->type != VMM_THREAD_GUEST); | 
|  |  | 
|  | acct_thread_blocked(vth); | 
|  | switch (vth->type) { | 
|  | case VMM_THREAD_TASK: | 
|  | task_thread_exit((struct task_thread*)uth); | 
|  | break; | 
|  | case VMM_THREAD_CTLR: | 
|  | ctlr_thread_exit((struct ctlr_thread*)uth); | 
|  | break; | 
|  | case VMM_THREAD_GUEST: | 
|  | panic("Guest threads shouldn't be able to exit"); | 
|  | } | 
|  | } | 
|  |  | 
|  | static void destroy_guest_thread(struct guest_thread *gth) | 
|  | { | 
|  | struct ctlr_thread *cth = gth->buddy; | 
|  |  | 
|  | __free_stack(cth->stacktop, cth->stacksize); | 
|  | uthread_cleanup((struct uthread*)cth); | 
|  | free(cth); | 
|  | uthread_cleanup((struct uthread*)gth); | 
|  | free(gth); | 
|  | } | 
|  |  | 
|  | struct guest_thread *create_guest_thread(struct virtual_machine *vm, | 
|  | unsigned int gpcoreid, | 
|  | struct vmm_gpcore_init *gpci) | 
|  | { | 
|  | struct guest_thread *gth; | 
|  | struct ctlr_thread *cth; | 
|  | /* Guests won't use TLS; they always operate in Ring V.  The controller | 
|  | * might - not because of anything we do, but because of glibc calls. */ | 
|  | struct uth_thread_attr gth_attr = {.want_tls = FALSE}; | 
|  | struct uth_thread_attr cth_attr = {.want_tls = TRUE}; | 
|  |  | 
|  | gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST); | 
|  | cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR); | 
|  | if (!gth || !cth) { | 
|  | free(gth); | 
|  | free(cth); | 
|  | return 0; | 
|  | } | 
|  | gth->buddy = cth; | 
|  | cth->buddy = gth; | 
|  | gth->gpc_id = gpcoreid; | 
|  | gth->gpci = *gpci; | 
|  | cth->stacksize = VMM_THR_STACKSIZE; | 
|  | cth->stacktop = __alloc_stack(cth->stacksize); | 
|  | if (!cth->stacktop) { | 
|  | free(gth); | 
|  | free(cth); | 
|  | return 0; | 
|  | } | 
|  | gth->uthread.u_ctx.type = ROS_VM_CTX; | 
|  | gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid; | 
|  | uthread_init((struct uthread*)gth, >h_attr); | 
|  | uthread_init((struct uthread*)cth, &cth_attr); | 
|  | gth->halt_mtx = uth_mutex_alloc(); | 
|  | gth->halt_cv = uth_cond_var_alloc(); | 
|  | return gth; | 
|  | } | 
|  |  | 
|  | static void ev_handle_diag(struct event_msg *ev_msg, unsigned int ev_type, | 
|  | void *data) | 
|  | { | 
|  | struct virtual_machine *vm = current_vm; | 
|  | struct guest_thread *gth; | 
|  | struct ctlr_thread *cth; | 
|  | bool reset = FALSE; | 
|  |  | 
|  | if (ev_msg && (ev_msg->ev_arg1 == 1)) | 
|  | reset = TRUE; | 
|  |  | 
|  | fprintf(stderr, "\nSCHED stats:\n---------------\n"); | 
|  | for (int i = 0; i < vm->nr_gpcs; i++) { | 
|  | gth = gpcid_to_gth(vm, i); | 
|  | cth = gth->buddy; | 
|  | fprintf(stderr, "\tGPC %2d: %lu resched, %lu gth runs, %lu ctl runs, %lu user-handled vmexits\n", | 
|  | i, | 
|  | ((struct vmm_thread*)gth)->nr_resched, | 
|  | ((struct vmm_thread*)gth)->nr_runs, | 
|  | ((struct vmm_thread*)cth)->nr_runs, | 
|  | gth->nr_vmexits); | 
|  | if (reset) { | 
|  | ((struct vmm_thread*)gth)->nr_resched = 0; | 
|  | ((struct vmm_thread*)gth)->nr_runs = 0; | 
|  | ((struct vmm_thread*)cth)->nr_runs = 0; | 
|  | gth->nr_vmexits = 0; | 
|  | } | 
|  | } | 
|  | fprintf(stderr, "\n\tNr unblocked gpc %lu, Nr unblocked tasks %lu\n", | 
|  | atomic_read(&nr_unblk_guests), atomic_read(&nr_unblk_tasks)); | 
|  | } | 
|  |  | 
|  | int vmm_init(struct virtual_machine *vm, struct vmm_gpcore_init *gpcis, | 
|  | int flags) | 
|  | { | 
|  | struct guest_thread **gths; | 
|  |  | 
|  | if (current_vm) | 
|  | return -1; | 
|  | current_vm = vm; | 
|  | /* We should tell the kernel to create all of the GPCs we'll need in | 
|  | * advance. | 
|  | * | 
|  | * We could create the others on the fly, but the kernel's answer for | 
|  | * CPUID[0x1] will not have to total number of cores.  If we move that | 
|  | * handler to userspace, we can create the SMP-booted GPCs on the fly. | 
|  | * | 
|  | * We'd also have to deal with gths[] growing dynamically, which would | 
|  | * require synchronization. */ | 
|  | if (syscall(SYS_vmm_add_gpcs, vm->nr_gpcs, gpcis) != vm->nr_gpcs) | 
|  | return -1; | 
|  | if (flags) { | 
|  | if (syscall(SYS_vmm_ctl, VMM_CTL_SET_FLAGS, flags)) | 
|  | return -1; | 
|  | } | 
|  | gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *)); | 
|  | if (!gths) | 
|  | return -1; | 
|  | for (int i = 0; i < vm->nr_gpcs; i++) { | 
|  | gths[i] = create_guest_thread(vm, i, &gpcis[i]); | 
|  | if (!gths[i]) { | 
|  | for (int j = 0; j < i; j++) | 
|  | destroy_guest_thread(gths[j]); | 
|  | free(gths); | 
|  | return -1; | 
|  | } | 
|  | } | 
|  | wmb(); /* All gths posted before advertising. */ | 
|  | vm->__gths = gths; | 
|  | uthread_mcp_init(); | 
|  | register_ev_handler(EV_FREE_APPLE_PIE, ev_handle_diag, NULL); | 
|  | if (sched_is_greedy()) { | 
|  | greedy_rnbl_guests = calloc(vm->nr_gpcs, sizeof(struct vmm_thread *)); | 
|  | assert(greedy_rnbl_guests); | 
|  | vcore_request_total(sched_nr_greedy_cores()); | 
|  | syscall(SYS_vmm_ctl, VMM_CTL_SET_EXITS, | 
|  | syscall(SYS_vmm_ctl, VMM_CTL_GET_EXITS) & | 
|  | ~(VMM_CTL_EXIT_HALT | VMM_CTL_EXIT_MWAIT)); | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | void start_guest_thread(struct guest_thread *gth) | 
|  | { | 
|  | acct_thread_unblocked((struct vmm_thread*)gth); | 
|  | enqueue_vmm_thread((struct vmm_thread*)gth); | 
|  | } | 
|  |  | 
|  | static void __task_thread_run(void) | 
|  | { | 
|  | struct task_thread *tth = (struct task_thread*)current_uthread; | 
|  |  | 
|  | uth_2ls_thread_exit(tth->func(tth->arg)); | 
|  | } | 
|  |  | 
|  | static int task_thread_ctor(void *obj, void *priv, int flags) | 
|  | { | 
|  | struct vmm_thread *vth = (struct vmm_thread*)obj; | 
|  | struct task_thread *tth = (struct task_thread*)obj; | 
|  |  | 
|  | memset(vth, 0, sizeof(struct vmm_thread)); | 
|  | vth->type = VMM_THREAD_TASK; | 
|  | vth->vm = current_vm; | 
|  | tth->stacksize = VMM_THR_STACKSIZE; | 
|  | tth->stacktop = __alloc_stack(tth->stacksize); | 
|  | if (!tth->stacktop) | 
|  | return -1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* Helper, creates and starts a task thread. */ | 
|  | static struct task_thread *__vmm_run_task(struct virtual_machine *vm, | 
|  | void *(*func)(void *), void *arg, | 
|  | struct uth_thread_attr *tth_attr) | 
|  | { | 
|  | struct task_thread *tth; | 
|  |  | 
|  | tth = kmem_cache_alloc(task_thread_cache, 0); | 
|  | tth->func = func; | 
|  | tth->arg = arg; | 
|  | init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run, | 
|  | (uintptr_t)(tth->stacktop)); | 
|  | uthread_init((struct uthread*)tth, tth_attr); | 
|  | acct_thread_unblocked((struct vmm_thread*)tth); | 
|  | enqueue_vmm_thread((struct vmm_thread*)tth); | 
|  | return tth; | 
|  | } | 
|  |  | 
|  | struct task_thread *vmm_run_task(struct virtual_machine *vm, | 
|  | void *(*func)(void *), void *arg) | 
|  | { | 
|  | struct uth_thread_attr tth_attr = {.want_tls = TRUE, .detached = TRUE}; | 
|  |  | 
|  | return __vmm_run_task(vm, func, arg, &tth_attr); | 
|  | } | 
|  |  | 
|  | static struct uthread *vmm_thread_create(void *(*func)(void *), void *arg) | 
|  | { | 
|  | struct uth_thread_attr tth_attr = {.want_tls = TRUE, .detached = FALSE}; | 
|  | struct task_thread *tth; | 
|  |  | 
|  | /* It's OK to not have a VM for a generic thread */ | 
|  | tth = __vmm_run_task(NULL, func, arg, &tth_attr); | 
|  | /* But just in case, let's poison it */ | 
|  | ((struct vmm_thread*)tth)->vm = (void*)0xdeadbeef; | 
|  | return (struct uthread*)tth; | 
|  | } | 
|  |  | 
|  | /* Helpers for tracking nr_unblk_* threads. */ | 
|  | static void acct_thread_blocked(struct vmm_thread *vth) | 
|  | { | 
|  | switch (vth->type) { | 
|  | case VMM_THREAD_GUEST: | 
|  | case VMM_THREAD_CTLR: | 
|  | atomic_dec(&nr_unblk_guests); | 
|  | break; | 
|  | case VMM_THREAD_TASK: | 
|  | atomic_dec(&nr_unblk_tasks); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | static void acct_thread_unblocked(struct vmm_thread *vth) | 
|  | { | 
|  | switch (vth->type) { | 
|  | case VMM_THREAD_GUEST: | 
|  | case VMM_THREAD_CTLR: | 
|  | atomic_inc(&nr_unblk_guests); | 
|  | break; | 
|  | case VMM_THREAD_TASK: | 
|  | atomic_inc(&nr_unblk_tasks); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | static void greedy_mark_guest_runnable(struct vmm_thread *vth) | 
|  | { | 
|  | int gpcid; | 
|  |  | 
|  | if (vth->type == VMM_THREAD_GUEST) | 
|  | gpcid = ((struct guest_thread*)vth)->gpc_id; | 
|  | else | 
|  | gpcid = ((struct ctlr_thread*)vth)->buddy->gpc_id; | 
|  | /* racing with the reader */ | 
|  | greedy_rnbl_guests[gpcid] = vth; | 
|  | } | 
|  |  | 
|  | static void enqueue_vmm_thread(struct vmm_thread *vth) | 
|  | { | 
|  | switch (vth->type) { | 
|  | case VMM_THREAD_GUEST: | 
|  | case VMM_THREAD_CTLR: | 
|  | if (sched_is_greedy()) { | 
|  | greedy_mark_guest_runnable(vth); | 
|  | } else { | 
|  | spin_pdr_lock(&queue_lock); | 
|  | TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next); | 
|  | spin_pdr_unlock(&queue_lock); | 
|  | } | 
|  | break; | 
|  | case VMM_THREAD_TASK: | 
|  | spin_pdr_lock(&queue_lock); | 
|  | TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next); | 
|  | spin_pdr_unlock(&queue_lock); | 
|  | if (sched_is_greedy()) | 
|  | vcore_wake(0, false); | 
|  | break; | 
|  | default: | 
|  | panic("Bad vmm_thread type %p\n", vth->type); | 
|  | } | 
|  | try_to_get_vcores(); | 
|  | } | 
|  |  | 
|  | static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type) | 
|  | { | 
|  | struct vmm_thread *vth; | 
|  | int ret; | 
|  |  | 
|  | ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread), | 
|  | sizeof(struct vmm_thread)); | 
|  | if (ret) | 
|  | return 0; | 
|  | memset(vth, 0, sizeof(struct vmm_thread)); | 
|  | vth->type = type; | 
|  | vth->vm = vm; | 
|  | return vth; | 
|  | } | 
|  |  | 
|  | static void __free_stack(void *stacktop, size_t stacksize) | 
|  | { | 
|  | munmap(stacktop - stacksize, stacksize); | 
|  | } | 
|  |  | 
|  | static void *__alloc_stack(size_t stacksize) | 
|  | { | 
|  | int force_a_page_fault; | 
|  | void *stacktop; | 
|  | void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC, | 
|  | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); | 
|  |  | 
|  | if (stackbot == MAP_FAILED) | 
|  | return 0; | 
|  | stacktop = stackbot + stacksize; | 
|  | /* Want the top of the stack populated, but not the rest of the stack; | 
|  | * that'll grow on demand (up to stacksize, then will clobber memory). */ | 
|  | force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int))); | 
|  | return stacktop; | 
|  | } |