| /* Copyright (c) 2016 Google Inc. |
| * Barret Rhoden <brho@cs.berkeley.edu> |
| * See LICENSE for details. |
| * |
| * 2LS for virtual machines */ |
| |
| #include <vmm/sched.h> |
| #include <vmm/vmm.h> |
| #include <sys/mman.h> |
| #include <stdlib.h> |
| #include <assert.h> |
| #include <parlib/spinlock.h> |
| #include <parlib/event.h> |
| #include <parlib/ucq.h> |
| #include <parlib/arch/trap.h> |
| #include <parlib/ros_debug.h> |
| #include <benchutil/vcore_tick.h> |
| |
| int vmm_sched_period_usec = 1000; |
| |
| /* For now, we only have one VM managed by the 2LS. If we ever expand that, |
| * we'll need something analogous to current_uthread, so the 2LS knows which VM |
| * it is working on. */ |
| static struct virtual_machine *current_vm; |
| |
| static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER; |
| /* Runnable queues, broken up by thread type. */ |
| static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks); |
| static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests); |
| /* Counts of *unblocked* threads. Unblocked = Running + Runnable. */ |
| static atomic_t nr_unblk_tasks; |
| static atomic_t nr_unblk_guests; |
| /* Global evq for all syscalls. Could make this per vcore or whatever. */ |
| static struct event_queue *sysc_evq; |
| |
| static void vmm_sched_entry(void); |
| static void vmm_thread_runnable(struct uthread *uth); |
| static void vmm_thread_paused(struct uthread *uth); |
| static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc); |
| static void vmm_thread_has_blocked(struct uthread *uth, int flags); |
| static void vmm_thread_refl_fault(struct uthread *uth, |
| struct user_context *ctx); |
| |
| struct schedule_ops vmm_sched_ops = { |
| .sched_entry = vmm_sched_entry, |
| .thread_runnable = vmm_thread_runnable, |
| .thread_paused = vmm_thread_paused, |
| .thread_blockon_sysc = vmm_thread_blockon_sysc, |
| .thread_has_blocked = vmm_thread_has_blocked, |
| .thread_refl_fault = vmm_thread_refl_fault, |
| }; |
| |
| /* Helpers */ |
| static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type, |
| void *data); |
| static void acct_thread_blocked(struct vmm_thread *vth); |
| static void acct_thread_unblocked(struct vmm_thread *vth); |
| static void enqueue_vmm_thread(struct vmm_thread *vth); |
| static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, |
| int type); |
| static void *__alloc_stack(size_t stacksize); |
| static void __free_stack(void *stacktop, size_t stacksize); |
| |
| |
| static void restart_thread(struct syscall *sysc) |
| { |
| struct uthread *ut_restartee = (struct uthread*)sysc->u_data; |
| |
| /* uthread stuff here: */ |
| assert(ut_restartee); |
| assert(ut_restartee->sysc == sysc); /* set in uthread.c */ |
| ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */ |
| vmm_thread_runnable(ut_restartee); |
| } |
| |
| static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type, |
| void *data) |
| { |
| struct syscall *sysc; |
| |
| /* I think we can make this assert now. If not, check pthread.c. (concern |
| * was having old ev_qs firing and running this handler). */ |
| assert(ev_msg); |
| sysc = ev_msg->ev_arg3; |
| assert(sysc); |
| restart_thread(sysc); |
| } |
| |
| /* Helper: allocates a UCQ-based event queue suitable for syscalls. Will |
| * attempt to route the notifs/IPIs to vcoreid */ |
| static struct event_queue *setup_sysc_evq(int vcoreid) |
| { |
| struct event_queue *evq; |
| uintptr_t mmap_block; |
| |
| mmap_block = (uintptr_t)mmap(0, PGSIZE * 2, |
| PROT_WRITE | PROT_READ, |
| MAP_POPULATE | MAP_ANONYMOUS, -1, 0); |
| evq = get_eventq_raw(); |
| assert(mmap_block && evq); |
| evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP; |
| evq->ev_vcore = vcoreid; |
| evq->ev_mbox->type = EV_MBOX_UCQ; |
| ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE); |
| return evq; |
| } |
| |
| static void __attribute__((constructor)) vmm_lib_init(void) |
| { |
| struct task_thread *thread0; |
| |
| init_once_racy(return); |
| uthread_lib_init(); |
| |
| /* Note that thread0 doesn't belong to a VM. We can set this during |
| * vmm_init() if we need to. */ |
| thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK); |
| assert(thread0); |
| acct_thread_unblocked((struct vmm_thread*)thread0); |
| thread0->stacksize = USTACK_NUM_PAGES * PGSIZE; |
| thread0->stacktop = (void*)USTACKTOP; |
| /* for lack of a better vcore, might as well send to 0 */ |
| sysc_evq = setup_sysc_evq(0); |
| register_ev_handler(EV_SYSCALL, vmm_handle_syscall, 0); |
| uthread_2ls_init((struct uthread*)thread0, &vmm_sched_ops); |
| } |
| |
| /* The scheduling policy is encapsulated in the next few functions (from here |
| * down to sched_entry()). */ |
| |
| static int desired_nr_vcores(void) |
| { |
| /* Sanity checks on our accounting. */ |
| assert(atomic_read(&nr_unblk_guests) >= 0); |
| assert(atomic_read(&nr_unblk_tasks) >= 0); |
| /* Lockless peak. This is always an estimate. Some of our tasks busy-wait, |
| * so it's not enough to just give us one vcore for all tasks, yet. */ |
| return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks); |
| } |
| |
| static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq) |
| { |
| struct vmm_thread *vth; |
| |
| vth = TAILQ_FIRST(tq); |
| if (vth) |
| TAILQ_REMOVE(tq, vth, tq_next); |
| return vth; |
| } |
| |
| static struct vmm_thread *pick_a_thread_degraded(void) |
| { |
| struct vmm_thread *vth = 0; |
| static int next_class = VMM_THREAD_GUEST; |
| |
| /* We don't have a lot of cores (maybe 0), so we'll alternate which type of |
| * thread we look at first. Basically, we're RR within a class of threads, |
| * and we'll toggle between those two classes. */ |
| spin_pdr_lock(&queue_lock); |
| if (next_class == VMM_THREAD_GUEST) { |
| if (!vth) |
| vth = __pop_first(&rnbl_guests); |
| if (!vth) |
| vth = __pop_first(&rnbl_tasks); |
| next_class = VMM_THREAD_TASK; |
| } else { |
| if (!vth) |
| vth = __pop_first(&rnbl_tasks); |
| if (!vth) |
| vth = __pop_first(&rnbl_guests); |
| next_class = VMM_THREAD_GUEST; |
| }; |
| spin_pdr_unlock(&queue_lock); |
| return vth; |
| } |
| |
| /* We have plenty of cores - run whatever we want. We'll prioritize tasks. */ |
| static struct vmm_thread *pick_a_thread_plenty(void) |
| { |
| struct vmm_thread *vth = 0; |
| |
| spin_pdr_lock(&queue_lock); |
| if (!vth) |
| vth = __pop_first(&rnbl_tasks); |
| if (!vth) |
| vth = __pop_first(&rnbl_guests); |
| spin_pdr_unlock(&queue_lock); |
| return vth; |
| } |
| |
| static void yield_current_uth(void) |
| { |
| struct vmm_thread *vth; |
| |
| if (!current_uthread) |
| return; |
| vth = (struct vmm_thread*)stop_current_uthread(); |
| enqueue_vmm_thread(vth); |
| } |
| |
| static void __attribute__((noreturn)) vmm_sched_entry(void) |
| { |
| struct vmm_thread *vth; |
| int nr_vcores_wanted = desired_nr_vcores(); |
| bool have_enough = nr_vcores_wanted <= num_vcores(); |
| |
| /* TODO: this doesn't handle a lot of issues, like preemption, how to |
| * run/yield our vcores, dynamic changes in the number of runnables, where |
| * to send events, how to avoid interfering with gpcs, etc. */ |
| if (have_enough) { |
| vcore_tick_disable(); |
| } else { |
| vcore_tick_enable(vmm_sched_period_usec); |
| vcore_request_total(nr_vcores_wanted); |
| if (vcore_tick_poll()) { |
| /* slightly less than ideal: we grab the queue lock twice */ |
| yield_current_uth(); |
| } |
| } |
| if (current_uthread) |
| run_current_uthread(); |
| if (have_enough) |
| vth = pick_a_thread_plenty(); |
| else |
| vth = pick_a_thread_degraded(); |
| if (!vth) |
| vcore_yield_or_restart(); |
| run_uthread((struct uthread*)vth); |
| } |
| |
| static void vmm_thread_runnable(struct uthread *uth) |
| { |
| /* A thread that was blocked is now runnable. This counts as becoming |
| * unblocked (running + runnable) */ |
| acct_thread_unblocked((struct vmm_thread*)uth); |
| enqueue_vmm_thread((struct vmm_thread*)uth); |
| } |
| |
| static void vmm_thread_paused(struct uthread *uth) |
| { |
| /* The thread stopped for some reason, usually a preemption. We'd like to |
| * just run it whenever we get a chance. Note that it didn't become |
| * 'blocked' - it's still runnable. */ |
| enqueue_vmm_thread((struct vmm_thread*)uth); |
| } |
| |
| static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall) |
| { |
| struct syscall *sysc = (struct syscall*)syscall; |
| |
| acct_thread_blocked((struct vmm_thread*)uth); |
| sysc->u_data = uth; |
| if (!register_evq(sysc, sysc_evq)) { |
| /* Lost the race with the call being done. The kernel won't send the |
| * event. Just restart him. */ |
| restart_thread(sysc); |
| } |
| /* GIANT WARNING: do not touch the thread after this point. */ |
| } |
| |
| static void vmm_thread_has_blocked(struct uthread *uth, int flags) |
| { |
| /* The thread blocked on something like a mutex. It's not runnable, so we |
| * don't need to put it on a list, but we do need to account for it not |
| * running. We'll find out (via thread_runnable) when it starts up again. |
| */ |
| acct_thread_blocked((struct vmm_thread*)uth); |
| } |
| |
| static void refl_error(struct uthread *uth, unsigned int trap_nr, |
| unsigned int err, unsigned long aux) |
| { |
| printf("Thread has unhandled fault: %d, err: %d, aux: %p\n", |
| trap_nr, err, aux); |
| /* Note that uthread.c already copied out our ctx into the uth |
| * struct */ |
| print_user_context(&uth->u_ctx); |
| printf("Turn on printx to spew unhandled, malignant trap info\n"); |
| exit(-1); |
| } |
| |
| static bool handle_page_fault(struct uthread *uth, unsigned int err, |
| unsigned long aux) |
| { |
| if (!(err & PF_VMR_BACKED)) |
| return FALSE; |
| syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1); |
| __block_uthread_on_async_sysc(uth); |
| return TRUE; |
| } |
| |
| static void vmm_thread_refl_hw_fault(struct uthread *uth, |
| unsigned int trap_nr, |
| unsigned int err, unsigned long aux) |
| { |
| switch (trap_nr) { |
| case HW_TRAP_PAGE_FAULT: |
| if (!handle_page_fault(uth, err, aux)) |
| refl_error(uth, trap_nr, err, aux); |
| break; |
| default: |
| refl_error(uth, trap_nr, err, aux); |
| } |
| } |
| |
| /* Yield callback for __ctlr_entry */ |
| static void __swap_to_gth(struct uthread *uth, void *dummy) |
| { |
| struct ctlr_thread *cth = (struct ctlr_thread*)uth; |
| |
| /* We don't re-account for block/unblock. The ctlr and the guest are |
| * accounted together ("pass the token" back and forth). */ |
| enqueue_vmm_thread((struct vmm_thread*)cth->buddy); |
| } |
| |
| /* All ctrl threads start here, each time their guest has a fault. They can |
| * block and unblock along the way. Once a ctlr does its final uthread_yield, |
| * the next time it will start again from the top. */ |
| static void __ctlr_entry(void) |
| { |
| struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread; |
| struct virtual_machine *vm = gth_to_vm(cth->buddy); |
| |
| if (!handle_vmexit(cth->buddy)) { |
| struct vm_trapframe *vm_tf = gth_to_vmtf(cth->buddy); |
| |
| fprintf(stderr, "vmm: handle_vmexit returned false\n"); |
| fprintf(stderr, "Note: this may be a kernel module, not the kernel\n"); |
| fprintf(stderr, "RIP was %p:\n", (void *)vm_tf->tf_rip); |
| /* TODO: properly walk the kernel page tables to map the tf_rip |
| * to a physical address. For now, however, this hack is good |
| * enough. |
| */ |
| hexdump(stderr, (void *)(vm_tf->tf_rip & 0x3fffffff), 16); |
| showstatus(stderr, cth->buddy); |
| exit(0); |
| } |
| /* We want to atomically yield and start/reenqueue our buddy. We do so in |
| * vcore context on the other side of the yield. */ |
| uthread_yield(FALSE, __swap_to_gth, 0); |
| } |
| |
| static void vmm_thread_refl_vm_fault(struct uthread *uth) |
| { |
| struct guest_thread *gth = (struct guest_thread*)uth; |
| struct ctlr_thread *cth = gth->buddy; |
| |
| /* The ctlr starts frm the top every time we get a new fault. */ |
| cth->uthread.flags |= UTHREAD_SAVED; |
| init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry, |
| (uintptr_t)(cth->stacktop)); |
| /* We don't re-account for block/unblock. The ctlr and the guest are |
| * accounted together ("pass the token" back and forth). */ |
| enqueue_vmm_thread((struct vmm_thread*)cth); |
| } |
| |
| static void vmm_thread_refl_fault(struct uthread *uth, |
| struct user_context *ctx) |
| { |
| switch (ctx->type) { |
| case ROS_HW_CTX: |
| /* Guests should only ever VM exit */ |
| assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST); |
| vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx), |
| __arch_refl_get_err(ctx), |
| __arch_refl_get_aux(ctx)); |
| break; |
| case ROS_VM_CTX: |
| vmm_thread_refl_vm_fault(uth); |
| break; |
| default: |
| assert(0); |
| } |
| } |
| |
| static void destroy_guest_thread(struct guest_thread *gth) |
| { |
| struct ctlr_thread *cth = gth->buddy; |
| |
| __free_stack(cth->stacktop, cth->stacksize); |
| uthread_cleanup((struct uthread*)cth); |
| free(cth); |
| uthread_cleanup((struct uthread*)gth); |
| free(gth); |
| } |
| |
| static struct guest_thread *create_guest_thread(struct virtual_machine *vm, |
| unsigned int gpcoreid) |
| { |
| struct guest_thread *gth; |
| struct ctlr_thread *cth; |
| /* Guests won't use TLS; they always operate in Ring V. The controller |
| * might - not because of anything we do, but because of glibc calls. */ |
| struct uth_thread_attr gth_attr = {.want_tls = FALSE}; |
| struct uth_thread_attr cth_attr = {.want_tls = TRUE}; |
| |
| gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST); |
| cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR); |
| if (!gth || !cth) { |
| free(gth); |
| free(cth); |
| return 0; |
| } |
| gth->buddy = cth; |
| cth->buddy = gth; |
| gth->gpc_id = gpcoreid; |
| cth->stacksize = VMM_THR_STACKSIZE; |
| cth->stacktop = __alloc_stack(cth->stacksize); |
| if (!cth->stacktop) { |
| free(gth); |
| free(cth); |
| return 0; |
| } |
| gth->uthread.u_ctx.type = ROS_VM_CTX; |
| gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid; |
| /* No need to init the ctlr. It gets re-init'd each time it starts. */ |
| uthread_init((struct uthread*)gth, >h_attr); |
| uthread_init((struct uthread*)cth, &cth_attr); |
| /* TODO: give it a correct FP state. Our current one is probably fine */ |
| restore_fp_state(>h->uthread.as); |
| gth->uthread.flags |= UTHREAD_FPSAVED; |
| gth->halt_mtx = uth_mutex_alloc(); |
| gth->halt_cv = uth_cond_var_alloc(); |
| return gth; |
| } |
| |
| int vmm_init(struct virtual_machine *vm, int flags) |
| { |
| struct guest_thread **gths; |
| |
| if (current_vm) |
| return -1; |
| current_vm = vm; |
| if (syscall(SYS_vmm_setup, vm->nr_gpcs, vm->gpcis, flags) != vm->nr_gpcs) |
| return -1; |
| gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *)); |
| if (!gths) |
| return -1; |
| for (int i = 0; i < vm->nr_gpcs; i++) { |
| gths[i] = create_guest_thread(vm, i); |
| if (!gths[i]) { |
| for (int j = 0; j < i; j++) |
| destroy_guest_thread(gths[j]); |
| free(gths); |
| return -1; |
| } |
| } |
| vm->gths = gths; |
| uthread_mcp_init(); |
| return 0; |
| } |
| |
| void start_guest_thread(struct guest_thread *gth) |
| { |
| acct_thread_unblocked((struct vmm_thread*)gth); |
| enqueue_vmm_thread((struct vmm_thread*)gth); |
| } |
| |
| static void __tth_exit_cb(struct uthread *uthread, void *junk) |
| { |
| struct task_thread *tth = (struct task_thread*)uthread; |
| |
| acct_thread_blocked((struct vmm_thread*)tth); |
| uthread_cleanup(uthread); |
| __free_stack(tth->stacktop, tth->stacksize); |
| free(tth); |
| } |
| |
| static void __task_thread_run(void) |
| { |
| struct task_thread *tth = (struct task_thread*)current_uthread; |
| |
| tth->func(tth->arg); |
| uthread_yield(FALSE, __tth_exit_cb, 0); |
| } |
| |
| struct task_thread *vmm_run_task(struct virtual_machine *vm, |
| void (*func)(void *), void *arg) |
| { |
| struct task_thread *tth; |
| struct uth_thread_attr tth_attr = {.want_tls = TRUE}; |
| |
| tth = (struct task_thread*)alloc_vmm_thread(vm, VMM_THREAD_TASK); |
| if (!tth) |
| return 0; |
| tth->stacksize = VMM_THR_STACKSIZE; |
| tth->stacktop = __alloc_stack(tth->stacksize); |
| if (!tth->stacktop) { |
| free(tth); |
| return 0; |
| } |
| tth->func = func; |
| tth->arg = arg; |
| init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run, |
| (uintptr_t)(tth->stacktop)); |
| uthread_init((struct uthread*)tth, &tth_attr); |
| acct_thread_unblocked((struct vmm_thread*)tth); |
| enqueue_vmm_thread((struct vmm_thread*)tth); |
| return tth; |
| } |
| |
| /* Helpers for tracking nr_unblk_* threads. */ |
| static void acct_thread_blocked(struct vmm_thread *vth) |
| { |
| switch (vth->type) { |
| case VMM_THREAD_GUEST: |
| case VMM_THREAD_CTLR: |
| atomic_dec(&nr_unblk_guests); |
| break; |
| case VMM_THREAD_TASK: |
| atomic_dec(&nr_unblk_tasks); |
| break; |
| } |
| } |
| |
| static void acct_thread_unblocked(struct vmm_thread *vth) |
| { |
| switch (vth->type) { |
| case VMM_THREAD_GUEST: |
| case VMM_THREAD_CTLR: |
| atomic_inc(&nr_unblk_guests); |
| break; |
| case VMM_THREAD_TASK: |
| atomic_inc(&nr_unblk_tasks); |
| break; |
| } |
| } |
| |
| static void enqueue_vmm_thread(struct vmm_thread *vth) |
| { |
| spin_pdr_lock(&queue_lock); |
| switch (vth->type) { |
| case VMM_THREAD_GUEST: |
| case VMM_THREAD_CTLR: |
| TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next); |
| break; |
| case VMM_THREAD_TASK: |
| TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next); |
| break; |
| } |
| spin_pdr_unlock(&queue_lock); |
| } |
| |
| static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type) |
| { |
| struct vmm_thread *vth; |
| int ret; |
| |
| ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread), |
| sizeof(struct vmm_thread)); |
| if (ret) |
| return 0; |
| memset(vth, 0, sizeof(struct vmm_thread)); |
| vth->type = type; |
| vth->vm = vm; |
| return vth; |
| } |
| |
| static void __free_stack(void *stacktop, size_t stacksize) |
| { |
| munmap(stacktop - stacksize, stacksize); |
| } |
| |
| static void *__alloc_stack(size_t stacksize) |
| { |
| int force_a_page_fault; |
| void *stacktop; |
| void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC, |
| MAP_ANONYMOUS, -1, 0); |
| |
| if (stackbot == MAP_FAILED) |
| return 0; |
| stacktop = stackbot + stacksize; |
| /* Want the top of the stack populated, but not the rest of the stack; |
| * that'll grow on demand (up to stacksize, then will clobber memory). */ |
| force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int))); |
| return stacktop; |
| } |