blob: 5ecdf6d353a1fc0cd69441451da131f0f4f5724b [file] [log] [blame] [edit]
/* Copyright (c) 2016 Google Inc.
* Barret Rhoden <brho@cs.berkeley.edu>
* See LICENSE for details.
*
* 2LS for virtual machines */
#include <vmm/sched.h>
#include <vmm/vmm.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <assert.h>
#include <parlib/spinlock.h>
#include <parlib/event.h>
#include <parlib/ucq.h>
#include <parlib/arch/trap.h>
#include <parlib/ros_debug.h>
#include <benchutil/vcore_tick.h>
int vmm_sched_period_usec = 1000;
/* For now, we only have one VM managed by the 2LS. If we ever expand that,
* we'll need something analogous to current_uthread, so the 2LS knows which VM
* it is working on. */
static struct virtual_machine *current_vm;
static struct spin_pdr_lock queue_lock = SPINPDR_INITIALIZER;
/* Runnable queues, broken up by thread type. */
static struct vmm_thread_tq rnbl_tasks = TAILQ_HEAD_INITIALIZER(rnbl_tasks);
static struct vmm_thread_tq rnbl_guests = TAILQ_HEAD_INITIALIZER(rnbl_guests);
/* Counts of *unblocked* threads. Unblocked = Running + Runnable. */
static atomic_t nr_unblk_tasks;
static atomic_t nr_unblk_guests;
/* Global evq for all syscalls. Could make this per vcore or whatever. */
static struct event_queue *sysc_evq;
static void vmm_sched_entry(void);
static void vmm_thread_runnable(struct uthread *uth);
static void vmm_thread_paused(struct uthread *uth);
static void vmm_thread_blockon_sysc(struct uthread *uth, void *sysc);
static void vmm_thread_has_blocked(struct uthread *uth, int flags);
static void vmm_thread_refl_fault(struct uthread *uth,
struct user_context *ctx);
struct schedule_ops vmm_sched_ops = {
.sched_entry = vmm_sched_entry,
.thread_runnable = vmm_thread_runnable,
.thread_paused = vmm_thread_paused,
.thread_blockon_sysc = vmm_thread_blockon_sysc,
.thread_has_blocked = vmm_thread_has_blocked,
.thread_refl_fault = vmm_thread_refl_fault,
};
/* Helpers */
static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
void *data);
static void acct_thread_blocked(struct vmm_thread *vth);
static void acct_thread_unblocked(struct vmm_thread *vth);
static void enqueue_vmm_thread(struct vmm_thread *vth);
static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm,
int type);
static void *__alloc_stack(size_t stacksize);
static void __free_stack(void *stacktop, size_t stacksize);
static void restart_thread(struct syscall *sysc)
{
struct uthread *ut_restartee = (struct uthread*)sysc->u_data;
/* uthread stuff here: */
assert(ut_restartee);
assert(ut_restartee->sysc == sysc); /* set in uthread.c */
ut_restartee->sysc = 0; /* so we don't 'reblock' on this later */
vmm_thread_runnable(ut_restartee);
}
static void vmm_handle_syscall(struct event_msg *ev_msg, unsigned int ev_type,
void *data)
{
struct syscall *sysc;
/* I think we can make this assert now. If not, check pthread.c. (concern
* was having old ev_qs firing and running this handler). */
assert(ev_msg);
sysc = ev_msg->ev_arg3;
assert(sysc);
restart_thread(sysc);
}
/* Helper: allocates a UCQ-based event queue suitable for syscalls. Will
* attempt to route the notifs/IPIs to vcoreid */
static struct event_queue *setup_sysc_evq(int vcoreid)
{
struct event_queue *evq;
uintptr_t mmap_block;
mmap_block = (uintptr_t)mmap(0, PGSIZE * 2,
PROT_WRITE | PROT_READ,
MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
evq = get_eventq_raw();
assert(mmap_block && evq);
evq->ev_flags = EVENT_IPI | EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
evq->ev_vcore = vcoreid;
evq->ev_mbox->type = EV_MBOX_UCQ;
ucq_init_raw(&evq->ev_mbox->ucq, mmap_block, mmap_block + PGSIZE);
return evq;
}
static void __attribute__((constructor)) vmm_lib_init(void)
{
struct task_thread *thread0;
init_once_racy(return);
uthread_lib_init();
/* Note that thread0 doesn't belong to a VM. We can set this during
* vmm_init() if we need to. */
thread0 = (struct task_thread*)alloc_vmm_thread(0, VMM_THREAD_TASK);
assert(thread0);
acct_thread_unblocked((struct vmm_thread*)thread0);
thread0->stacksize = USTACK_NUM_PAGES * PGSIZE;
thread0->stacktop = (void*)USTACKTOP;
/* for lack of a better vcore, might as well send to 0 */
sysc_evq = setup_sysc_evq(0);
register_ev_handler(EV_SYSCALL, vmm_handle_syscall, 0);
uthread_2ls_init((struct uthread*)thread0, &vmm_sched_ops);
}
/* The scheduling policy is encapsulated in the next few functions (from here
* down to sched_entry()). */
static int desired_nr_vcores(void)
{
/* Sanity checks on our accounting. */
assert(atomic_read(&nr_unblk_guests) >= 0);
assert(atomic_read(&nr_unblk_tasks) >= 0);
/* Lockless peak. This is always an estimate. Some of our tasks busy-wait,
* so it's not enough to just give us one vcore for all tasks, yet. */
return atomic_read(&nr_unblk_guests) + atomic_read(&nr_unblk_tasks);
}
static struct vmm_thread *__pop_first(struct vmm_thread_tq *tq)
{
struct vmm_thread *vth;
vth = TAILQ_FIRST(tq);
if (vth)
TAILQ_REMOVE(tq, vth, tq_next);
return vth;
}
static struct vmm_thread *pick_a_thread_degraded(void)
{
struct vmm_thread *vth = 0;
static int next_class = VMM_THREAD_GUEST;
/* We don't have a lot of cores (maybe 0), so we'll alternate which type of
* thread we look at first. Basically, we're RR within a class of threads,
* and we'll toggle between those two classes. */
spin_pdr_lock(&queue_lock);
if (next_class == VMM_THREAD_GUEST) {
if (!vth)
vth = __pop_first(&rnbl_guests);
if (!vth)
vth = __pop_first(&rnbl_tasks);
next_class = VMM_THREAD_TASK;
} else {
if (!vth)
vth = __pop_first(&rnbl_tasks);
if (!vth)
vth = __pop_first(&rnbl_guests);
next_class = VMM_THREAD_GUEST;
};
spin_pdr_unlock(&queue_lock);
return vth;
}
/* We have plenty of cores - run whatever we want. We'll prioritize tasks. */
static struct vmm_thread *pick_a_thread_plenty(void)
{
struct vmm_thread *vth = 0;
spin_pdr_lock(&queue_lock);
if (!vth)
vth = __pop_first(&rnbl_tasks);
if (!vth)
vth = __pop_first(&rnbl_guests);
spin_pdr_unlock(&queue_lock);
return vth;
}
static void yield_current_uth(void)
{
struct vmm_thread *vth;
if (!current_uthread)
return;
vth = (struct vmm_thread*)stop_current_uthread();
enqueue_vmm_thread(vth);
}
static void __attribute__((noreturn)) vmm_sched_entry(void)
{
struct vmm_thread *vth;
int nr_vcores_wanted = desired_nr_vcores();
bool have_enough = nr_vcores_wanted <= num_vcores();
/* TODO: this doesn't handle a lot of issues, like preemption, how to
* run/yield our vcores, dynamic changes in the number of runnables, where
* to send events, how to avoid interfering with gpcs, etc. */
if (have_enough) {
vcore_tick_disable();
} else {
vcore_tick_enable(vmm_sched_period_usec);
vcore_request_total(nr_vcores_wanted);
if (vcore_tick_poll()) {
/* slightly less than ideal: we grab the queue lock twice */
yield_current_uth();
}
}
if (current_uthread)
run_current_uthread();
if (have_enough)
vth = pick_a_thread_plenty();
else
vth = pick_a_thread_degraded();
if (!vth)
vcore_yield_or_restart();
run_uthread((struct uthread*)vth);
}
static void vmm_thread_runnable(struct uthread *uth)
{
/* A thread that was blocked is now runnable. This counts as becoming
* unblocked (running + runnable) */
acct_thread_unblocked((struct vmm_thread*)uth);
enqueue_vmm_thread((struct vmm_thread*)uth);
}
static void vmm_thread_paused(struct uthread *uth)
{
/* The thread stopped for some reason, usually a preemption. We'd like to
* just run it whenever we get a chance. Note that it didn't become
* 'blocked' - it's still runnable. */
enqueue_vmm_thread((struct vmm_thread*)uth);
}
static void vmm_thread_blockon_sysc(struct uthread *uth, void *syscall)
{
struct syscall *sysc = (struct syscall*)syscall;
acct_thread_blocked((struct vmm_thread*)uth);
sysc->u_data = uth;
if (!register_evq(sysc, sysc_evq)) {
/* Lost the race with the call being done. The kernel won't send the
* event. Just restart him. */
restart_thread(sysc);
}
/* GIANT WARNING: do not touch the thread after this point. */
}
static void vmm_thread_has_blocked(struct uthread *uth, int flags)
{
/* The thread blocked on something like a mutex. It's not runnable, so we
* don't need to put it on a list, but we do need to account for it not
* running. We'll find out (via thread_runnable) when it starts up again.
*/
acct_thread_blocked((struct vmm_thread*)uth);
}
static void refl_error(struct uthread *uth, unsigned int trap_nr,
unsigned int err, unsigned long aux)
{
printf("Thread has unhandled fault: %d, err: %d, aux: %p\n",
trap_nr, err, aux);
/* Note that uthread.c already copied out our ctx into the uth
* struct */
print_user_context(&uth->u_ctx);
printf("Turn on printx to spew unhandled, malignant trap info\n");
exit(-1);
}
static bool handle_page_fault(struct uthread *uth, unsigned int err,
unsigned long aux)
{
if (!(err & PF_VMR_BACKED))
return FALSE;
syscall_async(&uth->local_sysc, SYS_populate_va, aux, 1);
__block_uthread_on_async_sysc(uth);
return TRUE;
}
static void vmm_thread_refl_hw_fault(struct uthread *uth,
unsigned int trap_nr,
unsigned int err, unsigned long aux)
{
switch (trap_nr) {
case HW_TRAP_PAGE_FAULT:
if (!handle_page_fault(uth, err, aux))
refl_error(uth, trap_nr, err, aux);
break;
default:
refl_error(uth, trap_nr, err, aux);
}
}
/* Yield callback for __ctlr_entry */
static void __swap_to_gth(struct uthread *uth, void *dummy)
{
struct ctlr_thread *cth = (struct ctlr_thread*)uth;
/* We don't re-account for block/unblock. The ctlr and the guest are
* accounted together ("pass the token" back and forth). */
enqueue_vmm_thread((struct vmm_thread*)cth->buddy);
}
/* All ctrl threads start here, each time their guest has a fault. They can
* block and unblock along the way. Once a ctlr does its final uthread_yield,
* the next time it will start again from the top. */
static void __ctlr_entry(void)
{
struct ctlr_thread *cth = (struct ctlr_thread*)current_uthread;
struct virtual_machine *vm = gth_to_vm(cth->buddy);
if (!handle_vmexit(cth->buddy)) {
struct vm_trapframe *vm_tf = gth_to_vmtf(cth->buddy);
fprintf(stderr, "vmm: handle_vmexit returned false\n");
fprintf(stderr, "Note: this may be a kernel module, not the kernel\n");
fprintf(stderr, "RIP was %p:\n", (void *)vm_tf->tf_rip);
/* TODO: properly walk the kernel page tables to map the tf_rip
* to a physical address. For now, however, this hack is good
* enough.
*/
hexdump(stderr, (void *)(vm_tf->tf_rip & 0x3fffffff), 16);
showstatus(stderr, cth->buddy);
exit(0);
}
/* We want to atomically yield and start/reenqueue our buddy. We do so in
* vcore context on the other side of the yield. */
uthread_yield(FALSE, __swap_to_gth, 0);
}
static void vmm_thread_refl_vm_fault(struct uthread *uth)
{
struct guest_thread *gth = (struct guest_thread*)uth;
struct ctlr_thread *cth = gth->buddy;
/* The ctlr starts frm the top every time we get a new fault. */
cth->uthread.flags |= UTHREAD_SAVED;
init_user_ctx(&cth->uthread.u_ctx, (uintptr_t)&__ctlr_entry,
(uintptr_t)(cth->stacktop));
/* We don't re-account for block/unblock. The ctlr and the guest are
* accounted together ("pass the token" back and forth). */
enqueue_vmm_thread((struct vmm_thread*)cth);
}
static void vmm_thread_refl_fault(struct uthread *uth,
struct user_context *ctx)
{
switch (ctx->type) {
case ROS_HW_CTX:
/* Guests should only ever VM exit */
assert(((struct vmm_thread*)uth)->type != VMM_THREAD_GUEST);
vmm_thread_refl_hw_fault(uth, __arch_refl_get_nr(ctx),
__arch_refl_get_err(ctx),
__arch_refl_get_aux(ctx));
break;
case ROS_VM_CTX:
vmm_thread_refl_vm_fault(uth);
break;
default:
assert(0);
}
}
static void destroy_guest_thread(struct guest_thread *gth)
{
struct ctlr_thread *cth = gth->buddy;
__free_stack(cth->stacktop, cth->stacksize);
uthread_cleanup((struct uthread*)cth);
free(cth);
uthread_cleanup((struct uthread*)gth);
free(gth);
}
static struct guest_thread *create_guest_thread(struct virtual_machine *vm,
unsigned int gpcoreid)
{
struct guest_thread *gth;
struct ctlr_thread *cth;
/* Guests won't use TLS; they always operate in Ring V. The controller
* might - not because of anything we do, but because of glibc calls. */
struct uth_thread_attr gth_attr = {.want_tls = FALSE};
struct uth_thread_attr cth_attr = {.want_tls = TRUE};
gth = (struct guest_thread*)alloc_vmm_thread(vm, VMM_THREAD_GUEST);
cth = (struct ctlr_thread*)alloc_vmm_thread(vm, VMM_THREAD_CTLR);
if (!gth || !cth) {
free(gth);
free(cth);
return 0;
}
gth->buddy = cth;
cth->buddy = gth;
gth->gpc_id = gpcoreid;
cth->stacksize = VMM_THR_STACKSIZE;
cth->stacktop = __alloc_stack(cth->stacksize);
if (!cth->stacktop) {
free(gth);
free(cth);
return 0;
}
gth->uthread.u_ctx.type = ROS_VM_CTX;
gth->uthread.u_ctx.tf.vm_tf.tf_guest_pcoreid = gpcoreid;
/* No need to init the ctlr. It gets re-init'd each time it starts. */
uthread_init((struct uthread*)gth, &gth_attr);
uthread_init((struct uthread*)cth, &cth_attr);
/* TODO: give it a correct FP state. Our current one is probably fine */
restore_fp_state(&gth->uthread.as);
gth->uthread.flags |= UTHREAD_FPSAVED;
gth->halt_mtx = uth_mutex_alloc();
gth->halt_cv = uth_cond_var_alloc();
return gth;
}
int vmm_init(struct virtual_machine *vm, int flags)
{
struct guest_thread **gths;
if (current_vm)
return -1;
current_vm = vm;
if (syscall(SYS_vmm_setup, vm->nr_gpcs, vm->gpcis, flags) != vm->nr_gpcs)
return -1;
gths = malloc(vm->nr_gpcs * sizeof(struct guest_thread *));
if (!gths)
return -1;
for (int i = 0; i < vm->nr_gpcs; i++) {
gths[i] = create_guest_thread(vm, i);
if (!gths[i]) {
for (int j = 0; j < i; j++)
destroy_guest_thread(gths[j]);
free(gths);
return -1;
}
}
vm->gths = gths;
uthread_mcp_init();
return 0;
}
void start_guest_thread(struct guest_thread *gth)
{
acct_thread_unblocked((struct vmm_thread*)gth);
enqueue_vmm_thread((struct vmm_thread*)gth);
}
static void __tth_exit_cb(struct uthread *uthread, void *junk)
{
struct task_thread *tth = (struct task_thread*)uthread;
acct_thread_blocked((struct vmm_thread*)tth);
uthread_cleanup(uthread);
__free_stack(tth->stacktop, tth->stacksize);
free(tth);
}
static void __task_thread_run(void)
{
struct task_thread *tth = (struct task_thread*)current_uthread;
tth->func(tth->arg);
uthread_yield(FALSE, __tth_exit_cb, 0);
}
struct task_thread *vmm_run_task(struct virtual_machine *vm,
void (*func)(void *), void *arg)
{
struct task_thread *tth;
struct uth_thread_attr tth_attr = {.want_tls = TRUE};
tth = (struct task_thread*)alloc_vmm_thread(vm, VMM_THREAD_TASK);
if (!tth)
return 0;
tth->stacksize = VMM_THR_STACKSIZE;
tth->stacktop = __alloc_stack(tth->stacksize);
if (!tth->stacktop) {
free(tth);
return 0;
}
tth->func = func;
tth->arg = arg;
init_user_ctx(&tth->uthread.u_ctx, (uintptr_t)&__task_thread_run,
(uintptr_t)(tth->stacktop));
uthread_init((struct uthread*)tth, &tth_attr);
acct_thread_unblocked((struct vmm_thread*)tth);
enqueue_vmm_thread((struct vmm_thread*)tth);
return tth;
}
/* Helpers for tracking nr_unblk_* threads. */
static void acct_thread_blocked(struct vmm_thread *vth)
{
switch (vth->type) {
case VMM_THREAD_GUEST:
case VMM_THREAD_CTLR:
atomic_dec(&nr_unblk_guests);
break;
case VMM_THREAD_TASK:
atomic_dec(&nr_unblk_tasks);
break;
}
}
static void acct_thread_unblocked(struct vmm_thread *vth)
{
switch (vth->type) {
case VMM_THREAD_GUEST:
case VMM_THREAD_CTLR:
atomic_inc(&nr_unblk_guests);
break;
case VMM_THREAD_TASK:
atomic_inc(&nr_unblk_tasks);
break;
}
}
static void enqueue_vmm_thread(struct vmm_thread *vth)
{
spin_pdr_lock(&queue_lock);
switch (vth->type) {
case VMM_THREAD_GUEST:
case VMM_THREAD_CTLR:
TAILQ_INSERT_TAIL(&rnbl_guests, vth, tq_next);
break;
case VMM_THREAD_TASK:
TAILQ_INSERT_TAIL(&rnbl_tasks, vth, tq_next);
break;
}
spin_pdr_unlock(&queue_lock);
}
static struct vmm_thread *alloc_vmm_thread(struct virtual_machine *vm, int type)
{
struct vmm_thread *vth;
int ret;
ret = posix_memalign((void**)&vth, __alignof__(struct vmm_thread),
sizeof(struct vmm_thread));
if (ret)
return 0;
memset(vth, 0, sizeof(struct vmm_thread));
vth->type = type;
vth->vm = vm;
return vth;
}
static void __free_stack(void *stacktop, size_t stacksize)
{
munmap(stacktop - stacksize, stacksize);
}
static void *__alloc_stack(size_t stacksize)
{
int force_a_page_fault;
void *stacktop;
void *stackbot = mmap(0, stacksize, PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_ANONYMOUS, -1, 0);
if (stackbot == MAP_FAILED)
return 0;
stacktop = stackbot + stacksize;
/* Want the top of the stack populated, but not the rest of the stack;
* that'll grow on demand (up to stacksize, then will clobber memory). */
force_a_page_fault = ACCESS_ONCE(*(int*)(stacktop - sizeof(int)));
return stacktop;
}