blob: 22832f93f0a66fd785a99da0c961f8c937093d08 [file] [log] [blame]
/* Copyright (c) 2015-2016 Google Inc.
* See LICENSE for details. */
#include <parlib/common.h>
#include <vmm/virtio.h>
#include <vmm/virtio_mmio.h>
#include <vmm/virtio_ids.h>
#include <vmm/virtio_config.h>
#include <vmm/mmio.h>
#include <vmm/vmm.h>
#include <parlib/arch/trap.h>
#include <parlib/bitmask.h>
#include <stdio.h>
static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
{
return GET_BITMASK_BIT(gpci->posted_irq_desc,
VMX_POSTED_OUTSTANDING_NOTIF);
}
/* Returns true if the hardware will trigger an IRQ for the guest. These
* virtual IRQs are only processed under certain situations, like vmentry, and
* posted IRQs. See 'Evaluation of Pending Virtual Interrupts' in the SDM. */
static bool virtual_irq_is_pending(struct guest_thread *gth)
{
struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
uint8_t rvi, vppr;
/* Currently, the lower 4 bits are various ways to block IRQs, e.g.
* blocking by STI. The other bits are must be 0. Presumably any new
* bits are types of IRQ blocking. */
if (gth_to_vmtf(gth)->tf_intrinfo1)
return false;
vppr = read_mmreg32((uintptr_t)gth_to_gpci(gth)->vapic_addr + 0xa0);
rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
return (rvi & 0xf0) > (vppr & 0xf0);
}
/* Blocks a guest pcore / thread until it has an IRQ pending. Syncs with
* vmm_interrupt_guest(). */
static void sleep_til_irq(struct guest_thread *gth)
{
struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
/* The invariant is that if an IRQ is posted, but not delivered, we will
* not sleep. Anyone who posts an IRQ must signal after setting it.
* vmm_interrupt_guest() does this. If we use alternate sources of IRQ
* posting, we'll need to revist this. For more details, see the notes
* in the kernel IPI-IRC fast path.
*
* Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
* possible that the hardware attempted to post the interrupt. In SDM
* parlance, the processor could have "recognized" the virtual IRQ, but
* not delivered it yet. This could happen if the guest had executed
* "sti", but not "hlt" yet. The IRQ was posted and recognized, but not
* delivered ("sti blocking"). Then the guest executes "hlt", and
* vmexits. OUTSTANDING_NOTIF will be clear in this case. RVI should
* be set - at least to the vector we just sent, but possibly to a
* greater vector if multiple were sent. RVI should only be cleared
* after virtual IRQs were actually delivered. So checking
* OUTSTANDING_NOTIF and RVI should suffice.
*
* Note that when we see a notif or pending virtual IRQ, we don't
* actually deliver the IRQ, we'll just restart the guest and the
* hardware will deliver the virtual IRQ at the appropriate time.
*
* The more traditional race here is if the halt starts concurrently
* with the post; that's why we sync with the mutex to make sure there
* is an ordering between the actual halt (this function) and the
* posting. */
uth_mutex_lock(gth->halt_mtx);
while (!(pir_notif_is_set(gpci) || virtual_irq_is_pending(gth)))
uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
uth_mutex_unlock(gth->halt_mtx);
}
enum {
CPUID_0B_LEVEL_SMT = 0,
CPUID_0B_LEVEL_CORE
};
static bool handle_cpuid(struct guest_thread *gth)
{
struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
struct virtual_machine *vm = gth_to_vm(gth);
uint32_t level = vm_tf->tf_rcx & 0x0F;
if (vm_tf->tf_rax != 0x0B)
return FALSE;
vm_tf->tf_rip += 2;
vm_tf->tf_rax = 0;
vm_tf->tf_rbx = 0;
vm_tf->tf_rcx = level;
vm_tf->tf_rdx = gth->gpc_id;
if (level == CPUID_0B_LEVEL_SMT) {
vm_tf->tf_rax = 0;
vm_tf->tf_rbx = 1;
vm_tf->tf_rcx |= ((level + 1) << 8);
}
if (level == CPUID_0B_LEVEL_CORE) {
uint32_t shift = LOG2_UP(vm->nr_gpcs);
if (shift > 0x1F)
shift = 0x1F;
vm_tf->tf_rax = shift;
vm_tf->tf_rbx = vm->nr_gpcs;
vm_tf->tf_rcx |= ((level + 1) << 8);
}
return TRUE;
}
static bool handle_ept_fault(struct guest_thread *gth)
{
struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
struct virtual_machine *vm = gth_to_vm(gth);
uint64_t gpa, *regp;
uint8_t regx;
int store, size;
int advance;
int ret;
if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) {
ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0,
0, 0);
if (ret <= 0)
panic("[user] handle_ept_fault: populate_va failed: ret = %d\n",
ret);
return TRUE;
}
ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);
if (ret < 0)
return FALSE;
if (ret == VM_PAGE_FAULT) {
/* We were unable to translate RIP due to an ept fault */
vm_tf->tf_trap_inject = VM_TRAP_VALID
| VM_TRAP_ERROR_CODE
| VM_TRAP_HARDWARE
| HW_TRAP_PAGE_FAULT;
return TRUE;
}
assert(size >= 0);
/* TODO use helpers for some of these addr checks. the fee/fec ones
* might be wrong too. */
for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
if (vm->virtio_mmio_devices[i] == NULL)
continue;
if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
continue;
/* TODO: can the guest cause us to spawn off infinite threads?
*/
if (store)
virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa,
size, (uint32_t *)regp);
else
*regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i],
gpa, size);
vm_tf->tf_rip += advance;
return TRUE;
}
if (PG_ADDR(gpa) == 0xfec00000) {
do_ioapic(gth, gpa, regx, regp, store);
} else if (PG_ADDR(gpa) == 0) {
memmove(regp, &vm->low4k[gpa], size);
} else {
fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
vm_tf->tf_exit_reason);
fprintf(stderr, "Returning 0xffffffff\n");
showstatus(stderr, gth);
/* Just fill the whole register for now. */
*regp = (uint64_t) -1;
return FALSE;
}
vm_tf->tf_rip += advance;
return TRUE;
}
static bool handle_vmcall_printc(struct guest_thread *gth)
{
struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
uint8_t byte;
byte = vm_tf->tf_rdi;
printf("%c", byte);
if (byte == '\n')
printf("%c", '%');
fflush(stdout);
return TRUE;
}
static bool handle_vmcall_smpboot(struct guest_thread *gth)
{
struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
struct vm_trapframe *vm_tf_ap;
struct virtual_machine *vm = gth_to_vm(gth);
int cur_pcores = vm->up_gpcs;
/* Check if we're guest pcore 0. Only the BSP is allowed to start APs.
*/
if (vm_tf->tf_guest_pcoreid != 0) {
fprintf(stderr,
"Only guest pcore 0 is allowed to start APs. core was %ld\n",
vm_tf->tf_guest_pcoreid);
return FALSE;
}
/* Check if we've reached the maximum, if yes, blow out. */
if (vm->nr_gpcs == cur_pcores) {
fprintf(stderr,
"guest tried to start up too many cores. max was %ld, current up %ld\n",
vm->nr_gpcs, cur_pcores);
return FALSE;
}
/* Start up secondary core. */
vm_tf_ap = gpcid_to_vmtf(vm, cur_pcores);
/* We use the BSP's CR3 for now. This should be fine because they
* change it later anyway. */
vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;
/* Starting RIP is passed in via rdi. */
vm_tf_ap->tf_rip = vm_tf->tf_rdi;
/* Starting RSP is passed in via rsi. */
vm_tf_ap->tf_rsp = vm_tf->tf_rsi;
vm->up_gpcs++;
start_guest_thread(gpcid_to_gth(vm, cur_pcores));
return TRUE;
}
static bool handle_vmcall_get_tscfreq(struct guest_thread *gth)
{
struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
struct vm_trapframe *vm_tf_ap;
struct virtual_machine *vm = gth_to_vm(gth);
vm_tf->tf_rax = get_tsc_freq() / 1000;
return TRUE;
}
static bool handle_vmcall(struct guest_thread *gth)
{
struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
struct virtual_machine *vm = gth_to_vm(gth);
bool retval = FALSE;
if (vm->vmcall)
return vm->vmcall(gth, vm_tf);
switch (vm_tf->tf_rax) {
case VMCALL_PRINTC:
retval = handle_vmcall_printc(gth);
break;
case VMCALL_SMPBOOT:
retval = handle_vmcall_smpboot(gth);
break;
case VMCALL_GET_TSCFREQ:
retval = handle_vmcall_get_tscfreq(gth);
break;
case VMCALL_TRACE_TF:
trace_printf(" rax 0x%016lx\n", vm_tf->tf_r11);
trace_printf(" rbx 0x%016lx\n", vm_tf->tf_rbx);
trace_printf(" rcx 0x%016lx\n", vm_tf->tf_rcx);
trace_printf(" rdx 0x%016lx\n", vm_tf->tf_rdx);
trace_printf(" rbp 0x%016lx\n", vm_tf->tf_rbp);
trace_printf(" rsi 0x%016lx\n", vm_tf->tf_rsi);
trace_printf(" rdi 0x%016lx\n", vm_tf->tf_rdi);
trace_printf(" r8 0x%016lx\n", vm_tf->tf_r8);
trace_printf(" r9 0x%016lx\n", vm_tf->tf_r9);
trace_printf(" r10 0x%016lx\n", vm_tf->tf_r10);
trace_printf(" r11 0x%016lx\n", 0xdeadbeef);
trace_printf(" r12 0x%016lx\n", vm_tf->tf_r12);
trace_printf(" r13 0x%016lx\n", vm_tf->tf_r13);
trace_printf(" r14 0x%016lx\n", vm_tf->tf_r14);
trace_printf(" r15 0x%016lx\n", vm_tf->tf_r15);
trace_printf(" rip 0x%016lx\n", vm_tf->tf_rip);
trace_printf(" rflg 0x%016lx\n", vm_tf->tf_rflags);
trace_printf(" rsp 0x%016lx\n", vm_tf->tf_rsp);
trace_printf(" cr2 0x%016lx\n", vm_tf->tf_cr2);
trace_printf(" cr3 0x%016lx\n", vm_tf->tf_cr3);
trace_printf("Gpcore 0x%08x\n", vm_tf->tf_guest_pcoreid);
trace_printf("Flags 0x%08x\n", vm_tf->tf_flags);
trace_printf("Inject 0x%08x\n", vm_tf->tf_trap_inject);
trace_printf("ExitRs 0x%08x\n", vm_tf->tf_exit_reason);
trace_printf("ExitQl 0x%08x\n", vm_tf->tf_exit_qual);
trace_printf("Intr1 0x%016lx\n", vm_tf->tf_intrinfo1);
trace_printf("Intr2 0x%016lx\n", vm_tf->tf_intrinfo2);
trace_printf("GIntr 0x----%04x\n",
vm_tf->tf_guest_intr_status);
trace_printf("GVA 0x%016lx\n", vm_tf->tf_guest_va);
trace_printf("GPA 0x%016lx\n", vm_tf->tf_guest_pa);
retval = true;
break;
}
if (retval)
vm_tf->tf_rip += 3;
return retval;
}
static bool handle_io(struct guest_thread *gth)
{
struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
int ret = io(gth);
if (ret < 0)
return FALSE;
if (ret == VM_PAGE_FAULT) {
/* We were unable to translate RIP due to an ept fault */
vm_tf->tf_trap_inject = VM_TRAP_VALID
| VM_TRAP_ERROR_CODE
| VM_TRAP_HARDWARE
| HW_TRAP_PAGE_FAULT;
}
return TRUE;
}
static bool handle_msr(struct guest_thread *gth)
{
struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
/* Use event injection through vmctl to send a general
* protection fault vmctl.interrupt gets written to the VM-Entry
* Interruption-Information Field by vmx */
vm_tf->tf_trap_inject = VM_TRAP_VALID
| VM_TRAP_ERROR_CODE
| VM_TRAP_HARDWARE
| HW_TRAP_GP_FAULT;
} else {
vm_tf->tf_rip += 2;
}
return TRUE;
}
static bool handle_apic_access(struct guest_thread *gth)
{
uint64_t gpa, *regp;
uint8_t regx;
int store, size;
int advance;
struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
return FALSE;
if (__apic_access(gth, gpa, regx, regp, store))
return FALSE;
vm_tf->tf_rip += advance;
return TRUE;
}
static bool handle_halt(struct guest_thread *gth)
{
struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
struct virtual_machine *vm = gth_to_vm(gth);
if (vm->halt_exit)
return FALSE;
/* It's possible the guest disabled IRQs and halted, perhaps waiting on
* an NMI or something. If we need to support that, we can change this.
*/
sleep_til_irq(gth);
vm_tf->tf_rip += 1;
return TRUE;
}
/* The guest is told (via cpuid) that there is no monitor/mwait. Callers of
* mwait are paravirtualized halts.
*
* We don't support monitor/mwait in software, so if they tried to mwait
* without break-on-interrupt and with interrupts disabled, they'll never
* wake up. So we'll always break on interrupt. */
static bool handle_mwait(struct guest_thread *gth)
{
struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
struct virtual_machine *vm = gth_to_vm(gth);
sleep_til_irq(gth);
vm_tf->tf_rip += 3;
return TRUE;
}
/* Is this a vmm specific thing? or generic?
*
* what do we do when we want to kill the vm? what are our other options? */
bool handle_vmexit(struct guest_thread *gth)
{
struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
switch (vm_tf->tf_exit_reason) {
case EXIT_REASON_CPUID:
return handle_cpuid(gth);
case EXIT_REASON_EPT_VIOLATION:
return handle_ept_fault(gth);
case EXIT_REASON_VMCALL:
return handle_vmcall(gth);
case EXIT_REASON_IO_INSTRUCTION:
return handle_io(gth);
case EXIT_REASON_MSR_WRITE:
case EXIT_REASON_MSR_READ:
return handle_msr(gth);
case EXIT_REASON_APIC_ACCESS:
return handle_apic_access(gth);
case EXIT_REASON_HLT:
return handle_halt(gth);
case EXIT_REASON_MWAIT_INSTRUCTION:
return handle_mwait(gth);
case EXIT_REASON_EXTERNAL_INTERRUPT:
case EXIT_REASON_APIC_WRITE:
/* TODO: just ignore these? */
return TRUE;
default:
fprintf(stderr,
"VMM library: don't know how to handle exit %d\n",
vm_tf->tf_exit_reason);
fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
vm_tf->tf_exit_reason);
return FALSE;
}
}