|  | /* Copyright (c) 2015-2016 Google Inc. | 
|  | * See LICENSE for details. */ | 
|  |  | 
|  | #include <parlib/common.h> | 
|  | #include <vmm/virtio.h> | 
|  | #include <vmm/virtio_mmio.h> | 
|  | #include <vmm/virtio_ids.h> | 
|  | #include <vmm/virtio_config.h> | 
|  | #include <vmm/mmio.h> | 
|  | #include <vmm/vmm.h> | 
|  | #include <parlib/arch/trap.h> | 
|  | #include <parlib/bitmask.h> | 
|  | #include <stdio.h> | 
|  |  | 
|  | static bool pir_notif_is_set(struct vmm_gpcore_init *gpci) | 
|  | { | 
|  | return GET_BITMASK_BIT(gpci->posted_irq_desc, VMX_POSTED_OUTSTANDING_NOTIF); | 
|  | } | 
|  |  | 
|  | /* Returns true if the hardware will trigger an IRQ for the guest.  These | 
|  | * virtual IRQs are only processed under certain situations, like vmentry, and | 
|  | * posted IRQs.  See 'Evaluation of Pending Virtual Interrupts' in the SDM. */ | 
|  | static bool virtual_irq_is_pending(struct guest_thread *gth) | 
|  | { | 
|  | struct vmm_gpcore_init *gpci = gth_to_gpci(gth); | 
|  | uint8_t rvi, vppr; | 
|  |  | 
|  | /* Currently, the lower 4 bits are various ways to block IRQs, e.g. blocking | 
|  | * by STI.  The other bits are must be 0.  Presumably any new bits are types | 
|  | * of IRQ blocking. */ | 
|  | if (gth_to_vmtf(gth)->tf_intrinfo1) | 
|  | return false; | 
|  | vppr = read_mmreg32((uintptr_t)gth_to_gpci(gth)->vapic_addr + 0xa0); | 
|  | rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff; | 
|  | return (rvi & 0xf0) > (vppr & 0xf0); | 
|  | } | 
|  |  | 
|  | /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with | 
|  | * vmm_interrupt_guest(). */ | 
|  | static void sleep_til_irq(struct guest_thread *gth) | 
|  | { | 
|  | struct vmm_gpcore_init *gpci = gth_to_gpci(gth); | 
|  |  | 
|  | /* The invariant is that if an IRQ is posted, but not delivered, we will not | 
|  | * sleep.  Anyone who posts an IRQ must signal after setting it. | 
|  | * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ | 
|  | * posting, we'll need to revist this.  For more details, see the notes in | 
|  | * the kernel IPI-IRC fast path. | 
|  | * | 
|  | * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's | 
|  | * possible that the hardware attempted to post the interrupt.  In SDM | 
|  | * parlance, the processor could have "recognized" the virtual IRQ, but not | 
|  | * delivered it yet.  This could happen if the guest had executed "sti", but | 
|  | * not "hlt" yet.  The IRQ was posted and recognized, but not delivered | 
|  | * ("sti blocking").  Then the guest executes "hlt", and vmexits. | 
|  | * OUTSTANDING_NOTIF will be clear in this case.  RVI should be set - at | 
|  | * least to the vector we just sent, but possibly to a greater vector if | 
|  | * multiple were sent.  RVI should only be cleared after virtual IRQs were | 
|  | * actually delivered.  So checking OUTSTANDING_NOTIF and RVI should | 
|  | * suffice. | 
|  | * | 
|  | * Note that when we see a notif or pending virtual IRQ, we don't actually | 
|  | * deliver the IRQ, we'll just restart the guest and the hardware will | 
|  | * deliver the virtual IRQ at the appropriate time. | 
|  | * | 
|  | * The more traditional race here is if the halt starts concurrently with | 
|  | * the post; that's why we sync with the mutex to make sure there is an | 
|  | * ordering between the actual halt (this function) and the posting. */ | 
|  | uth_mutex_lock(gth->halt_mtx); | 
|  | while (!(pir_notif_is_set(gpci) || virtual_irq_is_pending(gth))) | 
|  | uth_cond_var_wait(gth->halt_cv, gth->halt_mtx); | 
|  | uth_mutex_unlock(gth->halt_mtx); | 
|  | } | 
|  |  | 
|  | enum { | 
|  | CPUID_0B_LEVEL_SMT = 0, | 
|  | CPUID_0B_LEVEL_CORE | 
|  | }; | 
|  |  | 
|  | static bool handle_cpuid(struct guest_thread *gth) | 
|  | { | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(gth); | 
|  | struct virtual_machine *vm = gth_to_vm(gth); | 
|  | uint32_t level = vm_tf->tf_rcx & 0x0F; | 
|  |  | 
|  | if (vm_tf->tf_rax != 0x0B) | 
|  | return FALSE; | 
|  |  | 
|  | vm_tf->tf_rip += 2; | 
|  | vm_tf->tf_rax = 0; | 
|  | vm_tf->tf_rbx = 0; | 
|  | vm_tf->tf_rcx = level; | 
|  | vm_tf->tf_rdx = gth->gpc_id; | 
|  | if (level == CPUID_0B_LEVEL_SMT) { | 
|  | vm_tf->tf_rax = 0; | 
|  | vm_tf->tf_rbx = 1; | 
|  | vm_tf->tf_rcx |= ((level + 1) << 8); | 
|  | } | 
|  | if (level == CPUID_0B_LEVEL_CORE) { | 
|  | uint32_t shift = LOG2_UP(vm->nr_gpcs); | 
|  |  | 
|  | if (shift > 0x1F) | 
|  | shift = 0x1F; | 
|  | vm_tf->tf_rax = shift; | 
|  | vm_tf->tf_rbx = vm->nr_gpcs; | 
|  | vm_tf->tf_rcx |= ((level + 1) << 8); | 
|  | } | 
|  |  | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | static bool handle_ept_fault(struct guest_thread *gth) | 
|  | { | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(gth); | 
|  | struct virtual_machine *vm = gth_to_vm(gth); | 
|  | uint64_t gpa, *regp; | 
|  | uint8_t regx; | 
|  | int store, size; | 
|  | int advance; | 
|  | int ret; | 
|  |  | 
|  | if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) { | 
|  | ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0, 0, 0); | 
|  | if (ret <= 0) | 
|  | panic("[user] handle_ept_fault: populate_va failed: ret = %d\n", | 
|  | ret); | 
|  | return TRUE; | 
|  | } | 
|  | ret = decode(gth, &gpa, ®x, ®p, &store, &size, &advance); | 
|  |  | 
|  | if (ret < 0) | 
|  | return FALSE; | 
|  | if (ret == VM_PAGE_FAULT) { | 
|  | /* We were unable to translate RIP due to an ept fault */ | 
|  | vm_tf->tf_trap_inject = VM_TRAP_VALID | 
|  | | VM_TRAP_ERROR_CODE | 
|  | | VM_TRAP_HARDWARE | 
|  | | HW_TRAP_PAGE_FAULT; | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | assert(size >= 0); | 
|  | /* TODO use helpers for some of these addr checks.  the fee/fec ones might | 
|  | * be wrong too. */ | 
|  | for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) { | 
|  | if (vm->virtio_mmio_devices[i] == NULL) | 
|  | continue; | 
|  | if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr) | 
|  | continue; | 
|  | /* TODO: can the guest cause us to spawn off infinite threads? */ | 
|  | if (store) | 
|  | virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa, size, | 
|  | (uint32_t *)regp); | 
|  | else | 
|  | *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i], gpa, size); | 
|  | vm_tf->tf_rip += advance; | 
|  | return TRUE; | 
|  | } | 
|  | if (PG_ADDR(gpa) == 0xfec00000) { | 
|  | do_ioapic(gth, gpa, regx, regp, store); | 
|  | } else if (PG_ADDR(gpa) == 0) { | 
|  | memmove(regp, &vm->low4k[gpa], size); | 
|  | } else { | 
|  | fprintf(stderr, "EPT violation: can't handle %p\n", gpa); | 
|  | fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip, | 
|  | vm_tf->tf_exit_reason); | 
|  | fprintf(stderr, "Returning 0xffffffff\n"); | 
|  | showstatus(stderr, gth); | 
|  | /* Just fill the whole register for now. */ | 
|  | *regp = (uint64_t) -1; | 
|  | return FALSE; | 
|  | } | 
|  | vm_tf->tf_rip += advance; | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | static bool handle_vmcall_printc(struct guest_thread *gth) | 
|  | { | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(gth); | 
|  | uint8_t byte; | 
|  |  | 
|  | byte = vm_tf->tf_rdi; | 
|  | printf("%c", byte); | 
|  | if (byte == '\n') | 
|  | printf("%c", '%'); | 
|  | fflush(stdout); | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | static bool handle_vmcall_smpboot(struct guest_thread *gth) | 
|  | { | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(gth); | 
|  | struct vm_trapframe *vm_tf_ap; | 
|  | struct virtual_machine *vm = gth_to_vm(gth); | 
|  | int cur_pcores = vm->up_gpcs; | 
|  |  | 
|  | /* Check if we're guest pcore 0. Only the BSP is allowed to start APs. */ | 
|  | if (vm_tf->tf_guest_pcoreid != 0) { | 
|  | fprintf(stderr, | 
|  | "Only guest pcore 0 is allowed to start APs. core was %ld\n", | 
|  | vm_tf->tf_guest_pcoreid); | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  | /* Check if we've reached the maximum, if yes, blow out. */ | 
|  | if (vm->nr_gpcs == cur_pcores) { | 
|  | fprintf(stderr, | 
|  | "guest tried to start up too many cores. max was %ld, current up %ld\n", | 
|  | vm->nr_gpcs, cur_pcores); | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  | /* Start up secondary core. */ | 
|  | vm_tf_ap = gpcid_to_vmtf(vm, cur_pcores); | 
|  | /* We use the BSP's CR3 for now. This should be fine because they | 
|  | * change it later anyway. */ | 
|  | vm_tf_ap->tf_cr3 = vm_tf->tf_cr3; | 
|  |  | 
|  | /* Starting RIP is passed in via rdi. */ | 
|  | vm_tf_ap->tf_rip = vm_tf->tf_rdi; | 
|  |  | 
|  | /* Starting RSP is passed in via rsi. */ | 
|  | vm_tf_ap->tf_rsp = vm_tf->tf_rsi; | 
|  |  | 
|  | vm->up_gpcs++; | 
|  |  | 
|  | start_guest_thread(gpcid_to_gth(vm, cur_pcores)); | 
|  |  | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | static bool handle_vmcall_get_tscfreq(struct guest_thread *gth) | 
|  | { | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(gth); | 
|  | struct vm_trapframe *vm_tf_ap; | 
|  | struct virtual_machine *vm = gth_to_vm(gth); | 
|  |  | 
|  | vm_tf->tf_rax =	get_tsc_freq() / 1000; | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | static bool handle_vmcall(struct guest_thread *gth) | 
|  | { | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(gth); | 
|  | struct virtual_machine *vm = gth_to_vm(gth); | 
|  | bool retval = FALSE; | 
|  |  | 
|  | if (vm->vmcall) | 
|  | return vm->vmcall(gth, vm_tf); | 
|  |  | 
|  | switch (vm_tf->tf_rax) { | 
|  | case VMCALL_PRINTC: | 
|  | retval = handle_vmcall_printc(gth); | 
|  | break; | 
|  | case VMCALL_SMPBOOT: | 
|  | retval = handle_vmcall_smpboot(gth); | 
|  | break; | 
|  | case VMCALL_GET_TSCFREQ: | 
|  | retval = handle_vmcall_get_tscfreq(gth); | 
|  | break; | 
|  | case VMCALL_TRACE_TF: | 
|  | trace_printf("  rax  0x%016lx\n",      vm_tf->tf_r11); | 
|  | trace_printf("  rbx  0x%016lx\n",      vm_tf->tf_rbx); | 
|  | trace_printf("  rcx  0x%016lx\n",      vm_tf->tf_rcx); | 
|  | trace_printf("  rdx  0x%016lx\n",      vm_tf->tf_rdx); | 
|  | trace_printf("  rbp  0x%016lx\n",      vm_tf->tf_rbp); | 
|  | trace_printf("  rsi  0x%016lx\n",      vm_tf->tf_rsi); | 
|  | trace_printf("  rdi  0x%016lx\n",      vm_tf->tf_rdi); | 
|  | trace_printf("  r8   0x%016lx\n",      vm_tf->tf_r8); | 
|  | trace_printf("  r9   0x%016lx\n",      vm_tf->tf_r9); | 
|  | trace_printf("  r10  0x%016lx\n",      vm_tf->tf_r10); | 
|  | trace_printf("  r11  0x%016lx\n",      0xdeadbeef); | 
|  | trace_printf("  r12  0x%016lx\n",      vm_tf->tf_r12); | 
|  | trace_printf("  r13  0x%016lx\n",      vm_tf->tf_r13); | 
|  | trace_printf("  r14  0x%016lx\n",      vm_tf->tf_r14); | 
|  | trace_printf("  r15  0x%016lx\n",      vm_tf->tf_r15); | 
|  | trace_printf("  rip  0x%016lx\n",      vm_tf->tf_rip); | 
|  | trace_printf("  rflg 0x%016lx\n",      vm_tf->tf_rflags); | 
|  | trace_printf("  rsp  0x%016lx\n",      vm_tf->tf_rsp); | 
|  | trace_printf("  cr2  0x%016lx\n",      vm_tf->tf_cr2); | 
|  | trace_printf("  cr3  0x%016lx\n",      vm_tf->tf_cr3); | 
|  | trace_printf("Gpcore 0x%08x\n",        vm_tf->tf_guest_pcoreid); | 
|  | trace_printf("Flags  0x%08x\n",        vm_tf->tf_flags); | 
|  | trace_printf("Inject 0x%08x\n",        vm_tf->tf_trap_inject); | 
|  | trace_printf("ExitRs 0x%08x\n",        vm_tf->tf_exit_reason); | 
|  | trace_printf("ExitQl 0x%08x\n",        vm_tf->tf_exit_qual); | 
|  | trace_printf("Intr1  0x%016lx\n",      vm_tf->tf_intrinfo1); | 
|  | trace_printf("Intr2  0x%016lx\n",      vm_tf->tf_intrinfo2); | 
|  | trace_printf("GIntr  0x----%04x\n",    vm_tf->tf_guest_intr_status); | 
|  | trace_printf("GVA    0x%016lx\n",      vm_tf->tf_guest_va); | 
|  | trace_printf("GPA    0x%016lx\n",      vm_tf->tf_guest_pa); | 
|  | retval = true; | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (retval) | 
|  | vm_tf->tf_rip += 3; | 
|  |  | 
|  | return retval; | 
|  | } | 
|  |  | 
|  | static bool handle_io(struct guest_thread *gth) | 
|  | { | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(gth); | 
|  | int ret = io(gth); | 
|  |  | 
|  | if (ret < 0) | 
|  | return FALSE; | 
|  | if (ret == VM_PAGE_FAULT) { | 
|  | /* We were unable to translate RIP due to an ept fault */ | 
|  | vm_tf->tf_trap_inject = VM_TRAP_VALID | 
|  | | VM_TRAP_ERROR_CODE | 
|  | | VM_TRAP_HARDWARE | 
|  | | HW_TRAP_PAGE_FAULT; | 
|  | } | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | static bool handle_msr(struct guest_thread *gth) | 
|  | { | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(gth); | 
|  |  | 
|  | if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) { | 
|  | /* Use event injection through vmctl to send a general protection fault | 
|  | * vmctl.interrupt gets written to the VM-Entry Interruption-Information | 
|  | * Field by vmx */ | 
|  | vm_tf->tf_trap_inject = VM_TRAP_VALID | 
|  | | VM_TRAP_ERROR_CODE | 
|  | | VM_TRAP_HARDWARE | 
|  | | HW_TRAP_GP_FAULT; | 
|  | } else { | 
|  | vm_tf->tf_rip += 2; | 
|  | } | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | static bool handle_apic_access(struct guest_thread *gth) | 
|  | { | 
|  | uint64_t gpa, *regp; | 
|  | uint8_t regx; | 
|  | int store, size; | 
|  | int advance; | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(gth); | 
|  |  | 
|  | if (decode(gth, &gpa, ®x, ®p, &store, &size, &advance)) | 
|  | return FALSE; | 
|  | if (__apic_access(gth, gpa, regx, regp, store)) | 
|  | return FALSE; | 
|  | vm_tf->tf_rip += advance; | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | static bool handle_halt(struct guest_thread *gth) | 
|  | { | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(gth); | 
|  | struct virtual_machine *vm = gth_to_vm(gth); | 
|  |  | 
|  | if (vm->halt_exit) | 
|  | return FALSE; | 
|  | /* It's possible the guest disabled IRQs and halted, perhaps waiting on an | 
|  | * NMI or something.  If we need to support that, we can change this.  */ | 
|  | sleep_til_irq(gth); | 
|  | vm_tf->tf_rip += 1; | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | /* The guest is told (via cpuid) that there is no monitor/mwait.  Callers of | 
|  | * mwait are paravirtualized halts. | 
|  | * | 
|  | * We don't support monitor/mwait in software, so if they tried to mwait | 
|  | * without break-on-interrupt and with interrupts disabled, they'll never | 
|  | * wake up.  So we'll always break on interrupt. */ | 
|  | static bool handle_mwait(struct guest_thread *gth) | 
|  | { | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(gth); | 
|  | struct virtual_machine *vm = gth_to_vm(gth); | 
|  |  | 
|  | sleep_til_irq(gth); | 
|  | vm_tf->tf_rip += 3; | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | /* Is this a vmm specific thing?  or generic? | 
|  | * | 
|  | * what do we do when we want to kill the vm?  what are our other options? */ | 
|  | bool handle_vmexit(struct guest_thread *gth) | 
|  | { | 
|  | struct vm_trapframe *vm_tf = gth_to_vmtf(gth); | 
|  |  | 
|  | switch (vm_tf->tf_exit_reason) { | 
|  | case EXIT_REASON_CPUID: | 
|  | return handle_cpuid(gth); | 
|  | case EXIT_REASON_EPT_VIOLATION: | 
|  | return handle_ept_fault(gth); | 
|  | case EXIT_REASON_VMCALL: | 
|  | return handle_vmcall(gth); | 
|  | case EXIT_REASON_IO_INSTRUCTION: | 
|  | return handle_io(gth); | 
|  | case EXIT_REASON_MSR_WRITE: | 
|  | case EXIT_REASON_MSR_READ: | 
|  | return handle_msr(gth); | 
|  | case EXIT_REASON_APIC_ACCESS: | 
|  | return handle_apic_access(gth); | 
|  | case EXIT_REASON_HLT: | 
|  | return handle_halt(gth); | 
|  | case EXIT_REASON_MWAIT_INSTRUCTION: | 
|  | return handle_mwait(gth); | 
|  | case EXIT_REASON_EXTERNAL_INTERRUPT: | 
|  | case EXIT_REASON_APIC_WRITE: | 
|  | /* TODO: just ignore these? */ | 
|  | return TRUE; | 
|  | default: | 
|  | fprintf(stderr, "VMM library: don't know how to handle exit %d\n", | 
|  | vm_tf->tf_exit_reason); | 
|  | fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip, | 
|  | vm_tf->tf_exit_reason); | 
|  | return FALSE; | 
|  | } | 
|  | } |