user/vmm/vmexit.c - akaros - Git at Google

 /* Copyright (c) 2015-2016 Google Inc.
  * See LICENSE for details. */

 #include <parlib/common.h>
 #include <vmm/virtio.h>
 #include <vmm/virtio_mmio.h>
 #include <vmm/virtio_ids.h>
 #include <vmm/virtio_config.h>
 #include <vmm/mmio.h>
 #include <vmm/vmm.h>
 #include <parlib/arch/trap.h>
 #include <parlib/bitmask.h>
 #include <stdio.h>

 static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
 {
 	return GET_BITMASK_BIT(gpci->posted_irq_desc,
 			       VMX_POSTED_OUTSTANDING_NOTIF);
 }

 /* Returns true if the hardware will trigger an IRQ for the guest.  These
  * virtual IRQs are only processed under certain situations, like vmentry, and
  * posted IRQs.  See 'Evaluation of Pending Virtual Interrupts' in the SDM. */
 static bool virtual_irq_is_pending(struct guest_thread *gth)
 {
 	struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
 	uint8_t rvi, vppr;

 	/* Currently, the lower 4 bits are various ways to block IRQs, e.g.
 	 * blocking by STI.  The other bits are must be 0.  Presumably any new
 	 * bits are types of IRQ blocking. */
 	if (gth_to_vmtf(gth)->tf_intrinfo1)
 		return false;
 	vppr = read_mmreg32((uintptr_t)gth_to_gpci(gth)->vapic_addr + 0xa0);
 	rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
 	return (rvi & 0xf0) > (vppr & 0xf0);
 }

 /* Blocks a guest pcore / thread until it has an IRQ pending.  Syncs with
  * vmm_interrupt_guest(). */
 static void sleep_til_irq(struct guest_thread *gth)
 {
 	struct vmm_gpcore_init *gpci = gth_to_gpci(gth);

 	/* The invariant is that if an IRQ is posted, but not delivered, we will
 	 * not sleep.  Anyone who posts an IRQ must signal after setting it.
 	 * vmm_interrupt_guest() does this.  If we use alternate sources of IRQ
 	 * posting, we'll need to revist this.  For more details, see the notes
 	 * in the kernel IPI-IRC fast path.
 	 *
 	 * Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
 	 * possible that the hardware attempted to post the interrupt.  In SDM
 	 * parlance, the processor could have "recognized" the virtual IRQ, but
 	 * not delivered it yet.  This could happen if the guest had executed
 	 * "sti", but not "hlt" yet.  The IRQ was posted and recognized, but not
 	 * delivered ("sti blocking").  Then the guest executes "hlt", and
 	 * vmexits.  OUTSTANDING_NOTIF will be clear in this case.  RVI should
 	 * be set - at least to the vector we just sent, but possibly to a
 	 * greater vector if multiple were sent.  RVI should only be cleared
 	 * after virtual IRQs were actually delivered.  So checking
 	 * OUTSTANDING_NOTIF and RVI should suffice.
 	 *
 	 * Note that when we see a notif or pending virtual IRQ, we don't
 	 * actually deliver the IRQ, we'll just restart the guest and the
 	 * hardware will deliver the virtual IRQ at the appropriate time.
 	 *
 	 * The more traditional race here is if the halt starts concurrently
 	 * with the post; that's why we sync with the mutex to make sure there
 	 * is an ordering between the actual halt (this function) and the
 	 * posting. */
 	uth_mutex_lock(gth->halt_mtx);
 	while (!(pir_notif_is_set(gpci) || virtual_irq_is_pending(gth)))
 		uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
 	uth_mutex_unlock(gth->halt_mtx);
 }

 enum {
 	CPUID_0B_LEVEL_SMT = 0,
 	CPUID_0B_LEVEL_CORE
 };

 static bool handle_cpuid(struct guest_thread *gth)
 {
 	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 	struct virtual_machine *vm = gth_to_vm(gth);
 	uint32_t level = vm_tf->tf_rcx & 0x0F;

 	if (vm_tf->tf_rax != 0x0B)
 		return FALSE;

 	vm_tf->tf_rip += 2;
 	vm_tf->tf_rax = 0;
 	vm_tf->tf_rbx = 0;
 	vm_tf->tf_rcx = level;
 	vm_tf->tf_rdx = gth->gpc_id;
 	if (level == CPUID_0B_LEVEL_SMT) {
 		vm_tf->tf_rax = 0;
 		vm_tf->tf_rbx = 1;
 		vm_tf->tf_rcx |= ((level + 1) << 8);
 	}
 	if (level == CPUID_0B_LEVEL_CORE) {
 		uint32_t shift = LOG2_UP(vm->nr_gpcs);

 		if (shift > 0x1F)
 			shift = 0x1F;
 		vm_tf->tf_rax = shift;
 		vm_tf->tf_rbx = vm->nr_gpcs;
 		vm_tf->tf_rcx |= ((level + 1) << 8);
 	}

 	return TRUE;
 }

 static bool handle_ept_fault(struct guest_thread *gth)
 {
 	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 	struct virtual_machine *vm = gth_to_vm(gth);
 	uint64_t gpa, *regp;
 	uint8_t regx;
 	int store, size;
 	int advance;
 	int ret;

 	if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) {
 		ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0,
 				  0, 0);
 		if (ret <= 0)
 			panic("[user] handle_ept_fault: populate_va failed: ret = %d\n",
 			      ret);
 		return TRUE;
 	}
 	ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);

 	if (ret < 0)
 		return FALSE;
 	if (ret == VM_PAGE_FAULT) {
 		/* We were unable to translate RIP due to an ept fault */
 		vm_tf->tf_trap_inject = VM_TRAP_VALID
 		                      | VM_TRAP_ERROR_CODE
 		                      | VM_TRAP_HARDWARE
 		                      | HW_TRAP_PAGE_FAULT;
 		return TRUE;
 	}

 	assert(size >= 0);
 	/* TODO use helpers for some of these addr checks.  the fee/fec ones
 	 * might be wrong too. */
 	for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
 		if (vm->virtio_mmio_devices[i] == NULL)
 			continue;
 		if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
 			continue;
 		/* TODO: can the guest cause us to spawn off infinite threads?
 		 */
 		if (store)
 			virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa,
 				       size, (uint32_t *)regp);
 		else
 			*regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i],
 					       gpa, size);
 		vm_tf->tf_rip += advance;
 		return TRUE;
 	}
 	if (PG_ADDR(gpa) == 0xfec00000) {
 		do_ioapic(gth, gpa, regx, regp, store);
 	} else if (PG_ADDR(gpa) == 0) {
 		memmove(regp, &vm->low4k[gpa], size);
 	} else {
 		fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
 		fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
 				vm_tf->tf_exit_reason);
 		fprintf(stderr, "Returning 0xffffffff\n");
 		showstatus(stderr, gth);
 		/* Just fill the whole register for now. */
 		*regp = (uint64_t) -1;
 		return FALSE;
 	}
 	vm_tf->tf_rip += advance;
 	return TRUE;
 }

 static bool handle_vmcall_printc(struct guest_thread *gth)
 {
 	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 	uint8_t byte;

 	byte = vm_tf->tf_rdi;
 	printf("%c", byte);
 	if (byte == '\n')
 		printf("%c", '%');
 	fflush(stdout);
 	return TRUE;
 }

 static bool handle_vmcall_smpboot(struct guest_thread *gth)
 {
 	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 	struct vm_trapframe *vm_tf_ap;
 	struct virtual_machine *vm = gth_to_vm(gth);
 	int cur_pcores = vm->up_gpcs;

 	/* Check if we're guest pcore 0. Only the BSP is allowed to start APs.
 	 */
 	if (vm_tf->tf_guest_pcoreid != 0) {
 		fprintf(stderr,
 		        "Only guest pcore 0 is allowed to start APs. core was %ld\n",
 		        vm_tf->tf_guest_pcoreid);
 		return FALSE;
 	}

 	/* Check if we've reached the maximum, if yes, blow out. */
 	if (vm->nr_gpcs == cur_pcores) {
 		fprintf(stderr,
 		        "guest tried to start up too many cores. max was %ld, current up %ld\n",
 		        vm->nr_gpcs, cur_pcores);
 		return FALSE;
 	}

 	/* Start up secondary core. */
 	vm_tf_ap = gpcid_to_vmtf(vm, cur_pcores);
 	/* We use the BSP's CR3 for now. This should be fine because they
 	 * change it later anyway. */
 	vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;

 	/* Starting RIP is passed in via rdi. */
 	vm_tf_ap->tf_rip = vm_tf->tf_rdi;

 	/* Starting RSP is passed in via rsi. */
 	vm_tf_ap->tf_rsp = vm_tf->tf_rsi;

 	vm->up_gpcs++;

 	start_guest_thread(gpcid_to_gth(vm, cur_pcores));

 	return TRUE;
 }

 static bool handle_vmcall_get_tscfreq(struct guest_thread *gth)
 {
 	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 	struct vm_trapframe *vm_tf_ap;
 	struct virtual_machine *vm = gth_to_vm(gth);

 	vm_tf->tf_rax =	get_tsc_freq() / 1000;
 	return TRUE;
 }

 static bool handle_vmcall(struct guest_thread *gth)
 {
 	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 	struct virtual_machine *vm = gth_to_vm(gth);
 	bool retval = FALSE;

 	if (vm->vmcall)
 		return vm->vmcall(gth, vm_tf);

 	switch (vm_tf->tf_rax) {
 	case VMCALL_PRINTC:
 		retval = handle_vmcall_printc(gth);
 		break;
 	case VMCALL_SMPBOOT:
 		retval = handle_vmcall_smpboot(gth);
 		break;
 	case VMCALL_GET_TSCFREQ:
 		retval = handle_vmcall_get_tscfreq(gth);
 		break;
 	case VMCALL_TRACE_TF:
 		trace_printf("  rax  0x%016lx\n",      vm_tf->tf_r11);
 		trace_printf("  rbx  0x%016lx\n",      vm_tf->tf_rbx);
 		trace_printf("  rcx  0x%016lx\n",      vm_tf->tf_rcx);
 		trace_printf("  rdx  0x%016lx\n",      vm_tf->tf_rdx);
 		trace_printf("  rbp  0x%016lx\n",      vm_tf->tf_rbp);
 		trace_printf("  rsi  0x%016lx\n",      vm_tf->tf_rsi);
 		trace_printf("  rdi  0x%016lx\n",      vm_tf->tf_rdi);
 		trace_printf("  r8   0x%016lx\n",      vm_tf->tf_r8);
 		trace_printf("  r9   0x%016lx\n",      vm_tf->tf_r9);
 		trace_printf("  r10  0x%016lx\n",      vm_tf->tf_r10);
 		trace_printf("  r11  0x%016lx\n",      0xdeadbeef);
 		trace_printf("  r12  0x%016lx\n",      vm_tf->tf_r12);
 		trace_printf("  r13  0x%016lx\n",      vm_tf->tf_r13);
 		trace_printf("  r14  0x%016lx\n",      vm_tf->tf_r14);
 		trace_printf("  r15  0x%016lx\n",      vm_tf->tf_r15);
 		trace_printf("  rip  0x%016lx\n",      vm_tf->tf_rip);
 		trace_printf("  rflg 0x%016lx\n",      vm_tf->tf_rflags);
 		trace_printf("  rsp  0x%016lx\n",      vm_tf->tf_rsp);
 		trace_printf("  cr2  0x%016lx\n",      vm_tf->tf_cr2);
 		trace_printf("  cr3  0x%016lx\n",      vm_tf->tf_cr3);
 		trace_printf("Gpcore 0x%08x\n",        vm_tf->tf_guest_pcoreid);
 		trace_printf("Flags  0x%08x\n",        vm_tf->tf_flags);
 		trace_printf("Inject 0x%08x\n",        vm_tf->tf_trap_inject);
 		trace_printf("ExitRs 0x%08x\n",        vm_tf->tf_exit_reason);
 		trace_printf("ExitQl 0x%08x\n",        vm_tf->tf_exit_qual);
 		trace_printf("Intr1  0x%016lx\n",      vm_tf->tf_intrinfo1);
 		trace_printf("Intr2  0x%016lx\n",      vm_tf->tf_intrinfo2);
 		trace_printf("GIntr  0x----%04x\n",
 			     vm_tf->tf_guest_intr_status);
 		trace_printf("GVA    0x%016lx\n",      vm_tf->tf_guest_va);
 		trace_printf("GPA    0x%016lx\n",      vm_tf->tf_guest_pa);
 		retval = true;
 		break;
 	}

 	if (retval)
 		vm_tf->tf_rip += 3;

 	return retval;
 }

 static bool handle_io(struct guest_thread *gth)
 {
 	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 	int ret = io(gth);

 	if (ret < 0)
 		return FALSE;
 	if (ret == VM_PAGE_FAULT) {
 		/* We were unable to translate RIP due to an ept fault */
 		vm_tf->tf_trap_inject = VM_TRAP_VALID
 		                      | VM_TRAP_ERROR_CODE
 		                      | VM_TRAP_HARDWARE
 		                      | HW_TRAP_PAGE_FAULT;
 	}
 	return TRUE;
 }

 static bool handle_msr(struct guest_thread *gth)
 {
 	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);

 	if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
 		/* Use event injection through vmctl to send a general
 		 * protection fault vmctl.interrupt gets written to the VM-Entry
 		 * Interruption-Information Field by vmx */
 		vm_tf->tf_trap_inject = VM_TRAP_VALID
 		                      | VM_TRAP_ERROR_CODE
 		                      | VM_TRAP_HARDWARE
 		                      | HW_TRAP_GP_FAULT;
 	} else {
 		vm_tf->tf_rip += 2;
 	}
 	return TRUE;
 }

 static bool handle_apic_access(struct guest_thread *gth)
 {
 	uint64_t gpa, *regp;
 	uint8_t regx;
 	int store, size;
 	int advance;
 	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);

 	if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
 		return FALSE;
 	if (__apic_access(gth, gpa, regx, regp, store))
 		return FALSE;
 	vm_tf->tf_rip += advance;
 	return TRUE;
 }

 static bool handle_halt(struct guest_thread *gth)
 {
 	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 	struct virtual_machine *vm = gth_to_vm(gth);

 	if (vm->halt_exit)
 		return FALSE;
 	/* It's possible the guest disabled IRQs and halted, perhaps waiting on
 	 * an NMI or something.  If we need to support that, we can change this.
 	 */
 	sleep_til_irq(gth);
 	vm_tf->tf_rip += 1;
 	return TRUE;
 }

 /* The guest is told (via cpuid) that there is no monitor/mwait.  Callers of
  * mwait are paravirtualized halts.
  *
  * We don't support monitor/mwait in software, so if they tried to mwait
  * without break-on-interrupt and with interrupts disabled, they'll never
  * wake up.  So we'll always break on interrupt. */
 static bool handle_mwait(struct guest_thread *gth)
 {
 	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
 	struct virtual_machine *vm = gth_to_vm(gth);

 	sleep_til_irq(gth);
 	vm_tf->tf_rip += 3;
 	return TRUE;
 }

 /* Is this a vmm specific thing?  or generic?
  *
  * what do we do when we want to kill the vm?  what are our other options? */
 bool handle_vmexit(struct guest_thread *gth)
 {
 	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);

 	switch (vm_tf->tf_exit_reason) {
 	case EXIT_REASON_CPUID:
 		return handle_cpuid(gth);
 	case EXIT_REASON_EPT_VIOLATION:
 		return handle_ept_fault(gth);
 	case EXIT_REASON_VMCALL:
 		return handle_vmcall(gth);
 	case EXIT_REASON_IO_INSTRUCTION:
 		return handle_io(gth);
 	case EXIT_REASON_MSR_WRITE:
 	case EXIT_REASON_MSR_READ:
 		return handle_msr(gth);
 	case EXIT_REASON_APIC_ACCESS:
 		return handle_apic_access(gth);
 	case EXIT_REASON_HLT:
 		return handle_halt(gth);
 	case EXIT_REASON_MWAIT_INSTRUCTION:
 		return handle_mwait(gth);
 	case EXIT_REASON_EXTERNAL_INTERRUPT:
 	case EXIT_REASON_APIC_WRITE:
 		/* TODO: just ignore these? */
 		return TRUE;
 	default:
 		fprintf(stderr,
 			"VMM library: don't know how to handle exit %d\n",
 		        vm_tf->tf_exit_reason);
 		fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
 		        vm_tf->tf_exit_reason);
 		return FALSE;
 	}
 }
	/* Copyright (c) 2015-2016 Google Inc.
	* See LICENSE for details. */

	#include <parlib/common.h>
	#include <vmm/virtio.h>
	#include <vmm/virtio_mmio.h>
	#include <vmm/virtio_ids.h>
	#include <vmm/virtio_config.h>
	#include <vmm/mmio.h>
	#include <vmm/vmm.h>
	#include <parlib/arch/trap.h>
	#include <parlib/bitmask.h>
	#include <stdio.h>

	static bool pir_notif_is_set(struct vmm_gpcore_init *gpci)
	{
	return GET_BITMASK_BIT(gpci->posted_irq_desc,
	VMX_POSTED_OUTSTANDING_NOTIF);
	}

	/* Returns true if the hardware will trigger an IRQ for the guest. These
	* virtual IRQs are only processed under certain situations, like vmentry, and
	* posted IRQs. See 'Evaluation of Pending Virtual Interrupts' in the SDM. */
	static bool virtual_irq_is_pending(struct guest_thread *gth)
	{
	struct vmm_gpcore_init *gpci = gth_to_gpci(gth);
	uint8_t rvi, vppr;

	/* Currently, the lower 4 bits are various ways to block IRQs, e.g.
	* blocking by STI. The other bits are must be 0. Presumably any new
	* bits are types of IRQ blocking. */
	if (gth_to_vmtf(gth)->tf_intrinfo1)
	return false;
	vppr = read_mmreg32((uintptr_t)gth_to_gpci(gth)->vapic_addr + 0xa0);
	rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff;
	return (rvi & 0xf0) > (vppr & 0xf0);
	}

	/* Blocks a guest pcore / thread until it has an IRQ pending. Syncs with
	* vmm_interrupt_guest(). */
	static void sleep_til_irq(struct guest_thread *gth)
	{
	struct vmm_gpcore_init *gpci = gth_to_gpci(gth);

	/* The invariant is that if an IRQ is posted, but not delivered, we will
	* not sleep. Anyone who posts an IRQ must signal after setting it.
	* vmm_interrupt_guest() does this. If we use alternate sources of IRQ
	* posting, we'll need to revist this. For more details, see the notes
	* in the kernel IPI-IRC fast path.
	*
	* Although vmm_interrupt_guest() only writes OUTSTANDING_NOTIF, it's
	* possible that the hardware attempted to post the interrupt. In SDM
	* parlance, the processor could have "recognized" the virtual IRQ, but
	* not delivered it yet. This could happen if the guest had executed
	* "sti", but not "hlt" yet. The IRQ was posted and recognized, but not
	* delivered ("sti blocking"). Then the guest executes "hlt", and
	* vmexits. OUTSTANDING_NOTIF will be clear in this case. RVI should
	* be set - at least to the vector we just sent, but possibly to a
	* greater vector if multiple were sent. RVI should only be cleared
	* after virtual IRQs were actually delivered. So checking
	* OUTSTANDING_NOTIF and RVI should suffice.
	*
	* Note that when we see a notif or pending virtual IRQ, we don't
	* actually deliver the IRQ, we'll just restart the guest and the
	* hardware will deliver the virtual IRQ at the appropriate time.
	*
	* The more traditional race here is if the halt starts concurrently
	* with the post; that's why we sync with the mutex to make sure there
	* is an ordering between the actual halt (this function) and the
	* posting. */
	uth_mutex_lock(gth->halt_mtx);
	while (!(pir_notif_is_set(gpci) \|\| virtual_irq_is_pending(gth)))
	uth_cond_var_wait(gth->halt_cv, gth->halt_mtx);
	uth_mutex_unlock(gth->halt_mtx);
	}

	enum {
	CPUID_0B_LEVEL_SMT = 0,
	CPUID_0B_LEVEL_CORE
	};

	static bool handle_cpuid(struct guest_thread *gth)
	{
	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
	struct virtual_machine *vm = gth_to_vm(gth);
	uint32_t level = vm_tf->tf_rcx & 0x0F;

	if (vm_tf->tf_rax != 0x0B)
	return FALSE;

	vm_tf->tf_rip += 2;
	vm_tf->tf_rax = 0;
	vm_tf->tf_rbx = 0;
	vm_tf->tf_rcx = level;
	vm_tf->tf_rdx = gth->gpc_id;
	if (level == CPUID_0B_LEVEL_SMT) {
	vm_tf->tf_rax = 0;
	vm_tf->tf_rbx = 1;
	vm_tf->tf_rcx \|= ((level + 1) << 8);
	}
	if (level == CPUID_0B_LEVEL_CORE) {
	uint32_t shift = LOG2_UP(vm->nr_gpcs);

	if (shift > 0x1F)
	shift = 0x1F;
	vm_tf->tf_rax = shift;
	vm_tf->tf_rbx = vm->nr_gpcs;
	vm_tf->tf_rcx \|= ((level + 1) << 8);
	}

	return TRUE;
	}

	static bool handle_ept_fault(struct guest_thread *gth)
	{
	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
	struct virtual_machine *vm = gth_to_vm(gth);
	uint64_t gpa, *regp;
	uint8_t regx;
	int store, size;
	int advance;
	int ret;

	if (vm_tf->tf_flags & VMCTX_FL_EPT_VMR_BACKED) {
	ret = ros_syscall(SYS_populate_va, vm_tf->tf_guest_pa, 1, 0, 0,
	0, 0);
	if (ret <= 0)
	panic("[user] handle_ept_fault: populate_va failed: ret = %d\n",
	ret);
	return TRUE;
	}
	ret = decode(gth, &gpa, &regx, &regp, &store, &size, &advance);

	if (ret < 0)
	return FALSE;
	if (ret == VM_PAGE_FAULT) {
	/* We were unable to translate RIP due to an ept fault */
	vm_tf->tf_trap_inject = VM_TRAP_VALID
	\| VM_TRAP_ERROR_CODE
	\| VM_TRAP_HARDWARE
	\| HW_TRAP_PAGE_FAULT;
	return TRUE;
	}

	assert(size >= 0);
	/* TODO use helpers for some of these addr checks. the fee/fec ones
	* might be wrong too. */
	for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) {
	if (vm->virtio_mmio_devices[i] == NULL)
	continue;
	if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr)
	continue;
	/* TODO: can the guest cause us to spawn off infinite threads?
	*/
	if (store)
	virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa,
	size, (uint32_t *)regp);
	else
	*regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i],
	gpa, size);
	vm_tf->tf_rip += advance;
	return TRUE;
	}
	if (PG_ADDR(gpa) == 0xfec00000) {
	do_ioapic(gth, gpa, regx, regp, store);
	} else if (PG_ADDR(gpa) == 0) {
	memmove(regp, &vm->low4k[gpa], size);
	} else {
	fprintf(stderr, "EPT violation: can't handle %p\n", gpa);
	fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip,
	vm_tf->tf_exit_reason);
	fprintf(stderr, "Returning 0xffffffff\n");
	showstatus(stderr, gth);
	/* Just fill the whole register for now. */
	*regp = (uint64_t) -1;
	return FALSE;
	}
	vm_tf->tf_rip += advance;
	return TRUE;
	}

	static bool handle_vmcall_printc(struct guest_thread *gth)
	{
	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
	uint8_t byte;

	byte = vm_tf->tf_rdi;
	printf("%c", byte);
	if (byte == '\n')
	printf("%c", '%');
	fflush(stdout);
	return TRUE;
	}

	static bool handle_vmcall_smpboot(struct guest_thread *gth)
	{
	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
	struct vm_trapframe *vm_tf_ap;
	struct virtual_machine *vm = gth_to_vm(gth);
	int cur_pcores = vm->up_gpcs;

	/* Check if we're guest pcore 0. Only the BSP is allowed to start APs.
	*/
	if (vm_tf->tf_guest_pcoreid != 0) {
	fprintf(stderr,
	"Only guest pcore 0 is allowed to start APs. core was %ld\n",
	vm_tf->tf_guest_pcoreid);
	return FALSE;
	}

	/* Check if we've reached the maximum, if yes, blow out. */
	if (vm->nr_gpcs == cur_pcores) {
	fprintf(stderr,
	"guest tried to start up too many cores. max was %ld, current up %ld\n",
	vm->nr_gpcs, cur_pcores);
	return FALSE;
	}

	/* Start up secondary core. */
	vm_tf_ap = gpcid_to_vmtf(vm, cur_pcores);
	/* We use the BSP's CR3 for now. This should be fine because they
	* change it later anyway. */
	vm_tf_ap->tf_cr3 = vm_tf->tf_cr3;

	/* Starting RIP is passed in via rdi. */
	vm_tf_ap->tf_rip = vm_tf->tf_rdi;

	/* Starting RSP is passed in via rsi. */
	vm_tf_ap->tf_rsp = vm_tf->tf_rsi;

	vm->up_gpcs++;

	start_guest_thread(gpcid_to_gth(vm, cur_pcores));

	return TRUE;
	}

	static bool handle_vmcall_get_tscfreq(struct guest_thread *gth)
	{
	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
	struct vm_trapframe *vm_tf_ap;
	struct virtual_machine *vm = gth_to_vm(gth);

	vm_tf->tf_rax = get_tsc_freq() / 1000;
	return TRUE;
	}

	static bool handle_vmcall(struct guest_thread *gth)
	{
	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
	struct virtual_machine *vm = gth_to_vm(gth);
	bool retval = FALSE;

	if (vm->vmcall)
	return vm->vmcall(gth, vm_tf);

	switch (vm_tf->tf_rax) {
	case VMCALL_PRINTC:
	retval = handle_vmcall_printc(gth);
	break;
	case VMCALL_SMPBOOT:
	retval = handle_vmcall_smpboot(gth);
	break;
	case VMCALL_GET_TSCFREQ:
	retval = handle_vmcall_get_tscfreq(gth);
	break;
	case VMCALL_TRACE_TF:
	trace_printf(" rax 0x%016lx\n", vm_tf->tf_r11);
	trace_printf(" rbx 0x%016lx\n", vm_tf->tf_rbx);
	trace_printf(" rcx 0x%016lx\n", vm_tf->tf_rcx);
	trace_printf(" rdx 0x%016lx\n", vm_tf->tf_rdx);
	trace_printf(" rbp 0x%016lx\n", vm_tf->tf_rbp);
	trace_printf(" rsi 0x%016lx\n", vm_tf->tf_rsi);
	trace_printf(" rdi 0x%016lx\n", vm_tf->tf_rdi);
	trace_printf(" r8 0x%016lx\n", vm_tf->tf_r8);
	trace_printf(" r9 0x%016lx\n", vm_tf->tf_r9);
	trace_printf(" r10 0x%016lx\n", vm_tf->tf_r10);
	trace_printf(" r11 0x%016lx\n", 0xdeadbeef);
	trace_printf(" r12 0x%016lx\n", vm_tf->tf_r12);
	trace_printf(" r13 0x%016lx\n", vm_tf->tf_r13);
	trace_printf(" r14 0x%016lx\n", vm_tf->tf_r14);
	trace_printf(" r15 0x%016lx\n", vm_tf->tf_r15);
	trace_printf(" rip 0x%016lx\n", vm_tf->tf_rip);
	trace_printf(" rflg 0x%016lx\n", vm_tf->tf_rflags);
	trace_printf(" rsp 0x%016lx\n", vm_tf->tf_rsp);
	trace_printf(" cr2 0x%016lx\n", vm_tf->tf_cr2);
	trace_printf(" cr3 0x%016lx\n", vm_tf->tf_cr3);
	trace_printf("Gpcore 0x%08x\n", vm_tf->tf_guest_pcoreid);
	trace_printf("Flags 0x%08x\n", vm_tf->tf_flags);
	trace_printf("Inject 0x%08x\n", vm_tf->tf_trap_inject);
	trace_printf("ExitRs 0x%08x\n", vm_tf->tf_exit_reason);
	trace_printf("ExitQl 0x%08x\n", vm_tf->tf_exit_qual);
	trace_printf("Intr1 0x%016lx\n", vm_tf->tf_intrinfo1);
	trace_printf("Intr2 0x%016lx\n", vm_tf->tf_intrinfo2);
	trace_printf("GIntr 0x----%04x\n",
	vm_tf->tf_guest_intr_status);
	trace_printf("GVA 0x%016lx\n", vm_tf->tf_guest_va);
	trace_printf("GPA 0x%016lx\n", vm_tf->tf_guest_pa);
	retval = true;
	break;
	}

	if (retval)
	vm_tf->tf_rip += 3;

	return retval;
	}

	static bool handle_io(struct guest_thread *gth)
	{
	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
	int ret = io(gth);

	if (ret < 0)
	return FALSE;
	if (ret == VM_PAGE_FAULT) {
	/* We were unable to translate RIP due to an ept fault */
	vm_tf->tf_trap_inject = VM_TRAP_VALID
	\| VM_TRAP_ERROR_CODE
	\| VM_TRAP_HARDWARE
	\| HW_TRAP_PAGE_FAULT;
	}
	return TRUE;
	}

	static bool handle_msr(struct guest_thread *gth)
	{
	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);

	if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) {
	/* Use event injection through vmctl to send a general
	* protection fault vmctl.interrupt gets written to the VM-Entry
	* Interruption-Information Field by vmx */
	vm_tf->tf_trap_inject = VM_TRAP_VALID
	\| VM_TRAP_ERROR_CODE
	\| VM_TRAP_HARDWARE
	\| HW_TRAP_GP_FAULT;
	} else {
	vm_tf->tf_rip += 2;
	}
	return TRUE;
	}

	static bool handle_apic_access(struct guest_thread *gth)
	{
	uint64_t gpa, *regp;
	uint8_t regx;
	int store, size;
	int advance;
	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);

	if (decode(gth, &gpa, &regx, &regp, &store, &size, &advance))
	return FALSE;
	if (__apic_access(gth, gpa, regx, regp, store))
	return FALSE;
	vm_tf->tf_rip += advance;
	return TRUE;
	}

	static bool handle_halt(struct guest_thread *gth)
	{
	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
	struct virtual_machine *vm = gth_to_vm(gth);

	if (vm->halt_exit)
	return FALSE;
	/* It's possible the guest disabled IRQs and halted, perhaps waiting on
	* an NMI or something. If we need to support that, we can change this.
	*/
	sleep_til_irq(gth);
	vm_tf->tf_rip += 1;
	return TRUE;
	}

	/* The guest is told (via cpuid) that there is no monitor/mwait. Callers of
	* mwait are paravirtualized halts.
	*
	* We don't support monitor/mwait in software, so if they tried to mwait
	* without break-on-interrupt and with interrupts disabled, they'll never
	* wake up. So we'll always break on interrupt. */
	static bool handle_mwait(struct guest_thread *gth)
	{
	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);
	struct virtual_machine *vm = gth_to_vm(gth);

	sleep_til_irq(gth);
	vm_tf->tf_rip += 3;
	return TRUE;
	}

	/* Is this a vmm specific thing? or generic?
	*
	* what do we do when we want to kill the vm? what are our other options? */
	bool handle_vmexit(struct guest_thread *gth)
	{
	struct vm_trapframe *vm_tf = gth_to_vmtf(gth);

	switch (vm_tf->tf_exit_reason) {
	case EXIT_REASON_CPUID:
	return handle_cpuid(gth);
	case EXIT_REASON_EPT_VIOLATION:
	return handle_ept_fault(gth);
	case EXIT_REASON_VMCALL:
	return handle_vmcall(gth);
	case EXIT_REASON_IO_INSTRUCTION:
	return handle_io(gth);
	case EXIT_REASON_MSR_WRITE:
	case EXIT_REASON_MSR_READ:
	return handle_msr(gth);
	case EXIT_REASON_APIC_ACCESS:
	return handle_apic_access(gth);
	case EXIT_REASON_HLT:
	return handle_halt(gth);
	case EXIT_REASON_MWAIT_INSTRUCTION:
	return handle_mwait(gth);
	case EXIT_REASON_EXTERNAL_INTERRUPT:
	case EXIT_REASON_APIC_WRITE:
	/* TODO: just ignore these? */
	return TRUE;
	default:
	fprintf(stderr,
	"VMM library: don't know how to handle exit %d\n",
	vm_tf->tf_exit_reason);
	fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip,
	vm_tf->tf_exit_reason);
	return FALSE;
	}
	}