x86: vmm: Rework VMRESUME logic The old code assumed that if we found a loaded VMCS (non-partial TF, VMCS still sitting around), then it was ready to be resumed. This would only be true if we previously actually ran the VMCS. It's possible for us to have the VMCS loaded, but never launched. This was pretty easy to do: launch it as an SCP. Then the first time we tried to run the VM, it was on Core 0, where it had been created and was still sitting around. When dealing with a VMCS, we must do the following: - load it - launch it - resume it. The old code thought that 'loaded' (lingering on the core) meant 'ready to resume.' The bug was pretty easy to find and sort out, once I had VMEXITs reflected to userspace (instead of locking up the machine), and I could see the error number. (exit_qual == 5 -> VMRESUME with non-launched VMCS). Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>

diff --git a/kern/arch/x86/process64.c b/kern/arch/x86/process64.c
index 6d3c677..f065dc1 100644
--- a/kern/arch/x86/process64.c
+++ b/kern/arch/x86/process64.c

@@ -104,24 +104,19 @@
 	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 	struct proc *p = pcpui->cur_proc;
 	struct guest_pcore *gpc;
-	bool should_vmresume;
 
 	if (x86_vmtf_is_partial(tf)) {
 		gpc = lookup_guest_pcore(p, tf->tf_guest_pcoreid);
 		assert(gpc);
 		assert(pcpui->guest_pcoreid == tf->tf_guest_pcoreid);
-		should_vmresume = TRUE;
+		assert(gpc->should_vmresume);
 	} else {
-		gpc = load_guest_pcore(p, tf->tf_guest_pcoreid, &should_vmresume);
+		gpc = load_guest_pcore(p, tf->tf_guest_pcoreid);
 		if (!gpc) {
 			tf->tf_exit_reason = EXIT_REASON_GUEST_IN_USE;
 			handle_bad_vm_tf(tf);
 		}
 	}
-	if (should_vmresume)
-		tf->tf_flags |= VMCTX_FL_VMRESUME;
-	else
-		tf->tf_flags &= ~VMCTX_FL_VMRESUME;
 	vmcs_write(GUEST_RSP, tf->tf_rsp);
 	vmcs_write(GUEST_CR3, tf->tf_cr3);
 	vmcs_write(GUEST_RIP, tf->tf_rip);
@@ -140,6 +135,15 @@
 	 * will arrive after we vmenter, since IRQs are currently disabled. */
 	if (test_bit(VMX_POSTED_OUTSTANDING_NOTIF, gpc->posted_irq_desc))
 		send_self_ipi(I_POKE_CORE);
+	/* The first time a VMCS is started after being loaded, it must be launched.
+	 * Subsequent starts must be resumes.  Once the VMCS is cleared, we start
+	 * with a launch again.  Note this is the VMCS, not the GPC unload. */
+	if (gpc->should_vmresume) {
+		tf->tf_flags |= VMCTX_FL_VMRESUME;
+	} else {
+		tf->tf_flags &= ~VMCTX_FL_VMRESUME;
+		gpc->should_vmresume = TRUE;
+	}
 	/* vmlaunch/resume can fail, so we need to be able to return from this.
 	 * Thus we can't clobber rsp via the popq style of setting the registers.
 	 * Likewise, we don't want to lose rbp via the clobber list.

diff --git a/kern/arch/x86/vmm/intel/vmx.c b/kern/arch/x86/vmm/intel/vmx.c
index 2debc1b..8e1033d 100644
--- a/kern/arch/x86/vmm/intel/vmx.c
+++ b/kern/arch/x86/vmm/intel/vmx.c

@@ -1068,7 +1068,6 @@
 {
 	ERRSTACK(2);
 	int8_t state = 0;
-	bool ignore;
 	struct guest_pcore *gpc = kmalloc(sizeof(struct guest_pcore), MEM_WAIT);
 
 	if (!gpc)
@@ -1091,9 +1090,10 @@
 	printd("%d: gpc->vmcs is %p\n", core_id(), gpc->vmcs);
 	gpc->cpu = -1;
 	gpc->vmcs_core_id = -1;
+	gpc->should_vmresume = FALSE;
 
 	disable_irqsave(&state);
-	vmx_load_guest_pcore(gpc, &ignore);
+	vmx_load_guest_pcore(gpc);
 	vmx_setup_vmcs(gpc);
 	vmx_setup_initial_guest_state(p, gpci);
 	vmx_unload_guest_pcore(gpc);
@@ -1362,6 +1362,7 @@
 	if (gpc) {
 		vmcs_clear(gpc->vmcs);
 		ept_sync_context(gpc_get_eptp(gpc));
+		gpc->should_vmresume = FALSE;
 		wmb(); /* write -1 after clearing */
 		gpc->vmcs_core_id = -1;
 		PERCPU_VAR(gpc_to_clear_to) = NULL;
@@ -1377,14 +1378,13 @@
  * instance, only one core can be loading or unloading a particular GPC at a
  * time.  Other cores write to our GPC's vmcs_core_id and vmcs (doing a
  * vmcs_clear).  Once they write vmcs_core_id != -1, it's ours. */
-void vmx_load_guest_pcore(struct guest_pcore *gpc, bool *should_vmresume)
+void vmx_load_guest_pcore(struct guest_pcore *gpc)
 {
 	int remote_core;
 
 	assert(!irq_is_enabled());
 	if (gpc->vmcs_core_id == core_id()) {
 		PERCPU_VAR(gpc_to_clear_to) = NULL;
-		*should_vmresume = TRUE;
 		return;
 	}
 	/* Clear ours *before* waiting on someone else; avoids deadlock (circular
@@ -1409,7 +1409,6 @@
 	vmcs_load(gpc->vmcs);
 	__vmx_setup_pcpu(gpc);
 	gpc->vmcs_core_id = core_id();
-	*should_vmresume = FALSE;
 }
 
 void vmx_unload_guest_pcore(struct guest_pcore *gpc)

diff --git a/kern/arch/x86/vmm/intel/vmx.h b/kern/arch/x86/vmm/intel/vmx.h
index 631fc5a..38d6b7f 100644
--- a/kern/arch/x86/vmm/intel/vmx.h
+++ b/kern/arch/x86/vmm/intel/vmx.h

@@ -314,6 +314,6 @@
 	uint32_t try_set_0;
 };
 
-void vmx_load_guest_pcore(struct guest_pcore *gpc, bool *should_vmresume);
+void vmx_load_guest_pcore(struct guest_pcore *gpc);
 void vmx_unload_guest_pcore(struct guest_pcore *gpc);
 void vmx_clear_vmcs(void);

diff --git a/kern/arch/x86/vmm/vmm.c b/kern/arch/x86/vmm/vmm.c
index 01bcaf8..2694800 100644
--- a/kern/arch/x86/vmm/vmm.c
+++ b/kern/arch/x86/vmm/vmm.c

@@ -160,8 +160,7 @@
 	return p->vmm.guest_pcores[guest_pcoreid];
 }
 
-struct guest_pcore *load_guest_pcore(struct proc *p, int guest_pcoreid,
-                                     bool *should_vmresume)
+struct guest_pcore *load_guest_pcore(struct proc *p, int guest_pcoreid)
 {
 	struct guest_pcore *gpc;
 	struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
@@ -179,7 +178,7 @@
 	spin_unlock(&p->vmm.lock);
 	/* We've got dibs on the gpc; we don't need to hold the lock any longer. */
 	pcpui->guest_pcoreid = guest_pcoreid;
-	vmx_load_guest_pcore(gpc, should_vmresume);
+	vmx_load_guest_pcore(gpc);
 	/* Load guest's xcr0 */
 	lxcr0(gpc->xcr0);
 

diff --git a/kern/arch/x86/vmm/vmm.h b/kern/arch/x86/vmm/vmm.h
index 9505de7..5c043fe 100644
--- a/kern/arch/x86/vmm/vmm.h
+++ b/kern/arch/x86/vmm/vmm.h

@@ -22,6 +22,7 @@
 	unsigned long *posted_irq_desc;
 	struct vmcs *vmcs;
 	int vmcs_core_id;
+	bool should_vmresume;
 	uint64_t xcr0;
 	uint64_t msr_kern_gs_base;
 	uint64_t msr_star;
@@ -75,8 +76,7 @@
 void ept_flush(uint64_t eptp);
 
 struct guest_pcore *lookup_guest_pcore(struct proc *p, int guest_pcoreid);
-struct guest_pcore *load_guest_pcore(struct proc *p, int guest_pcoreid,
-                                     bool *should_vmresume);
+struct guest_pcore *load_guest_pcore(struct proc *p, int guest_pcoreid);
 void unload_guest_pcore(struct proc *p, int guest_pcoreid);
 
 #define VMM_MSR_EMU_READ		1