| //#define DEBUG | 
 | /** | 
 |  *  vmx.c - The Intel VT-x driver for Dune | 
 |  * | 
 |  * This file is derived from Linux KVM VT-x support. | 
 |  * Copyright (C) 2006 Qumranet, Inc. | 
 |  * Copyright 2010 Red Hat, Inc. and/or its affiliates. | 
 |  * | 
 |  * Original Authors: | 
 |  *   Avi Kivity   <avi@qumranet.com> | 
 |  *   Yaniv Kamay  <yaniv@qumranet.com> | 
 |  * | 
 |  * This modified version is simpler because it avoids the following | 
 |  * features that are not requirements for Dune: | 
 |  *  * Real-mode emulation | 
 |  *  * Nested VT-x support | 
 |  *  * I/O hardware emulation | 
 |  *  * Any of the more esoteric X86 features and registers | 
 |  *  * KVM-specific functionality | 
 |  * | 
 |  * In essence we provide only the minimum functionality needed to run | 
 |  * a process in vmx non-root mode rather than the full hardware emulation | 
 |  * needed to support an entire OS. | 
 |  * | 
 |  * This driver is a research prototype and as such has the following | 
 |  * limitations: | 
 |  * | 
 |  * FIXME: Backward compatibility is currently a non-goal, and only recent | 
 |  * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this | 
 |  * driver. | 
 |  * | 
 |  * FIXME: Eventually we should handle concurrent user's of VT-x more | 
 |  * gracefully instead of requiring exclusive access. This would allow | 
 |  * Dune to interoperate with KVM and other HV solutions. | 
 |  * | 
 |  * FIXME: We need to support hotplugged physical CPUs. | 
 |  * | 
 |  * Authors: | 
 |  *   Adam Belay   <abelay@stanford.edu> | 
 |  */ | 
 |  | 
 | /* Basic flow. | 
 |  * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things. | 
 |  * You're left with the feeling that they got part way through and realized they had to have one for | 
 |  * | 
 |  * 1) your CPU is going to be capable of running VMs, and you need state for that. | 
 |  * | 
 |  * 2) you're about to start a guest, and you need state for that. | 
 |  * | 
 |  * So there is get cpu set up to be able to run VMs stuff, and now | 
 |  * let's start a guest stuff.  In Akaros, CPUs will always be set up | 
 |  * to run a VM if that is possible. Processes can flip themselves into | 
 |  * a VM and that will require another VMCS. | 
 |  * | 
 |  * So: at kernel startup time, the SMP boot stuff calls | 
 |  * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which | 
 |  * in the case of this file is intel_vmm_init. That does some code | 
 |  * that sets up stuff for ALL sockets, based on the capabilities of | 
 |  * the socket it runs on. If any cpu supports vmx, it assumes they all | 
 |  * do. That's a realistic assumption. So the call_function_all is kind | 
 |  * of stupid, really; it could just see what's on the current cpu and | 
 |  * assume it's on all. HOWEVER: there are systems in the wild that | 
 |  * can run VMs on some but not all CPUs, due to BIOS mistakes, so we | 
 |  * might as well allow for the chance that we'll only all VMMCPs on a | 
 |  * subset (not implemented yet however).  So: probe all CPUs, get a | 
 |  * count of how many support VMX and, for now, assume they all do | 
 |  * anyway. | 
 |  * | 
 |  * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct, | 
 |  * which contains all the naughty bits settings for all the cpus that can run a VM. | 
 |  * Realistically, all VMX-capable cpus in a system will have identical configurations. | 
 |  * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same configuration. | 
 |  * | 
 |  * configure the msr_bitmap. This is the bitmap of MSRs which the | 
 |  * guest can manipulate.  Currently, we only allow GS and FS base. | 
 |  * | 
 |  * Reserve bit 0 in the vpid bitmap as guests can not use that | 
 |  * | 
 |  * Set up the what we call the vmxarea. The vmxarea is per-cpu, not | 
 |  * per-guest. Once set up, it is left alone.  The ONLY think we set in | 
 |  * there is the revision area. The VMX is page-sized per cpu and | 
 |  * page-aligned. Note that it can be smaller, but why bother? We know | 
 |  * the max size and alignment, and it's convenient. | 
 |  * | 
 |  * Now that it is set up, enable vmx on all cpus. This involves | 
 |  * testing VMXE in cr4, to see if we've been here before (TODO: delete | 
 |  * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can | 
 |  * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon | 
 |  * instruction), and syncing vpid's and ept's.  Now the CPU is ready | 
 |  * to host guests. | 
 |  * | 
 |  * Setting up a guest. | 
 |  * We divide this into two things: vmm_proc_init and vm_run. | 
 |  * Currently, on Intel, vmm_proc_init does nothing. | 
 |  * | 
 |  * vm_run is really complicated. It is called with a coreid, and | 
 |  * vmctl struct. On intel, it calls vmx_launch. vmx_launch is set | 
 |  * up for a few test cases. If rip is 1, it sets the guest rip to | 
 |  * a function which will deref 0 and should exit with failure 2. If rip is 0, | 
 |  * it calls an infinite loop in the guest. | 
 |  * | 
 |  * The sequence of operations: | 
 |  * create a vcpu | 
 |  * while (1) { | 
 |  * get a vcpu | 
 |  * disable irqs (required or you can't enter the VM) | 
 |  * vmx_run_vcpu() | 
 |  * enable irqs | 
 |  * manage the vm exit | 
 |  * } | 
 |  * | 
 |  * get a vcpu | 
 |  * See if the current cpu has a vcpu. If so, and is the same as the vcpu we want, | 
 |  * vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD. | 
 |  * | 
 |  * If it's not the same, see if the vcpu thinks it is on the core. If it is not, call | 
 |  * __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear the one | 
 |  * attached to this cpu. Then vmcs_load the vmcs for vcpu on this this cpu, | 
 |  * call __vmx_setup_cpu, mark this vcpu as being attached to this cpu, done. | 
 |  * | 
 |  * vmx_run_vcpu this one gets messy, mainly because it's a giant wad | 
 |  * of inline assembly with embedded CPP crap. I suspect we'll want to | 
 |  * un-inline it someday, but maybe not.  It's called with a vcpu | 
 |  * struct from which it loads guest state, and to which it stores | 
 |  * non-virtualized host state. It issues a vmlaunch or vmresume | 
 |  * instruction depending, and on return, it evaluates if things the | 
 |  * launch/resume had an error in that operation. Note this is NOT the | 
 |  * same as an error while in the virtual machine; this is an error in | 
 |  * startup due to misconfiguration. Depending on what is returned it's | 
 |  * either a failed vm startup or an exit for lots of many reasons. | 
 |  * | 
 |  */ | 
 |  | 
 | /* basically: only rename those globals that might conflict | 
 |  * with existing names. Leave all else the same. | 
 |  * this code is more modern than the other code, yet still | 
 |  * well encapsulated, it seems. | 
 |  */ | 
 | #include <kmalloc.h> | 
 | #include <string.h> | 
 | #include <stdio.h> | 
 | #include <assert.h> | 
 | #include <error.h> | 
 | #include <pmap.h> | 
 | #include <sys/queue.h> | 
 | #include <smp.h> | 
 | #include <kref.h> | 
 | #include <atomic.h> | 
 | #include <alarm.h> | 
 | #include <event.h> | 
 | #include <umem.h> | 
 | #include <bitops.h> | 
 | #include <arch/types.h> | 
 | #include <syscall.h> | 
 | #include <arch/io.h> | 
 | #include <percpu.h> | 
 |  | 
 | #include <ros/vmm.h> | 
 | #include "vmx.h" | 
 | #include "../vmm.h" | 
 |  | 
 | #include <trap.h> | 
 |  | 
 | #include <smp.h> | 
 | #include <ros/procinfo.h> | 
 |  | 
 | #define currentcpu (&per_cpu_info[core_id()]) | 
 |  | 
 | static unsigned long *msr_bitmap; | 
 | #define VMX_IO_BITMAP_SZ		(1 << 16) /* 64 KB */ | 
 | static unsigned long *io_bitmap; | 
 |  | 
 | int x86_ept_pte_fix_ups = 0; | 
 |  | 
 | struct vmx_capability vmx_capability; | 
 | struct vmcs_config vmcs_config; | 
 |  | 
 | char * const VMX_EXIT_REASON_NAMES[] = { | 
 | 	VMX_EXIT_REASONS | 
 | }; | 
 |  | 
 | static char *cr_access_type[] = { | 
 | 	"move to cr", | 
 | 	"move from cr", | 
 | 	"clts", | 
 | 	"lmsw" | 
 | }; | 
 |  | 
 | static char *cr_gpr[] = { | 
 | 	"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", | 
 | 	"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" | 
 | }; | 
 |  | 
 | static int guest_cr_num[16] = { | 
 | 	GUEST_CR0, | 
 | 	-1, | 
 | 	-1, | 
 | 	GUEST_CR3, | 
 | 	GUEST_CR4, | 
 | 	-1, | 
 | 	-1, | 
 | 	-1, | 
 | 	-1,	/* 8? */ | 
 | 	-1, -1, -1, -1, -1, -1, -1 | 
 | }; | 
 |  | 
 | __always_inline unsigned long vmcs_readl(unsigned long field); | 
 | /* See section 24-3 of The Good Book */ | 
 | void | 
 | show_cr_access(uint64_t val) | 
 | { | 
 | 	int crnr = val & 0xf; | 
 | 	int type = (val >> 4) & 3; | 
 | 	int reg = (val >> 11) & 0xf; | 
 |  | 
 | 	print_lock(); | 
 | 	printk("%s: %d: ", cr_access_type[type], crnr); | 
 | 	if (type < 2) { | 
 | 		printk("%s", cr_gpr[reg]); | 
 | 		if (guest_cr_num[crnr] > -1) { | 
 | 			printk(": 0x%x", vmcs_readl(guest_cr_num[crnr])); | 
 | 		} | 
 | 	} | 
 | 	printk("\n"); | 
 | 	print_unlock(); | 
 | } | 
 |  | 
 | void | 
 | ept_flush(uint64_t eptp) | 
 | { | 
 | 	ept_sync_context(eptp); | 
 | } | 
 |  | 
 | static void | 
 | vmcs_clear(struct vmcs *vmcs) | 
 | { | 
 | 	uint64_t phys_addr = PADDR(vmcs); | 
 | 	uint8_t error; | 
 |  | 
 | 	asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0":"=qm"(error):"a"(&phys_addr), | 
 | 				  "m"(phys_addr) | 
 | 				  :"cc", "memory"); | 
 | 	if (error) | 
 | 		printk("vmclear fail: %p/%llx\n", vmcs, phys_addr); | 
 | } | 
 |  | 
 | static void | 
 | vmcs_load(struct vmcs *vmcs) | 
 | { | 
 | 	uint64_t phys_addr = PADDR(vmcs); | 
 | 	uint8_t error; | 
 |  | 
 | 	asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0":"=qm"(error):"a"(&phys_addr), | 
 | 				  "m"(phys_addr) | 
 | 				  :"cc", "memory"); | 
 | 	if (error) | 
 | 		printk("vmptrld %p/%llx failed\n", vmcs, phys_addr); | 
 | } | 
 |  | 
 | /* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */ | 
 | static physaddr_t | 
 | vmcs_get_current(void) | 
 | { | 
 | 	physaddr_t vmcs_paddr; | 
 | 	/* RAX contains the addr of the location to store the VMCS pointer.  The | 
 | 	 * compiler doesn't know the ASM will deref that pointer, hence the =m */ | 
 | 	asm volatile (ASM_VMX_VMPTRST_RAX:"=m"(vmcs_paddr):"a"(&vmcs_paddr)); | 
 | 	return vmcs_paddr; | 
 | } | 
 |  | 
 | __always_inline unsigned long | 
 | vmcs_readl(unsigned long field) | 
 | { | 
 | 	return vmcs_read(field); | 
 | } | 
 |  | 
 | __always_inline uint16_t | 
 | vmcs_read16(unsigned long field) | 
 | { | 
 | 	return vmcs_readl(field); | 
 | } | 
 |  | 
 | static __always_inline uint32_t | 
 | vmcs_read32(unsigned long field) | 
 | { | 
 | 	return vmcs_readl(field); | 
 | } | 
 |  | 
 | static __always_inline uint64_t | 
 | vmcs_read64(unsigned long field) | 
 | { | 
 | 	return vmcs_readl(field); | 
 | } | 
 |  | 
 | void | 
 | vmwrite_error(unsigned long field, unsigned long value) | 
 | { | 
 | 	printk("vmwrite error: reg %lx value %lx (err %d)\n", | 
 | 		   field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); | 
 | } | 
 |  | 
 | void | 
 | vmcs_writel(unsigned long field, unsigned long value) | 
 | { | 
 | 	if (!vmcs_write(field, value)) | 
 | 		vmwrite_error(field, value); | 
 | } | 
 |  | 
 | static void | 
 | vmcs_write16(unsigned long field, uint16_t value) | 
 | { | 
 | 	vmcs_writel(field, value); | 
 | } | 
 |  | 
 | static void | 
 | vmcs_write32(unsigned long field, uint32_t value) | 
 | { | 
 | 	vmcs_writel(field, value); | 
 | } | 
 |  | 
 | static void | 
 | vmcs_write64(unsigned long field, uint64_t value) | 
 | { | 
 | 	vmcs_writel(field, value); | 
 | } | 
 |  | 
 | void vapic_status_dump_kernel(void *vapic); | 
 |  | 
 | static bool vmx_control_can_be_changed(struct vmxec *v, uint32_t ctl) | 
 | { | 
 | 	return v->hw_changeable & v->policy_changeable & ctl; | 
 | } | 
 |  | 
 | /* | 
 |  * A note on Things You Can't Make Up. | 
 |  * or | 
 |  * "George, you can type this shit, but you can't say it" -- Harrison Ford | 
 |  * | 
 |  * There are 5 VMCS 32-bit words that control guest permissions. If | 
 |  * you set these correctly, you've got a guest that will behave. If | 
 |  * you get even one bit wrong, you've got a guest that will chew your | 
 |  * leg off. Some bits must be 1, some must be 0, and some can be set | 
 |  * either way. To add to the fun, the docs are sort of a docudrama or, | 
 |  * as the quote goes, "interesting if true." | 
 |  * | 
 |  * To determine what bit can be set in what VMCS 32-bit control word, | 
 |  * there are 5 corresponding 64-bit MSRs.  And, to make it even more | 
 |  * fun, the standard set of MSRs have errors in them, i.e. report | 
 |  * incorrect values, for legacy reasons, and so you are supposed to | 
 |  * "look around" to another set, which have correct bits in | 
 |  * them. There are four such 'correct' registers, and they have _TRUE_ | 
 |  * in the names as you can see below. We test for the value of VMCS | 
 |  * control bits in the _TRUE_ registers if possible. The fifth | 
 |  * register, CPU Secondary Exec Controls, which came later, needs no | 
 |  * _TRUE_ variant. | 
 |  * | 
 |  * For each MSR, the high 32 bits tell you what bits can be "1" by a | 
 |  * "1" in that position; the low 32 bits tell you what bit can be "0" | 
 |  * by a "0" in that position. So, for each of 32 bits in a given VMCS | 
 |  * control word, there is a pair of bits in an MSR that tells you what | 
 |  * values it can take. The two bits, of which there are *four* | 
 |  * combinations, describe the *three* possible operations on a | 
 |  * bit. The two bits, taken together, form an untruth table: There are | 
 |  * three possibilities: The VMCS bit can be set to 0 or 1, or it can | 
 |  * only be 0, or only 1. The fourth combination is not supposed to | 
 |  * happen. | 
 |  * | 
 |  * So: there is the 1 bit from the upper 32 bits of the msr. | 
 |  * If this bit is set, then the bit can be 1. If clear, it can not be 1. | 
 |  * | 
 |  * Then there is the 0 bit, from low 32 bits. If clear, the VMCS bit | 
 |  * can be 0. If 1, the VMCS bit can not be 0. | 
 |  * | 
 |  * SO, let's call the 1 bit R1, and the 0 bit R0, we have: | 
 |  *  R1 R0 | 
 |  *  0 0 -> must be 0 | 
 |  *  1 0 -> can be 1, can be 0 | 
 |  *  0 1 -> can not be 1, can not be 0. --> JACKPOT! Not seen yet. | 
 |  *  1 1 -> must be one. | 
 |  * | 
 |  * It's also pretty hard to know what you can and can't set, and | 
 |  * that's led to inadvertent opening of permissions at times.  Because | 
 |  * of this complexity we've decided on the following: the driver must | 
 |  * define EVERY bit, UNIQUELY, for each of the 5 registers, that it wants | 
 |  * set. Further, for any bit that's settable, the driver must specify | 
 |  * a setting; for any bit that's reserved, the driver settings must | 
 |  * match that bit. If there are reserved bits we don't specify, that's | 
 |  * ok; we'll take them as is. | 
 |  * | 
 |  * We use a set-means-set, and set-means-clear model, i.e. we use a | 
 |  * 32-bit word to contain the bits we want to be 1, indicated by one; | 
 |  * and another 32-bit word in which a bit we want to be 0 is indicated | 
 |  * by a 1. This allows us to easily create masks of all bits we're | 
 |  * going to set, for example. | 
 |  * | 
 |  * We have two 32-bit numbers for each 32-bit VMCS field: bits we want | 
 |  * set and bits we want clear.  If you read the MSR for that field, | 
 |  * compute the reserved 0 and 1 settings, and | them together, they | 
 |  * need to result in 0xffffffff. You can see that we can create other | 
 |  * tests for conflicts (i.e. overlap). | 
 |  * | 
 |  * At this point, I've tested check_vmx_controls in every way | 
 |  * possible, because I kept screwing the bitfields up. You'll get a nice | 
 |  * error it won't work at all, which is what we want: a | 
 |  * failure-prone setup, where even errors that might result in correct | 
 |  * values are caught -- "right answer, wrong method, zero credit." If there's | 
 |  * weirdness in the bits, we don't want to run. | 
 |  * The try_set stuff adds particular ugliness but we have to have it. | 
 |  */ | 
 | static bool check_vmxec_controls(struct vmxec *v, bool have_true_msr, | 
 |                                  uint32_t *result) | 
 | { | 
 | 	bool err = false; | 
 | 	uint32_t vmx_msr_low, vmx_msr_high; | 
 | 	uint64_t msr_val; | 
 | 	uint32_t reserved_0, reserved_1, changeable_bits, try0, try1; | 
 |  | 
 | 	if (have_true_msr) | 
 | 		msr_val = read_msr(v->truemsr); | 
 | 	else | 
 | 		msr_val = read_msr(v->msr); | 
 | 	vmx_msr_high = high32(msr_val); | 
 | 	vmx_msr_low = low32(msr_val); | 
 |  | 
 | 	if (vmx_msr_low & ~vmx_msr_high) | 
 | 		warn("JACKPOT: Conflicting VMX ec ctls for %s, high 0x%08x low 0x%08x", | 
 | 			 v->name, vmx_msr_high, vmx_msr_low); | 
 |  | 
 | 	reserved_0 = (~vmx_msr_low) & (~vmx_msr_high); | 
 | 	reserved_1 = vmx_msr_low & vmx_msr_high; | 
 | 	changeable_bits = ~(reserved_0 | reserved_1); | 
 | 	v->hw_changeable = changeable_bits; | 
 |  | 
 | 	/* | 
 | 	 * this is very much as follows: | 
 | 	 * accept the things I cannot change, | 
 | 	 * change the things I can, | 
 | 	 * know the difference. | 
 | 	 */ | 
 |  | 
 | 	/* Conflict. Don't try to both set and reset bits. */ | 
 | 	if ((v->must_be_1 & (v->must_be_0 | v->try_set_1 | v->try_set_0)) || | 
 | 	    (v->must_be_0 & (v->try_set_1 | v->try_set_0)) || | 
 | 	    (v->try_set_1 & v->try_set_0)) { | 
 | 		printk("%s: must 0 (0x%x) and must be 1 (0x%x) and try_set_0 (0x%x) and try_set_1 (0x%x) overlap\n", | 
 | 		       v->name, v->must_be_0, v->must_be_1, v->try_set_0, v->try_set_1); | 
 | 		err = true; | 
 | 	} | 
 |  | 
 | 	/* coverage */ | 
 | 	if (((v->must_be_0 | v->must_be_1 | v->try_set_0 | v->try_set_1) & changeable_bits) != changeable_bits) { | 
 | 		printk("%s: Need to cover 0x%x and have 0x%x,0x%x\n", | 
 | 		       v->name, changeable_bits, v->must_be_0, v->must_be_1, v->try_set_0, v->try_set_1); | 
 | 		err = true; | 
 | 	} | 
 |  | 
 | 	if ((v->must_be_0 | v->must_be_1 | v->try_set_0 | v->try_set_1 | reserved_0 | reserved_1) != 0xffffffff) { | 
 | 		printk("%s: incomplete coverage: have 0x%x, want 0x%x\n", | 
 | 		       v->name, v->must_be_0 | v->must_be_1 | v->try_set_0 | v->try_set_1 | | 
 | 		       reserved_0 | reserved_1, 0xffffffff); | 
 | 		err = true; | 
 | 	} | 
 |  | 
 | 	/* Don't try to change bits that can't be changed. */ | 
 | 	if ((v->must_be_0 & (reserved_0 | changeable_bits)) != v->must_be_0) { | 
 | 		printk("%s: set to 0 (0x%x) can't be done\n", v->name, v->must_be_0); | 
 | 		err = true; | 
 | 	} | 
 |  | 
 | 	if ((v->must_be_1 & (reserved_1 | changeable_bits)) != v->must_be_1) { | 
 | 		printk("%s: set to 1 (0x%x) can't be done\n", v->name, v->must_be_1); | 
 | 		err = true; | 
 | 	} | 
 | 	// Note we don't REQUIRE that try_set_0 or try_set_0 be possible. We just want to try it. | 
 |  | 
 | 	// Clear bits in try_set that can't be set. | 
 | 	try1 = v->try_set_1 & (reserved_1 | changeable_bits); | 
 |  | 
 | 	/* If there's been any error at all, spill our guts and return. */ | 
 | 	if (err) { | 
 | 		printk("%s: vmx_msr_high 0x%x, vmx_msr_low 0x%x, ", | 
 | 			   v->name, vmx_msr_high, vmx_msr_low); | 
 | 		printk("must_be_0 0x%x, try_set_0 0x%x,reserved_0 0x%x", | 
 | 			   v->must_be_0, v->try_set_0, reserved_0); | 
 | 		printk("must_be_1 0x%x, try_set_1 0x%x,reserved_1 0x%x", | 
 | 			   v->must_be_1, v->try_set_1, reserved_1); | 
 | 		printk(" reserved_0 0x%x", reserved_0); | 
 | 		printk(" changeable_bits 0x%x\n", changeable_bits); | 
 | 		return false; | 
 | 	} | 
 |  | 
 | 	*result = v->must_be_1 | try1 | reserved_1; | 
 |  | 
 | 	printk("%s: check_vmxec_controls succeeds with result 0x%x\n", | 
 | 		   v->name, *result); | 
 | 	return true; | 
 | } | 
 |  | 
 | /* | 
 |  * We're trying to make this as readable as possible. Realistically, it will | 
 |  * rarely if ever change, if the past is any guide. | 
 |  */ | 
 | static struct vmxec pbec = { | 
 | 	.name = "Pin Based Execution Controls", | 
 | 	.msr = MSR_IA32_VMX_PINBASED_CTLS, | 
 | 	.truemsr = MSR_IA32_VMX_TRUE_PINBASED_CTLS, | 
 |  | 
 | 	.must_be_1 = (PIN_BASED_EXT_INTR_MASK | | 
 | 		     PIN_BASED_NMI_EXITING | | 
 | 		     PIN_BASED_VIRTUAL_NMIS | | 
 | 		     PIN_BASED_POSTED_INTR), | 
 |  | 
 | 	.must_be_0 = (PIN_BASED_VMX_PREEMPTION_TIMER), | 
 | }; | 
 |  | 
 | static struct vmxec cbec = { | 
 | 	.name = "CPU Based Execution Controls", | 
 | 	.msr = MSR_IA32_VMX_PROCBASED_CTLS, | 
 | 	.truemsr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS, | 
 |  | 
 | 	.must_be_1 = ( | 
 | 	         CPU_BASED_MWAIT_EXITING | | 
 | 	         CPU_BASED_HLT_EXITING | | 
 | 		     CPU_BASED_TPR_SHADOW | | 
 | 		     CPU_BASED_RDPMC_EXITING | | 
 | 		     CPU_BASED_CR8_LOAD_EXITING | | 
 | 		     CPU_BASED_CR8_STORE_EXITING | | 
 | 		     CPU_BASED_USE_MSR_BITMAPS | | 
 | 		     CPU_BASED_USE_IO_BITMAPS | | 
 | 		     CPU_BASED_ACTIVATE_SECONDARY_CONTROLS), | 
 |  | 
 | 	.must_be_0 = ( | 
 | 	         CPU_BASED_VIRTUAL_INTR_PENDING | | 
 | 		     CPU_BASED_INVLPG_EXITING | | 
 | 		     CPU_BASED_USE_TSC_OFFSETING | | 
 | 		     CPU_BASED_RDTSC_EXITING | | 
 | 		     CPU_BASED_CR3_LOAD_EXITING | | 
 | 		     CPU_BASED_CR3_STORE_EXITING | | 
 | 		     CPU_BASED_MOV_DR_EXITING | | 
 | 		     CPU_BASED_VIRTUAL_NMI_PENDING | | 
 | 		     CPU_BASED_MONITOR_TRAP | | 
 | 		     CPU_BASED_PAUSE_EXITING | | 
 | 		     CPU_BASED_UNCOND_IO_EXITING), | 
 |  | 
 | 	.try_set_0 = (CPU_BASED_MONITOR_EXITING), | 
 | 	.policy_changeable = ( | 
 | 	         CPU_BASED_HLT_EXITING | | 
 | 	         CPU_BASED_PAUSE_EXITING | | 
 | 	         CPU_BASED_MWAIT_EXITING | | 
 | 	         0), | 
 | }; | 
 |  | 
 | static struct vmxec cb2ec = { | 
 | 	.name = "CPU Based 2nd Execution Controls", | 
 | 	.msr = MSR_IA32_VMX_PROCBASED_CTLS2, | 
 | 	.truemsr = MSR_IA32_VMX_PROCBASED_CTLS2, | 
 |  | 
 | 	.must_be_1 = (SECONDARY_EXEC_ENABLE_EPT | | 
 | 		     SECONDARY_EXEC_APIC_REGISTER_VIRT | | 
 | 		     SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | 
 | 		     SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | 
 | 		     SECONDARY_EXEC_ENABLE_INVPCID | | 
 | 		     SECONDARY_EXEC_WBINVD_EXITING), | 
 |  | 
 | 	.must_be_0 = ( | 
 | 		     SECONDARY_EXEC_DESCRIPTOR_EXITING | | 
 | 		     SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | 
 | 		     SECONDARY_EXEC_ENABLE_VPID | | 
 | 		     SECONDARY_EXEC_UNRESTRICTED_GUEST | | 
 | 		     SECONDARY_EXEC_PAUSE_LOOP_EXITING | | 
 | 		     SECONDARY_EXEC_RDRAND_EXITING | | 
 | 		     SECONDARY_EXEC_ENABLE_VMFUNC | | 
 | 		     SECONDARY_EXEC_SHADOW_VMCS | | 
 | 		     SECONDARY_EXEC_RDSEED_EXITING | | 
 | 		     SECONDARY_EPT_VE | | 
 | 		     SECONDARY_ENABLE_XSAV_RESTORE), | 
 |  | 
 | 	.try_set_1 = SECONDARY_EXEC_RDTSCP, | 
 |  | 
 | 	.try_set_0 = SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_ENABLE_PML | 
 |  | 
 | }; | 
 |  | 
 | static struct vmxec vmentry = { | 
 | 	.name = "VMENTRY controls", | 
 | 	.msr = MSR_IA32_VMX_ENTRY_CTLS, | 
 | 	.truemsr = MSR_IA32_VMX_TRUE_ENTRY_CTLS, | 
 | 	/* exact order from vmx.h; only the first two are enabled. */ | 
 |  | 
 | 	.must_be_1 =  (VM_ENTRY_LOAD_DEBUG_CONTROLS | /* can't set to 0 */ | 
 | 		      VM_ENTRY_LOAD_IA32_EFER | | 
 | 		      VM_ENTRY_IA32E_MODE), | 
 |  | 
 | 	.must_be_0 = (VM_ENTRY_SMM | | 
 | 		     VM_ENTRY_DEACT_DUAL_MONITOR | | 
 | 		     VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | | 
 | 		     VM_ENTRY_LOAD_IA32_PAT), | 
 | }; | 
 |  | 
 | static struct vmxec vmexit = { | 
 | 	.name = "VMEXIT controls", | 
 | 	.msr = MSR_IA32_VMX_EXIT_CTLS, | 
 | 	.truemsr = MSR_IA32_VMX_TRUE_EXIT_CTLS, | 
 |  | 
 | 	.must_be_1 = (VM_EXIT_SAVE_DEBUG_CONTROLS |	/* can't set to 0 */ | 
 | 				 VM_EXIT_ACK_INTR_ON_EXIT | | 
 | 				 VM_EXIT_SAVE_IA32_EFER | | 
 | 				VM_EXIT_LOAD_IA32_EFER | | 
 | 				VM_EXIT_HOST_ADDR_SPACE_SIZE),	/* 64 bit */ | 
 |  | 
 | 	.must_be_0 = (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | | 
 | 				 VM_EXIT_SAVE_IA32_PAT | | 
 | 				 VM_EXIT_LOAD_IA32_PAT | | 
 | 				VM_EXIT_SAVE_VMX_PREEMPTION_TIMER), | 
 | }; | 
 |  | 
 | static void | 
 | setup_vmcs_config(void *p) | 
 | { | 
 | 	int *ret = p; | 
 | 	struct vmcs_config *vmcs_conf = &vmcs_config; | 
 | 	uint32_t vmx_msr_high; | 
 | 	uint64_t vmx_msr; | 
 | 	bool have_true_msrs = false; | 
 | 	bool ok; | 
 |  | 
 | 	*ret = -EIO; | 
 |  | 
 | 	vmx_msr = read_msr(MSR_IA32_VMX_BASIC); | 
 | 	vmx_msr_high = vmx_msr >> 32; | 
 |  | 
 | 	/* | 
 | 	 * If bit 55 (VMX_BASIC_HAVE_TRUE_MSRS) is set, then we | 
 | 	 * can go for the true MSRs.  Else, we ask you to get a better CPU. | 
 | 	 */ | 
 | 	if (vmx_msr & VMX_BASIC_TRUE_CTLS) { | 
 | 		have_true_msrs = true; | 
 | 		printd("Running with TRUE MSRs\n"); | 
 | 	} else { | 
 | 		printk("Running with non-TRUE MSRs, this is old hardware\n"); | 
 | 	} | 
 |  | 
 | 	/* | 
 | 	 * Don't worry that one or more of these might fail and leave | 
 | 	 * the VMCS in some kind of incomplete state. If one of these | 
 | 	 * fails, the caller is going to discard the VMCS. | 
 | 	 * It is written this way to ensure we get results of all tests and avoid | 
 | 	 * BMAFR behavior. | 
 | 	 */ | 
 | 	ok = check_vmxec_controls(&pbec, have_true_msrs, | 
 | 	                          &vmcs_conf->pin_based_exec_ctrl); | 
 | 	ok = check_vmxec_controls(&cbec, have_true_msrs, | 
 | 	                          &vmcs_conf->cpu_based_exec_ctrl) && ok; | 
 | 	/* Only check cb2ec if we're still ok, o/w we may GPF */ | 
 | 	ok = ok && check_vmxec_controls(&cb2ec, have_true_msrs, | 
 | 	                                &vmcs_conf->cpu_based_2nd_exec_ctrl); | 
 | 	ok = check_vmxec_controls(&vmentry, have_true_msrs, | 
 | 	                          &vmcs_conf->vmentry_ctrl) && ok; | 
 | 	ok = check_vmxec_controls(&vmexit, have_true_msrs, | 
 | 	                          &vmcs_conf->vmexit_ctrl) && ok; | 
 | 	if (! ok) { | 
 | 		printk("vmxexec controls is no good.\n"); | 
 | 		return; | 
 | 	} | 
 | 	assert(cpu_has_secondary_exec_ctrls()); | 
 |  | 
 | 	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ | 
 | 	if ((vmx_msr_high & 0x1fff) > PGSIZE) { | 
 | 		printk("vmx_msr_high & 0x1fff) is 0x%x, > PAGE_SIZE 0x%x\n", | 
 | 			   vmx_msr_high & 0x1fff, PGSIZE); | 
 | 		return; | 
 | 	} | 
 |  | 
 | 	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ | 
 | 	if (vmx_msr & VMX_BASIC_64) { | 
 | 		printk("VMX doesn't support 64 bit width!\n"); | 
 | 		return; | 
 | 	} | 
 |  | 
 | 	if (((vmx_msr & VMX_BASIC_MEM_TYPE_MASK) >> VMX_BASIC_MEM_TYPE_SHIFT) | 
 | 		!= VMX_BASIC_MEM_TYPE_WB) { | 
 | 		printk("VMX doesn't support WB memory for VMCS accesses!\n"); | 
 | 		return; | 
 | 	} | 
 |  | 
 | 	vmcs_conf->size = vmx_msr_high & 0x1fff; | 
 | 	vmcs_conf->revision_id = (uint32_t) vmx_msr; | 
 |  | 
 | 	/* Read in the caps for runtime checks.  This MSR is only available if | 
 | 	 * secondary controls and ept or vpid is on, which we check earlier */ | 
 | 	vmx_msr = read_msr(MSR_IA32_VMX_EPT_VPID_CAP); | 
 | 	vmx_capability.vpid = high32(vmx_msr); | 
 | 	vmx_capability.ept = low32(vmx_msr); | 
 |  | 
 | 	*ret = 0; | 
 | } | 
 |  | 
 | static struct vmcs *__vmx_alloc_vmcs(int node) | 
 | { | 
 | 	struct vmcs *vmcs; | 
 |  | 
 | 	vmcs = kpages_alloc(vmcs_config.size, MEM_WAIT); | 
 | 	if (!vmcs) | 
 | 		error(ENOMEM, "__vmx_alloc_vmcs: Could not get %d contig bytes", | 
 | 		      vmcs_config.size); | 
 | 	memset(vmcs, 0, vmcs_config.size); | 
 | 	vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ | 
 | 	printd("%d: set rev id %d\n", core_id(), vmcs->revision_id); | 
 | 	return vmcs; | 
 | } | 
 |  | 
 | /** | 
 |  * vmx_alloc_vmcs - allocates a VMCS region | 
 |  * | 
 |  * NOTE: Assumes the new region will be used by the current CPU. | 
 |  * | 
 |  * Returns a valid VMCS region. | 
 |  */ | 
 | static struct vmcs * | 
 | vmx_alloc_vmcs(void) | 
 | { | 
 | 	return __vmx_alloc_vmcs(numa_id()); | 
 | } | 
 |  | 
 | /** | 
 |  * vmx_free_vmcs - frees a VMCS region | 
 |  */ | 
 | static void | 
 | vmx_free_vmcs(struct vmcs *vmcs) | 
 | { | 
 | 	kpages_free(vmcs, vmcs_config.size); | 
 | } | 
 |  | 
 | /* | 
 |  * Set up the vmcs's constant host-state fields, i.e., host-state fields that | 
 |  * will not change in the lifetime of the guest. | 
 |  * Note that host-state that does change is set elsewhere. E.g., host-state | 
 |  * that is set differently for each CPU is set in __vmx_setup_pcpu(), not here. | 
 |  */ | 
 | static void vmx_setup_constant_host_state(void) | 
 | { | 
 | 	pseudodesc_t dt; | 
 |  | 
 | 	vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);	/* 22.2.3 */ | 
 | 	vmcs_writel(HOST_CR4, rcr4());	/* 22.2.3, 22.2.5 */ | 
 | 	vmcs_writel(HOST_CR3, rcr3());	/* 22.2.3 */ | 
 |  | 
 | 	vmcs_write16(HOST_CS_SELECTOR, GD_KT);	/* 22.2.4 */ | 
 | 	vmcs_write16(HOST_DS_SELECTOR, GD_KD);	/* 22.2.4 */ | 
 | 	vmcs_write16(HOST_ES_SELECTOR, GD_KD);	/* 22.2.4 */ | 
 | 	vmcs_write16(HOST_SS_SELECTOR, GD_KD);	/* 22.2.4 */ | 
 | 	vmcs_write16(HOST_TR_SELECTOR, GD_TSS);	/* 22.2.4 */ | 
 |  | 
 | 	native_store_idt(&dt); | 
 | 	vmcs_writel(HOST_IDTR_BASE, dt.pd_base);	/* 22.2.4 */ | 
 |  | 
 | 	extern void vmexit_handler(void); | 
 | 	vmcs_writel(HOST_RIP, (unsigned long)vmexit_handler); | 
 |  | 
 | 	vmcs_write32(HOST_IA32_SYSENTER_CS, read_msr(MSR_IA32_SYSENTER_CS)); | 
 | 	vmcs_writel(HOST_IA32_SYSENTER_EIP, read_msr(MSR_IA32_SYSENTER_EIP)); | 
 |  | 
 | 	vmcs_write32(HOST_IA32_EFER, read_msr(MSR_EFER)); | 
 |  | 
 | 	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) | 
 | 		vmcs_write64(HOST_IA32_PAT, read_msr(MSR_IA32_CR_PAT)); | 
 |  | 
 | 	vmcs_write16(HOST_FS_SELECTOR, 0);	/* 22.2.4 */ | 
 | 	vmcs_write16(HOST_GS_SELECTOR, 0);	/* 22.2.4 */ | 
 | 	vmcs_write(HOST_FS_BASE, 0); | 
 | } | 
 |  | 
 | /* Set up the per-core VMCS fields.  This is the host state that varies from | 
 |  * core to core, which the hardware will switch for us on VM enters/exits. */ | 
 | static void __vmx_setup_pcpu(struct guest_pcore *gpc) | 
 | { | 
 | 	struct per_cpu_info *pcpui = &per_cpu_info[core_id()]; | 
 | 	struct vmx_vmm *vmx = &gpc->proc->vmm.vmx; | 
 |  | 
 | 	vmcs_write(HOST_TR_BASE, (uintptr_t)pcpui->tss); | 
 | 	vmcs_writel(HOST_GDTR_BASE, (uintptr_t)pcpui->gdt); | 
 | 	vmcs_write(HOST_GS_BASE, (uintptr_t)pcpui); | 
 | 	/* TODO: we might need to also set HOST_IA32_PERF_GLOBAL_CTRL.  Need to | 
 | 	 * think about how perf will work with VMs */ | 
 | 	/* Userspace can request changes to the ctls.  They take effect when we | 
 | 	 * reload the GPC, which occurs after a transition from userspace to VM. */ | 
 | 	vmcs_write(PIN_BASED_VM_EXEC_CONTROL, vmx->pin_exec_ctls); | 
 | 	vmcs_write(CPU_BASED_VM_EXEC_CONTROL, vmx->cpu_exec_ctls); | 
 | 	vmcs_write(SECONDARY_VM_EXEC_CONTROL, vmx->cpu2_exec_ctls); | 
 | } | 
 |  | 
 | uint64_t | 
 | construct_eptp(physaddr_t root_hpa) | 
 | { | 
 | 	uint64_t eptp; | 
 |  | 
 | 	/* set WB memory and 4 levels of walk.  we checked these in ept_init */ | 
 | 	eptp = VMX_EPT_MEM_TYPE_WB | (VMX_EPT_GAW_4_LVL << VMX_EPT_GAW_EPTP_SHIFT); | 
 | 	if (cpu_has_vmx_ept_ad_bits()) | 
 | 		eptp |= VMX_EPT_AD_ENABLE_BIT; | 
 | 	eptp |= (root_hpa & PAGE_MASK); | 
 |  | 
 | 	return eptp; | 
 | } | 
 |  | 
 | /* Helper: some fields of the VMCS need a physical page address, e.g. the VAPIC | 
 |  * page.  We have the user address.  This converts the user to phys addr and | 
 |  * sets that up in the VMCS.  Throws on error. */ | 
 | static void vmcs_set_pgaddr(struct proc *p, void *u_addr, | 
 |                             unsigned long field, char *what) | 
 | { | 
 | 	uintptr_t kva; | 
 | 	physaddr_t paddr; | 
 |  | 
 | 	/* Enforce page alignment */ | 
 | 	kva = uva2kva(p, ROUNDDOWN(u_addr, PGSIZE), PGSIZE, PROT_WRITE); | 
 | 	if (!kva) | 
 | 		error(EINVAL, "Unmapped pgaddr %p for VMCS page %s", u_addr, what); | 
 |  | 
 | 	paddr = PADDR(kva); | 
 | 	/* TODO: need to pin the page.  A munmap would actually be okay | 
 | 	 * (though probably we should kill the process), but we need to | 
 | 	 * keep the page from being reused.  A refcnt would do the trick, | 
 | 	 * which we decref when we destroy the guest core/vcpu. Note that | 
 | 	 * this is an assert, not an error, because it represents an error | 
 | 	 * in the kernel itself. */ | 
 | 	assert(!PGOFF(paddr)); | 
 | 	vmcs_writel(field, paddr); | 
 | 	/* Pages are inserted twice.  Once, with the full paddr.  The next field is | 
 | 	 * the upper 32 bits of the paddr. */ | 
 | 	vmcs_writel(field + 1, paddr >> 32); | 
 | } | 
 |  | 
 | /** | 
 |  * vmx_setup_initial_guest_state - configures the initial state of guest | 
 |  * registers and the VMCS.  Throws on error. | 
 |  */ | 
 | static void vmx_setup_initial_guest_state(struct proc *p, | 
 |                                           struct vmm_gpcore_init *gpci) | 
 | { | 
 | 	unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT | | 
 | 		X86_CR4_PGE | X86_CR4_OSFXSR; | 
 | 	uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE; | 
 |  | 
 | 	/* | 
 | 	 * Allow guest to use xsave and read/write fs/gs base. | 
 | 	 * We require these features to be present on the cpu. | 
 | 	 */ | 
 | 	assert(cpu_has_feat(CPU_FEAT_X86_XSAVE)); | 
 | 	assert(cpu_has_feat(CPU_FEAT_X86_FSGSBASE)); | 
 | 	cr4 |= X86_CR4_RDWRGSFS; | 
 | 	cr4 |= X86_CR4_OSXSAVE; | 
 | 	/* configure control and data registers */ | 
 | 	vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP | | 
 | 				X86_CR0_MP | X86_CR0_ET | X86_CR0_NE); | 
 | 	vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP | | 
 | 				X86_CR0_MP | X86_CR0_ET | X86_CR0_NE); | 
 | 	vmcs_writel(GUEST_CR3, rcr3()); | 
 | 	vmcs_writel(GUEST_CR4, cr4); | 
 | 	/* The only bits that matter in this shadow are those that are | 
 | 	 * set in CR4_GUEST_HOST_MASK.  TODO: do we need to separate | 
 | 	 * the setting of this value from that of | 
 | 	 * CR4_GUEST_HOST_MASK? */ | 
 | 	vmcs_writel(CR4_READ_SHADOW, 0); | 
 | 	vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA | | 
 | 				EFER_SCE | EFER_NX /*| EFER_FFXSR */ ); | 
 | 	vmcs_writel(GUEST_GDTR_BASE, 0); | 
 | 	vmcs_writel(GUEST_GDTR_LIMIT, 0); | 
 | 	vmcs_writel(GUEST_IDTR_BASE, 0); | 
 | 	vmcs_writel(GUEST_IDTR_LIMIT, 0); | 
 | 	vmcs_writel(GUEST_RIP, 0xdeadbeef); | 
 | 	vmcs_writel(GUEST_RSP, 0xdeadbeef); | 
 | 	vmcs_writel(GUEST_RFLAGS, FL_RSVD_1); | 
 | 	vmcs_writel(GUEST_DR7, 0); | 
 |  | 
 | 	/* guest segment bases */ | 
 | 	vmcs_writel(GUEST_CS_BASE, 0); | 
 | 	vmcs_writel(GUEST_DS_BASE, 0); | 
 | 	vmcs_writel(GUEST_ES_BASE, 0); | 
 | 	enforce_user_canon(&gpci->fsbase); | 
 | 	vmcs_writel(GUEST_FS_BASE, gpci->fsbase); | 
 | 	enforce_user_canon(&gpci->gsbase); | 
 | 	vmcs_writel(GUEST_GS_BASE, gpci->gsbase); | 
 | 	vmcs_writel(GUEST_SS_BASE, 0); | 
 |  | 
 | 	/* guest segment access rights */ | 
 | 	vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B); | 
 | 	vmcs_writel(GUEST_DS_AR_BYTES, 0xA093); | 
 | 	vmcs_writel(GUEST_ES_AR_BYTES, 0xA093); | 
 | 	vmcs_writel(GUEST_FS_AR_BYTES, 0xA093); | 
 | 	vmcs_writel(GUEST_GS_AR_BYTES, 0xA093); | 
 | 	vmcs_writel(GUEST_SS_AR_BYTES, 0xA093); | 
 |  | 
 | 	/* guest segment limits */ | 
 | 	vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF); | 
 | 	vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF); | 
 | 	vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF); | 
 | 	vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF); | 
 | 	vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF); | 
 | 	vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF); | 
 |  | 
 | 	/* configure segment selectors */ | 
 | 	vmcs_write16(GUEST_CS_SELECTOR, 0); | 
 | 	vmcs_write16(GUEST_DS_SELECTOR, 0); | 
 | 	vmcs_write16(GUEST_ES_SELECTOR, 0); | 
 | 	vmcs_write16(GUEST_FS_SELECTOR, 0); | 
 | 	vmcs_write16(GUEST_GS_SELECTOR, 0); | 
 | 	vmcs_write16(GUEST_SS_SELECTOR, 0); | 
 | 	vmcs_write16(GUEST_TR_SELECTOR, 0); | 
 |  | 
 | 	/* guest LDTR */ | 
 | 	vmcs_write16(GUEST_LDTR_SELECTOR, 0); | 
 | 	vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082); | 
 | 	vmcs_writel(GUEST_LDTR_BASE, 0); | 
 | 	vmcs_writel(GUEST_LDTR_LIMIT, 0); | 
 |  | 
 | 	/* guest TSS */ | 
 | 	vmcs_writel(GUEST_TR_BASE, 0); | 
 | 	vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS); | 
 | 	vmcs_writel(GUEST_TR_LIMIT, 0xff); | 
 |  | 
 | 	/* initialize sysenter */ | 
 | 	vmcs_write32(GUEST_SYSENTER_CS, 0); | 
 | 	vmcs_writel(GUEST_SYSENTER_ESP, 0); | 
 | 	vmcs_writel(GUEST_SYSENTER_EIP, 0); | 
 |  | 
 | 	/* other random initialization */ | 
 | 	vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); | 
 | 	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | 
 | 	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); | 
 | 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | 
 | 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);	/* 22.2.1 */ | 
 |  | 
 | 	/* Initialize posted interrupt notification vector */ | 
 | 	vmcs_write16(POSTED_NOTIFICATION_VEC, I_POKE_GUEST); | 
 |  | 
 | 	/* Clear the EOI exit bitmap */ | 
 | 	vmcs_writel(EOI_EXIT_BITMAP0, 0); | 
 | 	vmcs_writel(EOI_EXIT_BITMAP0_HIGH, 0); | 
 | 	vmcs_writel(EOI_EXIT_BITMAP1, 0); | 
 | 	vmcs_writel(EOI_EXIT_BITMAP1_HIGH, 0); | 
 | 	vmcs_writel(EOI_EXIT_BITMAP2, 0); | 
 | 	vmcs_writel(EOI_EXIT_BITMAP2_HIGH, 0); | 
 | 	vmcs_writel(EOI_EXIT_BITMAP3, 0); | 
 | 	vmcs_writel(EOI_EXIT_BITMAP3_HIGH, 0); | 
 |  | 
 | 	/* Initialize parts based on the users info. */ | 
 | 	vmcs_set_pgaddr(p, gpci->posted_irq_desc, POSTED_INTR_DESC_ADDR, | 
 | 	                "posted_irq_desc"); | 
 | 	vmcs_set_pgaddr(p, gpci->vapic_addr, VIRTUAL_APIC_PAGE_ADDR, | 
 | 	                "vapic_addr"); | 
 | 	vmcs_set_pgaddr(p, gpci->apic_addr, APIC_ACCESS_ADDR, "apic_addr"); | 
 | } | 
 |  | 
 | static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, | 
 | 					    uint32_t msr) { | 
 | 	int f = sizeof(unsigned long); | 
 | 	/* | 
 | 	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals | 
 | 	 * have the write-low and read-high bitmap offsets the wrong way round. | 
 | 	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | 
 | 	 */ | 
 | 	if (msr <= 0x1fff) { | 
 | 		__clear_bit(msr, msr_bitmap + 0x000 / f);	/* read-low */ | 
 | 		__clear_bit(msr, msr_bitmap + 0x800 / f);	/* write-low */ | 
 | 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | 
 | 		msr &= 0x1fff; | 
 | 		__clear_bit(msr, msr_bitmap + 0x400 / f);	/* read-high */ | 
 | 		__clear_bit(msr, msr_bitmap + 0xc00 / f);	/* write-high */ | 
 | 	} | 
 | } | 
 |  | 
 | /* note the io_bitmap is big enough for the 64K port space. */ | 
 | static void __vmx_disable_intercept_for_io(unsigned long *io_bitmap, | 
 | 					   uint16_t port) { | 
 | 	__clear_bit(port, io_bitmap); | 
 | } | 
 |  | 
 | static void dumpmsrs(void) { | 
 | 	int i; | 
 | 	int set[] = { | 
 | 		MSR_LSTAR, | 
 | 		MSR_FS_BASE, | 
 | 		MSR_GS_BASE, | 
 | 		MSR_KERNEL_GS_BASE, | 
 | 		MSR_SFMASK, | 
 | 		MSR_IA32_PEBS_ENABLE | 
 | 	}; | 
 | 	for (i = 0; i < ARRAY_SIZE(set); i++) { | 
 | 		printk("%p: %p\n", set[i], read_msr(set[i])); | 
 | 	} | 
 | 	printk("core id %d\n", core_id()); | 
 | } | 
 |  | 
 | /* Notes on autoloading.  We can't autoload FS_BASE or GS_BASE, according to the | 
 |  * manual, but that's because they are automatically saved and restored when all | 
 |  * of the other architectural registers are saved and restored, such as cs, ds, | 
 |  * es, and other fun things. (See 24.4.1).  We need to make sure we don't | 
 |  * accidentally intercept them too, since they are magically autoloaded. | 
 |  * | 
 |  * We'll need to be careful of any MSR we neither autoload nor intercept | 
 |  * whenever we vmenter/vmexit, and we intercept by default. | 
 |  * | 
 |  * Other MSRs, such as MSR_IA32_PEBS_ENABLE only work on certain architectures | 
 |  * only work on certain architectures. */ | 
 | static void setup_msr(struct guest_pcore *gpc) | 
 | { | 
 | 	/* Since PADDR(msr_bitmap) is non-zero, and the bitmap is all 0xff, we now | 
 | 	 * intercept all MSRs */ | 
 | 	vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap)); | 
 |  | 
 | 	vmcs_write64(IO_BITMAP_A, PADDR(io_bitmap)); | 
 | 	vmcs_write64(IO_BITMAP_B, PADDR((uintptr_t)io_bitmap + | 
 | 	                                (VMX_IO_BITMAP_SZ / 2))); | 
 |  | 
 | 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | 
 | 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | 
 | 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | 
 | } | 
 |  | 
 | void vmx_setup_vmx_vmm(struct vmx_vmm *vmx) | 
 | { | 
 | 	vmx->pin_exec_ctls = vmcs_config.pin_based_exec_ctrl; | 
 | 	vmx->cpu_exec_ctls = vmcs_config.cpu_based_exec_ctrl; | 
 | 	vmx->cpu2_exec_ctls = vmcs_config.cpu_based_2nd_exec_ctrl; | 
 | } | 
 |  | 
 | /** | 
 |  *  vmx_setup_vmcs - configures the vmcs with starting parameters | 
 |  */ | 
 | static void vmx_setup_vmcs(struct guest_pcore *gpc) | 
 | { | 
 | 	vmcs_write16(VIRTUAL_PROCESSOR_ID, 0); | 
 | 	vmcs_write64(VMCS_LINK_POINTER, -1ull);	/* 22.3.1.5 */ | 
 |  | 
 | 	vmcs_write64(EPT_POINTER, gpc_get_eptp(gpc)); | 
 |  | 
 | 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); | 
 | 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); | 
 | 	vmcs_write32(CR3_TARGET_COUNT, 0);	/* 22.2.1 */ | 
 |  | 
 | 	setup_msr(gpc); | 
 |  | 
 | 	vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE; | 
 |  | 
 | 	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); | 
 | 	vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); | 
 |  | 
 | 	vmcs_writel(CR0_GUEST_HOST_MASK, 0);	// ~0ul); | 
 |  | 
 | 	/* Mask some bits in CR4 as host-owned by setting them in this | 
 | 	 * VMCS entry.  For example, for now, we mark the CR4_VMXE bit | 
 | 	 * as host owned.  Right now, when Linux boots, it wants to | 
 | 	 * set CR4_VMXE to 0 at first, which is fine -- we do not want | 
 | 	 * to think about nested virtualization yet. But if we don't | 
 | 	 * mark this bit as host owned we get a VMEXIT. Marking | 
 | 	 * CR4_VMXE as host owned means that the writes will succeed | 
 | 	 * with no vmexit if the value written matches the | 
 | 	 * corresponding bit in the shadow register. */ | 
 | 	vmcs_writel(CR4_GUEST_HOST_MASK, CR4_VMXE); | 
 |  | 
 | 	//kvm_write_tsc(&vmx->gpc, 0); | 
 | 	vmcs_writel(TSC_OFFSET, 0); | 
 |  | 
 | 	vmx_setup_constant_host_state(); | 
 | } | 
 |  | 
 | /** | 
 |  * create_guest_pcore - allocates and initializes a guest physical core | 
 |  * | 
 |  * Returns: A new VCPU structure | 
 |  */ | 
 | struct guest_pcore *create_guest_pcore(struct proc *p, | 
 |                                        struct vmm_gpcore_init *gpci) | 
 | { | 
 | 	ERRSTACK(2); | 
 | 	int8_t state = 0; | 
 | 	struct guest_pcore *gpc = kmalloc(sizeof(struct guest_pcore), MEM_WAIT); | 
 |  | 
 | 	if (!gpc) | 
 | 		error(ENOMEM, "create_guest_pcore could not allocate gpc"); | 
 |  | 
 | 	if (waserror()) { | 
 | 		kfree(gpc); | 
 | 		nexterror(); | 
 | 	} | 
 |  | 
 | 	memset(gpc, 0, sizeof(*gpc)); | 
 |  | 
 | 	/* Warning: p here is uncounted (weak) reference */ | 
 | 	gpc->proc = p; | 
 | 	gpc->vmcs = vmx_alloc_vmcs(); | 
 | 	if (waserror()) { | 
 | 		vmx_free_vmcs(gpc->vmcs); | 
 | 		nexterror(); | 
 | 	} | 
 | 	printd("%d: gpc->vmcs is %p\n", core_id(), gpc->vmcs); | 
 | 	gpc->cpu = -1; | 
 | 	gpc->vmcs_core_id = -1; | 
 | 	gpc->should_vmresume = FALSE; | 
 |  | 
 | 	disable_irqsave(&state); | 
 | 	vmx_load_guest_pcore(gpc); | 
 | 	vmx_setup_vmcs(gpc); | 
 | 	vmx_setup_initial_guest_state(p, gpci); | 
 | 	vmx_unload_guest_pcore(gpc); | 
 | 	enable_irqsave(&state); | 
 |  | 
 | 	gpc->xcr0 = __proc_global_info.x86_default_xcr0; | 
 |  | 
 | 	gpc->posted_irq_desc = gpci->posted_irq_desc; | 
 | 	poperror(); | 
 | 	poperror(); | 
 | 	return gpc; | 
 | } | 
 |  | 
 | /** | 
 |  * destroy_guest_pcore - destroys and frees an existing guest physical core | 
 |  * @gpc: the GPC to destroy | 
 |  */ | 
 | void destroy_guest_pcore(struct guest_pcore *gpc) | 
 | { | 
 | 	vmx_free_vmcs(gpc->vmcs); | 
 | 	kfree(gpc); | 
 | } | 
 |  | 
 | static void vmx_step_instruction(void) { | 
 | 	vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + | 
 | 		    vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); | 
 | } | 
 |  | 
 | /** | 
 |  * __vmx_enable - low-level enable of VMX mode on the current CPU | 
 |  * @vmxon_buf: an opaque buffer for use as the VMXON region | 
 |  */ | 
 | static int __vmx_enable(struct vmcs *vmxon_buf) { | 
 | 	uint64_t phys_addr = PADDR(vmxon_buf); | 
 | 	uint64_t old, test_bits; | 
 |  | 
 | 	if (rcr4() & X86_CR4_VMXE) { | 
 | 		panic("Should never have this happen"); | 
 | 		return -EBUSY; | 
 | 	} | 
 |  | 
 | 	old = read_msr(MSR_IA32_FEATURE_CONTROL); | 
 |  | 
 | 	test_bits = FEATURE_CONTROL_LOCKED; | 
 | 	test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; | 
 |  | 
 | 	if (0)	// tboot_enabled()) | 
 | 		test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; | 
 |  | 
 | 	if ((old & test_bits) != test_bits) { | 
 | 		/* If it's locked, then trying to set it will cause a GPF. | 
 | 		 * No Dune for you! | 
 | 		 */ | 
 | 		if (old & FEATURE_CONTROL_LOCKED) { | 
 | 			printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n"); | 
 | 			return -1; | 
 | 		} | 
 |  | 
 | 		/* enable and lock */ | 
 | 		write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits); | 
 | 	} | 
 | 	lcr4(rcr4() | X86_CR4_VMXE); | 
 |  | 
 | 	__vmxon(phys_addr); | 
 | 	vpid_sync_gpc_global();	/* good idea, even if we aren't using vpids */ | 
 | 	ept_sync_global(); | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | /** | 
 |  * vmx_disable - disables VMX mode on the current CPU | 
 |  */ | 
 | static void vmx_disable(void *unused) { | 
 | 	if (currentcpu->vmx_enabled) { | 
 | 		__vmxoff(); | 
 | 		lcr4(rcr4() & ~X86_CR4_VMXE); | 
 | 		currentcpu->vmx_enabled = 0; | 
 | 	} | 
 | } | 
 |  | 
 | /* Probe the cpus to see which ones can do vmx. | 
 |  * Return -errno if it fails, and 1 if it succeeds. | 
 |  */ | 
 | static bool probe_cpu_vmx(void) { | 
 | 	/* The best way to test this code is: | 
 | 	 * wrmsr -p <cpu> 0x3a 1 | 
 | 	 * This will lock vmx off; then modprobe dune. | 
 | 	 * Frequently, however, systems have all 0x3a registers set to 5, | 
 | 	 * meaning testing is impossible, as vmx can not be disabled. | 
 | 	 * We have to simulate it being unavailable in most cases. | 
 | 	 * The 'test' variable provides an easy way to simulate | 
 | 	 * unavailability of vmx on some, none, or all cpus. | 
 | 	 */ | 
 | 	if (!cpu_has_vmx()) { | 
 | 		printk("Machine does not support VT-x\n"); | 
 | 		return FALSE; | 
 | 	} else { | 
 | 		printk("Machine supports VT-x\n"); | 
 | 		return TRUE; | 
 | 	} | 
 | } | 
 |  | 
 | static int ept_init(void) { | 
 | 	if (!cpu_has_vmx_ept()) { | 
 | 		printk("VMX doesn't support EPT!\n"); | 
 | 		return -1; | 
 | 	} | 
 | 	if (!cpu_has_vmx_eptp_writeback()) { | 
 | 		printk("VMX EPT doesn't support WB memory!\n"); | 
 | 		return -1; | 
 | 	} | 
 | 	if (!cpu_has_vmx_ept_4levels()) { | 
 | 		printk("VMX EPT doesn't support 4 level walks!\n"); | 
 | 		return -1; | 
 | 	} | 
 | 	switch (arch_max_jumbo_page_shift()) { | 
 | 	case PML3_SHIFT: | 
 | 		if (!cpu_has_vmx_ept_1g_page()) { | 
 | 			printk("VMX EPT doesn't support 1 GB pages!\n"); | 
 | 			return -1; | 
 | 		} | 
 | 		break; | 
 | 	case PML2_SHIFT: | 
 | 		if (!cpu_has_vmx_ept_2m_page()) { | 
 | 			printk("VMX EPT doesn't support 2 MB pages!\n"); | 
 | 			return -1; | 
 | 		} | 
 | 		break; | 
 | 	default: | 
 | 		printk("Unexpected jumbo page size %d\n", | 
 | 		       arch_max_jumbo_page_shift()); | 
 | 		return -1; | 
 | 	} | 
 | 	if (!cpu_has_vmx_ept_ad_bits()) { | 
 | 		printk("VMX EPT doesn't support accessed/dirty!\n"); | 
 | 		x86_ept_pte_fix_ups |= EPTE_A | EPTE_D; | 
 | 	} | 
 | 	if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) { | 
 | 		printk("VMX EPT can't invalidate PTEs/TLBs!\n"); | 
 | 		return -1; | 
 | 	} | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | /** | 
 |  * vmx_init sets up physical core data areas that are required to run a vm at all. | 
 |  * These data areas are not connected to a specific user process in any way. Instead, | 
 |  * they are in some sense externalizing what would other wise be a very large ball of | 
 |  * state that would be inside the CPU. | 
 |  */ | 
 | int intel_vmm_init(void) { | 
 | 	int r, cpu, ret; | 
 |  | 
 | 	if (!probe_cpu_vmx()) { | 
 | 		return -EOPNOTSUPP; | 
 | 	} | 
 |  | 
 | 	setup_vmcs_config(&ret); | 
 |  | 
 | 	if (ret) { | 
 | 		printk("setup_vmcs_config failed: %d\n", ret); | 
 | 		return ret; | 
 | 	} | 
 |  | 
 | 	msr_bitmap = (unsigned long *)kpage_zalloc_addr(); | 
 | 	if (!msr_bitmap) { | 
 | 		printk("Could not allocate msr_bitmap\n"); | 
 | 		return -ENOMEM; | 
 | 	} | 
 | 	io_bitmap = (unsigned long *)kpages_alloc(VMX_IO_BITMAP_SZ, MEM_WAIT); | 
 | 	if (!io_bitmap) { | 
 | 		printk("Could not allocate msr_bitmap\n"); | 
 | 		kfree(msr_bitmap); | 
 | 		return -ENOMEM; | 
 | 	} | 
 | 	/* FIXME: do we need APIC virtualization (flexpriority?) */ | 
 |  | 
 | 	memset(msr_bitmap, 0xff, PAGE_SIZE); | 
 |  | 
 | 	/* The following MSRs are virtualized to the vapic page so there is no | 
 | 	 * write or read from the actual MSR. */ | 
 | 	memset((void *)msr_bitmap + INTEL_X2APIC_MSR_START, 0, | 
 | 	       INTEL_X2APIC_MSR_LENGTH); | 
 | 	__vmx_disable_intercept_for_msr(msr_bitmap, MSR_LAPIC_EOI); | 
 | 	__vmx_disable_intercept_for_msr(msr_bitmap, MSR_LAPIC_TPR); | 
 | 	__vmx_disable_intercept_for_msr(msr_bitmap, MSR_LAPIC_SELF_IPI); | 
 |  | 
 | 	memset(io_bitmap, 0xff, VMX_IO_BITMAP_SZ); | 
 |  | 
 | 	/* These are the only MSRs that are not intercepted.  The hardware takes | 
 | 	 * care of FS_BASE, GS_BASE, and EFER.  We do the rest manually when loading | 
 | 	 * and unloading guest pcores. */ | 
 | 	__vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE); | 
 | 	__vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE); | 
 | 	__vmx_disable_intercept_for_msr(msr_bitmap, MSR_EFER); | 
 | 	__vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE); | 
 | 	__vmx_disable_intercept_for_msr(msr_bitmap, MSR_LSTAR); | 
 | 	__vmx_disable_intercept_for_msr(msr_bitmap, MSR_STAR); | 
 | 	__vmx_disable_intercept_for_msr(msr_bitmap, MSR_SFMASK); | 
 |  | 
 | 	/* TODO: this might be dangerous, since they can do more than just read the | 
 | 	 * CMOS */ | 
 | 	__vmx_disable_intercept_for_io(io_bitmap, CMOS_RAM_IDX); | 
 | 	__vmx_disable_intercept_for_io(io_bitmap, CMOS_RAM_DATA); | 
 |  | 
 | 	if ((ret = ept_init())) { | 
 | 		printk("EPT init failed, %d\n", ret); | 
 | 		return ret; | 
 | 	} | 
 | 	printk("VMX setup succeeded\n"); | 
 | 	/* If this isn't true (we have VMX but not mwait), then we'll have to look | 
 | 	 * closely at CPU_BASED_MWAIT_EXITING. */ | 
 | 	assert(cpu_has_feat(CPU_FEAT_X86_MWAIT)); | 
 | 	return 0; | 
 | } | 
 |  | 
 | int intel_vmm_pcpu_init(void) | 
 | { | 
 | 	struct vmcs *vmxon_buf; | 
 | 	int ret; | 
 |  | 
 | 	vmxon_buf = __vmx_alloc_vmcs(core_id()); | 
 | 	if (!vmxon_buf) { | 
 | 		printk("setup_vmxarea failed on node %d\n", core_id()); | 
 | 		return -1; | 
 | 	} | 
 |  | 
 | 	ret = __vmx_enable(vmxon_buf); | 
 | 	if (ret) | 
 | 		goto failed; | 
 | 	currentcpu->vmx_enabled = 1; | 
 | 	return 0; | 
 | failed: | 
 | 	printk("Failed to enable VMX on core %d, err = %d\n", core_id(), ret); | 
 | 	return ret; | 
 | } | 
 |  | 
 |  | 
 | void vapic_status_dump_kernel(void *vapic) | 
 | { | 
 | 	uint32_t *p = (uint32_t *)vapic; | 
 | 	int i; | 
 | 	printk("-- BEGIN KERNEL APIC STATUS DUMP --\n"); | 
 | 	for (i = 0x100/sizeof(*p); i < 0x180/sizeof(*p); i+=4) { | 
 | 		printk("VISR : 0x%x: 0x%08x\n", i, p[i]); | 
 | 	} | 
 | 	for (i = 0x200/sizeof(*p); i < 0x280/sizeof(*p); i+=4) { | 
 | 		printk("VIRR : 0x%x: 0x%08x\n", i, p[i]); | 
 | 	} | 
 | 	i = 0x0B0/sizeof(*p); | 
 | 	printk("EOI FIELD : 0x%x, 0x%08x\n", i, p[i]); | 
 |  | 
 | 	printk("-- END KERNEL APIC STATUS DUMP --\n"); | 
 | } | 
 |  | 
 | static DEFINE_PERCPU(struct guest_pcore *, gpc_to_clear_to); | 
 |  | 
 | /* Note this is set up to allow spurious pokes.  Someone could arbitrarily send | 
 |  * us this KMSG at any time.  We only actually clear when we've previously | 
 |  * unloaded the GPC.  gpc_to_clear_to is only set once we're just 'caching' it. | 
 |  * */ | 
 | void vmx_clear_vmcs(void) | 
 | { | 
 | 	struct guest_pcore *gpc; | 
 | 	int8_t irq_state = 0; | 
 |  | 
 | 	disable_irqsave(&irq_state); | 
 | 	gpc = PERCPU_VAR(gpc_to_clear_to); | 
 | 	if (gpc) { | 
 | 		vmcs_clear(gpc->vmcs); | 
 | 		ept_sync_context(gpc_get_eptp(gpc)); | 
 | 		gpc->should_vmresume = FALSE; | 
 | 		wmb(); /* write -1 after clearing */ | 
 | 		gpc->vmcs_core_id = -1; | 
 | 		PERCPU_VAR(gpc_to_clear_to) = NULL; | 
 | 	} | 
 | 	enable_irqsave(&irq_state); | 
 | } | 
 |  | 
 | static void __clear_vmcs(uint32_t srcid, long a0, long a1, long a2) | 
 | { | 
 | 	vmx_clear_vmcs(); | 
 | } | 
 |  | 
 | /* We are safe from races on GPC, other than vmcs and vmcs_core_id.  For | 
 |  * instance, only one core can be loading or unloading a particular GPC at a | 
 |  * time.  Other cores write to our GPC's vmcs_core_id and vmcs (doing a | 
 |  * vmcs_clear).  Once they write vmcs_core_id != -1, it's ours. */ | 
 | void vmx_load_guest_pcore(struct guest_pcore *gpc) | 
 | { | 
 | 	int remote_core; | 
 |  | 
 | 	assert(!irq_is_enabled()); | 
 | 	if (gpc->vmcs_core_id == core_id()) { | 
 | 		PERCPU_VAR(gpc_to_clear_to) = NULL; | 
 | 		return; | 
 | 	} | 
 | 	/* Clear ours *before* waiting on someone else; avoids deadlock (circular | 
 | 	 * wait). */ | 
 | 	__clear_vmcs(0, 0, 0, 0); | 
 | 	remote_core = ACCESS_ONCE(gpc->vmcs_core_id); | 
 | 	if (remote_core != -1) { | 
 | 		/* This is a bit nasty.  It requires the remote core to receive | 
 | 		 * interrupts, which means we're now waiting indefinitely for them to | 
 | 		 * enable IRQs.  They can wait on another core, and so on.  We cleared | 
 | 		 * our vmcs first, so that we won't deadlock on *this*. | 
 | 		 * | 
 | 		 * However, this means we can't wait on another core with IRQs disabled | 
 | 		 * for any *other* reason.  For instance, if some other subsystem | 
 | 		 * decides to have one core wait with IRQs disabled on another, the core | 
 | 		 * that has our VMCS could be waiting on us to do something that we'll | 
 | 		 * never do. */ | 
 | 		send_kernel_message(remote_core, __clear_vmcs, 0, 0, 0, KMSG_IMMEDIATE); | 
 | 		while (gpc->vmcs_core_id != -1) | 
 | 			cpu_relax(); | 
 | 	} | 
 | 	vmcs_load(gpc->vmcs); | 
 | 	__vmx_setup_pcpu(gpc); | 
 | 	gpc->vmcs_core_id = core_id(); | 
 | } | 
 |  | 
 | void vmx_unload_guest_pcore(struct guest_pcore *gpc) | 
 | { | 
 | 	/* We don't have to worry about races yet.  No one will try to load gpc | 
 | 	 * until we've returned and unlocked, and no one will clear an old VMCS to | 
 | 	 * this GPC, since it was cleared before we finished loading (above). */ | 
 | 	assert(!irq_is_enabled()); | 
 | 	gpc->vmcs_core_id = core_id(); | 
 | 	PERCPU_VAR(gpc_to_clear_to) = gpc; | 
 | } | 
 |  | 
 | uint64_t gpc_get_eptp(struct guest_pcore *gpc) | 
 | { | 
 | 	return gpc->proc->env_pgdir.eptp; | 
 | } | 
 |  | 
 | int vmx_ctl_get_exits(struct vmx_vmm *vmx) | 
 | { | 
 | 	int ret = 0; | 
 |  | 
 | 	if (vmx->cpu_exec_ctls & CPU_BASED_HLT_EXITING) | 
 | 		ret |= VMM_CTL_EXIT_HALT; | 
 | 	if (vmx->cpu_exec_ctls & CPU_BASED_PAUSE_EXITING) | 
 | 		ret |= VMM_CTL_EXIT_PAUSE; | 
 | 	if (vmx->cpu_exec_ctls & CPU_BASED_MWAIT_EXITING) | 
 | 		ret |= VMM_CTL_EXIT_MWAIT; | 
 | 	return ret; | 
 | } | 
 |  | 
 | int vmx_ctl_set_exits(struct vmx_vmm *vmx, int vmm_exits) | 
 | { | 
 | 	int toggle_want; | 
 | 	int vmx_toggle_do = 0; | 
 |  | 
 | 	toggle_want = (vmx_ctl_get_exits(vmx) ^ vmm_exits) & VMM_CTL_ALL_EXITS; | 
 | 	if (toggle_want & VMM_CTL_EXIT_HALT) { | 
 | 	    if (!vmx_control_can_be_changed(&cbec, CPU_BASED_HLT_EXITING)) | 
 | 			error(ENOSYS, "VMX can't toggle EXIT_HALT"); | 
 | 		vmx_toggle_do |= CPU_BASED_HLT_EXITING; | 
 | 	} | 
 | 	if (toggle_want & VMM_CTL_EXIT_PAUSE) { | 
 | 	    if (!vmx_control_can_be_changed(&cbec, CPU_BASED_PAUSE_EXITING)) | 
 | 			error(ENOSYS, "VMX can't toggle EXIT_PAUSE"); | 
 | 		vmx_toggle_do |= CPU_BASED_PAUSE_EXITING; | 
 | 	} | 
 | 	if (toggle_want & VMM_CTL_EXIT_MWAIT) { | 
 | 	    if (!vmx_control_can_be_changed(&cbec, CPU_BASED_MWAIT_EXITING)) | 
 | 			error(ENOSYS, "VMX can't toggle EXIT_MWAIT"); | 
 | 		vmx_toggle_do |= CPU_BASED_MWAIT_EXITING; | 
 | 	} | 
 | 	/* This is being read concurrently by load_guest_pcore. */ | 
 | 	WRITE_ONCE(vmx->cpu_exec_ctls, vmx->cpu_exec_ctls ^ vmx_toggle_do); | 
 | 	return 0; | 
 | } |