| /* Copyright 2015 Google Inc. |
| * |
| * See LICENSE for details. |
| */ |
| |
| /* We're not going to falll into the trap of only compiling support |
| * for AMD OR Intel for an image. It all gets compiled in, and which |
| * one you use depends on on cpuinfo, not a compile-time |
| * switch. That's proven to be the best strategy. Conditionally |
| * compiling in support is the path to hell. |
| */ |
| #include <assert.h> |
| #include <pmap.h> |
| #include <smp.h> |
| #include <kmalloc.h> |
| |
| #include <ros/vmm.h> |
| #include "intel/vmx.h" |
| #include "vmm.h" |
| #include <trap.h> |
| #include <umem.h> |
| |
| #include <arch/x86.h> |
| #include <ros/procinfo.h> |
| |
| |
| /* TODO: have better cpuid info storage and checks */ |
| bool x86_supports_vmx = FALSE; |
| |
| /* Figure out what kind of CPU we are on, and if it supports any reasonable |
| * virtualization. For now, if we're not some sort of newer intel, don't |
| * bother. This does all cores. Again, note, we make these decisions at runtime, |
| * to avoid getting into the problems that compile-time decisions can cause. |
| * At this point, of course, it's still all intel. |
| */ |
| void vmm_init(void) |
| { |
| int ret; |
| /* Check first for intel capabilities. This is hence two back-to-back |
| * implementationd-dependent checks. That's ok, it's all msr dependent. |
| */ |
| ret = intel_vmm_init(); |
| if (! ret) { |
| x86_supports_vmx = TRUE; |
| return; |
| } |
| |
| /* TODO: AMD. Will we ever care? It's not clear. */ |
| printk("vmm_init failed, ret %d\n", ret); |
| return; |
| } |
| |
| void vmm_pcpu_init(void) |
| { |
| struct per_cpu_info *pcpui = &per_cpu_info[core_id()]; |
| |
| pcpui->guest_pcoreid = -1; |
| if (!x86_supports_vmx) |
| return; |
| if (! intel_vmm_pcpu_init()) { |
| printd("vmm_pcpu_init worked\n"); |
| return; |
| } |
| /* TODO: AMD. Will we ever care? It's not clear. */ |
| printk("vmm_pcpu_init failed\n"); |
| } |
| |
| /* Initializes a process to run virtual machine contexts, returning the number |
| * initialized, throwing on error. */ |
| int vmm_struct_init(struct proc *p, unsigned int nr_guest_pcores, |
| struct vmm_gpcore_init *u_gpcis, int flags) |
| { |
| ERRSTACK(1); |
| struct vmm *vmm = &p->vmm; |
| struct vmm_gpcore_init gpci; |
| |
| if (flags & ~VMM_ALL_FLAGS) |
| error(EINVAL, "%s: flags is 0x%lx, VMM_ALL_FLAGS is 0x%lx\n", __func__, |
| flags, VMM_ALL_FLAGS); |
| vmm->flags = flags; |
| if (!x86_supports_vmx) |
| error(ENODEV, "This CPU does not support VMX"); |
| qlock(&vmm->qlock); |
| if (waserror()) { |
| qunlock(&vmm->qlock); |
| nexterror(); |
| } |
| |
| /* TODO: just use an atomic test instead of all this locking stuff? */ |
| if (vmm->vmmcp) |
| error(EAGAIN, "We're already running a vmmcp?"); |
| /* Set this early, so cleanup checks the gpc array */ |
| vmm->vmmcp = TRUE; |
| vmm->amd = 0; |
| vmx_setup_vmx_vmm(&vmm->vmx); |
| nr_guest_pcores = MIN(nr_guest_pcores, num_cores); |
| vmm->guest_pcores = kzmalloc(sizeof(void *) * nr_guest_pcores, MEM_WAIT); |
| if (!vmm->guest_pcores) |
| error(ENOMEM, "Allocation of vmm->guest_pcores failed"); |
| |
| for (int i = 0; i < nr_guest_pcores; i++) { |
| if (copy_from_user(&gpci, &u_gpcis[i], sizeof(struct vmm_gpcore_init))) |
| error(EINVAL, "Bad pointer %p for gps", u_gpcis); |
| vmm->guest_pcores[i] = create_guest_pcore(p, &gpci); |
| vmm->nr_guest_pcores = i + 1; |
| } |
| for (int i = 0; i < VMM_VMEXIT_NR_TYPES; i++) |
| vmm->vmexits[i] = 0; |
| qunlock(&vmm->qlock); |
| poperror(); |
| return vmm->nr_guest_pcores; |
| } |
| |
| /* Has no concurrency protection - only call this when you know you have the |
| * only ref to vmm. For instance, from __proc_free, where there is only one ref |
| * to the proc (and thus proc.vmm). */ |
| void __vmm_struct_cleanup(struct proc *p) |
| { |
| struct vmm *vmm = &p->vmm; |
| if (!vmm->vmmcp) |
| return; |
| for (int i = 0; i < vmm->nr_guest_pcores; i++) { |
| if (vmm->guest_pcores[i]) |
| destroy_guest_pcore(vmm->guest_pcores[i]); |
| } |
| kfree(vmm->guest_pcores); |
| ept_flush(p->env_pgdir.eptp); |
| vmm->vmmcp = FALSE; |
| } |
| |
| int vmm_poke_guest(struct proc *p, int guest_pcoreid) |
| { |
| struct guest_pcore *gpc; |
| int pcoreid; |
| |
| gpc = lookup_guest_pcore(p, guest_pcoreid); |
| if (!gpc) { |
| set_error(ENOENT, "Bad guest_pcoreid %d", guest_pcoreid); |
| return -1; |
| } |
| /* We're doing an unlocked peek; it could change immediately. This is a |
| * best effort service. */ |
| pcoreid = ACCESS_ONCE(gpc->cpu); |
| if (pcoreid == -1) { |
| /* So we know that we'll miss the poke for the posted IRQ. We could |
| * return an error. However, error handling for this case isn't |
| * particularly helpful (yet). The absence of the error does not mean |
| * the IRQ was posted. We'll still return 0, meaning "the user didn't |
| * mess up; we tried." */ |
| return 0; |
| } |
| send_ipi(pcoreid, I_POKE_GUEST); |
| return 0; |
| } |
| |
| struct guest_pcore *lookup_guest_pcore(struct proc *p, int guest_pcoreid) |
| { |
| /* nr_guest_pcores is written once at setup and never changed */ |
| if (guest_pcoreid >= p->vmm.nr_guest_pcores) |
| return 0; |
| return p->vmm.guest_pcores[guest_pcoreid]; |
| } |
| |
| struct guest_pcore *load_guest_pcore(struct proc *p, int guest_pcoreid) |
| { |
| struct guest_pcore *gpc; |
| struct per_cpu_info *pcpui = &per_cpu_info[core_id()]; |
| |
| gpc = lookup_guest_pcore(p, guest_pcoreid); |
| if (!gpc) |
| return 0; |
| assert(pcpui->guest_pcoreid == -1); |
| spin_lock(&p->vmm.lock); |
| if (gpc->cpu != -1) { |
| spin_unlock(&p->vmm.lock); |
| return 0; |
| } |
| gpc->cpu = core_id(); |
| spin_unlock(&p->vmm.lock); |
| /* We've got dibs on the gpc; we don't need to hold the lock any longer. */ |
| pcpui->guest_pcoreid = guest_pcoreid; |
| vmx_load_guest_pcore(gpc); |
| /* Load guest's xcr0 */ |
| lxcr0(gpc->xcr0); |
| |
| /* Manual MSR save/restore */ |
| write_kern_gsbase(gpc->msr_kern_gs_base); |
| if (gpc->msr_star != AKAROS_MSR_STAR) |
| write_msr(MSR_STAR, gpc->msr_star); |
| if (gpc->msr_lstar != AKAROS_MSR_LSTAR) |
| write_msr(MSR_LSTAR, gpc->msr_lstar); |
| if (gpc->msr_sfmask != AKAROS_MSR_SFMASK) |
| write_msr(MSR_SFMASK, gpc->msr_sfmask); |
| |
| return gpc; |
| } |
| |
| void unload_guest_pcore(struct proc *p, int guest_pcoreid) |
| { |
| struct guest_pcore *gpc; |
| struct per_cpu_info *pcpui = &per_cpu_info[core_id()]; |
| |
| gpc = lookup_guest_pcore(p, guest_pcoreid); |
| assert(gpc); |
| spin_lock(&p->vmm.lock); |
| assert(gpc->cpu != -1); |
| vmx_unload_guest_pcore(gpc); |
| gpc->cpu = -1; |
| |
| /* Save guest's xcr0 and restore Akaros's default. */ |
| gpc->xcr0 = rxcr0(); |
| lxcr0(__proc_global_info.x86_default_xcr0); |
| |
| /* We manage these MSRs manually. */ |
| gpc->msr_kern_gs_base = read_kern_gsbase(); |
| gpc->msr_star = read_msr(MSR_STAR); |
| gpc->msr_lstar = read_msr(MSR_LSTAR); |
| gpc->msr_sfmask = read_msr(MSR_SFMASK); |
| |
| write_kern_gsbase((uint64_t)pcpui); |
| if (gpc->msr_star != AKAROS_MSR_STAR) |
| write_msr(MSR_STAR, AKAROS_MSR_STAR); |
| if (gpc->msr_lstar != AKAROS_MSR_LSTAR) |
| write_msr(MSR_LSTAR, AKAROS_MSR_LSTAR); |
| if (gpc->msr_sfmask, AKAROS_MSR_SFMASK) |
| write_msr(MSR_SFMASK, AKAROS_MSR_SFMASK); |
| |
| /* As soon as we unlock, this gpc can be started on another core */ |
| spin_unlock(&p->vmm.lock); |
| pcpui->guest_pcoreid = -1; |
| } |
| |
| /* emulated msr. For now, an msr value and a pointer to a helper that |
| * performs the requested operation. |
| */ |
| struct emmsr { |
| uint32_t reg; |
| char *name; |
| bool (*f)(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode); |
| bool written; |
| uint32_t edx, eax; |
| }; |
| |
| static bool emsr_miscenable(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode); |
| static bool emsr_readonly(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode); |
| static bool emsr_readzero(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode); |
| static bool emsr_fakewrite(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode); |
| static bool emsr_ok(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode); |
| static bool emsr_fake_apicbase(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode); |
| |
| struct emmsr emmsrs[] = { |
| {MSR_IA32_MISC_ENABLE, "MSR_IA32_MISC_ENABLE", emsr_miscenable}, |
| {MSR_IA32_SYSENTER_CS, "MSR_IA32_SYSENTER_CS", emsr_ok}, |
| {MSR_IA32_SYSENTER_EIP, "MSR_IA32_SYSENTER_EIP", emsr_ok}, |
| {MSR_IA32_SYSENTER_ESP, "MSR_IA32_SYSENTER_ESP", emsr_ok}, |
| {MSR_IA32_UCODE_REV, "MSR_IA32_UCODE_REV", emsr_fakewrite}, |
| {MSR_CSTAR, "MSR_CSTAR", emsr_fakewrite}, |
| {MSR_IA32_VMX_BASIC_MSR, "MSR_IA32_VMX_BASIC_MSR", emsr_fakewrite}, |
| {MSR_IA32_VMX_PINBASED_CTLS_MSR, "MSR_IA32_VMX_PINBASED_CTLS_MSR", |
| emsr_fakewrite}, |
| {MSR_IA32_VMX_PROCBASED_CTLS_MSR, "MSR_IA32_VMX_PROCBASED_CTLS_MSR", |
| emsr_fakewrite}, |
| {MSR_IA32_VMX_PROCBASED_CTLS2, "MSR_IA32_VMX_PROCBASED_CTLS2", |
| emsr_fakewrite}, |
| {MSR_IA32_VMX_EXIT_CTLS_MSR, "MSR_IA32_VMX_EXIT_CTLS_MSR", |
| emsr_fakewrite}, |
| {MSR_IA32_VMX_ENTRY_CTLS_MSR, "MSR_IA32_VMX_ENTRY_CTLS_MSR", |
| emsr_fakewrite}, |
| {MSR_IA32_ENERGY_PERF_BIAS, "MSR_IA32_ENERGY_PERF_BIAS", |
| emsr_fakewrite}, |
| {MSR_LBR_SELECT, "MSR_LBR_SELECT", emsr_ok}, |
| {MSR_LBR_TOS, "MSR_LBR_TOS", emsr_ok}, |
| {MSR_LBR_NHM_FROM, "MSR_LBR_NHM_FROM", emsr_ok}, |
| {MSR_LBR_NHM_TO, "MSR_LBR_NHM_TO", emsr_ok}, |
| {MSR_LBR_CORE_FROM, "MSR_LBR_CORE_FROM", emsr_ok}, |
| {MSR_LBR_CORE_TO, "MSR_LBR_CORE_TO", emsr_ok}, |
| |
| // grumble. |
| {MSR_OFFCORE_RSP_0, "MSR_OFFCORE_RSP_0", emsr_ok}, |
| {MSR_OFFCORE_RSP_1, "MSR_OFFCORE_RSP_1", emsr_ok}, |
| // louder. |
| {MSR_PEBS_LD_LAT_THRESHOLD, "MSR_PEBS_LD_LAT_THRESHOLD", emsr_ok}, |
| // aaaaaahhhhhhhhhhhhhhhhhhhhh |
| {MSR_ARCH_PERFMON_EVENTSEL0, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok}, |
| {MSR_ARCH_PERFMON_EVENTSEL1, "MSR_ARCH_PERFMON_EVENTSEL1", emsr_ok}, |
| {MSR_IA32_PERF_CAPABILITIES, "MSR_IA32_PERF_CAPABILITIES", emsr_readzero}, |
| // unsafe. |
| {MSR_IA32_APICBASE, "MSR_IA32_APICBASE", emsr_fake_apicbase}, |
| |
| // mostly harmless. |
| {MSR_TSC_AUX, "MSR_TSC_AUX", emsr_fakewrite}, |
| {MSR_RAPL_POWER_UNIT, "MSR_RAPL_POWER_UNIT", emsr_readzero}, |
| {MSR_IA32_MCG_CAP, "MSR_IA32_MCG_CAP", emsr_readzero}, |
| {MSR_IA32_DEBUGCTLMSR, "MSR_IA32_DEBUGCTLMSR", emsr_fakewrite}, |
| |
| // TBD |
| {MSR_IA32_TSC_DEADLINE, "MSR_IA32_TSC_DEADLINE", emsr_fakewrite}, |
| }; |
| |
| /* this may be the only register that needs special handling. |
| * If there others then we might want to extend the emmsr struct. |
| */ |
| bool emsr_miscenable(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode) |
| { |
| uint64_t val; |
| uint32_t eax, edx; |
| |
| if (read_msr_safe(msr->reg, &val)) |
| return FALSE; |
| eax = low32(val); |
| eax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; |
| edx = high32(val); |
| /* we just let them read the misc msr for now. */ |
| if (opcode == VMM_MSR_EMU_READ) { |
| vm_tf->tf_rax = eax; |
| vm_tf->tf_rdx = edx; |
| return TRUE; |
| } else { |
| /* if they are writing what is already written, that's ok. */ |
| if (((uint32_t) vm_tf->tf_rax == eax) |
| && ((uint32_t) vm_tf->tf_rdx == edx)) |
| return TRUE; |
| } |
| printk("%s: Wanted to write 0x%x%x, but could not; value was 0x%x%x\n", |
| msr->name, (uint32_t) vm_tf->tf_rdx, (uint32_t) vm_tf->tf_rax, |
| edx, eax); |
| return FALSE; |
| } |
| |
| bool emsr_readonly(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode) |
| { |
| uint64_t val; |
| |
| if (read_msr_safe(msr->reg, &val)) |
| return FALSE; |
| if (opcode == VMM_MSR_EMU_READ) { |
| vm_tf->tf_rax = low32(val); |
| vm_tf->tf_rdx = high32(val); |
| return TRUE; |
| } |
| |
| printk("%s: Tried to write a readonly register\n", msr->name); |
| return FALSE; |
| } |
| |
| bool emsr_readzero(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode) |
| { |
| if (opcode == VMM_MSR_EMU_READ) { |
| vm_tf->tf_rax = 0; |
| vm_tf->tf_rdx = 0; |
| return TRUE; |
| } |
| |
| printk("%s: Tried to write a readonly register\n", msr->name); |
| return FALSE; |
| } |
| |
| /* pretend to write it, but don't write it. */ |
| bool emsr_fakewrite(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode) |
| { |
| uint32_t eax, edx; |
| uint64_t val; |
| |
| if (!msr->written) { |
| if (read_msr_safe(msr->reg, &val)) |
| return FALSE; |
| eax = low32(val); |
| edx = high32(val); |
| } else { |
| eax = msr->eax; |
| edx = msr->edx; |
| } |
| /* we just let them read the misc msr for now. */ |
| if (opcode == VMM_MSR_EMU_READ) { |
| vm_tf->tf_rax = eax; |
| vm_tf->tf_rdx = edx; |
| return TRUE; |
| } else { |
| msr->edx = vm_tf->tf_rdx; |
| msr->eax = vm_tf->tf_rax; |
| msr->written = TRUE; |
| } |
| return TRUE; |
| } |
| |
| bool emsr_ok(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode) |
| { |
| uint64_t val; |
| |
| if (opcode == VMM_MSR_EMU_READ) { |
| if (read_msr_safe(msr->reg, &val)) |
| return FALSE; |
| vm_tf->tf_rax = low32(val); |
| vm_tf->tf_rdx = high32(val); |
| } else { |
| val = (vm_tf->tf_rdx << 32) | (vm_tf->tf_rax & 0xffffffff); |
| if (write_msr_safe(msr->reg, val)) |
| return FALSE; |
| } |
| return TRUE; |
| } |
| |
| /* pretend to write it, but don't write it. */ |
| bool emsr_fake_apicbase(struct emmsr *msr, struct vm_trapframe *vm_tf, |
| uint32_t opcode) |
| { |
| uint32_t eax, edx; |
| |
| if (!msr->written) { |
| /* TODO: tightly coupled to the addr in vmrunkernel. We want this func |
| * to return the val that vmrunkernel put into the VMCS. */ |
| eax = 0xfee00d00; |
| if (vm_tf->tf_guest_pcoreid != 0) { |
| // Remove BSP bit if not core 0 |
| eax = 0xfee00c00; |
| } |
| edx = 0; |
| } else { |
| edx = msr->edx; |
| eax = msr->eax; |
| } |
| /* we just let them read the misc msr for now. */ |
| if (opcode == VMM_MSR_EMU_READ) { |
| vm_tf->tf_rax = eax; |
| vm_tf->tf_rdx = edx; |
| return TRUE; |
| } else { |
| /* if they are writing what is already written, that's ok. */ |
| if (((uint32_t) vm_tf->tf_rax == eax) |
| && ((uint32_t) vm_tf->tf_rdx == edx)) |
| return 0; |
| msr->edx = vm_tf->tf_rdx; |
| msr->eax = vm_tf->tf_rax; |
| msr->written = TRUE; |
| } |
| return TRUE; |
| } |
| |
| bool vmm_emulate_msr(struct vm_trapframe *vm_tf, int op) |
| { |
| for (int i = 0; i < ARRAY_SIZE(emmsrs); i++) { |
| if (emmsrs[i].reg != vm_tf->tf_rcx) |
| continue; |
| return emmsrs[i].f(&emmsrs[i], vm_tf, op); |
| } |
| return FALSE; |
| } |