| /* |
| * Kernel-based Virtual Machine driver for Linux |
| * |
| * This module enables machines with Intel VT-x extensions to run virtual |
| * machines without emulation or binary translation. |
| * |
| * Copyright (C) 2006 Qumranet, Inc. |
| * |
| * Authors: |
| * Avi Kivity <avi@qumranet.com> |
| * Yaniv Kamay <yaniv@qumranet.com> |
| * |
| */ |
| |
| #define DEBUG |
| #define LITEVM_DEBUG |
| |
| #include <kmalloc.h> |
| #include <string.h> |
| #include <stdio.h> |
| #include <assert.h> |
| #include <error.h> |
| #include <pmap.h> |
| #include <sys/queue.h> |
| #include <smp.h> |
| #include <kref.h> |
| #include <atomic.h> |
| #include <alarm.h> |
| #include <event.h> |
| #include <umem.h> |
| #include <devalarm.h> |
| #include <arch/types.h> |
| #include <arch/vm.h> |
| #include <arch/emulate.h> |
| #include <arch/vmdebug.h> |
| #include <arch/msr-index.h> |
| |
| #define currentcpu (&per_cpu_info[core_id()]) |
| |
| struct litevm_stat litevm_stat; |
| |
| static struct litevm_stats_debugfs_item { |
| const char *name; |
| uint32_t *data; |
| } debugfs_entries[] = { |
| { "pf_fixed", &litevm_stat.pf_fixed }, |
| { "pf_guest", &litevm_stat.pf_guest }, |
| { "tlb_flush", &litevm_stat.tlb_flush }, |
| { "invlpg", &litevm_stat.invlpg }, |
| { "exits", &litevm_stat.exits }, |
| { "io_exits", &litevm_stat.io_exits }, |
| { "mmio_exits", &litevm_stat.mmio_exits }, |
| { "signal_exits", &litevm_stat.signal_exits }, |
| { "irq_exits", &litevm_stat.irq_exits }, |
| { 0, 0 } |
| }; |
| |
| static struct dentry *debugfs_dir; |
| |
| static const uint32_t vmx_msr_index[] = { |
| #ifdef __x86_64__ |
| MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, |
| #endif |
| MSR_EFER, // wtf? MSR_K6_STAR, |
| }; |
| #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index)) |
| |
| #ifdef __x86_64__ |
| /* |
| * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt |
| * mechanism (cpu bug AA24) |
| */ |
| #define NR_BAD_MSRS 2 |
| #else |
| #define NR_BAD_MSRS 0 |
| #endif |
| |
| #define TSS_IOPB_BASE_OFFSET 0x66 |
| #define TSS_BASE_SIZE 0x68 |
| #define TSS_IOPB_SIZE (65536 / 8) |
| #define TSS_REDIRECTION_SIZE (256 / 8) |
| #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) |
| |
| #define MSR_IA32_VMX_BASIC_MSR 0x480 |
| #define MSR_IA32_VMX_PINBASED_CTLS_MSR 0x481 |
| #define MSR_IA32_VMX_PROCBASED_CTLS_MSR 0x482 |
| #define MSR_IA32_VMX_EXIT_CTLS_MSR 0x483 |
| #define MSR_IA32_VMX_ENTRY_CTLS_MSR 0x484 |
| |
| #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL |
| #define LMSW_GUEST_MASK 0x0eULL |
| #define CR4_RESEVED_BITS (~((1ULL << 11) - 1)) |
| //#define CR4_VMXE 0x2000 |
| #define CR8_RESEVED_BITS (~0x0fULL) |
| #define EFER_RESERVED_BITS 0xfffffffffffff2fe |
| |
| #ifdef __x86_64__ |
| #define HOST_IS_64 1 |
| #else |
| #define HOST_IS_64 0 |
| #endif |
| |
| /* bit ops not yet widely used in akaros and we're not sure where to put them. */ |
| /** |
| * __ffs - find first set bit in word |
| * @word: The word to search |
| * |
| * Undefined if no bit exists, so code should check against 0 first. |
| */ |
| static inline unsigned long __ffs(unsigned long word) |
| { |
| print_func_entry(); |
| asm("rep; bsf %1,%0" |
| : "=r" (word) |
| : "rm" (word)); |
| print_func_exit(); |
| return word; |
| } |
| |
| static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu, uint32_t msr) |
| { |
| print_func_entry(); |
| int i; |
| |
| for (i = 0; i < vcpu->nmsrs; ++i) |
| if (vcpu->guest_msrs[i].index == msr) { |
| print_func_exit(); |
| return &vcpu->guest_msrs[i]; |
| } |
| print_func_exit(); |
| return 0; |
| } |
| |
| struct descriptor_table { |
| uint16_t limit; |
| unsigned long base; |
| } __attribute__((packed)); |
| |
| static void get_gdt(struct descriptor_table *table) |
| { |
| print_func_entry(); |
| asm ("sgdt %0" : "=m"(*table)); |
| print_func_exit(); |
| } |
| |
| static void get_idt(struct descriptor_table *table) |
| { |
| print_func_entry(); |
| asm ("sidt %0" : "=m"(*table)); |
| print_func_exit(); |
| } |
| |
| static uint16_t read_fs(void) |
| { |
| print_func_entry(); |
| uint16_t seg; |
| asm ("mov %%fs, %0" : "=g"(seg)); |
| print_func_exit(); |
| return seg; |
| } |
| |
| static uint16_t read_gs(void) |
| { |
| print_func_entry(); |
| uint16_t seg; |
| asm ("mov %%gs, %0" : "=g"(seg)); |
| print_func_exit(); |
| return seg; |
| } |
| |
| static uint16_t read_ldt(void) |
| { |
| print_func_entry(); |
| uint16_t ldt; |
| asm ("sldt %0" : "=g"(ldt)); |
| print_func_exit(); |
| return ldt; |
| } |
| |
| static void load_fs(uint16_t sel) |
| { |
| print_func_entry(); |
| asm ("mov %0, %%fs" : : "g"(sel)); |
| print_func_exit(); |
| } |
| |
| static void load_gs(uint16_t sel) |
| { |
| print_func_entry(); |
| asm ("mov %0, %%gs" : : "g"(sel)); |
| print_func_exit(); |
| } |
| |
| #ifndef load_ldt |
| static void load_ldt(uint16_t sel) |
| { |
| print_func_entry(); |
| asm ("lldt %0" : : "g"(sel)); |
| print_func_exit(); |
| } |
| #endif |
| |
| static void fx_save(void *image) |
| { |
| print_func_entry(); |
| asm ("fxsave (%0)":: "r" (image)); |
| print_func_exit(); |
| } |
| |
| static void fx_restore(void *image) |
| { |
| print_func_entry(); |
| asm ("fxrstor (%0)":: "r" (image)); |
| print_func_exit(); |
| } |
| |
| static void fpu_init(void) |
| { |
| print_func_entry(); |
| asm ("finit"); |
| print_func_exit(); |
| } |
| |
| struct segment_descriptor { |
| uint16_t limit_low; |
| uint16_t base_low; |
| uint8_t base_mid; |
| uint8_t type : 4; |
| uint8_t system : 1; |
| uint8_t dpl : 2; |
| uint8_t present : 1; |
| uint8_t limit_high : 4; |
| uint8_t avl : 1; |
| uint8_t long_mode : 1; |
| uint8_t default_op : 1; |
| uint8_t granularity : 1; |
| uint8_t base_high; |
| } __attribute__((packed)); |
| |
| #ifdef __x86_64__ |
| // LDT or TSS descriptor in the GDT. 16 bytes. |
| struct segment_descriptor_64 { |
| struct segment_descriptor s; |
| uint32_t base_higher; |
| uint32_t pad_zero; |
| }; |
| |
| #endif |
| |
| static unsigned long segment_base(uint16_t selector) |
| { |
| print_func_entry(); |
| struct descriptor_table gdt; |
| struct segment_descriptor *d; |
| unsigned long table_base; |
| typedef unsigned long ul; |
| unsigned long v; |
| |
| asm ("sgdt %0" : "=m"(gdt)); |
| table_base = gdt.base; |
| |
| if (selector & 4) { /* from ldt */ |
| uint16_t ldt_selector; |
| |
| asm ("sldt %0" : "=g"(ldt_selector)); |
| table_base = segment_base(ldt_selector); |
| } |
| d = (struct segment_descriptor *)(table_base + (selector & ~7)); |
| v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); |
| #ifdef __x86_64__ |
| if (d->system == 0 |
| && (d->type == 2 || d->type == 9 || d->type == 11)) |
| v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; |
| #endif |
| print_func_exit(); |
| return v; |
| } |
| |
| static unsigned long read_tr_base(void) |
| { |
| print_func_entry(); |
| uint16_t tr; |
| asm ("str %0" : "=g"(tr)); |
| print_func_exit(); |
| return segment_base(tr); |
| } |
| |
| static void reload_tss(void) |
| { |
| print_func_entry(); |
| #ifndef __x86_64__ |
| |
| /* |
| * VT restores TR but not its size. Useless. |
| */ |
| struct descriptor_table gdt; |
| struct segment_descriptor *descs; |
| |
| get_gdt(&gdt); |
| descs = (void *)gdt.base; |
| descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ |
| load_TR_desc(); |
| #endif |
| print_func_exit(); |
| } |
| |
| static struct vmcs_descriptor { |
| int size; |
| int order; |
| uint32_t revision_id; |
| } vmcs_descriptor; |
| |
| static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn) |
| { |
| print_func_entry(); |
| struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn); |
| print_func_exit(); |
| return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0; |
| } |
| |
| |
| |
| int litevm_read_guest(struct litevm_vcpu *vcpu, |
| gva_t addr, |
| unsigned long size, |
| void *dest) |
| { |
| print_func_entry(); |
| unsigned char *host_buf = dest; |
| unsigned long req_size = size; |
| |
| while (size) { |
| hpa_t paddr; |
| unsigned now; |
| unsigned offset; |
| hva_t guest_buf; |
| |
| paddr = gva_to_hpa(vcpu, addr); |
| |
| if (is_error_hpa(paddr)) |
| break; |
| guest_buf = (hva_t)KADDR(paddr); |
| offset = addr & ~PAGE_MASK; |
| guest_buf |= offset; |
| now = MIN(size, PAGE_SIZE - offset); |
| memcpy(host_buf, (void*)guest_buf, now); |
| host_buf += now; |
| addr += now; |
| size -= now; |
| } |
| print_func_exit(); |
| return req_size - size; |
| } |
| |
| int litevm_write_guest(struct litevm_vcpu *vcpu, |
| gva_t addr, |
| unsigned long size, |
| void *data) |
| { |
| print_func_entry(); |
| unsigned char *host_buf = data; |
| unsigned long req_size = size; |
| |
| while (size) { |
| hpa_t paddr; |
| unsigned now; |
| unsigned offset; |
| hva_t guest_buf; |
| |
| paddr = gva_to_hpa(vcpu, addr); |
| |
| if (is_error_hpa(paddr)) |
| break; |
| |
| guest_buf = (hva_t)KADDR(paddr); |
| offset = addr & ~PAGE_MASK; |
| guest_buf |= offset; |
| now = MIN(size, PAGE_SIZE - offset); |
| memcpy((void*)guest_buf, host_buf, now); |
| host_buf += now; |
| addr += now; |
| size -= now; |
| } |
| print_func_exit(); |
| return req_size - size; |
| } |
| |
| static void setup_vmcs_descriptor(void) |
| { |
| print_func_entry(); |
| uint64_t msr; |
| |
| msr = read_msr(MSR_IA32_VMX_BASIC_MSR); |
| vmcs_descriptor.size = (msr>>32) & 0x1fff; |
| vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size>>PAGE_SHIFT); |
| vmcs_descriptor.revision_id = (uint32_t)msr; |
| printk("setup_vmcs_descriptor: msr 0x%x, size 0x%x order 0x%x id 0x%x\n", |
| msr, vmcs_descriptor.size, vmcs_descriptor.order, |
| vmcs_descriptor.revision_id); |
| print_func_exit(); |
| }; |
| |
| static void vmcs_clear(struct vmcs *vmcs) |
| { |
| print_func_entry(); |
| uint64_t phys_addr = PADDR(vmcs); |
| uint8_t error; |
| printk("%d: vmcs %p phys_addr %p\n", core_id(), vmcs, (void *)phys_addr); |
| asm volatile ("vmclear %1; setna %0" |
| : "=m"(error) : "m"(phys_addr) : "cc", "memory" ); |
| if (error) |
| printk("litevm: vmclear fail: %p/%llx\n", |
| vmcs, phys_addr); |
| print_func_exit(); |
| } |
| |
| static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg) |
| { |
| print_func_entry(); |
| struct litevm_vcpu *vcpu = arg; |
| int cpu = core_id(); |
| printd("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n", |
| cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs); |
| |
| if (vcpu->cpu == cpu) |
| vmcs_clear(vcpu->vmcs); |
| |
| if (currentcpu->vmcs == vcpu->vmcs) |
| currentcpu->vmcs = NULL; |
| print_func_exit(); |
| } |
| |
| static int vcpu_slot(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| print_func_exit(); |
| return vcpu - vcpu->litevm->vcpus; |
| } |
| |
| /* |
| * Switches to specified vcpu, until a matching vcpu_put(), but assumes |
| * vcpu mutex is already taken. |
| */ |
| static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| uint64_t phys_addr = PADDR(vcpu->vmcs); |
| int cpu; |
| cpu = core_id(); |
| printk("%d: __vcpu_load phys_addr %p\n", cpu, phys_addr); |
| if (vcpu->cpu != cpu) { |
| handler_wrapper_t *w; |
| smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w); |
| smp_call_wait(w); |
| vcpu->launched = 0; |
| } |
| if (currentcpu->vmcs != vcpu->vmcs) { |
| uint8_t error; |
| |
| currentcpu->vmcs = vcpu->vmcs; |
| asm volatile ("vmptrld %1; setna %0" |
| : "=m"(error) : "m"(phys_addr) : "cc" ); |
| if (error){ |
| printk("litevm: vmptrld %p/%llx fail\n", |
| vcpu->vmcs, phys_addr); |
| error("litevm: vmptrld %p/%llx fail\n", |
| vcpu->vmcs, phys_addr); |
| } |
| } |
| |
| if (vcpu->cpu != cpu) { |
| struct descriptor_table dt; |
| unsigned long sysenter_esp; |
| |
| vcpu->cpu = cpu; |
| /* |
| * Linux uses per-cpu TSS and GDT, so set these when switching |
| * processors. |
| */ |
| vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ |
| get_gdt(&dt); |
| vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ |
| |
| sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP); |
| vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ |
| } |
| print_func_exit(); |
| return vcpu; |
| } |
| |
| /* |
| * Switches to specified vcpu, until a matching vcpu_put() |
| */ |
| static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot) |
| { |
| int ret; |
| print_func_entry(); |
| struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot]; |
| |
| printk("vcpu_slot %d vcpu %p\n", vcpu_slot, vcpu); |
| |
| qlock(&vcpu->mutex); |
| printk("after qlock\n"); |
| if (!vcpu->vmcs) { |
| qunlock(&vcpu->mutex); |
| error("vcpu->vmcs is NULL"); |
| } |
| ret = __vcpu_load(vcpu); |
| print_func_exit(); |
| } |
| |
| static void vcpu_put(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| //put_cpu(); |
| qunlock(&vcpu->mutex); |
| print_func_exit(); |
| } |
| |
| |
| static struct vmcs *alloc_vmcs_cpu(int cpu) |
| { |
| print_func_entry(); |
| int node = node_id(); |
| struct vmcs *vmcs; |
| |
| vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT); |
| if (!pages) { |
| print_func_exit(); |
| return 0; |
| } |
| memset(vmcs, 0, vmcs_descriptor.size); |
| vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */ |
| print_func_exit(); |
| return vmcs; |
| } |
| |
| static struct vmcs *alloc_vmcs(void) |
| { |
| struct vmcs *ret; |
| print_func_entry(); |
| ret = alloc_vmcs_cpu(core_id()); |
| print_func_exit(); |
| return ret; |
| } |
| |
| static int cpu_has_litevm_support(void) |
| { |
| print_func_entry(); |
| uint32_t ecx = cpuid_ecx(1); |
| print_func_exit(); |
| return ecx & 5; /* CPUID.1:ECX.VMX[bit 5] -> VT */ |
| } |
| |
| static int vmx_disabled_by_bios(void) |
| { |
| print_func_entry(); |
| uint64_t msr; |
| |
| msr = read_msr(MSR_IA32_FEATURE_CONTROL); |
| print_func_exit(); |
| return (msr & 5) == 1; /* locked but not enabled */ |
| } |
| |
| static void vm_enable(struct hw_trapframe *hw_tf, void *garbage) |
| { |
| print_func_entry(); |
| int cpu = hw_core_id(); |
| uint64_t phys_addr; |
| uint64_t old; |
| uint64_t status = 0; |
| currentcpu->vmxarea = get_cont_pages_node(core_id(), vmcs_descriptor.order, |
| KMALLOC_WAIT); |
| if (! currentcpu->vmxarea) |
| return; |
| memset(currentcpu->vmxarea, 0, vmcs_descriptor.size); |
| currentcpu->vmxarea->revision_id = vmcs_descriptor.revision_id; |
| phys_addr = PADDR(currentcpu->vmxarea); |
| printk("%d: currentcpu->vmxarea %p phys_addr %p\n", core_id(), |
| currentcpu->vmxarea, (void *)phys_addr); |
| if (phys_addr & 0xfff){ |
| printk("fix vmxarea alignment!"); |
| } |
| printk("%d: CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE); |
| old = read_msr(MSR_IA32_FEATURE_CONTROL); |
| printk("%d: vm_enable, old is %d\n", core_id(), old); |
| if ((old & 5) == 0){ |
| /* enable and lock */ |
| write_msr(MSR_IA32_FEATURE_CONTROL, old | 5); |
| old = read_msr(MSR_IA32_FEATURE_CONTROL); |
| printk("%d:vm_enable, tried to set 5, old is %d\n", core_id(), old); |
| } |
| printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE); |
| lcr4(rcr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */ |
| printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE); |
| printk("%d:cr0 is %x\n", core_id(), rcr0()); |
| lcr0(rcr0() | 0x20); |
| printk("%d:cr0 is %x\n", core_id(), rcr0()); |
| printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92)); |
| outb(0x92, inb(0x92)|2); |
| printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92)); |
| asm volatile ("vmxon %1\njbe 1f\nmovl $1, %0\n1:" \ |
| : "=m" (status) : "m"(phys_addr) : "memory", "cc"); |
| printk("%d:vmxon status is %d\n", core_id(), status); |
| printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE); |
| if (! status){ |
| printk("%d:vm_enable: status says fail\n", core_id()); |
| } |
| print_func_exit(); |
| } |
| |
| static void litevm_disable(void *garbage) |
| { |
| print_func_entry(); |
| asm volatile ("vmxoff" : : : "cc"); |
| print_func_exit(); |
| } |
| |
| struct litevm *vmx_open(void) |
| { |
| print_func_entry(); |
| struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT); |
| int i; |
| |
| if (!litevm) { |
| printk("NO LITEVM! MAKES NO SENSE!\n"); |
| error("litevm alloc failed"); |
| print_func_exit(); |
| return 0; |
| } |
| |
| spinlock_init_irqsave(&litevm->lock); |
| LIST_INIT(&litevm->link); |
| for (i = 0; i < LITEVM_MAX_VCPUS; ++i) { |
| struct litevm_vcpu *vcpu = &litevm->vcpus[i]; |
| |
| qlock_init(&vcpu->mutex); |
| vcpu->mmu.root_hpa = INVALID_PAGE; |
| LIST_INIT(&vcpu->link); |
| } |
| printk("vmx_open: busy %d\n", litevm->busy); |
| printk("return %p\n", litevm); |
| print_func_exit(); |
| return litevm; |
| } |
| |
| /* |
| * Free any memory in @free but not in @dont. |
| */ |
| static void litevm_free_physmem_slot(struct litevm_memory_slot *free, |
| struct litevm_memory_slot *dont) |
| { |
| print_func_entry(); |
| int i; |
| |
| if (!dont || free->phys_mem != dont->phys_mem) |
| if (free->phys_mem) { |
| for (i = 0; i < free->npages; ++i){ |
| page_t *page = free->phys_mem[i]; |
| page_decref(page); |
| assert(page_is_free(page2ppn(page))); |
| } |
| kfree(free->phys_mem); |
| } |
| |
| if (!dont || free->dirty_bitmap != dont->dirty_bitmap) |
| kfree(free->dirty_bitmap); |
| |
| free->phys_mem = 0; |
| free->npages = 0; |
| free->dirty_bitmap = 0; |
| print_func_exit(); |
| } |
| |
| static void litevm_free_physmem(struct litevm *litevm) |
| { |
| print_func_entry(); |
| int i; |
| |
| for (i = 0; i < litevm->nmemslots; ++i) |
| litevm_free_physmem_slot(&litevm->memslots[i], 0); |
| print_func_exit(); |
| } |
| |
| static void litevm_free_vmcs(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| if (vcpu->vmcs) { |
| handler_wrapper_t *w; |
| smp_call_function_all(__vcpu_clear, vcpu, &w); |
| smp_call_wait(w); |
| //free_vmcs(vcpu->vmcs); |
| vcpu->vmcs = 0; |
| } |
| print_func_exit(); |
| } |
| |
| static void litevm_free_vcpu(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| litevm_free_vmcs(vcpu); |
| litevm_mmu_destroy(vcpu); |
| print_func_exit(); |
| } |
| |
| static void litevm_free_vcpus(struct litevm *litevm) |
| { |
| print_func_entry(); |
| unsigned int i; |
| |
| for (i = 0; i < LITEVM_MAX_VCPUS; ++i) |
| litevm_free_vcpu(&litevm->vcpus[i]); |
| print_func_exit(); |
| } |
| |
| static int litevm_dev_release(struct litevm *litevm) |
| { |
| print_func_entry(); |
| |
| litevm_free_vcpus(litevm); |
| litevm_free_physmem(litevm); |
| kfree(litevm); |
| print_func_exit(); |
| return 0; |
| } |
| |
| unsigned long vmcs_readl(unsigned long field) |
| { |
| print_func_entry(); |
| unsigned long value; |
| |
| asm volatile ("vmread %1, %0" : "=g"(value) : "r"(field) : "cc"); |
| print_func_exit(); |
| return value; |
| } |
| |
| void vmcs_writel(unsigned long field, unsigned long value) |
| { |
| print_func_entry(); |
| uint8_t error; |
| |
| asm volatile ("vmwrite %1, %2; setna %0" |
| : "=g"(error) : "r"(value), "r"(field) : "cc" ); |
| if (error) |
| printk("vmwrite error: reg %lx value %lx (err %d)\n", |
| field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); |
| print_func_exit(); |
| } |
| |
| static void vmcs_write16(unsigned long field, uint16_t value) |
| { |
| print_func_entry(); |
| vmcs_writel(field, value); |
| print_func_exit(); |
| } |
| |
| static void vmcs_write64(unsigned long field, uint64_t value) |
| { |
| print_func_entry(); |
| #ifdef __x86_64__ |
| vmcs_writel(field, value); |
| #else |
| vmcs_writel(field, value); |
| asm volatile (""); |
| vmcs_writel(field+1, value >> 32); |
| #endif |
| print_func_exit(); |
| } |
| |
| static void inject_gp(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| printd("inject_general_protection: rip 0x%lx\n", |
| vmcs_readl(GUEST_RIP)); |
| vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0); |
| vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
| GP_VECTOR | |
| INTR_TYPE_EXCEPTION | |
| INTR_INFO_DELIEVER_CODE_MASK | |
| INTR_INFO_VALID_MASK); |
| print_func_exit(); |
| } |
| |
| static void update_exception_bitmap(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| if (vcpu->rmode.active) |
| vmcs_write32(EXCEPTION_BITMAP, ~0); |
| else |
| vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); |
| print_func_exit(); |
| } |
| |
| static void enter_pmode(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| unsigned long flags; |
| |
| vcpu->rmode.active = 0; |
| |
| vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); |
| vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); |
| vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); |
| |
| flags = vmcs_readl(GUEST_RFLAGS); |
| flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); |
| flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); |
| vmcs_writel(GUEST_RFLAGS, flags); |
| |
| vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) | |
| (vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK) ); |
| |
| update_exception_bitmap(vcpu); |
| |
| #define FIX_PMODE_DATASEG(seg, save) { \ |
| vmcs_write16(GUEST_##seg##_SELECTOR, 0); \ |
| vmcs_writel(GUEST_##seg##_BASE, 0); \ |
| vmcs_write32(GUEST_##seg##_LIMIT, 0xffff); \ |
| vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93); \ |
| } |
| |
| FIX_PMODE_DATASEG(SS, vcpu->rmode.ss); |
| FIX_PMODE_DATASEG(ES, vcpu->rmode.es); |
| FIX_PMODE_DATASEG(DS, vcpu->rmode.ds); |
| FIX_PMODE_DATASEG(GS, vcpu->rmode.gs); |
| FIX_PMODE_DATASEG(FS, vcpu->rmode.fs); |
| |
| vmcs_write16(GUEST_CS_SELECTOR, |
| vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); |
| vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); |
| print_func_exit(); |
| } |
| |
| static int rmode_tss_base(struct litevm* litevm) |
| { |
| print_func_entry(); |
| gfn_t base_gfn = litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3; |
| print_func_exit(); |
| return base_gfn << PAGE_SHIFT; |
| } |
| |
| static void enter_rmode(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| unsigned long flags; |
| |
| vcpu->rmode.active = 1; |
| |
| vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); |
| vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm)); |
| |
| vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); |
| vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); |
| |
| vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); |
| vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); |
| |
| flags = vmcs_readl(GUEST_RFLAGS); |
| vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; |
| |
| flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; |
| |
| vmcs_writel(GUEST_RFLAGS, flags); |
| vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK); |
| update_exception_bitmap(vcpu); |
| |
| #define FIX_RMODE_SEG(seg, save) { \ |
| vmcs_write16(GUEST_##seg##_SELECTOR, \ |
| vmcs_readl(GUEST_##seg##_BASE) >> 4); \ |
| vmcs_write32(GUEST_##seg##_LIMIT, 0xffff); \ |
| vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3); \ |
| } |
| |
| vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); |
| vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); |
| |
| FIX_RMODE_SEG(ES, vcpu->rmode.es); |
| FIX_RMODE_SEG(DS, vcpu->rmode.ds); |
| FIX_RMODE_SEG(SS, vcpu->rmode.ss); |
| FIX_RMODE_SEG(GS, vcpu->rmode.gs); |
| FIX_RMODE_SEG(FS, vcpu->rmode.fs); |
| print_func_exit(); |
| } |
| |
| static int init_rmode_tss(struct litevm* litevm) |
| { |
| print_func_entry(); |
| struct page *p1, *p2, *p3; |
| gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT; |
| char *page; |
| |
| p1 = _gfn_to_page(litevm, fn++); |
| p2 = _gfn_to_page(litevm, fn++); |
| p3 = _gfn_to_page(litevm, fn); |
| |
| if (!p1 || !p2 || !p3) { |
| printk("%s: gfn_to_page failed\n", __FUNCTION__); |
| print_func_exit(); |
| return 0; |
| } |
| |
| page = page2kva(p1); |
| memset(page, 0, PAGE_SIZE); |
| *(uint16_t*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; |
| |
| page = page2kva(p2); |
| memset(page, 0, PAGE_SIZE); |
| |
| page = page2kva(p3); |
| memset(page, 0, PAGE_SIZE); |
| *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; |
| |
| print_func_exit(); |
| return 1; |
| } |
| |
| #ifdef __x86_64__ |
| |
| static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer) |
| { |
| print_func_entry(); |
| struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER); |
| |
| vcpu->shadow_efer = efer; |
| if (efer & EFER_LMA) { |
| vmcs_write32(VM_ENTRY_CONTROLS, |
| vmcs_read32(VM_ENTRY_CONTROLS) | |
| VM_ENTRY_CONTROLS_IA32E_MASK); |
| msr->data = efer; |
| |
| } else { |
| vmcs_write32(VM_ENTRY_CONTROLS, |
| vmcs_read32(VM_ENTRY_CONTROLS) & |
| ~VM_ENTRY_CONTROLS_IA32E_MASK); |
| |
| msr->data = efer & ~EFER_LME; |
| } |
| print_func_exit(); |
| } |
| |
| static void enter_lmode(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| uint32_t guest_tr_ar; |
| |
| guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); |
| if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { |
| printd("%s: tss fixup for long mode. \n", |
| __FUNCTION__); |
| vmcs_write32(GUEST_TR_AR_BYTES, |
| (guest_tr_ar & ~AR_TYPE_MASK) |
| | AR_TYPE_BUSY_64_TSS); |
| } |
| |
| vcpu->shadow_efer |= EFER_LMA; |
| |
| find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME; |
| vmcs_write32(VM_ENTRY_CONTROLS, |
| vmcs_read32(VM_ENTRY_CONTROLS) |
| | VM_ENTRY_CONTROLS_IA32E_MASK); |
| print_func_exit(); |
| } |
| |
| static void exit_lmode(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| vcpu->shadow_efer &= ~EFER_LMA; |
| |
| vmcs_write32(VM_ENTRY_CONTROLS, |
| vmcs_read32(VM_ENTRY_CONTROLS) |
| & ~VM_ENTRY_CONTROLS_IA32E_MASK); |
| print_func_exit(); |
| } |
| |
| #endif |
| |
| static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0) |
| { |
| print_func_entry(); |
| if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) |
| enter_pmode(vcpu); |
| |
| if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) |
| enter_rmode(vcpu); |
| |
| #ifdef __x86_64__ |
| if (vcpu->shadow_efer & EFER_LME) { |
| if (!is_paging() && (cr0 & CR0_PG_MASK)) |
| enter_lmode(vcpu); |
| if (is_paging() && !(cr0 & CR0_PG_MASK)) |
| exit_lmode(vcpu); |
| } |
| #endif |
| |
| vmcs_writel(CR0_READ_SHADOW, cr0); |
| vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON); |
| print_func_exit(); |
| } |
| |
| static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu, |
| unsigned long cr3) |
| { |
| print_func_entry(); |
| gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; |
| unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5; |
| int i; |
| uint64_t pdpte; |
| uint64_t *pdpt; |
| struct litevm_memory_slot *memslot; |
| |
| spin_lock_irqsave(&vcpu->litevm->lock); |
| memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn); |
| /* FIXME: !memslot - emulate? 0xff? */ |
| pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn)); |
| |
| for (i = 0; i < 4; ++i) { |
| pdpte = pdpt[offset + i]; |
| if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) |
| break; |
| } |
| |
| spin_unlock(&vcpu->litevm->lock); |
| |
| print_func_exit(); |
| return i != 4; |
| } |
| |
| static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0) |
| { |
| print_func_entry(); |
| if (cr0 & CR0_RESEVED_BITS) { |
| printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", |
| cr0, guest_cr0()); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| |
| if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) { |
| printd("set_cr0: #GP, CD == 0 && NW == 1\n"); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| |
| if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) { |
| printd("set_cr0: #GP, set PG flag " |
| "and a clear PE flag\n"); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| |
| if (!is_paging() && (cr0 & CR0_PG_MASK)) { |
| #ifdef __x86_64__ |
| if ((vcpu->shadow_efer & EFER_LME)) { |
| uint32_t guest_cs_ar; |
| if (!is_pae()) { |
| printd("set_cr0: #GP, start paging " |
| "in long mode while PAE is disabled\n"); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES); |
| if (guest_cs_ar & SEGMENT_AR_L_MASK) { |
| printd("set_cr0: #GP, start paging " |
| "in long mode while CS.L == 1\n"); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| |
| } |
| } else |
| #endif |
| if (is_pae() && |
| pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { |
| printd("set_cr0: #GP, pdptrs " |
| "reserved bits\n"); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| |
| } |
| |
| __set_cr0(vcpu, cr0); |
| litevm_mmu_reset_context(vcpu); |
| print_func_exit(); |
| return; |
| } |
| |
| static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw) |
| { |
| print_func_entry(); |
| unsigned long cr0 = guest_cr0(); |
| |
| if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) { |
| enter_pmode(vcpu); |
| vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK); |
| |
| } else |
| printd("lmsw: unexpected\n"); |
| |
| vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK) |
| | (msw & LMSW_GUEST_MASK)); |
| print_func_exit(); |
| } |
| |
| static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4) |
| { |
| print_func_entry(); |
| vmcs_writel(CR4_READ_SHADOW, cr4); |
| vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? |
| LITEVM_RMODE_VM_CR4_ALWAYS_ON : LITEVM_PMODE_VM_CR4_ALWAYS_ON)); |
| print_func_exit(); |
| } |
| |
| static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4) |
| { |
| print_func_entry(); |
| if (cr4 & CR4_RESEVED_BITS) { |
| printd("set_cr4: #GP, reserved bits\n"); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| |
| if (is_long_mode()) { |
| if (!(cr4 & CR4_PAE_MASK)) { |
| printd("set_cr4: #GP, clearing PAE while " |
| "in long mode\n"); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| } else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK) |
| && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { |
| printd("set_cr4: #GP, pdptrs reserved bits\n"); |
| inject_gp(vcpu); |
| } |
| |
| if (cr4 & CR4_VMXE_MASK) { |
| printd("set_cr4: #GP, setting VMXE\n"); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| __set_cr4(vcpu, cr4); |
| spin_lock_irqsave(&vcpu->litevm->lock); |
| litevm_mmu_reset_context(vcpu); |
| spin_unlock(&vcpu->litevm->lock); |
| print_func_exit(); |
| } |
| |
| static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3) |
| { |
| print_func_entry(); |
| if (is_long_mode()) { |
| if ( cr3 & CR3_L_MODE_RESEVED_BITS) { |
| printd("set_cr3: #GP, reserved bits\n"); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| } else { |
| if (cr3 & CR3_RESEVED_BITS) { |
| printd("set_cr3: #GP, reserved bits\n"); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| if (is_paging() && is_pae() && |
| pdptrs_have_reserved_bits_set(vcpu, cr3)) { |
| printd("set_cr3: #GP, pdptrs " |
| "reserved bits\n"); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| } |
| |
| vcpu->cr3 = cr3; |
| spin_lock_irqsave(&vcpu->litevm->lock); |
| vcpu->mmu.new_cr3(vcpu); |
| spin_unlock(&vcpu->litevm->lock); |
| print_func_exit(); |
| } |
| |
| static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8) |
| { |
| print_func_entry(); |
| if ( cr8 & CR8_RESEVED_BITS) { |
| printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| vcpu->cr8 = cr8; |
| print_func_exit(); |
| } |
| |
| static uint32_t get_rdx_init_val(void) |
| { |
| print_func_entry(); |
| uint32_t val; |
| |
| asm ("movl $1, %%eax \n\t" |
| "movl %%eax, %0 \n\t" : "=g"(val) ); |
| print_func_exit(); |
| return val; |
| |
| } |
| |
| static void fx_init(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| struct __attribute__ ((__packed__)) fx_image_s { |
| uint16_t control; //fcw |
| uint16_t status; //fsw |
| uint16_t tag; // ftw |
| uint16_t opcode; //fop |
| uint64_t ip; // fpu ip |
| uint64_t operand;// fpu dp |
| uint32_t mxcsr; |
| uint32_t mxcsr_mask; |
| |
| } *fx_image; |
| |
| fx_save(vcpu->host_fx_image); |
| fpu_init(); |
| fx_save(vcpu->guest_fx_image); |
| fx_restore(vcpu->host_fx_image); |
| |
| fx_image = (struct fx_image_s *)vcpu->guest_fx_image; |
| fx_image->mxcsr = 0x1f80; |
| memset(vcpu->guest_fx_image + sizeof(struct fx_image_s), |
| 0, FX_IMAGE_SIZE - sizeof(struct fx_image_s)); |
| print_func_exit(); |
| } |
| |
| static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field, uint32_t val) |
| { |
| print_func_entry(); |
| uint32_t msr_high, msr_low; |
| uint64_t msrval; |
| |
| msrval = read_msr(msr); |
| msr_low = msrval; |
| msr_high = (msrval>>32); |
| |
| val &= msr_high; |
| val |= msr_low; |
| vmcs_write32(vmcs_field, val); |
| print_func_exit(); |
| } |
| |
| /* |
| * Sets up the vmcs for emulated real mode. |
| */ |
| static int litevm_vcpu_setup(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| /* no op on x86_64 */ |
| #define asmlinkage |
| extern asmlinkage void litevm_vmx_return(void); |
| uint32_t host_sysenter_cs; |
| uint32_t junk; |
| uint64_t a; |
| struct descriptor_table dt; |
| int i; |
| int ret; |
| uint64_t tsc; |
| int nr_good_msrs; |
| |
| |
| if (!init_rmode_tss(vcpu->litevm)) { |
| error("vcpu_setup: init_rmode_tss failed"); |
| } |
| |
| memset(vcpu->regs, 0, sizeof(vcpu->regs)); |
| vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val(); |
| vcpu->cr8 = 0; |
| vcpu->apic_base = 0xfee00000 | |
| /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | |
| MSR_IA32_APICBASE_ENABLE; |
| |
| fx_init(vcpu); |
| |
| #define SEG_SETUP(seg) do { \ |
| vmcs_write16(GUEST_##seg##_SELECTOR, 0); \ |
| vmcs_writel(GUEST_##seg##_BASE, 0); \ |
| vmcs_write32(GUEST_##seg##_LIMIT, 0xffff); \ |
| vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93); \ |
| } while (0) |
| |
| /* |
| * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode |
| * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. |
| */ |
| vmcs_write16(GUEST_CS_SELECTOR, 0xf000); |
| vmcs_writel(GUEST_CS_BASE, 0x000f0000); |
| vmcs_write32(GUEST_CS_LIMIT, 0xffff); |
| vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); |
| |
| SEG_SETUP(DS); |
| SEG_SETUP(ES); |
| SEG_SETUP(FS); |
| SEG_SETUP(GS); |
| SEG_SETUP(SS); |
| |
| vmcs_write16(GUEST_TR_SELECTOR, 0); |
| vmcs_writel(GUEST_TR_BASE, 0); |
| vmcs_write32(GUEST_TR_LIMIT, 0xffff); |
| vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); |
| |
| vmcs_write16(GUEST_LDTR_SELECTOR, 0); |
| vmcs_writel(GUEST_LDTR_BASE, 0); |
| vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); |
| vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); |
| |
| vmcs_write32(GUEST_SYSENTER_CS, 0); |
| vmcs_writel(GUEST_SYSENTER_ESP, 0); |
| vmcs_writel(GUEST_SYSENTER_EIP, 0); |
| |
| vmcs_writel(GUEST_RFLAGS, 0x02); |
| vmcs_writel(GUEST_RIP, 0xfff0); |
| vmcs_writel(GUEST_RSP, 0); |
| |
| vmcs_writel(GUEST_CR3, 0); |
| |
| //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 |
| vmcs_writel(GUEST_DR7, 0x400); |
| |
| vmcs_writel(GUEST_GDTR_BASE, 0); |
| vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); |
| |
| vmcs_writel(GUEST_IDTR_BASE, 0); |
| vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); |
| |
| vmcs_write32(GUEST_ACTIVITY_STATE, 0); |
| vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); |
| vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); |
| |
| /* I/O */ |
| vmcs_write64(IO_BITMAP_A, 0); |
| vmcs_write64(IO_BITMAP_B, 0); |
| |
| tsc = read_tsc(); |
| vmcs_write64(TSC_OFFSET, -tsc); |
| |
| vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ |
| |
| /* Special registers */ |
| vmcs_write64(GUEST_IA32_DEBUGCTL, 0); |
| |
| /* Control */ |
| vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR, |
| PIN_BASED_VM_EXEC_CONTROL, |
| PIN_BASED_EXT_INTR_MASK /* 20.6.1 */ |
| | PIN_BASED_NMI_EXITING /* 20.6.1 */ |
| ); |
| vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR, |
| CPU_BASED_VM_EXEC_CONTROL, |
| CPU_BASED_HLT_EXITING /* 20.6.2 */ |
| | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ |
| | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ |
| | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ |
| | CPU_BASED_INVDPG_EXITING |
| | CPU_BASED_MOV_DR_EXITING |
| | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ |
| ); |
| |
| vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); |
| vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); |
| vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); |
| vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ |
| |
| vmcs_writel(HOST_CR0, rcr0()); /* 22.2.3 */ |
| vmcs_writel(HOST_CR4, rcr4()); /* 22.2.3, 22.2.5 */ |
| vmcs_writel(HOST_CR3, rcr3()); /* 22.2.3 FIXME: shadow tables */ |
| |
| #warning "not setting selectors; do we need them?" |
| #if 0 |
| vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ |
| vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ |
| vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ |
| #endif |
| vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ |
| vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ |
| #if 0 |
| vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ |
| #endif |
| #ifdef __x86_64__ |
| a = read_msr(MSR_FS_BASE); |
| vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ |
| a = read_msr(MSR_GS_BASE); |
| vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ |
| #else |
| vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ |
| vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ |
| #endif |
| |
| #warning "Not setting HOST_TR_SELECTOR" |
| #if 0 |
| vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ |
| #endif |
| |
| get_idt(&dt); |
| vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ |
| |
| |
| vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return); /* 22.2.5 */ |
| |
| /* it's the HIGH 32 bits! */ |
| host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32; |
| vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); |
| a = read_msr(MSR_IA32_SYSENTER_ESP); |
| vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ |
| a = read_msr(MSR_IA32_SYSENTER_EIP); |
| vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ |
| |
| ret = -ENOMEM; |
| vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT); |
| if (!vcpu->guest_msrs) |
| error("guest_msrs kmalloc failed"); |
| vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT); |
| if (!vcpu->host_msrs) |
| error("vcpu->host_msrs kmalloc failed -- storage leaked"); |
| |
| for (i = 0; i < NR_VMX_MSR; ++i) { |
| uint32_t index = vmx_msr_index[i]; |
| uint32_t data_low, data_high; |
| uint64_t data; |
| int j = vcpu->nmsrs; |
| |
| #warning "need readmsr_safe" |
| // if (rdmsr_safe(index, &data_low, &data_high) < 0) |
| // continue; |
| data = read_msr(index); |
| vcpu->host_msrs[j].index = index; |
| vcpu->host_msrs[j].reserved = 0; |
| vcpu->host_msrs[j].data = data; |
| vcpu->guest_msrs[j] = vcpu->host_msrs[j]; |
| ++vcpu->nmsrs; |
| } |
| printk("msrs: %d\n", vcpu->nmsrs); |
| |
| nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS; |
| vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, |
| PADDR(vcpu->guest_msrs + NR_BAD_MSRS)); |
| vmcs_writel(VM_EXIT_MSR_STORE_ADDR, |
| PADDR(vcpu->guest_msrs + NR_BAD_MSRS)); |
| vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, |
| PADDR(vcpu->host_msrs + NR_BAD_MSRS)); |
| vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS, |
| (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */ |
| vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ |
| vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ |
| vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ |
| |
| |
| /* 22.2.1, 20.8.1 */ |
| vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR, |
| VM_ENTRY_CONTROLS, 0); |
| vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ |
| |
| vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0); |
| vmcs_writel(TPR_THRESHOLD, 0); |
| |
| vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK); |
| vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK); |
| |
| __set_cr0(vcpu, 0x60000010); // enter rmode |
| __set_cr4(vcpu, 0); |
| #ifdef __x86_64__ |
| __set_efer(vcpu, 0); |
| #endif |
| |
| ret = litevm_mmu_init(vcpu); |
| |
| print_func_exit(); |
| return ret; |
| |
| out_free_guest_msrs: |
| kfree(vcpu->guest_msrs); |
| out: |
| return ret; |
| } |
| |
| /* |
| * Sync the rsp and rip registers into the vcpu structure. This allows |
| * registers to be accessed by indexing vcpu->regs. |
| */ |
| static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); |
| vcpu->rip = vmcs_readl(GUEST_RIP); |
| print_func_exit(); |
| } |
| |
| /* |
| * Syncs rsp and rip back into the vmcs. Should be called after possible |
| * modification. |
| */ |
| static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); |
| vmcs_writel(GUEST_RIP, vcpu->rip); |
| print_func_exit(); |
| } |
| |
| /* |
| * Creates some virtual cpus. Good luck creating more than one. |
| */ |
| int vmx_create_vcpu(struct litevm *litevm, int n) |
| { |
| print_func_entry(); |
| ERRSTACK(1); |
| int r; |
| struct litevm_vcpu *vcpu; |
| struct vmcs *vmcs; |
| char *errstring = NULL; |
| |
| if (n < 0 || n >= LITEVM_MAX_VCPUS){ |
| printk("%d is out of range; LITEVM_MAX_VCPUS is %d", n, LITEVM_MAX_VCPUS); |
| error("%d is out of range; LITEVM_MAX_VCPUS is %d", n, LITEVM_MAX_VCPUS); |
| } |
| |
| vcpu = &litevm->vcpus[n]; |
| |
| qlock(&vcpu->mutex); |
| |
| if (vcpu->vmcs) { |
| qunlock(&vcpu->mutex); |
| printk("VM already exists\n"); |
| error("VM already exists"); |
| } |
| |
| /* I'm a bad person */ |
| //ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN); |
| uint64_t a = (uint64_t) vcpu->fx_buf; |
| a += FX_IMAGE_ALIGN-1; |
| a /= FX_IMAGE_ALIGN; |
| a *= FX_IMAGE_ALIGN; |
| |
| vcpu->host_fx_image = (char*)a; |
| vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; |
| |
| vcpu->cpu = -1; /* First load will set up TR */ |
| vcpu->litevm = litevm; |
| |
| vmcs = alloc_vmcs(); |
| if (!vmcs) { |
| errstring = "vmcs allocate failed"; |
| printk("%s\n", errstring); |
| qunlock(&vcpu->mutex); |
| goto out_free_vcpus; |
| } |
| vmcs_clear(vmcs); |
| printk("after vmcs_clear\n"); |
| vcpu->vmcs = vmcs; |
| vcpu->launched = 0; |
| printk("vcpu %p slot %d vmcs is %p\n", vcpu, n, vmcs); |
| error("before vcpu_load"); |
| __vcpu_load(vcpu); |
| |
| printk("PAST vcpu_load\n"); |
| if (waserror()){ |
| /* we really need to fix waserror() */ |
| poperror(); |
| goto out_free_vcpus; |
| } |
| |
| r = litevm_vcpu_setup(vcpu); |
| |
| vcpu_put(vcpu); |
| |
| printk("r is %d\n", r); |
| |
| if (! r) { |
| |
| print_func_exit(); |
| return 0; |
| } |
| |
| errstring = "vcup set failed"; |
| |
| out_free_vcpus: |
| printk("out_free_vcpus: life sucks\n"); |
| litevm_free_vcpu(vcpu); |
| error(errstring); |
| out: |
| print_func_exit(); |
| return r; |
| } |
| |
| /* |
| * Allocate some memory and give it an address in the guest physical address |
| * space. |
| * |
| * Discontiguous memory is allowed, mostly for framebuffers. |
| */ |
| int vm_set_memory_region(struct litevm *litevm, |
| struct litevm_memory_region *mem) |
| { |
| print_func_entry(); |
| ERRSTACK(2); |
| int r; |
| gfn_t base_gfn; |
| unsigned long npages; |
| unsigned long i; |
| struct litevm_memory_slot *memslot; |
| struct litevm_memory_slot old, new; |
| int memory_config_version; |
| void *init_data = mem->init_data; |
| int pass = 1; |
| |
| printk("litevm %p\n", litevm); |
| /* should not happen but ... */ |
| if (! litevm) |
| error("NULL litevm in %s", __func__); |
| |
| if (!mem) |
| error("NULL mem in %s", __func__); |
| |
| if (litevm->busy) |
| error("litevm->busy is set! 0x%x\n", litevm->busy); |
| r = -EINVAL; |
| /* General sanity checks */ |
| if (mem->memory_size & (PAGE_SIZE - 1)) |
| error("mem->memory_size %lld is not page-aligned", mem->memory_size); |
| if (mem->guest_phys_addr & (PAGE_SIZE - 1)) |
| error("guest_phys_addr 0x%llx is not page-aligned", mem->guest_phys_addr); |
| if (mem->slot >= LITEVM_MEMORY_SLOTS) |
| error("Slot %d is >= %d", mem->slot, LITEVM_MEMORY_SLOTS); |
| if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) |
| error("0x%x + 0x%x is < 0x%x", |
| mem->guest_phys_addr, mem->memory_size, mem->guest_phys_addr); |
| |
| memslot = &litevm->memslots[mem->slot]; |
| base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; |
| npages = mem->memory_size >> PAGE_SHIFT; |
| |
| if (!npages) |
| mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES; |
| |
| /* this is actually a very tricky for loop. The use of |
| * error is a bit dangerous, so we don't use it much. |
| * consider a rewrite. Would be nice if akaros could do the |
| * allocation of a bunch of pages for us. |
| */ |
| raced: |
| printk("raced: pass %d\n", pass); |
| spin_lock_irqsave(&litevm->lock); |
| printk("locked\n"); |
| |
| if (waserror()){ |
| spin_unlock(&litevm->lock); |
| nexterror(); |
| } |
| |
| memory_config_version = litevm->memory_config_version; |
| new = old = *memslot; |
| |
| new.base_gfn = base_gfn; |
| new.npages = npages; |
| new.flags = mem->flags; |
| |
| /* Disallow changing a memory slot's size. */ |
| r = -EINVAL; |
| if (npages && old.npages && npages != old.npages) |
| error("npages is %d, old.npages is %d, can't change", |
| npages, old.npages); |
| |
| /* Check for overlaps */ |
| r = -EEXIST; |
| for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) { |
| struct litevm_memory_slot *s = &litevm->memslots[i]; |
| |
| if (s == memslot) |
| continue; |
| if (!((base_gfn + npages <= s->base_gfn) || |
| (base_gfn >= s->base_gfn + s->npages))) |
| error("Overlap"); |
| } |
| /* |
| * Do memory allocations outside lock. memory_config_version will |
| * detect any races. |
| */ |
| spin_unlock(&litevm->lock); |
| printk("unlocked\n"); |
| poperror(); |
| |
| /* Deallocate if slot is being removed */ |
| if (!npages) |
| new.phys_mem = 0; |
| |
| /* Free page dirty bitmap if unneeded */ |
| if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES)) |
| new.dirty_bitmap = 0; |
| |
| r = -ENOMEM; |
| |
| /* Allocate if a slot is being created */ |
| if (npages && !new.phys_mem) { |
| new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT); |
| |
| if (!new.phys_mem) |
| goto out_free; |
| |
| for (i = 0; i < npages; ++i) { |
| int ret; |
| ret = kpage_alloc(&new.phys_mem[i]); |
| if (ret != ESUCCESS) |
| goto out_free; |
| if (init_data){ |
| printk("init data memcpy(%p,%p,4096);\n", |
| page2kva(new.phys_mem[i]), init_data); |
| memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE); |
| init_data += PAGE_SIZE; |
| } |
| } |
| } |
| |
| /* Allocate page dirty bitmap if needed */ |
| if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { |
| unsigned dirty_bytes;//ALIGN(npages, BITS_PER_LONG) / 8; |
| dirty_bytes = (((npages + BITS_PER_LONG-1)/BITS_PER_LONG)*BITS_PER_LONG)/8; |
| |
| new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT); |
| if (!new.dirty_bitmap){ |
| printk("VM: alloc of %d bytes for map failed\n", dirty_bytes); |
| goto out_free; |
| } |
| } |
| |
| spin_lock_irqsave(&litevm->lock); |
| printk("locked\n"); |
| if (memory_config_version != litevm->memory_config_version) { |
| spin_unlock(&litevm->lock); |
| printk("unlocked, try again\n"); |
| litevm_free_physmem_slot(&new, &old); |
| goto raced; |
| } |
| |
| r = -EAGAIN; |
| if (litevm->busy){ |
| printk("BUSY!\n"); |
| goto out_unlock; |
| } |
| |
| if (mem->slot >= litevm->nmemslots) |
| litevm->nmemslots = mem->slot + 1; |
| |
| *memslot = new; |
| ++litevm->memory_config_version; |
| |
| spin_unlock(&litevm->lock); |
| printk("unlocked\n"); |
| for (i = 0; i < LITEVM_MAX_VCPUS; ++i) { |
| struct litevm_vcpu *vcpu; |
| |
| vcpu = vcpu_load(litevm, i); |
| if (!vcpu) |
| continue; |
| litevm_mmu_reset_context(vcpu); |
| vcpu_put(vcpu); |
| } |
| |
| litevm_free_physmem_slot(&old, &new); |
| print_func_exit(); |
| return 0; |
| |
| out_unlock: |
| spin_unlock(&litevm->lock); |
| printk("out_unlock\n"); |
| out_free: |
| printk("out_free\n"); |
| litevm_free_physmem_slot(&new, &old); |
| out: |
| printk("vm_set_memory_region: return %d\n", r); |
| print_func_exit(); |
| return r; |
| } |
| |
| #if 0 |
| /* |
| * Get (and clear) the dirty memory log for a memory slot. |
| */ |
| static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm, |
| struct litevm_dirty_log *log) |
| { |
| struct litevm_memory_slot *memslot; |
| int r, i; |
| int n; |
| unsigned long any = 0; |
| |
| spin_lock_irqsave(&litevm->lock); |
| |
| /* |
| * Prevent changes to guest memory configuration even while the lock |
| * is not taken. |
| */ |
| ++litevm->busy; |
| spin_unlock(&litevm->lock); |
| r = -EINVAL; |
| if (log->slot >= LITEVM_MEMORY_SLOTS) |
| goto out; |
| |
| memslot = &litevm->memslots[log->slot]; |
| r = -ENOENT; |
| if (!memslot->dirty_bitmap) |
| goto out; |
| |
| n = ALIGN(memslot->npages, 8) / 8; |
| |
| for (i = 0; !any && i < n; ++i) |
| any = memslot->dirty_bitmap[i]; |
| |
| r = -EFAULT; |
| if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) |
| goto out; |
| |
| |
| if (any) { |
| spin_lock_irqsave(&litevm->lock); |
| litevm_mmu_slot_remove_write_access(litevm, log->slot); |
| spin_unlock(&litevm->lock); |
| memset(memslot->dirty_bitmap, 0, n); |
| for (i = 0; i < LITEVM_MAX_VCPUS; ++i) { |
| struct litevm_vcpu *vcpu = vcpu_load(litevm, i); |
| |
| if (!vcpu) |
| continue; |
| flush_guest_tlb(vcpu); |
| vcpu_put(vcpu); |
| } |
| } |
| |
| r = 0; |
| |
| out: |
| spin_lock_irqsave(&litevm->lock); |
| --litevm->busy; |
| spin_unlock(&litevm->lock); |
| return r; |
| } |
| #endif |
| |
| struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn) |
| { |
| print_func_entry(); |
| int i; |
| |
| for (i = 0; i < litevm->nmemslots; ++i) { |
| struct litevm_memory_slot *memslot = &litevm->memslots[i]; |
| |
| if (gfn >= memslot->base_gfn |
| && gfn < memslot->base_gfn + memslot->npages) { |
| print_func_exit(); |
| return memslot; |
| } |
| } |
| print_func_exit(); |
| return 0; |
| } |
| |
| void mark_page_dirty(struct litevm *litevm, gfn_t gfn) |
| { |
| print_func_entry(); |
| int i; |
| struct litevm_memory_slot *memslot = 0; |
| unsigned long rel_gfn; |
| |
| for (i = 0; i < litevm->nmemslots; ++i) { |
| memslot = &litevm->memslots[i]; |
| |
| if (gfn >= memslot->base_gfn |
| && gfn < memslot->base_gfn + memslot->npages) { |
| |
| if (!memslot || !memslot->dirty_bitmap) { |
| print_func_exit(); |
| return; |
| } |
| |
| rel_gfn = gfn - memslot->base_gfn; |
| |
| /* avoid RMW */ |
| if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn)) |
| SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn); |
| print_func_exit(); |
| return; |
| } |
| } |
| print_func_exit(); |
| } |
| |
| static void skip_emulated_instruction(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| unsigned long rip; |
| uint32_t interruptibility; |
| |
| rip = vmcs_readl(GUEST_RIP); |
| rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); |
| vmcs_writel(GUEST_RIP, rip); |
| |
| /* |
| * We emulated an instruction, so temporary interrupt blocking |
| * should be removed, if set. |
| */ |
| interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); |
| if (interruptibility & 3) |
| vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, |
| interruptibility & ~3); |
| print_func_exit(); |
| } |
| |
| static int emulator_read_std(unsigned long addr, |
| unsigned long *val, |
| unsigned int bytes, |
| struct x86_emulate_ctxt *ctxt) |
| { |
| print_func_entry(); |
| struct litevm_vcpu *vcpu = ctxt->vcpu; |
| void *data = val; |
| |
| while (bytes) { |
| gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); |
| unsigned offset = addr & (PAGE_SIZE-1); |
| unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ? |
| bytes : (unsigned)PAGE_SIZE - offset; |
| unsigned long pfn; |
| struct litevm_memory_slot *memslot; |
| void *page; |
| |
| if (gpa == UNMAPPED_GVA) { |
| print_func_exit(); |
| return X86EMUL_PROPAGATE_FAULT; |
| } |
| pfn = gpa >> PAGE_SHIFT; |
| memslot = gfn_to_memslot(vcpu->litevm, pfn); |
| if (!memslot) { |
| print_func_exit(); |
| return X86EMUL_UNHANDLEABLE; |
| } |
| page = page2kva(gfn_to_page(memslot, pfn)); |
| |
| memcpy(data, page + offset, tocopy); |
| |
| bytes -= tocopy; |
| data += tocopy; |
| addr += tocopy; |
| } |
| |
| print_func_exit(); |
| return X86EMUL_CONTINUE; |
| } |
| |
| static int emulator_write_std(unsigned long addr, |
| unsigned long val, |
| unsigned int bytes, |
| struct x86_emulate_ctxt *ctxt) |
| { |
| print_func_entry(); |
| printk("emulator_write_std: addr %lx n %d\n", |
| addr, bytes); |
| print_func_exit(); |
| return X86EMUL_UNHANDLEABLE; |
| } |
| |
| static int emulator_read_emulated(unsigned long addr, |
| unsigned long *val, |
| unsigned int bytes, |
| struct x86_emulate_ctxt *ctxt) |
| { |
| print_func_entry(); |
| struct litevm_vcpu *vcpu = ctxt->vcpu; |
| |
| if (vcpu->mmio_read_completed) { |
| memcpy(val, vcpu->mmio_data, bytes); |
| vcpu->mmio_read_completed = 0; |
| print_func_exit(); |
| return X86EMUL_CONTINUE; |
| } else if (emulator_read_std(addr, val, bytes, ctxt) |
| == X86EMUL_CONTINUE) { |
| print_func_exit(); |
| return X86EMUL_CONTINUE; |
| } |
| else { |
| gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); |
| if (gpa == UNMAPPED_GVA) { |
| print_func_exit(); |
| return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT; |
| } |
| vcpu->mmio_needed = 1; |
| vcpu->mmio_phys_addr = gpa; |
| vcpu->mmio_size = bytes; |
| vcpu->mmio_is_write = 0; |
| |
| print_func_exit(); |
| return X86EMUL_UNHANDLEABLE; |
| } |
| } |
| |
| static int emulator_write_emulated(unsigned long addr, |
| unsigned long val, |
| unsigned int bytes, |
| struct x86_emulate_ctxt *ctxt) |
| { |
| print_func_entry(); |
| struct litevm_vcpu *vcpu = ctxt->vcpu; |
| gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); |
| |
| if (gpa == UNMAPPED_GVA) { |
| print_func_exit(); |
| return X86EMUL_PROPAGATE_FAULT; |
| } |
| |
| vcpu->mmio_needed = 1; |
| vcpu->mmio_phys_addr = gpa; |
| vcpu->mmio_size = bytes; |
| vcpu->mmio_is_write = 1; |
| memcpy(vcpu->mmio_data, &val, bytes); |
| |
| print_func_exit(); |
| return X86EMUL_CONTINUE; |
| } |
| |
| static int emulator_cmpxchg_emulated(unsigned long addr, |
| unsigned long old, |
| unsigned long new, |
| unsigned int bytes, |
| struct x86_emulate_ctxt *ctxt) |
| { |
| print_func_entry(); |
| static int reported; |
| |
| if (!reported) { |
| reported = 1; |
| printk("litevm: emulating exchange as write\n"); |
| } |
| print_func_exit(); |
| return emulator_write_emulated(addr, new, bytes, ctxt); |
| } |
| |
| static void report_emulation_failure(struct x86_emulate_ctxt *ctxt) |
| { |
| print_func_entry(); |
| static int reported; |
| uint8_t opcodes[4]; |
| unsigned long rip = vmcs_readl(GUEST_RIP); |
| unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE); |
| |
| if (reported) { |
| print_func_exit(); |
| return; |
| } |
| |
| emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt); |
| |
| printk("emulation failed but !mmio_needed?" |
| " rip %lx %02x %02x %02x %02x\n", |
| rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); |
| reported = 1; |
| print_func_exit(); |
| } |
| |
| struct x86_emulate_ops emulate_ops = { |
| .read_std = emulator_read_std, |
| .write_std = emulator_write_std, |
| .read_emulated = emulator_read_emulated, |
| .write_emulated = emulator_write_emulated, |
| .cmpxchg_emulated = emulator_cmpxchg_emulated, |
| }; |
| |
| enum emulation_result { |
| EMULATE_DONE, /* no further processing */ |
| EMULATE_DO_MMIO, /* litevm_run filled with mmio request */ |
| EMULATE_FAIL, /* can't emulate this instruction */ |
| }; |
| |
| static int emulate_instruction(struct litevm_vcpu *vcpu, |
| struct litevm_run *run, |
| unsigned long cr2, |
| uint16_t error_code) |
| { |
| print_func_entry(); |
| struct x86_emulate_ctxt emulate_ctxt; |
| int r; |
| uint32_t cs_ar; |
| |
| vcpu_load_rsp_rip(vcpu); |
| |
| cs_ar = vmcs_read32(GUEST_CS_AR_BYTES); |
| |
| emulate_ctxt.vcpu = vcpu; |
| emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS); |
| emulate_ctxt.cr2 = cr2; |
| emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) |
| ? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK) |
| ? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK) |
| ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; |
| |
| if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { |
| emulate_ctxt.cs_base = 0; |
| emulate_ctxt.ds_base = 0; |
| emulate_ctxt.es_base = 0; |
| emulate_ctxt.ss_base = 0; |
| emulate_ctxt.gs_base = 0; |
| emulate_ctxt.fs_base = 0; |
| } else { |
| emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE); |
| emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE); |
| emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE); |
| emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE); |
| emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE); |
| emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE); |
| } |
| |
| vcpu->mmio_is_write = 0; |
| r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); |
| |
| if ((r || vcpu->mmio_is_write) && run) { |
| run->mmio.phys_addr = vcpu->mmio_phys_addr; |
| memcpy(run->mmio.data, vcpu->mmio_data, 8); |
| run->mmio.len = vcpu->mmio_size; |
| run->mmio.is_write = vcpu->mmio_is_write; |
| } |
| |
| if (r) { |
| if (!vcpu->mmio_needed) { |
| report_emulation_failure(&emulate_ctxt); |
| print_func_exit(); |
| return EMULATE_FAIL; |
| } |
| print_func_exit(); |
| return EMULATE_DO_MMIO; |
| } |
| |
| vcpu_put_rsp_rip(vcpu); |
| vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags); |
| |
| if (vcpu->mmio_is_write) { |
| print_func_exit(); |
| return EMULATE_DO_MMIO; |
| } |
| |
| print_func_exit(); |
| return EMULATE_DONE; |
| } |
| |
| static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val) |
| { |
| print_func_entry(); |
| print_func_exit(); |
| return (curr_cr & ~((1ULL << 32) - 1)) | new_val; |
| } |
| |
| void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base) |
| { |
| print_func_entry(); |
| vmcs_writel(GUEST_GDTR_BASE, base); |
| vmcs_write32(GUEST_GDTR_LIMIT, limit); |
| print_func_exit(); |
| } |
| |
| void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base) |
| { |
| print_func_entry(); |
| vmcs_writel(GUEST_IDTR_BASE, base); |
| vmcs_write32(GUEST_IDTR_LIMIT, limit); |
| print_func_exit(); |
| } |
| |
| void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw, |
| unsigned long *rflags) |
| { |
| print_func_entry(); |
| lmsw(vcpu, msw); |
| *rflags = vmcs_readl(GUEST_RFLAGS); |
| print_func_exit(); |
| } |
| |
| unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr) |
| { |
| print_func_entry(); |
| switch (cr) { |
| case 0: |
| print_func_exit(); |
| return guest_cr0(); |
| case 2: |
| print_func_exit(); |
| return vcpu->cr2; |
| case 3: |
| print_func_exit(); |
| return vcpu->cr3; |
| case 4: |
| print_func_exit(); |
| return guest_cr4(); |
| default: |
| vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); |
| print_func_exit(); |
| return 0; |
| } |
| } |
| |
| void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val, |
| unsigned long *rflags) |
| { |
| print_func_entry(); |
| switch (cr) { |
| case 0: |
| set_cr0(vcpu, mk_cr_64(guest_cr0(), val)); |
| *rflags = vmcs_readl(GUEST_RFLAGS); |
| break; |
| case 2: |
| vcpu->cr2 = val; |
| break; |
| case 3: |
| set_cr3(vcpu, val); |
| break; |
| case 4: |
| set_cr4(vcpu, mk_cr_64(guest_cr4(), val)); |
| break; |
| default: |
| vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); |
| } |
| print_func_exit(); |
| } |
| |
| static int handle_rmode_exception(struct litevm_vcpu *vcpu, |
| int vec, uint32_t err_code) |
| { |
| print_func_entry(); |
| if (!vcpu->rmode.active) { |
| print_func_exit(); |
| return 0; |
| } |
| |
| if (vec == GP_VECTOR && err_code == 0) |
| if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) { |
| print_func_exit(); |
| return 1; |
| } |
| print_func_exit(); |
| return 0; |
| } |
| |
| static int handle_exception(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run) |
| { |
| print_func_entry(); |
| uint32_t intr_info, error_code; |
| unsigned long cr2, rip; |
| uint32_t vect_info; |
| enum emulation_result er; |
| |
| vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
| intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
| |
| if ((vect_info & VECTORING_INFO_VALID_MASK) && |
| !is_page_fault(intr_info)) { |
| printk("%s: unexpected, vectoring info 0x%x " |
| "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); |
| } |
| |
| if (is_external_interrupt(vect_info)) { |
| int irq = vect_info & VECTORING_INFO_VECTOR_MASK; |
| SET_BITMASK_BIT_ATOMIC(((uint8_t *)&vcpu->irq_pending), irq); |
| SET_BITMASK_BIT_ATOMIC(((uint8_t *)&vcpu->irq_summary), irq / BITS_PER_LONG); |
| } |
| |
| if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ |
| asm ("int $2"); |
| print_func_exit(); |
| return 1; |
| } |
| error_code = 0; |
| rip = vmcs_readl(GUEST_RIP); |
| if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) |
| error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); |
| if (is_page_fault(intr_info)) { |
| cr2 = vmcs_readl(EXIT_QUALIFICATION); |
| |
| spin_lock_irqsave(&vcpu->litevm->lock); |
| if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) { |
| spin_unlock(&vcpu->litevm->lock); |
| print_func_exit(); |
| return 1; |
| } |
| |
| er = emulate_instruction(vcpu, litevm_run, cr2, error_code); |
| spin_unlock(&vcpu->litevm->lock); |
| |
| switch (er) { |
| case EMULATE_DONE: |
| print_func_exit(); |
| return 1; |
| case EMULATE_DO_MMIO: |
| ++litevm_stat.mmio_exits; |
| litevm_run->exit_reason = LITEVM_EXIT_MMIO; |
| print_func_exit(); |
| return 0; |
| case EMULATE_FAIL: |
| vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__); |
| break; |
| default: |
| assert(0); |
| } |
| } |
| |
| if (vcpu->rmode.active && |
| handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, |
| error_code)) { |
| print_func_exit(); |
| return 1; |
| } |
| |
| if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { |
| litevm_run->exit_reason = LITEVM_EXIT_DEBUG; |
| print_func_exit(); |
| return 0; |
| } |
| litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION; |
| litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; |
| litevm_run->ex.error_code = error_code; |
| print_func_exit(); |
| return 0; |
| } |
| |
| static int handle_external_interrupt(struct litevm_vcpu *vcpu, |
| struct litevm_run *litevm_run) |
| { |
| print_func_entry(); |
| ++litevm_stat.irq_exits; |
| print_func_exit(); |
| return 1; |
| } |
| |
| |
| static int get_io_count(struct litevm_vcpu *vcpu, uint64_t *count) |
| { |
| print_func_entry(); |
| uint64_t inst; |
| gva_t rip; |
| int countr_size; |
| int i, n; |
| |
| if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) { |
| countr_size = 2; |
| } else { |
| uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES); |
| |
| countr_size = (cs_ar & AR_L_MASK) ? 8: |
| (cs_ar & AR_DB_MASK) ? 4: 2; |
| } |
| |
| rip = vmcs_readl(GUEST_RIP); |
| if (countr_size != 8) |
| rip += vmcs_readl(GUEST_CS_BASE); |
| |
| n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst); |
| |
| for (i = 0; i < n; i++) { |
| switch (((uint8_t*)&inst)[i]) { |
| case 0xf0: |
| case 0xf2: |
| case 0xf3: |
| case 0x2e: |
| case 0x36: |
| case 0x3e: |
| case 0x26: |
| case 0x64: |
| case 0x65: |
| case 0x66: |
| break; |
| case 0x67: |
| countr_size = (countr_size == 2) ? 4: (countr_size >> 1); |
| default: |
| goto done; |
| } |
| } |
| print_func_exit(); |
| return 0; |
| done: |
| countr_size *= 8; |
| *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size)); |
| print_func_exit(); |
| return 1; |
| } |
| |
| static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run) |
| { |
| print_func_entry(); |
| uint64_t exit_qualification; |
| |
| ++litevm_stat.io_exits; |
| exit_qualification = vmcs_read64(EXIT_QUALIFICATION); |
| litevm_run->exit_reason = LITEVM_EXIT_IO; |
| if (exit_qualification & 8) |
| litevm_run->io.direction = LITEVM_EXIT_IO_IN; |
| else |
| litevm_run->io.direction = LITEVM_EXIT_IO_OUT; |
| litevm_run->io.size = (exit_qualification & 7) + 1; |
| litevm_run->io.string = (exit_qualification & 16) != 0; |
| litevm_run->io.string_down |
| = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; |
| litevm_run->io.rep = (exit_qualification & 32) != 0; |
| litevm_run->io.port = exit_qualification >> 16; |
| if (litevm_run->io.string) { |
| if (!get_io_count(vcpu, &litevm_run->io.count)) { |
| print_func_exit(); |
| return 1; |
| } |
| litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS); |
| } else |
| litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */ |
| print_func_exit(); |
| return 0; |
| } |
| |
| static int handle_invlpg(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run) |
| { |
| print_func_entry(); |
| uint64_t address = vmcs_read64(EXIT_QUALIFICATION); |
| int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); |
| spin_lock_irqsave(&vcpu->litevm->lock); |
| vcpu->mmu.inval_page(vcpu, address); |
| spin_unlock(&vcpu->litevm->lock); |
| vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length); |
| print_func_exit(); |
| return 1; |
| } |
| |
| static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run) |
| { |
| print_func_entry(); |
| uint64_t exit_qualification; |
| int cr; |
| int reg; |
| |
| #ifdef LITEVM_DEBUG |
| if (guest_cpl() != 0) { |
| vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return 1; |
| } |
| #endif |
| |
| exit_qualification = vmcs_read64(EXIT_QUALIFICATION); |
| cr = exit_qualification & 15; |
| reg = (exit_qualification >> 8) & 15; |
| switch ((exit_qualification >> 4) & 3) { |
| case 0: /* mov to cr */ |
| switch (cr) { |
| case 0: |
| vcpu_load_rsp_rip(vcpu); |
| set_cr0(vcpu, vcpu->regs[reg]); |
| skip_emulated_instruction(vcpu); |
| print_func_exit(); |
| return 1; |
| case 3: |
| vcpu_load_rsp_rip(vcpu); |
| set_cr3(vcpu, vcpu->regs[reg]); |
| skip_emulated_instruction(vcpu); |
| print_func_exit(); |
| return 1; |
| case 4: |
| vcpu_load_rsp_rip(vcpu); |
| set_cr4(vcpu, vcpu->regs[reg]); |
| skip_emulated_instruction(vcpu); |
| print_func_exit(); |
| return 1; |
| case 8: |
| vcpu_load_rsp_rip(vcpu); |
| set_cr8(vcpu, vcpu->regs[reg]); |
| skip_emulated_instruction(vcpu); |
| print_func_exit(); |
| return 1; |
| }; |
| break; |
| case 1: /*mov from cr*/ |
| switch (cr) { |
| case 3: |
| vcpu_load_rsp_rip(vcpu); |
| vcpu->regs[reg] = vcpu->cr3; |
| vcpu_put_rsp_rip(vcpu); |
| skip_emulated_instruction(vcpu); |
| print_func_exit(); |
| return 1; |
| case 8: |
| printd("handle_cr: read CR8 " |
| "cpu erratum AA15\n"); |
| vcpu_load_rsp_rip(vcpu); |
| vcpu->regs[reg] = vcpu->cr8; |
| vcpu_put_rsp_rip(vcpu); |
| skip_emulated_instruction(vcpu); |
| print_func_exit(); |
| return 1; |
| } |
| break; |
| case 3: /* lmsw */ |
| lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); |
| |
| skip_emulated_instruction(vcpu); |
| print_func_exit(); |
| return 1; |
| default: |
| break; |
| } |
| litevm_run->exit_reason = 0; |
| printk("litevm: unhandled control register: op %d cr %d\n", |
| (int)(exit_qualification >> 4) & 3, cr); |
| print_func_exit(); |
| return 0; |
| } |
| |
| static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run) |
| { |
| print_func_entry(); |
| uint64_t exit_qualification; |
| unsigned long val; |
| int dr, reg; |
| |
| /* |
| * FIXME: this code assumes the host is debugging the guest. |
| * need to deal with guest debugging itself too. |
| */ |
| exit_qualification = vmcs_read64(EXIT_QUALIFICATION); |
| dr = exit_qualification & 7; |
| reg = (exit_qualification >> 8) & 15; |
| vcpu_load_rsp_rip(vcpu); |
| if (exit_qualification & 16) { |
| /* mov from dr */ |
| switch (dr) { |
| case 6: |
| val = 0xffff0ff0; |
| break; |
| case 7: |
| val = 0x400; |
| break; |
| default: |
| val = 0; |
| } |
| vcpu->regs[reg] = val; |
| } else { |
| /* mov to dr */ |
| } |
| vcpu_put_rsp_rip(vcpu); |
| skip_emulated_instruction(vcpu); |
| print_func_exit(); |
| return 1; |
| } |
| |
| static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run) |
| { |
| print_func_entry(); |
| litevm_run->exit_reason = LITEVM_EXIT_CPUID; |
| print_func_exit(); |
| return 0; |
| } |
| |
| static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run) |
| { |
| print_func_entry(); |
| uint32_t ecx = vcpu->regs[VCPU_REGS_RCX]; |
| struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx); |
| uint64_t data; |
| |
| if (guest_cpl() != 0) { |
| vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return 1; |
| } |
| |
| switch (ecx) { |
| case MSR_FS_BASE: |
| data = vmcs_readl(GUEST_FS_BASE); |
| break; |
| case MSR_GS_BASE: |
| data = vmcs_readl(GUEST_GS_BASE); |
| break; |
| case MSR_IA32_SYSENTER_CS: |
| data = vmcs_read32(GUEST_SYSENTER_CS); |
| break; |
| case MSR_IA32_SYSENTER_EIP: |
| data = vmcs_read32(GUEST_SYSENTER_EIP); |
| break; |
| case MSR_IA32_SYSENTER_ESP: |
| data = vmcs_read32(GUEST_SYSENTER_ESP); |
| break; |
| case MSR_IA32_MC0_CTL: |
| case MSR_IA32_MCG_STATUS: |
| case MSR_IA32_MCG_CAP: |
| case MSR_IA32_MC0_MISC: |
| case MSR_IA32_MC0_MISC+4: |
| case MSR_IA32_MC0_MISC+8: |
| case MSR_IA32_MC0_MISC+12: |
| case MSR_IA32_MC0_MISC+16: |
| case MSR_IA32_UCODE_REV: |
| /* MTRR registers */ |
| case 0xfe: |
| case 0x200 ... 0x2ff: |
| data = 0; |
| break; |
| case MSR_IA32_APICBASE: |
| data = vcpu->apic_base; |
| break; |
| default: |
| if (msr) { |
| data = msr->data; |
| break; |
| } |
| printk("litevm: unhandled rdmsr: %x\n", ecx); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return 1; |
| } |
| |
| /* FIXME: handling of bits 32:63 of rax, rdx */ |
| vcpu->regs[VCPU_REGS_RAX] = data & -1u; |
| vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; |
| skip_emulated_instruction(vcpu); |
| print_func_exit(); |
| return 1; |
| } |
| |
| #ifdef __x86_64__ |
| |
| static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer) |
| { |
| print_func_entry(); |
| struct vmx_msr_entry *msr; |
| |
| if (efer & EFER_RESERVED_BITS) { |
| printd("set_efer: 0x%llx #GP, reserved bits\n", |
| efer); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| |
| if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) { |
| printd("set_efer: #GP, change LME while paging\n"); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return; |
| } |
| |
| efer &= ~EFER_LMA; |
| efer |= vcpu->shadow_efer & EFER_LMA; |
| |
| vcpu->shadow_efer = efer; |
| |
| msr = find_msr_entry(vcpu, MSR_EFER); |
| |
| if (!(efer & EFER_LMA)) |
| efer &= ~EFER_LME; |
| msr->data = efer; |
| skip_emulated_instruction(vcpu); |
| print_func_exit(); |
| } |
| |
| #endif |
| |
| #define MSR_IA32_TIME_STAMP_COUNTER 0x10 |
| |
| static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run) |
| { |
| print_func_entry(); |
| uint32_t ecx = vcpu->regs[VCPU_REGS_RCX]; |
| struct vmx_msr_entry *msr; |
| uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u) |
| | ((uint64_t)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); |
| |
| if (guest_cpl() != 0) { |
| vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return 1; |
| } |
| |
| switch (ecx) { |
| case MSR_FS_BASE: |
| vmcs_writel(GUEST_FS_BASE, data); |
| break; |
| case MSR_GS_BASE: |
| vmcs_writel(GUEST_GS_BASE, data); |
| break; |
| case MSR_IA32_SYSENTER_CS: |
| vmcs_write32(GUEST_SYSENTER_CS, data); |
| break; |
| case MSR_IA32_SYSENTER_EIP: |
| vmcs_write32(GUEST_SYSENTER_EIP, data); |
| break; |
| case MSR_IA32_SYSENTER_ESP: |
| vmcs_write32(GUEST_SYSENTER_ESP, data); |
| break; |
| case MSR_EFER: |
| set_efer(vcpu, data); |
| print_func_exit(); |
| return 1; |
| case MSR_IA32_MC0_STATUS: |
| printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n" |
| , __FUNCTION__, data); |
| break; |
| case MSR_IA32_TIME_STAMP_COUNTER: { |
| uint64_t tsc; |
| |
| tsc = read_tsc(); |
| vmcs_write64(TSC_OFFSET, data - tsc); |
| break; |
| } |
| case MSR_IA32_UCODE_REV: |
| case MSR_IA32_UCODE_WRITE: |
| case 0x200 ... 0x2ff: /* MTRRs */ |
| break; |
| case MSR_IA32_APICBASE: |
| vcpu->apic_base = data; |
| break; |
| default: |
| msr = find_msr_entry(vcpu, ecx); |
| if (msr) { |
| msr->data = data; |
| break; |
| } |
| printk("litevm: unhandled wrmsr: %x\n", ecx); |
| inject_gp(vcpu); |
| print_func_exit(); |
| return 1; |
| } |
| skip_emulated_instruction(vcpu); |
| print_func_exit(); |
| return 1; |
| } |
| |
| static int handle_interrupt_window(struct litevm_vcpu *vcpu, |
| struct litevm_run *litevm_run) |
| { |
| print_func_entry(); |
| /* Turn off interrupt window reporting. */ |
| vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, |
| vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
| & ~CPU_BASED_VIRTUAL_INTR_PENDING); |
| print_func_exit(); |
| return 1; |
| } |
| |
| static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run) |
| { |
| print_func_entry(); |
| skip_emulated_instruction(vcpu); |
| if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) { |
| print_func_exit(); |
| return 1; |
| } |
| |
| litevm_run->exit_reason = LITEVM_EXIT_HLT; |
| print_func_exit(); |
| return 0; |
| } |
| |
| /* |
| * The exit handlers return 1 if the exit was handled fully and guest execution |
| * may resume. Otherwise they set the litevm_run parameter to indicate what needs |
| * to be done to userspace and return 0. |
| */ |
| static int (*litevm_vmx_exit_handlers[])(struct litevm_vcpu *vcpu, |
| struct litevm_run *litevm_run) = { |
| [EXIT_REASON_EXCEPTION_NMI] = handle_exception, |
| [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, |
| [EXIT_REASON_IO_INSTRUCTION] = handle_io, |
| [EXIT_REASON_INVLPG] = handle_invlpg, |
| [EXIT_REASON_CR_ACCESS] = handle_cr, |
| [EXIT_REASON_DR_ACCESS] = handle_dr, |
| [EXIT_REASON_CPUID] = handle_cpuid, |
| [EXIT_REASON_MSR_READ] = handle_rdmsr, |
| [EXIT_REASON_MSR_WRITE] = handle_wrmsr, |
| [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, |
| [EXIT_REASON_HLT] = handle_halt, |
| }; |
| |
| static const int litevm_vmx_max_exit_handlers = |
| sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers); |
| |
| /* |
| * The guest has exited. See if we can fix it or if we need userspace |
| * assistance. |
| */ |
| static int litevm_handle_exit(struct litevm_run *litevm_run, struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
| uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON); |
| |
| if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && |
| exit_reason != EXIT_REASON_EXCEPTION_NMI ) |
| printk("%s: unexpected, valid vectoring info and " |
| "exit reason is 0x%x\n", __FUNCTION__, exit_reason); |
| litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); |
| if (exit_reason < litevm_vmx_max_exit_handlers |
| && litevm_vmx_exit_handlers[exit_reason]) { |
| print_func_exit(); |
| return litevm_vmx_exit_handlers[exit_reason](vcpu, litevm_run); |
| } |
| else { |
| litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN; |
| litevm_run->hw.hardware_exit_reason = exit_reason; |
| } |
| print_func_exit(); |
| return 0; |
| } |
| |
| static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq) |
| { |
| print_func_entry(); |
| uint16_t ent[2]; |
| uint16_t cs; |
| uint16_t ip; |
| unsigned long flags; |
| unsigned long ss_base = vmcs_readl(GUEST_SS_BASE); |
| uint16_t sp = vmcs_readl(GUEST_RSP); |
| uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT); |
| |
| if (sp > ss_limit || ((sp - 6) > sp)) { |
| vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", |
| __FUNCTION__, |
| vmcs_readl(GUEST_RSP), |
| vmcs_readl(GUEST_SS_BASE), |
| vmcs_read32(GUEST_SS_LIMIT)); |
| print_func_exit(); |
| return; |
| } |
| |
| if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) != |
| sizeof(ent)) { |
| //vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); |
| print_func_exit(); |
| return; |
| } |
| |
| flags = vmcs_readl(GUEST_RFLAGS); |
| cs = vmcs_readl(GUEST_CS_BASE) >> 4; |
| ip = vmcs_readl(GUEST_RIP); |
| |
| |
| if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 || |
| litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 || |
| litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) { |
| //vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); |
| print_func_exit(); |
| return; |
| } |
| |
| vmcs_writel(GUEST_RFLAGS, flags & |
| ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); |
| vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; |
| vmcs_writel(GUEST_CS_BASE, ent[1] << 4); |
| vmcs_writel(GUEST_RIP, ent[0]); |
| vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); |
| print_func_exit(); |
| } |
| |
| static void litevm_do_inject_irq(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| int word_index = __ffs(vcpu->irq_summary); |
| int bit_index = __ffs(vcpu->irq_pending[word_index]); |
| int irq = word_index * BITS_PER_LONG + bit_index; |
| |
| /* don't have clear_bit and I'm not sure the akaros |
| * bitops are really going to work. |
| */ |
| vcpu->irq_pending[word_index] &= ~(1 << bit_index); |
| if (!vcpu->irq_pending[word_index]) |
| vcpu->irq_summary &= ~ (1 << word_index); |
| |
| if (vcpu->rmode.active) { |
| inject_rmode_irq(vcpu, irq); |
| print_func_exit(); |
| return; |
| } |
| vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
| irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); |
| print_func_exit(); |
| } |
| |
| static void litevm_try_inject_irq(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) |
| && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0) |
| /* |
| * Interrupts enabled, and not blocked by sti or mov ss. Good. |
| */ |
| litevm_do_inject_irq(vcpu); |
| else |
| /* |
| * Interrupts blocked. Wait for unblock. |
| */ |
| vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, |
| vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
| | CPU_BASED_VIRTUAL_INTR_PENDING); |
| print_func_exit(); |
| } |
| |
| static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu) |
| { |
| print_func_entry(); |
| struct litevm_guest_debug *dbg = &vcpu->guest_debug; |
| |
| #warning "no debugging guests yet" |
| assert(0); |
| /* |
| set_debugreg(dbg->bp[0], 0); |
| set_debugreg(dbg->bp[1], 1); |
| set_debugreg(dbg->bp[2], 2); |
| set_debugreg(dbg->bp[3], 3); |
| */ |
| if (dbg->singlestep) { |
| unsigned long flags; |
| |
| flags = vmcs_readl(GUEST_RFLAGS); |
| flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; |
| vmcs_writel(GUEST_RFLAGS, flags); |
| } |
| print_func_exit(); |
| } |
| |
| static void load_msrs(struct vmx_msr_entry *e, int n) |
| { |
| print_func_entry(); |
| int i; |
| |
| for (i = 0; i < n; ++i) |
| write_msr(e[i].index, e[i].data); |
| print_func_exit(); |
| } |
| |
| static void save_msrs(struct vmx_msr_entry *e, int n) |
| { |
| print_func_entry(); |
| int i; |
| |
| for (i = 0; i < n; ++i) |
| e[i].data = read_msr(e[i].index); |
| print_func_exit(); |
| } |
| |
| int vm_run(struct litevm *litevm, struct litevm_run *litevm_run) |
| { |
| print_func_entry(); |
| struct litevm_vcpu *vcpu; |
| uint8_t fail; |
| uint16_t fs_sel, gs_sel, ldt_sel; |
| int fs_gs_ldt_reload_needed; |
| |
| if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS) |
| error("vcpu is %d but must be in the range %d..%d\n", |
| litevm_run->vcpu, LITEVM_MAX_VCPUS); |
| |
| vcpu = vcpu_load(litevm, litevm_run->vcpu); |
| if (!vcpu) |
| error("vcpu_load failed"); |
| |
| if (litevm_run->emulated) { |
| skip_emulated_instruction(vcpu); |
| litevm_run->emulated = 0; |
| } |
| |
| if (litevm_run->mmio_completed) { |
| memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8); |
| vcpu->mmio_read_completed = 1; |
| } |
| |
| vcpu->mmio_needed = 0; |
| |
| again: |
| /* |
| * Set host fs and gs selectors. Unfortunately, 22.2.3 does not |
| * allow segment selectors with cpl > 0 or ti == 1. |
| */ |
| fs_sel = read_fs(); |
| gs_sel = read_gs(); |
| ldt_sel = read_ldt(); |
| fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel; |
| if (!fs_gs_ldt_reload_needed) { |
| vmcs_write16(HOST_FS_SELECTOR, fs_sel); |
| vmcs_write16(HOST_GS_SELECTOR, gs_sel); |
| } else { |
| vmcs_write16(HOST_FS_SELECTOR, 0); |
| vmcs_write16(HOST_GS_SELECTOR, 0); |
| } |
| |
| #ifdef __x86_64__ |
| vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); |
| vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); |
| #endif |
| |
| if (vcpu->irq_summary && |
| !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) |
| litevm_try_inject_irq(vcpu); |
| |
| if (vcpu->guest_debug.enabled) |
| litevm_guest_debug_pre(vcpu); |
| |
| fx_save(vcpu->host_fx_image); |
| fx_restore(vcpu->guest_fx_image); |
| |
| save_msrs(vcpu->host_msrs, vcpu->nmsrs); |
| load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); |
| |
| asm ( |
| /* Store host registers */ |
| "pushf \n\t" |
| #ifdef __x86_64__ |
| "push %%rax; push %%rbx; push %%rdx;" |
| "push %%rsi; push %%rdi; push %%rbp;" |
| "push %%r8; push %%r9; push %%r10; push %%r11;" |
| "push %%r12; push %%r13; push %%r14; push %%r15;" |
| "push %%rcx \n\t" |
| "vmwrite %%rsp, %2 \n\t" |
| #else |
| "pusha; push %%ecx \n\t" |
| "vmwrite %%esp, %2 \n\t" |
| #endif |
| /* Check if vmlaunch of vmresume is needed */ |
| "cmp $0, %1 \n\t" |
| /* Load guest registers. Don't clobber flags. */ |
| #ifdef __x86_64__ |
| "mov %c[cr2](%3), %%rax \n\t" |
| "mov %%rax, %%cr2 \n\t" |
| "mov %c[rax](%3), %%rax \n\t" |
| "mov %c[rbx](%3), %%rbx \n\t" |
| "mov %c[rdx](%3), %%rdx \n\t" |
| "mov %c[rsi](%3), %%rsi \n\t" |
| "mov %c[rdi](%3), %%rdi \n\t" |
| "mov %c[rbp](%3), %%rbp \n\t" |
| "mov %c[r8](%3), %%r8 \n\t" |
| "mov %c[r9](%3), %%r9 \n\t" |
| "mov %c[r10](%3), %%r10 \n\t" |
| "mov %c[r11](%3), %%r11 \n\t" |
| "mov %c[r12](%3), %%r12 \n\t" |
| "mov %c[r13](%3), %%r13 \n\t" |
| "mov %c[r14](%3), %%r14 \n\t" |
| "mov %c[r15](%3), %%r15 \n\t" |
| "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ |
| #else |
| "mov %c[cr2](%3), %%eax \n\t" |
| "mov %%eax, %%cr2 \n\t" |
| "mov %c[rax](%3), %%eax \n\t" |
| "mov %c[rbx](%3), %%ebx \n\t" |
| "mov %c[rdx](%3), %%edx \n\t" |
| "mov %c[rsi](%3), %%esi \n\t" |
| "mov %c[rdi](%3), %%edi \n\t" |
| "mov %c[rbp](%3), %%ebp \n\t" |
| "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ |
| #endif |
| /* Enter guest mode */ |
| "jne launched \n\t" |
| "vmlaunch \n\t" |
| "jmp litevm_vmx_return \n\t" |
| "launched: vmresume \n\t" |
| ".globl litevm_vmx_return \n\t" |
| "litevm_vmx_return: " |
| /* Save guest registers, load host registers, keep flags */ |
| #ifdef __x86_64__ |
| "xchg %3, 0(%%rsp) \n\t" |
| "mov %%rax, %c[rax](%3) \n\t" |
| "mov %%rbx, %c[rbx](%3) \n\t" |
| "pushq 0(%%rsp); popq %c[rcx](%3) \n\t" |
| "mov %%rdx, %c[rdx](%3) \n\t" |
| "mov %%rsi, %c[rsi](%3) \n\t" |
| "mov %%rdi, %c[rdi](%3) \n\t" |
| "mov %%rbp, %c[rbp](%3) \n\t" |
| "mov %%r8, %c[r8](%3) \n\t" |
| "mov %%r9, %c[r9](%3) \n\t" |
| "mov %%r10, %c[r10](%3) \n\t" |
| "mov %%r11, %c[r11](%3) \n\t" |
| "mov %%r12, %c[r12](%3) \n\t" |
| "mov %%r13, %c[r13](%3) \n\t" |
| "mov %%r14, %c[r14](%3) \n\t" |
| "mov %%r15, %c[r15](%3) \n\t" |
| "mov %%cr2, %%rax \n\t" |
| "mov %%rax, %c[cr2](%3) \n\t" |
| "mov 0(%%rsp), %3 \n\t" |
| |
| "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" |
| "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" |
| "pop %%rbp; pop %%rdi; pop %%rsi;" |
| "pop %%rdx; pop %%rbx; pop %%rax \n\t" |
| #else |
| "xchg %3, 0(%%esp) \n\t" |
| "mov %%eax, %c[rax](%3) \n\t" |
| "mov %%ebx, %c[rbx](%3) \n\t" |
| "pushl 0(%%esp); popl %c[rcx](%3) \n\t" |
| "mov %%edx, %c[rdx](%3) \n\t" |
| "mov %%esi, %c[rsi](%3) \n\t" |
| "mov %%edi, %c[rdi](%3) \n\t" |
| "mov %%ebp, %c[rbp](%3) \n\t" |
| "mov %%cr2, %%eax \n\t" |
| "mov %%eax, %c[cr2](%3) \n\t" |
| "mov 0(%%esp), %3 \n\t" |
| |
| "pop %%ecx; popa \n\t" |
| #endif |
| "setbe %0 \n\t" |
| "popf \n\t" |
| : "=g" (fail) |
| : "r"(vcpu->launched), "r"((unsigned long)HOST_RSP), |
| "c"(vcpu), |
| [rax]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])), |
| [rbx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])), |
| [rcx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])), |
| [rdx]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])), |
| [rsi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])), |
| [rdi]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])), |
| [rbp]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])), |
| #ifdef __x86_64__ |
| [r8 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8 ])), |
| [r9 ]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9 ])), |
| [r10]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])), |
| [r11]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])), |
| [r12]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])), |
| [r13]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])), |
| [r14]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])), |
| [r15]"i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])), |
| #endif |
| [cr2]"i"(offsetof(struct litevm_vcpu, cr2)) |
| : "cc", "memory" ); |
| |
| ++litevm_stat.exits; |
| printk("vm_run exits"); |
| save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); |
| load_msrs(vcpu->host_msrs, NR_BAD_MSRS); |
| |
| fx_save(vcpu->guest_fx_image); |
| fx_restore(vcpu->host_fx_image); |
| |
| #ifndef __x86_64__ |
| asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); |
| #endif |
| |
| litevm_run->exit_type = 0; |
| if (fail) { |
| litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY; |
| litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); |
| } else { |
| if (fs_gs_ldt_reload_needed) { |
| load_ldt(ldt_sel); |
| load_fs(fs_sel); |
| /* |
| * If we have to reload gs, we must take care to |
| * preserve our gs base. |
| */ |
| disable_irq(); |
| load_gs(gs_sel); |
| #ifdef __x86_64__ |
| write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); |
| #endif |
| enable_irq(); |
| |
| reload_tss(); |
| } |
| vcpu->launched = 1; |
| litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT; |
| if (litevm_handle_exit(litevm_run, vcpu)) { |
| /* Give scheduler a change to reschedule. */ |
| vcpu_put(vcpu); |
| #warning "how to tell if signal is pending" |
| /* |
| if (signal_pending(current)) { |
| ++litevm_stat.signal_exits; |
| return -EINTR; |
| } |
| */ |
| kthread_yield(); |
| /* Cannot fail - no vcpu unplug yet. */ |
| vcpu_load(litevm, vcpu_slot(vcpu)); |
| goto again; |
| } |
| } |
| |
| vcpu_put(vcpu); |
| printk("vm_run returns\n"); |
| print_func_exit(); |
| return 0; |
| } |
| |
| static int litevm_dev_ioctl_get_regs(struct litevm *litevm, struct litevm_regs *regs) |
| { |
| print_func_entry(); |
| struct litevm_vcpu *vcpu; |
| |
| if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) { |
| print_func_exit(); |
| return -EINVAL; |
| } |
| |
| vcpu = vcpu_load(litevm, regs->vcpu); |
| if (!vcpu) { |
| print_func_exit(); |
| return -ENOENT; |
| } |
| |
| regs->rax = vcpu->regs[VCPU_REGS_RAX]; |
| regs->rbx = vcpu->regs[VCPU_REGS_RBX]; |
| regs->rcx = vcpu->regs[VCPU_REGS_RCX]; |
| regs->rdx = vcpu->regs[VCPU_REGS_RDX]; |
| regs->rsi = vcpu->regs[VCPU_REGS_RSI]; |
| regs->rdi = vcpu->regs[VCPU_REGS_RDI]; |
| regs->rsp = vmcs_readl(GUEST_RSP); |
| regs->rbp = vcpu->regs[VCPU_REGS_RBP]; |
| #ifdef __x86_64__ |
| regs->r8 = vcpu->regs[VCPU_REGS_R8]; |
| regs->r9 = vcpu->regs[VCPU_REGS_R9]; |
| regs->r10 = vcpu->regs[VCPU_REGS_R10]; |
| regs->r11 = vcpu->regs[VCPU_REGS_R11]; |
| regs->r12 = vcpu->regs[VCPU_REGS_R12]; |
| regs->r13 = vcpu->regs[VCPU_REGS_R13]; |
| regs->r14 = vcpu->regs[VCPU_REGS_R14]; |
| regs->r15 = vcpu->regs[VCPU_REGS_R15]; |
| #endif |
| |
| regs->rip = vmcs_readl(GUEST_RIP); |
| regs->rflags = vmcs_readl(GUEST_RFLAGS); |
| |
| /* |
| * Don't leak debug flags in case they were set for guest debugging |
| */ |
| if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) |
| regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); |
| |
| vcpu_put(vcpu); |
| |
| print_func_exit(); |
| return 0; |
| } |
| |
| static int litevm_dev_ioctl_set_regs(struct litevm *litevm, struct litevm_regs *regs) |
| { |
| print_func_entry(); |
| struct litevm_vcpu *vcpu; |
| |
| if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) { |
| print_func_exit(); |
| return -EINVAL; |
| } |
| |
| vcpu = vcpu_load(litevm, regs->vcpu); |
| if (!vcpu) { |
| print_func_exit(); |
| return -ENOENT; |
| } |
| |
| vcpu->regs[VCPU_REGS_RAX] = regs->rax; |
| vcpu->regs[VCPU_REGS_RBX] = regs->rbx; |
| vcpu->regs[VCPU_REGS_RCX] = regs->rcx; |
| vcpu->regs[VCPU_REGS_RDX] = regs->rdx; |
| vcpu->regs[VCPU_REGS_RSI] = regs->rsi; |
| vcpu->regs[VCPU_REGS_RDI] = regs->rdi; |
| vmcs_writel(GUEST_RSP, regs->rsp); |
| vcpu->regs[VCPU_REGS_RBP] = regs->rbp; |
| #ifdef __x86_64__ |
| vcpu->regs[VCPU_REGS_R8] = regs->r8; |
| vcpu->regs[VCPU_REGS_R9] = regs->r9; |
| vcpu->regs[VCPU_REGS_R10] = regs->r10; |
| vcpu->regs[VCPU_REGS_R11] = regs->r11; |
| vcpu->regs[VCPU_REGS_R12] = regs->r12; |
| vcpu->regs[VCPU_REGS_R13] = regs->r13; |
| vcpu->regs[VCPU_REGS_R14] = regs->r14; |
| vcpu->regs[VCPU_REGS_R15] = regs->r15; |
| #endif |
| |
| vmcs_writel(GUEST_RIP, regs->rip); |
| vmcs_writel(GUEST_RFLAGS, regs->rflags); |
| |
| vcpu_put(vcpu); |
| |
| print_func_exit(); |
| return 0; |
| } |
| |
| static int litevm_dev_ioctl_get_sregs(struct litevm *litevm, struct litevm_sregs *sregs) |
| { |
| print_func_entry(); |
| struct litevm_vcpu *vcpu; |
| |
| if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) { |
| print_func_exit(); |
| return -EINVAL; |
| } |
| vcpu = vcpu_load(litevm, sregs->vcpu); |
| if (!vcpu) { |
| print_func_exit(); |
| return -ENOENT; |
| } |
| |
| #define get_segment(var, seg) \ |
| do { \ |
| uint32_t ar; \ |
| \ |
| sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \ |
| sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \ |
| sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \ |
| ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \ |
| if (ar & AR_UNUSABLE_MASK) ar = 0; \ |
| sregs->var.type = ar & 15; \ |
| sregs->var.s = (ar >> 4) & 1; \ |
| sregs->var.dpl = (ar >> 5) & 3; \ |
| sregs->var.present = (ar >> 7) & 1; \ |
| sregs->var.avl = (ar >> 12) & 1; \ |
| sregs->var.l = (ar >> 13) & 1; \ |
| sregs->var.db = (ar >> 14) & 1; \ |
| sregs->var.g = (ar >> 15) & 1; \ |
| sregs->var.unusable = (ar >> 16) & 1; \ |
| } while (0); |
| |
| get_segment(cs, CS); |
| get_segment(ds, DS); |
| get_segment(es, ES); |
| get_segment(fs, FS); |
| get_segment(gs, GS); |
| get_segment(ss, SS); |
| |
| get_segment(tr, TR); |
| get_segment(ldt, LDTR); |
| #undef get_segment |
| |
| #define get_dtable(var, table) \ |
| sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \ |
| sregs->var.base = vmcs_readl(GUEST_##table##_BASE) |
| |
| get_dtable(idt, IDTR); |
| get_dtable(gdt, GDTR); |
| #undef get_dtable |
| |
| sregs->cr0 = guest_cr0(); |
| sregs->cr2 = vcpu->cr2; |
| sregs->cr3 = vcpu->cr3; |
| sregs->cr4 = guest_cr4(); |
| sregs->cr8 = vcpu->cr8; |
| sregs->efer = vcpu->shadow_efer; |
| sregs->apic_base = vcpu->apic_base; |
| |
| sregs->pending_int = vcpu->irq_summary != 0; |
| |
| vcpu_put(vcpu); |
| |
| print_func_exit(); |
| return 0; |
| } |
| |
| static int litevm_dev_ioctl_set_sregs(struct litevm *litevm, struct litevm_sregs *sregs) |
| { |
| print_func_entry(); |
| struct litevm_vcpu *vcpu; |
| int mmu_reset_needed = 0; |
| |
| if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) { |
| print_func_exit(); |
| return -EINVAL; |
| } |
| vcpu = vcpu_load(litevm, sregs->vcpu); |
| if (!vcpu) { |
| print_func_exit(); |
| return -ENOENT; |
| } |
| |
| #define set_segment(var, seg) \ |
| do { \ |
| uint32_t ar; \ |
| \ |
| vmcs_writel(GUEST_##seg##_BASE, sregs->var.base); \ |
| vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \ |
| vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \ |
| if (sregs->var.unusable) { \ |
| ar = (1 << 16); \ |
| } else { \ |
| ar = (sregs->var.type & 15); \ |
| ar |= (sregs->var.s & 1) << 4; \ |
| ar |= (sregs->var.dpl & 3) << 5; \ |
| ar |= (sregs->var.present & 1) << 7; \ |
| ar |= (sregs->var.avl & 1) << 12; \ |
| ar |= (sregs->var.l & 1) << 13; \ |
| ar |= (sregs->var.db & 1) << 14; \ |
| ar |= (sregs->var.g & 1) << 15; \ |
| } \ |
| vmcs_write32(GUEST_##seg##_AR_BYTES, ar); \ |
| } while (0); |
| |
| set_segment(cs, CS); |
| set_segment(ds, DS); |
| set_segment(es, ES); |
| set_segment(fs, FS); |
| set_segment(gs, GS); |
| set_segment(ss, SS); |
| |
| set_segment(tr, TR); |
| |
| set_segment(ldt, LDTR); |
| #undef set_segment |
| |
| #define set_dtable(var, table) \ |
| vmcs_write32(GUEST_##table##_LIMIT, sregs->var.limit), \ |
| vmcs_writel(GUEST_##table##_BASE, sregs->var.base) |
| |
| set_dtable(idt, IDTR); |
| set_dtable(gdt, GDTR); |
| #undef set_dtable |
| |
| vcpu->cr2 = sregs->cr2; |
| mmu_reset_needed |= vcpu->cr3 != sregs->cr3; |
| vcpu->cr3 = sregs->cr3; |
| |
| vcpu->cr8 = sregs->cr8; |
| |
| mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; |
| #ifdef __x86_64__ |
| __set_efer(vcpu, sregs->efer); |
| #endif |
| vcpu->apic_base = sregs->apic_base; |
| |
| mmu_reset_needed |= guest_cr0() != sregs->cr0; |
| vcpu->rmode.active = ((sregs->cr0 & CR0_PE_MASK) == 0); |
| update_exception_bitmap(vcpu); |
| vmcs_writel(CR0_READ_SHADOW, sregs->cr0); |
| vmcs_writel(GUEST_CR0, sregs->cr0 | LITEVM_VM_CR0_ALWAYS_ON); |
| |
| mmu_reset_needed |= guest_cr4() != sregs->cr4; |
| __set_cr4(vcpu, sregs->cr4); |
| |
| if (mmu_reset_needed) |
| litevm_mmu_reset_context(vcpu); |
| vcpu_put(vcpu); |
| |
| print_func_exit(); |
| return 0; |
| } |
| |
| /* |
| * Translate a guest virtual address to a guest physical address. |
| */ |
| static int litevm_dev_ioctl_translate(struct litevm *litevm, struct litevm_translation *tr) |
| { |
| print_func_entry(); |
| unsigned long vaddr = tr->linear_address; |
| struct litevm_vcpu *vcpu; |
| gpa_t gpa; |
| |
| vcpu = vcpu_load(litevm, tr->vcpu); |
| if (!vcpu) { |
| print_func_exit(); |
| return -ENOENT; |
| } |
| spin_lock_irqsave(&litevm->lock); |
| gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); |
| tr->physical_address = gpa; |
| tr->valid = gpa != UNMAPPED_GVA; |
| tr->writeable = 1; |
| tr->usermode = 0; |
| spin_unlock(&litevm->lock); |
| vcpu_put(vcpu); |
| |
| print_func_exit(); |
| return 0; |
| } |
| |
| #if 0 |
| static int litevm_dev_ioctl_interrupt(struct litevm *litevm, struct litevm_interrupt *irq) |
| { |
| struct litevm_vcpu *vcpu; |
| |
| if (irq->vcpu < 0 || irq->vcpu >= LITEVM_MAX_VCPUS) |
| return -EINVAL; |
| if (irq->irq < 0 || irq->irq >= 256) |
| return -EINVAL; |
| vcpu = vcpu_load(litevm, irq->vcpu); |
| if (!vcpu) |
| return -ENOENT; |
| |
| set_bit(irq->irq, vcpu->irq_pending); |
| set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); |
| |
| vcpu_put(vcpu); |
| |
| return 0; |
| } |
| #endif |
| |
| #if 0 |
| static int litevm_dev_ioctl_debug_guest(struct litevm *litevm, |
| struct litevm_debug_guest *dbg) |
| { |
| struct litevm_vcpu *vcpu; |
| unsigned long dr7 = 0x400; |
| uint32_t exception_bitmap; |
| int old_singlestep; |
| |
| if (dbg->vcpu < 0 || dbg->vcpu >= LITEVM_MAX_VCPUS) |
| return -EINVAL; |
| vcpu = vcpu_load(litevm, dbg->vcpu); |
| if (!vcpu) |
| return -ENOENT; |
| |
| exception_bitmap = vmcs_read32(EXCEPTION_BITMAP); |
| old_singlestep = vcpu->guest_debug.singlestep; |
| |
| vcpu->guest_debug.enabled = dbg->enabled; |
| if (vcpu->guest_debug.enabled) { |
| int i; |
| |
| dr7 |= 0x200; /* exact */ |
| for (i = 0; i < 4; ++i) { |
| if (!dbg->breakpoints[i].enabled) |
| continue; |
| vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; |
| dr7 |= 2 << (i*2); /* global enable */ |
| dr7 |= 0 << (i*4+16); /* execution breakpoint */ |
| } |
| |
| exception_bitmap |= (1u << 1); /* Trap debug exceptions */ |
| |
| vcpu->guest_debug.singlestep = dbg->singlestep; |
| } else { |
| exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */ |
| vcpu->guest_debug.singlestep = 0; |
| } |
| |
| if (old_singlestep && !vcpu->guest_debug.singlestep) { |
| unsigned long flags; |
| |
| flags = vmcs_readl(GUEST_RFLAGS); |
| flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); |
| vmcs_writel(GUEST_RFLAGS, flags); |
| } |
| |
| vmcs_write32(EXCEPTION_BITMAP, exception_bitmap); |
| vmcs_writel(GUEST_DR7, dr7); |
| |
| vcpu_put(vcpu); |
| |
| return 0; |
| } |
| #endif |
| |
| #if 0 |
| long litevm_control(struct litevm *litevm, int command, unsigned long arg) |
| { |
| int r = -EINVAL; |
| |
| switch (command) { |
| case LITEVM_CREATE_VCPU: { |
| r = create_vcpu(litevm, arg); |
| if (r) |
| goto out; |
| break; |
| } |
| case LITEVM_RUN: { |
| struct litevm_run litevm_run; |
| |
| r = -EFAULT; |
| if (copy_from_user(&litevm_run, (void *)arg, sizeof litevm_run)) |
| goto out; |
| r = litevm_dev_ioctl_run(litevm, &litevm_run); |
| if (r < 0) |
| goto out; |
| r = -EFAULT; |
| if (copy_to_user((void *)arg, &litevm_run, sizeof litevm_run)) |
| goto out; |
| r = 0; |
| break; |
| } |
| case LITEVM_GET_REGS: { |
| struct litevm_regs litevm_regs; |
| |
| r = -EFAULT; |
| if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs)) |
| goto out; |
| r = litevm_dev_ioctl_get_regs(litevm, &litevm_regs); |
| if (r) |
| goto out; |
| r = -EFAULT; |
| if (copy_to_user((void *)arg, &litevm_regs, sizeof litevm_regs)) |
| goto out; |
| r = 0; |
| break; |
| } |
| case LITEVM_SET_REGS: { |
| struct litevm_regs litevm_regs; |
| |
| r = -EFAULT; |
| if (copy_from_user(&litevm_regs, (void *)arg, sizeof litevm_regs)) |
| goto out; |
| r = litevm_dev_ioctl_set_regs(litevm, &litevm_regs); |
| if (r) |
| goto out; |
| r = 0; |
| break; |
| } |
| case LITEVM_GET_SREGS: { |
| struct litevm_sregs litevm_sregs; |
| |
| r = -EFAULT; |
| if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs)) |
| goto out; |
| r = litevm_dev_ioctl_get_sregs(litevm, &litevm_sregs); |
| if (r) |
| goto out; |
| r = -EFAULT; |
| if (copy_to_user((void *)arg, &litevm_sregs, sizeof litevm_sregs)) |
| goto out; |
| r = 0; |
| break; |
| } |
| case LITEVM_SET_SREGS: { |
| struct litevm_sregs litevm_sregs; |
| |
| r = -EFAULT; |
| if (copy_from_user(&litevm_sregs, (void *)arg, sizeof litevm_sregs)) |
| goto out; |
| r = litevm_dev_ioctl_set_sregs(litevm, &litevm_sregs); |
| if (r) |
| goto out; |
| r = 0; |
| break; |
| } |
| case LITEVM_TRANSLATE: { |
| struct litevm_translation tr; |
| |
| r = -EFAULT; |
| if (copy_from_user(&tr, (void *)arg, sizeof tr)) |
| goto out; |
| r = litevm_dev_ioctl_translate(litevm, &tr); |
| if (r) |
| goto out; |
| r = -EFAULT; |
| if (copy_to_user((void *)arg, &tr, sizeof tr)) |
| goto out; |
| r = 0; |
| break; |
| } |
| case LITEVM_INTERRUPT: { |
| struct litevm_interrupt irq; |
| |
| r = -EFAULT; |
| if (copy_from_user(&irq, (void *)arg, sizeof irq)) |
| goto out; |
| r = litevm_dev_ioctl_interrupt(litevm, &irq); |
| if (r) |
| goto out; |
| r = 0; |
| break; |
| } |
| case LITEVM_DEBUG_GUEST: { |
| struct litevm_debug_guest dbg; |
| |
| r = -EFAULT; |
| if (copy_from_user(&dbg, (void *)arg, sizeof dbg)) |
| goto out; |
| r = litevm_dev_ioctl_debug_guest(litevm, &dbg); |
| if (r) |
| goto out; |
| r = 0; |
| break; |
| } |
| case LITEVM_SET_MEMORY_REGION: { |
| struct litevm_memory_region litevm_mem; |
| |
| r = -EFAULT; |
| if (copy_from_user(&litevm_mem, (void *)arg, sizeof litevm_mem)) |
| goto out; |
| r = litevm_dev_ioctl_set_memory_region(litevm, &litevm_mem); |
| if (r) |
| goto out; |
| break; |
| } |
| case LITEVM_GET_DIRTY_LOG: { |
| struct litevm_dirty_log log; |
| |
| r = -EFAULT; |
| if (copy_from_user(&log, (void *)arg, sizeof log)) |
| goto out; |
| r = litevm_dev_ioctl_get_dirty_log(litevm, &log); |
| if (r) |
| goto out; |
| break; |
| } |
| default: |
| ; |
| } |
| out: |
| return r; |
| } |
| #endif |
| |
| #if 0 |
| static int litevm_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
| { |
| struct litevm *litevm = vma->vm_file->private_data; |
| struct litevm_memory_slot *slot; |
| struct page *page; |
| |
| slot = gfn_to_memslot(litevm, vmf->pgoff); |
| if (!slot) |
| return VM_FAULT_SIGBUS; |
| page = gfn_to_page(slot, vmf->pgoff); |
| if (!page) |
| return VM_FAULT_SIGBUS; |
| |
| get_page(page); |
| vmf->page = page; |
| return 0; |
| } |
| #endif |
| |
| #if 0 |
| static int litevm_reboot(struct notifier_block *notifier, unsigned long val, |
| void *v) |
| { |
| panic("litevm_reboot"); |
| if (val == SYS_RESTART) { |
| /* |
| * Some (well, at least mine) BIOSes hang on reboot if |
| * in vmx root mode. |
| */ |
| printk("litevm: exiting vmx mode\n"); |
| handler_wrapper_t *w; |
| smp_call_function_all(litevm_disable, 0, &w); |
| smp_call_wait(w); |
| } |
| return NOTIFY_OK; |
| return 0; |
| } |
| #endif |
| |
| hpa_t bad_page_address; |
| |
| int vmx_init(void) |
| { |
| print_func_entry(); |
| handler_wrapper_t *w; |
| int r = 0; |
| |
| if (!cpu_has_litevm_support()) { |
| printk("litevm: no hardware support\n"); |
| print_func_exit(); |
| return -EOPNOTSUPP; |
| } |
| if (vmx_disabled_by_bios()) { |
| printk("litevm: disabled by bios\n"); |
| print_func_exit(); |
| return -EOPNOTSUPP; |
| } |
| |
| setup_vmcs_descriptor(); |
| smp_call_function_all(vm_enable, 0, &w); |
| if (smp_call_wait(w)){ |
| printk("litevm_init. smp_call_wait failed. Expect a panic.\n"); |
| } |
| |
| if ((bad_page_address = PADDR(kpage_zalloc_addr())) == 0ULL) { |
| r = -ENOMEM; |
| } |
| |
| print_func_exit(); |
| return r; |
| } |
| |
| static void litevm_exit(void) |
| { |
| print_func_entry(); |
| //free_litevm_area(); |
| //__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); |
| print_func_exit(); |
| } |
| |
| |