blob: 93a15276468e86b8f15ec7200481bacaae9bb04c [file] [log] [blame] [edit]
/*
* Kernel-based Virtual Machine driver for Linux
*
* This module enables machines with Intel VT-x extensions to run virtual
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*
*/
#define DEBUG
#define LITEVM_DEBUG
#include <kmalloc.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <error.h>
#include <pmap.h>
#include <sys/queue.h>
#include <smp.h>
#include <kref.h>
#include <atomic.h>
#include <alarm.h>
#include <event.h>
#include <umem.h>
#include <devalarm.h>
#include <arch/types.h>
#include <arch/vm.h>
#include <arch/emulate.h>
#include <arch/vmdebug.h>
#include <arch/msr-index.h>
void monitor(void *);
#define currentcpu (&per_cpu_info[core_id()])
#define QLOCK_init(x) {printd("qlock_init %p\n", x); qlock_init(x); printd("%p lock_inited\n", x);}
#define QLOCK(x) {printd("qlock %p\n", x); qlock(x); printd("%p locked\n", x);}
#define QUNLOCK(x) {printd("qunlock %p\n", x); qunlock(x); printd("%p unlocked\n", x);}
#define SPLI_irqsave(x){printd("spin_lock_init %p:", x); spinlock_init(x); printd("inited\n");}
#define SPLL(x){printd("spin_lock %p\n", x); spin_lock_irqsave(x); printd("%p locked\n", x);}
#define SPLU(x){printd("spin_unlock %p\n", x); spin_unlock(x); printd("%p unlocked\n", x);}
struct litevm_stat litevm_stat;
static struct litevm_stats_debugfs_item {
const char *name;
uint32_t *data;
} debugfs_entries[] = {
{
"pf_fixed", &litevm_stat.pf_fixed}, {
"pf_guest", &litevm_stat.pf_guest}, {
"tlb_flush", &litevm_stat.tlb_flush}, {
"invlpg", &litevm_stat.invlpg}, {
"exits", &litevm_stat.exits}, {
"io_exits", &litevm_stat.io_exits}, {
"mmio_exits", &litevm_stat.mmio_exits}, {
"signal_exits", &litevm_stat.signal_exits}, {
"irq_exits", &litevm_stat.irq_exits}, {
0, 0}
};
static struct dentry *debugfs_dir;
static const uint32_t vmx_msr_index[] = {
#ifdef __x86_64__
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
#endif
MSR_EFER, // wtf? MSR_K6_STAR,
};
static const char* vmx_msr_name[] = {
#ifdef __x86_64__
"MSR_SYSCALL_MASK", "MSR_LSTAR", "MSR_CSTAR", "MSR_KERNEL_GS_BASE",
#endif
"MSR_EFER", // wtf? MSR_K6_STAR,
};
#define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
#ifdef __x86_64__
/*
* avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
* mechanism (cpu bug AA24)
*/
#define NR_BAD_MSRS 2
#else
#define NR_BAD_MSRS 0
#endif
#define TSS_IOPB_BASE_OFFSET 0x66
#define TSS_BASE_SIZE 0x68
#define TSS_IOPB_SIZE (65536 / 8)
#define TSS_REDIRECTION_SIZE (256 / 8)
#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
#define MSR_IA32_VMX_BASIC_MSR 0x480
#define MSR_IA32_VMX_PINBASED_CTLS_MSR 0x481
#define MSR_IA32_VMX_PROCBASED_CTLS_MSR 0x482
#define MSR_IA32_VMX_EXIT_CTLS_MSR 0x483
#define MSR_IA32_VMX_ENTRY_CTLS_MSR 0x484
#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
#define LMSW_GUEST_MASK 0x0eULL
#define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
//#define CR4_VMXE 0x2000
#define CR8_RESEVED_BITS (~0x0fULL)
#define EFER_RESERVED_BITS 0xfffffffffffff2fe
#ifdef __x86_64__
#define HOST_IS_64 1
#else
#define HOST_IS_64 0
#endif
int vm_set_memory_region(struct litevm *litevm,
struct litevm_memory_region *mem);
/* bit ops not yet widely used in akaros and we're not sure where to put them. */
/**
* __ffs - find first set bit in word
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static inline unsigned long __ffs(unsigned long word)
{
print_func_entry();
asm("rep; bsf %1,%0":"=r"(word)
: "rm"(word));
print_func_exit();
return word;
}
static struct vmx_msr_entry *find_msr_entry(struct litevm_vcpu *vcpu,
uint32_t msr)
{
print_func_entry();
int i;
for (i = 0; i < vcpu->nmsrs; ++i)
if (vcpu->guest_msrs[i].index == msr) {
print_func_exit();
return &vcpu->guest_msrs[i];
}
print_func_exit();
return 0;
}
struct descriptor_table {
uint16_t limit;
unsigned long base;
} __attribute__ ((packed));
static void get_gdt(struct descriptor_table *table)
{
print_func_entry();
asm("sgdt %0":"=m"(*table));
print_func_exit();
}
static void get_idt(struct descriptor_table *table)
{
print_func_entry();
asm("sidt %0":"=m"(*table));
print_func_exit();
}
static uint16_t read_fs(void)
{
//print_func_entry();
uint16_t seg;
asm("mov %%fs, %0":"=g"(seg));
//print_func_exit();
return seg;
}
static uint16_t read_gs(void)
{
//print_func_entry();
uint16_t seg;
asm("mov %%gs, %0":"=g"(seg));
//print_func_exit();
return seg;
}
static uint16_t read_ldt(void)
{
//print_func_entry();
uint16_t ldt;
asm("sldt %0":"=g"(ldt));
//print_func_exit();
return ldt;
}
static void load_fs(uint16_t sel)
{
//print_func_entry();
asm("mov %0, %%fs": :"g"(sel));
//print_func_exit();
}
static void load_gs(uint16_t sel)
{
//print_func_entry();
asm("mov %0, %%gs": :"g"(sel));
//print_func_exit();
}
#ifndef load_ldt
static void load_ldt(uint16_t sel)
{
//print_func_entry();
asm("lldt %0": :"g"(sel));
//print_func_exit();
}
#endif
static void fx_save(void *image)
{
//print_func_entry();
asm("fxsave (%0)"::"r"(image));
//print_func_exit();
}
static void fx_restore(void *image)
{
//print_func_entry();
asm("fxrstor (%0)"::"r"(image));
//print_func_exit();
}
static void fpu_init(void)
{
print_func_entry();
asm("finit");
print_func_exit();
}
struct segment_descriptor {
uint16_t limit_low;
uint16_t base_low;
uint8_t base_mid;
uint8_t type:4;
uint8_t system:1;
uint8_t dpl:2;
uint8_t present:1;
uint8_t limit_high:4;
uint8_t avl:1;
uint8_t long_mode:1;
uint8_t default_op:1;
uint8_t granularity:1;
uint8_t base_high;
} __attribute__ ((packed));
#ifdef __x86_64__
// LDT or TSS descriptor in the GDT. 16 bytes.
struct segment_descriptor_64 {
struct segment_descriptor s;
uint32_t base_higher;
uint32_t pad_zero;
};
#endif
static unsigned long segment_base(uint16_t selector)
{
print_func_entry();
struct descriptor_table gdt;
struct segment_descriptor *d;
unsigned long table_base;
typedef unsigned long ul;
unsigned long v;
asm("sgdt %0":"=m"(gdt));
table_base = gdt.base;
if (selector & 4) { /* from ldt */
uint16_t ldt_selector;
asm("sldt %0":"=g"(ldt_selector));
table_base = segment_base(ldt_selector);
}
d = (struct segment_descriptor *)(table_base + (selector & ~7));
v = d->base_low | ((ul) d->base_mid << 16) | ((ul) d->base_high << 24);
#ifdef __x86_64__
if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
v |= ((ul) ((struct segment_descriptor_64 *)d)->base_higher) << 32;
#endif
print_func_exit();
return v;
}
static unsigned long read_tr_base(void)
{
print_func_entry();
uint16_t tr;
asm("str %0":"=g"(tr));
print_func_exit();
return segment_base(tr);
}
static void reload_tss(void)
{
print_func_entry();
#ifndef __x86_64__
/*
* VT restores TR but not its size. Useless.
*/
struct descriptor_table gdt;
struct segment_descriptor *descs;
get_gdt(&gdt);
descs = (void *)gdt.base;
descs[GD_TSS].type = 9; /* available TSS */
load_TR_desc();
#endif
print_func_exit();
}
static struct vmcs_descriptor {
int size;
int order;
uint32_t revision_id;
} vmcs_descriptor;
static inline struct page *_gfn_to_page(struct litevm *litevm, gfn_t gfn)
{
print_func_entry();
struct litevm_memory_slot *slot = gfn_to_memslot(litevm, gfn);
print_func_exit();
return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
}
int litevm_read_guest(struct litevm_vcpu *vcpu,
gva_t addr, unsigned long size, void *dest)
{
print_func_entry();
unsigned char *host_buf = dest;
unsigned long req_size = size;
while (size) {
hpa_t paddr;
unsigned now;
unsigned offset;
hva_t guest_buf;
paddr = gva_to_hpa(vcpu, addr);
if (is_error_hpa(paddr))
break;
guest_buf = (hva_t) KADDR(paddr);
offset = addr & ~PAGE_MASK;
guest_buf |= offset;
now = MIN(size, PAGE_SIZE - offset);
memcpy(host_buf, (void *)guest_buf, now);
host_buf += now;
addr += now;
size -= now;
}
print_func_exit();
return req_size - size;
}
int litevm_write_guest(struct litevm_vcpu *vcpu,
gva_t addr, unsigned long size, void *data)
{
print_func_entry();
unsigned char *host_buf = data;
unsigned long req_size = size;
while (size) {
hpa_t paddr;
unsigned now;
unsigned offset;
hva_t guest_buf;
paddr = gva_to_hpa(vcpu, addr);
if (is_error_hpa(paddr))
break;
guest_buf = (hva_t) KADDR(paddr);
offset = addr & ~PAGE_MASK;
guest_buf |= offset;
now = MIN(size, PAGE_SIZE - offset);
memcpy((void *)guest_buf, host_buf, now);
host_buf += now;
addr += now;
size -= now;
}
print_func_exit();
return req_size - size;
}
static void setup_vmcs_descriptor(void)
{
print_func_entry();
uint64_t msr;
msr = read_msr(MSR_IA32_VMX_BASIC_MSR);
vmcs_descriptor.size = (msr >> 32) & 0x1fff;
vmcs_descriptor.order = LOG2_UP(vmcs_descriptor.size >> PAGE_SHIFT);
vmcs_descriptor.revision_id = (uint32_t) msr;
printk("setup_vmcs_descriptor: msr 0x%x, size 0x%x order 0x%x id 0x%x\n",
msr, vmcs_descriptor.size, vmcs_descriptor.order,
vmcs_descriptor.revision_id);
print_func_exit();
};
static void vmcs_clear(struct vmcs *vmcs)
{
print_func_entry();
uint64_t phys_addr = PADDR(vmcs);
uint8_t error;
printk("%d: vmcs %p phys_addr %p\n", core_id(), vmcs, (void *)phys_addr);
asm volatile ("vmclear %1; setna %0":"=m" (error):"m"(phys_addr):"cc",
"memory");
if (error)
printk("litevm: vmclear fail: %p/%llx\n", vmcs, phys_addr);
print_func_exit();
}
static void __vcpu_clear(struct hw_trapframe *hw_tf, void *arg)
{
print_func_entry();
struct litevm_vcpu *vcpu = arg;
int cpu = core_id();
printd
("__vcpu_clear: cpu %d vcpu->cpu %d currentcpu->vmcs %p vcpu->vmcs %p\n",
cpu, vcpu->cpu, currentcpu->vmcs, vcpu->vmcs);
if (vcpu->cpu == cpu)
vmcs_clear(vcpu->vmcs);
if (currentcpu->vmcs == vcpu->vmcs)
currentcpu->vmcs = NULL;
print_func_exit();
}
static int vcpu_slot(struct litevm_vcpu *vcpu)
{
print_func_entry();
print_func_exit();
return vcpu - vcpu->litevm->vcpus;
}
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
static struct litevm_vcpu *__vcpu_load(struct litevm_vcpu *vcpu)
{
print_func_entry();
uint64_t phys_addr = PADDR(vcpu->vmcs);
int cpu;
cpu = core_id();
printk("__vcpu_load: vcpu->cpu %d cpu %d\n", vcpu->cpu, cpu);
if ((vcpu->cpu != cpu) && (vcpu->cpu != -1)){
handler_wrapper_t *w;
smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, &w);
smp_call_wait(w);
vcpu->launched = 0;
}
printk("2 ..");
if (currentcpu->vmcs != vcpu->vmcs) {
uint8_t error;
currentcpu->vmcs = vcpu->vmcs;
asm volatile ("vmptrld %1; setna %0":"=m" (error):"m"(phys_addr):"cc");
if (error) {
printk("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
error("litevm: vmptrld %p/%llx fail\n", vcpu->vmcs, phys_addr);
}
}
printk("3 ..");
if (vcpu->cpu != cpu) {
struct descriptor_table dt;
unsigned long sysenter_esp;
vcpu->cpu = cpu;
/*
* Linux uses per-cpu TSS and GDT, so set these when switching
* processors.
*/
vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
get_gdt(&dt);
vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
}
print_func_exit();
return vcpu;
}
/*
* Switches to specified vcpu, until a matching vcpu_put()
* And leaves it locked!
*/
static struct litevm_vcpu *vcpu_load(struct litevm *litevm, int vcpu_slot)
{
struct litevm_vcpu *ret;
print_func_entry();
struct litevm_vcpu *vcpu = &litevm->vcpus[vcpu_slot];
printk("vcpu_slot %d vcpu %p\n", vcpu_slot, vcpu);
QLOCK(&vcpu->mutex);
printk("Locked\n");
if (!vcpu->vmcs) {
QUNLOCK(&vcpu->mutex);
printk("vcpu->vmcs for vcpu %p is NULL", vcpu);
error("vcpu->vmcs is NULL");
}
ret = __vcpu_load(vcpu);
print_func_exit();
return ret;
}
static void vcpu_put(struct litevm_vcpu *vcpu)
{
print_func_entry();
//put_cpu();
QUNLOCK(&vcpu->mutex);
print_func_exit();
}
static struct vmcs *alloc_vmcs_cpu(int cpu)
{
print_func_entry();
int node = node_id();
struct vmcs *vmcs;
vmcs = get_cont_pages_node(node, vmcs_descriptor.order, KMALLOC_WAIT);
if (!vmcs) {
print_func_exit();
printk("no memory for vcpus");
error("no memory for vcpus");
}
memset(vmcs, 0, vmcs_descriptor.size);
vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
print_func_exit();
return vmcs;
}
static struct vmcs *alloc_vmcs(void)
{
struct vmcs *ret;
print_func_entry();
ret = alloc_vmcs_cpu(core_id());
print_func_exit();
return ret;
}
static int cpu_has_litevm_support(void)
{
int ret;
print_func_entry();
/* sigh ... qemu. */
char vid[16];
if (vendor_id(vid) < 0)
return 0;
printk("vendor id is %s\n", vid);
if (vid[0] == 'Q') /* qemu */
return 0;
if (vid[0] == 'A') /* AMD or qemu claiming to be AMD */
return 0;
uint32_t ecx = cpuid_ecx(1);
ret = ecx & (1 << 5); /* CPUID.1:ECX.VMX[bit 5] -> VT */
printk("%s: CPUID.1:ECX.VMX[bit 5] -> VT is%s available\n", __func__, ret ? "" : " NOT");
print_func_exit();
return ret;
}
static int vmx_disabled_by_bios(void)
{
print_func_entry();
uint64_t msr;
msr = read_msr(MSR_IA32_FEATURE_CONTROL);
print_func_exit();
return (msr & 5) == 1; /* locked but not enabled */
}
static void vm_enable(struct hw_trapframe *hw_tf, void *garbage)
{
print_func_entry();
int cpu = hw_core_id();
uint64_t phys_addr;
uint64_t old;
uint64_t status = 0;
currentcpu->vmxarea = get_cont_pages_node(core_id(), vmcs_descriptor.order,
KMALLOC_WAIT);
if (!currentcpu->vmxarea)
return;
memset(currentcpu->vmxarea, 0, vmcs_descriptor.size);
currentcpu->vmxarea->revision_id = vmcs_descriptor.revision_id;
phys_addr = PADDR(currentcpu->vmxarea);
printk("%d: currentcpu->vmxarea %p phys_addr %p\n", core_id(),
currentcpu->vmxarea, (void *)phys_addr);
if (phys_addr & 0xfff) {
printk("fix vmxarea alignment!");
}
printk("%d: CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
old = read_msr(MSR_IA32_FEATURE_CONTROL);
printk("%d: vm_enable, old is %d\n", core_id(), old);
if ((old & 5) == 0) {
/* enable and lock */
write_msr(MSR_IA32_FEATURE_CONTROL, old | 5);
old = read_msr(MSR_IA32_FEATURE_CONTROL);
printk("%d:vm_enable, tried to set 5, old is %d\n", core_id(), old);
}
printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
lcr4(rcr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
printk("%d:cr0 is %x\n", core_id(), rcr0());
lcr0(rcr0() | 0x20);
printk("%d:cr0 is %x\n", core_id(), rcr0());
printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
outb(0x92, inb(0x92) | 2);
printk("%d:A20 is %d (0x2 should be set)\n", core_id(), inb(0x92));
asm volatile ("vmxon %1\njbe 1f\nmovl $1, %0\n1:":"=m" (status):"m"
(phys_addr):"memory", "cc");
printk("%d:vmxon status is %d\n", core_id(), status);
printk("%d:CR4 is 0x%x, and VMXE is %x\n", core_id(), rcr4(), CR4_VMXE);
if (!status) {
printk("%d:vm_enable: status says fail\n", core_id());
}
print_func_exit();
}
static void litevm_disable(void *garbage)
{
print_func_entry();
asm volatile ("vmxoff":::"cc");
print_func_exit();
}
struct litevm *vmx_open(void)
{
print_func_entry();
struct litevm *litevm = kzmalloc(sizeof(struct litevm), KMALLOC_WAIT);
int i;
printk("vmx_open: litevm is %p\n", litevm);
if (!litevm) {
printk("NO LITEVM! MAKES NO SENSE!\n");
error("litevm alloc failed");
print_func_exit();
return 0;
}
SPLI_irqsave(&litevm->lock);
LIST_INIT(&litevm->link);
for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
struct litevm_vcpu *vcpu = &litevm->vcpus[i];
printk("init vcpu %p\n", vcpu);
QLOCK_init(&vcpu->mutex);
vcpu->mmu.root_hpa = INVALID_PAGE;
vcpu->litevm = litevm;
LIST_INIT(&vcpu->link);
}
printk("vmx_open: busy %d\n", litevm->busy);
printk("return %p\n", litevm);
print_func_exit();
return litevm;
}
/*
* Free any memory in @free but not in @dont.
*/
static void litevm_free_physmem_slot(struct litevm_memory_slot *free,
struct litevm_memory_slot *dont)
{
print_func_entry();
int i;
if (!dont || free->phys_mem != dont->phys_mem)
if (free->phys_mem) {
for (i = 0; i < free->npages; ++i) {
page_t *page = free->phys_mem[i];
page_decref(page);
assert(page_is_free(page2ppn(page)));
}
kfree(free->phys_mem);
}
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
kfree(free->dirty_bitmap);
free->phys_mem = 0;
free->npages = 0;
free->dirty_bitmap = 0;
print_func_exit();
}
static void litevm_free_physmem(struct litevm *litevm)
{
print_func_entry();
int i;
for (i = 0; i < litevm->nmemslots; ++i)
litevm_free_physmem_slot(&litevm->memslots[i], 0);
print_func_exit();
}
static void litevm_free_vmcs(struct litevm_vcpu *vcpu)
{
print_func_entry();
if (vcpu->vmcs) {
handler_wrapper_t *w;
smp_call_function_all(__vcpu_clear, vcpu, &w);
smp_call_wait(w);
//free_vmcs(vcpu->vmcs);
vcpu->vmcs = 0;
}
print_func_exit();
}
static void litevm_free_vcpu(struct litevm_vcpu *vcpu)
{
print_func_entry();
litevm_free_vmcs(vcpu);
litevm_mmu_destroy(vcpu);
print_func_exit();
}
static void litevm_free_vcpus(struct litevm *litevm)
{
print_func_entry();
unsigned int i;
for (i = 0; i < LITEVM_MAX_VCPUS; ++i)
litevm_free_vcpu(&litevm->vcpus[i]);
print_func_exit();
}
static int litevm_dev_release(struct litevm *litevm)
{
print_func_entry();
litevm_free_vcpus(litevm);
litevm_free_physmem(litevm);
kfree(litevm);
print_func_exit();
return 0;
}
unsigned long vmcs_readl(unsigned long field)
{
unsigned long value;
asm volatile ("vmread %1, %0":"=g" (value):"r"(field):"cc");
return value;
}
void vmcs_writel(unsigned long field, unsigned long value)
{
uint8_t error;
asm volatile ("vmwrite %1, %2; setna %0":"=g" (error):"r"(value),
"r"(field):"cc");
if (error)
printk("vmwrite error: reg %lx value %lx (err %d)\n",
field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}
static void vmcs_write16(unsigned long field, uint16_t value)
{
vmcs_writel(field, value);
}
static void vmcs_write64(unsigned long field, uint64_t value)
{
print_func_entry();
#ifdef __x86_64__
vmcs_writel(field, value);
#else
vmcs_writel(field, value);
asm volatile ("");
vmcs_writel(field + 1, value >> 32);
#endif
print_func_exit();
}
static void inject_gp(struct litevm_vcpu *vcpu)
{
print_func_entry();
printd("inject_general_protection: rip 0x%lx\n", vmcs_readl(GUEST_RIP));
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
GP_VECTOR |
INTR_TYPE_EXCEPTION |
INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK);
print_func_exit();
}
static void update_exception_bitmap(struct litevm_vcpu *vcpu)
{
print_func_entry();
if (vcpu->rmode.active)
vmcs_write32(EXCEPTION_BITMAP, ~0);
else
vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
print_func_exit();
}
static void enter_pmode(struct litevm_vcpu *vcpu)
{
print_func_entry();
unsigned long flags;
vcpu->rmode.active = 0;
vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
(vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK));
update_exception_bitmap(vcpu);
#define FIX_PMODE_DATASEG(seg, save) { \
vmcs_write16(GUEST_##seg##_SELECTOR, 0); \
vmcs_writel(GUEST_##seg##_BASE, 0); \
vmcs_write32(GUEST_##seg##_LIMIT, 0xffff); \
vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93); \
}
FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
vmcs_write16(GUEST_CS_SELECTOR,
vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
print_func_exit();
}
static int rmode_tss_base(struct litevm *litevm)
{
print_func_entry();
gfn_t base_gfn =
litevm->memslots[0].base_gfn + litevm->memslots[0].npages - 3;
print_func_exit();
return base_gfn << PAGE_SHIFT;
}
static void enter_rmode(struct litevm_vcpu *vcpu)
{
print_func_entry();
unsigned long flags;
vcpu->rmode.active = 1;
vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->litevm));
vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
printk("FLAGS 0x%x\n", flags);
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
update_exception_bitmap(vcpu);
#define FIX_RMODE_SEG(seg, save) { \
vmcs_write16(GUEST_##seg##_SELECTOR, \
vmcs_readl(GUEST_##seg##_BASE) >> 4); \
vmcs_write32(GUEST_##seg##_LIMIT, 0xffff); \
vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3); \
}
vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
FIX_RMODE_SEG(ES, vcpu->rmode.es);
FIX_RMODE_SEG(DS, vcpu->rmode.ds);
FIX_RMODE_SEG(SS, vcpu->rmode.ss);
FIX_RMODE_SEG(GS, vcpu->rmode.gs);
FIX_RMODE_SEG(FS, vcpu->rmode.fs);
print_func_exit();
}
static int init_rmode_tss(struct litevm *litevm)
{
print_func_entry();
struct page *p1, *p2, *p3;
gfn_t fn = rmode_tss_base(litevm) >> PAGE_SHIFT;
char *page;
p1 = _gfn_to_page(litevm, fn++);
p2 = _gfn_to_page(litevm, fn++);
p3 = _gfn_to_page(litevm, fn);
if (!p1 || !p2 || !p3) {
printk("%s: gfn_to_page failed\n", __FUNCTION__);
print_func_exit();
return 0;
}
page = page2kva(p1);
memset(page, 0, PAGE_SIZE);
*(uint16_t *) (page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
page = page2kva(p2);
memset(page, 0, PAGE_SIZE);
page = page2kva(p3);
memset(page, 0, PAGE_SIZE);
*(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
print_func_exit();
return 1;
}
#ifdef __x86_64__
static void __set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
{
print_func_entry();
struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
vcpu->shadow_efer = efer;
if (efer & EFER_LMA) {
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS) |
VM_ENTRY_CONTROLS_IA32E_MASK);
msr->value = efer;
} else {
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS) &
~VM_ENTRY_CONTROLS_IA32E_MASK);
msr->value = efer & ~EFER_LME;
}
print_func_exit();
}
static void enter_lmode(struct litevm_vcpu *vcpu)
{
print_func_entry();
uint32_t guest_tr_ar;
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
printd("%s: tss fixup for long mode. \n", __FUNCTION__);
vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK)
| AR_TYPE_BUSY_64_TSS);
}
vcpu->shadow_efer |= EFER_LMA;
find_msr_entry(vcpu, MSR_EFER)->value |= EFER_LMA | EFER_LME;
vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
| VM_ENTRY_CONTROLS_IA32E_MASK);
print_func_exit();
}
static void exit_lmode(struct litevm_vcpu *vcpu)
{
print_func_entry();
vcpu->shadow_efer &= ~EFER_LMA;
vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS)
& ~VM_ENTRY_CONTROLS_IA32E_MASK);
print_func_exit();
}
#endif
static void __set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
{
print_func_entry();
if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
enter_pmode(vcpu);
if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
enter_rmode(vcpu);
#ifdef __x86_64__
if (vcpu->shadow_efer & EFER_LME) {
if (!is_paging() && (cr0 & CR0_PG_MASK))
enter_lmode(vcpu);
if (is_paging() && !(cr0 & CR0_PG_MASK))
exit_lmode(vcpu);
}
#endif
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, cr0 | LITEVM_VM_CR0_ALWAYS_ON);
print_func_exit();
}
static int pdptrs_have_reserved_bits_set(struct litevm_vcpu *vcpu,
unsigned long cr3)
{
print_func_entry();
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
unsigned offset = (cr3 & (PAGE_SIZE - 1)) >> 5;
int i;
uint64_t pdpte;
uint64_t *pdpt;
struct litevm_memory_slot *memslot;
SPLL(&vcpu->litevm->lock);
memslot = gfn_to_memslot(vcpu->litevm, pdpt_gfn);
/* FIXME: !memslot - emulate? 0xff? */
pdpt = page2kva(gfn_to_page(memslot, pdpt_gfn));
for (i = 0; i < 4; ++i) {
pdpte = pdpt[offset + i];
if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
break;
}
SPLU(&vcpu->litevm->lock);
print_func_exit();
return i != 4;
}
static void set_cr0(struct litevm_vcpu *vcpu, unsigned long cr0)
{
print_func_entry();
if (cr0 & CR0_RESEVED_BITS) {
printd("set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, guest_cr0());
inject_gp(vcpu);
print_func_exit();
return;
}
if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
printd("set_cr0: #GP, CD == 0 && NW == 1\n");
inject_gp(vcpu);
print_func_exit();
return;
}
if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
printd("set_cr0: #GP, set PG flag " "and a clear PE flag\n");
inject_gp(vcpu);
print_func_exit();
return;
}
if (!is_paging() && (cr0 & CR0_PG_MASK)) {
#ifdef __x86_64__
if ((vcpu->shadow_efer & EFER_LME)) {
uint32_t guest_cs_ar;
if (!is_pae()) {
printd("set_cr0: #GP, start paging "
"in long mode while PAE is disabled\n");
inject_gp(vcpu);
print_func_exit();
return;
}
guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
if (guest_cs_ar & SEGMENT_AR_L_MASK) {
printd("set_cr0: #GP, start paging "
"in long mode while CS.L == 1\n");
inject_gp(vcpu);
print_func_exit();
return;
}
} else
#endif
if (is_pae() && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
printd("set_cr0: #GP, pdptrs " "reserved bits\n");
inject_gp(vcpu);
print_func_exit();
return;
}
}
__set_cr0(vcpu, cr0);
litevm_mmu_reset_context(vcpu);
print_func_exit();
return;
}
static void lmsw(struct litevm_vcpu *vcpu, unsigned long msw)
{
print_func_entry();
unsigned long cr0 = guest_cr0();
if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
enter_pmode(vcpu);
vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
} else
printd("lmsw: unexpected\n");
vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
| (msw & LMSW_GUEST_MASK));
print_func_exit();
}
static void __set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
{
print_func_entry();
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
LITEVM_RMODE_VM_CR4_ALWAYS_ON :
LITEVM_PMODE_VM_CR4_ALWAYS_ON));
print_func_exit();
}
static void set_cr4(struct litevm_vcpu *vcpu, unsigned long cr4)
{
print_func_entry();
if (cr4 & CR4_RESEVED_BITS) {
printd("set_cr4: #GP, reserved bits\n");
inject_gp(vcpu);
print_func_exit();
return;
}
if (is_long_mode()) {
if (!(cr4 & CR4_PAE_MASK)) {
printd("set_cr4: #GP, clearing PAE while " "in long mode\n");
inject_gp(vcpu);
print_func_exit();
return;
}
} else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
&& pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
printd("set_cr4: #GP, pdptrs reserved bits\n");
inject_gp(vcpu);
}
if (cr4 & CR4_VMXE_MASK) {
printd("set_cr4: #GP, setting VMXE\n");
inject_gp(vcpu);
print_func_exit();
return;
}
__set_cr4(vcpu, cr4);
SPLL(&vcpu->litevm->lock);
litevm_mmu_reset_context(vcpu);
SPLU(&vcpu->litevm->lock);
print_func_exit();
}
static void set_cr3(struct litevm_vcpu *vcpu, unsigned long cr3)
{
print_func_entry();
if (is_long_mode()) {
if (cr3 & CR3_L_MODE_RESEVED_BITS) {
printd("set_cr3: #GP, reserved bits\n");
inject_gp(vcpu);
print_func_exit();
return;
}
} else {
if (cr3 & CR3_RESEVED_BITS) {
printd("set_cr3: #GP, reserved bits\n");
inject_gp(vcpu);
print_func_exit();
return;
}
if (is_paging() && is_pae() && pdptrs_have_reserved_bits_set(vcpu, cr3)) {
printd("set_cr3: #GP, pdptrs " "reserved bits\n");
inject_gp(vcpu);
print_func_exit();
return;
}
}
vcpu->cr3 = cr3;
SPLL(&vcpu->litevm->lock);
vcpu->mmu.new_cr3(vcpu);
SPLU(&vcpu->litevm->lock);
print_func_exit();
}
static void set_cr8(struct litevm_vcpu *vcpu, unsigned long cr8)
{
print_func_entry();
if (cr8 & CR8_RESEVED_BITS) {
printd("set_cr8: #GP, reserved bits 0x%lx\n", cr8);
inject_gp(vcpu);
print_func_exit();
return;
}
vcpu->cr8 = cr8;
print_func_exit();
}
static uint32_t get_rdx_init_val(void)
{
print_func_entry();
uint32_t val;
asm("movl $1, %%eax \n\t" "movl %%eax, %0 \n\t":"=g"(val));
print_func_exit();
return val;
}
static void fx_init(struct litevm_vcpu *vcpu)
{
print_func_entry();
struct __attribute__ ((__packed__)) fx_image_s {
uint16_t control; //fcw
uint16_t status; //fsw
uint16_t tag; // ftw
uint16_t opcode; //fop
uint64_t ip; // fpu ip
uint64_t operand; // fpu dp
uint32_t mxcsr;
uint32_t mxcsr_mask;
} *fx_image;
fx_save(vcpu->host_fx_image);
fpu_init();
fx_save(vcpu->guest_fx_image);
fx_restore(vcpu->host_fx_image);
fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
fx_image->mxcsr = 0x1f80;
memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
print_func_exit();
}
static void vmcs_write32_fixedbits(uint32_t msr, uint32_t vmcs_field,
uint32_t val)
{
uint32_t msr_high, msr_low;
uint64_t msrval;
msrval = read_msr(msr);
msr_low = msrval;
msr_high = (msrval >> 32);
val &= msr_high;
val |= msr_low;
vmcs_write32(vmcs_field, val);
}
/*
* Sets up the vmcs for emulated real mode.
*/
static int litevm_vcpu_setup(struct litevm_vcpu *vcpu)
{
print_func_entry();
/* no op on x86_64 */
#define asmlinkage
extern asmlinkage void litevm_vmx_return(void);
uint32_t host_sysenter_cs;
uint32_t junk;
uint64_t a;
struct descriptor_table dt;
int i;
int ret;
uint64_t tsc;
int nr_good_msrs;
memset(vcpu->regs, 0, sizeof(vcpu->regs));
vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
vcpu->cr8 = 0;
vcpu->apic_base = 0xfee00000 |
/*for vcpu 0 */ MSR_IA32_APICBASE_BSP |
MSR_IA32_APICBASE_ENABLE;
fx_init(vcpu);
#define SEG_SETUP(seg) do { \
vmcs_write16(GUEST_##seg##_SELECTOR, 0); \
vmcs_writel(GUEST_##seg##_BASE, 0); \
vmcs_write32(GUEST_##seg##_LIMIT, 0xffff); \
vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93); \
} while (0)
/*
* GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
* insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
*/
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
vmcs_writel(GUEST_CS_BASE, 0x000f0000);
vmcs_write32(GUEST_CS_LIMIT, 0xffff);
vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
SEG_SETUP(DS);
SEG_SETUP(ES);
SEG_SETUP(FS);
SEG_SETUP(GS);
SEG_SETUP(SS);
vmcs_write16(GUEST_TR_SELECTOR, 0);
vmcs_writel(GUEST_TR_BASE, 0);
vmcs_write32(GUEST_TR_LIMIT, 0xffff);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
vmcs_write16(GUEST_LDTR_SELECTOR, 0);
vmcs_writel(GUEST_LDTR_BASE, 0);
vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
vmcs_write32(GUEST_SYSENTER_CS, 0);
vmcs_writel(GUEST_SYSENTER_ESP, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
vmcs_writel(GUEST_RFLAGS, 0x02);
vmcs_writel(GUEST_RIP, 0xfff0);
vmcs_writel(GUEST_RSP, 0);
vmcs_writel(GUEST_CR3, 0);
//todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
vmcs_writel(GUEST_DR7, 0x400);
vmcs_writel(GUEST_GDTR_BASE, 0);
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
vmcs_writel(GUEST_IDTR_BASE, 0);
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_ACTIVITY_STATE, 0);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
/* I/O */
vmcs_write64(IO_BITMAP_A, 0);
vmcs_write64(IO_BITMAP_B, 0);
tsc = read_tsc();
vmcs_write64(TSC_OFFSET, -tsc);
vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
/* Special registers */
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
/* Control */
vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_EXT_INTR_MASK /* 20.6.1 */
| PIN_BASED_NMI_EXITING /* 20.6.1 */
);
vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR, CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_HLT_EXITING /* 20.6.2 */
| CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
| CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */
| CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */
| CPU_BASED_INVDPG_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */
);
vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
vmcs_writel(HOST_CR0, rcr0()); /* 22.2.3 */
vmcs_writel(HOST_CR4, rcr4()); /* 22.2.3, 22.2.5 */
vmcs_writel(HOST_CR3, rcr3()); /* 22.2.3 FIXME: shadow tables */
vmcs_write16(HOST_CS_SELECTOR, GD_KT); /* 22.2.4 */
vmcs_write16(HOST_DS_SELECTOR, GD_KD); /* 22.2.4 */
vmcs_write16(HOST_ES_SELECTOR, GD_KD); /* 22.2.4 */
vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */
vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */
vmcs_write16(HOST_SS_SELECTOR, GD_KD); /* 22.2.4 */
#ifdef __x86_64__
a = read_msr(MSR_FS_BASE);
vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
a = read_msr(MSR_GS_BASE);
vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
#else
vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
#endif
vmcs_write16(HOST_TR_SELECTOR, GD_TSS * 8); /* 22.2.4 */
get_idt(&dt);
vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
vmcs_writel(HOST_RIP, (unsigned long)litevm_vmx_return); /* 22.2.5 */
/* it's the HIGH 32 bits! */
host_sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS) >> 32;
vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
a = read_msr(MSR_IA32_SYSENTER_ESP);
vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
a = read_msr(MSR_IA32_SYSENTER_EIP);
vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
ret = -ENOMEM;
vcpu->guest_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
if (!vcpu->guest_msrs)
error("guest_msrs kmalloc failed");
vcpu->host_msrs = kmalloc(PAGE_SIZE, KMALLOC_WAIT);
if (!vcpu->host_msrs)
error("vcpu->host_msrs kmalloc failed -- storage leaked");
for (i = 0; i < NR_VMX_MSR; ++i) {
uint32_t index = vmx_msr_index[i];
uint32_t data_low, data_high;
uint64_t data;
int j = vcpu->nmsrs;
#warning "need readmsr_safe"
// if (rdmsr_safe(index, &data_low, &data_high) < 0)
// continue;
data = read_msr(index);
vcpu->host_msrs[j].index = index;
vcpu->host_msrs[j].reserved = 0;
vcpu->host_msrs[j].value = data;
vcpu->guest_msrs[j] = vcpu->host_msrs[j];
++vcpu->nmsrs;
}
printk("msrs: %d\n", vcpu->nmsrs);
nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
vmcs_writel(VM_EXIT_MSR_STORE_ADDR, PADDR(vcpu->guest_msrs + NR_BAD_MSRS));
vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, PADDR(vcpu->host_msrs + NR_BAD_MSRS));
vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS, (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
/* 22.2.1, 20.8.1 */
vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR, VM_ENTRY_CONTROLS, 0);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
vmcs_writel(TPR_THRESHOLD, 0);
vmcs_writel(CR0_GUEST_HOST_MASK, LITEVM_GUEST_CR0_MASK);
vmcs_writel(CR4_GUEST_HOST_MASK, LITEVM_GUEST_CR4_MASK);
__set_cr0(vcpu, 0x60000010); // enter rmode
__set_cr4(vcpu, 0);
#ifdef __x86_64__
__set_efer(vcpu, 0);
#endif
ret = litevm_mmu_init(vcpu);
print_func_exit();
return ret;
out_free_guest_msrs:
kfree(vcpu->guest_msrs);
out:
return ret;
}
/*
* Sync the rsp and rip registers into the vcpu structure. This allows
* registers to be accessed by indexing vcpu->regs.
*/
static void vcpu_load_rsp_rip(struct litevm_vcpu *vcpu)
{
print_func_entry();
vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
vcpu->rip = vmcs_readl(GUEST_RIP);
print_func_exit();
}
/*
* Syncs rsp and rip back into the vmcs. Should be called after possible
* modification.
*/
static void vcpu_put_rsp_rip(struct litevm_vcpu *vcpu)
{
print_func_entry();
vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
vmcs_writel(GUEST_RIP, vcpu->rip);
print_func_exit();
}
/*
* Creates some virtual cpus. Good luck creating more than one.
*/
int vmx_create_vcpu(struct litevm *litevm, int n)
{
print_func_entry();
ERRSTACK(2);
int r;
struct litevm_vcpu *vcpu;
struct vmcs *vmcs;
char *errstring = NULL;
if (n < 0 || n >= LITEVM_MAX_VCPUS) {
printk("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
LITEVM_MAX_VCPUS);
error("%d is out of range; LITEVM_MAX_VCPUS is %d", n,
LITEVM_MAX_VCPUS);
}
printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
vcpu = &litevm->vcpus[n];
printk("vmx_create_vcpu: @%d, %p\n", n, vcpu);
QLOCK(&vcpu->mutex);
if (vcpu->vmcs) {
QUNLOCK(&vcpu->mutex);
printk("VM already exists\n");
error("VM already exists");
}
printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
/* I'm a bad person */
//ALIGN(vcpu->fx_buf, FX_IMAGE_ALIGN);
uint64_t a = (uint64_t) vcpu->fx_buf;
a += FX_IMAGE_ALIGN - 1;
a /= FX_IMAGE_ALIGN;
a *= FX_IMAGE_ALIGN;
vcpu->host_fx_image = (char *)a;
vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
vcpu->cpu = -1; /* First load will set up TR */
vcpu->litevm = litevm;
printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
if (waserror()){
printk("ERR 1 in %s, %s\n", __func__, current_errstr());
QUNLOCK(&vcpu->mutex);
litevm_free_vcpu(vcpu);
nexterror();
}
printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
vmcs = alloc_vmcs();
vmcs_clear(vmcs);
printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
printk("after vmcs_clear\n");
vcpu->vmcs = vmcs;
printk("vcpu %p set vmcs to %p\n", vcpu, vmcs);
vcpu->launched = 0;
printk("vcpu %p slot %d vmcs is %p\n", vcpu, n, vmcs);
__vcpu_load(vcpu);
printk("PAST vcpu_load\n");
if (waserror()) {
/* we really need to fix waserror() */
printk("vcpu_setup failed: %s\n", current_errstr());
QUNLOCK(&vcpu->mutex);
nexterror();
}
/* need memory for the rmode_tss. I have no idea how this happened
* originally in kvm.
*/
/* this sucks. */
QUNLOCK(&vcpu->mutex);
void *v;
struct litevm_memory_region vmr;
vmr.slot = 0;
vmr.flags = 0;
vmr.guest_phys_addr = /* guess. */ 0x1000000;
vmr.memory_size = 0x10000;
vmr.init_data = NULL;
if (vm_set_memory_region(litevm, &vmr))
printk("vm_set_memory_region failed");
printk("set memory region done\n");
if (!init_rmode_tss(litevm)) {
error("vcpu_setup: init_rmode_tss failed");
}
QLOCK(&vcpu->mutex);
r = litevm_vcpu_setup(vcpu);
vcpu_put(vcpu);
printk("r is %d\n", r);
if (!r) {
poperror();
print_func_exit();
return 0;
}
errstring = "vcup set failed";
out_free_vcpus:
out:
print_func_exit();
return r;
}
/*
* Allocate some memory and give it an address in the guest physical address
* space.
*
* Discontiguous memory is allowed, mostly for framebuffers.
*/
int vm_set_memory_region(struct litevm *litevm,
struct litevm_memory_region *mem)
{
print_func_entry();
ERRSTACK(2);
int r;
gfn_t base_gfn;
unsigned long npages;
unsigned long i;
struct litevm_memory_slot *memslot;
struct litevm_memory_slot old, new;
int memory_config_version;
void *init_data = mem->init_data;
int pass = 1;
printk("%s: slot %d base %08x npages %d\n",
__func__,
mem->slot, mem->guest_phys_addr,
mem->memory_size);
/* should not happen but ... */
if (!litevm)
error("NULL litevm in %s", __func__);
if (!mem)
error("NULL mem in %s", __func__);
/* I don't care right now. *
if (litevm->busy)
error("litevm->busy is set! 0x%x\n", litevm->busy);
*/
r = -EINVAL;
/* General sanity checks */
if (mem->memory_size & (PAGE_SIZE - 1))
error("mem->memory_size %lld is not page-aligned", mem->memory_size);
if (mem->guest_phys_addr & (PAGE_SIZE - 1))
error("guest_phys_addr 0x%llx is not page-aligned",
mem->guest_phys_addr);
if (mem->slot >= LITEVM_MEMORY_SLOTS)
error("Slot %d is >= %d", mem->slot, LITEVM_MEMORY_SLOTS);
if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
error("0x%x + 0x%x is < 0x%x",
mem->guest_phys_addr, mem->memory_size, mem->guest_phys_addr);
memslot = &litevm->memslots[mem->slot];
base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
npages = mem->memory_size >> PAGE_SHIFT;
if (!npages)
mem->flags &= ~LITEVM_MEM_LOG_DIRTY_PAGES;
/* this is actually a very tricky for loop. The use of
* error is a bit dangerous, so we don't use it much.
* consider a rewrite. Would be nice if akaros could do the
* allocation of a bunch of pages for us.
*/
raced:
printk("raced: pass %d\n", pass);
printk("LOCK %p, locked %d\n", &litevm->lock, spin_locked(&litevm->lock));
void monitor(void *);
monitor(NULL);
SPLL(&litevm->lock);
printk("locked\n");
if (waserror()) {
printk("error in %s, %s\n", __func__, current_errstr());
SPLU(&litevm->lock);
nexterror();
}
memory_config_version = litevm->memory_config_version;
new = old = *memslot;
printk("memory_config_version %d\n", memory_config_version);
new.base_gfn = base_gfn;
new.npages = npages;
new.flags = mem->flags;
/* Disallow changing a memory slot's size. */
r = -EINVAL;
if (npages && old.npages && npages != old.npages)
error("npages is %d, old.npages is %d, can't change",
npages, old.npages);
/* Check for overlaps */
r = -EEXIST;
for (i = 0; i < LITEVM_MEMORY_SLOTS; ++i) {
struct litevm_memory_slot *s = &litevm->memslots[i];
printk("Region %d: base gfn 0x%x npages %d\n", s->base_gfn, s->npages);
if (s == memslot)
continue;
if (!((base_gfn + npages <= s->base_gfn) ||
(base_gfn >= s->base_gfn + s->npages)))
error("Overlap");
}
/*
* Do memory allocations outside lock. memory_config_version will
* detect any races.
*/
SPLU(&litevm->lock);
printk("unlocked\n");
poperror();
/* Deallocate if slot is being removed */
if (!npages)
new.phys_mem = 0;
/* Free page dirty bitmap if unneeded */
if (!(new.flags & LITEVM_MEM_LOG_DIRTY_PAGES))
new.dirty_bitmap = 0;
r = -ENOMEM;
/* Allocate if a slot is being created */
if (npages && !new.phys_mem) {
new.phys_mem = kzmalloc(npages * sizeof(struct page *), KMALLOC_WAIT);
if (!new.phys_mem)
goto out_free;
for (i = 0; i < npages; ++i) {
int ret;
ret = kpage_alloc(&new.phys_mem[i]);
printk("PAGEALLOC: va %p pa %p\n",page2kva(new.phys_mem[i]),page2pa(new.phys_mem[i]));
if (ret != ESUCCESS)
goto out_free;
if (init_data) {
printk("init data memcpy(%p,%p,4096);\n",
page2kva(new.phys_mem[i]), init_data);
memcpy(page2kva(new.phys_mem[i]), init_data, PAGE_SIZE);
init_data += PAGE_SIZE;
} else {
int j;
//memset(page2kva(new.phys_mem[i]), 0xf4 /* hlt */, PAGE_SIZE);
uint8_t *cp = page2kva(new.phys_mem[i]);
memset(cp, 0, PAGE_SIZE);
if (base_gfn < 0x100000){
for(j = 0; j < PAGE_SIZE; j += 2){
// XORL %RAX, %RAX
cp[j] = 0x31; cp[j+1] = 0xc0;
}
// 1: jmp 1b
cp[4094] = 0xeb;
cp[4095] = 0xfe;
}
init_data += PAGE_SIZE;
}
}
}
/* Allocate page dirty bitmap if needed */
if ((new.flags & LITEVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
unsigned dirty_bytes; //ALIGN(npages, BITS_PER_LONG) / 8;
dirty_bytes =
(((npages + BITS_PER_LONG -
1) / BITS_PER_LONG) * BITS_PER_LONG) / 8;
new.dirty_bitmap = kzmalloc(dirty_bytes, KMALLOC_WAIT);
if (!new.dirty_bitmap) {
printk("VM: alloc of %d bytes for map failed\n", dirty_bytes);
goto out_free;
}
}
SPLL(&litevm->lock);
printk("locked\n");
if (memory_config_version != litevm->memory_config_version) {
SPLU(&litevm->lock);
printk("unlocked, try again\n");
litevm_free_physmem_slot(&new, &old);
goto raced;
}
r = -EAGAIN;
if (litevm->busy) {
printk("BUSY!\n");
goto out_unlock;
}
if (mem->slot >= litevm->nmemslots)
litevm->nmemslots = mem->slot + 1;
*memslot = new;
++litevm->memory_config_version;
SPLU(&litevm->lock);
printk("unlocked\n");
for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
struct litevm_vcpu *vcpu;
vcpu = vcpu_load(litevm, i);
if (!vcpu){
printk("%s: no cpu %d\n", __func__, i);
continue;
}
litevm_mmu_reset_context(vcpu);
vcpu_put(vcpu);
}
litevm_free_physmem_slot(&old, &new);
print_func_exit();
return 0;
out_unlock:
SPLU(&litevm->lock);
printk("out_unlock\n");
out_free:
printk("out_free\n");
litevm_free_physmem_slot(&new, &old);
out:
printk("vm_set_memory_region: return %d\n", r);
print_func_exit();
return r;
}
#if 0
/*
* Get (and clear) the dirty memory log for a memory slot.
*/
static int litevm_dev_ioctl_get_dirty_log(struct litevm *litevm,
struct litevm_dirty_log *log)
{
struct litevm_memory_slot *memslot;
int r, i;
int n;
unsigned long any = 0;
SPLL(&litevm->lock);
/*
* Prevent changes to guest memory configuration even while the lock
* is not taken.
*/
++litevm->busy;
SPLU(&litevm->lock);
r = -EINVAL;
if (log->slot >= LITEVM_MEMORY_SLOTS)
goto out;
memslot = &litevm->memslots[log->slot];
r = -ENOENT;
if (!memslot->dirty_bitmap)
goto out;
n = ALIGN(memslot->npages, 8) / 8;
for (i = 0; !any && i < n; ++i)
any = memslot->dirty_bitmap[i];
r = -EFAULT;
if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
goto out;
if (any) {
SPLL(&litevm->lock);
litevm_mmu_slot_remove_write_access(litevm, log->slot);
SPLU(&litevm->lock);
memset(memslot->dirty_bitmap, 0, n);
for (i = 0; i < LITEVM_MAX_VCPUS; ++i) {
struct litevm_vcpu *vcpu = vcpu_load(litevm, i);
if (!vcpu)
continue;
flush_guest_tlb(vcpu);
vcpu_put(vcpu);
}
}
r = 0;
out:
SPLL(&litevm->lock);
--litevm->busy;
SPLU(&litevm->lock);
return r;
}
#endif
struct litevm_memory_slot *gfn_to_memslot(struct litevm *litevm, gfn_t gfn)
{
print_func_entry();
int i;
printk("%s: litevm %p gfn %d\n", litevm, gfn);
for (i = 0; i < litevm->nmemslots; ++i) {
struct litevm_memory_slot *memslot = &litevm->memslots[i];
printk("%s: slot %d gfn 0x%lx base_gfn %lx npages %x\n",
__func__, i, gfn,memslot->base_gfn, memslot->npages);
if (gfn >= memslot->base_gfn
&& gfn < memslot->base_gfn + memslot->npages) {
print_func_exit();
return memslot;
}
}
print_func_exit();
return 0;
}
void mark_page_dirty(struct litevm *litevm, gfn_t gfn)
{
print_func_entry();
int i;
struct litevm_memory_slot *memslot = 0;
unsigned long rel_gfn;
for (i = 0; i < litevm->nmemslots; ++i) {
memslot = &litevm->memslots[i];
if (gfn >= memslot->base_gfn
&& gfn < memslot->base_gfn + memslot->npages) {
if (!memslot || !memslot->dirty_bitmap) {
print_func_exit();
return;
}
rel_gfn = gfn - memslot->base_gfn;
/* avoid RMW */
if (!GET_BITMASK_BIT(memslot->dirty_bitmap, rel_gfn))
SET_BITMASK_BIT_ATOMIC(memslot->dirty_bitmap, rel_gfn);
print_func_exit();
return;
}
}
print_func_exit();
}
static void skip_emulated_instruction(struct litevm_vcpu *vcpu)
{
print_func_entry();
unsigned long rip;
uint32_t interruptibility;
rip = vmcs_readl(GUEST_RIP);
rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
vmcs_writel(GUEST_RIP, rip);
/*
* We emulated an instruction, so temporary interrupt blocking
* should be removed, if set.
*/
interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
if (interruptibility & 3)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility & ~3);
print_func_exit();
}
static int emulator_read_std(unsigned long addr,
unsigned long *val,
unsigned int bytes, struct x86_emulate_ctxt *ctxt)
{
print_func_entry();
struct litevm_vcpu *vcpu = ctxt->vcpu;
void *data = val;
while (bytes) {
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
unsigned offset = addr & (PAGE_SIZE - 1);
unsigned tocopy = bytes < (unsigned)PAGE_SIZE - offset ?
bytes : (unsigned)PAGE_SIZE - offset;
unsigned long pfn;
struct litevm_memory_slot *memslot;
void *page;
if (gpa == UNMAPPED_GVA) {
print_func_exit();
return X86EMUL_PROPAGATE_FAULT;
}
pfn = gpa >> PAGE_SHIFT;
memslot = gfn_to_memslot(vcpu->litevm, pfn);
if (!memslot) {
print_func_exit();
return X86EMUL_UNHANDLEABLE;
}
page = page2kva(gfn_to_page(memslot, pfn));
memcpy(data, page + offset, tocopy);
bytes -= tocopy;
data += tocopy;
addr += tocopy;
}
print_func_exit();
return X86EMUL_CONTINUE;
}
static int emulator_write_std(unsigned long addr,
unsigned long val,
unsigned int bytes, struct x86_emulate_ctxt *ctxt)
{
print_func_entry();
printk("emulator_write_std: addr %lx n %d\n", addr, bytes);
print_func_exit();
return X86EMUL_UNHANDLEABLE;
}
static int emulator_read_emulated(unsigned long addr,
unsigned long *val,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
print_func_entry();
struct litevm_vcpu *vcpu = ctxt->vcpu;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
vcpu->mmio_read_completed = 0;
print_func_exit();
return X86EMUL_CONTINUE;
} else if (emulator_read_std(addr, val, bytes, ctxt)
== X86EMUL_CONTINUE) {
print_func_exit();
return X86EMUL_CONTINUE;
} else {
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
if (gpa == UNMAPPED_GVA) {
print_func_exit();
return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
}
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
vcpu->mmio_size = bytes;
vcpu->mmio_is_write = 0;
print_func_exit();
return X86EMUL_UNHANDLEABLE;
}
}
static int emulator_write_emulated(unsigned long addr,
unsigned long val,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
print_func_entry();
struct litevm_vcpu *vcpu = ctxt->vcpu;
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
if (gpa == UNMAPPED_GVA) {
print_func_exit();
return X86EMUL_PROPAGATE_FAULT;
}
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
vcpu->mmio_size = bytes;
vcpu->mmio_is_write = 1;
memcpy(vcpu->mmio_data, &val, bytes);
print_func_exit();
return X86EMUL_CONTINUE;
}
static int emulator_cmpxchg_emulated(unsigned long addr,
unsigned long old,
unsigned long new,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
print_func_entry();
static int reported;
if (!reported) {
reported = 1;
printk("litevm: emulating exchange as write\n");
}
print_func_exit();
return emulator_write_emulated(addr, new, bytes, ctxt);
}
static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
{
print_func_entry();
static int reported;
uint8_t opcodes[4];
unsigned long rip = vmcs_readl(GUEST_RIP);
unsigned long rip_linear = rip + vmcs_readl(GUEST_CS_BASE);
if (reported) {
print_func_exit();
return;
}
emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
printk("emulation failed but !mmio_needed?"
" rip %lx %02x %02x %02x %02x\n",
rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
reported = 1;
print_func_exit();
}
struct x86_emulate_ops emulate_ops = {
.read_std = emulator_read_std,
.write_std = emulator_write_std,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
};
enum emulation_result {
EMULATE_DONE, /* no further processing */
EMULATE_DO_MMIO, /* litevm_run filled with mmio request */
EMULATE_FAIL, /* can't emulate this instruction */
};
static int emulate_instruction(struct litevm_vcpu *vcpu,
struct litevm_run *run,
unsigned long cr2, uint16_t error_code)
{
print_func_entry();
struct x86_emulate_ctxt emulate_ctxt;
int r;
uint32_t cs_ar;
vcpu_load_rsp_rip(vcpu);
cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
emulate_ctxt.vcpu = vcpu;
emulate_ctxt.eflags = vmcs_readl(GUEST_RFLAGS);
emulate_ctxt.cr2 = cr2;
emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
? X86EMUL_MODE_REAL : (cs_ar & AR_L_MASK)
? X86EMUL_MODE_PROT64 : (cs_ar & AR_DB_MASK)
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
emulate_ctxt.cs_base = 0;
emulate_ctxt.ds_base = 0;
emulate_ctxt.es_base = 0;
emulate_ctxt.ss_base = 0;
emulate_ctxt.gs_base = 0;
emulate_ctxt.fs_base = 0;
} else {
emulate_ctxt.cs_base = vmcs_readl(GUEST_CS_BASE);
emulate_ctxt.ds_base = vmcs_readl(GUEST_DS_BASE);
emulate_ctxt.es_base = vmcs_readl(GUEST_ES_BASE);
emulate_ctxt.ss_base = vmcs_readl(GUEST_SS_BASE);
emulate_ctxt.gs_base = vmcs_readl(GUEST_GS_BASE);
emulate_ctxt.fs_base = vmcs_readl(GUEST_FS_BASE);
}
vcpu->mmio_is_write = 0;
r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
if ((r || vcpu->mmio_is_write) && run) {
run->mmio.phys_addr = vcpu->mmio_phys_addr;
memcpy(run->mmio.data, vcpu->mmio_data, 8);
run->mmio.len = vcpu->mmio_size;
run->mmio.is_write = vcpu->mmio_is_write;
}
if (r) {
if (!vcpu->mmio_needed) {
report_emulation_failure(&emulate_ctxt);
print_func_exit();
return EMULATE_FAIL;
}
print_func_exit();
return EMULATE_DO_MMIO;
}
vcpu_put_rsp_rip(vcpu);
vmcs_writel(GUEST_RFLAGS, emulate_ctxt.eflags);
if (vcpu->mmio_is_write) {
print_func_exit();
return EMULATE_DO_MMIO;
}
print_func_exit();
return EMULATE_DONE;
}
static uint64_t mk_cr_64(uint64_t curr_cr, uint32_t new_val)
{
print_func_entry();
print_func_exit();
return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
}
void realmode_lgdt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
{
print_func_entry();
vmcs_writel(GUEST_GDTR_BASE, base);
vmcs_write32(GUEST_GDTR_LIMIT, limit);
print_func_exit();
}
void realmode_lidt(struct litevm_vcpu *vcpu, uint16_t limit, unsigned long base)
{
print_func_entry();
vmcs_writel(GUEST_IDTR_BASE, base);
vmcs_write32(GUEST_IDTR_LIMIT, limit);
print_func_exit();
}
void realmode_lmsw(struct litevm_vcpu *vcpu, unsigned long msw,
unsigned long *rflags)
{
print_func_entry();
lmsw(vcpu, msw);
*rflags = vmcs_readl(GUEST_RFLAGS);
print_func_exit();
}
unsigned long realmode_get_cr(struct litevm_vcpu *vcpu, int cr)
{
print_func_entry();
switch (cr) {
case 0:
print_func_exit();
return guest_cr0();
case 2:
print_func_exit();
return vcpu->cr2;
case 3:
print_func_exit();
return vcpu->cr3;
case 4:
print_func_exit();
return guest_cr4();
default:
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
print_func_exit();
return 0;
}
}
void realmode_set_cr(struct litevm_vcpu *vcpu, int cr, unsigned long val,
unsigned long *rflags)
{
print_func_entry();
switch (cr) {
case 0:
set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
*rflags = vmcs_readl(GUEST_RFLAGS);
break;
case 2:
vcpu->cr2 = val;
break;
case 3:
set_cr3(vcpu, val);
break;
case 4:
set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
break;
default:
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
}
print_func_exit();
}
static int handle_rmode_exception(struct litevm_vcpu *vcpu,
int vec, uint32_t err_code)
{
print_func_entry();
if (!vcpu->rmode.active) {
print_func_exit();
return 0;
}
if (vec == GP_VECTOR && err_code == 0)
if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) {
print_func_exit();
return 1;
}
print_func_exit();
return 0;
}
static int handle_exception(struct litevm_vcpu *vcpu,
struct litevm_run *litevm_run)
{
print_func_entry();
uint32_t intr_info, error_code;
unsigned long cr2, rip;
uint32_t vect_info;
enum emulation_result er;
vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
printk("vect_info %x intro_info %x\n", vect_info, intr_info);
printk("page fault? %d\n", is_page_fault(intr_info));
if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) {
printk("%s: unexpected, vectoring info 0x%x "
"intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
}
if (is_external_interrupt(vect_info)) {
printk("extern interrupt\n");
int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_pending), irq);
SET_BITMASK_BIT_ATOMIC(((uint8_t *) & vcpu->irq_summary),
irq / BITS_PER_LONG);
}
if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
printk("nmi\n");
asm("int $2");
print_func_exit();
return 1;
}
error_code = 0;
rip = vmcs_readl(GUEST_RIP);
printk("GUEST_RIP %x\n", rip);
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
printk("PAGE FAULT!\n");
cr2 = vmcs_readl(EXIT_QUALIFICATION);
SPLL(&vcpu->litevm->lock);
if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
SPLU(&vcpu->litevm->lock);
print_func_exit();
return 1;
}
er = emulate_instruction(vcpu, litevm_run, cr2, error_code);
SPLU(&vcpu->litevm->lock);
switch (er) {
case EMULATE_DONE:
print_func_exit();
return 1;
case EMULATE_DO_MMIO:
++litevm_stat.mmio_exits;
litevm_run->exit_reason = LITEVM_EXIT_MMIO;
print_func_exit();
return 0;
case EMULATE_FAIL:
vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
break;
default:
assert(0);
}
}
if (vcpu->rmode.active &&
handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
error_code)) {
printk("RMODE EXCEPTION might have been handled\n");
print_func_exit();
return 1;
}
if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
(INTR_TYPE_EXCEPTION | 1)) {
litevm_run->exit_reason = LITEVM_EXIT_DEBUG;
print_func_exit();
return 0;
}
litevm_run->exit_reason = LITEVM_EXIT_EXCEPTION;
litevm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
litevm_run->ex.error_code = error_code;
print_func_exit();
return 0;
}
static int handle_external_interrupt(struct litevm_vcpu *vcpu,
struct litevm_run *litevm_run)
{
//print_func_entry();
++litevm_stat.irq_exits;
//print_func_exit();
return 1;
}
static int get_io_count(struct litevm_vcpu *vcpu, uint64_t * count)
{
print_func_entry();
uint64_t inst;
gva_t rip;
int countr_size;
int i, n;
if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
countr_size = 2;
} else {
uint32_t cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
countr_size = (cs_ar & AR_L_MASK) ? 8 : (cs_ar & AR_DB_MASK) ? 4 : 2;
}
rip = vmcs_readl(GUEST_RIP);
if (countr_size != 8)
rip += vmcs_readl(GUEST_CS_BASE);
n = litevm_read_guest(vcpu, rip, sizeof(inst), &inst);
for (i = 0; i < n; i++) {
switch (((uint8_t *) & inst)[i]) {
case 0xf0:
case 0xf2:
case 0xf3:
case 0x2e:
case 0x36:
case 0x3e:
case 0x26:
case 0x64:
case 0x65:
case 0x66:
break;
case 0x67:
countr_size = (countr_size == 2) ? 4 : (countr_size >> 1);
default:
goto done;
}
}
print_func_exit();
return 0;
done:
countr_size *= 8;
*count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
print_func_exit();
return 1;
}
static int handle_io(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
{
print_func_entry();
uint64_t exit_qualification;
++litevm_stat.io_exits;
exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
litevm_run->exit_reason = LITEVM_EXIT_IO;
if (exit_qualification & 8)
litevm_run->io.direction = LITEVM_EXIT_IO_IN;
else
litevm_run->io.direction = LITEVM_EXIT_IO_OUT;
litevm_run->io.size = (exit_qualification & 7) + 1;
litevm_run->io.string = (exit_qualification & 16) != 0;
litevm_run->io.string_down
= (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
litevm_run->io.rep = (exit_qualification & 32) != 0;
litevm_run->io.port = exit_qualification >> 16;
if (litevm_run->io.string) {
if (!get_io_count(vcpu, &litevm_run->io.count)) {
print_func_exit();
return 1;
}
litevm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
} else
litevm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */
print_func_exit();
return 0;
}
static int handle_invlpg(struct litevm_vcpu *vcpu,
struct litevm_run *litevm_run)
{
print_func_entry();
uint64_t address = vmcs_read64(EXIT_QUALIFICATION);
int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
SPLL(&vcpu->litevm->lock);
vcpu->mmu.inval_page(vcpu, address);
SPLU(&vcpu->litevm->lock);
vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
print_func_exit();
return 1;
}
static int handle_cr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
{
print_func_entry();
uint64_t exit_qualification;
int cr;
int reg;
#ifdef LITEVM_DEBUG
if (guest_cpl() != 0) {
vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
inject_gp(vcpu);
print_func_exit();
return 1;
}
#endif
exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
cr = exit_qualification & 15;
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
switch (cr) {
case 0:
vcpu_load_rsp_rip(vcpu);
set_cr0(vcpu, vcpu->regs[reg]);
skip_emulated_instruction(vcpu);
print_func_exit();
return 1;
case 3:
vcpu_load_rsp_rip(vcpu);
set_cr3(vcpu, vcpu->regs[reg]);
skip_emulated_instruction(vcpu);
print_func_exit();
return 1;
case 4:
vcpu_load_rsp_rip(vcpu);
set_cr4(vcpu, vcpu->regs[reg]);
skip_emulated_instruction(vcpu);
print_func_exit();
return 1;
case 8:
vcpu_load_rsp_rip(vcpu);
set_cr8(vcpu, vcpu->regs[reg]);
skip_emulated_instruction(vcpu);
print_func_exit();
return 1;
};
break;
case 1: /*mov from cr */
switch (cr) {
case 3:
vcpu_load_rsp_rip(vcpu);
vcpu->regs[reg] = vcpu->cr3;
vcpu_put_rsp_rip(vcpu);
skip_emulated_instruction(vcpu);
print_func_exit();
return 1;
case 8:
printd("handle_cr: read CR8 " "cpu erratum AA15\n");
vcpu_load_rsp_rip(vcpu);
vcpu->regs[reg] = vcpu->cr8;
vcpu_put_rsp_rip(vcpu);
skip_emulated_instruction(vcpu);
print_func_exit();
return 1;
}
break;
case 3: /* lmsw */
lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
skip_emulated_instruction(vcpu);
print_func_exit();
return 1;
default:
break;
}
litevm_run->exit_reason = 0;
printk("litevm: unhandled control register: op %d cr %d\n",
(int)(exit_qualification >> 4) & 3, cr);
print_func_exit();
return 0;
}
static int handle_dr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
{
print_func_entry();
uint64_t exit_qualification;
unsigned long val;
int dr, reg;
/*
* FIXME: this code assumes the host is debugging the guest.
* need to deal with guest debugging itself too.
*/
exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
dr = exit_qualification & 7;
reg = (exit_qualification >> 8) & 15;
vcpu_load_rsp_rip(vcpu);
if (exit_qualification & 16) {
/* mov from dr */
switch (dr) {
case 6:
val = 0xffff0ff0;
break;
case 7:
val = 0x400;
break;
default:
val = 0;
}
vcpu->regs[reg] = val;
} else {
/* mov to dr */
}
vcpu_put_rsp_rip(vcpu);
skip_emulated_instruction(vcpu);
print_func_exit();
return 1;
}
static int handle_cpuid(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
{
print_func_entry();
litevm_run->exit_reason = LITEVM_EXIT_CPUID;
print_func_exit();
return 0;
}
static int handle_rdmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
{
print_func_entry();
uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
uint64_t data;
if (guest_cpl() != 0) {
vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
inject_gp(vcpu);
print_func_exit();
return 1;
}
switch (ecx) {
case MSR_FS_BASE:
data = vmcs_readl(GUEST_FS_BASE);
break;
case MSR_GS_BASE:
data = vmcs_readl(GUEST_GS_BASE);
break;
case MSR_IA32_SYSENTER_CS:
data = vmcs_read32(GUEST_SYSENTER_CS);
break;
case MSR_IA32_SYSENTER_EIP:
data = vmcs_read32(GUEST_SYSENTER_EIP);
break;
case MSR_IA32_SYSENTER_ESP:
data = vmcs_read32(GUEST_SYSENTER_ESP);
break;
case MSR_IA32_MC0_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MCG_CAP:
case MSR_IA32_MC0_MISC:
case MSR_IA32_MC0_MISC + 4:
case MSR_IA32_MC0_MISC + 8:
case MSR_IA32_MC0_MISC + 12:
case MSR_IA32_MC0_MISC + 16:
case MSR_IA32_UCODE_REV:
/* MTRR registers */
case 0xfe:
case 0x200 ... 0x2ff:
data = 0;
break;
case MSR_IA32_APICBASE:
data = vcpu->apic_base;
break;
default:
if (msr) {
data = msr->value;
break;
}
printk("litevm: unhandled rdmsr: %x\n", ecx);
inject_gp(vcpu);
print_func_exit();
return 1;
}
/* FIXME: handling of bits 32:63 of rax, rdx */
vcpu->regs[VCPU_REGS_RAX] = data & -1u;
vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
skip_emulated_instruction(vcpu);
print_func_exit();
return 1;
}
#ifdef __x86_64__
static void set_efer(struct litevm_vcpu *vcpu, uint64_t efer)
{
print_func_entry();
struct vmx_msr_entry *msr;
if (efer & EFER_RESERVED_BITS) {
printd("set_efer: 0x%llx #GP, reserved bits\n", efer);
inject_gp(vcpu);
print_func_exit();
return;
}
if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
printd("set_efer: #GP, change LME while paging\n");
inject_gp(vcpu);
print_func_exit();
return;
}
efer &= ~EFER_LMA;
efer |= vcpu->shadow_efer & EFER_LMA;
vcpu->shadow_efer = efer;
msr = find_msr_entry(vcpu, MSR_EFER);
if (!(efer & EFER_LMA))
efer &= ~EFER_LME;
msr->value = efer;
skip_emulated_instruction(vcpu);
print_func_exit();
}
#endif
#define MSR_IA32_TIME_STAMP_COUNTER 0x10
static int handle_wrmsr(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
{
print_func_entry();
uint32_t ecx = vcpu->regs[VCPU_REGS_RCX];
struct vmx_msr_entry *msr;
uint64_t data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
| ((uint64_t) (vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
if (guest_cpl() != 0) {
vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
inject_gp(vcpu);
print_func_exit();
return 1;
}
switch (ecx) {
case MSR_FS_BASE:
vmcs_writel(GUEST_FS_BASE, data);
break;
case MSR_GS_BASE:
vmcs_writel(GUEST_GS_BASE, data);
break;
case MSR_IA32_SYSENTER_CS:
vmcs_write32(GUEST_SYSENTER_CS, data);
break;
case MSR_IA32_SYSENTER_EIP:
vmcs_write32(GUEST_SYSENTER_EIP, data);
break;
case MSR_IA32_SYSENTER_ESP:
vmcs_write32(GUEST_SYSENTER_ESP, data);
break;
case MSR_EFER:
set_efer(vcpu, data);
print_func_exit();
return 1;
case MSR_IA32_MC0_STATUS:
printk("%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data);
break;
case MSR_IA32_TIME_STAMP_COUNTER:{
uint64_t tsc;
tsc = read_tsc();
vmcs_write64(TSC_OFFSET, data - tsc);
break;
}
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE:
case 0x200 ... 0x2ff: /* MTRRs */
break;
case MSR_IA32_APICBASE:
vcpu->apic_base = data;
break;
default:
msr = find_msr_entry(vcpu, ecx);
if (msr) {
msr->value = data;
break;
}
printk("litevm: unhandled wrmsr: %x\n", ecx);
inject_gp(vcpu);
print_func_exit();
return 1;
}
skip_emulated_instruction(vcpu);
print_func_exit();
return 1;
}
static int handle_interrupt_window(struct litevm_vcpu *vcpu,
struct litevm_run *litevm_run)
{
print_func_entry();
/* Turn off interrupt window reporting. */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
& ~CPU_BASED_VIRTUAL_INTR_PENDING);
print_func_exit();
return 1;
}
static int handle_halt(struct litevm_vcpu *vcpu, struct litevm_run *litevm_run)
{
print_func_entry();
skip_emulated_instruction(vcpu);
if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) {
print_func_exit();
return 1;
}
litevm_run->exit_reason = LITEVM_EXIT_HLT;
print_func_exit();
return 0;
}
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the litevm_run parameter to indicate what needs
* to be done to userspace and return 0.
*/
static int (*litevm_vmx_exit_handlers[]) (struct litevm_vcpu * vcpu,
struct litevm_run * litevm_run) = {
[EXIT_REASON_EXCEPTION_NMI] = handle_exception,
[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
[EXIT_REASON_IO_INSTRUCTION] = handle_io,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_CR_ACCESS] = handle_cr,
[EXIT_REASON_DR_ACCESS] = handle_dr,
[EXIT_REASON_CPUID] = handle_cpuid,
[EXIT_REASON_MSR_READ] = handle_rdmsr,
[EXIT_REASON_MSR_WRITE] = handle_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,};
static const int litevm_vmx_max_exit_handlers =
sizeof(litevm_vmx_exit_handlers) / sizeof(*litevm_vmx_exit_handlers);
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
static int litevm_handle_exit(struct litevm_run *litevm_run,
struct litevm_vcpu *vcpu)
{
//print_func_entry();
uint32_t vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
uint32_t exit_reason = vmcs_read32(VM_EXIT_REASON);
//printk("vectoring_info %08x exit_reason %x\n", vectoring_info, exit_reason);
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
exit_reason != EXIT_REASON_EXCEPTION_NMI)
printk("%s: unexpected, valid vectoring info and "
"exit reason is 0x%x\n", __FUNCTION__, exit_reason);
litevm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
if (exit_reason < litevm_vmx_max_exit_handlers
&& litevm_vmx_exit_handlers[exit_reason]) {
//printk("reason is KNOWN\n");
//print_func_exit();
return litevm_vmx_exit_handlers[exit_reason] (vcpu, litevm_run);
} else {
printk("reason is UNKNOWN\n");
litevm_run->exit_reason = LITEVM_EXIT_UNKNOWN;
litevm_run->hw.hardware_exit_reason = exit_reason;
}
//print_func_exit();
return 0;
}
static void inject_rmode_irq(struct litevm_vcpu *vcpu, int irq)
{
print_func_entry();
uint16_t ent[2];
uint16_t cs;
uint16_t ip;
unsigned long flags;
unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
uint16_t sp = vmcs_readl(GUEST_RSP);
uint32_t ss_limit = vmcs_read32(GUEST_SS_LIMIT);
/* This is the 'does it wrap' test. */
/* This original test elicited complaints from the C compiler.
* It's a bit too Klever for me.
if (sp > ss_limit || ((sp - 6) > sp)) {
*/
if (sp > ss_limit || (sp < 6)) {
vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
__FUNCTION__,
vmcs_readl(GUEST_RSP),
vmcs_readl(GUEST_SS_BASE), vmcs_read32(GUEST_SS_LIMIT));
print_func_exit();
return;
}
if (litevm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
sizeof(ent)) {
//vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
print_func_exit();
return;
}
flags = vmcs_readl(GUEST_RFLAGS);
cs = vmcs_readl(GUEST_CS_BASE) >> 4;
ip = vmcs_readl(GUEST_RIP);
if (litevm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
litevm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
litevm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
//vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
print_func_exit();
return;
}
vmcs_writel(GUEST_RFLAGS, flags &
~(X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
vmcs_write16(GUEST_CS_SELECTOR, ent[1]);
vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
vmcs_writel(GUEST_RIP, ent[0]);
vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
print_func_exit();
}
static void litevm_do_inject_irq(struct litevm_vcpu *vcpu)
{
print_func_entry();
int word_index = __ffs(vcpu->irq_summary);
int bit_index = __ffs(vcpu->irq_pending[word_index]);
int irq = word_index * BITS_PER_LONG + bit_index;
/* don't have clear_bit and I'm not sure the akaros
* bitops are really going to work.
*/
vcpu->irq_pending[word_index] &= ~(1 << bit_index);
if (!vcpu->irq_pending[word_index])
vcpu->irq_summary &= ~(1 << word_index);
if (vcpu->rmode.active) {
inject_rmode_irq(vcpu, irq);
print_func_exit();
return;
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
print_func_exit();
}
static void litevm_try_inject_irq(struct litevm_vcpu *vcpu)
{
print_func_entry();
if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
&& (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
/*
* Interrupts enabled, and not blocked by sti or mov ss. Good.
*/
litevm_do_inject_irq(vcpu);
else
/*
* Interrupts blocked. Wait for unblock.
*/
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
| CPU_BASED_VIRTUAL_INTR_PENDING);
print_func_exit();
}
static void litevm_guest_debug_pre(struct litevm_vcpu *vcpu)
{
print_func_entry();
struct litevm_guest_debug *dbg = &vcpu->guest_debug;
/*
set_debugreg(dbg->bp[0], 0);
set_debugreg(dbg->bp[1], 1);
set_debugreg(dbg->bp[2], 2);
set_debugreg(dbg->bp[3], 3);
*/
if (dbg->singlestep) {
unsigned long flags;
flags = vmcs_readl(GUEST_RFLAGS);
flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
vmcs_writel(GUEST_RFLAGS, flags);
}
print_func_exit();
}
static void load_msrs(struct vmx_msr_entry *e, int n)
{
//print_func_entry();
int i;
if (! e) {
printk("LOAD MSR WITH NULL POINTER?");
error("LOAD MSR WITH NULL POINTER?");
}
for (i = 0; i < n; ++i) {
//printk("Load MSR (%lx), with %lx\n", e[i].index, e[i].data);
write_msr(e[i].index, e[i].value);
//printk("Done\n");
}
//print_func_exit();
}
static void save_msrs(struct vmx_msr_entry *e, int n)
{
//print_func_entry();
int i;
for (i = 0; i < n; ++i)
e[i].value = read_msr(e[i].index);
//print_func_exit();
}
int vm_run(struct litevm *litevm, struct litevm_run *litevm_run)
{
print_func_entry();
struct litevm_vcpu *vcpu;
uint8_t fail;
uint16_t fs_sel, gs_sel, ldt_sel;
int fs_gs_ldt_reload_needed;
if (litevm_run->vcpu < 0 || litevm_run->vcpu >= LITEVM_MAX_VCPUS)
error("vcpu is %d but must be in the range %d..%d\n",
litevm_run->vcpu, LITEVM_MAX_VCPUS);
vcpu = vcpu_load(litevm, litevm_run->vcpu);
if (!vcpu)
error("vcpu_load failed");
printk("Loaded\n");
if (litevm_run->emulated) {
skip_emulated_instruction(vcpu);
litevm_run->emulated = 0;
}
printk("Emulated\n");
if (litevm_run->mmio_completed) {
memcpy(vcpu->mmio_data, litevm_run->mmio.data, 8);
vcpu->mmio_read_completed = 1;
}
printk("mmio completed\n");
vcpu->mmio_needed = 0;
again:
/*
* Set host fs and gs selectors. Unfortunately, 22.2.3 does not
* allow segment selectors with cpl > 0 or ti == 1.
*/
fs_sel = read_fs();
//printk("fs_sel %x\n", fs_sel);
gs_sel = read_gs();
//printk("gs_sel %x\n", gs_sel);
ldt_sel = read_ldt();
//printk("ldt_sel %x\n", ldt_sel);
fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
if (!fs_gs_ldt_reload_needed) {
vmcs_write16(HOST_FS_SELECTOR, fs_sel);
vmcs_write16(HOST_GS_SELECTOR, gs_sel);
} else {
vmcs_write16(HOST_FS_SELECTOR, 0);
vmcs_write16(HOST_GS_SELECTOR, 0);
}
//printk("reloaded gs and gs\n");
#ifdef __x86_64__
vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
//printk("Set FS_BASE and GS_BASE");
#endif
if (vcpu->irq_summary &&
!(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
litevm_try_inject_irq(vcpu);
if (vcpu->guest_debug.enabled)
litevm_guest_debug_pre(vcpu);
fx_save(vcpu->host_fx_image);
fx_restore(vcpu->guest_fx_image);
save_msrs(vcpu->host_msrs, vcpu->nmsrs);
load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
printk("GO FOR IT! %08lx\n", vmcs_readl(GUEST_RIP));
asm(
/* Store host registers */
"pushf \n\t"
#ifdef __x86_64__
"push %%rax; push %%rbx; push %%rdx;"
"push %%rsi; push %%rdi; push %%rbp;"
"push %%r8; push %%r9; push %%r10; push %%r11;"
"push %%r12; push %%r13; push %%r14; push %%r15;"
"push %%rcx \n\t" "vmwrite %%rsp, %2 \n\t"
#else
"pusha; push %%ecx \n\t" "vmwrite %%esp, %2 \n\t"
#endif
/* Check if vmlaunch of vmresume is needed */
"cmp $0, %1 \n\t"
/* Load guest registers. Don't clobber flags. */
#ifdef __x86_64__
"mov %c[cr2](%3), %%rax \n\t" "mov %%rax, %%cr2 \n\t" "mov %c[rax](%3), %%rax \n\t" "mov %c[rbx](%3), %%rbx \n\t" "mov %c[rdx](%3), %%rdx \n\t" "mov %c[rsi](%3), %%rsi \n\t" "mov %c[rdi](%3), %%rdi \n\t" "mov %c[rbp](%3), %%rbp \n\t" "mov %c[r8](%3), %%r8 \n\t" "mov %c[r9](%3), %%r9 \n\t" "mov %c[r10](%3), %%r10 \n\t" "mov %c[r11](%3), %%r11 \n\t" "mov %c[r12](%3), %%r12 \n\t" "mov %c[r13](%3), %%r13 \n\t" "mov %c[r14](%3), %%r14 \n\t" "mov %c[r15](%3), %%r15 \n\t" "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
#else
"mov %c[cr2](%3), %%eax \n\t" "mov %%eax, %%cr2 \n\t" "mov %c[rax](%3), %%eax \n\t" "mov %c[rbx](%3), %%ebx \n\t" "mov %c[rdx](%3), %%edx \n\t" "mov %c[rsi](%3), %%esi \n\t" "mov %c[rdi](%3), %%edi \n\t" "mov %c[rbp](%3), %%ebp \n\t" "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
#endif
/* Enter guest mode */
"jne launched \n\t"
"vmlaunch \n\t"
"jmp litevm_vmx_return \n\t"
"launched: vmresume \n\t"
".globl litevm_vmx_return \n\t" "litevm_vmx_return: "
/* Save guest registers, load host registers, keep flags */
#ifdef __x86_64__
"xchg %3, 0(%%rsp) \n\t"
"mov %%rax, %c[rax](%3) \n\t"
"mov %%rbx, %c[rbx](%3) \n\t"
"pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
"mov %%rdx, %c[rdx](%3) \n\t"
"mov %%rsi, %c[rsi](%3) \n\t"
"mov %%rdi, %c[rdi](%3) \n\t"
"mov %%rbp, %c[rbp](%3) \n\t"
"mov %%r8, %c[r8](%3) \n\t"
"mov %%r9, %c[r9](%3) \n\t"
"mov %%r10, %c[r10](%3) \n\t"
"mov %%r11, %c[r11](%3) \n\t"
"mov %%r12, %c[r12](%3) \n\t"
"mov %%r13, %c[r13](%3) \n\t"
"mov %%r14, %c[r14](%3) \n\t"
"mov %%r15, %c[r15](%3) \n\t"
"mov %%cr2, %%rax \n\t"
"mov %%rax, %c[cr2](%3) \n\t"
"mov 0(%%rsp), %3 \n\t"
"pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
"pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
"pop %%rbp; pop %%rdi; pop %%rsi;"
"pop %%rdx; pop %%rbx; pop %%rax \n\t"
#else
"xchg %3, 0(%%esp) \n\t"
"mov %%eax, %c[rax](%3) \n\t"
"mov %%ebx, %c[rbx](%3) \n\t"
"pushl 0(%%esp); popl %c[rcx](%3) \n\t"
"mov %%edx, %c[rdx](%3) \n\t"
"mov %%esi, %c[rsi](%3) \n\t"
"mov %%edi, %c[rdi](%3) \n\t"
"mov %%ebp, %c[rbp](%3) \n\t"
"mov %%cr2, %%eax \n\t"
"mov %%eax, %c[cr2](%3) \n\t"
"mov 0(%%esp), %3 \n\t" "pop %%ecx; popa \n\t"
#endif
"setbe %0 \n\t" "popf \n\t":"=g"(fail)
: "r"(vcpu->launched), "r"((unsigned long)HOST_RSP),
"c"(vcpu),
[rax] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RAX])),
[rbx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBX])),
[rcx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RCX])),
[rdx] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDX])),
[rsi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RSI])),
[rdi] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RDI])),
[rbp] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_RBP])),
#ifdef __x86_64__
[r8] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R8])),
[r9] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R9])),
[r10] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R10])),
[r11] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R11])),
[r12] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R12])),
[r13] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R13])),
[r14] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R14])),
[r15] "i"(offsetof(struct litevm_vcpu, regs[VCPU_REGS_R15])),
#endif
[cr2] "i"(offsetof(struct litevm_vcpu, cr2))
:"cc", "memory");
++litevm_stat.exits;
printk("vm_run exits! %08lx flags %08lx\n", vmcs_readl(GUEST_RIP),
vmcs_readl(GUEST_RFLAGS));
save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
fx_save(vcpu->guest_fx_image);
fx_restore(vcpu->host_fx_image);
#ifndef __x86_64__
asm("mov %0, %%ds; mov %0, %%es": :"r"(__USER_DS));
#endif
litevm_run->exit_type = 0;
if (fail) {
printk("FAIL\n");
litevm_run->exit_type = LITEVM_EXIT_TYPE_FAIL_ENTRY;
litevm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
printk("reason %d\n", litevm_run->exit_reason);
} else {
printk("NOT FAIL\n");
if (fs_gs_ldt_reload_needed) {
load_ldt(ldt_sel);
load_fs(fs_sel);
/*
* If we have to reload gs, we must take care to
* preserve our gs base.
*/
disable_irq();
load_gs(gs_sel);
#ifdef __x86_64__
write_msr(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
#endif
enable_irq();
reload_tss();
}
vcpu->launched = 1;
litevm_run->exit_type = LITEVM_EXIT_TYPE_VM_EXIT;
//printk("Let's see why it exited\n");
if (litevm_handle_exit(litevm_run, vcpu)) {
/* Give scheduler a change to reschedule. */
#if 0
vcpu_put(vcpu);
#warning "how to tell if signal is pending"
/*
if (signal_pending(current)) {
++litevm_stat.signal_exits;
return -EINTR;
}
*/
consider getting rid of this for now.
Maybe it is just breaking things.
kthread_yield();
/* Cannot fail - no vcpu unplug yet. */
vcpu_load(litevm, vcpu_slot(vcpu));
#endif
monitor(NULL);
goto again;
}
}
done:
printk("vm_run exits! %08lx flags %08lx\n", vmcs_readl(GUEST_RIP),
vmcs_readl(GUEST_RFLAGS));
vcpu_put(vcpu);
printk("vm_run returns\n");
print_func_exit();
return 0;
}
static int litevm_dev_ioctl_get_regs(struct litevm *litevm,
struct litevm_regs *regs)
{
print_func_entry();
struct litevm_vcpu *vcpu;
if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
print_func_exit();
return -EINVAL;
}
vcpu = vcpu_load(litevm, regs->vcpu);
if (!vcpu) {
print_func_exit();
return -ENOENT;
}
regs->rax = vcpu->regs[VCPU_REGS_RAX];
regs->rbx = vcpu->regs[VCPU_REGS_RBX];
regs->rcx = vcpu->regs[VCPU_REGS_RCX];
regs->rdx = vcpu->regs[VCPU_REGS_RDX];
regs->rsi = vcpu->regs[VCPU_REGS_RSI];
regs->rdi = vcpu->regs[VCPU_REGS_RDI];
regs->rsp = vmcs_readl(GUEST_RSP);
regs->rbp = vcpu->regs[VCPU_REGS_RBP];
#ifdef __x86_64__
regs->r8 = vcpu->regs[VCPU_REGS_R8];
regs->r9 = vcpu->regs[VCPU_REGS_R9];
regs->r10 = vcpu->regs[VCPU_REGS_R10];
regs->r11 = vcpu->regs[VCPU_REGS_R11];
regs->r12 = vcpu->regs[VCPU_REGS_R12];
regs->r13 = vcpu->regs[VCPU_REGS_R13];
regs->r14 = vcpu->regs[VCPU_REGS_R14];
regs->r15 = vcpu->regs[VCPU_REGS_R15];
#endif
regs->rip = vmcs_readl(GUEST_RIP);
regs->rflags = vmcs_readl(GUEST_RFLAGS);
/*
* Don't leak debug flags in case they were set for guest debugging
*/
if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
vcpu_put(vcpu);
print_func_exit();
return 0;
}
static int litevm_dev_ioctl_set_regs(struct litevm *litevm,
struct litevm_regs *regs)
{
print_func_entry();
struct litevm_vcpu *vcpu;
if (regs->vcpu < 0 || regs->vcpu >= LITEVM_MAX_VCPUS) {
print_func_exit();
return -EINVAL;
}
vcpu = vcpu_load(litevm, regs->vcpu);
if (!vcpu) {
print_func_exit();
return -ENOENT;
}
vcpu->regs[VCPU_REGS_RAX] = regs->rax;
vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
vmcs_writel(GUEST_RSP, regs->rsp);
vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
#ifdef __x86_64__
vcpu->regs[VCPU_REGS_R8] = regs->r8;
vcpu->regs[VCPU_REGS_R9] = regs->r9;
vcpu->regs[VCPU_REGS_R10] = regs->r10;
vcpu->regs[VCPU_REGS_R11] = regs->r11;
vcpu->regs[VCPU_REGS_R12] = regs->r12;
vcpu->regs[VCPU_REGS_R13] = regs->r13;
vcpu->regs[VCPU_REGS_R14] = regs->r14;
vcpu->regs[VCPU_REGS_R15] = regs->r15;
#endif
vmcs_writel(GUEST_RIP, regs->rip);
vmcs_writel(GUEST_RFLAGS, regs->rflags);
vcpu_put(vcpu);
print_func_exit();
return 0;
}
static int litevm_dev_ioctl_get_sregs(struct litevm *litevm,
struct litevm_sregs *sregs)
{
print_func_entry();
struct litevm_vcpu *vcpu;
if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
print_func_exit();
return -EINVAL;
}
vcpu = vcpu_load(litevm, sregs->vcpu);
if (!vcpu) {
print_func_exit();
return -ENOENT;
}
#define get_segment(var, seg) \
do { \
uint32_t ar; \
\
sregs->var.base = vmcs_readl(GUEST_##seg##_BASE); \
sregs->var.limit = vmcs_read32(GUEST_##seg##_LIMIT); \
sregs->var.selector = vmcs_read16(GUEST_##seg##_SELECTOR); \
ar = vmcs_read32(GUEST_##seg##_AR_BYTES); \
if (ar & AR_UNUSABLE_MASK) ar = 0; \
sregs->var.type = ar & 15; \
sregs->var.s = (ar >> 4) & 1; \
sregs->var.dpl = (ar >> 5) & 3; \
sregs->var.present = (ar >> 7) & 1; \
sregs->var.avl = (ar >> 12) & 1; \
sregs->var.l = (ar >> 13) & 1; \
sregs->var.db = (ar >> 14) & 1; \
sregs->var.g = (ar >> 15) & 1; \
sregs->var.unusable = (ar >> 16) & 1; \
} while (0);
get_segment(cs, CS);
get_segment(ds, DS);
get_segment(es, ES);
get_segment(fs, FS);
get_segment(gs, GS);
get_segment(ss, SS);
get_segment(tr, TR);
get_segment(ldt, LDTR);
#undef get_segment
#define get_dtable(var, table) \
sregs->var.limit = vmcs_read32(GUEST_##table##_LIMIT), \
sregs->var.base = vmcs_readl(GUEST_##table##_BASE)
get_dtable(idt, IDTR);
get_dtable(gdt, GDTR);
#undef get_dtable
sregs->cr0 = guest_cr0();
sregs->cr2 = vcpu->cr2;
sregs->cr3 = vcpu->cr3;
sregs->cr4 = guest_cr4();
sregs->cr8 = vcpu->cr8;
sregs->efer = vcpu->shadow_efer;
sregs->apic_base = vcpu->apic_base;
sregs->pending_int = vcpu->irq_summary != 0;
vcpu_put(vcpu);
print_func_exit();
return 0;
}
static int litevm_dev_ioctl_set_sregs(struct litevm *litevm,
struct litevm_sregs *sregs)
{
print_func_entry();
struct litevm_vcpu *vcpu;
int mmu_reset_needed = 0;
if (sregs->vcpu < 0 || sregs->vcpu >= LITEVM_MAX_VCPUS) {
print_func_exit();
return -EINVAL;
}
vcpu = vcpu_load(litevm, sregs->vcpu);
if (!vcpu) {
print_func_exit();
return -ENOENT;
}
#define set_segment(var, seg) \
do { \
uint32_t ar; \
\
vmcs_writel(GUEST_##seg##_BASE, sregs->var.base); \
vmcs_write32(GUEST_##seg##_LIMIT, sregs->var.limit); \
vmcs_write16(GUEST_##seg##_SELECTOR, sregs->var.selector); \
if (sregs->var.unusable) { \
ar = (1 << 16); \
} else { \
ar = (sregs->var.type & 15); \
ar |= (sregs->var.s & 1) << 4; \
ar |= (sregs->var.dpl & 3) << 5; \
ar |= (sregs->var.present & 1) << 7; \
ar |= (sregs->var.avl & 1) << 12; \
ar |= (sregs->var.l & 1) << 13; \
ar |= (sregs->var.db & 1) << 14; \
ar |= (sregs->var.g & 1) << 15; \
} \
vmcs_write32(GUEST_##seg##_AR_BYTES, ar); \
} while (0);
set_segment(cs, CS);
set_segment(ds, DS);
set_segment(es, ES);
set_segment(fs, FS);
set_segment(gs, GS);
set_segment(ss, SS);
set_segment(tr, TR);
set_segment(ldt, LDTR);
#undef set_segment
#define set_dtable(var, table) \
vmcs_write32(GUEST_##table##_LIMIT, sregs->var.limit), \
vmcs_writel(GUEST_##table##_BASE, sregs->var.base)
set_dtable(idt, IDTR);
set_dtable(gdt, GDTR);
#undef set_dtable
vcpu->cr2 = sregs->cr2;
mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
vcpu->cr3 = sregs->cr3;
vcpu->cr8 = sregs->cr8;
mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
#ifdef __x86_64__
__set_efer(vcpu, sregs->efer);
#endif
vcpu->apic_base = sregs->apic_base;
mmu_reset_needed |= guest_cr0() != sregs->cr0;
vcpu->rmode.active = ((sregs->cr0 & CR0_PE_MASK) == 0);
update_exception_bitmap(vcpu);
vmcs_writel(CR0_READ_SHADOW, sregs->cr0);
vmcs_writel(GUEST_CR0, sregs->cr0 | LITEVM_VM_CR0_ALWAYS_ON);
mmu_reset_needed |= guest_cr4() != sregs->cr4;
__set_cr4(vcpu, sregs->cr4);
if (mmu_reset_needed)
litevm_mmu_reset_context(vcpu);
vcpu_put(vcpu);
print_func_exit();
return 0;
}
/*
* Translate a guest virtual address to a guest physical address.
*/
static int litevm_dev_ioctl_translate(struct litevm *litevm,
struct litevm_translation *tr)
{
print_func_entry();
unsigned long vaddr = tr->linear_address;
struct litevm_vcpu *vcpu;
gpa_t gpa;
vcpu = vcpu_load(litevm, tr->vcpu);
if (!vcpu) {
print_func_exit();
return -ENOENT;
}
SPLL(&litevm->lock);
gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
tr->physical_address = gpa;
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
tr->usermode = 0;
SPLU(&litevm->lock);
vcpu_put(vcpu);
print_func_exit();
return 0;
}
#if 0
static int litevm_dev_ioctl_interrupt(struct litevm *litevm,
struct litevm_interrupt *irq)
{
struct litevm_vcpu *vcpu;
if (irq->vcpu < 0 || irq->vcpu >= LITEVM_MAX_VCPUS)
return -EINVAL;
if (irq->irq < 0 || irq->irq >= 256)
return -EINVAL;
vcpu = vcpu_load(litevm, irq->vcpu);
if (!vcpu)
return -ENOENT;
set_bit(irq->irq, vcpu->irq_pending);
set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
vcpu_put(vcpu);
return 0;
}
#endif
#if 0
static int litevm_dev_ioctl_debug_guest(struct litevm *litevm,
struct litevm_debug_guest *dbg)
{
struct litevm_vcpu *vcpu;
unsigned long dr7 = 0x400;
uint32_t exception_bitmap;
int old_singlestep;
if (dbg->vcpu < 0 || dbg->vcpu >= LITEVM_MAX_VCPUS)
return -EINVAL;
vcpu = vcpu_load(litevm, dbg->vcpu);
if (!vcpu)
return -ENOENT;
exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
old_singlestep = vcpu->guest_debug.singlestep;
vcpu->guest_debug.enabled = dbg->enabled;
if (vcpu->guest_debug.enabled) {
int i;
dr7 |= 0x200; /* exact */
for (i = 0; i < 4; ++i) {
if (!dbg->breakpoints[i].enabled)
continue;
vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
dr7 |= 2 << (i * 2); /* global enable */
dr7 |= 0 << (i * 4 + 16); /* execution breakpoint */
}
exception_bitmap |= (1u << 1); /* Trap debug exceptions */
vcpu->guest_debug.singlestep = dbg->singlestep;
} else {
exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
vcpu->guest_debug.singlestep = 0;
}
if (old_singlestep && !vcpu->guest_debug.singlestep) {
unsigned long flags;
flags = vmcs_readl(GUEST_RFLAGS);
flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
vmcs_writel(GUEST_RFLAGS, flags);
}
vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
vmcs_writel(GUEST_DR7, dr7);
vcpu_put(vcpu);
return 0;
}
#endif
#if 0
long litevm_control(struct litevm *litevm, int command, unsigned long arg)
{
int r = -EINVAL;
switch (command) {
case LITEVM_CREATE_VCPU:{
r = create_vcpu(litevm, arg);
if (r)
goto out;
break;
}
case LITEVM_RUN:{
struct litevm_run litevm_run;
r = -EFAULT;
if (copy_from_user(&litevm_run, (void *)arg, sizeof litevm_run))
goto out;
r = litevm_dev_ioctl_run(litevm, &litevm_run);
if (r < 0)
goto out;
r = -EFAULT;
if (copy_to_user((void *)arg, &litevm_run, sizeof litevm_run))
goto out;
r = 0;
break;
}
case LITEVM_GET_REGS:{
struct litevm_regs litevm_regs;
r = -EFAULT;
if (copy_from_user
(&litevm_regs, (void *)arg, sizeof litevm_regs))
goto out;
r = litevm_dev_ioctl_get_regs(litevm, &litevm_regs);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user((void *)arg, &litevm_regs, sizeof litevm_regs))
goto out;
r = 0;
break;
}
case LITEVM_SET_REGS:{
struct litevm_regs litevm_regs;
r = -EFAULT;
if (copy_from_user
(&litevm_regs, (void *)arg, sizeof litevm_regs))
goto out;
r = litevm_dev_ioctl_set_regs(litevm, &litevm_regs);
if (r)
goto out;
r = 0;
break;
}
case LITEVM_GET_SREGS:{
struct litevm_sregs litevm_sregs;
r = -EFAULT;
if (copy_from_user
(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
goto out;
r = litevm_dev_ioctl_get_sregs(litevm, &litevm_sregs);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user
((void *)arg, &litevm_sregs, sizeof litevm_sregs))
goto out;
r = 0;
break;
}
case LITEVM_SET_SREGS:{
struct litevm_sregs litevm_sregs;
r = -EFAULT;
if (copy_from_user
(&litevm_sregs, (void *)arg, sizeof litevm_sregs))
goto out;
r = litevm_dev_ioctl_set_sregs(litevm, &litevm_sregs);
if (r)
goto out;
r = 0;
break;
}
case LITEVM_TRANSLATE:{
struct litevm_translation tr;
r = -EFAULT;
if (copy_from_user(&tr, (void *)arg, sizeof tr))
goto out;
r = litevm_dev_ioctl_translate(litevm, &tr);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user((void *)arg, &tr, sizeof tr))
goto out;
r = 0;
break;
}
case LITEVM_INTERRUPT:{
struct litevm_interrupt irq;
r = -EFAULT;
if (copy_from_user(&irq, (void *)arg, sizeof irq))
goto out;
r = litevm_dev_ioctl_interrupt(litevm, &irq);
if (r)
goto out;
r = 0;
break;
}
case LITEVM_DEBUG_GUEST:{
struct litevm_debug_guest dbg;
r = -EFAULT;
if (copy_from_user(&dbg, (void *)arg, sizeof dbg))
goto out;
r = litevm_dev_ioctl_debug_guest(litevm, &dbg);
if (r)
goto out;
r = 0;
break;
}
case LITEVM_SET_MEMORY_REGION:{
struct litevm_memory_region litevm_mem;
r = -EFAULT;
if (copy_from_user(&litevm_mem, (void *)arg, sizeof litevm_mem))
goto out;
r = litevm_dev_ioctl_set_memory_region(litevm, &litevm_mem);
if (r)
goto out;
break;
}
case LITEVM_GET_DIRTY_LOG:{
struct litevm_dirty_log log;
r = -EFAULT;
if (copy_from_user(&log, (void *)arg, sizeof log))
goto out;
r = litevm_dev_ioctl_get_dirty_log(litevm, &log);
if (r)
goto out;
break;
}
default:
;
}
out:
return r;
}
#endif
#if 0
static int litevm_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct litevm *litevm = vma->vm_file->private_data;
struct litevm_memory_slot *slot;
struct page *page;
slot = gfn_to_memslot(litevm, vmf->pgoff);
if (!slot)
return VM_FAULT_SIGBUS;
page = gfn_to_page(slot, vmf->pgoff);
if (!page)
return VM_FAULT_SIGBUS;
get_page(page);
vmf->page = page;
return 0;
}
#endif
#if 0
static int litevm_reboot(struct notifier_block *notifier, unsigned long val,
void *v)
{
panic("litevm_reboot");
if (val == SYS_RESTART) {
/*
* Some (well, at least mine) BIOSes hang on reboot if
* in vmx root mode.
*/
printk("litevm: exiting vmx mode\n");
handler_wrapper_t *w;
smp_call_function_all(litevm_disable, 0, &w);
smp_call_wait(w);
}
return NOTIFY_OK;
return 0;
}
#endif
hpa_t bad_page_address;
int vmx_init(void)
{
print_func_entry();
handler_wrapper_t *w;
int r = 0;
if (!cpu_has_litevm_support()) {
printk("litevm: no hardware support\n");
print_func_exit();
return -EOPNOTSUPP;
}
if (vmx_disabled_by_bios()) {
printk("litevm: disabled by bios\n");
print_func_exit();
return -EOPNOTSUPP;
}
setup_vmcs_descriptor();
smp_call_function_all(vm_enable, 0, &w);
if (smp_call_wait(w)) {
printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
}
if ((bad_page_address = PADDR(kpage_zalloc_addr())) == 0ULL) {
r = -ENOMEM;
}
print_func_exit();
return r;
}
static void litevm_exit(void)
{
print_func_entry();
//free_litevm_area();
//__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
print_func_exit();
}