VMM: Call EPT ops for every KPT op At this point, the EPT should equal the KPT, up to UVPT. I disconnected the EPT fault handler for now. tests/vmmcp still works, since everything is pre-faulted in. We'll need to change it to do the initial faults from the VM.
diff --git a/kern/arch/x86/arch.h b/kern/arch/x86/arch.h index bf6f85e..bee50ea 100644 --- a/kern/arch/x86/arch.h +++ b/kern/arch/x86/arch.h
@@ -14,8 +14,6 @@ #define __always_inline inline __attribute__((always_inline)) static inline void breakpoint(void) __attribute__((always_inline)); -static inline void invlpg(void *addr) __attribute__((always_inline)); -static inline void tlbflush(void) __attribute__((always_inline)); static inline void icache_flush_page(void *va, void *kva) __attribute__((always_inline)); static inline uint64_t read_tsc(void) __attribute__((always_inline)); @@ -42,24 +40,16 @@ void print_cpuinfo(void); void show_mapping(pgdir_t pgdir, uintptr_t start, size_t size); int vendor_id(char *); +/* pmap.c */ +void invlpg(void *addr); +void tlbflush(void); +void tlb_flush_global(void); static inline void breakpoint(void) { asm volatile("int3"); } -static inline void invlpg(void *addr) -{ - asm volatile("invlpg (%0)" : : "r" (addr) : "memory"); -} - -static inline void tlbflush(void) -{ - unsigned long cr3; - asm volatile("mov %%cr3,%0" : "=r" (cr3)); - asm volatile("mov %0,%%cr3" : : "r" (cr3)); -} - static inline void icache_flush_page(void *va, void *kva) { // x86 handles self-modifying code (mostly) without SW support
diff --git a/kern/arch/x86/kpt.h b/kern/arch/x86/kpt.h new file mode 100644 index 0000000..3415418 --- /dev/null +++ b/kern/arch/x86/kpt.h
@@ -0,0 +1,94 @@ +/* Copyright (c) 2015 Google Inc. + * Barret Rhoden <brho@cs.berkeley.edu> + * See LICENSE for details. + * + * 64 bit KPT helpers */ + +#ifndef ROS_ARCH_KPT_H +#define ROS_ARCH_KPT_H + +#include <arch/ros/mmu64.h> + +static inline bool kpte_is_present(kpte_t *kpte) +{ + return *kpte & PTE_P ? TRUE : FALSE; +} + +static inline bool kpte_is_unmapped(kpte_t *kpte) +{ + return *kpte == 0; +} + +static inline bool kpte_is_mapped(kpte_t *kpte) +{ + return *kpte != 0; +} + +static inline bool kpte_is_paged_out(kpte_t *kpte) +{ + return *kpte != 0; +} + +static inline bool kpte_is_dirty(kpte_t *kpte) +{ + return *kpte & PTE_D ? TRUE : FALSE; +} + +static inline bool kpte_is_accessed(kpte_t *kpte) +{ + return *kpte & PTE_A ? TRUE : FALSE; +} + +static inline bool kpte_is_jumbo(kpte_t *kpte) +{ + return *kpte & PTE_PS ? TRUE : FALSE; +} + +static inline physaddr_t kpte_get_paddr(kpte_t *kpte) +{ + return (physaddr_t)*kpte & ~(PGSIZE - 1); +} + +/* Returns the PTE in an unsigned long, for debugging mostly. */ +static inline unsigned long kpte_print(kpte_t *kpte) +{ + return *kpte; +} + +static inline void kpte_write(kpte_t *kpte, physaddr_t pa, int perm) +{ + assert(!PGOFF(pa)); + *kpte = pa | perm; +} + +static inline void kpte_clear_present(kpte_t *kpte) +{ + *kpte &= ~PTE_P; +} + +static inline void kpte_clear(kpte_t *kpte) +{ + *kpte = 0; +} + +static inline bool kpte_has_perm_ur(kpte_t *kpte) +{ + return (*kpte & PTE_USER_RO) == PTE_USER_RO; +} + +static inline bool kpte_has_perm_urw(kpte_t *kpte) +{ + return (*kpte & PTE_USER_RW) == PTE_USER_RW; +} + +static inline int kpte_get_perm(kpte_t *kpte) +{ + return *kpte & PTE_PERM; +} + +static inline void kpte_replace_perm(kpte_t *kpte, int perm) +{ + *kpte = (*kpte & ~PTE_PERM) | perm; +} + +#endif /* ROS_ARCH_KPT_H */
diff --git a/kern/arch/x86/pmap.c b/kern/arch/x86/pmap.c index 2fe9f56..903387c 100644 --- a/kern/arch/x86/pmap.c +++ b/kern/arch/x86/pmap.c
@@ -99,6 +99,22 @@ enable_irqsave(&state); } +void invlpg(void *addr) +{ + asm volatile("invlpg (%0)" : : "r" (addr) : "memory"); + if (per_cpu_info[core_id()].vmx_enabled) + ept_inval_addr((uintptr_t)addr); +} + +void tlbflush(void) +{ + unsigned long cr3; + asm volatile("mov %%cr3,%0" : "=r" (cr3)); + asm volatile("mov %0,%%cr3" : : "r" (cr3)); + if (per_cpu_info[core_id()].vmx_enabled) + ept_inval_context(); +} + /* Flushes a TLB, including global pages. We should always have the CR4_PGE * flag set, but just in case, we'll check. Toggling this bit flushes the TLB. */ @@ -108,6 +124,9 @@ if (cr4 & CR4_PGE) { lcr4(cr4 & ~CR4_PGE); lcr4(cr4); - } else + } else { lcr3(rcr3()); + } + if (per_cpu_info[core_id()].vmx_enabled) + ept_inval_global(); }
diff --git a/kern/arch/x86/pmap64.c b/kern/arch/x86/pmap64.c index 901a9af..d9a2dd8 100644 --- a/kern/arch/x86/pmap64.c +++ b/kern/arch/x86/pmap64.c
@@ -80,9 +80,11 @@ static kpte_t *__pml_walk(kpte_t *pml, uintptr_t va, int flags, int pml_shift) { kpte_t *kpte; + epte_t *epte; void *new_pml_kva; kpte = &pml[PMLx(va, pml_shift)]; + epte = kpte_to_epte(kpte); if (walk_is_complete(kpte, pml_shift, flags)) return kpte; if (!(*kpte & PTE_P)) { @@ -98,6 +100,14 @@ * translation is !User). We put the perms on the last entry, not the * intermediates. */ *kpte = PADDR(new_pml_kva) | PTE_P | PTE_U | PTE_W; + /* The physaddr of the new_pml is one page higher than the KPT page. A + * few other things: + * - for the same reason that we have U and X set on all intermediate + * PTEs, we now set R, X, and W for the EPTE. + * - All EPTEs have U perms + * - We can't use epte_write since we're workin on intermediate PTEs, + * and they don't have the memory type set. */ + *epte = (PADDR(new_pml_kva) + PGSIZE) | EPTE_R | EPTE_X | EPTE_W; } return __pml_walk(kpte2pml(*kpte), va, flags, pml_shift - BITS_PER_PML); }
diff --git a/kern/arch/x86/pmap_ops.h b/kern/arch/x86/pmap_ops.h index 78c05fb..8e7e574 100644 --- a/kern/arch/x86/pmap_ops.h +++ b/kern/arch/x86/pmap_ops.h
@@ -12,14 +12,14 @@ #ifndef ROS_ARCH_PMAPS_OPS_H #define ROS_ARCH_PMAPS_OPS_H +#include <arch/vmm/ept.h> +#include <arch/kpt.h> + /* TODO: (EPT) build a CONFIG mode where we assert the EPT agrees with the KPT * for all of the read ops */ static inline bool pte_walk_okay(pte_t pte) { - /* walk_okay should only be called after a walk, when we have both a KPTE - * and an EPTE */ - dassert(pte.kpte ? TRUE : !pte.epte); return pte.kpte ? TRUE : FALSE; } @@ -43,92 +43,69 @@ * - unmapped: completely unused. (0 value) */ static inline bool pte_is_present(pte_t pte) { -#if 0 /* could do some debuggin like this. painful. */ - bool ret_kpte, ret_epte; - assert(pte.kpte || pte.epte); - ret_kpte = pte.kpte ? (*pte.kpte & PTE_P ? TRUE : FALSE) : 0; - /* TODO: EPT check */ - ret_epte = pte.epte ? (*pte.epte & PTE_P ? TRUE : FALSE) : 0; - if (pte.kpte && pte.epte) - assert(ret_kpte == ret_epte); - return pte.kpte ? ret_kpte : ret_epte; -#endif - return pte.kpte ? (*pte.kpte & PTE_P ? TRUE : FALSE) - : 0; /* TODO: EPT check */ + return kpte_is_present(pte.kpte); } static inline bool pte_is_unmapped(pte_t pte) { - return pte.kpte ? PAGE_UNMAPPED(*pte.kpte) - : 0; /* TODO: EPT check */ + return kpte_is_unmapped(pte.kpte); } static inline bool pte_is_mapped(pte_t pte) { - return pte.kpte ? !PAGE_UNMAPPED(*pte.kpte) - : 0; /* TODO: EPT check */ + return kpte_is_mapped(pte.kpte); } static inline bool pte_is_paged_out(pte_t pte) { - return pte.kpte ? PAGE_PAGED_OUT(*pte.kpte) - : 0; /* TODO: EPT check */ + return kpte_is_paged_out(pte.kpte); } static inline bool pte_is_dirty(pte_t pte) { - return pte.kpte ? (*pte.kpte & PTE_D ? TRUE : FALSE) - : 0; /* TODO: EPT check */ + return kpte_is_dirty(pte.kpte) || + epte_is_dirty(kpte_to_epte(pte.kpte)); } static inline bool pte_is_accessed(pte_t pte) { - return pte.kpte ? (*pte.kpte & PTE_A ? TRUE : FALSE) - : 0; /* TODO: EPT check */ + return kpte_is_accessed(pte.kpte) || + epte_is_accessed(kpte_to_epte(pte.kpte)); } /* Used in debugging code - want something better involving the walk */ static inline bool pte_is_jumbo(pte_t pte) { - return pte.kpte ? (*pte.kpte & PTE_PS ? TRUE : FALSE) - : 0; /* TODO: EPT check */ + return kpte_is_jumbo(pte.kpte); } static inline physaddr_t pte_get_paddr(pte_t pte) { - return pte.kpte ? PTE_ADDR(*pte.kpte) - : 0; /* TODO: EPT check */ + return kpte_get_paddr(pte.kpte); } /* Returns the PTE in an unsigned long, for debugging mostly. */ static inline unsigned long pte_print(pte_t pte) { - return pte.kpte ? *pte.kpte - : 0; /* TODO: EPT check */ + return kpte_print(pte.kpte); } static inline void pte_write(pte_t pte, physaddr_t pa, int perm) { - if (pte.kpte) - *pte.kpte = PTE(pa2ppn(pa), perm); - if (pte.epte) - /* TODO: EPT write (if EPT) */; + kpte_write(pte.kpte, pa, perm); + epte_write(kpte_to_epte(pte.kpte), pa, perm); } static inline void pte_clear_present(pte_t pte) { - if (pte.kpte) - *pte.kpte &= ~PTE_P; - if (pte.epte) - /* TODO: EPT write (if EPT) */; + kpte_clear_present(pte.kpte); + epte_clear_present(kpte_to_epte(pte.kpte)); } static inline void pte_clear(pte_t pte) { - if (pte.kpte) - *pte.kpte = 0; - if (pte.epte) - /* TODO: EPT write (if EPT) */; + kpte_clear(pte.kpte); + epte_clear(kpte_to_epte(pte.kpte)); } /* These are used by memcpy_*_user, but are very dangerous (and possibly used @@ -139,30 +116,25 @@ * to an intermediate PTE, we'd miss that. */ static inline bool pte_has_perm_ur(pte_t pte) { - return pte.kpte ? (*pte.kpte & PTE_USER_RO ? TRUE : FALSE) - : 0; /* TODO: EPT check */ + return kpte_has_perm_ur(pte.kpte); } static inline bool pte_has_perm_urw(pte_t pte) { - return pte.kpte ? (*pte.kpte & PTE_USER_RW ? TRUE : FALSE) - : 0; /* TODO: EPT check */ + return kpte_has_perm_urw(pte.kpte); } /* return the arch-independent format for prots - whatever you'd expect to * receive for pte_write. Careful with the ret, since a valid type is 0. */ static inline int pte_get_perm(pte_t pte) { - return pte.kpte ? *pte.kpte & PTE_PERM - : 0; /* TODO: EPT check */ + return kpte_get_perm(pte.kpte); } static inline void pte_replace_perm(pte_t pte, int perm) { - if (pte.kpte) - *pte.kpte = (*pte.kpte & ~PTE_PERM) | perm; - if (pte.epte) - /* TODO: EPT write (if EPT) */; + kpte_replace_perm(pte.kpte, perm); + epte_replace_perm(kpte_to_epte(pte.kpte), perm); } #endif /* ROS_ARCH_PMAPS_OPS_H */
diff --git a/kern/arch/x86/vmm/ept.h b/kern/arch/x86/vmm/ept.h new file mode 100644 index 0000000..b2d3da9 --- /dev/null +++ b/kern/arch/x86/vmm/ept.h
@@ -0,0 +1,176 @@ +/* Copyright (c) 2015 Google Inc. + * Barret Rhoden <brho@cs.berkeley.edu> + * See LICENSE for details. + * + * 64 bit EPT helpers */ + +#ifndef ROS_ARCH_VMM_EPT_H +#define ROS_ARCH_VMM_EPT_H + +#include <arch/vmm/intel/vmx.h> /* for sync/flush helpers */ +#include <smp.h> /* for current */ + +/* Some EPTE PTE flags are only valid for the last PTEs in a walk */ +#define EPTE_R (1ULL << 0) /* Readable */ +#define EPTE_W (1ULL << 1) /* Writeable */ +#define EPTE_X (1ULL << 2) /* Executable */ +#define EPTE_MEM_BITS (7ULL << 3) /* Memory type specifier */ +#define EPTE_IGN_PAT (1ULL << 6) /* Ignore PAT */ +#define EPTE_PS (1ULL << 7) /* Jumbo Page Size */ +#define EPTE_A (1ULL << 8) /* Accessed */ +#define EPTE_D (1ULL << 9) /* Dirty */ +#define EPTE_SUP_VE (1ULL << 63) /* Suppress virt exceptions */ +#define EPTE_P (EPTE_R | EPTE_W | EPTE_X) + +/* Types available for the EPTE_MEM_TYPE */ +#define EPT_MEM_TYPE_UC 0 +#define EPT_MEM_TYPE_WC 1 +#define EPT_MEM_TYPE_WT 4 +#define EPT_MEM_TYPE_WP 5 +#define EPT_MEM_TYPE_WB 6 +/* Helper to align the type to its location in the PTE */ +#define EPT_MEM_TYPE(type) ((type) << 3) + +/* Some machines don't support A and D EPTE bits. We'll |= 1 in those cases. */ +extern int x86_ept_pte_fix_ups; + +static inline epte_t *kpte_to_epte(kpte_t *kpte) +{ + return (epte_t*)(((uintptr_t)kpte) + PGSIZE); +} + +static inline bool epte_is_present(epte_t *epte) +{ + /* Actually, certain combos, like W but not R could be misconfigurations */ + return *epte & EPTE_P ? TRUE : FALSE; +} + +static inline bool epte_is_unmapped(epte_t *epte) +{ + return *epte == 0; +} + +static inline bool epte_is_mapped(epte_t *epte) +{ + return *epte != 0; +} + +static inline bool epte_is_paged_out(epte_t *epte) +{ + return *epte != 0; +} + +/* Some Intel machines don't support A or D. In these cases, we must assume + * the pages have been accessed or dirtied... */ +static inline bool epte_is_dirty(epte_t *epte) +{ + return (*epte | x86_ept_pte_fix_ups) & EPTE_D ? TRUE : FALSE; +} + +static inline bool epte_is_accessed(epte_t *epte) +{ + return (*epte | x86_ept_pte_fix_ups) & EPTE_A ? TRUE : FALSE; +} + +static inline bool epte_is_jumbo(epte_t *epte) +{ + return *epte & EPTE_PS ? TRUE : FALSE; +} + +static inline physaddr_t epte_get_paddr(epte_t *epte) +{ + /* 63:52 are ignored/flags. 51:12 are the addr. Technically 51:N must be + * 0, where N is the physical addr width */ + return *epte & 0x000ffffffffff000; +} + +static inline int __pte_to_epte_perm(int perm) +{ + switch (perm) { + /* Since we keep the EPT in lockstep with the KPT, we might get some + * mapping requests for the kernel (e.g. vmap_pmem). */ + case PTE_KERN_RW: + case PTE_KERN_RO: + case PTE_NONE: + return 0; + case PTE_USER_RW: + return EPTE_W | EPTE_R | EPTE_X; + case PTE_USER_RO: + return EPTE_R | EPTE_X; + default: + panic("Bad PTE type 0x%x\n", perm); + } +} + +static inline void epte_write(epte_t *epte, physaddr_t pa, int settings) +{ + /* Could put in a check against the max physaddr len */ + epte_t temp = pa; + temp |= __pte_to_epte_perm(settings & PTE_PERM); + temp |= settings & PTE_PS ? EPTE_PS : 0; + /* All memory is WB by default, but the guest can override that with their + * PAT on the first page walk (guest KPT/cr3) */ + temp |= EPT_MEM_TYPE(EPT_MEM_TYPE_WB); + *epte = temp; +} + +static inline void epte_clear_present(epte_t *epte) +{ + *epte &= ~EPTE_P; +} + +static inline void epte_clear(epte_t *epte) +{ + *epte = 0; +} + +static inline bool epte_has_perm_ur(epte_t *epte) +{ + return (*epte & (EPTE_R | EPTE_X)) == (EPTE_R | EPTE_X); +} + +static inline bool epte_has_perm_urw(epte_t *epte) +{ + return (*epte & (EPTE_R | EPTE_W | EPTE_X)) == (EPTE_R | EPTE_W | EPTE_X); +} + +/* We want to know User and Writable, in the 'PTE' sense. All present epte + * entries are User PTEs. */ +static inline int epte_get_perm(epte_t *epte) +{ + int settings = 0; + if (*epte & EPTE_P) { + settings |= PTE_P | PTE_U; + settings |= *epte & EPTE_W ? PTE_W : 0; + } + //settings |= *epte & EPTE_PS ? PTE_PS : 0; /* TODO */ + return settings; +} + +/* Again, we're replacing the old perms with U and/or W. Any non-U are ignored, + * as with epte_write. R (and X) are implied. */ +static inline void epte_replace_perm(epte_t *epte, int settings) +{ + *epte = (*epte & ~EPTE_P) | __pte_to_epte_perm(settings & PTE_PERM); +} + +/* These ops might be the same for AMD as Intel; in which case we can move the + * body of these ept_sync_* funcs into here */ +static inline void ept_inval_addr(unsigned long addr) +{ + if (current && current->vmm.vmmcp) + ept_sync_individual_addr(current->env_pgdir.eptp, addr); +} + +static inline void ept_inval_context(void) +{ + if (current && current->vmm.vmmcp) + ept_sync_context(current->env_pgdir.eptp); +} + +static inline void ept_inval_global(void) +{ + ept_sync_global(); +} + +#endif /* ROS_ARCH_VMM_EPT_H */
diff --git a/kern/arch/x86/vmm/intel/Kbuild b/kern/arch/x86/vmm/intel/Kbuild index a0f377a..152480f 100644 --- a/kern/arch/x86/vmm/intel/Kbuild +++ b/kern/arch/x86/vmm/intel/Kbuild
@@ -1,2 +1 @@ obj-y += vmx.o -obj-y += ept.o
diff --git a/kern/arch/x86/vmm/intel/ept.c b/kern/arch/x86/vmm/intel/ept.c deleted file mode 100644 index 08ff585..0000000 --- a/kern/arch/x86/vmm/intel/ept.c +++ /dev/null
@@ -1,335 +0,0 @@ -/** - * ept.c - Support for Intel's Extended Page Tables - * - * Authors: - * Adam Belay <abelay@stanford.edu> - * - * Right now we support EPT by making a sort of 'shadow' copy of the Linux - * process page table. In the future, a more invasive architecture port - * to VMX x86 could provide better performance by eliminating the need for - * two copies of each page table entry, relying instead on only the EPT - * format. - * - * This code is only a prototype and could benefit from a more comprehensive - * review in terms of performance and correctness. Also, the implications - * of threaded processes haven't been fully considered. - * - * Some of the low-level EPT functions are based on KVM. - * Original Authors: - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - */ - -#include <kmalloc.h> -#include <string.h> -#include <stdio.h> -#include <assert.h> -#include <error.h> -#include <pmap.h> -#include <sys/queue.h> -#include <smp.h> -#include <kref.h> -#include <atomic.h> -#include <alarm.h> -#include <event.h> -#include <umem.h> -#include <bitops.h> -#include <arch/types.h> -#include <syscall.h> -#include <monitor.h> - -#include "vmx.h" -#include "../vmm.h" - -#include "cpufeature.h" - -#define EPT_LEVELS 4 /* 0 through 3 */ -#define HUGE_PAGE_SIZE 2097152 -#define PageHuge(x) (0) - -#define VMX_EPT_FAULT_READ 0x01 -#define VMX_EPT_FAULT_WRITE 0x02 -#define VMX_EPT_FAULT_INS 0x04 - -typedef unsigned long epte_t; - -#define __EPTE_READ 0x01 -#define __EPTE_WRITE 0x02 -#define __EPTE_EXEC 0x04 -#define __EPTE_IPAT 0x40 -#define __EPTE_SZ 0x80 -#define __EPTE_TYPE(n) (((n) & 0x7) << 3) - -enum { - EPTE_TYPE_UC = 0, /* uncachable */ - EPTE_TYPE_WC = 1, /* write combining */ - EPTE_TYPE_WT = 4, /* write through */ - EPTE_TYPE_WP = 5, /* write protected */ - EPTE_TYPE_WB = 6, /* write back */ -}; - -#define __EPTE_NONE 0 -#define __EPTE_FULL (__EPTE_READ | __EPTE_WRITE | __EPTE_EXEC) - -#define EPTE_ADDR (~(PAGE_SIZE - 1)) -#define EPTE_FLAGS (PAGE_SIZE - 1) - -static inline uintptr_t epte_addr(epte_t epte) -{ - return (epte & EPTE_ADDR); -} - -static inline uintptr_t epte_page_vaddr(epte_t epte) -{ - return (uintptr_t) KADDR(epte_addr(epte)); -} - -static inline epte_t epte_flags(epte_t epte) -{ - return (epte & EPTE_FLAGS); -} - -static inline int epte_present(epte_t epte) -{ - return (epte & __EPTE_FULL) > 0; -} - -static inline int epte_big(epte_t epte) -{ - return (epte & __EPTE_SZ) > 0; -} - -#define ADDR_TO_IDX(la, n) \ - ((((unsigned long) (la)) >> (12 + 9 * (n))) & ((1 << 9) - 1)) - -/* for now we assume in 'current' */ -static int -ept_lookup_gpa(epte_t *dir, void *gpa, int level, int create, epte_t **epte_out) -{ - int i; - - for (i = EPT_LEVELS - 1; i > level; i--) { - int idx = ADDR_TO_IDX(gpa, i); - printk("%d: gpa %p, idx %p\n", i, gpa, idx); - if (!epte_present(dir[idx])) { - printk("not present\n"); - void *page; - - if (!create) - return -ENOENT; - - page = (void *) kpage_zalloc_addr(); - if (!page) - return -ENOMEM; - printk("page %p\n", page); - dir[idx] = epte_addr(PADDR(page)) | - __EPTE_FULL; - printk("Set %p[%p] to %p\n", dir, idx, dir[idx]); - } - - if (epte_big(dir[idx])) { - if (i != 1) - return -EINVAL; - level = i; - break; - } - - dir = (epte_t *) epte_page_vaddr(dir[idx]); - printk("Dir for next pass: %p\n", dir); - } - - *epte_out = &dir[ADDR_TO_IDX(gpa, level)]; - printk("Final ept is %p\n", *epte_out); - return 0; -} - -static void free_ept_page(epte_t epte) -{ - // TODO: clean this up. - void *page = KADDR(epte & ~0xfff); - //struct page *page = pfn_to_page(epte_addr(epte) >> PAGE_SHIFT); - - kfree(page); -} - -static void vmx_free_ept(unsigned long ept_root) -{ - epte_t *pgd = (epte_t *) KADDR(ept_root); - int i, j, k, l; - - // TODO: change all instances of 512 to something. - for (i = 0; i < 512; i++) { - epte_t *pud = (epte_t *) epte_page_vaddr(pgd[i]); - if (!epte_present(pgd[i])) - continue; - - for (j = 0; j < 512; j++) { - epte_t *pmd = (epte_t *) epte_page_vaddr(pud[j]); - if (!epte_present(pud[j])) - continue; - if (epte_flags(pud[j]) & __EPTE_SZ) - continue; - - for (k = 0; k < 512; k++) { - epte_t *pte = (epte_t *) epte_page_vaddr(pmd[k]); - if (!epte_present(pmd[k])) - continue; - if (epte_flags(pmd[k]) & __EPTE_SZ) { - free_ept_page(pmd[k]); - continue; - } - - for (l = 0; l < 512; l++) { - if (!epte_present(pte[l])) - continue; - - free_ept_page(pte[l]); - } - - kfree(pte); - } - - kfree(pmd); - } - - kfree(pud); - } - - kfree(pgd); -} - -static int ept_clear_epte(epte_t *epte) -{ - if (*epte == __EPTE_NONE) - return 0; - - free_ept_page(*epte); - *epte = __EPTE_NONE; - - return 1; -} - -/* We're given a guest physical and a host physical. */ -static int ept_set_epte(epte_t *dir, int make_write, unsigned long gpa, unsigned long hpa) -{ - int ret = -1; - epte_t *epte, flags; - struct page *page = NULL; - - // We're going to assume locking is done by this point. - // TODO: PageHuge - - ret = ept_lookup_gpa(dir, (void *) gpa, PageHuge(page) ? 1 : 0, 1, &epte); - if (ret) { - printk("ept: failed to lookup EPT entry\n"); - return ret; - } - - printk("=====================> epte %p is %p\n", epte, *epte); - if (epte_present(*epte) && (epte_big(*epte) || !PageHuge(page))) { - printk("PRESENT? WTF? OK ...\n"); - monitor(NULL); - //ept_clear_epte(epte); - } else { - flags = __EPTE_READ | __EPTE_EXEC | __EPTE_WRITE | - __EPTE_TYPE(EPTE_TYPE_WB) | __EPTE_IPAT; - if (make_write) - flags |= __EPTE_WRITE; - - /* TODO: fix thishuge page shit.*/ - if (PageHuge(page)) { - flags |= __EPTE_SZ; - if (epte_present(*epte) && !epte_big(*epte)){ - panic("free huge page?"); - //free_page(epte_page_vaddr(*epte)); - } - /* FIXME: free L0 entries too */ - *epte = epte_addr(PADDR(page) & ~((1 << 21) - 1)) | - flags; - } else { - *epte = epte_addr(hpa) | flags; - printk("Set epte to %p\n", *epte); - } - } - return 0; -} - -// TODO: kill this? -// NOTE: guest physical is 1:1 mapped to host virtual. This is NOT -// like dune at all. -int vmx_do_ept_fault(void *dir, unsigned long gpa, unsigned long hpa, int fault_flags) -{ - int ret; - int make_write = (fault_flags & VMX_EPT_FAULT_WRITE) ? 1 : 0; - - printk("ept: GPA: 0x%lx, GVA: 0x%lx, flags: %x\n", - gpa, hpa, fault_flags); - - ret = ept_set_epte((epte_t *)dir, make_write, gpa, hpa); - - return ret; -} - -/* - * ept_fault_pages pre-faults pages in the range start..end - */ -int ept_fault_pages(void *dir, uint32_t start, uint32_t end) -{ - uint64_t i; - int ret; - for(i = start; i < end; i++) { - uint64_t addr = i << 12; - ret = vmx_do_ept_fault((epte_t*)dir, i, i, VMX_EPT_FAULT_WRITE); - if (ret) { - return ret; - } - } - return 0; -} -/** - * ept_invalidate_page - removes a page from the EPT - * @vcpu: the vcpu - * @mm: the process's mm_struct - * @addr: the address of the page - * - * Returns 1 if the page was removed, 0 otherwise - */ -static int ept_invalidate_page(epte_t *dir, unsigned long addr) -{ - int ret; - epte_t *epte; - void *gpa = (void *) addr; - - ret = ept_lookup_gpa(dir, (void *) gpa, 0, 0, &epte); - if (ret) { - return 0; - } - - ret = ept_clear_epte(epte); - - /* TODO: sync individual? - if (ret) - vmx_ept_sync_individual_addr(vcpu, (gpa_t) gpa); - */ - - return ret; -} - -/** - * ept_check_page - determines if a page is mapped in the ept - * @vcpu: the vcpu - * @mm: the process's mm_struct - * @addr: the address of the page - * - * Returns 1 if the page is mapped, 0 otherwise - */ -int ept_check_page(void *dir, unsigned long addr) -{ - int ret; - epte_t *epte; - void *gpa = (void *) addr; - - ret = ept_lookup_gpa((epte_t *)dir, gpa, 0, 0, &epte); - - return ret; -}
diff --git a/kern/arch/x86/vmm/intel/vmx.c b/kern/arch/x86/vmm/intel/vmx.c index 25bd9be..ef0bb45 100644 --- a/kern/arch/x86/vmm/intel/vmx.c +++ b/kern/arch/x86/vmm/intel/vmx.c
@@ -173,6 +173,8 @@ static unsigned long *msr_bitmap; +int x86_ept_pte_fix_ups = 0; + struct vmx_capability vmx_capability; struct vmcs_config vmcs_config; @@ -654,67 +656,6 @@ //put_cpu(); } -static void __vmx_sync_helper(struct hw_trapframe *hw_tf, void *ptr) -{ - struct vmx_vcpu *vcpu = ptr; - - ept_sync_context(vcpu_get_eptp(vcpu)); -} - -struct sync_addr_args { - struct vmx_vcpu *vcpu; - gpa_t gpa; -}; - -static void __vmx_sync_individual_addr_helper(struct hw_trapframe *hw_tf, void *ptr) -{ - struct sync_addr_args *args = ptr; - -// ept_sync_individual_addr( - -} - -/** - * vmx_ept_sync_global - used to evict everything in the EPT - * @vcpu: the vcpu - */ -void vmx_ept_sync_vcpu(struct vmx_vcpu *vcpu) -{ - handler_wrapper_t *w; - - smp_call_function_single(vcpu->cpu, - __vmx_sync_helper, (void *) vcpu, &w); - - if (smp_call_wait(w)) { - printk("litevm_init. smp_call_wait failed. Expect a panic.\n"); - } - - -} - -/** - * vmx_ept_sync_individual_addr - used to evict an individual address - * @vcpu: the vcpu - * @gpa: the guest-physical address - */ -void vmx_ept_sync_individual_addr(struct vmx_vcpu *vcpu, gpa_t gpa) -{ - struct sync_addr_args args; - args.vcpu = vcpu; - args.gpa = gpa; - - handler_wrapper_t *w; - - - smp_call_function_single(vcpu->cpu, - __vmx_sync_individual_addr_helper, (void *) &args, &w); - - if (smp_call_wait(w)) { - printk("litevm_init. smp_call_wait failed. Expect a panic.\n"); - } - -} - /** * vmx_dump_cpu - prints the CPU state * @vcpu: VCPU to print @@ -1212,7 +1153,7 @@ if (page) { uint64_t hpa = page2pa(page); printk("hpa for %p is %p\n", gpa, hpa); - ret = vmx_do_ept_fault(vcpu->proc->env_pgdir.epte, gpa, hpa, exit_qual); + ret = -1; printk("vmx_do_ept_fault returns %d\n", ret); } @@ -1531,7 +1472,7 @@ } if (!cpu_has_vmx_ept_ad_bits()) { printk("VMX EPT doesn't support accessed/dirty!\n"); - /* TODO: set the pmap_ops accordingly */ + x86_ept_pte_fix_ups |= EPTE_A | EPTE_D; } if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) { printk("VMX EPT can't invalidate PTEs/TLBs!\n");
diff --git a/kern/arch/x86/vmm/intel/vmx.h b/kern/arch/x86/vmm/intel/vmx.h index d329d19..c69168d 100644 --- a/kern/arch/x86/vmm/intel/vmx.h +++ b/kern/arch/x86/vmm/intel/vmx.h
@@ -502,6 +502,9 @@ #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul +#define VMX_EPT_FAULT_READ 0x01 +#define VMX_EPT_FAULT_WRITE 0x02 +#define VMX_EPT_FAULT_INS 0x04 #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"