VMM: Call EPT ops for every KPT op
At this point, the EPT should equal the KPT, up to UVPT.
I disconnected the EPT fault handler for now. tests/vmmcp still works,
since everything is pre-faulted in. We'll need to change it to do the
initial faults from the VM.
diff --git a/kern/arch/x86/arch.h b/kern/arch/x86/arch.h
index bf6f85e..bee50ea 100644
--- a/kern/arch/x86/arch.h
+++ b/kern/arch/x86/arch.h
@@ -14,8 +14,6 @@
#define __always_inline inline __attribute__((always_inline))
static inline void breakpoint(void) __attribute__((always_inline));
-static inline void invlpg(void *addr) __attribute__((always_inline));
-static inline void tlbflush(void) __attribute__((always_inline));
static inline void icache_flush_page(void *va, void *kva)
__attribute__((always_inline));
static inline uint64_t read_tsc(void) __attribute__((always_inline));
@@ -42,24 +40,16 @@
void print_cpuinfo(void);
void show_mapping(pgdir_t pgdir, uintptr_t start, size_t size);
int vendor_id(char *);
+/* pmap.c */
+void invlpg(void *addr);
+void tlbflush(void);
+void tlb_flush_global(void);
static inline void breakpoint(void)
{
asm volatile("int3");
}
-static inline void invlpg(void *addr)
-{
- asm volatile("invlpg (%0)" : : "r" (addr) : "memory");
-}
-
-static inline void tlbflush(void)
-{
- unsigned long cr3;
- asm volatile("mov %%cr3,%0" : "=r" (cr3));
- asm volatile("mov %0,%%cr3" : : "r" (cr3));
-}
-
static inline void icache_flush_page(void *va, void *kva)
{
// x86 handles self-modifying code (mostly) without SW support
diff --git a/kern/arch/x86/kpt.h b/kern/arch/x86/kpt.h
new file mode 100644
index 0000000..3415418
--- /dev/null
+++ b/kern/arch/x86/kpt.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2015 Google Inc.
+ * Barret Rhoden <brho@cs.berkeley.edu>
+ * See LICENSE for details.
+ *
+ * 64 bit KPT helpers */
+
+#ifndef ROS_ARCH_KPT_H
+#define ROS_ARCH_KPT_H
+
+#include <arch/ros/mmu64.h>
+
+static inline bool kpte_is_present(kpte_t *kpte)
+{
+ return *kpte & PTE_P ? TRUE : FALSE;
+}
+
+static inline bool kpte_is_unmapped(kpte_t *kpte)
+{
+ return *kpte == 0;
+}
+
+static inline bool kpte_is_mapped(kpte_t *kpte)
+{
+ return *kpte != 0;
+}
+
+static inline bool kpte_is_paged_out(kpte_t *kpte)
+{
+ return *kpte != 0;
+}
+
+static inline bool kpte_is_dirty(kpte_t *kpte)
+{
+ return *kpte & PTE_D ? TRUE : FALSE;
+}
+
+static inline bool kpte_is_accessed(kpte_t *kpte)
+{
+ return *kpte & PTE_A ? TRUE : FALSE;
+}
+
+static inline bool kpte_is_jumbo(kpte_t *kpte)
+{
+ return *kpte & PTE_PS ? TRUE : FALSE;
+}
+
+static inline physaddr_t kpte_get_paddr(kpte_t *kpte)
+{
+ return (physaddr_t)*kpte & ~(PGSIZE - 1);
+}
+
+/* Returns the PTE in an unsigned long, for debugging mostly. */
+static inline unsigned long kpte_print(kpte_t *kpte)
+{
+ return *kpte;
+}
+
+static inline void kpte_write(kpte_t *kpte, physaddr_t pa, int perm)
+{
+ assert(!PGOFF(pa));
+ *kpte = pa | perm;
+}
+
+static inline void kpte_clear_present(kpte_t *kpte)
+{
+ *kpte &= ~PTE_P;
+}
+
+static inline void kpte_clear(kpte_t *kpte)
+{
+ *kpte = 0;
+}
+
+static inline bool kpte_has_perm_ur(kpte_t *kpte)
+{
+ return (*kpte & PTE_USER_RO) == PTE_USER_RO;
+}
+
+static inline bool kpte_has_perm_urw(kpte_t *kpte)
+{
+ return (*kpte & PTE_USER_RW) == PTE_USER_RW;
+}
+
+static inline int kpte_get_perm(kpte_t *kpte)
+{
+ return *kpte & PTE_PERM;
+}
+
+static inline void kpte_replace_perm(kpte_t *kpte, int perm)
+{
+ *kpte = (*kpte & ~PTE_PERM) | perm;
+}
+
+#endif /* ROS_ARCH_KPT_H */
diff --git a/kern/arch/x86/pmap.c b/kern/arch/x86/pmap.c
index 2fe9f56..903387c 100644
--- a/kern/arch/x86/pmap.c
+++ b/kern/arch/x86/pmap.c
@@ -99,6 +99,22 @@
enable_irqsave(&state);
}
+void invlpg(void *addr)
+{
+ asm volatile("invlpg (%0)" : : "r" (addr) : "memory");
+ if (per_cpu_info[core_id()].vmx_enabled)
+ ept_inval_addr((uintptr_t)addr);
+}
+
+void tlbflush(void)
+{
+ unsigned long cr3;
+ asm volatile("mov %%cr3,%0" : "=r" (cr3));
+ asm volatile("mov %0,%%cr3" : : "r" (cr3));
+ if (per_cpu_info[core_id()].vmx_enabled)
+ ept_inval_context();
+}
+
/* Flushes a TLB, including global pages. We should always have the CR4_PGE
* flag set, but just in case, we'll check. Toggling this bit flushes the TLB.
*/
@@ -108,6 +124,9 @@
if (cr4 & CR4_PGE) {
lcr4(cr4 & ~CR4_PGE);
lcr4(cr4);
- } else
+ } else {
lcr3(rcr3());
+ }
+ if (per_cpu_info[core_id()].vmx_enabled)
+ ept_inval_global();
}
diff --git a/kern/arch/x86/pmap64.c b/kern/arch/x86/pmap64.c
index 901a9af..d9a2dd8 100644
--- a/kern/arch/x86/pmap64.c
+++ b/kern/arch/x86/pmap64.c
@@ -80,9 +80,11 @@
static kpte_t *__pml_walk(kpte_t *pml, uintptr_t va, int flags, int pml_shift)
{
kpte_t *kpte;
+ epte_t *epte;
void *new_pml_kva;
kpte = &pml[PMLx(va, pml_shift)];
+ epte = kpte_to_epte(kpte);
if (walk_is_complete(kpte, pml_shift, flags))
return kpte;
if (!(*kpte & PTE_P)) {
@@ -98,6 +100,14 @@
* translation is !User). We put the perms on the last entry, not the
* intermediates. */
*kpte = PADDR(new_pml_kva) | PTE_P | PTE_U | PTE_W;
+ /* The physaddr of the new_pml is one page higher than the KPT page. A
+ * few other things:
+ * - for the same reason that we have U and X set on all intermediate
+ * PTEs, we now set R, X, and W for the EPTE.
+ * - All EPTEs have U perms
+ * - We can't use epte_write since we're workin on intermediate PTEs,
+ * and they don't have the memory type set. */
+ *epte = (PADDR(new_pml_kva) + PGSIZE) | EPTE_R | EPTE_X | EPTE_W;
}
return __pml_walk(kpte2pml(*kpte), va, flags, pml_shift - BITS_PER_PML);
}
diff --git a/kern/arch/x86/pmap_ops.h b/kern/arch/x86/pmap_ops.h
index 78c05fb..8e7e574 100644
--- a/kern/arch/x86/pmap_ops.h
+++ b/kern/arch/x86/pmap_ops.h
@@ -12,14 +12,14 @@
#ifndef ROS_ARCH_PMAPS_OPS_H
#define ROS_ARCH_PMAPS_OPS_H
+#include <arch/vmm/ept.h>
+#include <arch/kpt.h>
+
/* TODO: (EPT) build a CONFIG mode where we assert the EPT agrees with the KPT
* for all of the read ops */
static inline bool pte_walk_okay(pte_t pte)
{
- /* walk_okay should only be called after a walk, when we have both a KPTE
- * and an EPTE */
- dassert(pte.kpte ? TRUE : !pte.epte);
return pte.kpte ? TRUE : FALSE;
}
@@ -43,92 +43,69 @@
* - unmapped: completely unused. (0 value) */
static inline bool pte_is_present(pte_t pte)
{
-#if 0 /* could do some debuggin like this. painful. */
- bool ret_kpte, ret_epte;
- assert(pte.kpte || pte.epte);
- ret_kpte = pte.kpte ? (*pte.kpte & PTE_P ? TRUE : FALSE) : 0;
- /* TODO: EPT check */
- ret_epte = pte.epte ? (*pte.epte & PTE_P ? TRUE : FALSE) : 0;
- if (pte.kpte && pte.epte)
- assert(ret_kpte == ret_epte);
- return pte.kpte ? ret_kpte : ret_epte;
-#endif
- return pte.kpte ? (*pte.kpte & PTE_P ? TRUE : FALSE)
- : 0; /* TODO: EPT check */
+ return kpte_is_present(pte.kpte);
}
static inline bool pte_is_unmapped(pte_t pte)
{
- return pte.kpte ? PAGE_UNMAPPED(*pte.kpte)
- : 0; /* TODO: EPT check */
+ return kpte_is_unmapped(pte.kpte);
}
static inline bool pte_is_mapped(pte_t pte)
{
- return pte.kpte ? !PAGE_UNMAPPED(*pte.kpte)
- : 0; /* TODO: EPT check */
+ return kpte_is_mapped(pte.kpte);
}
static inline bool pte_is_paged_out(pte_t pte)
{
- return pte.kpte ? PAGE_PAGED_OUT(*pte.kpte)
- : 0; /* TODO: EPT check */
+ return kpte_is_paged_out(pte.kpte);
}
static inline bool pte_is_dirty(pte_t pte)
{
- return pte.kpte ? (*pte.kpte & PTE_D ? TRUE : FALSE)
- : 0; /* TODO: EPT check */
+ return kpte_is_dirty(pte.kpte) ||
+ epte_is_dirty(kpte_to_epte(pte.kpte));
}
static inline bool pte_is_accessed(pte_t pte)
{
- return pte.kpte ? (*pte.kpte & PTE_A ? TRUE : FALSE)
- : 0; /* TODO: EPT check */
+ return kpte_is_accessed(pte.kpte) ||
+ epte_is_accessed(kpte_to_epte(pte.kpte));
}
/* Used in debugging code - want something better involving the walk */
static inline bool pte_is_jumbo(pte_t pte)
{
- return pte.kpte ? (*pte.kpte & PTE_PS ? TRUE : FALSE)
- : 0; /* TODO: EPT check */
+ return kpte_is_jumbo(pte.kpte);
}
static inline physaddr_t pte_get_paddr(pte_t pte)
{
- return pte.kpte ? PTE_ADDR(*pte.kpte)
- : 0; /* TODO: EPT check */
+ return kpte_get_paddr(pte.kpte);
}
/* Returns the PTE in an unsigned long, for debugging mostly. */
static inline unsigned long pte_print(pte_t pte)
{
- return pte.kpte ? *pte.kpte
- : 0; /* TODO: EPT check */
+ return kpte_print(pte.kpte);
}
static inline void pte_write(pte_t pte, physaddr_t pa, int perm)
{
- if (pte.kpte)
- *pte.kpte = PTE(pa2ppn(pa), perm);
- if (pte.epte)
- /* TODO: EPT write (if EPT) */;
+ kpte_write(pte.kpte, pa, perm);
+ epte_write(kpte_to_epte(pte.kpte), pa, perm);
}
static inline void pte_clear_present(pte_t pte)
{
- if (pte.kpte)
- *pte.kpte &= ~PTE_P;
- if (pte.epte)
- /* TODO: EPT write (if EPT) */;
+ kpte_clear_present(pte.kpte);
+ epte_clear_present(kpte_to_epte(pte.kpte));
}
static inline void pte_clear(pte_t pte)
{
- if (pte.kpte)
- *pte.kpte = 0;
- if (pte.epte)
- /* TODO: EPT write (if EPT) */;
+ kpte_clear(pte.kpte);
+ epte_clear(kpte_to_epte(pte.kpte));
}
/* These are used by memcpy_*_user, but are very dangerous (and possibly used
@@ -139,30 +116,25 @@
* to an intermediate PTE, we'd miss that. */
static inline bool pte_has_perm_ur(pte_t pte)
{
- return pte.kpte ? (*pte.kpte & PTE_USER_RO ? TRUE : FALSE)
- : 0; /* TODO: EPT check */
+ return kpte_has_perm_ur(pte.kpte);
}
static inline bool pte_has_perm_urw(pte_t pte)
{
- return pte.kpte ? (*pte.kpte & PTE_USER_RW ? TRUE : FALSE)
- : 0; /* TODO: EPT check */
+ return kpte_has_perm_urw(pte.kpte);
}
/* return the arch-independent format for prots - whatever you'd expect to
* receive for pte_write. Careful with the ret, since a valid type is 0. */
static inline int pte_get_perm(pte_t pte)
{
- return pte.kpte ? *pte.kpte & PTE_PERM
- : 0; /* TODO: EPT check */
+ return kpte_get_perm(pte.kpte);
}
static inline void pte_replace_perm(pte_t pte, int perm)
{
- if (pte.kpte)
- *pte.kpte = (*pte.kpte & ~PTE_PERM) | perm;
- if (pte.epte)
- /* TODO: EPT write (if EPT) */;
+ kpte_replace_perm(pte.kpte, perm);
+ epte_replace_perm(kpte_to_epte(pte.kpte), perm);
}
#endif /* ROS_ARCH_PMAPS_OPS_H */
diff --git a/kern/arch/x86/vmm/ept.h b/kern/arch/x86/vmm/ept.h
new file mode 100644
index 0000000..b2d3da9
--- /dev/null
+++ b/kern/arch/x86/vmm/ept.h
@@ -0,0 +1,176 @@
+/* Copyright (c) 2015 Google Inc.
+ * Barret Rhoden <brho@cs.berkeley.edu>
+ * See LICENSE for details.
+ *
+ * 64 bit EPT helpers */
+
+#ifndef ROS_ARCH_VMM_EPT_H
+#define ROS_ARCH_VMM_EPT_H
+
+#include <arch/vmm/intel/vmx.h> /* for sync/flush helpers */
+#include <smp.h> /* for current */
+
+/* Some EPTE PTE flags are only valid for the last PTEs in a walk */
+#define EPTE_R (1ULL << 0) /* Readable */
+#define EPTE_W (1ULL << 1) /* Writeable */
+#define EPTE_X (1ULL << 2) /* Executable */
+#define EPTE_MEM_BITS (7ULL << 3) /* Memory type specifier */
+#define EPTE_IGN_PAT (1ULL << 6) /* Ignore PAT */
+#define EPTE_PS (1ULL << 7) /* Jumbo Page Size */
+#define EPTE_A (1ULL << 8) /* Accessed */
+#define EPTE_D (1ULL << 9) /* Dirty */
+#define EPTE_SUP_VE (1ULL << 63) /* Suppress virt exceptions */
+#define EPTE_P (EPTE_R | EPTE_W | EPTE_X)
+
+/* Types available for the EPTE_MEM_TYPE */
+#define EPT_MEM_TYPE_UC 0
+#define EPT_MEM_TYPE_WC 1
+#define EPT_MEM_TYPE_WT 4
+#define EPT_MEM_TYPE_WP 5
+#define EPT_MEM_TYPE_WB 6
+/* Helper to align the type to its location in the PTE */
+#define EPT_MEM_TYPE(type) ((type) << 3)
+
+/* Some machines don't support A and D EPTE bits. We'll |= 1 in those cases. */
+extern int x86_ept_pte_fix_ups;
+
+static inline epte_t *kpte_to_epte(kpte_t *kpte)
+{
+ return (epte_t*)(((uintptr_t)kpte) + PGSIZE);
+}
+
+static inline bool epte_is_present(epte_t *epte)
+{
+ /* Actually, certain combos, like W but not R could be misconfigurations */
+ return *epte & EPTE_P ? TRUE : FALSE;
+}
+
+static inline bool epte_is_unmapped(epte_t *epte)
+{
+ return *epte == 0;
+}
+
+static inline bool epte_is_mapped(epte_t *epte)
+{
+ return *epte != 0;
+}
+
+static inline bool epte_is_paged_out(epte_t *epte)
+{
+ return *epte != 0;
+}
+
+/* Some Intel machines don't support A or D. In these cases, we must assume
+ * the pages have been accessed or dirtied... */
+static inline bool epte_is_dirty(epte_t *epte)
+{
+ return (*epte | x86_ept_pte_fix_ups) & EPTE_D ? TRUE : FALSE;
+}
+
+static inline bool epte_is_accessed(epte_t *epte)
+{
+ return (*epte | x86_ept_pte_fix_ups) & EPTE_A ? TRUE : FALSE;
+}
+
+static inline bool epte_is_jumbo(epte_t *epte)
+{
+ return *epte & EPTE_PS ? TRUE : FALSE;
+}
+
+static inline physaddr_t epte_get_paddr(epte_t *epte)
+{
+ /* 63:52 are ignored/flags. 51:12 are the addr. Technically 51:N must be
+ * 0, where N is the physical addr width */
+ return *epte & 0x000ffffffffff000;
+}
+
+static inline int __pte_to_epte_perm(int perm)
+{
+ switch (perm) {
+ /* Since we keep the EPT in lockstep with the KPT, we might get some
+ * mapping requests for the kernel (e.g. vmap_pmem). */
+ case PTE_KERN_RW:
+ case PTE_KERN_RO:
+ case PTE_NONE:
+ return 0;
+ case PTE_USER_RW:
+ return EPTE_W | EPTE_R | EPTE_X;
+ case PTE_USER_RO:
+ return EPTE_R | EPTE_X;
+ default:
+ panic("Bad PTE type 0x%x\n", perm);
+ }
+}
+
+static inline void epte_write(epte_t *epte, physaddr_t pa, int settings)
+{
+ /* Could put in a check against the max physaddr len */
+ epte_t temp = pa;
+ temp |= __pte_to_epte_perm(settings & PTE_PERM);
+ temp |= settings & PTE_PS ? EPTE_PS : 0;
+ /* All memory is WB by default, but the guest can override that with their
+ * PAT on the first page walk (guest KPT/cr3) */
+ temp |= EPT_MEM_TYPE(EPT_MEM_TYPE_WB);
+ *epte = temp;
+}
+
+static inline void epte_clear_present(epte_t *epte)
+{
+ *epte &= ~EPTE_P;
+}
+
+static inline void epte_clear(epte_t *epte)
+{
+ *epte = 0;
+}
+
+static inline bool epte_has_perm_ur(epte_t *epte)
+{
+ return (*epte & (EPTE_R | EPTE_X)) == (EPTE_R | EPTE_X);
+}
+
+static inline bool epte_has_perm_urw(epte_t *epte)
+{
+ return (*epte & (EPTE_R | EPTE_W | EPTE_X)) == (EPTE_R | EPTE_W | EPTE_X);
+}
+
+/* We want to know User and Writable, in the 'PTE' sense. All present epte
+ * entries are User PTEs. */
+static inline int epte_get_perm(epte_t *epte)
+{
+ int settings = 0;
+ if (*epte & EPTE_P) {
+ settings |= PTE_P | PTE_U;
+ settings |= *epte & EPTE_W ? PTE_W : 0;
+ }
+ //settings |= *epte & EPTE_PS ? PTE_PS : 0; /* TODO */
+ return settings;
+}
+
+/* Again, we're replacing the old perms with U and/or W. Any non-U are ignored,
+ * as with epte_write. R (and X) are implied. */
+static inline void epte_replace_perm(epte_t *epte, int settings)
+{
+ *epte = (*epte & ~EPTE_P) | __pte_to_epte_perm(settings & PTE_PERM);
+}
+
+/* These ops might be the same for AMD as Intel; in which case we can move the
+ * body of these ept_sync_* funcs into here */
+static inline void ept_inval_addr(unsigned long addr)
+{
+ if (current && current->vmm.vmmcp)
+ ept_sync_individual_addr(current->env_pgdir.eptp, addr);
+}
+
+static inline void ept_inval_context(void)
+{
+ if (current && current->vmm.vmmcp)
+ ept_sync_context(current->env_pgdir.eptp);
+}
+
+static inline void ept_inval_global(void)
+{
+ ept_sync_global();
+}
+
+#endif /* ROS_ARCH_VMM_EPT_H */
diff --git a/kern/arch/x86/vmm/intel/Kbuild b/kern/arch/x86/vmm/intel/Kbuild
index a0f377a..152480f 100644
--- a/kern/arch/x86/vmm/intel/Kbuild
+++ b/kern/arch/x86/vmm/intel/Kbuild
@@ -1,2 +1 @@
obj-y += vmx.o
-obj-y += ept.o
diff --git a/kern/arch/x86/vmm/intel/ept.c b/kern/arch/x86/vmm/intel/ept.c
deleted file mode 100644
index 08ff585..0000000
--- a/kern/arch/x86/vmm/intel/ept.c
+++ /dev/null
@@ -1,335 +0,0 @@
-/**
- * ept.c - Support for Intel's Extended Page Tables
- *
- * Authors:
- * Adam Belay <abelay@stanford.edu>
- *
- * Right now we support EPT by making a sort of 'shadow' copy of the Linux
- * process page table. In the future, a more invasive architecture port
- * to VMX x86 could provide better performance by eliminating the need for
- * two copies of each page table entry, relying instead on only the EPT
- * format.
- *
- * This code is only a prototype and could benefit from a more comprehensive
- * review in terms of performance and correctness. Also, the implications
- * of threaded processes haven't been fully considered.
- *
- * Some of the low-level EPT functions are based on KVM.
- * Original Authors:
- * Avi Kivity <avi@qumranet.com>
- * Yaniv Kamay <yaniv@qumranet.com>
- */
-
-#include <kmalloc.h>
-#include <string.h>
-#include <stdio.h>
-#include <assert.h>
-#include <error.h>
-#include <pmap.h>
-#include <sys/queue.h>
-#include <smp.h>
-#include <kref.h>
-#include <atomic.h>
-#include <alarm.h>
-#include <event.h>
-#include <umem.h>
-#include <bitops.h>
-#include <arch/types.h>
-#include <syscall.h>
-#include <monitor.h>
-
-#include "vmx.h"
-#include "../vmm.h"
-
-#include "cpufeature.h"
-
-#define EPT_LEVELS 4 /* 0 through 3 */
-#define HUGE_PAGE_SIZE 2097152
-#define PageHuge(x) (0)
-
-#define VMX_EPT_FAULT_READ 0x01
-#define VMX_EPT_FAULT_WRITE 0x02
-#define VMX_EPT_FAULT_INS 0x04
-
-typedef unsigned long epte_t;
-
-#define __EPTE_READ 0x01
-#define __EPTE_WRITE 0x02
-#define __EPTE_EXEC 0x04
-#define __EPTE_IPAT 0x40
-#define __EPTE_SZ 0x80
-#define __EPTE_TYPE(n) (((n) & 0x7) << 3)
-
-enum {
- EPTE_TYPE_UC = 0, /* uncachable */
- EPTE_TYPE_WC = 1, /* write combining */
- EPTE_TYPE_WT = 4, /* write through */
- EPTE_TYPE_WP = 5, /* write protected */
- EPTE_TYPE_WB = 6, /* write back */
-};
-
-#define __EPTE_NONE 0
-#define __EPTE_FULL (__EPTE_READ | __EPTE_WRITE | __EPTE_EXEC)
-
-#define EPTE_ADDR (~(PAGE_SIZE - 1))
-#define EPTE_FLAGS (PAGE_SIZE - 1)
-
-static inline uintptr_t epte_addr(epte_t epte)
-{
- return (epte & EPTE_ADDR);
-}
-
-static inline uintptr_t epte_page_vaddr(epte_t epte)
-{
- return (uintptr_t) KADDR(epte_addr(epte));
-}
-
-static inline epte_t epte_flags(epte_t epte)
-{
- return (epte & EPTE_FLAGS);
-}
-
-static inline int epte_present(epte_t epte)
-{
- return (epte & __EPTE_FULL) > 0;
-}
-
-static inline int epte_big(epte_t epte)
-{
- return (epte & __EPTE_SZ) > 0;
-}
-
-#define ADDR_TO_IDX(la, n) \
- ((((unsigned long) (la)) >> (12 + 9 * (n))) & ((1 << 9) - 1))
-
-/* for now we assume in 'current' */
-static int
-ept_lookup_gpa(epte_t *dir, void *gpa, int level, int create, epte_t **epte_out)
-{
- int i;
-
- for (i = EPT_LEVELS - 1; i > level; i--) {
- int idx = ADDR_TO_IDX(gpa, i);
- printk("%d: gpa %p, idx %p\n", i, gpa, idx);
- if (!epte_present(dir[idx])) {
- printk("not present\n");
- void *page;
-
- if (!create)
- return -ENOENT;
-
- page = (void *) kpage_zalloc_addr();
- if (!page)
- return -ENOMEM;
- printk("page %p\n", page);
- dir[idx] = epte_addr(PADDR(page)) |
- __EPTE_FULL;
- printk("Set %p[%p] to %p\n", dir, idx, dir[idx]);
- }
-
- if (epte_big(dir[idx])) {
- if (i != 1)
- return -EINVAL;
- level = i;
- break;
- }
-
- dir = (epte_t *) epte_page_vaddr(dir[idx]);
- printk("Dir for next pass: %p\n", dir);
- }
-
- *epte_out = &dir[ADDR_TO_IDX(gpa, level)];
- printk("Final ept is %p\n", *epte_out);
- return 0;
-}
-
-static void free_ept_page(epte_t epte)
-{
- // TODO: clean this up.
- void *page = KADDR(epte & ~0xfff);
- //struct page *page = pfn_to_page(epte_addr(epte) >> PAGE_SHIFT);
-
- kfree(page);
-}
-
-static void vmx_free_ept(unsigned long ept_root)
-{
- epte_t *pgd = (epte_t *) KADDR(ept_root);
- int i, j, k, l;
-
- // TODO: change all instances of 512 to something.
- for (i = 0; i < 512; i++) {
- epte_t *pud = (epte_t *) epte_page_vaddr(pgd[i]);
- if (!epte_present(pgd[i]))
- continue;
-
- for (j = 0; j < 512; j++) {
- epte_t *pmd = (epte_t *) epte_page_vaddr(pud[j]);
- if (!epte_present(pud[j]))
- continue;
- if (epte_flags(pud[j]) & __EPTE_SZ)
- continue;
-
- for (k = 0; k < 512; k++) {
- epte_t *pte = (epte_t *) epte_page_vaddr(pmd[k]);
- if (!epte_present(pmd[k]))
- continue;
- if (epte_flags(pmd[k]) & __EPTE_SZ) {
- free_ept_page(pmd[k]);
- continue;
- }
-
- for (l = 0; l < 512; l++) {
- if (!epte_present(pte[l]))
- continue;
-
- free_ept_page(pte[l]);
- }
-
- kfree(pte);
- }
-
- kfree(pmd);
- }
-
- kfree(pud);
- }
-
- kfree(pgd);
-}
-
-static int ept_clear_epte(epte_t *epte)
-{
- if (*epte == __EPTE_NONE)
- return 0;
-
- free_ept_page(*epte);
- *epte = __EPTE_NONE;
-
- return 1;
-}
-
-/* We're given a guest physical and a host physical. */
-static int ept_set_epte(epte_t *dir, int make_write, unsigned long gpa, unsigned long hpa)
-{
- int ret = -1;
- epte_t *epte, flags;
- struct page *page = NULL;
-
- // We're going to assume locking is done by this point.
- // TODO: PageHuge
-
- ret = ept_lookup_gpa(dir, (void *) gpa, PageHuge(page) ? 1 : 0, 1, &epte);
- if (ret) {
- printk("ept: failed to lookup EPT entry\n");
- return ret;
- }
-
- printk("=====================> epte %p is %p\n", epte, *epte);
- if (epte_present(*epte) && (epte_big(*epte) || !PageHuge(page))) {
- printk("PRESENT? WTF? OK ...\n");
- monitor(NULL);
- //ept_clear_epte(epte);
- } else {
- flags = __EPTE_READ | __EPTE_EXEC | __EPTE_WRITE |
- __EPTE_TYPE(EPTE_TYPE_WB) | __EPTE_IPAT;
- if (make_write)
- flags |= __EPTE_WRITE;
-
- /* TODO: fix thishuge page shit.*/
- if (PageHuge(page)) {
- flags |= __EPTE_SZ;
- if (epte_present(*epte) && !epte_big(*epte)){
- panic("free huge page?");
- //free_page(epte_page_vaddr(*epte));
- }
- /* FIXME: free L0 entries too */
- *epte = epte_addr(PADDR(page) & ~((1 << 21) - 1)) |
- flags;
- } else {
- *epte = epte_addr(hpa) | flags;
- printk("Set epte to %p\n", *epte);
- }
- }
- return 0;
-}
-
-// TODO: kill this?
-// NOTE: guest physical is 1:1 mapped to host virtual. This is NOT
-// like dune at all.
-int vmx_do_ept_fault(void *dir, unsigned long gpa, unsigned long hpa, int fault_flags)
-{
- int ret;
- int make_write = (fault_flags & VMX_EPT_FAULT_WRITE) ? 1 : 0;
-
- printk("ept: GPA: 0x%lx, GVA: 0x%lx, flags: %x\n",
- gpa, hpa, fault_flags);
-
- ret = ept_set_epte((epte_t *)dir, make_write, gpa, hpa);
-
- return ret;
-}
-
-/*
- * ept_fault_pages pre-faults pages in the range start..end
- */
-int ept_fault_pages(void *dir, uint32_t start, uint32_t end)
-{
- uint64_t i;
- int ret;
- for(i = start; i < end; i++) {
- uint64_t addr = i << 12;
- ret = vmx_do_ept_fault((epte_t*)dir, i, i, VMX_EPT_FAULT_WRITE);
- if (ret) {
- return ret;
- }
- }
- return 0;
-}
-/**
- * ept_invalidate_page - removes a page from the EPT
- * @vcpu: the vcpu
- * @mm: the process's mm_struct
- * @addr: the address of the page
- *
- * Returns 1 if the page was removed, 0 otherwise
- */
-static int ept_invalidate_page(epte_t *dir, unsigned long addr)
-{
- int ret;
- epte_t *epte;
- void *gpa = (void *) addr;
-
- ret = ept_lookup_gpa(dir, (void *) gpa, 0, 0, &epte);
- if (ret) {
- return 0;
- }
-
- ret = ept_clear_epte(epte);
-
- /* TODO: sync individual?
- if (ret)
- vmx_ept_sync_individual_addr(vcpu, (gpa_t) gpa);
- */
-
- return ret;
-}
-
-/**
- * ept_check_page - determines if a page is mapped in the ept
- * @vcpu: the vcpu
- * @mm: the process's mm_struct
- * @addr: the address of the page
- *
- * Returns 1 if the page is mapped, 0 otherwise
- */
-int ept_check_page(void *dir, unsigned long addr)
-{
- int ret;
- epte_t *epte;
- void *gpa = (void *) addr;
-
- ret = ept_lookup_gpa((epte_t *)dir, gpa, 0, 0, &epte);
-
- return ret;
-}
diff --git a/kern/arch/x86/vmm/intel/vmx.c b/kern/arch/x86/vmm/intel/vmx.c
index 25bd9be..ef0bb45 100644
--- a/kern/arch/x86/vmm/intel/vmx.c
+++ b/kern/arch/x86/vmm/intel/vmx.c
@@ -173,6 +173,8 @@
static unsigned long *msr_bitmap;
+int x86_ept_pte_fix_ups = 0;
+
struct vmx_capability vmx_capability;
struct vmcs_config vmcs_config;
@@ -654,67 +656,6 @@
//put_cpu();
}
-static void __vmx_sync_helper(struct hw_trapframe *hw_tf, void *ptr)
-{
- struct vmx_vcpu *vcpu = ptr;
-
- ept_sync_context(vcpu_get_eptp(vcpu));
-}
-
-struct sync_addr_args {
- struct vmx_vcpu *vcpu;
- gpa_t gpa;
-};
-
-static void __vmx_sync_individual_addr_helper(struct hw_trapframe *hw_tf, void *ptr)
-{
- struct sync_addr_args *args = ptr;
-
-// ept_sync_individual_addr(
-
-}
-
-/**
- * vmx_ept_sync_global - used to evict everything in the EPT
- * @vcpu: the vcpu
- */
-void vmx_ept_sync_vcpu(struct vmx_vcpu *vcpu)
-{
- handler_wrapper_t *w;
-
- smp_call_function_single(vcpu->cpu,
- __vmx_sync_helper, (void *) vcpu, &w);
-
- if (smp_call_wait(w)) {
- printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
- }
-
-
-}
-
-/**
- * vmx_ept_sync_individual_addr - used to evict an individual address
- * @vcpu: the vcpu
- * @gpa: the guest-physical address
- */
-void vmx_ept_sync_individual_addr(struct vmx_vcpu *vcpu, gpa_t gpa)
-{
- struct sync_addr_args args;
- args.vcpu = vcpu;
- args.gpa = gpa;
-
- handler_wrapper_t *w;
-
-
- smp_call_function_single(vcpu->cpu,
- __vmx_sync_individual_addr_helper, (void *) &args, &w);
-
- if (smp_call_wait(w)) {
- printk("litevm_init. smp_call_wait failed. Expect a panic.\n");
- }
-
-}
-
/**
* vmx_dump_cpu - prints the CPU state
* @vcpu: VCPU to print
@@ -1212,7 +1153,7 @@
if (page) {
uint64_t hpa = page2pa(page);
printk("hpa for %p is %p\n", gpa, hpa);
- ret = vmx_do_ept_fault(vcpu->proc->env_pgdir.epte, gpa, hpa, exit_qual);
+ ret = -1;
printk("vmx_do_ept_fault returns %d\n", ret);
}
@@ -1531,7 +1472,7 @@
}
if (!cpu_has_vmx_ept_ad_bits()) {
printk("VMX EPT doesn't support accessed/dirty!\n");
- /* TODO: set the pmap_ops accordingly */
+ x86_ept_pte_fix_ups |= EPTE_A | EPTE_D;
}
if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) {
printk("VMX EPT can't invalidate PTEs/TLBs!\n");
diff --git a/kern/arch/x86/vmm/intel/vmx.h b/kern/arch/x86/vmm/intel/vmx.h
index d329d19..c69168d 100644
--- a/kern/arch/x86/vmm/intel/vmx.h
+++ b/kern/arch/x86/vmm/intel/vmx.h
@@ -502,6 +502,9 @@
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
+#define VMX_EPT_FAULT_READ 0x01
+#define VMX_EPT_FAULT_WRITE 0x02
+#define VMX_EPT_FAULT_INS 0x04
#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"