blob: 08ff585b05fa88e7873bfb034ec62411e10d0651 [file] [log] [blame]
/**
* ept.c - Support for Intel's Extended Page Tables
*
* Authors:
* Adam Belay <abelay@stanford.edu>
*
* Right now we support EPT by making a sort of 'shadow' copy of the Linux
* process page table. In the future, a more invasive architecture port
* to VMX x86 could provide better performance by eliminating the need for
* two copies of each page table entry, relying instead on only the EPT
* format.
*
* This code is only a prototype and could benefit from a more comprehensive
* review in terms of performance and correctness. Also, the implications
* of threaded processes haven't been fully considered.
*
* Some of the low-level EPT functions are based on KVM.
* Original Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*/
#include <kmalloc.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <error.h>
#include <pmap.h>
#include <sys/queue.h>
#include <smp.h>
#include <kref.h>
#include <atomic.h>
#include <alarm.h>
#include <event.h>
#include <umem.h>
#include <bitops.h>
#include <arch/types.h>
#include <syscall.h>
#include <monitor.h>
#include "vmx.h"
#include "../vmm.h"
#include "cpufeature.h"
#define EPT_LEVELS 4 /* 0 through 3 */
#define HUGE_PAGE_SIZE 2097152
#define PageHuge(x) (0)
#define VMX_EPT_FAULT_READ 0x01
#define VMX_EPT_FAULT_WRITE 0x02
#define VMX_EPT_FAULT_INS 0x04
typedef unsigned long epte_t;
#define __EPTE_READ 0x01
#define __EPTE_WRITE 0x02
#define __EPTE_EXEC 0x04
#define __EPTE_IPAT 0x40
#define __EPTE_SZ 0x80
#define __EPTE_TYPE(n) (((n) & 0x7) << 3)
enum {
EPTE_TYPE_UC = 0, /* uncachable */
EPTE_TYPE_WC = 1, /* write combining */
EPTE_TYPE_WT = 4, /* write through */
EPTE_TYPE_WP = 5, /* write protected */
EPTE_TYPE_WB = 6, /* write back */
};
#define __EPTE_NONE 0
#define __EPTE_FULL (__EPTE_READ | __EPTE_WRITE | __EPTE_EXEC)
#define EPTE_ADDR (~(PAGE_SIZE - 1))
#define EPTE_FLAGS (PAGE_SIZE - 1)
static inline uintptr_t epte_addr(epte_t epte)
{
return (epte & EPTE_ADDR);
}
static inline uintptr_t epte_page_vaddr(epte_t epte)
{
return (uintptr_t) KADDR(epte_addr(epte));
}
static inline epte_t epte_flags(epte_t epte)
{
return (epte & EPTE_FLAGS);
}
static inline int epte_present(epte_t epte)
{
return (epte & __EPTE_FULL) > 0;
}
static inline int epte_big(epte_t epte)
{
return (epte & __EPTE_SZ) > 0;
}
#define ADDR_TO_IDX(la, n) \
((((unsigned long) (la)) >> (12 + 9 * (n))) & ((1 << 9) - 1))
/* for now we assume in 'current' */
static int
ept_lookup_gpa(epte_t *dir, void *gpa, int level, int create, epte_t **epte_out)
{
int i;
for (i = EPT_LEVELS - 1; i > level; i--) {
int idx = ADDR_TO_IDX(gpa, i);
printk("%d: gpa %p, idx %p\n", i, gpa, idx);
if (!epte_present(dir[idx])) {
printk("not present\n");
void *page;
if (!create)
return -ENOENT;
page = (void *) kpage_zalloc_addr();
if (!page)
return -ENOMEM;
printk("page %p\n", page);
dir[idx] = epte_addr(PADDR(page)) |
__EPTE_FULL;
printk("Set %p[%p] to %p\n", dir, idx, dir[idx]);
}
if (epte_big(dir[idx])) {
if (i != 1)
return -EINVAL;
level = i;
break;
}
dir = (epte_t *) epte_page_vaddr(dir[idx]);
printk("Dir for next pass: %p\n", dir);
}
*epte_out = &dir[ADDR_TO_IDX(gpa, level)];
printk("Final ept is %p\n", *epte_out);
return 0;
}
static void free_ept_page(epte_t epte)
{
// TODO: clean this up.
void *page = KADDR(epte & ~0xfff);
//struct page *page = pfn_to_page(epte_addr(epte) >> PAGE_SHIFT);
kfree(page);
}
static void vmx_free_ept(unsigned long ept_root)
{
epte_t *pgd = (epte_t *) KADDR(ept_root);
int i, j, k, l;
// TODO: change all instances of 512 to something.
for (i = 0; i < 512; i++) {
epte_t *pud = (epte_t *) epte_page_vaddr(pgd[i]);
if (!epte_present(pgd[i]))
continue;
for (j = 0; j < 512; j++) {
epte_t *pmd = (epte_t *) epte_page_vaddr(pud[j]);
if (!epte_present(pud[j]))
continue;
if (epte_flags(pud[j]) & __EPTE_SZ)
continue;
for (k = 0; k < 512; k++) {
epte_t *pte = (epte_t *) epte_page_vaddr(pmd[k]);
if (!epte_present(pmd[k]))
continue;
if (epte_flags(pmd[k]) & __EPTE_SZ) {
free_ept_page(pmd[k]);
continue;
}
for (l = 0; l < 512; l++) {
if (!epte_present(pte[l]))
continue;
free_ept_page(pte[l]);
}
kfree(pte);
}
kfree(pmd);
}
kfree(pud);
}
kfree(pgd);
}
static int ept_clear_epte(epte_t *epte)
{
if (*epte == __EPTE_NONE)
return 0;
free_ept_page(*epte);
*epte = __EPTE_NONE;
return 1;
}
/* We're given a guest physical and a host physical. */
static int ept_set_epte(epte_t *dir, int make_write, unsigned long gpa, unsigned long hpa)
{
int ret = -1;
epte_t *epte, flags;
struct page *page = NULL;
// We're going to assume locking is done by this point.
// TODO: PageHuge
ret = ept_lookup_gpa(dir, (void *) gpa, PageHuge(page) ? 1 : 0, 1, &epte);
if (ret) {
printk("ept: failed to lookup EPT entry\n");
return ret;
}
printk("=====================> epte %p is %p\n", epte, *epte);
if (epte_present(*epte) && (epte_big(*epte) || !PageHuge(page))) {
printk("PRESENT? WTF? OK ...\n");
monitor(NULL);
//ept_clear_epte(epte);
} else {
flags = __EPTE_READ | __EPTE_EXEC | __EPTE_WRITE |
__EPTE_TYPE(EPTE_TYPE_WB) | __EPTE_IPAT;
if (make_write)
flags |= __EPTE_WRITE;
/* TODO: fix thishuge page shit.*/
if (PageHuge(page)) {
flags |= __EPTE_SZ;
if (epte_present(*epte) && !epte_big(*epte)){
panic("free huge page?");
//free_page(epte_page_vaddr(*epte));
}
/* FIXME: free L0 entries too */
*epte = epte_addr(PADDR(page) & ~((1 << 21) - 1)) |
flags;
} else {
*epte = epte_addr(hpa) | flags;
printk("Set epte to %p\n", *epte);
}
}
return 0;
}
// TODO: kill this?
// NOTE: guest physical is 1:1 mapped to host virtual. This is NOT
// like dune at all.
int vmx_do_ept_fault(void *dir, unsigned long gpa, unsigned long hpa, int fault_flags)
{
int ret;
int make_write = (fault_flags & VMX_EPT_FAULT_WRITE) ? 1 : 0;
printk("ept: GPA: 0x%lx, GVA: 0x%lx, flags: %x\n",
gpa, hpa, fault_flags);
ret = ept_set_epte((epte_t *)dir, make_write, gpa, hpa);
return ret;
}
/*
* ept_fault_pages pre-faults pages in the range start..end
*/
int ept_fault_pages(void *dir, uint32_t start, uint32_t end)
{
uint64_t i;
int ret;
for(i = start; i < end; i++) {
uint64_t addr = i << 12;
ret = vmx_do_ept_fault((epte_t*)dir, i, i, VMX_EPT_FAULT_WRITE);
if (ret) {
return ret;
}
}
return 0;
}
/**
* ept_invalidate_page - removes a page from the EPT
* @vcpu: the vcpu
* @mm: the process's mm_struct
* @addr: the address of the page
*
* Returns 1 if the page was removed, 0 otherwise
*/
static int ept_invalidate_page(epte_t *dir, unsigned long addr)
{
int ret;
epte_t *epte;
void *gpa = (void *) addr;
ret = ept_lookup_gpa(dir, (void *) gpa, 0, 0, &epte);
if (ret) {
return 0;
}
ret = ept_clear_epte(epte);
/* TODO: sync individual?
if (ret)
vmx_ept_sync_individual_addr(vcpu, (gpa_t) gpa);
*/
return ret;
}
/**
* ept_check_page - determines if a page is mapped in the ept
* @vcpu: the vcpu
* @mm: the process's mm_struct
* @addr: the address of the page
*
* Returns 1 if the page is mapped, 0 otherwise
*/
int ept_check_page(void *dir, unsigned long addr)
{
int ret;
epte_t *epte;
void *gpa = (void *) addr;
ret = ept_lookup_gpa((epte_t *)dir, gpa, 0, 0, &epte);
return ret;
}