| /* Copyright (c) 2013 The Regents of the University of California |
| * Barret Rhoden <brho@cs.berkeley.edu> |
| * See LICENSE for details. |
| * |
| * 64 bit virtual memory / address space management (and a touch of pmem). |
| * |
| * TODO: |
| * - better testing: check my helper funcs, a variety of inserts/segments remove |
| * it all, etc (esp with jumbos). check permissions and the existence of |
| * mappings. |
| * - mapping segments doesn't support having a PTE already present |
| * - mtrrs break big machines |
| * - jumbo pages are only supported at the VM layer, not PM (a jumbo is 2^9 |
| * little pages, for example) |
| * - usermemwalk and freeing might need some help (in higher layers of the |
| * kernel). */ |
| |
| #include <arch/x86.h> |
| #include <arch/arch.h> |
| #include <arch/mmu.h> |
| #include <arch/apic.h> |
| #include <error.h> |
| #include <sys/queue.h> |
| #include <atomic.h> |
| #include <string.h> |
| #include <assert.h> |
| #include <pmap.h> |
| #include <env.h> |
| #include <stdio.h> |
| #include <kmalloc.h> |
| #include <page_alloc.h> |
| #include <umem.h> |
| |
| extern char boot_pml4[], gdt64[], gdt64desc[]; |
| pgdir_t boot_pgdir; |
| physaddr_t boot_cr3; |
| segdesc_t *gdt; |
| pseudodesc_t gdt_pd; |
| |
| #define PG_WALK_SHIFT_MASK 0x00ff /* first byte = target shift */ |
| #define PG_WALK_CREATE 0x0100 |
| |
| kpte_t *pml_walk(kpte_t *pml, uintptr_t va, int flags); |
| typedef int (*kpte_cb_t)(kpte_t *kpte, uintptr_t kva, int pml_shift, |
| bool visited_subs, void *arg); |
| int pml_for_each(kpte_t *pml, uintptr_t start, size_t len, kpte_cb_t callback, |
| void *arg); |
| /* Helpers for PML for-each walks */ |
| static inline bool pte_is_final(pte_t pte, int pml_shift) |
| { |
| return (pml_shift == PML1_SHIFT) || pte_is_jumbo(pte); |
| } |
| |
| static inline bool pte_is_intermediate(pte_t pte, int pml_shift) |
| { |
| return !pte_is_final(pte, pml_shift); |
| } |
| |
| /* Helper: gets the kpte_t pointer which is the base of the PML4 from pgdir */ |
| static kpte_t *pgdir_get_kpt(pgdir_t pgdir) |
| { |
| return pgdir.kpte; |
| } |
| |
| /* Helper: returns true if we do not need to walk the page table any further. |
| * |
| * The caller may or may not know if a jumbo is desired. pml_shift determines |
| * which layer we are at in the page walk, and flags contains the target level |
| * we're looking for, like a jumbo or a default. |
| * |
| * Regardless of the desired target, if we find a jumbo page, we're also done. |
| */ |
| static bool walk_is_complete(kpte_t *kpte, int pml_shift, int flags) |
| { |
| if ((pml_shift == (flags & PG_WALK_SHIFT_MASK)) || (*kpte & PTE_PS)) |
| return TRUE; |
| return FALSE; |
| } |
| |
| /* PTE_ADDR should only be used on a PTE that has a physical address of the next |
| * PML inside. i.e., not a final PTE in the page table walk. */ |
| static kpte_t *kpte2pml(kpte_t kpte) |
| { |
| return (kpte_t*)KADDR(PTE_ADDR(kpte)); |
| } |
| |
| static kpte_t *__pml_walk(kpte_t *pml, uintptr_t va, int flags, int pml_shift) |
| { |
| kpte_t *kpte; |
| epte_t *epte; |
| void *new_pml_kva; |
| |
| kpte = &pml[PMLx(va, pml_shift)]; |
| epte = kpte_to_epte(kpte); |
| if (walk_is_complete(kpte, pml_shift, flags)) |
| return kpte; |
| if (!kpte_is_present(kpte)) { |
| if (!(flags & PG_WALK_CREATE)) |
| return NULL; |
| new_pml_kva = kpages_alloc(2 * PGSIZE, MEM_WAIT); |
| memset(new_pml_kva, 0, PGSIZE * 2); |
| /* Might want better error handling (we're probably out of |
| * memory) */ |
| if (!new_pml_kva) |
| return NULL; |
| /* We insert the new PT into the PML with U and W perms. |
| * Permissions on page table walks are anded together (if any of |
| * them are !User, the translation is !User). We put the perms |
| * on the last entry, not the intermediates. */ |
| *kpte = PADDR(new_pml_kva) | PTE_P | PTE_U | PTE_W; |
| /* For a dose of paranoia, we'll avoid mapping intermediate |
| * eptes when we know we're using an address that should never |
| * be ept-accesible. */ |
| if (va < ULIM) { |
| /* The physaddr of the new_pml is one page higher than |
| * the KPT page. |
| * A few other things: |
| * - for the same reason that we have U and X set on all |
| * intermediate PTEs, we now set R, X, and W for the |
| * EPTE. |
| * - All EPTEs have U perms |
| * - We can't use epte_write since we're workin on |
| * intermediate PTEs, and they don't have the memory |
| * type set. */ |
| *epte = (PADDR(new_pml_kva) + PGSIZE) | EPTE_R | EPTE_X |
| | EPTE_W; |
| } |
| } |
| return __pml_walk(kpte2pml(*kpte), va, flags, pml_shift - BITS_PER_PML); |
| } |
| |
| /* Returns a pointer to the page table entry corresponding to va. Flags has |
| * some options and selects which level of the page table we're happy with |
| * stopping at. Normally, this is PML1 for a normal page (e.g. flags = |
| * PML1_SHIFT), but could be for a jumbo page (PML3 or PML2 entry). |
| * |
| * Flags also controls whether or not intermediate page tables are created or |
| * not. This is useful for when we are checking whether or not a mapping |
| * exists, but aren't interested in creating intermediate tables that will not |
| * get filled. When we want to create intermediate pages (i.e. we're looking |
| * for the PTE to insert a page), pass in PG_WALK_CREATE with flags. |
| * |
| * Returns 0 on error or absence of a PTE for va. */ |
| kpte_t *pml_walk(kpte_t *pml, uintptr_t va, int flags) |
| { |
| return __pml_walk(pml, va, flags, PML4_SHIFT); |
| } |
| |
| /* Helper: determines how much va needs to be advanced until it is aligned to |
| * pml_shift. */ |
| static uintptr_t amt_til_aligned(uintptr_t va, int pml_shift) |
| { |
| /* find the lower bits of va, subtract them from the shift to see what |
| * we would need to add to get to the shift. va might be aligned |
| * already, and we subtracted 0, so we mask off the top part again. */ |
| return ((1UL << pml_shift) - (va & ((1UL << pml_shift) - 1))) & |
| ((1UL << pml_shift) - 1); |
| } |
| |
| /* Helper: determines how much of size we can take, in chunks of pml_shift */ |
| static uintptr_t amt_of_aligned_bytes(uintptr_t size, int pml_shift) |
| { |
| /* creates a mask all 1s from MSB down to (including) shift */ |
| return (~((1UL << pml_shift) - 1)) & size; |
| } |
| |
| /* Helper: Advance kpte, given old_pte. Will do pml walks when necessary. */ |
| static kpte_t *get_next_pte(kpte_t *old_pte, kpte_t *pgdir, uintptr_t va, |
| int flags) |
| { |
| /* PTEs (undereferenced) are addresses within page tables. so long as |
| * we stay inside the PML, we can just advance via pointer arithmetic. |
| * if we advance old_pte and it points to the beginning of a page |
| * (offset == 0), we've looped outside of our original PML, and need to |
| * get a new one. */ |
| old_pte++; |
| if (!PGOFF(old_pte)) |
| return pml_walk(pgdir, va, flags); |
| return old_pte; |
| } |
| |
| /* Helper: maps pages from va to pa for size bytes, all for a given page size */ |
| static void map_my_pages(kpte_t *pgdir, uintptr_t va, size_t size, |
| physaddr_t pa, int perm, int pml_shift) |
| { |
| /* set to trigger a pml walk on the first get_next */ |
| kpte_t *kpte = (kpte_t*)PGSIZE - 1; |
| size_t pgsize = 1UL << pml_shift; |
| |
| for (size_t i = 0; i < size; i += pgsize, va += pgsize, |
| pa += pgsize) { |
| kpte = get_next_pte(kpte, pgdir, va, |
| PG_WALK_CREATE | pml_shift); |
| assert(kpte); |
| pte_write(kpte, pa, perm | (pml_shift != PML1_SHIFT ? PTE_PS |
| : 0)); |
| printd("Wrote *kpte %p, for va %p to pa %p tried to cover %p\n", |
| *kpte, va, pa, amt_mapped); |
| } |
| } |
| |
| /* Maps all pages possible from va->pa, up to size, preferring to use pages of |
| * type pml_shift (size == (1 << shift)). Assumes that it is possible to map va |
| * to pa at the given shift. */ |
| static uintptr_t __map_segment(kpte_t *pgdir, uintptr_t va, size_t size, |
| physaddr_t pa, int perm, int pml_shift) |
| { |
| printd("__map_segment, va %p, size %p, pa %p, shift %d\n", va, size, |
| pa, pml_shift); |
| uintptr_t amt_to_submap, amt_to_map, amt_mapped = 0; |
| |
| amt_to_submap = amt_til_aligned(va, pml_shift); |
| amt_to_submap = MIN(amt_to_submap, size); |
| if (amt_to_submap) { |
| amt_mapped = __map_segment(pgdir, va, amt_to_submap, pa, perm, |
| pml_shift - BITS_PER_PML); |
| va += amt_mapped; |
| pa += amt_mapped; |
| size -= amt_mapped; |
| } |
| /* Now we're either aligned and ready to map, or size == 0 */ |
| amt_to_map = amt_of_aligned_bytes(size, pml_shift); |
| if (amt_to_map) { |
| map_my_pages(pgdir, va, amt_to_map, pa, perm, pml_shift); |
| va += amt_to_map; |
| pa += amt_to_map; |
| size -= amt_to_map; |
| amt_mapped += amt_to_map; |
| } |
| /* Map whatever is left over */ |
| if (size) |
| amt_mapped += __map_segment(pgdir, va, size, pa, perm, |
| pml_shift - BITS_PER_PML); |
| return amt_mapped; |
| } |
| |
| /* Returns the maximum pml shift possible between a va->pa mapping. It is the |
| * number of least-significant bits the two addresses have in common. For |
| * instance, if the two pages are 0x456000 and 0x156000, this returns 20. For |
| * regular pages, it will be at least 12 (every page ends in 0x000). |
| * |
| * The max pml shift possible for an va->pa mapping is determined by the |
| * least bit that differs between va and pa. |
| * |
| * We can optimize this a bit, since we know the first 12 bits are the same, and |
| * we won't go higher than max_pml_shift. */ |
| static int max_possible_shift(uintptr_t va, uintptr_t pa) |
| { |
| int shift = 0; |
| if (va == pa) |
| return sizeof(uintptr_t) * 8; |
| while ((va & 1) == (pa & 1)) { |
| va >>= 1; |
| pa >>= 1; |
| shift++; |
| } |
| return shift; |
| } |
| |
| /* Map [va, va+size) of virtual (linear) address space to physical [pa, pa+size) |
| * in the page table rooted at pgdir. Size is a multiple of PGSIZE. Use |
| * permission bits perm for the entries. Set pml_shift to the shift of the |
| * largest page size you're willing to use. |
| * |
| * Doesn't handle having pages currently mapped yet, and while supporting that |
| * is relatively easy, doing an insertion of small pages into an existing jumbo |
| * would be trickier. Might have the vmem region code deal with this. |
| * |
| * Don't use this to set the PAT flag on jumbo pages in perm, unless you are |
| * absolultely sure you won't map regular pages. */ |
| void map_segment(pgdir_t pgdir, uintptr_t va, size_t size, physaddr_t pa, |
| int perm, int pml_shift) |
| { |
| int max_shift_possible; |
| if (PGOFF(va) || PGOFF(pa) || PGOFF(size)) |
| panic("Asked to map with bad alignment. va %p, pa %p, size %p\n", |
| va, pa, size); |
| /* Given the max_page_size, try and use larger pages. We'll figure out |
| * the largest possible jumbo page, up to whatever we were asked for. */ |
| if (pml_shift != PGSHIFT) { |
| max_shift_possible = max_possible_shift(va, pa); |
| max_shift_possible = MIN(max_shift_possible, |
| arch_max_jumbo_page_shift()); |
| /* Assumes we were given a proper PML shift 12, 21, 30, etc */ |
| while (pml_shift > max_shift_possible) |
| pml_shift -= BITS_PER_PML; |
| } |
| assert((pml_shift == PML1_SHIFT) || |
| (pml_shift == PML2_SHIFT) || |
| (pml_shift == PML3_SHIFT)); |
| __map_segment(pgdir_get_kpt(pgdir), va, size, pa, perm, pml_shift); |
| } |
| |
| /* For every PTE in [start, start + len), call callback(kpte, shift, |
| * etc), including the not present PTEs. pml_shift is the shift/size of pml. |
| * |
| * This will recurse down into sub PMLs, and perform the CB in a |
| * depth-first-search. The CB will be told which level of the paging it is at, |
| * via 'shift'. |
| * |
| * The CB will also run on intermediate PTEs: meaning, PTEs that point to page |
| * tables (and not (jumbo) pages) will be executed. If the CB returns anything |
| * other than 0, we'll abort and propagate that back out from for_each. */ |
| static int __pml_for_each(kpte_t *pml, uintptr_t start, size_t len, |
| kpte_cb_t callback, void *arg, int pml_shift) |
| { |
| int ret; |
| bool visited_all_subs; |
| kpte_t *kpte_s, *kpte_e, *kpte_i; |
| uintptr_t kva, pgsize = 1UL << pml_shift; |
| |
| if (!len) |
| return 0; |
| kpte_s = &pml[PMLx(start, pml_shift)]; |
| /* Later, we'll loop up to and including kpte_e. Since start + len |
| * might not be page aligned, we'll need to include the final kpte. If |
| * it is aligned, we don't want to visit, so we subtract one so that the |
| * aligned case maps to the index below its normal kpte. */ |
| kpte_e = &pml[PMLx(start + len - 1, pml_shift)]; |
| /* tracks the virt addr kpte_i works on, rounded for this PML */ |
| kva = ROUNDDOWN(start, pgsize); |
| printd("start %p PMLx(S) %d, end-inc %p PMLx(E) %d shift %d, kva %p\n", |
| start, PMLx(start, pml_shift), start + len - 1, |
| PMLx(start + len - 1, pml_shift), pml_shift, kva); |
| for (kpte_i = kpte_s; kpte_i <= kpte_e; kpte_i++, kva += pgsize) { |
| visited_all_subs = FALSE; |
| /* Complete only on the last level (PML1_SHIFT) or on a jumbo */ |
| if (kpte_is_present(kpte_i) && |
| (!walk_is_complete(kpte_i, pml_shift, PML1_SHIFT))) { |
| /* only pass truncated end points (e.g. start may not be |
| * page aligned) when we're on the first (or last) item. |
| * For the middle entries, we want the subpmls to |
| * process the full range they are responsible for: |
| * [kva, kva + pgsize). */ |
| uintptr_t sub_start = MAX(kva, start); |
| size_t sub_len = MIN(start + len - sub_start, |
| kva + pgsize - sub_start); |
| |
| ret = __pml_for_each(kpte2pml(*kpte_i), sub_start, |
| sub_len, callback, arg, |
| pml_shift - BITS_PER_PML); |
| if (ret) |
| return ret; |
| /* based on sub_{start,end}, we can tell if our sub |
| * visited all of its PTES. */ |
| if ((sub_start == kva) && (sub_len == pgsize)) |
| visited_all_subs = TRUE; |
| } |
| if ((ret = callback(kpte_i, kva, pml_shift, visited_all_subs, |
| arg))) |
| return ret; |
| } |
| return 0; |
| } |
| |
| int pml_for_each(kpte_t *pml, uintptr_t start, size_t len, kpte_cb_t callback, |
| void *arg) |
| { |
| return __pml_for_each(pml, start, len, callback, arg, PML4_SHIFT); |
| } |
| |
| /* Unmaps [va, va + size) from pgdir, freeing any intermediate page tables for |
| * non-kernel mappings. This does not free the actual memory pointed to by the |
| * page tables, nor does it flush the TLB. */ |
| int unmap_segment(pgdir_t pgdir, uintptr_t va, size_t size) |
| { |
| int pt_free_cb(kpte_t *kpte, uintptr_t kva, int shift, |
| bool visited_subs, void *data) |
| { |
| if (!kpte_is_present(kpte)) |
| return 0; |
| if (pte_is_final(kpte, shift)) { |
| pte_clear(kpte); |
| return 0; |
| } |
| /* Never remove intermediate pages for any kernel mappings. |
| * This is also important for x86 so that we don't accidentally |
| * free any of the boot PMLs, which aren't two-page alloc'd from |
| * kpages_arena. */ |
| if (kva >= ULIM) |
| return 0; |
| /* If we haven't visited all of our subs, we might still have |
| * some mappings hanging off this page table. */ |
| if (!visited_subs) { |
| kpte_t *kpte_i = kpte2pml(*kpte);/* first kpte == pml */ |
| /* make sure we have no PTEs in use */ |
| for (int i = 0; i < NPTENTRIES; i++, kpte_i++) { |
| if (*kpte_i) |
| return 0; |
| } |
| } |
| kpages_free(KADDR(PTE_ADDR(*kpte)), 2 * PGSIZE); |
| pte_clear(kpte); |
| return 0; |
| } |
| return pml_for_each(pgdir_get_kpt(pgdir), va, size, pt_free_cb, 0); |
| } |
| |
| /* Older interface for page table walks - will return the PTE corresponding to |
| * VA. If create is 1, it'll create intermediate tables. This can return jumbo |
| * PTEs, but only if they already exist. Otherwise, (with create), it'll walk |
| * to the lowest PML. If the walk fails due to a lack of intermediate tables or |
| * memory, this returns 0 (subject to change based on pte_t). */ |
| pte_t pgdir_walk(pgdir_t pgdir, const void *va, int create) |
| { |
| pte_t ret; |
| int flags = PML1_SHIFT; |
| if (create == 1) |
| flags |= PG_WALK_CREATE; |
| return pml_walk(pgdir_get_kpt(pgdir), (uintptr_t)va, flags); |
| } |
| |
| static int pml_perm_walk(kpte_t *pml, const void *va, int pml_shift) |
| { |
| kpte_t *kpte; |
| int perms_here; |
| |
| kpte = &pml[PMLx(va, pml_shift)]; |
| if (!kpte_is_present(kpte)) |
| return 0; |
| perms_here = *kpte & PTE_PERM; |
| if (walk_is_complete(kpte, pml_shift, PML1_SHIFT)) |
| return perms_here; |
| return pml_perm_walk(kpte2pml(*kpte), va, pml_shift - BITS_PER_PML) & |
| perms_here; |
| } |
| |
| /* Returns the effective permissions for PTE_U, PTE_W, and PTE_P on a given |
| * virtual address. Note we need to consider the composition of every PTE in |
| * the page table walk (we bit-and all of them together) */ |
| int get_va_perms(pgdir_t pgdir, const void *va) |
| { |
| return pml_perm_walk(pgdir_get_kpt(pgdir), va, PML4_SHIFT); |
| } |
| |
| #define check_sym_va(sym, addr) \ |
| ({ \ |
| if ((sym) != (addr)) \ |
| panic("Error: " #sym " is %p, should be " #addr, sym); \ |
| }) |
| |
| static void check_syms_va(void) |
| { |
| /* Make sure our symbols are up to date (see arch/ros/mmu64.h) */ |
| check_sym_va(KERN_LOAD_ADDR, 0xffffffffc0000000); |
| check_sym_va(IOAPIC_BASE, 0xffffffffbff00000); |
| check_sym_va(VPT_TOP, 0xffffff0000000000); |
| check_sym_va(VPT, 0xfffffe8000000000); |
| check_sym_va(KERN_VMAP_TOP, 0xfffffe8000000000); |
| check_sym_va(KERNBASE, 0xffff800000000000); |
| check_sym_va(ULIM, 0x0000800000000000); |
| check_sym_va(UVPT, 0x00007f8000000000); |
| check_sym_va(UGINFO, 0x00007f7fffe00000); |
| check_sym_va(UINFO, 0x00007f7fffc00000); |
| check_sym_va(UWLIM, 0x00007f7fffc00000); |
| check_sym_va(UDATA, 0x00007f7fffa00000); |
| check_sym_va(UGDATA, 0x00007f7fff9ff000); |
| check_sym_va(UMAPTOP, 0x00007f7fff9ff000); |
| check_sym_va(USTACKTOP, 0x00007f7fff9ff000); |
| check_sym_va(BRK_END, 0x0000300000000000); |
| } |
| |
| /* Initializes anything related to virtual memory. Paging is already on, but we |
| * have a slimmed down page table. */ |
| void vm_init(void) |
| { |
| int max_jumbo_shift; |
| kpte_t *boot_kpt = KADDR(get_boot_pml4()); |
| |
| boot_cr3 = get_boot_pml4(); |
| boot_pgdir.kpte = boot_kpt; |
| boot_pgdir.eptp = 0; |
| gdt = KADDR(get_gdt64()); |
| |
| /* We need to limit our mappings on machines that don't support 1GB |
| * pages */ |
| max_jumbo_shift = arch_max_jumbo_page_shift(); |
| check_syms_va(); |
| /* KERNBASE mapping: we already have 512 GB complete (one full |
| * PML3_REACH). It's okay if we have extra, just need to make sure we |
| * reach max_paddr. */ |
| if (KERNBASE + PML3_REACH < (uintptr_t)KADDR(max_paddr)) { |
| map_segment(boot_pgdir, KERNBASE + PML3_REACH, |
| max_paddr - PML3_REACH, 0x0 + PML3_REACH, |
| PTE_KERN_RW | PTE_G, max_jumbo_shift); |
| } |
| /* For the LAPIC and IOAPIC, we use PAT (but not *the* PAT flag) to make |
| * these type UC */ |
| map_segment(boot_pgdir, IOAPIC_BASE, APIC_SIZE, IOAPIC_PBASE, |
| PTE_NOCACHE | PTE_KERN_RW | PTE_G, max_jumbo_shift); |
| /* VPT mapping: recursive PTE inserted at the VPT spot */ |
| boot_kpt[PML4(VPT)] = PADDR(boot_kpt) | PTE_KERN_RW; |
| /* same for UVPT, accessible by userspace (RO). */ |
| boot_kpt[PML4(UVPT)] = PADDR(boot_kpt) | PTE_USER_RO; |
| /* set up core0s now (mostly for debugging) */ |
| setup_default_mtrrs(0); |
| /* Our current gdt_pd (gdt64desc) is pointing to a physical address for |
| * the GDT. We need to switch over to pointing to one with a virtual |
| * address, so we can later unmap the low memory */ |
| gdt_pd = (pseudodesc_t) {sizeof(segdesc_t) * SEG_COUNT - 1, |
| (uintptr_t)gdt}; |
| asm volatile("lgdt %0" : : "m"(gdt_pd)); |
| } |
| |
| void x86_cleanup_bootmem(void) |
| { |
| /* the boot page tables weren't alloc'd the same as other pages, so |
| * we'll need to do some hackery to 'free' them. This doesn't actually |
| * free anything - it just unmaps but leave 2 KPTs (4 pages) sitting |
| * around. */ |
| //unmap_segment(boot_pgdir, 0, PML3_PTE_REACH); // want to do this |
| boot_pgdir.kpte[0] = 0; |
| tlb_flush_global(); |
| } |
| |
| /* Walks len bytes from start, executing 'callback' on every PTE, passing it a |
| * specific VA and whatever arg is passed in. Note, this cannot handle jumbo |
| * pages. |
| * |
| * This is just a clumsy wrapper around the more powerful pml_for_each, which |
| * can handle jumbo and intermediate pages. */ |
| int env_user_mem_walk(struct proc *p, void *start, size_t len, |
| mem_walk_callback_t callback, void *arg) |
| { |
| struct tramp_package { |
| struct proc *p; |
| mem_walk_callback_t cb; |
| void *cb_arg; |
| }; |
| int trampoline_cb(kpte_t *kpte, uintptr_t kva, int shift, |
| bool visited_subs, void *data) |
| { |
| struct tramp_package *tp = (struct tramp_package*)data; |
| assert(tp->cb); |
| /* memwalk CBs don't know how to handle intermediates or jumbos |
| */ |
| if (shift != PML1_SHIFT) |
| return 0; |
| return tp->cb(tp->p, kpte, (void*)kva, tp->cb_arg); |
| } |
| |
| struct tramp_package local_tp; |
| local_tp.p = p; |
| local_tp.cb = callback; |
| local_tp.cb_arg = arg; |
| return pml_for_each(pgdir_get_kpt(p->env_pgdir), (uintptr_t)start, len, |
| trampoline_cb, &local_tp); |
| } |
| |
| /* Frees (decrefs) all pages of the process's page table, including the page |
| * directory. Does not free the memory that is actually mapped. */ |
| void env_pagetable_free(struct proc *p) |
| { |
| unmap_segment(p->env_pgdir, 0, UVPT - 0); |
| /* the page directory is not a PTE, so it never was freed */ |
| kpages_free(pgdir_get_kpt(p->env_pgdir), 2 * PGSIZE); |
| tlbflush(); |
| } |
| |
| /* Remove the inner page tables along va's walk. The internals are more |
| * powerful. We'll eventually want better arch-indep VM functions. */ |
| error_t pagetable_remove(pgdir_t pgdir, void *va) |
| { |
| return unmap_segment(pgdir, (uintptr_t)va, PGSIZE); |
| } |
| |
| void page_check(void) |
| { |
| } |
| |
| /* Similar to the kernels page table walk, but walks the guest page tables for a |
| * guest_va. Takes a proc and user virtual (guest physical) address for the |
| * PML, returning the actual PTE (copied out of userspace). */ |
| static kpte_t __guest_pml_walk(struct proc *p, kpte_t *u_pml, uintptr_t gva, |
| int flags, int pml_shift) |
| { |
| kpte_t pte; |
| |
| if (memcpy_from_user(p, &pte, &u_pml[PMLx(gva, pml_shift)], |
| sizeof(kpte_t))) { |
| warn("Buggy pml %p, tried %p\n", u_pml, |
| &u_pml[PMLx(gva, pml_shift)]); |
| return 0; |
| } |
| if (walk_is_complete(&pte, pml_shift, flags)) |
| return pte; |
| if (!kpte_is_present(&pte)) |
| return 0; |
| return __guest_pml_walk(p, (kpte_t*)PTE_ADDR(pte), gva, flags, |
| pml_shift - BITS_PER_PML); |
| } |
| |
| uintptr_t gva2gpa(struct proc *p, uintptr_t cr3, uintptr_t gva) |
| { |
| kpte_t pte; |
| int shift = PML1_SHIFT; |
| |
| pte = __guest_pml_walk(p, (kpte_t*)cr3, gva, shift, PML4_SHIFT); |
| if (!pte) |
| return 0; |
| /* TODO: Jumbos mess with us. We need to know the shift the walk did. |
| * This is a little nasty, but will work til we make Akaros more |
| * jumbo-aware. */ |
| while (pte & PTE_PS) { |
| shift += BITS_PER_PML; |
| pte = __guest_pml_walk(p, (kpte_t*)cr3, gva, shift, PML4_SHIFT); |
| if (!pte) |
| return 0; |
| } |
| return (pte & ~((1 << shift) - 1)) | (gva & ((1 << shift) - 1)); |
| } |
| |
| /* Sets up the page directory, based on boot_copy. |
| * |
| * For x86, to support VMs, all processes will have an EPT and a KPT. Ideally, |
| * we'd use the same actual PT for both, but we can't thanks to the EPT design. |
| * Although they are not the same actual PT, they have the same contents. |
| * |
| * The KPT-EPT invariant is that the KPT and EPT hold the same mappings from |
| * [0,UVPT), so long as some lock is held. Right now, the lock is the pte_lock, |
| * but it could be a finer-grained lock (e.g. on lower level PTs) in the future. |
| * |
| * Part of the reason for the invariant is so that a pgdir walk on the process's |
| * address space will get the 'same' PTE for both the KPT and the EPT. For |
| * instance, if a page is present in the KPT, a pte is present and points to the |
| * same physical page in the EPT. Likewise, both the KPT and EPT agree on jumbo |
| * mappings. |
| * |
| * I went with UVPT for the upper limit of equality btw the KPT and EPT for a |
| * couple reasons: I wanted something static (technically the physaddr width is |
| * runtime dependent), and we'll never actually PF high enough for it to make a |
| * difference. Plus, the UVPT is something that would need to be changed for |
| * the EPT too, if we supported it at all. |
| * |
| * Each page table page is actually two contiguous pages. The lower is the KPT. |
| * The upper is the EPT. Order-1 page allocs are a little harder, but the |
| * tradeoff is simplicity in all of the pm code. Given a KPTE, we can find an |
| * EPTE with no hassle. Note that this two-page business is a tax on *all* |
| * processes, which is less than awesome. |
| * |
| * Another note is that the boot page tables are *not* double-pages. The EPT |
| * won't cover those spaces (e.g. kernbase mapping), so it's not necessary, and |
| * it's a pain in the ass to get it to work (can't align to 2*PGSIZE without |
| * grub complaining, and we might run into issues with freeing memory in the |
| * data segment). */ |
| int arch_pgdir_setup(pgdir_t boot_copy, pgdir_t *new_pd) |
| { |
| kpte_t *kpt; |
| epte_t *ept; |
| |
| kpt = kpages_alloc(2 * PGSIZE, MEM_WAIT); |
| memcpy(kpt, boot_copy.kpte, PGSIZE); |
| ept = kpte_to_epte(kpt); |
| memset(ept, 0, PGSIZE); |
| |
| /* This bit of paranoia slows process creation a little, but makes sure |
| * that there is nothing below ULIM in boot_pgdir. Any PML4 entries |
| * copied from boot_pgdir (e.g. the kernel's memory) will be *shared* |
| * among all processes, including *everything* under the PML4 entries |
| * reach (e.g. PML4_PTE_REACH = 512 GB) and any activity would need to |
| * be synchronized. |
| * |
| * We could do this once at boot time, but that would miss out on |
| * potential changes to the boot_pgdir at runtime. |
| * |
| * We could also just memset that region to 0. For now, I want to catch |
| * whatever mappings exist, since they are probably bugs. */ |
| for (int i = 0; i < PML4(ULIM - 1); i++) |
| assert(kpt[i] == 0); |
| |
| /* VPT and UVPT map the proc's page table, with different permissions.*/ |
| kpt[PML4(VPT)] = build_kpte(PADDR(kpt), PTE_KERN_RW); |
| kpt[PML4(UVPT)] = build_kpte(PADDR(kpt), PTE_USER_RO); |
| |
| new_pd->kpte = kpt; |
| new_pd->eptp = construct_eptp(PADDR(ept)); |
| return 0; |
| } |
| |
| physaddr_t arch_pgdir_get_cr3(pgdir_t pd) |
| { |
| return PADDR(pd.kpte); |
| } |
| |
| void arch_pgdir_clear(pgdir_t *pd) |
| { |
| pd->kpte = 0; |
| pd->eptp = 0; |
| } |
| |
| /* Returns the page shift of the largest jumbo supported */ |
| int arch_max_jumbo_page_shift(void) |
| { |
| uint32_t edx; |
| cpuid(0x80000001, 0x0, 0, 0, 0, &edx); |
| return edx & (1 << 26) ? PML3_SHIFT : PML2_SHIFT; |
| } |
| |
| /* Adds empty intermediate PTs to the top-most PML in pgdir for the given range. |
| * On a 4-PML system, this will add entries to PML4, consisting of a bunch of |
| * empty PML3s, such that [va, va+len) has intermediate tables in pgdir. |
| * |
| * A few related notes: |
| * |
| * The boot_pgdir is where we do the original kernel mappings. All of the PML4 |
| * entries are filled in, pointing to intermediate PML3s. All other pgdirs copy |
| * the kernel mapping, which means they have the same content. That content |
| * never changes at runtime. What changes is the contents of the PML3s and |
| * below, which are pointed to by all pgdirs. |
| * |
| * The proc pgdirs do not have KPT or EPT mappings above ULIM, so if the |
| * intermediate PTs have EPT entries, it's just a waste of memory, but not a |
| * mapping the user could exploit. |
| * |
| * On occasion, there might be code that maps things into boot_pgdir below ULIM, |
| * though right now this is just an out-of-branch "mmap a page at 0" debugging |
| * hack. */ |
| void arch_add_intermediate_pts(pgdir_t pgdir, uintptr_t va, size_t len) |
| { |
| kpte_t *pml4 = pgdir_get_kpt(pgdir); |
| kpte_t *kpte; |
| epte_t *epte; |
| void *new_pml_kva; |
| |
| for (size_t i = 0; i < len; i += PML4_PTE_REACH, va += PML4_PTE_REACH) { |
| kpte = &pml4[PML4(va)]; |
| epte = kpte_to_epte(kpte); |
| if (kpte_is_present(kpte)) |
| continue; |
| new_pml_kva = kpages_zalloc(2 * PGSIZE, MEM_WAIT); |
| /* We insert the same as for __pml_walk. */ |
| *kpte = PADDR(new_pml_kva) | PTE_P | PTE_U | PTE_W; |
| if (va < ULIM) |
| *epte = (PADDR(new_pml_kva) + PGSIZE) | EPTE_R | EPTE_X |
| | EPTE_W; |
| } |
| } |
| |
| /* Debugging */ |
| static int print_pte(kpte_t *kpte, uintptr_t kva, int shift, bool visited_subs, |
| void *data) |
| { |
| if (kpte_is_unmapped(kpte)) |
| return 0; |
| print_lock(); |
| switch (shift) { |
| case (PML1_SHIFT): |
| printk("\t"); |
| /* fall-through */ |
| case (PML2_SHIFT): |
| printk("\t"); |
| /* fall-through */ |
| case (PML3_SHIFT): |
| printk("\t"); |
| } |
| printk("KVA: %p, PTE val %p, shift %d, visit %d%s\n", kva, *kpte, shift, |
| visited_subs, (*kpte & PTE_PS ? " (jumbo)" : "")); |
| print_unlock(); |
| return 0; |
| } |
| |
| void debug_print_pgdir(kpte_t *pgdir) |
| { |
| if (! pgdir) |
| pgdir = KADDR(rcr3()); |
| printk("Printing the entire page table set for %p, DFS\n", pgdir); |
| /* Need to be careful we avoid VPT/UVPT, o/w we'll recurse */ |
| pml_for_each(pgdir, 0, UVPT, print_pte, 0); |
| if (arch_max_jumbo_page_shift() < PML3_SHIFT) |
| printk("(skipping kernbase mapping - too many entries)\n"); |
| else |
| pml_for_each(pgdir, KERNBASE, VPT - KERNBASE, print_pte, 0); |
| pml_for_each(pgdir, VPT_TOP, MAX_VADDR - VPT_TOP, print_pte, 0); |
| } |
| |
| /* Debug helper - makes sure the KPT == EPT for [0, UVPT) */ |
| int debug_check_kpt_ept(void) |
| { |
| int db_cb(kpte_t *kpte, uintptr_t kva, int shift, bool visited_subs, |
| void *data) |
| { |
| epte_t *epte = kpte_to_epte(kpte); |
| char *reason; |
| int pa_offset = 0; |
| |
| if (kpte_is_present(kpte) != epte_is_present(epte)) { |
| reason = "present bit"; |
| goto fail; |
| } |
| if (kpte_is_mapped(kpte) != epte_is_mapped(epte)) { |
| reason = "mapped or not"; |
| goto fail; |
| } |
| if (kpte_is_jumbo(kpte) != epte_is_jumbo(epte)) { |
| reason = "jumbo"; |
| goto fail; |
| } |
| /* Intermediate PTEs have the EPTE pointing to PADDR + PGSIZE */ |
| if (pte_is_present(kpte) && pte_is_intermediate(kpte, shift)) |
| pa_offset = PGSIZE; |
| if (kpte_get_paddr(kpte) + pa_offset != epte_get_paddr(epte)) { |
| reason = "paddr"; |
| goto fail; |
| } |
| if ((kpte_get_settings(kpte) & PTE_PERM) != |
| (epte_get_settings(epte) & PTE_PERM)) { |
| reason = "permissions"; |
| goto fail; |
| } |
| return 0; |
| |
| fail: |
| panic("kpte %p (%p) epte %p (%p) kva %p shift %d: %s", |
| kpte, *kpte, epte, *epte, kva, shift, reason); |
| return -1; |
| } |
| return pml_for_each(current->env_pgdir.kpte, 0, UVPT - 0, db_cb, 0); |
| } |