kern/src/pagemap.c - upstream - Git at Google

 /* Copyright (c) 2014 The Regents of the University of California
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
  * Page mapping: maps an object (inode or block dev) in page size chunks.
  * Analagous to Linux's "struct address space" */

 #include <pmap.h>
 #include <atomic.h>
 #include <radix.h>
 #include <kref.h>
 #include <assert.h>
 #include <stdio.h>
 #include <pagemap.h>
 #include <rcu.h>

 void pm_add_vmr(struct page_map *pm, struct vm_region *vmr)
 {
 	/* note that the VMR being reverse-mapped by the PM is protected by the
 	 * PM's lock.  we clearly need a write lock here, but removal also needs
 	 * a write lock, so later when removal holds this, it delays munmaps and
 	 * keeps the VMR connected. */
 	spin_lock(&pm->pm_lock);
 	TAILQ_INSERT_TAIL(&pm->pm_vmrs, vmr, vm_pm_link);
 	spin_unlock(&pm->pm_lock);
 }

 void pm_remove_vmr(struct page_map *pm, struct vm_region *vmr)
 {
 	spin_lock(&pm->pm_lock);
 	TAILQ_REMOVE(&pm->pm_vmrs, vmr, vm_pm_link);
 	spin_unlock(&pm->pm_lock);
 }

 /* PM slot void *s look like this:
  *
  * |--11--|--1--|----52 or 20 bits--|
  * | ref  | flag|    ppn of page    |
  *              \  <--- meta shift -/
  *
  * The setter funcs return the void* that should update slot_val; it doesn't
  * change slot_val in place (it's a val, not the addr) */

 #ifdef CONFIG_64BIT
 # define PM_FLAGS_SHIFT 52
 #else
 # define PM_FLAGS_SHIFT 20
 #endif
 #define PM_REFCNT_SHIFT (PM_FLAGS_SHIFT + 1)

 #define PM_UNUSED_FLAG (1UL << PM_FLAGS_SHIFT)

 static int pm_slot_check_refcnt(void *slot_val)
 {
 	return (unsigned long)slot_val >> PM_REFCNT_SHIFT;
 }

 static void *pm_slot_inc_refcnt(void *slot_val)
 {
 	void *ret;

 	ret = (void*)((unsigned long)slot_val + (1UL << PM_REFCNT_SHIFT));
 	/* Catches previously negative refcnts */
 	assert(pm_slot_check_refcnt(ret) > 0);
 	return ret;
 }

 static void *pm_slot_dec_refcnt(void *slot_val)
 {
 	assert(pm_slot_check_refcnt(slot_val) > 0);
 	return (void*)((unsigned long)slot_val - (1UL << PM_REFCNT_SHIFT));
 }

 static struct page *pm_slot_get_page(void *slot_val)
 {
 	if (!slot_val)
 		return 0;
 	return ppn2page((unsigned long)slot_val & ((1UL << PM_FLAGS_SHIFT) - 1));
 }

 static void *pm_slot_set_page(void *slot_val, struct page *pg)
 {
 	assert(pg != pages);	/* we should never alloc page 0, for sanity */
 	return (void*)(page2ppn(pg) | ((unsigned long)slot_val &
 	                               ~((1UL << PM_FLAGS_SHIFT) - 1)));
 }

 /* Initializes a PM.  Host should be an fs_file.  The reference this stores is
  * uncounted. */
 void pm_init(struct page_map *pm, struct page_map_operations *op, void *host)
 {
 	pm->pm_file = host;
 	radix_tree_init(&pm->pm_tree);
 	pm->pm_num_pages = 0;
 	pm->pm_op = op;
 	qlock_init(&pm->pm_qlock);
 	spinlock_init(&pm->pm_lock);
 	TAILQ_INIT(&pm->pm_vmrs);
 }

 /* Looks up the index'th page in the page map, returning a refcnt'd reference
  * that need to be dropped with pm_put_page, or 0 if it was not in the map. */
 static struct page *pm_find_page(struct page_map *pm, unsigned long index)
 {
 	void **tree_slot;
 	void *old_slot_val, *slot_val;
 	struct page *page = 0;

 	/* We use rcu to protect our radix walk, specifically the tree_slot
 	 * pointer.  We get our own 'pm refcnt' on the slot itself, which
 	 * doesn't need RCU. */
 	rcu_read_lock();
 	/* We're syncing with removal.  The deal is that if we grab the page
 	 * (and we'd only do that if the page != 0), we up the slot ref and
 	 * clear removal.  A remover will only remove it if removal is still
 	 * set.  If we grab and release while removal is in progress, even
 	 * though we no longer hold the ref, we have unset removal.  Also, to
 	 * prevent removal where we get a page well before the removal process,
 	 * the removal won't even bother when the slot refcnt is upped. */
 	tree_slot = radix_lookup_slot(&pm->pm_tree, index);
 	if (!tree_slot)
 		goto out;
 	do {
 		old_slot_val = ACCESS_ONCE(*tree_slot);
 		slot_val = old_slot_val;
 		page = pm_slot_get_page(slot_val);
 		if (!page)
 			goto out;
 		slot_val = pm_slot_inc_refcnt(slot_val); /* not a page kref */
 	} while (!atomic_cas_ptr(tree_slot, old_slot_val, slot_val));
 	assert(page->pg_tree_slot == tree_slot);
 out:
 	rcu_read_unlock();
 	return page;
 }

 /* Attempts to insert the page into the page_map, returns 0 for success, or an
  * error code if there was one already (EEXIST) or we ran out of memory
  * (ENOMEM).
  *
  * On success, callers *lose* their page ref, but get a PM slot ref.  This slot
  * ref is sufficient to keep the page alive (slot ref protects the page ref)..
  *
  * Makes no assumptions about the quality of the data loaded, that's up to the
  * caller. */
 static int pm_insert_page(struct page_map *pm, unsigned long index,
                           struct page *page)
 {
 	int ret;
 	void **tree_slot;
 	void *slot_val = 0;

 	page->pg_mapping = pm;	/* debugging */
 	page->pg_index = index;
 	/* no one should be looking at the tree slot til we stop write locking.
 	 * the only other one who looks is removal, who requires a PM write
 	 * lock. */
 	page->pg_tree_slot = (void*)0xdeadbeef;	/* poison */
 	slot_val = pm_slot_inc_refcnt(slot_val);
 	/* passing the page ref from the caller to the slot */
 	slot_val = pm_slot_set_page(slot_val, page);
 	qlock(&pm->pm_qlock);
 	ret = radix_insert(&pm->pm_tree, index, slot_val, &tree_slot);
 	if (ret) {
 		qunlock(&pm->pm_qlock);
 		return ret;
 	}
 	page->pg_tree_slot = tree_slot;
 	pm->pm_num_pages++;
 	qunlock(&pm->pm_qlock);
 	return 0;
 }

 /* Decrefs the PM slot ref (usage of a PM page).  The PM's page ref remains. */
 void pm_put_page(struct page *page)
 {
 	void **tree_slot = page->pg_tree_slot;

 	assert(tree_slot);
 	assert(pm_slot_get_page(*tree_slot) == page);
 	assert(pm_slot_check_refcnt(*tree_slot) > 0);
 	/* decref, don't care about CASing */
 	atomic_add((atomic_t*)tree_slot, -(1UL << PM_REFCNT_SHIFT));
 }

 /* Makes sure the index'th page of the mapped object is loaded in the page cache
  * and returns its location via **pp.
  *
  * You'll get a pm-slot refcnt back, which you need to put when you're done. */
 int pm_load_page(struct page_map *pm, unsigned long index, struct page **pp)
 {
 	struct page *page;
 	int error;

 	page = pm_find_page(pm, index);
 	while (!page) {
 		if (kpage_alloc(&page))
 			return -ENOMEM;
 		/* important that UP_TO_DATE is not set.  once we put it in the
 		 * PM, others can find it, and we still need to fill it. */
 		atomic_set(&page->pg_flags, PG_LOCKED | PG_PAGEMAP);
 		/* The sem needs to be initted before anyone can try to lock it,
 		 * meaning before it is in the page cache.  We also want it
 		 * locked preemptively, by setting signals = 0. */
 		sem_init(&page->pg_sem, 0);
 		error = pm_insert_page(pm, index, page);
 		switch (error) {
 		case 0:
 			goto load_locked_page;
 			break;
 		case -EEXIST:
 			/* the page was mapped already (benign race), just get
 			 * rid of our page and try again (the only case that
 			 * uses the while) */
 			atomic_set(&page->pg_flags, 0);
 			page_decref(page);
 			page = pm_find_page(pm, index);
 			break;
 		default:
 			atomic_set(&page->pg_flags, 0);
 			page_decref(page);
 			return error;
 		}
 	}
 	assert(page);
 	assert(pm_slot_check_refcnt(*page->pg_tree_slot));
 	assert(pm_slot_get_page(*page->pg_tree_slot) == page);
 	if (atomic_read(&page->pg_flags) & PG_UPTODATE) {
 		*pp = page;
 		printd("pm %p FOUND page %p, addr %p, idx %d\n", pm, page,
 		       page2kva(page), index);
 		return 0;
 	}
 	lock_page(page);
 	/* double-check.  if we we blocked on lock_page, it was probably for
 	 * someone else loading.  plus, we can't load a page more than once (it
 	 * could clobber newer writes) */
 	if (atomic_read(&page->pg_flags) & PG_UPTODATE) {
 		unlock_page(page);
 		*pp = page;
 		return 0;
 	}
 	/* fall through */
 load_locked_page:
 	error = pm->pm_op->readpage(pm, page);
 	assert(!error);
 	assert(atomic_read(&page->pg_flags) & PG_UPTODATE);
 	unlock_page(page);
 	*pp = page;
 	printd("pm %p LOADS page %p, addr %p, idx %d\n", pm, page,
 	       page2kva(page), index);
 	return 0;
 }

 int pm_load_page_nowait(struct page_map *pm, unsigned long index,
                         struct page **pp)
 {
 	struct page *page = pm_find_page(pm, index);

 	if (!page)
 		return -EAGAIN;
 	if (!(atomic_read(&page->pg_flags) & PG_UPTODATE)) {
 		/* TODO: could have a read_nowait pm_op */
 		pm_put_page(page);
 		return -EAGAIN;
 	}
 	*pp = page;
 	return 0;
 }

 static bool vmr_has_page_idx(struct vm_region *vmr, unsigned long pg_idx)
 {
 	unsigned long nr_pgs = (vmr->vm_end - vmr->vm_base) >> PGSHIFT;
 	unsigned long start_pg = vmr->vm_foff >> PGSHIFT;

 	if (!vmr->vm_ready)
 		return false;
 	return ((start_pg <= pg_idx) && (pg_idx < start_pg + nr_pgs));
 }

 /* Runs CB on every PTE in the VMR that corresponds to the file's pg_idx, for up
  * to max_nr_pgs. */
 static void vmr_for_each(struct vm_region *vmr, unsigned long pg_idx,
                          unsigned long max_nr_pgs, mem_walk_callback_t callback)
 {
 	uintptr_t start_va;
 	off64_t file_off = pg_idx << PGSHIFT;
 	size_t len = max_nr_pgs << PGSHIFT;

 	if (file_off < vmr->vm_foff) {
 		len -= vmr->vm_foff - file_off;
 		file_off = vmr->vm_foff;
 	}

 	start_va = vmr->vm_base + (file_off - vmr->vm_foff);
 	if (start_va < vmr->vm_base) {
 		warn("wraparound! %p %p %p %p", start_va, vmr->vm_base,
 		     vmr->vm_foff, pg_idx);
 		return;
 	}
 	if (start_va >= vmr->vm_end)
 		return;

 	len = MIN(len, vmr->vm_end - start_va);
 	if (!len)
 		return;
 	env_user_mem_walk(vmr->vm_proc, (void*)start_va, len, callback, vmr);
 }

 static bool pm_has_vmr_with_page(struct page_map *pm, unsigned long pg_idx)
 {
 	struct vm_region *vmr_i;

 	spin_lock(&pm->pm_lock);
 	TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
 		if (vmr_has_page_idx(vmr_i, pg_idx)) {
 			spin_unlock(&pm->pm_lock);
 			return true;
 		}
 	}
 	spin_unlock(&pm->pm_lock);
 	return false;
 }

 static bool __remove_or_zero_cb(void **slot, unsigned long tree_idx, void *arg)
 {
 	struct page_map *pm = arg;
 	struct page *page;
 	void *old_slot_val, *slot_val;

 	old_slot_val = ACCESS_ONCE(*slot);
 	slot_val = old_slot_val;
 	page = pm_slot_get_page(slot_val);
 	/* We shouldn't have an item in the tree without a page, unless there's
 	 * another removal.  Currently, this CB is called with a qlock. */
 	assert(page);
 	/* Don't even bother with VMRs that might have faulted in the page */
 	if (pm_has_vmr_with_page(pm, tree_idx)) {
 		memset(page2kva(page), 0, PGSIZE);
 		return false;
 	}
 	/* syncing with lookups, writebacks, etc - anyone who gets a ref on a PM
 	 * leaf/page (e.g. pm_load_page / pm_find_page. */
 	slot_val = pm_slot_set_page(slot_val, NULL);
 	if (pm_slot_check_refcnt(slot_val) ||
 	        !atomic_cas_ptr(slot, old_slot_val, slot_val)) {
 		memset(page2kva(page), 0, PGSIZE);
 		return false;
 	}
 	/* We yanked the page out.  The radix tree still has an item until we
 	 * return true, but this is fine.  Future lock-free lookups will now
 	 * fail (since the page is 0), and insertions will block on the write
 	 * lock. */
 	atomic_set(&page->pg_flags, 0);	/* cause/catch bugs */
 	page_decref(page);
 	return true;
 }

 void pm_remove_or_zero_pages(struct page_map *pm, unsigned long start_idx,
                              unsigned long nr_pgs)
 {
 	unsigned long end_idx = start_idx + nr_pgs;

 	assert(end_idx > start_idx);
 	qlock(&pm->pm_qlock);
 	radix_for_each_slot_in_range(&pm->pm_tree, start_idx, end_idx,
 	                             __remove_or_zero_cb, pm);
 	qunlock(&pm->pm_qlock);
 }

 static int __pm_mark_and_clear_dirty(struct proc *p, pte_t pte, void *va,
                                      void *arg)
 {
 	struct page *page = pa2page(pte_get_paddr(pte));
 	struct vm_region *vmr = arg;

 	if (!pte_is_present(pte) || !pte_is_dirty(pte))
 		return 0;
 	if (!(atomic_read(&page->pg_flags) & PG_DIRTY))
 		atomic_or(&page->pg_flags, PG_DIRTY);
 	pte_clear_dirty(pte);
 	vmr->vm_shootdown_needed = true;
 	return 0;
 }

 /* Dirty PTE bits will get marked to the struct page itself, and the PTEs will
  * have the dirty bit cleared.  VMRs that need a shootdown are marked.  Note
  * this only marks PTEs and VMRs if they were the one to do some of the
  * dirtying. */
 static void mark_and_clear_dirty_ptes(struct page_map *pm)
 {
 	struct vm_region *vmr_i;
 	pte_t pte;

 	spin_lock(&pm->pm_lock);
 	TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
 		if (!(vmr_i->vm_prot & PROT_WRITE))
 			continue;
 		/* Only care about shared mappings, not private.  Private
 		 * mappings have a reference to the file, but the pages are not
 		 * in the page cache - they hang directly off the PTEs (for
 		 * now). */
 		if (!(vmr_i->vm_flags & MAP_SHARED))
 			continue;
 		spin_lock(&vmr_i->vm_proc->pte_lock);
 		vmr_for_each(vmr_i, 0, ULONG_MAX, __pm_mark_and_clear_dirty);
 		spin_unlock(&vmr_i->vm_proc->pte_lock);
 	}
 	spin_unlock(&pm->pm_lock);
 }

 static void shootdown_vmrs(struct page_map *pm)
 {
 	struct vm_region *vmr_i;

 	/* The VMR flag shootdown_needed is owned by the PM.  Each VMR is hooked
 	 * to at most one file, so there's no issue there.  We might have a proc
 	 * that has multiple non-private VMRs in the same file, but it shouldn't
 	 * be a big enough issue to worry about. */
 	spin_lock(&pm->pm_lock);
 	TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
 		if (vmr_i->vm_shootdown_needed) {
 			vmr_i->vm_shootdown_needed = false;
 			proc_tlbshootdown(vmr_i->vm_proc, 0, 0);
 		}
 	}
 	spin_unlock(&pm->pm_lock);
 }

 /* Send any queued WBs that haven't been sent yet. */
 static void flush_queued_writebacks(struct page_map *pm)
 {
 	/* TODO (WB) */
 }

 /* Batches up pages to be written back, preferably as one big op.  If we have a
  * bunch outstanding, we'll send them. */
 static void queue_writeback(struct page_map *pm, struct page *page)
 {
 	/* TODO (WB): add a bulk op (instead of only writepage()), collect
 	 * extents, and send them to the device.  Probably do something similar
 	 * for reads. */
 	pm->pm_op->writepage(pm, page);
 }

 static bool __writeback_cb(void **slot, unsigned long tree_idx, void *arg)
 {
 	struct page_map *pm = arg;
 	struct page *page = pm_slot_get_page(*slot);

 	/* We're qlocked, so all items should have pages. */
 	assert(page);
 	if (atomic_read(&page->pg_flags) & PG_DIRTY) {
 		atomic_and(&page->pg_flags, ~PG_DIRTY);
 		queue_writeback(pm, page);
 	}
 	return false;
 }

 /* Every dirty page gets written back, regardless of whether it's in a VMR or
  * not.  All the dirty bits get cleared too, before writing back. */
 void pm_writeback_pages(struct page_map *pm)
 {
 	qlock(&pm->pm_qlock);
 	mark_and_clear_dirty_ptes(pm);
 	shootdown_vmrs(pm);
 	radix_for_each_slot(&pm->pm_tree, __writeback_cb, pm);
 	flush_queued_writebacks(pm);
 	qunlock(&pm->pm_qlock);
 }

 static bool __flush_unused_cb(void **slot, unsigned long tree_idx, void *arg)
 {
 	struct page_map *pm = arg;
 	struct page *page = pm_slot_get_page(*slot);
 	void *old_slot_val, *slot_val;

 	/* We're qlocked, so all items should have pages. */
 	assert(page);
 	old_slot_val = ACCESS_ONCE(*slot);
 	slot_val = old_slot_val;
 	/* Under any contention, we just skip it */
 	if (pm_slot_check_refcnt(slot_val))
 		return false;
 	assert(pm_slot_get_page(slot_val) == page);
 	slot_val = pm_slot_set_page(slot_val, NULL);
 	if (!atomic_cas_ptr(slot, old_slot_val, slot_val))
 		return false;
 	/* At this point, we yanked the page.  any concurrent wait-free users
 	 * that want to get this page will fail (pm_find_page /
 	 * pm_load_page_nowait).  They will block on the qlock that we hold when
 	 * they try to insert a page (as part of pm_load_page, for both reading
 	 * or writing).  We can still bail out and everything will be fine, so
 	 * long as we put the page back.
 	 *
 	 * We can't tell from looking at the page if it was actually faulted
 	 * into the VMR; we just know it was possible.  (currently).  Also, we
 	 * need to do this check after removing the page from the PM slot, since
 	 * the mm faulting code (hpf) will attempt a non-blocking PM lookup. */
 	if (pm_has_vmr_with_page(pm, tree_idx)) {
 		slot_val = pm_slot_set_page(slot_val, page);
 		/* No one should be writing to it.  We hold the qlock, and any
 		 * readers should not have increffed while the page was NULL. */
 		WRITE_ONCE(*slot, slot_val);
 		return false;
 	}
 	/* Need to check PG_DIRTY *after* checking VMRs.  o/w we could check,
 	 * PAUSE, see no VMRs.  But in the meantime, we had a VMR that munmapped
 	 * and wrote-back the dirty flag. */
 	if (atomic_read(&page->pg_flags) & PG_DIRTY) {
 		/* If we want to batch these, we'll also have to batch the
 		 * freeing, which isn't a big deal.  Just do it before freeing
 		 * and before unlocking the PM; we don't want someone to load
 		 * the page from the backing store and get an old value. */
 		pm->pm_op->writepage(pm, page);
 	}
 	/* All clear - the page is unused and (now) clean. */
 	atomic_set(&page->pg_flags, 0);	/* catch bugs */
 	page_decref(page);
 	return true;
 }

 /* Unused pages (not currently involved in a read, write, or mmap) are pruned.
  * Dirty pages are written back first.
  *
  * We ignore anything mapped in a VMR.  Not bothering with unmapping or
  * shootdowns or anything.  At least for now. */
 void pm_free_unused_pages(struct page_map *pm)
 {
 	qlock(&pm->pm_qlock);
 	radix_for_each_slot(&pm->pm_tree, __flush_unused_cb, pm);
 	qunlock(&pm->pm_qlock);
 }

 static bool __destroy_cb(void **slot, unsigned long tree_idx, void *arg)
 {
 	struct page *page = pm_slot_get_page(*slot);

 	/* Should be no users or need to sync */
 	assert(pm_slot_check_refcnt(*slot) == 0);
 	atomic_set(&page->pg_flags, 0);	/* catch bugs */
 	page_decref(page);
 	return true;
 }

 void pm_destroy(struct page_map *pm)
 {
 	radix_for_each_slot(&pm->pm_tree, __destroy_cb, pm);
 	radix_tree_destroy(&pm->pm_tree);
 }

 void print_page_map_info(struct page_map *pm)
 {
 	struct vm_region *vmr_i;
 	printk("Page Map %p\n", pm);
 	printk("\tNum pages: %lu\n", pm->pm_num_pages);
 	spin_lock(&pm->pm_lock);
 	TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
 		printk("\tVMR proc %d: (%p - %p): 0x%08x, 0x%08x, %p, %p\n",
 		       vmr_i->vm_proc->pid, vmr_i->vm_base, vmr_i->vm_end,
 		       vmr_i->vm_prot, vmr_i->vm_flags,
 		       foc_pointer(vmr_i->__vm_foc), vmr_i->vm_foff);
 	}
 	spin_unlock(&pm->pm_lock);
 }

 void pm_page_asserter(struct page *page, char *str)
 {
 	void **tree_slot = page->pg_tree_slot;

 	if (!page_is_pagemap(page))
 		return;
 	assert(tree_slot);
 	assert(pm_slot_get_page(*tree_slot) == page);
 	assert(pm_slot_check_refcnt(*tree_slot) > 0);
 }
	/* Copyright (c) 2014 The Regents of the University of California
	* Barret Rhoden <brho@cs.berkeley.edu>
	* See LICENSE for details.
	*
	* Page mapping: maps an object (inode or block dev) in page size chunks.
	* Analagous to Linux's "struct address space" */

	#include <pmap.h>
	#include <atomic.h>
	#include <radix.h>
	#include <kref.h>
	#include <assert.h>
	#include <stdio.h>
	#include <pagemap.h>
	#include <rcu.h>

	void pm_add_vmr(struct page_map pm, struct vm_region vmr)
	{
	/* note that the VMR being reverse-mapped by the PM is protected by the
	* PM's lock. we clearly need a write lock here, but removal also needs
	* a write lock, so later when removal holds this, it delays munmaps and
	* keeps the VMR connected. */
	spin_lock(&pm->pm_lock);
	TAILQ_INSERT_TAIL(&pm->pm_vmrs, vmr, vm_pm_link);
	spin_unlock(&pm->pm_lock);
	}

	void pm_remove_vmr(struct page_map pm, struct vm_region vmr)
	{
	spin_lock(&pm->pm_lock);
	TAILQ_REMOVE(&pm->pm_vmrs, vmr, vm_pm_link);
	spin_unlock(&pm->pm_lock);
	}

	/* PM slot void *s look like this:
	*
	* \|--11--\|--1--\|----52 or 20 bits--\|
	* \| ref \| flag\| ppn of page \|
	* \ <--- meta shift -/
	*
	* The setter funcs return the void* that should update slot_val; it doesn't
	* change slot_val in place (it's a val, not the addr) */

	#ifdef CONFIG_64BIT
	# define PM_FLAGS_SHIFT 52
	#else
	# define PM_FLAGS_SHIFT 20
	#endif
	#define PM_REFCNT_SHIFT (PM_FLAGS_SHIFT + 1)

	#define PM_UNUSED_FLAG (1UL << PM_FLAGS_SHIFT)

	static int pm_slot_check_refcnt(void *slot_val)
	{
	return (unsigned long)slot_val >> PM_REFCNT_SHIFT;
	}

	static void pm_slot_inc_refcnt(void slot_val)
	{
	void *ret;

	ret = (void*)((unsigned long)slot_val + (1UL << PM_REFCNT_SHIFT));
	/* Catches previously negative refcnts */
	assert(pm_slot_check_refcnt(ret) > 0);
	return ret;
	}

	static void pm_slot_dec_refcnt(void slot_val)
	{
	assert(pm_slot_check_refcnt(slot_val) > 0);
	return (void*)((unsigned long)slot_val - (1UL << PM_REFCNT_SHIFT));
	}

	static struct page pm_slot_get_page(void slot_val)
	{
	if (!slot_val)
	return 0;
	return ppn2page((unsigned long)slot_val & ((1UL << PM_FLAGS_SHIFT) - 1));
	}

	static void pm_slot_set_page(void slot_val, struct page *pg)
	{
	assert(pg != pages); /* we should never alloc page 0, for sanity */
	return (void*)(page2ppn(pg) \| ((unsigned long)slot_val &
	~((1UL << PM_FLAGS_SHIFT) - 1)));
	}

	/* Initializes a PM. Host should be an fs_file. The reference this stores is
	* uncounted. */
	void pm_init(struct page_map pm, struct page_map_operations op, void *host)
	{
	pm->pm_file = host;
	radix_tree_init(&pm->pm_tree);
	pm->pm_num_pages = 0;
	pm->pm_op = op;
	qlock_init(&pm->pm_qlock);
	spinlock_init(&pm->pm_lock);
	TAILQ_INIT(&pm->pm_vmrs);
	}

	/* Looks up the index'th page in the page map, returning a refcnt'd reference
	* that need to be dropped with pm_put_page, or 0 if it was not in the map. */
	static struct page pm_find_page(struct page_map pm, unsigned long index)
	{
	void **tree_slot;
	void old_slot_val, slot_val;
	struct page *page = 0;

	/* We use rcu to protect our radix walk, specifically the tree_slot
	* pointer. We get our own 'pm refcnt' on the slot itself, which
	* doesn't need RCU. */
	rcu_read_lock();
	/* We're syncing with removal. The deal is that if we grab the page
	* (and we'd only do that if the page != 0), we up the slot ref and
	* clear removal. A remover will only remove it if removal is still
	* set. If we grab and release while removal is in progress, even
	* though we no longer hold the ref, we have unset removal. Also, to
	* prevent removal where we get a page well before the removal process,
	* the removal won't even bother when the slot refcnt is upped. */
	tree_slot = radix_lookup_slot(&pm->pm_tree, index);
	if (!tree_slot)
	goto out;
	do {
	old_slot_val = ACCESS_ONCE(*tree_slot);
	slot_val = old_slot_val;
	page = pm_slot_get_page(slot_val);
	if (!page)
	goto out;
	slot_val = pm_slot_inc_refcnt(slot_val); /* not a page kref */
	} while (!atomic_cas_ptr(tree_slot, old_slot_val, slot_val));
	assert(page->pg_tree_slot == tree_slot);
	out:
	rcu_read_unlock();
	return page;
	}

	/* Attempts to insert the page into the page_map, returns 0 for success, or an
	* error code if there was one already (EEXIST) or we ran out of memory
	* (ENOMEM).
	*
	* On success, callers lose their page ref, but get a PM slot ref. This slot
	* ref is sufficient to keep the page alive (slot ref protects the page ref)..
	*
	* Makes no assumptions about the quality of the data loaded, that's up to the
	* caller. */
	static int pm_insert_page(struct page_map *pm, unsigned long index,
	struct page *page)
	{
	int ret;
	void **tree_slot;
	void *slot_val = 0;

	page->pg_mapping = pm; /* debugging */
	page->pg_index = index;
	/* no one should be looking at the tree slot til we stop write locking.
	* the only other one who looks is removal, who requires a PM write
	* lock. */
	page->pg_tree_slot = (void)0xdeadbeef; / poison */
	slot_val = pm_slot_inc_refcnt(slot_val);
	/* passing the page ref from the caller to the slot */
	slot_val = pm_slot_set_page(slot_val, page);
	qlock(&pm->pm_qlock);
	ret = radix_insert(&pm->pm_tree, index, slot_val, &tree_slot);
	if (ret) {
	qunlock(&pm->pm_qlock);
	return ret;
	}
	page->pg_tree_slot = tree_slot;
	pm->pm_num_pages++;
	qunlock(&pm->pm_qlock);
	return 0;
	}

	/* Decrefs the PM slot ref (usage of a PM page). The PM's page ref remains. */
	void pm_put_page(struct page *page)
	{
	void **tree_slot = page->pg_tree_slot;

	assert(tree_slot);
	assert(pm_slot_get_page(*tree_slot) == page);
	assert(pm_slot_check_refcnt(*tree_slot) > 0);
	/* decref, don't care about CASing */
	atomic_add((atomic_t*)tree_slot, -(1UL << PM_REFCNT_SHIFT));
	}

	/* Makes sure the index'th page of the mapped object is loaded in the page cache
	* and returns its location via **pp.
	*
	* You'll get a pm-slot refcnt back, which you need to put when you're done. */
	int pm_load_page(struct page_map pm, unsigned long index, struct page *pp)
	{
	struct page *page;
	int error;

	page = pm_find_page(pm, index);
	while (!page) {
	if (kpage_alloc(&page))
	return -ENOMEM;
	/* important that UP_TO_DATE is not set. once we put it in the
	* PM, others can find it, and we still need to fill it. */
	atomic_set(&page->pg_flags, PG_LOCKED \| PG_PAGEMAP);
	/* The sem needs to be initted before anyone can try to lock it,
	* meaning before it is in the page cache. We also want it
	* locked preemptively, by setting signals = 0. */
	sem_init(&page->pg_sem, 0);
	error = pm_insert_page(pm, index, page);
	switch (error) {
	case 0:
	goto load_locked_page;
	break;
	case -EEXIST:
	/* the page was mapped already (benign race), just get
	* rid of our page and try again (the only case that
	* uses the while) */
	atomic_set(&page->pg_flags, 0);
	page_decref(page);
	page = pm_find_page(pm, index);
	break;
	default:
	atomic_set(&page->pg_flags, 0);
	page_decref(page);
	return error;
	}
	}
	assert(page);
	assert(pm_slot_check_refcnt(*page->pg_tree_slot));
	assert(pm_slot_get_page(*page->pg_tree_slot) == page);
	if (atomic_read(&page->pg_flags) & PG_UPTODATE) {
	*pp = page;
	printd("pm %p FOUND page %p, addr %p, idx %d\n", pm, page,
	page2kva(page), index);
	return 0;
	}
	lock_page(page);
	/* double-check. if we we blocked on lock_page, it was probably for
	* someone else loading. plus, we can't load a page more than once (it
	* could clobber newer writes) */
	if (atomic_read(&page->pg_flags) & PG_UPTODATE) {
	unlock_page(page);
	*pp = page;
	return 0;
	}
	/* fall through */
	load_locked_page:
	error = pm->pm_op->readpage(pm, page);
	assert(!error);
	assert(atomic_read(&page->pg_flags) & PG_UPTODATE);
	unlock_page(page);
	*pp = page;
	printd("pm %p LOADS page %p, addr %p, idx %d\n", pm, page,
	page2kva(page), index);
	return 0;
	}

	int pm_load_page_nowait(struct page_map *pm, unsigned long index,
	struct page **pp)
	{
	struct page *page = pm_find_page(pm, index);

	if (!page)
	return -EAGAIN;
	if (!(atomic_read(&page->pg_flags) & PG_UPTODATE)) {
	/* TODO: could have a read_nowait pm_op */
	pm_put_page(page);
	return -EAGAIN;
	}
	*pp = page;
	return 0;
	}

	static bool vmr_has_page_idx(struct vm_region *vmr, unsigned long pg_idx)
	{
	unsigned long nr_pgs = (vmr->vm_end - vmr->vm_base) >> PGSHIFT;
	unsigned long start_pg = vmr->vm_foff >> PGSHIFT;

	if (!vmr->vm_ready)
	return false;
	return ((start_pg <= pg_idx) && (pg_idx < start_pg + nr_pgs));
	}

	/* Runs CB on every PTE in the VMR that corresponds to the file's pg_idx, for up
	* to max_nr_pgs. */
	static void vmr_for_each(struct vm_region *vmr, unsigned long pg_idx,
	unsigned long max_nr_pgs, mem_walk_callback_t callback)
	{
	uintptr_t start_va;
	off64_t file_off = pg_idx << PGSHIFT;
	size_t len = max_nr_pgs << PGSHIFT;

	if (file_off < vmr->vm_foff) {
	len -= vmr->vm_foff - file_off;
	file_off = vmr->vm_foff;
	}

	start_va = vmr->vm_base + (file_off - vmr->vm_foff);
	if (start_va < vmr->vm_base) {
	warn("wraparound! %p %p %p %p", start_va, vmr->vm_base,
	vmr->vm_foff, pg_idx);
	return;
	}
	if (start_va >= vmr->vm_end)
	return;

	len = MIN(len, vmr->vm_end - start_va);
	if (!len)
	return;
	env_user_mem_walk(vmr->vm_proc, (void*)start_va, len, callback, vmr);
	}

	static bool pm_has_vmr_with_page(struct page_map *pm, unsigned long pg_idx)
	{
	struct vm_region *vmr_i;

	spin_lock(&pm->pm_lock);
	TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
	if (vmr_has_page_idx(vmr_i, pg_idx)) {
	spin_unlock(&pm->pm_lock);
	return true;
	}
	}
	spin_unlock(&pm->pm_lock);
	return false;
	}

	static bool __remove_or_zero_cb(void *slot, unsigned long tree_idx, void arg)
	{
	struct page_map *pm = arg;
	struct page *page;
	void old_slot_val, slot_val;

	old_slot_val = ACCESS_ONCE(*slot);
	slot_val = old_slot_val;
	page = pm_slot_get_page(slot_val);
	/* We shouldn't have an item in the tree without a page, unless there's
	* another removal. Currently, this CB is called with a qlock. */
	assert(page);
	/* Don't even bother with VMRs that might have faulted in the page */
	if (pm_has_vmr_with_page(pm, tree_idx)) {
	memset(page2kva(page), 0, PGSIZE);
	return false;
	}
	/* syncing with lookups, writebacks, etc - anyone who gets a ref on a PM
	* leaf/page (e.g. pm_load_page / pm_find_page. */
	slot_val = pm_slot_set_page(slot_val, NULL);
	if (pm_slot_check_refcnt(slot_val) \|\|
	!atomic_cas_ptr(slot, old_slot_val, slot_val)) {
	memset(page2kva(page), 0, PGSIZE);
	return false;
	}
	/* We yanked the page out. The radix tree still has an item until we
	* return true, but this is fine. Future lock-free lookups will now
	* fail (since the page is 0), and insertions will block on the write
	* lock. */
	atomic_set(&page->pg_flags, 0); /* cause/catch bugs */
	page_decref(page);
	return true;
	}

	void pm_remove_or_zero_pages(struct page_map *pm, unsigned long start_idx,
	unsigned long nr_pgs)
	{
	unsigned long end_idx = start_idx + nr_pgs;

	assert(end_idx > start_idx);
	qlock(&pm->pm_qlock);
	radix_for_each_slot_in_range(&pm->pm_tree, start_idx, end_idx,
	__remove_or_zero_cb, pm);
	qunlock(&pm->pm_qlock);
	}

	static int __pm_mark_and_clear_dirty(struct proc p, pte_t pte, void va,
	void *arg)
	{
	struct page *page = pa2page(pte_get_paddr(pte));
	struct vm_region *vmr = arg;

	if (!pte_is_present(pte) \|\| !pte_is_dirty(pte))
	return 0;
	if (!(atomic_read(&page->pg_flags) & PG_DIRTY))
	atomic_or(&page->pg_flags, PG_DIRTY);
	pte_clear_dirty(pte);
	vmr->vm_shootdown_needed = true;
	return 0;
	}

	/* Dirty PTE bits will get marked to the struct page itself, and the PTEs will
	* have the dirty bit cleared. VMRs that need a shootdown are marked. Note
	* this only marks PTEs and VMRs if they were the one to do some of the
	* dirtying. */
	static void mark_and_clear_dirty_ptes(struct page_map *pm)
	{
	struct vm_region *vmr_i;
	pte_t pte;

	spin_lock(&pm->pm_lock);
	TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
	if (!(vmr_i->vm_prot & PROT_WRITE))
	continue;
	/* Only care about shared mappings, not private. Private
	* mappings have a reference to the file, but the pages are not
	* in the page cache - they hang directly off the PTEs (for
	* now). */
	if (!(vmr_i->vm_flags & MAP_SHARED))
	continue;
	spin_lock(&vmr_i->vm_proc->pte_lock);
	vmr_for_each(vmr_i, 0, ULONG_MAX, __pm_mark_and_clear_dirty);
	spin_unlock(&vmr_i->vm_proc->pte_lock);
	}
	spin_unlock(&pm->pm_lock);
	}

	static void shootdown_vmrs(struct page_map *pm)
	{
	struct vm_region *vmr_i;

	/* The VMR flag shootdown_needed is owned by the PM. Each VMR is hooked
	* to at most one file, so there's no issue there. We might have a proc
	* that has multiple non-private VMRs in the same file, but it shouldn't
	* be a big enough issue to worry about. */
	spin_lock(&pm->pm_lock);
	TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
	if (vmr_i->vm_shootdown_needed) {
	vmr_i->vm_shootdown_needed = false;
	proc_tlbshootdown(vmr_i->vm_proc, 0, 0);
	}
	}
	spin_unlock(&pm->pm_lock);
	}

	/* Send any queued WBs that haven't been sent yet. */
	static void flush_queued_writebacks(struct page_map *pm)
	{
	/* TODO (WB) */
	}

	/* Batches up pages to be written back, preferably as one big op. If we have a
	* bunch outstanding, we'll send them. */
	static void queue_writeback(struct page_map pm, struct page page)
	{
	/* TODO (WB): add a bulk op (instead of only writepage()), collect
	* extents, and send them to the device. Probably do something similar
	* for reads. */
	pm->pm_op->writepage(pm, page);
	}

	static bool __writeback_cb(void *slot, unsigned long tree_idx, void arg)
	{
	struct page_map *pm = arg;
	struct page page = pm_slot_get_page(slot);

	/* We're qlocked, so all items should have pages. */
	assert(page);
	if (atomic_read(&page->pg_flags) & PG_DIRTY) {
	atomic_and(&page->pg_flags, ~PG_DIRTY);
	queue_writeback(pm, page);
	}
	return false;
	}

	/* Every dirty page gets written back, regardless of whether it's in a VMR or
	* not. All the dirty bits get cleared too, before writing back. */
	void pm_writeback_pages(struct page_map *pm)
	{
	qlock(&pm->pm_qlock);
	mark_and_clear_dirty_ptes(pm);
	shootdown_vmrs(pm);
	radix_for_each_slot(&pm->pm_tree, __writeback_cb, pm);
	flush_queued_writebacks(pm);
	qunlock(&pm->pm_qlock);
	}

	static bool __flush_unused_cb(void *slot, unsigned long tree_idx, void arg)
	{
	struct page_map *pm = arg;
	struct page page = pm_slot_get_page(slot);
	void old_slot_val, slot_val;

	/* We're qlocked, so all items should have pages. */
	assert(page);
	old_slot_val = ACCESS_ONCE(*slot);
	slot_val = old_slot_val;
	/* Under any contention, we just skip it */
	if (pm_slot_check_refcnt(slot_val))
	return false;
	assert(pm_slot_get_page(slot_val) == page);
	slot_val = pm_slot_set_page(slot_val, NULL);
	if (!atomic_cas_ptr(slot, old_slot_val, slot_val))
	return false;
	/* At this point, we yanked the page. any concurrent wait-free users
	* that want to get this page will fail (pm_find_page /
	* pm_load_page_nowait). They will block on the qlock that we hold when
	* they try to insert a page (as part of pm_load_page, for both reading
	* or writing). We can still bail out and everything will be fine, so
	* long as we put the page back.
	*
	* We can't tell from looking at the page if it was actually faulted
	* into the VMR; we just know it was possible. (currently). Also, we
	* need to do this check after removing the page from the PM slot, since
	* the mm faulting code (hpf) will attempt a non-blocking PM lookup. */
	if (pm_has_vmr_with_page(pm, tree_idx)) {
	slot_val = pm_slot_set_page(slot_val, page);
	/* No one should be writing to it. We hold the qlock, and any
	* readers should not have increffed while the page was NULL. */
	WRITE_ONCE(*slot, slot_val);
	return false;
	}
	/* Need to check PG_DIRTY after checking VMRs. o/w we could check,
	* PAUSE, see no VMRs. But in the meantime, we had a VMR that munmapped
	* and wrote-back the dirty flag. */
	if (atomic_read(&page->pg_flags) & PG_DIRTY) {
	/* If we want to batch these, we'll also have to batch the
	* freeing, which isn't a big deal. Just do it before freeing
	* and before unlocking the PM; we don't want someone to load
	* the page from the backing store and get an old value. */
	pm->pm_op->writepage(pm, page);
	}
	/* All clear - the page is unused and (now) clean. */
	atomic_set(&page->pg_flags, 0); /* catch bugs */
	page_decref(page);
	return true;
	}

	/* Unused pages (not currently involved in a read, write, or mmap) are pruned.
	* Dirty pages are written back first.
	*
	* We ignore anything mapped in a VMR. Not bothering with unmapping or
	* shootdowns or anything. At least for now. */
	void pm_free_unused_pages(struct page_map *pm)
	{
	qlock(&pm->pm_qlock);
	radix_for_each_slot(&pm->pm_tree, __flush_unused_cb, pm);
	qunlock(&pm->pm_qlock);
	}

	static bool __destroy_cb(void *slot, unsigned long tree_idx, void arg)
	{
	struct page page = pm_slot_get_page(slot);

	/* Should be no users or need to sync */
	assert(pm_slot_check_refcnt(*slot) == 0);
	atomic_set(&page->pg_flags, 0); /* catch bugs */
	page_decref(page);
	return true;
	}

	void pm_destroy(struct page_map *pm)
	{
	radix_for_each_slot(&pm->pm_tree, __destroy_cb, pm);
	radix_tree_destroy(&pm->pm_tree);
	}

	void print_page_map_info(struct page_map *pm)
	{
	struct vm_region *vmr_i;
	printk("Page Map %p\n", pm);
	printk("\tNum pages: %lu\n", pm->pm_num_pages);
	spin_lock(&pm->pm_lock);
	TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) {
	printk("\tVMR proc %d: (%p - %p): 0x%08x, 0x%08x, %p, %p\n",
	vmr_i->vm_proc->pid, vmr_i->vm_base, vmr_i->vm_end,
	vmr_i->vm_prot, vmr_i->vm_flags,
	foc_pointer(vmr_i->__vm_foc), vmr_i->vm_foff);
	}
	spin_unlock(&pm->pm_lock);
	}

	void pm_page_asserter(struct page page, char str)
	{
	void **tree_slot = page->pg_tree_slot;

	if (!page_is_pagemap(page))
	return;
	assert(tree_slot);
	assert(pm_slot_get_page(*tree_slot) == page);
	assert(pm_slot_check_refcnt(*tree_slot) > 0);
	}