| /* Copyright (c) 2014 The Regents of the University of California | 
 |  * Barret Rhoden <brho@cs.berkeley.edu> | 
 |  * See LICENSE for details. | 
 |  * | 
 |  * Page mapping: maps an object (inode or block dev) in page size chunks. | 
 |  * Analagous to Linux's "struct address space" */ | 
 |  | 
 | #include <pmap.h> | 
 | #include <atomic.h> | 
 | #include <radix.h> | 
 | #include <kref.h> | 
 | #include <assert.h> | 
 | #include <stdio.h> | 
 | #include <pagemap.h> | 
 | #include <rcu.h> | 
 |  | 
 | void pm_add_vmr(struct page_map *pm, struct vm_region *vmr) | 
 | { | 
 | 	/* note that the VMR being reverse-mapped by the PM is protected by the PM's | 
 | 	 * lock.  we clearly need a write lock here, but removal also needs a write | 
 | 	 * lock, so later when removal holds this, it delays munmaps and keeps the | 
 | 	 * VMR connected. */ | 
 | 	spin_lock(&pm->pm_lock); | 
 | 	TAILQ_INSERT_TAIL(&pm->pm_vmrs, vmr, vm_pm_link); | 
 | 	spin_unlock(&pm->pm_lock); | 
 | } | 
 |  | 
 | void pm_remove_vmr(struct page_map *pm, struct vm_region *vmr) | 
 | { | 
 | 	spin_lock(&pm->pm_lock); | 
 | 	TAILQ_REMOVE(&pm->pm_vmrs, vmr, vm_pm_link); | 
 | 	spin_unlock(&pm->pm_lock); | 
 | } | 
 |  | 
 | /* PM slot void *s look like this: | 
 |  * | 
 |  * |--11--|--1--|----52 or 20 bits--| | 
 |  * | ref  | flag|    ppn of page    | | 
 |  *              \  <--- meta shift -/ | 
 |  * | 
 |  * The setter funcs return the void* that should update slot_val; it doesn't | 
 |  * change slot_val in place (it's a val, not the addr) */ | 
 |  | 
 | #ifdef CONFIG_64BIT | 
 | # define PM_FLAGS_SHIFT 52 | 
 | #else | 
 | # define PM_FLAGS_SHIFT 20 | 
 | #endif | 
 | #define PM_REFCNT_SHIFT (PM_FLAGS_SHIFT + 1) | 
 |  | 
 | #define PM_UNUSED_FLAG (1UL << PM_FLAGS_SHIFT) | 
 |  | 
 | static int pm_slot_check_refcnt(void *slot_val) | 
 | { | 
 | 	return (unsigned long)slot_val >> PM_REFCNT_SHIFT; | 
 | } | 
 |  | 
 | static void *pm_slot_inc_refcnt(void *slot_val) | 
 | { | 
 | 	void *ret; | 
 |  | 
 | 	ret = (void*)((unsigned long)slot_val + (1UL << PM_REFCNT_SHIFT)); | 
 | 	/* Catches previously negative refcnts */ | 
 | 	assert(pm_slot_check_refcnt(ret) > 0); | 
 | 	return ret; | 
 | } | 
 |  | 
 | static void *pm_slot_dec_refcnt(void *slot_val) | 
 | { | 
 | 	assert(pm_slot_check_refcnt(slot_val) > 0); | 
 | 	return (void*)((unsigned long)slot_val - (1UL << PM_REFCNT_SHIFT)); | 
 | } | 
 |  | 
 | static struct page *pm_slot_get_page(void *slot_val) | 
 | { | 
 | 	if (!slot_val) | 
 | 		return 0; | 
 | 	return ppn2page((unsigned long)slot_val & ((1UL << PM_FLAGS_SHIFT) - 1)); | 
 | } | 
 |  | 
 | static void *pm_slot_set_page(void *slot_val, struct page *pg) | 
 | { | 
 | 	assert(pg != pages);	/* we should never alloc page 0, for sanity */ | 
 | 	return (void*)(page2ppn(pg) | ((unsigned long)slot_val & | 
 | 	                               ~((1UL << PM_FLAGS_SHIFT) - 1))); | 
 | } | 
 |  | 
 | /* Initializes a PM.  Host should be an fs_file.  The reference this stores is | 
 |  * uncounted. */ | 
 | void pm_init(struct page_map *pm, struct page_map_operations *op, void *host) | 
 | { | 
 | 	pm->pm_file = host; | 
 | 	radix_tree_init(&pm->pm_tree); | 
 | 	pm->pm_num_pages = 0; | 
 | 	pm->pm_op = op; | 
 | 	qlock_init(&pm->pm_qlock); | 
 | 	spinlock_init(&pm->pm_lock); | 
 | 	TAILQ_INIT(&pm->pm_vmrs); | 
 | } | 
 |  | 
 | /* Looks up the index'th page in the page map, returning a refcnt'd reference | 
 |  * that need to be dropped with pm_put_page, or 0 if it was not in the map. */ | 
 | static struct page *pm_find_page(struct page_map *pm, unsigned long index) | 
 | { | 
 | 	void **tree_slot; | 
 | 	void *old_slot_val, *slot_val; | 
 | 	struct page *page = 0; | 
 |  | 
 | 	/* We use rcu to protect our radix walk, specifically the tree_slot pointer. | 
 | 	 * We get our own 'pm refcnt' on the slot itself, which doesn't need RCU. */ | 
 | 	rcu_read_lock(); | 
 | 	/* We're syncing with removal.  The deal is that if we grab the page (and | 
 | 	 * we'd only do that if the page != 0), we up the slot ref and clear | 
 | 	 * removal.  A remover will only remove it if removal is still set.  If we | 
 | 	 * grab and release while removal is in progress, even though we no longer | 
 | 	 * hold the ref, we have unset removal.  Also, to prevent removal where we | 
 | 	 * get a page well before the removal process, the removal won't even bother | 
 | 	 * when the slot refcnt is upped. */ | 
 | 	tree_slot = radix_lookup_slot(&pm->pm_tree, index); | 
 | 	if (!tree_slot) | 
 | 		goto out; | 
 | 	do { | 
 | 		old_slot_val = ACCESS_ONCE(*tree_slot); | 
 | 		slot_val = old_slot_val; | 
 | 		page = pm_slot_get_page(slot_val); | 
 | 		if (!page) | 
 | 			goto out; | 
 | 		slot_val = pm_slot_inc_refcnt(slot_val);	/* not a page kref */ | 
 | 	} while (!atomic_cas_ptr(tree_slot, old_slot_val, slot_val)); | 
 | 	assert(page->pg_tree_slot == tree_slot); | 
 | out: | 
 | 	rcu_read_unlock(); | 
 | 	return page; | 
 | } | 
 |  | 
 | /* Attempts to insert the page into the page_map, returns 0 for success, or an | 
 |  * error code if there was one already (EEXIST) or we ran out of memory | 
 |  * (ENOMEM). | 
 |  * | 
 |  * On success, callers *lose* their page ref, but get a PM slot ref.  This slot | 
 |  * ref is sufficient to keep the page alive (slot ref protects the page ref).. | 
 |  * | 
 |  * Makes no assumptions about the quality of the data loaded, that's up to the | 
 |  * caller. */ | 
 | static int pm_insert_page(struct page_map *pm, unsigned long index, | 
 |                           struct page *page) | 
 | { | 
 | 	int ret; | 
 | 	void **tree_slot; | 
 | 	void *slot_val = 0; | 
 |  | 
 | 	page->pg_mapping = pm;	/* debugging */ | 
 | 	page->pg_index = index; | 
 | 	/* no one should be looking at the tree slot til we stop write locking.  the | 
 | 	 * only other one who looks is removal, who requires a PM write lock. */ | 
 | 	page->pg_tree_slot = (void*)0xdeadbeef;	/* poison */ | 
 | 	slot_val = pm_slot_inc_refcnt(slot_val); | 
 | 	/* passing the page ref from the caller to the slot */ | 
 | 	slot_val = pm_slot_set_page(slot_val, page); | 
 | 	qlock(&pm->pm_qlock); | 
 | 	ret = radix_insert(&pm->pm_tree, index, slot_val, &tree_slot); | 
 | 	if (ret) { | 
 | 		qunlock(&pm->pm_qlock); | 
 | 		return ret; | 
 | 	} | 
 | 	page->pg_tree_slot = tree_slot; | 
 | 	pm->pm_num_pages++; | 
 | 	qunlock(&pm->pm_qlock); | 
 | 	return 0; | 
 | } | 
 |  | 
 | /* Decrefs the PM slot ref (usage of a PM page).  The PM's page ref remains. */ | 
 | void pm_put_page(struct page *page) | 
 | { | 
 | 	void **tree_slot = page->pg_tree_slot; | 
 |  | 
 | 	assert(tree_slot); | 
 | 	assert(pm_slot_get_page(*tree_slot) == page); | 
 | 	assert(pm_slot_check_refcnt(*tree_slot) > 0); | 
 | 	/* decref, don't care about CASing */ | 
 | 	atomic_add((atomic_t*)tree_slot, -(1UL << PM_REFCNT_SHIFT)); | 
 | } | 
 |  | 
 | /* Makes sure the index'th page of the mapped object is loaded in the page cache | 
 |  * and returns its location via **pp. | 
 |  * | 
 |  * You'll get a pm-slot refcnt back, which you need to put when you're done. */ | 
 | int pm_load_page(struct page_map *pm, unsigned long index, struct page **pp) | 
 | { | 
 | 	struct page *page; | 
 | 	int error; | 
 |  | 
 | 	page = pm_find_page(pm, index); | 
 | 	while (!page) { | 
 | 		if (kpage_alloc(&page)) | 
 | 			return -ENOMEM; | 
 | 		/* important that UP_TO_DATE is not set.  once we put it in the PM, | 
 | 		 * others can find it, and we still need to fill it. */ | 
 | 		atomic_set(&page->pg_flags, PG_LOCKED | PG_PAGEMAP); | 
 | 		/* The sem needs to be initted before anyone can try to lock it, meaning | 
 | 		 * before it is in the page cache.  We also want it locked preemptively, | 
 | 		 * by setting signals = 0. */ | 
 | 		sem_init(&page->pg_sem, 0); | 
 | 		error = pm_insert_page(pm, index, page); | 
 | 		switch (error) { | 
 | 			case 0: | 
 | 				goto load_locked_page; | 
 | 				break; | 
 | 			case -EEXIST: | 
 | 				/* the page was mapped already (benign race), just get rid of | 
 | 				 * our page and try again (the only case that uses the while) */ | 
 | 				atomic_set(&page->pg_flags, 0); | 
 | 				page_decref(page); | 
 | 				page = pm_find_page(pm, index); | 
 | 				break; | 
 | 			default: | 
 | 				atomic_set(&page->pg_flags, 0); | 
 | 				page_decref(page); | 
 | 				return error; | 
 | 		} | 
 | 	} | 
 | 	assert(page); | 
 | 	assert(pm_slot_check_refcnt(*page->pg_tree_slot)); | 
 | 	assert(pm_slot_get_page(*page->pg_tree_slot) == page); | 
 | 	if (atomic_read(&page->pg_flags) & PG_UPTODATE) { | 
 | 		*pp = page; | 
 | 		printd("pm %p FOUND page %p, addr %p, idx %d\n", pm, page, | 
 | 		       page2kva(page), index); | 
 | 		return 0; | 
 | 	} | 
 | 	lock_page(page); | 
 | 	/* double-check.  if we we blocked on lock_page, it was probably for someone | 
 | 	 * else loading.  plus, we can't load a page more than once (it could | 
 | 	 * clobber newer writes) */ | 
 | 	if (atomic_read(&page->pg_flags) & PG_UPTODATE) { | 
 | 		unlock_page(page); | 
 | 		*pp = page; | 
 | 		return 0; | 
 | 	} | 
 | 	/* fall through */ | 
 | load_locked_page: | 
 | 	error = pm->pm_op->readpage(pm, page); | 
 | 	assert(!error); | 
 | 	assert(atomic_read(&page->pg_flags) & PG_UPTODATE); | 
 | 	unlock_page(page); | 
 | 	*pp = page; | 
 | 	printd("pm %p LOADS page %p, addr %p, idx %d\n", pm, page, | 
 | 	       page2kva(page), index); | 
 | 	return 0; | 
 | } | 
 |  | 
 | int pm_load_page_nowait(struct page_map *pm, unsigned long index, | 
 |                         struct page **pp) | 
 | { | 
 | 	struct page *page = pm_find_page(pm, index); | 
 | 	if (!page) | 
 | 		return -EAGAIN; | 
 | 	if (!(atomic_read(&page->pg_flags) & PG_UPTODATE)) { | 
 | 		/* TODO: could have a read_nowait pm_op */ | 
 | 		pm_put_page(page); | 
 | 		return -EAGAIN; | 
 | 	} | 
 | 	*pp = page; | 
 | 	return 0; | 
 | } | 
 |  | 
 | static bool vmr_has_page_idx(struct vm_region *vmr, unsigned long pg_idx) | 
 | { | 
 | 	unsigned long nr_pgs = (vmr->vm_end - vmr->vm_base) >> PGSHIFT; | 
 | 	unsigned long start_pg = vmr->vm_foff >> PGSHIFT; | 
 |  | 
 | 	if (!vmr->vm_ready) | 
 | 		return false; | 
 | 	return ((start_pg <= pg_idx) && (pg_idx < start_pg + nr_pgs)); | 
 | } | 
 |  | 
 | /* Runs CB on every PTE in the VMR that corresponds to the file's pg_idx, for up | 
 |  * to max_nr_pgs. */ | 
 | static void vmr_for_each(struct vm_region *vmr, unsigned long pg_idx, | 
 |                          unsigned long max_nr_pgs, mem_walk_callback_t callback) | 
 | { | 
 | 	uintptr_t start_va; | 
 | 	off64_t file_off = pg_idx << PGSHIFT; | 
 | 	size_t len = max_nr_pgs << PGSHIFT; | 
 |  | 
 | 	if (file_off < vmr->vm_foff) { | 
 | 		len -= vmr->vm_foff - file_off; | 
 | 		file_off = vmr->vm_foff; | 
 | 	} | 
 |  | 
 | 	start_va = vmr->vm_base + (file_off - vmr->vm_foff); | 
 | 	if (start_va < vmr->vm_base) { | 
 | 		warn("wraparound! %p %p %p %p", start_va, vmr->vm_base, vmr->vm_foff, | 
 | 		     pg_idx); | 
 | 		return; | 
 | 	} | 
 | 	if (start_va >= vmr->vm_end) | 
 | 		return; | 
 |  | 
 | 	len = MIN(len, vmr->vm_end - start_va); | 
 | 	if (!len) | 
 | 		return; | 
 | 	env_user_mem_walk(vmr->vm_proc, (void*)start_va, len, callback, vmr); | 
 | } | 
 |  | 
 | static bool pm_has_vmr_with_page(struct page_map *pm, unsigned long pg_idx) | 
 | { | 
 | 	struct vm_region *vmr_i; | 
 |  | 
 | 	spin_lock(&pm->pm_lock); | 
 | 	TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) { | 
 | 		if (vmr_has_page_idx(vmr_i, pg_idx)) { | 
 | 			spin_unlock(&pm->pm_lock); | 
 | 			return true; | 
 | 		} | 
 | 	} | 
 | 	spin_unlock(&pm->pm_lock); | 
 | 	return false; | 
 | } | 
 |  | 
 | static bool __remove_or_zero_cb(void **slot, unsigned long tree_idx, void *arg) | 
 | { | 
 | 	struct page_map *pm = arg; | 
 | 	struct page *page; | 
 | 	void *old_slot_val, *slot_val; | 
 |  | 
 | 	old_slot_val = ACCESS_ONCE(*slot); | 
 | 	slot_val = old_slot_val; | 
 | 	page = pm_slot_get_page(slot_val); | 
 | 	/* We shouldn't have an item in the tree without a page, unless there's | 
 | 	 * another removal.  Currently, this CB is called with a qlock. */ | 
 | 	assert(page); | 
 | 	/* Don't even bother with VMRs that might have faulted in the page */ | 
 | 	if (pm_has_vmr_with_page(pm, tree_idx)) { | 
 | 		memset(page2kva(page), 0, PGSIZE); | 
 | 		return false; | 
 | 	} | 
 | 	/* syncing with lookups, writebacks, etc - anyone who gets a ref on a PM | 
 | 	 * leaf/page (e.g. pm_load_page / pm_find_page. */ | 
 | 	slot_val = pm_slot_set_page(slot_val, NULL); | 
 | 	if (pm_slot_check_refcnt(slot_val) || | 
 | 	        !atomic_cas_ptr(slot, old_slot_val, slot_val)) { | 
 | 		memset(page2kva(page), 0, PGSIZE); | 
 | 		return false; | 
 | 	} | 
 | 	/* We yanked the page out.  The radix tree still has an item until we return | 
 | 	 * true, but this is fine.  Future lock-free lookups will now fail (since | 
 | 	 * the page is 0), and insertions will block on the write lock. */ | 
 | 	atomic_set(&page->pg_flags, 0);	/* cause/catch bugs */ | 
 | 	page_decref(page); | 
 | 	return true; | 
 | } | 
 |  | 
 | void pm_remove_or_zero_pages(struct page_map *pm, unsigned long start_idx, | 
 |                              unsigned long nr_pgs) | 
 | { | 
 | 	unsigned long end_idx = start_idx + nr_pgs; | 
 |  | 
 | 	assert(end_idx > start_idx); | 
 | 	qlock(&pm->pm_qlock); | 
 | 	radix_for_each_slot_in_range(&pm->pm_tree, start_idx, end_idx, | 
 | 	                             __remove_or_zero_cb, pm); | 
 | 	qunlock(&pm->pm_qlock); | 
 | } | 
 |  | 
 | static int __pm_mark_and_clear_dirty(struct proc *p, pte_t pte, void *va, | 
 |                                      void *arg) | 
 | { | 
 | 	struct page *page = pa2page(pte_get_paddr(pte)); | 
 | 	struct vm_region *vmr = arg; | 
 |  | 
 | 	if (!pte_is_present(pte) || !pte_is_dirty(pte)) | 
 | 		return 0; | 
 | 	if (!(atomic_read(&page->pg_flags) & PG_DIRTY)) | 
 | 		atomic_or(&page->pg_flags, PG_DIRTY); | 
 | 	pte_clear_dirty(pte); | 
 | 	vmr->vm_shootdown_needed = true; | 
 | 	return 0; | 
 | } | 
 |  | 
 | /* Dirty PTE bits will get marked to the struct page itself, and the PTEs will | 
 |  * have the dirty bit cleared.  VMRs that need a shootdown are marked.  Note | 
 |  * this only marks PTEs and VMRs if they were the one to do some of the | 
 |  * dirtying. */ | 
 | static void mark_and_clear_dirty_ptes(struct page_map *pm) | 
 | { | 
 | 	struct vm_region *vmr_i; | 
 | 	pte_t pte; | 
 |  | 
 | 	spin_lock(&pm->pm_lock); | 
 | 	TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) { | 
 | 		if (!(vmr_i->vm_prot & PROT_WRITE)) | 
 | 			continue; | 
 | 		/* Only care about shared mappings, not private.  Private mappings have | 
 | 		 * a reference to the file, but the pages are not in the page cache - | 
 | 		 * they hang directly off the PTEs (for now). */ | 
 | 		if (!(vmr_i->vm_flags & MAP_SHARED)) | 
 | 			continue; | 
 | 		spin_lock(&vmr_i->vm_proc->pte_lock); | 
 | 		vmr_for_each(vmr_i, 0, ULONG_MAX, __pm_mark_and_clear_dirty); | 
 | 		spin_unlock(&vmr_i->vm_proc->pte_lock); | 
 | 	} | 
 | 	spin_unlock(&pm->pm_lock); | 
 | } | 
 |  | 
 | static void shootdown_vmrs(struct page_map *pm) | 
 | { | 
 | 	struct vm_region *vmr_i; | 
 |  | 
 | 	/* The VMR flag shootdown_needed is owned by the PM.  Each VMR is hooked to | 
 | 	 * at most one file, so there's no issue there.  We might have a proc that | 
 | 	 * has multiple non-private VMRs in the same file, but it shouldn't be a big | 
 | 	 * enough issue to worry about. */ | 
 | 	spin_lock(&pm->pm_lock); | 
 | 	TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) { | 
 | 		if (vmr_i->vm_shootdown_needed) { | 
 | 			vmr_i->vm_shootdown_needed = false; | 
 | 			proc_tlbshootdown(vmr_i->vm_proc, 0, 0); | 
 | 		} | 
 | 	} | 
 | 	spin_unlock(&pm->pm_lock); | 
 | } | 
 |  | 
 | /* Send any queued WBs that haven't been sent yet. */ | 
 | static void flush_queued_writebacks(struct page_map *pm) | 
 | { | 
 | 	/* TODO (WB) */ | 
 | } | 
 |  | 
 | /* Batches up pages to be written back, preferably as one big op.  If we have a | 
 |  * bunch outstanding, we'll send them. */ | 
 | static void queue_writeback(struct page_map *pm, struct page *page) | 
 | { | 
 | 	/* TODO (WB): add a bulk op (instead of only writepage()), collect extents, | 
 | 	 * and send them to the device.  Probably do something similar for reads. */ | 
 | 	pm->pm_op->writepage(pm, page); | 
 | } | 
 |  | 
 | static bool __writeback_cb(void **slot, unsigned long tree_idx, void *arg) | 
 | { | 
 | 	struct page_map *pm = arg; | 
 | 	struct page *page = pm_slot_get_page(*slot); | 
 |  | 
 | 	/* We're qlocked, so all items should have pages. */ | 
 | 	assert(page); | 
 | 	if (atomic_read(&page->pg_flags) & PG_DIRTY) { | 
 | 		atomic_and(&page->pg_flags, ~PG_DIRTY); | 
 | 		queue_writeback(pm, page); | 
 | 	} | 
 | 	return false; | 
 | } | 
 |  | 
 | /* Every dirty page gets written back, regardless of whether it's in a VMR or | 
 |  * not.  All the dirty bits get cleared too, before writing back. */ | 
 | void pm_writeback_pages(struct page_map *pm) | 
 | { | 
 | 	qlock(&pm->pm_qlock); | 
 | 	mark_and_clear_dirty_ptes(pm); | 
 | 	shootdown_vmrs(pm); | 
 | 	radix_for_each_slot(&pm->pm_tree, __writeback_cb, pm); | 
 | 	flush_queued_writebacks(pm); | 
 | 	qunlock(&pm->pm_qlock); | 
 | } | 
 |  | 
 | static bool __flush_unused_cb(void **slot, unsigned long tree_idx, void *arg) | 
 | { | 
 | 	struct page_map *pm = arg; | 
 | 	struct page *page = pm_slot_get_page(*slot); | 
 | 	void *old_slot_val, *slot_val; | 
 |  | 
 | 	/* We're qlocked, so all items should have pages. */ | 
 | 	assert(page); | 
 | 	old_slot_val = ACCESS_ONCE(*slot); | 
 | 	slot_val = old_slot_val; | 
 | 	/* Under any contention, we just skip it */ | 
 | 	if (pm_slot_check_refcnt(slot_val)) | 
 | 		return false; | 
 | 	assert(pm_slot_get_page(slot_val) == page); | 
 | 	slot_val = pm_slot_set_page(slot_val, NULL); | 
 | 	if (!atomic_cas_ptr(slot, old_slot_val, slot_val)) | 
 | 		return false; | 
 | 	/* At this point, we yanked the page.  any concurrent wait-free users that | 
 | 	 * want to get this page will fail (pm_find_page / pm_load_page_nowait). | 
 | 	 * They will block on the qlock that we hold when they try to insert a page | 
 | 	 * (as part of pm_load_page, for both reading or writing).  We can still | 
 | 	 * bail out and everything will be fine, so long as we put the page back. | 
 | 	 * | 
 | 	 * We can't tell from looking at the page if it was actually faulted into | 
 | 	 * the VMR; we just know it was possible.  (currently).  Also, we need to do | 
 | 	 * this check after removing the page from the PM slot, since the mm | 
 | 	 * faulting code (hpf) will attempt a non-blocking PM lookup. */ | 
 | 	if (pm_has_vmr_with_page(pm, tree_idx)) { | 
 | 		slot_val = pm_slot_set_page(slot_val, page); | 
 | 		/* No one should be writing to it.  We hold the qlock, and any readers | 
 | 		 * should not have increffed while the page was NULL. */ | 
 | 		WRITE_ONCE(*slot, slot_val); | 
 | 		return false; | 
 | 	} | 
 | 	/* Need to check PG_DIRTY *after* checking VMRs.  o/w we could check, PAUSE, | 
 | 	 * see no VMRs.  But in the meantime, we had a VMR that munmapped and | 
 | 	 * wrote-back the dirty flag. */ | 
 | 	if (atomic_read(&page->pg_flags) & PG_DIRTY) { | 
 | 		/* If we want to batch these, we'll also have to batch the freeing, | 
 | 		 * which isn't a big deal.  Just do it before freeing and before | 
 | 		 * unlocking the PM; we don't want someone to load the page from the | 
 | 		 * backing store and get an old value. */ | 
 | 		pm->pm_op->writepage(pm, page); | 
 | 	} | 
 | 	/* All clear - the page is unused and (now) clean. */ | 
 | 	atomic_set(&page->pg_flags, 0);	/* catch bugs */ | 
 | 	page_decref(page); | 
 | 	return true; | 
 | } | 
 |  | 
 | /* Unused pages (not currently involved in a read, write, or mmap) are pruned. | 
 |  * Dirty pages are written back first. | 
 |  * | 
 |  * We ignore anything mapped in a VMR.  Not bothering with unmapping or | 
 |  * shootdowns or anything.  At least for now. */ | 
 | void pm_free_unused_pages(struct page_map *pm) | 
 | { | 
 | 	qlock(&pm->pm_qlock); | 
 | 	radix_for_each_slot(&pm->pm_tree, __flush_unused_cb, pm); | 
 | 	qunlock(&pm->pm_qlock); | 
 | } | 
 |  | 
 | static bool __destroy_cb(void **slot, unsigned long tree_idx, void *arg) | 
 | { | 
 | 	struct page *page = pm_slot_get_page(*slot); | 
 |  | 
 | 	/* Should be no users or need to sync */ | 
 | 	assert(pm_slot_check_refcnt(*slot) == 0); | 
 | 	atomic_set(&page->pg_flags, 0);	/* catch bugs */ | 
 | 	page_decref(page); | 
 | 	return true; | 
 | } | 
 |  | 
 | void pm_destroy(struct page_map *pm) | 
 | { | 
 | 	radix_for_each_slot(&pm->pm_tree, __destroy_cb, pm); | 
 | 	radix_tree_destroy(&pm->pm_tree); | 
 | } | 
 |  | 
 | void print_page_map_info(struct page_map *pm) | 
 | { | 
 | 	struct vm_region *vmr_i; | 
 | 	printk("Page Map %p\n", pm); | 
 | 	printk("\tNum pages: %lu\n", pm->pm_num_pages); | 
 | 	spin_lock(&pm->pm_lock); | 
 | 	TAILQ_FOREACH(vmr_i, &pm->pm_vmrs, vm_pm_link) { | 
 | 		printk("\tVMR proc %d: (%p - %p): 0x%08x, 0x%08x, %p, %p\n", | 
 | 		       vmr_i->vm_proc->pid, vmr_i->vm_base, vmr_i->vm_end, | 
 | 		       vmr_i->vm_prot, vmr_i->vm_flags, foc_pointer(vmr_i->__vm_foc), | 
 | 			   vmr_i->vm_foff); | 
 | 	} | 
 | 	spin_unlock(&pm->pm_lock); | 
 | } | 
 |  | 
 | void pm_page_asserter(struct page *page, char *str) | 
 | { | 
 | 	void **tree_slot = page->pg_tree_slot; | 
 |  | 
 | 	if (!page_is_pagemap(page)) | 
 | 		return; | 
 | 	assert(tree_slot); | 
 | 	assert(pm_slot_get_page(*tree_slot) == page); | 
 | 	assert(pm_slot_check_refcnt(*tree_slot) > 0); | 
 | } |