Page map interface and munmap changes The PM interface was slimmed a bit. We still will need to change the usages of pm_put_page for mmap a little. munmap is a bit more efficient, and the VMRs are explicitly unmapped. We'll need to do this instead of just blindly decreffing an entire address space, since we need to handle PM pages differently.

diff --git a/kern/include/mm.h b/kern/include/mm.h
index cf1affa..b8dd752 100644
--- a/kern/include/mm.h
+++ b/kern/include/mm.h

@@ -58,7 +58,7 @@
 struct vm_region *find_vmr(struct proc *p, uintptr_t va);
 struct vm_region *find_first_vmr(struct proc *p, uintptr_t va);
 void isolate_vmrs(struct proc *p, uintptr_t va, size_t len);
-void destroy_vmrs(struct proc *p);
+void unmap_and_destroy_vmrs(struct proc *p);
 int duplicate_vmrs(struct proc *p, struct proc *new_p);
 void print_vmrs(struct proc *p);
 

diff --git a/kern/include/page_alloc.h b/kern/include/page_alloc.h
index b0a7e4c..bda74ce 100644
--- a/kern/include/page_alloc.h
+++ b/kern/include/page_alloc.h

@@ -32,6 +32,7 @@
 #define PG_UPTODATE		0x002	/* page map, filled with file data */
 #define PG_DIRTY		0x004	/* page map, data is dirty */
 #define PG_BUFFER		0x008	/* is a buffer page, has BHs */
+#define PG_PAGEMAP		0x010	/* belongs to a page map */
 
 /* TODO: this struct is not protected from concurrent operations in some
  * functions.  If you want to lock on it, use the spinlock in the semaphore.

diff --git a/kern/include/pagemap.h b/kern/include/pagemap.h
index e302bf3..bbbed5e 100644
--- a/kern/include/pagemap.h
+++ b/kern/include/pagemap.h

@@ -55,9 +55,7 @@
 
 /* Page cache functions */
 void pm_init(struct page_map *pm, struct page_map_operations *op, void *host);
-struct page *pm_find_page(struct page_map *pm, unsigned long index);
-int pm_insert_page(struct page_map *pm, unsigned long index, struct page *page);
-int pm_remove_page(struct page_map *pm, struct page *page);
 int pm_load_page(struct page_map *pm, unsigned long index, struct page **pp);
+void pm_put_page(struct page *page);
 
 #endif /* ROS_KERN_PAGEMAP_H */

diff --git a/kern/src/blockdev.c b/kern/src/blockdev.c
index 8e4f3b5..1f12299 100644
--- a/kern/src/blockdev.c
+++ b/kern/src/blockdev.c

@@ -276,7 +276,7 @@
  * reclaiming will be in page sized chunks from the page cache. */
 void bdev_put_buffer(struct buffer_head *bh)
 {
-	page_decref(bh->bh_page);
+	pm_put_page(bh->bh_page);
 }
 
 /* Block device page map ops: */

diff --git a/kern/src/mm.c b/kern/src/mm.c
index 52cbc2a..3fba9bc 100644
--- a/kern/src/mm.c
+++ b/kern/src/mm.c

@@ -25,6 +25,8 @@
 #include <vfs.h>
 #include <smp.h>
 
+static int __vmr_free_pgs(struct proc *p, pte_t *pte, void *va, void *arg);
+
 struct kmem_cache *vmr_kcache;
 
 void vmr_init(void)
@@ -234,13 +236,16 @@
 		split_vmr(vmr, va + len);
 }
 
-/* Destroys all vmrs of a process - important for when files are mmap()d and
- * probably later when we share memory regions */
-void destroy_vmrs(struct proc *p)
+void unmap_and_destroy_vmrs(struct proc *p)
 {
-	struct vm_region *vm_i;
-	TAILQ_FOREACH(vm_i, &p->vm_regions, vm_link)
-		destroy_vmr(vm_i);
+	struct vm_region *vmr_i, *vmr_temp;
+	/* need the safe style, since destroy_vmr modifies the list */
+	TAILQ_FOREACH_SAFE(vmr_i, &p->vm_regions, vm_link, vmr_temp) {
+		/* note this CB sets the PTE = 0, regardless of if it was P or not */
+		env_user_mem_walk(p, (void*)vmr_i->vm_base,
+		                  vmr_i->vm_end - vmr_i->vm_base, __vmr_free_pgs, 0);
+		destroy_vmr(vmr_i);
+	}
 }
 
 /* Helper: copies the contents of pages from p to new p.  For pages that aren't
@@ -288,37 +293,6 @@
 	}
 	return env_user_mem_walk(p, (void*)va_start, va_end - va_start, &copy_page,
 	                         new_p);
-	/* here's how to do it without the env_user_mem_walk */
-#if 0
-	pte_t *old_pte;
-	struct page *pp;
-	/* For each page, check the old PTE and copy present pages over */
-	for (uintptr_t va_i = va_start; va_i < va_end; va_i += PGSIZE) {
-		old_pte = pgdir_walk(p->env_pgdir, (void*)va_i, 0);
-		if (!old_pte)
-			continue;
-		if (PAGE_PRESENT(*old_pte)) {
-			/* TODO: check for jumbos */
-			if (upage_alloc(new_p, &pp, 0))
-				return -ENOMEM;
-			if (page_insert(new_p->env_pgdir, pp, (void*)va_i,
-			                *old_pte & PTE_PERM)) {
-				page_decref(pp);
-				return -ENOMEM;
-			}
-			pagecopy(page2kva(pp), ppn2kva(PTE2PPN(*old_pte)));
-			page_decref(pp);
-		} else if (PAGE_PAGED_OUT(*old_pte)) {
-			/* TODO: (SWAP) will need to either make a copy or CoW/refcnt the
-			 * backend store.  For now, this PTE will be the same as the
-			 * original PTE */
-			panic("Swapping not supported!");
-		} else {
-			panic("Weird PTE %p in %s!", *old_pte, __FUNCTION__);
-		}
-	}
-	return 0;
-#endif
 }
 
 /* This will make new_p have the same VMRs as p, and it will make sure all
@@ -648,43 +622,74 @@
 	return ret;
 }
 
+static int __munmap_mark_not_present(struct proc *p, pte_t *pte, void *va,
+                                     void *arg)
+{
+	bool *shootdown_needed = (bool*)arg;
+	struct page *page;
+	/* could put in some checks here for !P and also !0 */
+	if (!PAGE_PRESENT(*pte))	/* unmapped (== 0) *ptes are also not PTE_P */
+		return 0;
+	page = ppn2page(PTE2PPN(*pte));
+	*pte &= ~PTE_P;
+	*shootdown_needed = TRUE;
+	return 0;
+}
+
+/* If our page is actually in the PM, we don't do anything.  All a page map
+ * really needs is for our VMR to no longer track it (vmr being in the pm's
+ * list) and to not point at its pages (mark it 0, dude).
+ *
+ * But private mappings mess with that a bit.  Luckily, we can tell by looking
+ * at a page whether the specific page is in the PM or not.  If it isn't, we
+ * still need to free our "VMR local" copy.
+ *
+ * For pages in a PM, we're racing with PM removers.  Both of us sync with the
+ * mm lock, so once we hold the lock, it's a matter of whether or not the PTE is
+ * 0 or not.  If it isn't, then we're still okay to look at the page.  Consider
+ * the PTE a weak ref on the page.  So long as you hold the mm lock, you can
+ * look at the PTE and know the page isn't being freed. */
+static int __vmr_free_pgs(struct proc *p, pte_t *pte, void *va, void *arg)
+{
+	struct page *page;
+	if (!*pte)
+		return 0;
+	page = ppn2page(PTE2PPN(*pte));
+	*pte = 0;
+	if (!(atomic_read(&page->pg_flags) & PG_PAGEMAP))
+		page_decref(page);
+	return 0;
+}
+
 int __do_munmap(struct proc *p, uintptr_t addr, size_t len)
 {
-	struct vm_region *vmr, *next_vmr;
+	struct vm_region *vmr, *next_vmr, *first_vmr;
 	pte_t *pte;
 	bool shootdown_needed = FALSE;
 
 	/* TODO: this will be a bit slow, since we end up doing three linear
 	 * searches (two in isolate, one in find_first). */
 	isolate_vmrs(p, addr, len);
-	vmr = find_first_vmr(p, addr);
+	first_vmr = find_first_vmr(p, addr);
+	vmr = first_vmr;
 	while (vmr && vmr->vm_base < addr + len) {
-		for (uintptr_t va = vmr->vm_base; va < vmr->vm_end; va += PGSIZE) { 
-			pte = pgdir_walk(p->env_pgdir, (void*)va, 0);
-			if (!pte)
-				continue;
-			if (PAGE_PRESENT(*pte)) {
-				/* TODO: (TLB) race here, where the page can be given out before
-				 * the shootdown happened.  Need to put it on a temp list. */
-				page_t *page = ppn2page(PTE2PPN(*pte));
-				*pte = 0;
-				page_decref(page);
-				shootdown_needed = TRUE;
-			} else if (PAGE_PAGED_OUT(*pte)) {
-				/* TODO: (SWAP) mark free in the swapfile or whatever.  For now,
-				 * PAGED_OUT is also being used to mean "hasn't been mapped
-				 * yet".  Note we now allow PAGE_UNMAPPED, unlike older
-				 * versions of mmap(). */
-				panic("Swapping not supported!");
-				*pte = 0;
-			}
-		}
+		env_user_mem_walk(p, (void*)vmr->vm_base, vmr->vm_end - vmr->vm_base,
+		                  __munmap_mark_not_present, &shootdown_needed);
+		vmr = TAILQ_NEXT(vmr, vm_link);
+	}
+	/* we haven't freed the pages yet; still using the PTEs to store the them.
+	 * There should be no races with inserts/faults, since we still hold the mm
+	 * lock since the previous CB. */
+	if (shootdown_needed)
+		proc_tlbshootdown(p, addr, addr + len);
+	vmr = first_vmr;
+	while (vmr && vmr->vm_base < addr + len) {
+		env_user_mem_walk(p, (void*)vmr->vm_base, vmr->vm_end - vmr->vm_base,
+			              __vmr_free_pgs, 0);
 		next_vmr = TAILQ_NEXT(vmr, vm_link);
 		destroy_vmr(vmr);
 		vmr = next_vmr;
 	}
-	if (shootdown_needed)
-		proc_tlbshootdown(p, addr, addr + len);
 	return 0;
 }
 

diff --git a/kern/src/pagemap.c b/kern/src/pagemap.c
index 7db0905..c984911 100644
--- a/kern/src/pagemap.c
+++ b/kern/src/pagemap.c

@@ -26,7 +26,7 @@
 
 /* Looks up the index'th page in the page map, returning an incref'd reference,
  * or 0 if it was not in the map. */
-struct page *pm_find_page(struct page_map *pm, unsigned long index)
+static struct page *pm_find_page(struct page_map *pm, unsigned long index)
 {
 	spin_lock(&pm->pm_tree_lock);
 	struct page *page = (struct page*)radix_lookup(&pm->pm_tree, index);
@@ -40,14 +40,16 @@
  * error code if there was one already (EEXIST) or we ran out of memory
  * (ENOMEM).  On success, this will preemptively lock the page, and will also
  * store a reference to the page in the pm. */
-int pm_insert_page(struct page_map *pm, unsigned long index, struct page *page)
+static int pm_insert_page(struct page_map *pm, unsigned long index,
+                          struct page *page)
 {
 	int error = 0;
 	spin_lock(&pm->pm_tree_lock);
 	error = radix_insert(&pm->pm_tree, index, page);
 	if (!error) {
 		page_incref(page);
-		atomic_or(&page->pg_flags, PG_LOCKED | PG_BUFFER);
+		/* setting PG_BUF since we know it'll be used for IO later... */
+		atomic_or(&page->pg_flags, PG_LOCKED | PG_BUFFER | PG_PAGEMAP);
 		page->pg_sem.nr_signals = 0;		/* ensure others will block */
 		page->pg_mapping = pm;
 		page->pg_index = index;
@@ -57,25 +59,9 @@
 	return error;
 }
 
-/* Removes the page, including its reference.  Not sure yet what interface we
- * want to this (pm and index or page), and this has never been used.  There are
- * also issues with when you want to call this, since a page in the cache may be
- * mmap'd by someone else. */
-int pm_remove_page(struct page_map *pm, struct page *page)
+void pm_put_page(struct page *page)
 {
-	void *retval;
-	warn("pm_remove_page() hasn't been thought through or tested.");
-	/* TODO: check for dirty pages, don't let them be removed right away.  Need
-	 * to schedule them for writeback, and then remove them later (callback).
-	 * Also, need to be careful - anyone holding a reference to a page can dirty
-	 * it concurrently. */
-	spin_lock(&pm->pm_tree_lock);
-	retval = radix_delete(&pm->pm_tree, page->pg_index);
-	spin_unlock(&pm->pm_tree_lock);
-	assert(retval == (void*)page);
 	page_decref(page);
-	pm->pm_num_pages--;
-	return 0;
 }
 
 /* Makes sure the index'th page of the mapped object is loaded in the page cache
@@ -143,6 +129,6 @@
 	assert(!error);
 	/* Unlock, since we're done with the page and it is up to date */
 	unlock_page(page);
-	assert(page->pg_flags & PG_UPTODATE);
+	assert(atomic_read(&page->pg_flags) & PG_UPTODATE);
 	return 0;
 }

diff --git a/kern/src/process.c b/kern/src/process.c
index ebcb244..71326f0 100644
--- a/kern/src/process.c
+++ b/kern/src/process.c

@@ -400,6 +400,12 @@
 	return p;
 }
 
+static int __cb_assert_no_pg(struct proc *p, pte_t *pte, void *va, void *arg)
+{
+	assert(!*pte);
+	return 0;
+}
+
 /* This is called by kref_put(), once the last reference to the process is
  * gone.  Don't call this otherwise (it will panic).  It will clean up the
  * address space and deallocate any other used memory. */
@@ -421,7 +427,8 @@
 	kfree(p->fgrp);
 	kref_put(&p->fs_env.root->d_kref);
 	kref_put(&p->fs_env.pwd->d_kref);
-	destroy_vmrs(p);
+	/* now we'll finally decref files for the file-backed vmrs */
+	unmap_and_destroy_vmrs(p);
 	frontend_proc_free(p);	/* TODO: please remove me one day */
 	/* Free any colors allocated to this process */
 	if (p->cache_colors_map != global_cache_colors_map) {
@@ -435,9 +442,11 @@
 		panic("Proc not in the pid table in %s", __FUNCTION__);
 	spin_unlock(&pid_hash_lock);
 	put_free_pid(p->pid);
-	/* Flush all mapped pages in the user portion of the address space */
-	env_user_mem_free(p, 0, UVPT);
-	/* These need to be free again, since they were allocated with a refcnt. */
+	/* all memory below UMAPTOP should have been freed via the VMRs.  the stuff
+	 * above is the global page and procinfo/procdata */
+	env_user_mem_free(p, (void*)UMAPTOP, UVPT - UMAPTOP); /* 3rd arg = len... */
+	env_user_mem_walk(p, 0, UMAPTOP, __cb_assert_no_pg, 0);
+	/* These need to be freed again, since they were allocated with a refcnt. */
 	free_cont_pages(p->procinfo, LOG2_UP(PROCINFO_NUM_PAGES));
 	free_cont_pages(p->procdata, LOG2_UP(PROCDATA_NUM_PAGES));
 
@@ -826,7 +835,10 @@
 	 * a parent sleeping on our pipe, that parent won't wake up to decref until
 	 * the pipe closes.  And if the parent doesnt decref, we don't free.
 	 * alternatively, we could send a SIGCHILD to the parent, but that would
-	 * require parent's to never ignore that signal (or risk never reaping) */
+	 * require parent's to never ignore that signal (or risk never reaping).
+	 *
+	 * Also note that any mmap'd files will still be mmapped.  You can close the
+	 * file after mmapping, with no effect. */
 	close_9ns_files(p, FALSE);
 	close_all_files(&p->open_files, FALSE);
 	/* Tell the ksched about our death, and which cores we freed up */

diff --git a/kern/src/syscall.c b/kern/src/syscall.c
index e5ee981..210537a 100644
--- a/kern/src/syscall.c
+++ b/kern/src/syscall.c

@@ -582,7 +582,7 @@
 	#endif
 	/* When we destroy our memory regions, accessing cur_sysc would PF */
 	pcpui->cur_kthread->sysc = 0;
-	destroy_vmrs(p);
+	unmap_and_destroy_vmrs(p);
 	close_9ns_files(p, TRUE);
 	close_all_files(&p->open_files, TRUE);
 	env_user_mem_free(p, 0, UMAPTOP);

diff --git a/kern/src/vfs.c b/kern/src/vfs.c
index fb56671..4151bd3 100644
--- a/kern/src/vfs.c
+++ b/kern/src/vfs.c

@@ -1168,7 +1168,7 @@
 		}
 		buf += copy_amt;
 		page_off = 0;
-		page_decref(page);	/* it's still in the cache, we just don't need it */
+		pm_put_page(page);	/* it's still in the cache, we just don't need it */
 	}
 	assert(buf == buf_end);
 	*offset += count;
@@ -1218,7 +1218,8 @@
 		}
 		buf += copy_amt;
 		page_off = 0;
-		page_decref(page);	/* it's still in the cache, we just don't need it */
+		atomic_or(&page->pg_flags, PG_DIRTY);
+		pm_put_page(page);	/* it's still in the cache, we just don't need it */
 	}
 	assert(buf == buf_end);
 	*offset += count;