WIP-pop-3000

diff --git a/kern/arch/x86/entry64.S b/kern/arch/x86/entry64.S
index 61774cd..bcc3634 100644
--- a/kern/arch/x86/entry64.S
+++ b/kern/arch/x86/entry64.S

@@ -287,7 +287,8 @@
 _start:
 	movl	$stack32top, %esp
 	push	%ebx					# save mulitboot info
-	movw	$0x1234,0x472			# warm boot
+	#movw	$0x1234,0x472			# warm boot
+	movw	$0x4321,0x472			# preserve ram...
 	movl	$0x80000001, %eax
 	# some machines / VMs might not support long mode
 	cpuid

diff --git a/kern/arch/x86/page_alloc.c b/kern/arch/x86/page_alloc.c
index e0161cb..6dece87 100644
--- a/kern/arch/x86/page_alloc.c
+++ b/kern/arch/x86/page_alloc.c

@@ -9,6 +9,9 @@
 #include <multiboot.h>
 #include <arena.h>
 
+physaddr_t mefs_start;
+size_t mefs_size;
+
 /* Helper.  Adds free entries to the base arena.  Most entries are page aligned,
  * though on some machines below EXTPHYSMEM we may have some that aren't. */
 static void parse_mboot_region(struct multiboot_mmap_entry *entry, void *data)
@@ -18,6 +21,14 @@
 	size_t len = entry->len;
 	extern char end[];
 
+// XXX
+if (start == 0x0000000100000000) {
+	mefs_start = (uintptr_t)KADDR(start);
+	mefs_size = len;
+	return;
+}
+
+
 	if (entry->type != MULTIBOOT_MEMORY_AVAILABLE)
 		return;
 	/* Skip anything over max_paddr - might be bad entries(?) */

diff --git a/kern/arch/x86/smp_boot.c b/kern/arch/x86/smp_boot.c
index 133b9d6..fdd7b2d 100644
--- a/kern/arch/x86/smp_boot.c
+++ b/kern/arch/x86/smp_boot.c

@@ -101,7 +101,7 @@
 }
 
 // this needs to be set in smp_entry too...
-#define trampoline_pg 0x00001000UL
+#define trampoline_pg KADDR(0x00001000UL)
 extern char smp_entry[];
 extern char smp_entry_end[];
 extern char smp_boot_lock[];
@@ -128,11 +128,12 @@
 	struct per_cpu_info *pcpui0 = &per_cpu_info[0];
 	page_t *smp_stack;
 
+	//XXX
+	//x86_cleanup_bootmem();
 	// NEED TO GRAB A LOWMEM FREE PAGE FOR AP BOOTUP CODE
 	// page1 (2nd page) is reserved, hardcoded in pmap.c
-	memset(KADDR(trampoline_pg), 0, PGSIZE);
-	memcpy(KADDR(trampoline_pg), (void *)smp_entry,
-           smp_entry_end - smp_entry);
+	memset(trampoline_pg, 0, PGSIZE);
+	memcpy(trampoline_pg, smp_entry, smp_entry_end - smp_entry);
 
 	/* Make sure the trampoline page is mapped.  64 bit already has the tramp pg
 	 * mapped (1 GB of lowmem), so this is a nop. */
@@ -183,6 +184,7 @@
 #endif /* CONFIG_DISABLE_SMT */
 
 	/* cleans up the trampoline page, and any other low boot mem mappings */
+	//XXX
 	x86_cleanup_bootmem();
 	/* trampoline_pg had a refcount of 2 earlier, so we need to dec once more to
 	 * free it but only if all cores are in (or we reset / reinit those that

diff --git a/kern/arch/x86/smp_entry64.S b/kern/arch/x86/smp_entry64.S
index 6c9e350..caab39b 100644
--- a/kern/arch/x86/smp_entry64.S
+++ b/kern/arch/x86/smp_entry64.S

@@ -62,6 +62,18 @@
 	orl		$(CR0_PE | CR0_PG | CR0_WP | CR0_NE | CR0_MP), %eax
 	andl	$(~(CR0_AM | CR0_TS | CR0_EM | CR0_CD | CR0_NW)), %eax
 	movl	%eax, %cr0
+
+	# paging is on, but we don't have that low page mapped.  hence the crash
+	# here
+	# 		even if we made this work, we'd probably want pg 0 mapped until the
+	# 		IDT is up
+	#
+	# 		1)maybe smp boot in entry.S and pause
+	# 				lock after long_mode
+	# 		2)faster syscalls with bools and whatnot.  see email
+
+	#1:jmp 1b
+
 	# load the 64bit GDT and jump to long mode (symbol from entry64)
 	lgdt	gdt64desc
 	# Want to jump to the label long_mode, but we need to relocate to code

diff --git a/kern/arch/x86/trap.c b/kern/arch/x86/trap.c
index 01b0dd5..0fac56e 100644
--- a/kern/arch/x86/trap.c
+++ b/kern/arch/x86/trap.c

@@ -552,6 +552,7 @@
 	// Handle processor exceptions.
 	switch(hw_tf->tf_trapno) {
 		case T_BRKPT:
+			// XXX consider finalizing, since the BT / mon lacks fsbase
 			if (!in_kernel(hw_tf))
 				backtrace_user_ctx(current, current_ctx);
 			else

diff --git a/kern/drivers/Kbuild b/kern/drivers/Kbuild
index 86e9b80..1d8588c 100644
--- a/kern/drivers/Kbuild
+++ b/kern/drivers/Kbuild

@@ -1,3 +1,4 @@
+obj-y						+= fs/
 obj-y						+= net/
 obj-y						+= dev/
 obj-y						+= timers/

diff --git a/kern/drivers/dev/gtfs.c b/kern/drivers/dev/gtfs.c
index bd375a0..4c3771b 100644
--- a/kern/drivers/dev/gtfs.c
+++ b/kern/drivers/dev/gtfs.c

@@ -28,6 +28,9 @@
 	struct kref					users;
 };
 
+// XXX
+static struct gtfs *recent;
+
 /* Blob hanging off the fs_file->priv.  The backend chans are only accessed,
  * (changed or used) with the corresponding fs_file qlock held.  That's the
  * primary use of the qlock - we might be able to avoid qlocking with increfs
@@ -752,6 +755,8 @@
 		nexterror();
 	}
 	gtfs = kzmalloc(sizeof(struct gtfs), MEM_WAIT);
+// XXX
+recent = gtfs;
 	/* This 'users' kref is the one that every distinct frontend chan has.
 	 * These come from attaches and successful, 'moving' walks. */
 	kref_init(&gtfs->users, gtfs_release, 1);
@@ -766,6 +771,8 @@
 		tf_kref_put(tfs->root);
 		tfs_destroy(tfs);
 		kfree(gtfs);
+		// XXX
+		recent = NULL;
 		nexterror();
 	}
 	/* stores the ref for 'backend' inside tfs->root */
@@ -880,3 +887,51 @@
 	.mmap = tree_chan_mmap,
 	.chan_ctl = gtfs_chan_ctl,
 };
+
+// XXX
+int xme(int op)
+{
+	if (!recent)
+		return -1;
+	switch (op) {
+	case 1:
+		printk("dumping GTFS (Qidpath, Ref)\n-----------------\n");
+		__tfs_dump(&recent->tfs);
+		break;
+	case 3:
+		gtfs_free_memory(recent);
+		break;
+	default:
+		printk("give me an op\n");
+		return -1;
+	}
+	return 0;
+}
+
+// XXX
+//
+//	would like some torture tests
+//
+//ktask changes:
+//	maybe better framework for ktasks and rendez.
+//			and ktask names
+// 			also, would like a better kthread interface regarding the string.
+// 				like an snprintf or something.
+//
+// 			either that, or just have the ktask code malloc and free its own
+// 			copy of the name.  so we don't have different behavior for each.
+// 				probably this
+// 		also, this kth might be on another core.  same as the mgmt kth,
+// 		since we don't have kthread affinity.
+// 				right now, whoever does the wakeup hosts the core, right?
+// 				maybe have ktasks head to core 0 or some affinity
+//		enum / #define for the gp_ktask_ctl states
+//		there's some element of post-and-poke to it, esp regarding rcu_barrier
+//			also, we now care if a ktask is running or not, more like
+//			schedulable entities.  no one ever had a pointer to a ktask before
+// for_each_mgmt_core, is_mgmt_core()
+// 		dynamically changing this is harder?
+//
+// cache-align fields in rcu_state
+// 		actually, probably should have a ktask struct, and a pointer hangs off
+// 		rcu_state

diff --git a/kern/drivers/dev/kfs.c b/kern/drivers/dev/kfs.c
index 3a5541e..e4032e3 100644
--- a/kern/drivers/dev/kfs.c
+++ b/kern/drivers/dev/kfs.c

@@ -337,3 +337,220 @@
 	.mmap = tree_chan_mmap,
 	.chan_ctl = kfs_chan_ctl,
 };
+
+
+// XXX misc TODO
+// --------------------------------------------------
+// bash doesn't give us errstr...
+// e.g. 
+// 		bash-4.3$ echo ffff  >> /prog/goo
+// 		bash: /prog/goo: Operation not permitted
+// 		bash-4.3$ ash
+// 		/ $ echo ffff >> /prog/goo
+// 		ash: can't create /prog/goo: devpermcheck(goo, 0644, 03102) failed
+// 			that's a little weird.  it was already created...  could be an ash
+// 			thing
+// 		/ $ write_to /prog/goo fff
+// 		Can't open path: Operation not permitted, devpermcheck(goo, 0644, 03)
+// 		failed
+//
+// 		a little better.
+// 		why are the perms fucked?  that was umask, and the owner is eve, but our
+// 		username is nanwan or something.  maybe nothing.  but not eve.
+// 		need umask 0002 or just 0, so we don't make a file 644 that we can't
+// 		write
+//
+// bash when tabbing out cd, shows us all files, not just directories.
+// 		not ash.  they do the readdir, then stat everything
+// 		some difference with stat, they can't tell it's (not) a dir?
+// 		not sure - bash does the readdir, but doesn't do the stat right away.
+// 		the function it is in (rl_filename_completion_function) doesn't seem to
+// 		care about directories vs files.  maybe it's not getting the right comp
+// 		code?  bash does do a stat, but only after printing the name
+// 		rmdir doesn't do it either.  also doesn't do it on busybox.
+//
+//
+//  our linux list.h could use some safety shit, like WRITE_ONCE.  update to the
+//  most recent one, perhaps?
+//
+// hashing
+// - consider storing the hash in the tf.  might only be done twice
+// - might be harder to resize, esp with RCU readers.  might need a seq.
+// - consider hashing on the parent QID too.
+// - consider bucket locks
+// - consider exclusivity checks on insert (or caller's responsibility)
+//
+// ns devtab func signature issue
+// 		qbwrite and whatnot is ssize_t, and some cases can return -1.
+// 			who calls that?
+// 			how do we call devtab.bread (e.g. pipe)
+// 			these funcs don't always throw
+// 			ipbwrite just says it wrote it all.
+// 		prob should review the functions like pipebread 
+//
+// 		convD2M is unsigned int
+//
+// 		netifbread/bwrite/read/write
+//
+// have waserror() check for irq/trap depth
+//
+//
+// XXX bigger shit
+//
+//
+//   how do we trigger the shrink of the cache?  (memory pressure)
+//   	- need to talk to the instance, e.g. versions of gtfs/tmpfs
+//   	- walking the NS to find those chans is hard
+//   	- having a CB where they register with the memory system might be better
+//   		
+// 	maybe related: some sort of chan op that returns an FD for a ctl FS
+// 		imagine you want to get access to a control plane for a mounted
+// 		device, such as a #gtfs.  you want to fuck with various settings.
+//
+// 		how do you attach this?
+// 			it probably doesn't speak 9p, so it'd be a bind
+// 			but sort of like mnt, we had a path to a chan, then did chanctl,
+// 			then the result of that is bound
+// 			- we need something to attach.  that chan_ctl can return an
+// 			attachable chan to something else within the device?
+// 				but then every op e.g. gtfs_write would need to know if it was
+// 				talking to the real thing or something else
+// 		maybe it'd be better to have an 'introspection' device, a different
+// 		#peek or something.
+// 			- this device takes a chan, like mount, as arguments for its attach
+// 			and it has a small set of kobj/sysfs like ops that the peekee
+// 			implements
+// 			- just a device that knows about another device and can have custom
+// 			hooks/commands/etc
+// 			- though this might not work as well with 9p.  issue is the
+// 			interface between devices - if it's not 9p/devtab, then we're
+// 			somewhat screwed
+// 		say we had a chan flag, with tainting, e.g. CTL_PLANE or something
+// 			we'll still never be able to have a device that supports this just
+// 			have e.g. tree_chan_walk as its method.  everything gets a layer.
+//
+//
+// 	btw, chan_ctl's numbers are currently independent of fcntls, and there is no
+// 	way to talk directly to chan_ctl (just like you can't call dev.create).  not
+// 	a problem yet, but if we want arbitrary chan_ctl, then we might change the
+// 	numbers
+// 		for instance, if i wanted to add a hokey chan_ctl for gtfs memory
+// 		writeback or debugging.  i can't access that from userspace.  hence
+// 		kfunc
+// 		rel to the #peek device, chan_ctl might be the source for some blob
+// 		pointer / hook.  if userspace provides an FD, like mnt, then we'd need a
+// 		way to get it.
+// 			and the numbers for that are the CCTL_X, which are e.g. F_SETFL
+// 			maybe.  or maybe we have interposition layers, esp since F_GETFD is
+// 			about the FD, not the chan.
+//
+//
+// 	want a gtfs ktask that syncs or LRU frees on occasion?
+//
+// 	glibc uses the old convD2M and friends (also grep STATFIXLEN)
+//
+// 	RCU
+// 		^^^^^^^^^
+//
+// 	better mmap infrastructure
+// 			get rid of file_or_chan once we sort this out.  right now, it has
+// 			both chan and fs_file
+//
+// 	mmap notes
+//
+// 		newer note:
+// 			we have foc_dev_mmap, but that doesn't pm_add_vmr.
+// 				it could, but we also have pm_add_vmr when duplicating etc
+//		 		maybe that dev_mmap op ought to do both the pm_add_vmr and remove.
+//		 		call the op on both ends
+//		 		we'll need a counter for the number of dirtiable VMRs
+//
+//		 also, consider nesting / layering devices, even through the TFS.
+//		 we might want to pass through to the block device/backend if it has an
+//		 mmap op, since that could tell us the page-ish struct to use
+//
+//		when we talk to 9ns, we want to handle things other than PMs.  like
+//		arbitrary physical memory
+//			optional callback for unmapping (i.e. when the device wants to
+//			revoke the VMR connection, e.g. PM flusher)
+//
+//			instead of PM, maybe something a little higher
+//				like the fs_file
+//				or the PM itself points to those pages.  not quite a PM, in that
+//				it doesn't allocate pages.  we just want to know where to point.
+//
+// 				tempted to have __vm_foc be an fs_file, though sometimes we need
+// 				its absolute path (perf), which is a chan feature.
+//
+//		what's the connection from VMR to file and from PM back to VMR?
+//			IIRC, PM has weak refs on the VMRs, VMRs have refs on file -> PM
+//			VMRs have refs on files/chan: the mmap lasts beyond the FD closing
+//				though it might not need to be the chan.  could be fs_file
+//				depends on what the interface is - everything with chans and
+//				whatnot, multiplexed through a devtab[c->type].mmap op.
+//					9p mmap op? probably not
+//						say you want to access a NIC on another machine
+//						9p mnt - can you do that?  it'll fake it with a mmap on
+//						the frontend, implemented with reads to the backend
+//
+//  fs_file is doing some nasty things with usernames.  everyone is eve,
+//  basically, and there's no real accounting.
+//  	could change e.g. dir->uid to refcnts on struct user
+//  		refcnting is a bit nasty, want something like 'users never go away'
+//  	also need to interpet 9p's usernames. 
+//  		like lookup, given name, hook in
+//  		need something for unknown users.  eve?  mount owner?
+//  	also, sort out any other rules for the dir->strings.  e.g. ext can be 0
+//
+//	missing chown
+//		that, and glibc set errno, but has an old errstr
+//			bash-4.3$ mv /prog/file /prog/f2
+//			mv: can't preserve ownership of '/prog/f2': Function not implemented, could not find name f2, dev root
+//		
+//
+// 	XXX VM shit
+// 		can we move all the PG_ flags out of struct page?
+// 			we have PG_REMOVAL and PM_REMOVAL.  ffs.
+// 				PG_REMOVAL is used to communicate through the mem_walk
+// 				callbacks
+// 				PG_DIRTY is the response from the CB for a particular
+// 				page too.  so it's bidirectional
+// 			there's a giant sem in there too, for load_page
+// 			can we have the radix point to something other than a page?
+// 			like some on-demand struct that has all the flags
+// 				we'll need a way for vmr_for_each to communicate back to
+// 				us.
+// 			do we want a pml walk?  slightly better than a
+// 			foreach-pte_walk, since we don't have to go up and down.
+// 			but the downside is we don't know the radix slot / PM info
+// 			for a specific PTE.
+// 				is there something we could pass that they can quickly
+// 				find it? (rewalking the radix isn't 'quickly').  if so,
+// 				we'd just do another PTE
+//
+// 				seems like we have two structures that are both radix
+// 				trees: PMLs and pm_tree.  would be nice to merge.  can
+// 				we walk them in sync?  or use the same one? 
+// 					no to most, since a proc's KPT has many unrelated VMRs
+//
+// 				also, munmap is making a pass to mark not present
+// 				anyways. (in regards to the for-each-pte-walk shit)
+//
+// 		maybe make all VMRs point to a "PM", even anon ones, instead of using
+// 		the PTEs to track pages. 
+// 			- then replace all of it with the radixvm trees
+// 			- and this thing can track whatever paddrs we're pointing to
+// 			- PTEs become weak refs, unlike the weird shit mm does now
+// 			- fs files or pms?  (separate issues)
+// 			- and to some extent, all of anon mem is really one giant PM, not N
+// 			separate ones, and the VMRs are windows into that PM.
+// 			- revisit locking the 'fs_file' and len check.  anon won't have len.
+//
+//
+// side note: whenever we free pages, they stay in the slab layers, so it's hard
+// to tell we're actually freeing them
+
+	// 	XXX
+	//
+	// 		install this, maybe (requires sqlite3)
+	// 		https://github.com/juntaki/gogtags

diff --git a/kern/drivers/fs/Kbuild b/kern/drivers/fs/Kbuild
new file mode 100644
index 0000000..eb48bea
--- /dev/null
+++ b/kern/drivers/fs/Kbuild

@@ -0,0 +1 @@
+obj-y						+= mefs/

diff --git a/kern/drivers/fs/mefs/Kbuild b/kern/drivers/fs/mefs/Kbuild
new file mode 100644
index 0000000..827aa96
--- /dev/null
+++ b/kern/drivers/fs/mefs/Kbuild

@@ -0,0 +1,2 @@
+obj-y						+= block.o
+obj-y						+= mefs.o

diff --git a/kern/drivers/fs/mefs/block.c b/kern/drivers/fs/mefs/block.c
new file mode 100644
index 0000000..a9bcd54
--- /dev/null
+++ b/kern/drivers/fs/mefs/block.c

@@ -0,0 +1,358 @@
+/* Copyright (c) 2016, 2018 Google Inc
+ * Barret Rhoden <brho@cs.berkeley.edu>
+ * See LICENSE for details.
+ *
+ * Memory Extent Filesystem block allocation
+ *
+ * We use a power-of-two list allocator, similar to the arena allocator.
+ * There's no xalloc, importing, qcaches, or anything like that.  The superblock
+ * is analogous to the base arena: it must be self-sufficient.
+ *
+ * All of the structures are "on disk."  In theory, that should change as often
+ * as a filesystem's disk structures change, which is rarely if ever.  Once it
+ * is done.  =)  All values are in host-endian, and we operate directly on RAM.
+ *
+ * There's no protection for metadata corruption - if you crash in the middle of
+ * a tree-changing operation, you're out of luck.
+ *
+ * Unlike the arena allocator, we don't return a "void *", we actually return
+ * a pointer to the btag.  All of our users (the rest of mefs) will put up with
+ * this layer of indirection.  In exchange, we don't have to muck around with
+ * hash tables to find the btag when the segment is freed.
+ *
+ * This all assumes the caller manages synchronization (e.g. locks).
+ *
+ * This uses the BSD list macros, which technically is not guaranteed to not
+ * change.  If someone wants to replace them with local versions that are bound
+ * to the filesystem's disk format, then be my guest.  This doesn't use the
+ * rbtree code, though we probably could with the same justification for using
+ * the BSD list code.  But it'd be a bit more of a pain to roll our own for
+ * that, and I doubt it is necessary.
+ */
+
+#include "mefs.h"
+#include <err.h>
+
+static struct mefs_btag *__get_from_freelists(struct mefs_superblock *sb,
+                                              int list_idx);
+static bool __account_alloc(struct mefs_superblock *sb, struct mefs_btag *bt,
+                            size_t size, struct mefs_btag *new);
+
+/* Bootstrap from the free list */
+static void __ensure_some_btags(struct mefs_superblock *sb)
+{
+	struct mefs_btag *bt, *tags;
+	size_t nr_bts = MEFS_QUANTUM / sizeof(struct mefs_btag);
+
+	if (!BSD_LIST_EMPTY(&sb->unused_btags))
+		return;
+	bt = __get_from_freelists(sb, LOG2_UP(MEFS_QUANTUM));
+	if (!bt)
+		error(ENOMEM, "Unable to get more BTs in mefs!");
+	tags = (struct mefs_btag*)bt->start;
+	if (__account_alloc(sb, bt, MEFS_QUANTUM, &tags[0])) {
+		/* We used the tag[0]; we'll have to skip over it now. */
+		tags++;
+		nr_bts--;
+	}
+	for (int i = 0; i < nr_bts; i++)
+		BSD_LIST_INSERT_HEAD(&sb->unused_btags, &tags[i], misc_link);
+}
+
+static struct mefs_btag *__get_btag(struct mefs_superblock *sb)
+{
+	struct mefs_btag *bt;
+
+	bt = BSD_LIST_FIRST(&sb->unused_btags);
+	assert(bt);
+	BSD_LIST_REMOVE(bt, misc_link);
+	return bt;
+}
+
+static void __free_btag(struct mefs_superblock *sb, struct mefs_btag *bt)
+{
+	BSD_LIST_INSERT_HEAD(&sb->unused_btags, bt, misc_link);
+}
+
+static void __track_free_seg(struct mefs_superblock *sb, struct mefs_btag *bt)
+{
+	int list_idx = LOG2_DOWN(bt->size);
+
+	bt->status = MEFS_BTAG_FREE;
+	BSD_LIST_INSERT_HEAD(&sb->free_segs[list_idx], bt, misc_link);
+}
+
+static void __untrack_free_seg(struct mefs_superblock *sb, struct mefs_btag *bt)
+{
+	BSD_LIST_REMOVE(bt, misc_link);
+}
+
+/* This is a little slow, and is a consequence of not using a tree.  However,
+ * the common case caller was when @bt was created from an old one, and is
+ * likely to be right after it.  The old one is the @hint, which is where to
+ * start our search. */
+static void __insert_btag(struct mefs_btag_list *list, struct mefs_btag *bt,
+                          struct mefs_btag *hint)
+{
+	struct mefs_btag *i, *last = NULL;
+	bool hinted = false;
+
+	BSD_LIST_FOREACH(i, list, all_link) {
+		if (!hinted && hint) {
+			i = hint;
+			hinted = true;
+		}
+		if (bt->start < i->start) {
+			BSD_LIST_INSERT_BEFORE(i, bt, all_link);
+			return;
+		}
+		if (bt->start == i->start)
+			panic("BT %p == i %p in list %p!", bt, i, list);
+		last = i;
+	}
+	if (last)
+		BSD_LIST_INSERT_AFTER(last, bt, all_link);
+	else
+		BSD_LIST_INSERT_HEAD(list, bt, all_link);
+}
+
+/* Unlink the arena allocator, we don't track the segments on an allocated list.
+ * Our caller is the one that keeps track of it. */
+static void __track_alloc_seg(struct mefs_superblock *sb, struct mefs_btag *bt)
+{
+	bt->status = MEFS_BTAG_ALLOC;
+}
+
+/* Helper: we decided we want to alloc part of @bt, which has been removed from
+ * its old list.  We need @size units.  The rest can go back.
+ *
+ * Takes @new, which we'll use if we need a new btag.  If @new is NULL, we'll
+ * allocate one.  If we used the caller's btag, we'll return TRUE. */
+static bool __account_alloc(struct mefs_superblock *sb, struct mefs_btag *bt,
+                            size_t size, struct mefs_btag *new)
+{
+	bool ret = FALSE;
+
+	assert(bt->status == MEFS_BTAG_FREE);
+	if (bt->size != size) {
+		assert(bt->size > size);
+		if (new)
+			ret = TRUE;
+		else
+			new = __get_btag(sb);
+		new->start = bt->start + size;
+		new->size = bt->size - size;
+		bt->size = size;
+		__track_free_seg(sb, new);
+		__insert_btag(&sb->all_segs, new, bt);
+	}
+	__track_alloc_seg(sb, bt);
+	sb->amt_alloc_segs += size;
+	return ret;
+}
+
+static struct mefs_btag *__get_from_freelists(struct mefs_superblock *sb,
+                                              int list_idx)
+{
+	struct mefs_btag *ret = NULL;
+
+	for (int i = list_idx; i < MEFS_NR_FREE_LISTS; i++) {
+		ret = BSD_LIST_FIRST(&sb->free_segs[i]);
+		if (ret) {
+			BSD_LIST_REMOVE(ret, misc_link);
+			break;
+		}
+	}
+	return ret;
+}
+
+/* This uses the arena's "best fit" policy.  You could imagine building a
+ * version that cares about alignment too, for e.g. huge pages. */
+struct mefs_btag *mefs_ext_alloc(struct mefs_superblock *sb, size_t size)
+{
+	int list_idx = LOG2_DOWN(size);
+	struct mefs_btag *bt_i, *best = NULL;
+
+	if (!size)
+		error(EINVAL, "mefs_ext_alloc with 0 size!");
+	__ensure_some_btags(sb);
+	size = ROUNDUP(size, MEFS_QUANTUM);
+	BSD_LIST_FOREACH(bt_i, &sb->free_segs[list_idx], misc_link) {
+		if (bt_i->size >= size) {
+			if (!best || (best->size > bt_i->size))
+				best = bt_i;
+		}
+	}
+	if (best)
+		BSD_LIST_REMOVE(best, misc_link);
+	else
+		best = __get_from_freelists(sb, list_idx + 1);
+	if (!best)
+		error(ENOMEM, "couldn't find segment in mefs!");
+	__account_alloc(sb, best, size, NULL);
+	return best;
+}
+
+static bool __merge_right_to_left(struct mefs_superblock *sb,
+                                  struct mefs_btag *left,
+                                  struct mefs_btag *right)
+{
+	if (left->status != MEFS_BTAG_FREE)
+		return false;
+	if (right->status != MEFS_BTAG_FREE)
+		return false;
+	if (left->start + left->size == right->start) {
+		__untrack_free_seg(sb, left);
+		__untrack_free_seg(sb, right);
+		left->size += right->size;
+		__track_free_seg(sb, left);
+		BSD_LIST_REMOVE(right, all_link);
+		__free_btag(sb, right);
+		return true;
+	}
+	return false;
+}
+
+static void __coalesce_free_seg(struct mefs_superblock *sb,
+                                struct mefs_btag *bt)
+{
+	struct mefs_btag *bt_p, *bt_n;
+
+	bt_n = BSD_LIST_NEXT(bt, all_link);
+	if (bt_n)
+		__merge_right_to_left(sb, bt, bt_n);
+	bt_p = BSD_LIST_PREV(bt, &sb->all_segs, mefs_btag, all_link);
+	if (bt_p)
+		__merge_right_to_left(sb, bt_p, bt);
+}
+
+void mefs_ext_free(struct mefs_superblock *sb, struct mefs_btag *bt)
+{
+	void *to_free_addr = 0;
+	size_t to_free_sz = 0;
+
+	sb->amt_alloc_segs -= bt->size;
+	__track_free_seg(sb, bt);
+	/* Don't use bt after this: */
+	__coalesce_free_seg(sb, bt);
+	sb->amt_total_segs -= to_free_sz;
+}
+
+/* Bump allocates space in the segment [seg_alloc, seg_alloc + seg_amt).
+ * Returns the allocation address and updates the allocator's values by
+ * reference.  Throws on error. */
+static void *bump_zalloc(size_t amt, size_t align, uintptr_t *seg_alloc,
+                         size_t *seg_amt)
+{
+	size_t align_diff;
+	void *ret;
+
+	align_diff = ALIGN(*seg_alloc, align) - *seg_alloc;
+	if (*seg_amt < amt + align_diff)
+		error(ENOMEM, "Out of space in mefs SB");
+	*seg_amt -= align_diff;
+	*seg_alloc += align_diff;
+	ret = (void*)*seg_alloc;
+	*seg_alloc += amt;
+	*seg_amt -= amt;
+	memset(ret, 0, amt);
+	return ret;
+}
+
+/* Creates a superblock and adds the memory segment.  The SB will be at the
+ * beginning of the segment. */
+struct mefs_superblock *mefs_super_create(uintptr_t init_seg, size_t size)
+{
+	struct mefs_superblock *sb;
+	struct mefs_btag *bt;
+	uintptr_t seg_alloc = init_seg;
+	size_t seg_amt = size;
+
+	sb = bump_zalloc(sizeof(*sb), __alignof__(*sb), &seg_alloc, &seg_amt);
+	memcpy(sb->magic, MEFS_MAGIC, sizeof(sb->magic));
+	BSD_LIST_INIT(&sb->all_segs);
+	BSD_LIST_INIT(&sb->unused_btags);
+	for (int i = 0; i < MEFS_NR_FREE_LISTS; i++)
+		BSD_LIST_INIT(&sb->free_segs[i]);
+
+	bt = bump_zalloc(sizeof(*bt), __alignof__(*bt), &seg_alloc, &seg_amt);
+	BSD_LIST_INSERT_HEAD(&sb->unused_btags, bt, misc_link);
+	
+	seg_alloc = ALIGN(seg_alloc, MEFS_QUANTUM);
+	seg_amt = ALIGN_DOWN(seg_amt, MEFS_QUANTUM);
+
+	mefs_super_add(sb, seg_alloc, seg_amt);
+
+	return sb;
+}
+
+/* Ignoring size for now.  Could use it for sanity checks. */
+struct mefs_superblock *mefs_super_attach(uintptr_t init_seg, size_t size)
+{
+	struct mefs_superblock *sb;
+
+	init_seg = ALIGN(init_seg, sizeof(*sb));
+	sb = (struct mefs_superblock*)init_seg;
+	if (strcmp(sb->magic, MEFS_MAGIC))
+		return NULL;
+	return sb;
+}
+
+void mefs_super_add(struct mefs_superblock *sb, uintptr_t seg, size_t size)
+{
+	struct mefs_btag *bt;
+
+	__ensure_some_btags(sb);
+	bt = __get_btag(sb);
+	bt->start = seg;
+	bt->size = size;
+	sb->amt_total_segs += size;
+	__track_free_seg(sb, bt);
+	__insert_btag(&sb->all_segs, bt, NULL);
+}
+
+void mefs_super_destroy(struct mefs_superblock *sb)
+{
+	memset(sb->magic, 0xa, sizeof(sb->magic));
+}
+
+void mefs_super_dump(struct mefs_superblock *sb)
+{
+	struct mefs_btag *i;
+
+	printk("All segs\n");
+	BSD_LIST_FOREACH(i, &sb->all_segs, all_link)
+		printk("bt %p, start %p, +%lu, %s\n", i, i->start, i->size,
+		       i->status == MEFS_BTAG_FREE ? "free" : "alloc");
+}
+
+#include <time.h>
+
+static inline void kb_wait(void)
+{
+    int i;
+
+    for (i = 0; i < 0x10000; i++) {
+        if ((inb(0x64) & 0x02) == 0)
+            break;
+        udelay(2);
+    }
+}
+
+void food()
+{
+	outb(0xcf9, 0x6);
+	printk("ACPI cf9 FAILED\n");
+}
+
+void foot()
+{
+            for (int i = 0; i < 10; i++) {
+                kb_wait();
+                udelay(50);
+                outb(0x64, 0xfe); /* Pulse reset low */
+                udelay(50);
+            }
+
+	printk("KBD FAILED\n");
+}

diff --git a/kern/drivers/fs/mefs/mefs.c b/kern/drivers/fs/mefs/mefs.c
new file mode 100644
index 0000000..83a23e3
--- /dev/null
+++ b/kern/drivers/fs/mefs/mefs.c

@@ -0,0 +1,292 @@
+/* Copyright (c) 2018 Google Inc
+ * Barret Rhoden <brho@cs.berkeley.edu>
+ * See LICENSE for details.
+ *
+ * #mefs: Memory Extent Filesystem
+ *
+ * It's designed to run on memory segments, supporting a small number of files
+ * whose sizes are bimodal - either small, or potentially very large.  Small
+ * files are O(PGSIZE).  Large files are O(TB).
+ *
+ * We're not designing for persistence in the face of failures, hardcore
+ * performance, or anything like that.  I'd like it to be simple, yet capable of
+ * handling very large files.
+ *
+ * There's only one instance of mefs, similar to KFS and unlike tmpfs.  All
+ * attaches get the same FS.
+ */
+
+#include <ns.h>
+#include <kmalloc.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <error.h>
+#include <tree_file.h>
+#include <pmap.h>
+
+#include "mefs.h"
+
+struct dev mefs_devtab;
+
+struct mefs {
+	struct tree_filesystem		tfs;
+	struct mefs_superblock		*sb;
+
+
+
+	atomic_t					qid;
+};
+
+static struct mefs mefs[1];
+
+static uint64_t mefs_get_qid_path(struct mefs *mefs)
+{
+	return atomic_fetch_and_add(&mefs->qid, 1);
+}
+
+static char *devname(void)
+{
+	return mefs_devtab.name;
+}
+
+static void mefs_tf_free(struct tree_file *tf)
+{
+	/* We have nothing special hanging off the TF */
+}
+
+static void mefs_tf_unlink(struct tree_file *parent, struct tree_file *child)
+{
+	/* This is the "+1 for existing" ref. */
+	tf_kref_put(child);
+}
+
+static void __mefs_tf_init(struct tree_file *tf, int dir_type, int dir_dev,
+                            struct username *user, int perm)
+{
+	struct dir *dir = &tf->file.dir;
+
+	fs_file_init_dir(&tf->file, dir_type, dir_dev, user, perm);
+	dir->qid.path = mefs_get_qid_path((struct mefs*)tf->tfs);
+	dir->qid.vers = 0;
+	/* This is the "+1 for existing" ref.  There is no backing store for the FS,
+	 * such as a disk or 9p, so we can't get rid of a file until it is unlinked
+	 * and decreffed.  Note that KFS doesn't use pruners or anything else. */
+	__kref_get(&tf->kref, 1);
+}
+
+/* Note: If your TFS doesn't support symlinks, you need to error out */
+static void mefs_tf_create(struct tree_file *parent, struct tree_file *child,
+                            int perm)
+{
+	__mefs_tf_init(child, parent->file.dir.type, parent->file.dir.dev, &eve,
+	                perm);
+}
+
+static void mefs_tf_rename(struct tree_file *tf, struct tree_file *old_parent,
+                            struct tree_file *new_parent, const char *name,
+                            int flags)
+{
+	/* We don't have a backend, so we don't need to do anything additional for
+	 * rename. */
+}
+
+static bool mefs_tf_has_children(struct tree_file *parent)
+{
+	/* The tree_file parent list is complete and not merely a cache for a real
+	 * backend. */
+	return !list_empty(&parent->children);
+}
+
+struct tree_file_ops mefs_tf_ops = {
+	.free = mefs_tf_free,
+	.unlink = mefs_tf_unlink,
+	.lookup = NULL,
+	.create = mefs_tf_create,
+	.rename = mefs_tf_rename,
+	.has_children = mefs_tf_has_children,
+};
+
+/* Fills page with its contents from its backing store file.  For KFS, that
+ * means we're creating or extending a file, and the contents are 0.  Note the
+ * page/offset might be beyond the current file length, based on the current
+ * pagemap code. */
+static int mefs_pm_readpage(struct page_map *pm, struct page *pg)
+{
+	memset(page2kva(pg), 0, PGSIZE);
+	atomic_or(&pg->pg_flags, PG_UPTODATE);
+	/* Pretend that we blocked while filing this page.  This catches a lot of
+	 * bugs.  It does slightly slow down the kernel, but it's only when filling
+	 * the page cache, and considering we are using a RAMFS, you shouldn't
+	 * measure things that actually rely on KFS's performance. */
+	kthread_usleep(1);
+	return 0;
+}
+
+/* Meant to take the page from PM and flush to backing store.  There is no
+ * backing store. */
+static int mefs_pm_writepage(struct page_map *pm, struct page *pg)
+{
+	return 0;
+}
+
+static void mefs_fs_punch_hole(struct fs_file *f, off64_t begin, off64_t end)
+{
+}
+
+static bool mefs_fs_can_grow_to(struct fs_file *f, size_t len)
+{
+	/* TODO: implement some sort of memory limit */
+	return true;
+}
+
+struct fs_file_ops mefs_fs_ops = {
+	.readpage = mefs_pm_readpage,
+	.writepage = mefs_pm_writepage,
+	.punch_hole = mefs_fs_punch_hole,
+	.can_grow_to = mefs_fs_can_grow_to,
+};
+
+static struct mefs *chan_to_mefs(struct chan *c)
+{
+	struct tree_file *tf = chan_to_tree_file(c);
+
+	return (struct mefs*)(tf->tfs);
+}
+
+extern physaddr_t mefs_start;
+extern size_t mefs_size;
+
+static void mefs_init(void)
+{
+	ERRSTACK(1);
+	struct tree_filesystem *tfs = (struct tree_filesystem*)mefs;
+	struct mefs_superblock *sb;
+
+	if (waserror()) {
+		printk("#mefs threw %s\n", current_errstr());
+		poperror();
+		return;
+	}
+	if (!mefs_start)
+		error(ENOENT, "Couldn't find mefs_start, aborting");
+	sb = mefs_super_attach(mefs_start, mefs_size);
+	if (sb) {
+		printk("Found existing mefs sb at %p, reconnecting.\n", sb);
+	} else {
+		sb = mefs_super_create(mefs_start, mefs_size);
+		printk("Created new mefs sb at %p\n", sb);
+
+		mefs_ext_alloc(sb, PGSIZE << 0);
+		mefs_ext_alloc(sb, PGSIZE << 0);
+		void * x = mefs_ext_alloc(sb, PGSIZE << 10);
+		mefs_ext_alloc(sb, PGSIZE << 5);
+		mefs_ext_alloc(sb, PGSIZE << 1);
+		mefs_ext_free(sb, x);
+		mefs_ext_alloc(sb, PGSIZE << 7);
+	}
+	mefs_super_dump(sb);
+	
+	mefs->sb = sb;
+// XXX
+
+	
+	/* This gives us one ref on root, which we'll never drop. */
+	tfs_init(tfs);
+	tfs->tf_ops = mefs_tf_ops;
+	tfs->fs_ops = mefs_fs_ops;
+
+	// XXX
+	/* This gives us an extra refcnt on tfs->root.  This is "+1 for existing."
+	 * It is decreffed during the purge CB. */
+	__mefs_tf_init(tfs->root, &mefs_devtab - devtab, 0, &eve, DMDIR | 0777);
+	poperror();
+}
+
+static struct chan *mefs_attach(char *spec)
+{
+	struct tree_filesystem *tfs = (struct tree_filesystem*)mefs;
+
+	return tree_file_alloc_chan(tfs->root, &mefs_devtab, "#mefs");
+}
+
+static unsigned long mefs_chan_ctl(struct chan *c, int op, unsigned long a1,
+                                    unsigned long a2, unsigned long a3,
+                                    unsigned long a4)
+{
+	switch (op) {
+	case CCTL_SYNC:
+		return 0;
+	default:
+		error(EINVAL, "%s does not support %d", __func__, op);
+	}
+}
+
+struct dev mefs_devtab __devtab = {
+	.name = "mefs",
+	.reset = devreset,
+	.init = mefs_init,
+	.shutdown = devshutdown,
+	.attach = mefs_attach,
+	.walk = tree_chan_walk,
+	.stat = tree_chan_stat,
+	.open = tree_chan_open,
+	.create = tree_chan_create,
+	.close = tree_chan_close,
+	.read = tree_chan_read,
+	.bread = devbread,
+	.write = tree_chan_write,
+	.bwrite = devbwrite,
+	.remove = tree_chan_remove,
+	.rename = tree_chan_rename,
+	.wstat = tree_chan_wstat,
+	.power = devpower,
+	.chaninfo = devchaninfo,
+	.mmap = tree_chan_mmap,
+	.chan_ctl = mefs_chan_ctl,
+};
+
+
+// 	XXX
+//
+//	syslinux or something didn't work - the segment was zeroed.
+//			might need a kexec
+//				device teardown?  none of that shit was tested. (NICs)
+//			k, it's a large ball.
+//				need that ball to not be in the 'overwrite' spot
+//				the new one defines the size of the overwrite spot too (elf
+//				parse, etc)
+//			need a chunk of code, running on its own protected page tables
+//				need that to also not be in the overwrite spot
+//			protected gdt too, and stack page.  can disable irqs...
+//			memcpy to the final location, jump to it.
+//				basically the elf parser, similar to loadelf.c
+//				ah, but can't use any external code either.
+//			maybe kexec is a super-slim OS
+//				actually, we can bundle it with the target OS image.
+//				set up its PT in advance?  
+//					need to do it at runtime, since we need the paddr
+//					
+//			
+//
+//	will want to destroy the super aggressively.  or at least have commands for
+//	it, so that if we e.g. barcher a new kernel, we're not stuck with the bugs
+//
+//	init is hokey.  would like to grow and shrink, and need to sync btw the base
+//	arena, mefs, and whatever we do to communicate to our future self.
+//		actually, mefs will describe itself
+//		but the future self / multiboot memory detection is trickier
+//		handing segments back is a little trickier (can make a yank function,
+//		then arena add.  though that fragments the space slightly)
+//
+//
+// don't forget some way to sync, if necessary (since we don't sync on unmount)
+// 		btw, should unmount.c also sync?
+//
+//
+//
+// 	btw, for hole-punching, we might not be able to free the intermediate data
+// 	easily.  would need to break it up.
+// 		issue is that we don't have individual blocks - we have a large
+// 		structure.  and the arena code won't take something that didn't have a
+// 		btag

diff --git a/kern/drivers/fs/mefs/mefs.h b/kern/drivers/fs/mefs/mefs.h
new file mode 100644
index 0000000..465817c
--- /dev/null
+++ b/kern/drivers/fs/mefs/mefs.h

@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 Google Inc
+ * Barret Rhoden <brho@cs.berkeley.edu>
+ * See LICENSE for details.
+ *
+ */
+
+#pragma once
+
+#include <sys/types.h>
+#include <ros/arch/mmu.h>
+#include <sys/queue.h>
+
+#define MEFS_BTAG_FREE		1
+#define MEFS_BTAG_ALLOC		2
+
+/* all_link is rooted at all_segs in the SB.  misc_link is used for the
+ * unused_btags list (btag cache) or the free seg list. */
+struct mefs_btag {
+	BSD_LIST_ENTRY(mefs_btag)	all_link;
+	BSD_LIST_ENTRY(mefs_btag)	misc_link;
+	uintptr_t					start;
+	size_t						size;
+	int							status;
+};
+BSD_LIST_HEAD(mefs_btag_list, mefs_btag);
+
+/* 64 is the most powers of two we can express with 64 bits. */
+#define MEFS_NR_FREE_LISTS		64
+#define MEFS_QUANTUM			PGSIZE
+#define MEFS_MAGIC				"MEFS001"
+
+/* all_segs is the sorted list of all btags that cover the memory space.  i.e.
+ * not the unused btags, but all the btags for the allocated and free memory. */
+struct mefs_superblock {
+	char						magic[8];
+	struct mefs_btag_list		all_segs;
+	struct mefs_btag_list		unused_btags;
+	struct mefs_btag_list		free_segs[MEFS_NR_FREE_LISTS];
+	size_t						amt_total_segs;
+	size_t						amt_alloc_segs;
+};
+
+struct mefs_superblock *mefs_super_create(uintptr_t init_seg, size_t size);
+struct mefs_superblock *mefs_super_attach(uintptr_t init_seg, size_t size);
+void mefs_super_add(struct mefs_superblock *sb, uintptr_t seg, size_t size);
+void mefs_super_destroy(struct mefs_superblock *sb);
+void mefs_super_dump(struct mefs_superblock *sb);
+struct mefs_btag *mefs_ext_alloc(struct mefs_superblock *sb, size_t size);
+void mefs_ext_free(struct mefs_superblock *sb, struct mefs_btag *bt);

diff --git a/kern/src/arena.c b/kern/src/arena.c
index 26a5991..050004f 100644
--- a/kern/src/arena.c
+++ b/kern/src/arena.c

@@ -1060,6 +1060,9 @@
 	struct rb_node *rb_p, *rb_n;
 	struct btag *bt_p, *bt_n;
 
+	// XXX this could merge more.  if we succeed on a merge, then we might move
+	// to a higher list, and then be able to merge again with both left and
+	// right
 	rb_n = rb_next(&bt->all_link);
 	if (rb_n) {
 		bt_n = container_of(rb_n, struct btag, all_link);

diff --git a/kern/src/ns/tree_file.c b/kern/src/ns/tree_file.c
index 14e5f8d..b7c2dc5 100644
--- a/kern/src/ns/tree_file.c
+++ b/kern/src/ns/tree_file.c

@@ -87,6 +87,11 @@
 	struct tree_filesystem *tfs = tf->tfs;
 
 	tf->tfs->tf_ops.free(tf);
+
+	// XXX 
+	printk("FREEING %s\n", tree_file_to_name(tf));
+
+	
 	if (tf->flags & TF_F_IS_ROOT) {
 		assert(tfs->root == tf);
 		assert(!parent);

diff --git a/kern/src/rcu.c b/kern/src/rcu.c
index 23a8a6c..71c1d31 100644
--- a/kern/src/rcu.c
+++ b/kern/src/rcu.c

@@ -91,7 +91,7 @@
 extern int rcu_num_lvls;
 
 /* Controls whether we skip cores when we expedite, which forces tardy cores. */
-static bool rcu_debug_tardy;
+static bool rcu_debug_tardy = true;
 
 /* Externed in rcu_tree_helper.c */
 struct rcu_state rcu_state;
@@ -610,3 +610,55 @@
 		rpi->booted = true;
 	}
 }
+
+// XXX
+struct bar {
+	int x;
+	struct rcu_head h;
+};
+
+void foo()
+{
+	rcu_dump_rcu_node_tree(&rcu_state);
+	printk("gp num %d, completed %d\n", rcu_state.gpnum, rcu_state.completed);
+	struct bar *bar = kmalloc(sizeof(struct bar), MEM_WAIT);
+
+	kfree_rcu(bar, h);
+}
+
+
+static void increment(struct rcu_head *head)
+{
+	struct bar *b = container_of(head, struct bar, h);
+
+	WRITE_ONCE(b->x, b->x + 1);
+}
+
+static void __torture(uint32_t srcid, long a0, long a1, long a2)
+{
+	struct bar *bars = kzmalloc(sizeof(struct bar) * 1000, MEM_WAIT);
+
+	#define NR_CBS 50
+
+	for (int i = 0; i < NR_CBS; i++) {
+		bars[i].x = i;
+		call_rcu(&bars[i].h, increment);
+	}
+	udelay(1000);
+	/* We know the CBs have not run yet, since this CPU hasn't had a QS */
+	for (int i = 0; i < NR_CBS; i++)
+		assert(bars[i].x == i);
+	rcu_barrier();	/* might hurt.  could imagine a local barrier */
+	for (int i = 0; i < NR_CBS; i++)
+		assert(bars[i].x == i + 1);
+	kfree(bars);
+}
+
+void torture(void)
+{
+	/* Most all of this time is spent on core 0 (mpstat) */
+	for_each_core(i) {
+		for (int j = 0; j < 20; j++)
+			send_kernel_message(i, __torture, 0, 0, 0, KMSG_ROUTINE);
+	}
+}

diff --git a/kern/src/rcu_tree_helper.c b/kern/src/rcu_tree_helper.c
index 5c99032..3b36aa6 100644
--- a/kern/src/rcu_tree_helper.c
+++ b/kern/src/rcu_tree_helper.c

@@ -46,7 +46,7 @@
 /* Number of cores RCU thinks exist.  Set to 0 or nothing to use 'num_cores'.
  * The extra cores are called 'fake cores' in rcu.c, and are used for testing
  * the tree. */
-int rcu_num_cores;
+int rcu_num_cores = 78;
 
 /* in rcu.c */
 extern struct rcu_state rcu_state;