kern/drivers/fs/mefs/mefs.c - upstream - Git at Google

 /* Copyright (c) 2018 Google Inc
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
  * #mefs: Memory Extent Filesystem
  *
  * It's designed to run on memory segments, supporting a small number of files
  * whose sizes are bimodal - either small, or potentially very large.  Small
  * files are O(PGSIZE).  Large files are O(TB).
  *
  * We're not designing for persistence in the face of failures, hardcore
  * performance, or anything like that.  I'd like it to be simple, yet capable of
  * handling very large files.
  *
  * There's only one instance of mefs, similar to KFS and unlike tmpfs.  All
  * attaches get the same FS.
  */

 #include <ns.h>
 #include <kmalloc.h>
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
 #include <error.h>
 #include <tree_file.h>
 #include <pmap.h>

 #include "mefs.h"

 struct dev mefs_devtab;

 struct mefs {
 	struct tree_filesystem		tfs;
 	struct mefs_superblock		*sb;


 	atomic_t					qid;
 };

 static struct mefs mefs[1];

 static uint64_t mefs_get_qid_path(struct mefs *mefs)
 {
 	return atomic_fetch_and_add(&mefs->qid, 1);
 }

 static char *devname(void)
 {
 	return mefs_devtab.name;
 }

 static void mefs_tf_free(struct tree_file *tf)
 {
 	/* We have nothing special hanging off the TF */
 }

 static void mefs_tf_unlink(struct tree_file *parent, struct tree_file *child)
 {
 	/* This is the "+1 for existing" ref. */
 	tf_kref_put(child);
 }

 static void __mefs_tf_init(struct tree_file *tf, int dir_type, int dir_dev,
                             struct username *user, int perm)
 {
 	struct dir *dir = &tf->file.dir;

 	fs_file_init_dir(&tf->file, dir_type, dir_dev, user, perm);
 	dir->qid.path = mefs_get_qid_path((struct mefs*)tf->tfs);
 	dir->qid.vers = 0;
 	/* This is the "+1 for existing" ref.  There is no backing store for the FS,
 	 * such as a disk or 9p, so we can't get rid of a file until it is unlinked
 	 * and decreffed.  Note that KFS doesn't use pruners or anything else. */
 	__kref_get(&tf->kref, 1);
 }

 /* Note: If your TFS doesn't support symlinks, you need to error out */
 static void mefs_tf_create(struct tree_file *parent, struct tree_file *child,
                             int perm)
 {
 	__mefs_tf_init(child, parent->file.dir.type, parent->file.dir.dev, &eve,
 	                perm);
 }

 static void mefs_tf_rename(struct tree_file *tf, struct tree_file *old_parent,
                             struct tree_file *new_parent, const char *name,
                             int flags)
 {
 	/* We don't have a backend, so we don't need to do anything additional for
 	 * rename. */
 }

 static bool mefs_tf_has_children(struct tree_file *parent)
 {
 	/* The tree_file parent list is complete and not merely a cache for a real
 	 * backend. */
 	return !list_empty(&parent->children);
 }

 struct tree_file_ops mefs_tf_ops = {
 	.free = mefs_tf_free,
 	.unlink = mefs_tf_unlink,
 	.lookup = NULL,
 	.create = mefs_tf_create,
 	.rename = mefs_tf_rename,
 	.has_children = mefs_tf_has_children,
 };

 /* Fills page with its contents from its backing store file.  For KFS, that
  * means we're creating or extending a file, and the contents are 0.  Note the
  * page/offset might be beyond the current file length, based on the current
  * pagemap code. */
 static int mefs_pm_readpage(struct page_map *pm, struct page *pg)
 {
 	memset(page2kva(pg), 0, PGSIZE);
 	atomic_or(&pg->pg_flags, PG_UPTODATE);
 	/* Pretend that we blocked while filing this page.  This catches a lot of
 	 * bugs.  It does slightly slow down the kernel, but it's only when filling
 	 * the page cache, and considering we are using a RAMFS, you shouldn't
 	 * measure things that actually rely on KFS's performance. */
 	kthread_usleep(1);
 	return 0;
 }

 /* Meant to take the page from PM and flush to backing store.  There is no
  * backing store. */
 static int mefs_pm_writepage(struct page_map *pm, struct page *pg)
 {
 	return 0;
 }

 static void mefs_fs_punch_hole(struct fs_file *f, off64_t begin, off64_t end)
 {
 }

 static bool mefs_fs_can_grow_to(struct fs_file *f, size_t len)
 {
 	/* TODO: implement some sort of memory limit */
 	return true;
 }

 struct fs_file_ops mefs_fs_ops = {
 	.readpage = mefs_pm_readpage,
 	.writepage = mefs_pm_writepage,
 	.punch_hole = mefs_fs_punch_hole,
 	.can_grow_to = mefs_fs_can_grow_to,
 };

 static struct mefs *chan_to_mefs(struct chan *c)
 {
 	struct tree_file *tf = chan_to_tree_file(c);

 	return (struct mefs*)(tf->tfs);
 }

 extern physaddr_t mefs_start;
 extern size_t mefs_size;

 static void mefs_init(void)
 {
 	ERRSTACK(1);
 	struct tree_filesystem *tfs = (struct tree_filesystem*)mefs;
 	struct mefs_superblock *sb;

 	if (waserror()) {
 		printk("#mefs threw %s\n", current_errstr());
 		poperror();
 		return;
 	}
 	if (!mefs_start)
 		error(ENOENT, "Couldn't find mefs_start, aborting");
 	sb = mefs_super_attach(mefs_start, mefs_size);
 	if (sb) {
 		printk("Found existing mefs sb at %p, reconnecting.\n", sb);
 	} else {
 		sb = mefs_super_create(mefs_start, mefs_size);
 		printk("Created new mefs sb at %p\n", sb);

 		mefs_ext_alloc(sb, PGSIZE << 0);
 		mefs_ext_alloc(sb, PGSIZE << 0);
 		void * x = mefs_ext_alloc(sb, PGSIZE << 10);
 		mefs_ext_alloc(sb, PGSIZE << 5);
 		mefs_ext_alloc(sb, PGSIZE << 1);
 		mefs_ext_free(sb, x);
 		mefs_ext_alloc(sb, PGSIZE << 7);
 	}
 	mefs_super_dump(sb);

 	mefs->sb = sb;
 // XXX


 	/* This gives us one ref on root, which we'll never drop. */
 	tfs_init(tfs);
 	tfs->tf_ops = mefs_tf_ops;
 	tfs->fs_ops = mefs_fs_ops;

 	// XXX
 	/* This gives us an extra refcnt on tfs->root.  This is "+1 for existing."
 	 * It is decreffed during the purge CB. */
 	__mefs_tf_init(tfs->root, &mefs_devtab - devtab, 0, &eve, DMDIR | 0777);
 	poperror();
 }

 static struct chan *mefs_attach(char *spec)
 {
 	struct tree_filesystem *tfs = (struct tree_filesystem*)mefs;

 	return tree_file_alloc_chan(tfs->root, &mefs_devtab, "#mefs");
 }

 static unsigned long mefs_chan_ctl(struct chan *c, int op, unsigned long a1,
                                     unsigned long a2, unsigned long a3,
                                     unsigned long a4)
 {
 	switch (op) {
 	case CCTL_SYNC:
 		return 0;
 	default:
 		error(EINVAL, "%s does not support %d", __func__, op);
 	}
 }

 struct dev mefs_devtab __devtab = {
 	.name = "mefs",
 	.reset = devreset,
 	.init = mefs_init,
 	.shutdown = devshutdown,
 	.attach = mefs_attach,
 	.walk = tree_chan_walk,
 	.stat = tree_chan_stat,
 	.open = tree_chan_open,
 	.create = tree_chan_create,
 	.close = tree_chan_close,
 	.read = tree_chan_read,
 	.bread = devbread,
 	.write = tree_chan_write,
 	.bwrite = devbwrite,
 	.remove = tree_chan_remove,
 	.rename = tree_chan_rename,
 	.wstat = tree_chan_wstat,
 	.power = devpower,
 	.chaninfo = devchaninfo,
 	.mmap = tree_chan_mmap,
 	.chan_ctl = mefs_chan_ctl,
 };


 // 	XXX
 //
 //	syslinux or something didn't work - the segment was zeroed.
 //			might need a kexec
 //				device teardown?  none of that shit was tested. (NICs)
 //			k, it's a large ball.
 //				need that ball to not be in the 'overwrite' spot
 //				the new one defines the size of the overwrite spot too (elf
 //				parse, etc)
 //			need a chunk of code, running on its own protected page tables
 //				need that to also not be in the overwrite spot
 //			protected gdt too, and stack page.  can disable irqs...
 //			memcpy to the final location, jump to it.
 //				basically the elf parser, similar to loadelf.c
 //				ah, but can't use any external code either.
 //			maybe kexec is a super-slim OS
 //				actually, we can bundle it with the target OS image.
 //				set up its PT in advance?
 //					need to do it at runtime, since we need the paddr
 //
 //
 //
 //	will want to destroy the super aggressively.  or at least have commands for
 //	it, so that if we e.g. barcher a new kernel, we're not stuck with the bugs
 //
 //	init is hokey.  would like to grow and shrink, and need to sync btw the base
 //	arena, mefs, and whatever we do to communicate to our future self.
 //		actually, mefs will describe itself
 //		but the future self / multiboot memory detection is trickier
 //		handing segments back is a little trickier (can make a yank function,
 //		then arena add.  though that fragments the space slightly)
 //
 //
 // don't forget some way to sync, if necessary (since we don't sync on unmount)
 // 		btw, should unmount.c also sync?
 //
 //
 //
 // 	btw, for hole-punching, we might not be able to free the intermediate data
 // 	easily.  would need to break it up.
 // 		issue is that we don't have individual blocks - we have a large
 // 		structure.  and the arena code won't take something that didn't have a
 // 		btag
	/* Copyright (c) 2018 Google Inc
	* Barret Rhoden <brho@cs.berkeley.edu>
	* See LICENSE for details.
	*
	* #mefs: Memory Extent Filesystem
	*
	* It's designed to run on memory segments, supporting a small number of files
	* whose sizes are bimodal - either small, or potentially very large. Small
	* files are O(PGSIZE). Large files are O(TB).
	*
	* We're not designing for persistence in the face of failures, hardcore
	* performance, or anything like that. I'd like it to be simple, yet capable of
	* handling very large files.
	*
	* There's only one instance of mefs, similar to KFS and unlike tmpfs. All
	* attaches get the same FS.
	*/

	#include <ns.h>
	#include <kmalloc.h>
	#include <string.h>
	#include <stdio.h>
	#include <assert.h>
	#include <error.h>
	#include <tree_file.h>
	#include <pmap.h>

	#include "mefs.h"

	struct dev mefs_devtab;

	struct mefs {
	struct tree_filesystem tfs;
	struct mefs_superblock *sb;



	atomic_t qid;
	};

	static struct mefs mefs[1];

	static uint64_t mefs_get_qid_path(struct mefs *mefs)
	{
	return atomic_fetch_and_add(&mefs->qid, 1);
	}

	static char *devname(void)
	{
	return mefs_devtab.name;
	}

	static void mefs_tf_free(struct tree_file *tf)
	{
	/* We have nothing special hanging off the TF */
	}

	static void mefs_tf_unlink(struct tree_file parent, struct tree_file child)
	{
	/* This is the "+1 for existing" ref. */
	tf_kref_put(child);
	}

	static void __mefs_tf_init(struct tree_file *tf, int dir_type, int dir_dev,
	struct username *user, int perm)
	{
	struct dir *dir = &tf->file.dir;

	fs_file_init_dir(&tf->file, dir_type, dir_dev, user, perm);
	dir->qid.path = mefs_get_qid_path((struct mefs*)tf->tfs);
	dir->qid.vers = 0;
	/* This is the "+1 for existing" ref. There is no backing store for the FS,
	* such as a disk or 9p, so we can't get rid of a file until it is unlinked
	* and decreffed. Note that KFS doesn't use pruners or anything else. */
	__kref_get(&tf->kref, 1);
	}

	/* Note: If your TFS doesn't support symlinks, you need to error out */
	static void mefs_tf_create(struct tree_file parent, struct tree_file child,
	int perm)
	{
	__mefs_tf_init(child, parent->file.dir.type, parent->file.dir.dev, &eve,
	perm);
	}

	static void mefs_tf_rename(struct tree_file tf, struct tree_file old_parent,
	struct tree_file new_parent, const char name,
	int flags)
	{
	/* We don't have a backend, so we don't need to do anything additional for
	* rename. */
	}

	static bool mefs_tf_has_children(struct tree_file *parent)
	{
	/* The tree_file parent list is complete and not merely a cache for a real
	* backend. */
	return !list_empty(&parent->children);
	}

	struct tree_file_ops mefs_tf_ops = {
	.free = mefs_tf_free,
	.unlink = mefs_tf_unlink,
	.lookup = NULL,
	.create = mefs_tf_create,
	.rename = mefs_tf_rename,
	.has_children = mefs_tf_has_children,
	};

	/* Fills page with its contents from its backing store file. For KFS, that
	* means we're creating or extending a file, and the contents are 0. Note the
	* page/offset might be beyond the current file length, based on the current
	* pagemap code. */
	static int mefs_pm_readpage(struct page_map pm, struct page pg)
	{
	memset(page2kva(pg), 0, PGSIZE);
	atomic_or(&pg->pg_flags, PG_UPTODATE);
	/* Pretend that we blocked while filing this page. This catches a lot of
	* bugs. It does slightly slow down the kernel, but it's only when filling
	* the page cache, and considering we are using a RAMFS, you shouldn't
	* measure things that actually rely on KFS's performance. */
	kthread_usleep(1);
	return 0;
	}

	/* Meant to take the page from PM and flush to backing store. There is no
	* backing store. */
	static int mefs_pm_writepage(struct page_map pm, struct page pg)
	{
	return 0;
	}

	static void mefs_fs_punch_hole(struct fs_file *f, off64_t begin, off64_t end)
	{
	}

	static bool mefs_fs_can_grow_to(struct fs_file *f, size_t len)
	{
	/* TODO: implement some sort of memory limit */
	return true;
	}

	struct fs_file_ops mefs_fs_ops = {
	.readpage = mefs_pm_readpage,
	.writepage = mefs_pm_writepage,
	.punch_hole = mefs_fs_punch_hole,
	.can_grow_to = mefs_fs_can_grow_to,
	};

	static struct mefs chan_to_mefs(struct chan c)
	{
	struct tree_file *tf = chan_to_tree_file(c);

	return (struct mefs*)(tf->tfs);
	}

	extern physaddr_t mefs_start;
	extern size_t mefs_size;

	static void mefs_init(void)
	{
	ERRSTACK(1);
	struct tree_filesystem tfs = (struct tree_filesystem)mefs;
	struct mefs_superblock *sb;

	if (waserror()) {
	printk("#mefs threw %s\n", current_errstr());
	poperror();
	return;
	}
	if (!mefs_start)
	error(ENOENT, "Couldn't find mefs_start, aborting");
	sb = mefs_super_attach(mefs_start, mefs_size);
	if (sb) {
	printk("Found existing mefs sb at %p, reconnecting.\n", sb);
	} else {
	sb = mefs_super_create(mefs_start, mefs_size);
	printk("Created new mefs sb at %p\n", sb);

	mefs_ext_alloc(sb, PGSIZE << 0);
	mefs_ext_alloc(sb, PGSIZE << 0);
	void * x = mefs_ext_alloc(sb, PGSIZE << 10);
	mefs_ext_alloc(sb, PGSIZE << 5);
	mefs_ext_alloc(sb, PGSIZE << 1);
	mefs_ext_free(sb, x);
	mefs_ext_alloc(sb, PGSIZE << 7);
	}
	mefs_super_dump(sb);

	mefs->sb = sb;
	// XXX


	/* This gives us one ref on root, which we'll never drop. */
	tfs_init(tfs);
	tfs->tf_ops = mefs_tf_ops;
	tfs->fs_ops = mefs_fs_ops;

	// XXX
	/* This gives us an extra refcnt on tfs->root. This is "+1 for existing."
	* It is decreffed during the purge CB. */
	__mefs_tf_init(tfs->root, &mefs_devtab - devtab, 0, &eve, DMDIR \| 0777);
	poperror();
	}

	static struct chan mefs_attach(char spec)
	{
	struct tree_filesystem tfs = (struct tree_filesystem)mefs;

	return tree_file_alloc_chan(tfs->root, &mefs_devtab, "#mefs");
	}

	static unsigned long mefs_chan_ctl(struct chan *c, int op, unsigned long a1,
	unsigned long a2, unsigned long a3,
	unsigned long a4)
	{
	switch (op) {
	case CCTL_SYNC:
	return 0;
	default:
	error(EINVAL, "%s does not support %d", __func__, op);
	}
	}

	struct dev mefs_devtab __devtab = {
	.name = "mefs",
	.reset = devreset,
	.init = mefs_init,
	.shutdown = devshutdown,
	.attach = mefs_attach,
	.walk = tree_chan_walk,
	.stat = tree_chan_stat,
	.open = tree_chan_open,
	.create = tree_chan_create,
	.close = tree_chan_close,
	.read = tree_chan_read,
	.bread = devbread,
	.write = tree_chan_write,
	.bwrite = devbwrite,
	.remove = tree_chan_remove,
	.rename = tree_chan_rename,
	.wstat = tree_chan_wstat,
	.power = devpower,
	.chaninfo = devchaninfo,
	.mmap = tree_chan_mmap,
	.chan_ctl = mefs_chan_ctl,
	};


	// XXX
	//
	// syslinux or something didn't work - the segment was zeroed.
	// might need a kexec
	// device teardown? none of that shit was tested. (NICs)
	// k, it's a large ball.
	// need that ball to not be in the 'overwrite' spot
	// the new one defines the size of the overwrite spot too (elf
	// parse, etc)
	// need a chunk of code, running on its own protected page tables
	// need that to also not be in the overwrite spot
	// protected gdt too, and stack page. can disable irqs...
	// memcpy to the final location, jump to it.
	// basically the elf parser, similar to loadelf.c
	// ah, but can't use any external code either.
	// maybe kexec is a super-slim OS
	// actually, we can bundle it with the target OS image.
	// set up its PT in advance?
	// need to do it at runtime, since we need the paddr
	//
	//
	//
	// will want to destroy the super aggressively. or at least have commands for
	// it, so that if we e.g. barcher a new kernel, we're not stuck with the bugs
	//
	// init is hokey. would like to grow and shrink, and need to sync btw the base
	// arena, mefs, and whatever we do to communicate to our future self.
	// actually, mefs will describe itself
	// but the future self / multiboot memory detection is trickier
	// handing segments back is a little trickier (can make a yank function,
	// then arena add. though that fragments the space slightly)
	//
	//
	// don't forget some way to sync, if necessary (since we don't sync on unmount)
	// btw, should unmount.c also sync?
	//
	//
	//
	// btw, for hole-punching, we might not be able to free the intermediate data
	// easily. would need to break it up.
	// issue is that we don't have individual blocks - we have a large
	// structure. and the arena code won't take something that didn't have a
	// btag