blob: e4032e36304271cf5573b3fd252d6b7d6e2e4ae7 [file] [log] [blame]
/* Copyright (c) 2018 Google Inc
* Barret Rhoden <brho@cs.berkeley.edu>
* See LICENSE for details.
*
* #kfs, in-memory ram filesystem, pulling from the kernel's embedded CPIO
*/
#include <ns.h>
#include <kmalloc.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <error.h>
#include <tree_file.h>
#include <pmap.h>
#include <cpio.h>
struct dev kfs_devtab;
struct kfs {
struct tree_filesystem tfs;
atomic_t qid;
} kfs;
static uint64_t kfs_get_qid_path(void)
{
return atomic_fetch_and_add(&kfs.qid, 1);
}
static char *devname(void)
{
return kfs_devtab.name;
}
static void kfs_tf_free(struct tree_file *tf)
{
/* We have nothing special hanging off the TF */
}
static void kfs_tf_unlink(struct tree_file *parent, struct tree_file *child)
{
/* This is the "+1 for existing" ref. */
tf_kref_put(child);
}
static void __kfs_tf_init(struct tree_file *tf, int dir_type, int dir_dev,
struct username *user, int perm)
{
struct dir *dir = &tf->file.dir;
fs_file_init_dir(&tf->file, dir_type, dir_dev, user, perm);
dir->qid.path = kfs_get_qid_path();
dir->qid.vers = 0;
/* This is the "+1 for existing" ref. There is no backing store for the FS,
* such as a disk or 9p, so we can't get rid of a file until it is unlinked
* and decreffed. Note that KFS doesn't use pruners or anything else. */
__kref_get(&tf->kref, 1);
}
/* Note: If your TFS doesn't support symlinks, you need to error out */
static void kfs_tf_create(struct tree_file *parent, struct tree_file *child,
int perm)
{
__kfs_tf_init(child, parent->file.dir.type, parent->file.dir.dev, &eve,
perm);
}
static void kfs_tf_rename(struct tree_file *tf, struct tree_file *old_parent,
struct tree_file *new_parent, const char *name,
int flags)
{
/* We don't have a backend, so we don't need to do anything additional for
* rename. */
}
static bool kfs_tf_has_children(struct tree_file *parent)
{
/* The tree_file parent list is complete and not merely a cache for a real
* backend. */
return !list_empty(&parent->children);
}
struct tree_file_ops kfs_tf_ops = {
.free = kfs_tf_free,
.unlink = kfs_tf_unlink,
.lookup = NULL,
.create = kfs_tf_create,
.rename = kfs_tf_rename,
.has_children = kfs_tf_has_children,
};
/* Fills page with its contents from its backing store file. For KFS, that
* means we're creating or extending a file, and the contents are 0. Note the
* page/offset might be beyond the current file length, based on the current
* pagemap code. */
static int kfs_pm_readpage(struct page_map *pm, struct page *pg)
{
memset(page2kva(pg), 0, PGSIZE);
atomic_or(&pg->pg_flags, PG_UPTODATE);
/* Pretend that we blocked while filing this page. This catches a lot of
* bugs. It does slightly slow down the kernel, but it's only when filling
* the page cache, and considering we are using a RAMFS, you shouldn't
* measure things that actually rely on KFS's performance. */
kthread_usleep(1);
return 0;
}
/* Meant to take the page from PM and flush to backing store. There is no
* backing store. */
static int kfs_pm_writepage(struct page_map *pm, struct page *pg)
{
return 0;
}
static void kfs_fs_punch_hole(struct fs_file *f, off64_t begin, off64_t end)
{
}
static bool kfs_fs_can_grow_to(struct fs_file *f, size_t len)
{
/* TODO: implement some sort of memory limit */
return true;
}
struct fs_file_ops kfs_fs_ops = {
.readpage = kfs_pm_readpage,
.writepage = kfs_pm_writepage,
.punch_hole = kfs_fs_punch_hole,
.can_grow_to = kfs_fs_can_grow_to,
};
/* Consumes root's chan, even on error. */
static struct chan *__add_kfs_dir(struct chan *root, char *path,
struct cpio_bin_hdr *c_bhdr)
{
ERRSTACK(1);
struct chan *c;
if (waserror()) {
warn("failed to add %s", path);
cclose(root);
poperror();
return NULL;
}
c = namec_from(root, path, Acreate, O_EXCL, DMDIR | c_bhdr->c_mode, NULL);
poperror();
return c;
}
static struct chan *__add_kfs_symlink(struct chan *root, char *path,
struct cpio_bin_hdr *c_bhdr)
{
ERRSTACK(1);
struct chan *c;
char target[c_bhdr->c_filesize + 1];
if (waserror()) {
warn("failed to add %s", path);
cclose(root);
poperror();
return NULL;
}
strncpy(target, c_bhdr->c_filestart, c_bhdr->c_filesize);
target[c_bhdr->c_filesize] = 0;
c = namec_from(root, path, Acreate, O_EXCL,
DMSYMLINK | S_IRWXU | S_IRWXG | S_IRWXO, target);
poperror();
return c;
}
static struct chan *__add_kfs_file(struct chan *root, char *path,
struct cpio_bin_hdr *c_bhdr)
{
ERRSTACK(1);
struct chan *c;
off64_t offset = 0;
size_t ret, amt = c_bhdr->c_filesize;
void *buf = c_bhdr->c_filestart;
if (waserror()) {
warn("failed to add %s", path);
cclose(root);
poperror();
return NULL;
}
c = namec_from(root, path, Acreate, O_EXCL | O_RDWR, c_bhdr->c_mode, NULL);
poperror();
if (waserror()) {
warn("failed to modify %s", path);
cclose(c);
poperror();
return NULL;
}
while (amt) {
ret = devtab[c->type].write(c, buf + offset, amt, offset);
amt -= ret;
offset += ret;
}
poperror();
return c;
}
static int add_kfs_entry(struct cpio_bin_hdr *c_bhdr, void *cb_arg)
{
struct tree_file *root = cb_arg;
char *path = c_bhdr->c_filename;
struct chan *c;
struct tree_file *tf;
struct timespec ts;
/* Root of the FS, already part of KFS */
if (!strcmp(path, "."))
return 0;
c = tree_file_alloc_chan(root, &kfs_devtab, "#kfs");
switch (c_bhdr->c_mode & CPIO_FILE_MASK) {
case (CPIO_DIRECTORY):
c = __add_kfs_dir(c, path, c_bhdr);
break;
case (CPIO_SYMLINK):
c = __add_kfs_symlink(c, path, c_bhdr);
break;
case (CPIO_REG_FILE):
c = __add_kfs_file(c, path, c_bhdr);
break;
default:
cclose(c);
warn("Unknown file type %d in the CPIO!",
c_bhdr->c_mode & CPIO_FILE_MASK);
return -1;
}
if (!c)
return -1;
tf = chan_to_tree_file(c);
ts.tv_sec = c_bhdr->c_mtime;
ts.tv_nsec = 0;
/* Lockless */
__set_acmtime_to(&tf->file, FSF_ATIME | FSF_BTIME | FSF_CTIME | FSF_MTIME,
&ts);
/* TODO: consider UID/GID. Right now, everything is owned by eve. */
cclose(c);
return 0;
}
struct cpio_info {
void *base;
size_t sz;
};
static void kfs_get_cpio_info(struct cpio_info *ci)
{
extern uint8_t _binary_obj_kern_initramfs_cpio_size[];
extern uint8_t _binary_obj_kern_initramfs_cpio_start[];
ci->base = (void*)_binary_obj_kern_initramfs_cpio_start;
ci->sz = (size_t)_binary_obj_kern_initramfs_cpio_size;
}
static void kfs_extract_cpio(struct cpio_info *ci)
{
parse_cpio_entries(ci->base, ci->sz, add_kfs_entry, kfs.tfs.root);
}
static void kfs_free_cpio(struct cpio_info *ci)
{
void *base = ci->base;
size_t sz = ci->sz;
/* The base arena requires page aligned, page sized segments. */
sz -= ROUNDUP(base, PGSIZE) - base;
sz = ROUNDDOWN(sz, PGSIZE);
base = ROUNDUP(base, PGSIZE);
/* Careful - the CPIO is part of the kernel blob and a code address. */
base = KBASEADDR(base);
printk("Freeing %d MB of CPIO RAM\n", sz >> 20);
arena_add(base_arena, base, sz, MEM_WAIT);
}
static void kfs_init(void)
{
struct tree_filesystem *tfs = &kfs.tfs;
struct cpio_info ci[1];
/* This gives us one ref on tfs->root. */
tfs_init(tfs);
tfs->tf_ops = kfs_tf_ops;
tfs->fs_ops = kfs_fs_ops;
/* Note this gives us the "+1 for existing" ref on tfs->root. */
__kfs_tf_init(tfs->root, &kfs_devtab - devtab, 0, &eve, DMDIR | 0777);
/* Other devices might want to create things like kthreads that run the LRU
* pruner or PM sweeper. */
kfs_get_cpio_info(ci);
kfs_extract_cpio(ci);
kfs_free_cpio(ci);
/* This has another kref. Note that each attach gets a ref and each new
* process gets a ref. */
kern_slash = tree_file_alloc_chan(kfs.tfs.root, &kfs_devtab, "/");
}
static struct chan *kfs_attach(char *spec)
{
/* The root TF has a new kref for the attach chan */
return tree_file_alloc_chan(kfs.tfs.root, &kfs_devtab, "#kfs");
}
static unsigned long kfs_chan_ctl(struct chan *c, int op, unsigned long a1,
unsigned long a2, unsigned long a3,
unsigned long a4)
{
switch (op) {
case CCTL_SYNC:
return 0;
default:
error(EINVAL, "%s does not support %d", __func__, op);
}
}
struct dev kfs_devtab __devtab = {
.name = "kfs",
.reset = devreset,
.init = kfs_init,
.shutdown = devshutdown,
.attach = kfs_attach,
.walk = tree_chan_walk,
.stat = tree_chan_stat,
.open = tree_chan_open,
.create = tree_chan_create,
.close = tree_chan_close,
.read = tree_chan_read,
.bread = devbread,
.write = tree_chan_write,
.bwrite = devbwrite,
.remove = tree_chan_remove,
.rename = tree_chan_rename,
.wstat = tree_chan_wstat,
.power = devpower,
.chaninfo = devchaninfo,
.mmap = tree_chan_mmap,
.chan_ctl = kfs_chan_ctl,
};
// XXX misc TODO
// --------------------------------------------------
// bash doesn't give us errstr...
// e.g.
// bash-4.3$ echo ffff >> /prog/goo
// bash: /prog/goo: Operation not permitted
// bash-4.3$ ash
// / $ echo ffff >> /prog/goo
// ash: can't create /prog/goo: devpermcheck(goo, 0644, 03102) failed
// that's a little weird. it was already created... could be an ash
// thing
// / $ write_to /prog/goo fff
// Can't open path: Operation not permitted, devpermcheck(goo, 0644, 03)
// failed
//
// a little better.
// why are the perms fucked? that was umask, and the owner is eve, but our
// username is nanwan or something. maybe nothing. but not eve.
// need umask 0002 or just 0, so we don't make a file 644 that we can't
// write
//
// bash when tabbing out cd, shows us all files, not just directories.
// not ash. they do the readdir, then stat everything
// some difference with stat, they can't tell it's (not) a dir?
// not sure - bash does the readdir, but doesn't do the stat right away.
// the function it is in (rl_filename_completion_function) doesn't seem to
// care about directories vs files. maybe it's not getting the right comp
// code? bash does do a stat, but only after printing the name
// rmdir doesn't do it either. also doesn't do it on busybox.
//
//
// our linux list.h could use some safety shit, like WRITE_ONCE. update to the
// most recent one, perhaps?
//
// hashing
// - consider storing the hash in the tf. might only be done twice
// - might be harder to resize, esp with RCU readers. might need a seq.
// - consider hashing on the parent QID too.
// - consider bucket locks
// - consider exclusivity checks on insert (or caller's responsibility)
//
// ns devtab func signature issue
// qbwrite and whatnot is ssize_t, and some cases can return -1.
// who calls that?
// how do we call devtab.bread (e.g. pipe)
// these funcs don't always throw
// ipbwrite just says it wrote it all.
// prob should review the functions like pipebread
//
// convD2M is unsigned int
//
// netifbread/bwrite/read/write
//
// have waserror() check for irq/trap depth
//
//
// XXX bigger shit
//
//
// how do we trigger the shrink of the cache? (memory pressure)
// - need to talk to the instance, e.g. versions of gtfs/tmpfs
// - walking the NS to find those chans is hard
// - having a CB where they register with the memory system might be better
//
// maybe related: some sort of chan op that returns an FD for a ctl FS
// imagine you want to get access to a control plane for a mounted
// device, such as a #gtfs. you want to fuck with various settings.
//
// how do you attach this?
// it probably doesn't speak 9p, so it'd be a bind
// but sort of like mnt, we had a path to a chan, then did chanctl,
// then the result of that is bound
// - we need something to attach. that chan_ctl can return an
// attachable chan to something else within the device?
// but then every op e.g. gtfs_write would need to know if it was
// talking to the real thing or something else
// maybe it'd be better to have an 'introspection' device, a different
// #peek or something.
// - this device takes a chan, like mount, as arguments for its attach
// and it has a small set of kobj/sysfs like ops that the peekee
// implements
// - just a device that knows about another device and can have custom
// hooks/commands/etc
// - though this might not work as well with 9p. issue is the
// interface between devices - if it's not 9p/devtab, then we're
// somewhat screwed
// say we had a chan flag, with tainting, e.g. CTL_PLANE or something
// we'll still never be able to have a device that supports this just
// have e.g. tree_chan_walk as its method. everything gets a layer.
//
//
// btw, chan_ctl's numbers are currently independent of fcntls, and there is no
// way to talk directly to chan_ctl (just like you can't call dev.create). not
// a problem yet, but if we want arbitrary chan_ctl, then we might change the
// numbers
// for instance, if i wanted to add a hokey chan_ctl for gtfs memory
// writeback or debugging. i can't access that from userspace. hence
// kfunc
// rel to the #peek device, chan_ctl might be the source for some blob
// pointer / hook. if userspace provides an FD, like mnt, then we'd need a
// way to get it.
// and the numbers for that are the CCTL_X, which are e.g. F_SETFL
// maybe. or maybe we have interposition layers, esp since F_GETFD is
// about the FD, not the chan.
//
//
// want a gtfs ktask that syncs or LRU frees on occasion?
//
// glibc uses the old convD2M and friends (also grep STATFIXLEN)
//
// RCU
// ^^^^^^^^^
//
// better mmap infrastructure
// get rid of file_or_chan once we sort this out. right now, it has
// both chan and fs_file
//
// mmap notes
//
// newer note:
// we have foc_dev_mmap, but that doesn't pm_add_vmr.
// it could, but we also have pm_add_vmr when duplicating etc
// maybe that dev_mmap op ought to do both the pm_add_vmr and remove.
// call the op on both ends
// we'll need a counter for the number of dirtiable VMRs
//
// also, consider nesting / layering devices, even through the TFS.
// we might want to pass through to the block device/backend if it has an
// mmap op, since that could tell us the page-ish struct to use
//
// when we talk to 9ns, we want to handle things other than PMs. like
// arbitrary physical memory
// optional callback for unmapping (i.e. when the device wants to
// revoke the VMR connection, e.g. PM flusher)
//
// instead of PM, maybe something a little higher
// like the fs_file
// or the PM itself points to those pages. not quite a PM, in that
// it doesn't allocate pages. we just want to know where to point.
//
// tempted to have __vm_foc be an fs_file, though sometimes we need
// its absolute path (perf), which is a chan feature.
//
// what's the connection from VMR to file and from PM back to VMR?
// IIRC, PM has weak refs on the VMRs, VMRs have refs on file -> PM
// VMRs have refs on files/chan: the mmap lasts beyond the FD closing
// though it might not need to be the chan. could be fs_file
// depends on what the interface is - everything with chans and
// whatnot, multiplexed through a devtab[c->type].mmap op.
// 9p mmap op? probably not
// say you want to access a NIC on another machine
// 9p mnt - can you do that? it'll fake it with a mmap on
// the frontend, implemented with reads to the backend
//
// fs_file is doing some nasty things with usernames. everyone is eve,
// basically, and there's no real accounting.
// could change e.g. dir->uid to refcnts on struct user
// refcnting is a bit nasty, want something like 'users never go away'
// also need to interpet 9p's usernames.
// like lookup, given name, hook in
// need something for unknown users. eve? mount owner?
// also, sort out any other rules for the dir->strings. e.g. ext can be 0
//
// missing chown
// that, and glibc set errno, but has an old errstr
// bash-4.3$ mv /prog/file /prog/f2
// mv: can't preserve ownership of '/prog/f2': Function not implemented, could not find name f2, dev root
//
//
// XXX VM shit
// can we move all the PG_ flags out of struct page?
// we have PG_REMOVAL and PM_REMOVAL. ffs.
// PG_REMOVAL is used to communicate through the mem_walk
// callbacks
// PG_DIRTY is the response from the CB for a particular
// page too. so it's bidirectional
// there's a giant sem in there too, for load_page
// can we have the radix point to something other than a page?
// like some on-demand struct that has all the flags
// we'll need a way for vmr_for_each to communicate back to
// us.
// do we want a pml walk? slightly better than a
// foreach-pte_walk, since we don't have to go up and down.
// but the downside is we don't know the radix slot / PM info
// for a specific PTE.
// is there something we could pass that they can quickly
// find it? (rewalking the radix isn't 'quickly'). if so,
// we'd just do another PTE
//
// seems like we have two structures that are both radix
// trees: PMLs and pm_tree. would be nice to merge. can
// we walk them in sync? or use the same one?
// no to most, since a proc's KPT has many unrelated VMRs
//
// also, munmap is making a pass to mark not present
// anyways. (in regards to the for-each-pte-walk shit)
//
// maybe make all VMRs point to a "PM", even anon ones, instead of using
// the PTEs to track pages.
// - then replace all of it with the radixvm trees
// - and this thing can track whatever paddrs we're pointing to
// - PTEs become weak refs, unlike the weird shit mm does now
// - fs files or pms? (separate issues)
// - and to some extent, all of anon mem is really one giant PM, not N
// separate ones, and the VMRs are windows into that PM.
// - revisit locking the 'fs_file' and len check. anon won't have len.
//
//
// side note: whenever we free pages, they stay in the slab layers, so it's hard
// to tell we're actually freeing them
// XXX
//
// install this, maybe (requires sqlite3)
// https://github.com/juntaki/gogtags