WIP-pop-3000
diff --git a/kern/arch/x86/entry64.S b/kern/arch/x86/entry64.S index 61774cd..bcc3634 100644 --- a/kern/arch/x86/entry64.S +++ b/kern/arch/x86/entry64.S
@@ -287,7 +287,8 @@ _start: movl $stack32top, %esp push %ebx # save mulitboot info - movw $0x1234,0x472 # warm boot + #movw $0x1234,0x472 # warm boot + movw $0x4321,0x472 # preserve ram... movl $0x80000001, %eax # some machines / VMs might not support long mode cpuid
diff --git a/kern/arch/x86/page_alloc.c b/kern/arch/x86/page_alloc.c index e0161cb..6dece87 100644 --- a/kern/arch/x86/page_alloc.c +++ b/kern/arch/x86/page_alloc.c
@@ -9,6 +9,9 @@ #include <multiboot.h> #include <arena.h> +physaddr_t mefs_start; +size_t mefs_size; + /* Helper. Adds free entries to the base arena. Most entries are page aligned, * though on some machines below EXTPHYSMEM we may have some that aren't. */ static void parse_mboot_region(struct multiboot_mmap_entry *entry, void *data) @@ -18,6 +21,14 @@ size_t len = entry->len; extern char end[]; +// XXX +if (start == 0x0000000100000000) { + mefs_start = (uintptr_t)KADDR(start); + mefs_size = len; + return; +} + + if (entry->type != MULTIBOOT_MEMORY_AVAILABLE) return; /* Skip anything over max_paddr - might be bad entries(?) */
diff --git a/kern/arch/x86/smp_boot.c b/kern/arch/x86/smp_boot.c index 133b9d6..fdd7b2d 100644 --- a/kern/arch/x86/smp_boot.c +++ b/kern/arch/x86/smp_boot.c
@@ -101,7 +101,7 @@ } // this needs to be set in smp_entry too... -#define trampoline_pg 0x00001000UL +#define trampoline_pg KADDR(0x00001000UL) extern char smp_entry[]; extern char smp_entry_end[]; extern char smp_boot_lock[]; @@ -128,11 +128,12 @@ struct per_cpu_info *pcpui0 = &per_cpu_info[0]; page_t *smp_stack; + //XXX + //x86_cleanup_bootmem(); // NEED TO GRAB A LOWMEM FREE PAGE FOR AP BOOTUP CODE // page1 (2nd page) is reserved, hardcoded in pmap.c - memset(KADDR(trampoline_pg), 0, PGSIZE); - memcpy(KADDR(trampoline_pg), (void *)smp_entry, - smp_entry_end - smp_entry); + memset(trampoline_pg, 0, PGSIZE); + memcpy(trampoline_pg, smp_entry, smp_entry_end - smp_entry); /* Make sure the trampoline page is mapped. 64 bit already has the tramp pg * mapped (1 GB of lowmem), so this is a nop. */ @@ -183,6 +184,7 @@ #endif /* CONFIG_DISABLE_SMT */ /* cleans up the trampoline page, and any other low boot mem mappings */ + //XXX x86_cleanup_bootmem(); /* trampoline_pg had a refcount of 2 earlier, so we need to dec once more to * free it but only if all cores are in (or we reset / reinit those that
diff --git a/kern/arch/x86/smp_entry64.S b/kern/arch/x86/smp_entry64.S index 6c9e350..caab39b 100644 --- a/kern/arch/x86/smp_entry64.S +++ b/kern/arch/x86/smp_entry64.S
@@ -62,6 +62,18 @@ orl $(CR0_PE | CR0_PG | CR0_WP | CR0_NE | CR0_MP), %eax andl $(~(CR0_AM | CR0_TS | CR0_EM | CR0_CD | CR0_NW)), %eax movl %eax, %cr0 + + # paging is on, but we don't have that low page mapped. hence the crash + # here + # even if we made this work, we'd probably want pg 0 mapped until the + # IDT is up + # + # 1)maybe smp boot in entry.S and pause + # lock after long_mode + # 2)faster syscalls with bools and whatnot. see email + + #1:jmp 1b + # load the 64bit GDT and jump to long mode (symbol from entry64) lgdt gdt64desc # Want to jump to the label long_mode, but we need to relocate to code
diff --git a/kern/arch/x86/trap.c b/kern/arch/x86/trap.c index 01b0dd5..0fac56e 100644 --- a/kern/arch/x86/trap.c +++ b/kern/arch/x86/trap.c
@@ -552,6 +552,7 @@ // Handle processor exceptions. switch(hw_tf->tf_trapno) { case T_BRKPT: + // XXX consider finalizing, since the BT / mon lacks fsbase if (!in_kernel(hw_tf)) backtrace_user_ctx(current, current_ctx); else
diff --git a/kern/drivers/Kbuild b/kern/drivers/Kbuild index 86e9b80..1d8588c 100644 --- a/kern/drivers/Kbuild +++ b/kern/drivers/Kbuild
@@ -1,3 +1,4 @@ +obj-y += fs/ obj-y += net/ obj-y += dev/ obj-y += timers/
diff --git a/kern/drivers/dev/gtfs.c b/kern/drivers/dev/gtfs.c index bd375a0..4c3771b 100644 --- a/kern/drivers/dev/gtfs.c +++ b/kern/drivers/dev/gtfs.c
@@ -28,6 +28,9 @@ struct kref users; }; +// XXX +static struct gtfs *recent; + /* Blob hanging off the fs_file->priv. The backend chans are only accessed, * (changed or used) with the corresponding fs_file qlock held. That's the * primary use of the qlock - we might be able to avoid qlocking with increfs @@ -752,6 +755,8 @@ nexterror(); } gtfs = kzmalloc(sizeof(struct gtfs), MEM_WAIT); +// XXX +recent = gtfs; /* This 'users' kref is the one that every distinct frontend chan has. * These come from attaches and successful, 'moving' walks. */ kref_init(>fs->users, gtfs_release, 1); @@ -766,6 +771,8 @@ tf_kref_put(tfs->root); tfs_destroy(tfs); kfree(gtfs); + // XXX + recent = NULL; nexterror(); } /* stores the ref for 'backend' inside tfs->root */ @@ -880,3 +887,51 @@ .mmap = tree_chan_mmap, .chan_ctl = gtfs_chan_ctl, }; + +// XXX +int xme(int op) +{ + if (!recent) + return -1; + switch (op) { + case 1: + printk("dumping GTFS (Qidpath, Ref)\n-----------------\n"); + __tfs_dump(&recent->tfs); + break; + case 3: + gtfs_free_memory(recent); + break; + default: + printk("give me an op\n"); + return -1; + } + return 0; +} + +// XXX +// +// would like some torture tests +// +//ktask changes: +// maybe better framework for ktasks and rendez. +// and ktask names +// also, would like a better kthread interface regarding the string. +// like an snprintf or something. +// +// either that, or just have the ktask code malloc and free its own +// copy of the name. so we don't have different behavior for each. +// probably this +// also, this kth might be on another core. same as the mgmt kth, +// since we don't have kthread affinity. +// right now, whoever does the wakeup hosts the core, right? +// maybe have ktasks head to core 0 or some affinity +// enum / #define for the gp_ktask_ctl states +// there's some element of post-and-poke to it, esp regarding rcu_barrier +// also, we now care if a ktask is running or not, more like +// schedulable entities. no one ever had a pointer to a ktask before +// for_each_mgmt_core, is_mgmt_core() +// dynamically changing this is harder? +// +// cache-align fields in rcu_state +// actually, probably should have a ktask struct, and a pointer hangs off +// rcu_state
diff --git a/kern/drivers/dev/kfs.c b/kern/drivers/dev/kfs.c index 3a5541e..e4032e3 100644 --- a/kern/drivers/dev/kfs.c +++ b/kern/drivers/dev/kfs.c
@@ -337,3 +337,220 @@ .mmap = tree_chan_mmap, .chan_ctl = kfs_chan_ctl, }; + + +// XXX misc TODO +// -------------------------------------------------- +// bash doesn't give us errstr... +// e.g. +// bash-4.3$ echo ffff >> /prog/goo +// bash: /prog/goo: Operation not permitted +// bash-4.3$ ash +// / $ echo ffff >> /prog/goo +// ash: can't create /prog/goo: devpermcheck(goo, 0644, 03102) failed +// that's a little weird. it was already created... could be an ash +// thing +// / $ write_to /prog/goo fff +// Can't open path: Operation not permitted, devpermcheck(goo, 0644, 03) +// failed +// +// a little better. +// why are the perms fucked? that was umask, and the owner is eve, but our +// username is nanwan or something. maybe nothing. but not eve. +// need umask 0002 or just 0, so we don't make a file 644 that we can't +// write +// +// bash when tabbing out cd, shows us all files, not just directories. +// not ash. they do the readdir, then stat everything +// some difference with stat, they can't tell it's (not) a dir? +// not sure - bash does the readdir, but doesn't do the stat right away. +// the function it is in (rl_filename_completion_function) doesn't seem to +// care about directories vs files. maybe it's not getting the right comp +// code? bash does do a stat, but only after printing the name +// rmdir doesn't do it either. also doesn't do it on busybox. +// +// +// our linux list.h could use some safety shit, like WRITE_ONCE. update to the +// most recent one, perhaps? +// +// hashing +// - consider storing the hash in the tf. might only be done twice +// - might be harder to resize, esp with RCU readers. might need a seq. +// - consider hashing on the parent QID too. +// - consider bucket locks +// - consider exclusivity checks on insert (or caller's responsibility) +// +// ns devtab func signature issue +// qbwrite and whatnot is ssize_t, and some cases can return -1. +// who calls that? +// how do we call devtab.bread (e.g. pipe) +// these funcs don't always throw +// ipbwrite just says it wrote it all. +// prob should review the functions like pipebread +// +// convD2M is unsigned int +// +// netifbread/bwrite/read/write +// +// have waserror() check for irq/trap depth +// +// +// XXX bigger shit +// +// +// how do we trigger the shrink of the cache? (memory pressure) +// - need to talk to the instance, e.g. versions of gtfs/tmpfs +// - walking the NS to find those chans is hard +// - having a CB where they register with the memory system might be better +// +// maybe related: some sort of chan op that returns an FD for a ctl FS +// imagine you want to get access to a control plane for a mounted +// device, such as a #gtfs. you want to fuck with various settings. +// +// how do you attach this? +// it probably doesn't speak 9p, so it'd be a bind +// but sort of like mnt, we had a path to a chan, then did chanctl, +// then the result of that is bound +// - we need something to attach. that chan_ctl can return an +// attachable chan to something else within the device? +// but then every op e.g. gtfs_write would need to know if it was +// talking to the real thing or something else +// maybe it'd be better to have an 'introspection' device, a different +// #peek or something. +// - this device takes a chan, like mount, as arguments for its attach +// and it has a small set of kobj/sysfs like ops that the peekee +// implements +// - just a device that knows about another device and can have custom +// hooks/commands/etc +// - though this might not work as well with 9p. issue is the +// interface between devices - if it's not 9p/devtab, then we're +// somewhat screwed +// say we had a chan flag, with tainting, e.g. CTL_PLANE or something +// we'll still never be able to have a device that supports this just +// have e.g. tree_chan_walk as its method. everything gets a layer. +// +// +// btw, chan_ctl's numbers are currently independent of fcntls, and there is no +// way to talk directly to chan_ctl (just like you can't call dev.create). not +// a problem yet, but if we want arbitrary chan_ctl, then we might change the +// numbers +// for instance, if i wanted to add a hokey chan_ctl for gtfs memory +// writeback or debugging. i can't access that from userspace. hence +// kfunc +// rel to the #peek device, chan_ctl might be the source for some blob +// pointer / hook. if userspace provides an FD, like mnt, then we'd need a +// way to get it. +// and the numbers for that are the CCTL_X, which are e.g. F_SETFL +// maybe. or maybe we have interposition layers, esp since F_GETFD is +// about the FD, not the chan. +// +// +// want a gtfs ktask that syncs or LRU frees on occasion? +// +// glibc uses the old convD2M and friends (also grep STATFIXLEN) +// +// RCU +// ^^^^^^^^^ +// +// better mmap infrastructure +// get rid of file_or_chan once we sort this out. right now, it has +// both chan and fs_file +// +// mmap notes +// +// newer note: +// we have foc_dev_mmap, but that doesn't pm_add_vmr. +// it could, but we also have pm_add_vmr when duplicating etc +// maybe that dev_mmap op ought to do both the pm_add_vmr and remove. +// call the op on both ends +// we'll need a counter for the number of dirtiable VMRs +// +// also, consider nesting / layering devices, even through the TFS. +// we might want to pass through to the block device/backend if it has an +// mmap op, since that could tell us the page-ish struct to use +// +// when we talk to 9ns, we want to handle things other than PMs. like +// arbitrary physical memory +// optional callback for unmapping (i.e. when the device wants to +// revoke the VMR connection, e.g. PM flusher) +// +// instead of PM, maybe something a little higher +// like the fs_file +// or the PM itself points to those pages. not quite a PM, in that +// it doesn't allocate pages. we just want to know where to point. +// +// tempted to have __vm_foc be an fs_file, though sometimes we need +// its absolute path (perf), which is a chan feature. +// +// what's the connection from VMR to file and from PM back to VMR? +// IIRC, PM has weak refs on the VMRs, VMRs have refs on file -> PM +// VMRs have refs on files/chan: the mmap lasts beyond the FD closing +// though it might not need to be the chan. could be fs_file +// depends on what the interface is - everything with chans and +// whatnot, multiplexed through a devtab[c->type].mmap op. +// 9p mmap op? probably not +// say you want to access a NIC on another machine +// 9p mnt - can you do that? it'll fake it with a mmap on +// the frontend, implemented with reads to the backend +// +// fs_file is doing some nasty things with usernames. everyone is eve, +// basically, and there's no real accounting. +// could change e.g. dir->uid to refcnts on struct user +// refcnting is a bit nasty, want something like 'users never go away' +// also need to interpet 9p's usernames. +// like lookup, given name, hook in +// need something for unknown users. eve? mount owner? +// also, sort out any other rules for the dir->strings. e.g. ext can be 0 +// +// missing chown +// that, and glibc set errno, but has an old errstr +// bash-4.3$ mv /prog/file /prog/f2 +// mv: can't preserve ownership of '/prog/f2': Function not implemented, could not find name f2, dev root +// +// +// XXX VM shit +// can we move all the PG_ flags out of struct page? +// we have PG_REMOVAL and PM_REMOVAL. ffs. +// PG_REMOVAL is used to communicate through the mem_walk +// callbacks +// PG_DIRTY is the response from the CB for a particular +// page too. so it's bidirectional +// there's a giant sem in there too, for load_page +// can we have the radix point to something other than a page? +// like some on-demand struct that has all the flags +// we'll need a way for vmr_for_each to communicate back to +// us. +// do we want a pml walk? slightly better than a +// foreach-pte_walk, since we don't have to go up and down. +// but the downside is we don't know the radix slot / PM info +// for a specific PTE. +// is there something we could pass that they can quickly +// find it? (rewalking the radix isn't 'quickly'). if so, +// we'd just do another PTE +// +// seems like we have two structures that are both radix +// trees: PMLs and pm_tree. would be nice to merge. can +// we walk them in sync? or use the same one? +// no to most, since a proc's KPT has many unrelated VMRs +// +// also, munmap is making a pass to mark not present +// anyways. (in regards to the for-each-pte-walk shit) +// +// maybe make all VMRs point to a "PM", even anon ones, instead of using +// the PTEs to track pages. +// - then replace all of it with the radixvm trees +// - and this thing can track whatever paddrs we're pointing to +// - PTEs become weak refs, unlike the weird shit mm does now +// - fs files or pms? (separate issues) +// - and to some extent, all of anon mem is really one giant PM, not N +// separate ones, and the VMRs are windows into that PM. +// - revisit locking the 'fs_file' and len check. anon won't have len. +// +// +// side note: whenever we free pages, they stay in the slab layers, so it's hard +// to tell we're actually freeing them + + // XXX + // + // install this, maybe (requires sqlite3) + // https://github.com/juntaki/gogtags
diff --git a/kern/drivers/fs/Kbuild b/kern/drivers/fs/Kbuild new file mode 100644 index 0000000..eb48bea --- /dev/null +++ b/kern/drivers/fs/Kbuild
@@ -0,0 +1 @@ +obj-y += mefs/
diff --git a/kern/drivers/fs/mefs/Kbuild b/kern/drivers/fs/mefs/Kbuild new file mode 100644 index 0000000..827aa96 --- /dev/null +++ b/kern/drivers/fs/mefs/Kbuild
@@ -0,0 +1,2 @@ +obj-y += block.o +obj-y += mefs.o
diff --git a/kern/drivers/fs/mefs/block.c b/kern/drivers/fs/mefs/block.c new file mode 100644 index 0000000..a9bcd54 --- /dev/null +++ b/kern/drivers/fs/mefs/block.c
@@ -0,0 +1,358 @@ +/* Copyright (c) 2016, 2018 Google Inc + * Barret Rhoden <brho@cs.berkeley.edu> + * See LICENSE for details. + * + * Memory Extent Filesystem block allocation + * + * We use a power-of-two list allocator, similar to the arena allocator. + * There's no xalloc, importing, qcaches, or anything like that. The superblock + * is analogous to the base arena: it must be self-sufficient. + * + * All of the structures are "on disk." In theory, that should change as often + * as a filesystem's disk structures change, which is rarely if ever. Once it + * is done. =) All values are in host-endian, and we operate directly on RAM. + * + * There's no protection for metadata corruption - if you crash in the middle of + * a tree-changing operation, you're out of luck. + * + * Unlike the arena allocator, we don't return a "void *", we actually return + * a pointer to the btag. All of our users (the rest of mefs) will put up with + * this layer of indirection. In exchange, we don't have to muck around with + * hash tables to find the btag when the segment is freed. + * + * This all assumes the caller manages synchronization (e.g. locks). + * + * This uses the BSD list macros, which technically is not guaranteed to not + * change. If someone wants to replace them with local versions that are bound + * to the filesystem's disk format, then be my guest. This doesn't use the + * rbtree code, though we probably could with the same justification for using + * the BSD list code. But it'd be a bit more of a pain to roll our own for + * that, and I doubt it is necessary. + */ + +#include "mefs.h" +#include <err.h> + +static struct mefs_btag *__get_from_freelists(struct mefs_superblock *sb, + int list_idx); +static bool __account_alloc(struct mefs_superblock *sb, struct mefs_btag *bt, + size_t size, struct mefs_btag *new); + +/* Bootstrap from the free list */ +static void __ensure_some_btags(struct mefs_superblock *sb) +{ + struct mefs_btag *bt, *tags; + size_t nr_bts = MEFS_QUANTUM / sizeof(struct mefs_btag); + + if (!BSD_LIST_EMPTY(&sb->unused_btags)) + return; + bt = __get_from_freelists(sb, LOG2_UP(MEFS_QUANTUM)); + if (!bt) + error(ENOMEM, "Unable to get more BTs in mefs!"); + tags = (struct mefs_btag*)bt->start; + if (__account_alloc(sb, bt, MEFS_QUANTUM, &tags[0])) { + /* We used the tag[0]; we'll have to skip over it now. */ + tags++; + nr_bts--; + } + for (int i = 0; i < nr_bts; i++) + BSD_LIST_INSERT_HEAD(&sb->unused_btags, &tags[i], misc_link); +} + +static struct mefs_btag *__get_btag(struct mefs_superblock *sb) +{ + struct mefs_btag *bt; + + bt = BSD_LIST_FIRST(&sb->unused_btags); + assert(bt); + BSD_LIST_REMOVE(bt, misc_link); + return bt; +} + +static void __free_btag(struct mefs_superblock *sb, struct mefs_btag *bt) +{ + BSD_LIST_INSERT_HEAD(&sb->unused_btags, bt, misc_link); +} + +static void __track_free_seg(struct mefs_superblock *sb, struct mefs_btag *bt) +{ + int list_idx = LOG2_DOWN(bt->size); + + bt->status = MEFS_BTAG_FREE; + BSD_LIST_INSERT_HEAD(&sb->free_segs[list_idx], bt, misc_link); +} + +static void __untrack_free_seg(struct mefs_superblock *sb, struct mefs_btag *bt) +{ + BSD_LIST_REMOVE(bt, misc_link); +} + +/* This is a little slow, and is a consequence of not using a tree. However, + * the common case caller was when @bt was created from an old one, and is + * likely to be right after it. The old one is the @hint, which is where to + * start our search. */ +static void __insert_btag(struct mefs_btag_list *list, struct mefs_btag *bt, + struct mefs_btag *hint) +{ + struct mefs_btag *i, *last = NULL; + bool hinted = false; + + BSD_LIST_FOREACH(i, list, all_link) { + if (!hinted && hint) { + i = hint; + hinted = true; + } + if (bt->start < i->start) { + BSD_LIST_INSERT_BEFORE(i, bt, all_link); + return; + } + if (bt->start == i->start) + panic("BT %p == i %p in list %p!", bt, i, list); + last = i; + } + if (last) + BSD_LIST_INSERT_AFTER(last, bt, all_link); + else + BSD_LIST_INSERT_HEAD(list, bt, all_link); +} + +/* Unlink the arena allocator, we don't track the segments on an allocated list. + * Our caller is the one that keeps track of it. */ +static void __track_alloc_seg(struct mefs_superblock *sb, struct mefs_btag *bt) +{ + bt->status = MEFS_BTAG_ALLOC; +} + +/* Helper: we decided we want to alloc part of @bt, which has been removed from + * its old list. We need @size units. The rest can go back. + * + * Takes @new, which we'll use if we need a new btag. If @new is NULL, we'll + * allocate one. If we used the caller's btag, we'll return TRUE. */ +static bool __account_alloc(struct mefs_superblock *sb, struct mefs_btag *bt, + size_t size, struct mefs_btag *new) +{ + bool ret = FALSE; + + assert(bt->status == MEFS_BTAG_FREE); + if (bt->size != size) { + assert(bt->size > size); + if (new) + ret = TRUE; + else + new = __get_btag(sb); + new->start = bt->start + size; + new->size = bt->size - size; + bt->size = size; + __track_free_seg(sb, new); + __insert_btag(&sb->all_segs, new, bt); + } + __track_alloc_seg(sb, bt); + sb->amt_alloc_segs += size; + return ret; +} + +static struct mefs_btag *__get_from_freelists(struct mefs_superblock *sb, + int list_idx) +{ + struct mefs_btag *ret = NULL; + + for (int i = list_idx; i < MEFS_NR_FREE_LISTS; i++) { + ret = BSD_LIST_FIRST(&sb->free_segs[i]); + if (ret) { + BSD_LIST_REMOVE(ret, misc_link); + break; + } + } + return ret; +} + +/* This uses the arena's "best fit" policy. You could imagine building a + * version that cares about alignment too, for e.g. huge pages. */ +struct mefs_btag *mefs_ext_alloc(struct mefs_superblock *sb, size_t size) +{ + int list_idx = LOG2_DOWN(size); + struct mefs_btag *bt_i, *best = NULL; + + if (!size) + error(EINVAL, "mefs_ext_alloc with 0 size!"); + __ensure_some_btags(sb); + size = ROUNDUP(size, MEFS_QUANTUM); + BSD_LIST_FOREACH(bt_i, &sb->free_segs[list_idx], misc_link) { + if (bt_i->size >= size) { + if (!best || (best->size > bt_i->size)) + best = bt_i; + } + } + if (best) + BSD_LIST_REMOVE(best, misc_link); + else + best = __get_from_freelists(sb, list_idx + 1); + if (!best) + error(ENOMEM, "couldn't find segment in mefs!"); + __account_alloc(sb, best, size, NULL); + return best; +} + +static bool __merge_right_to_left(struct mefs_superblock *sb, + struct mefs_btag *left, + struct mefs_btag *right) +{ + if (left->status != MEFS_BTAG_FREE) + return false; + if (right->status != MEFS_BTAG_FREE) + return false; + if (left->start + left->size == right->start) { + __untrack_free_seg(sb, left); + __untrack_free_seg(sb, right); + left->size += right->size; + __track_free_seg(sb, left); + BSD_LIST_REMOVE(right, all_link); + __free_btag(sb, right); + return true; + } + return false; +} + +static void __coalesce_free_seg(struct mefs_superblock *sb, + struct mefs_btag *bt) +{ + struct mefs_btag *bt_p, *bt_n; + + bt_n = BSD_LIST_NEXT(bt, all_link); + if (bt_n) + __merge_right_to_left(sb, bt, bt_n); + bt_p = BSD_LIST_PREV(bt, &sb->all_segs, mefs_btag, all_link); + if (bt_p) + __merge_right_to_left(sb, bt_p, bt); +} + +void mefs_ext_free(struct mefs_superblock *sb, struct mefs_btag *bt) +{ + void *to_free_addr = 0; + size_t to_free_sz = 0; + + sb->amt_alloc_segs -= bt->size; + __track_free_seg(sb, bt); + /* Don't use bt after this: */ + __coalesce_free_seg(sb, bt); + sb->amt_total_segs -= to_free_sz; +} + +/* Bump allocates space in the segment [seg_alloc, seg_alloc + seg_amt). + * Returns the allocation address and updates the allocator's values by + * reference. Throws on error. */ +static void *bump_zalloc(size_t amt, size_t align, uintptr_t *seg_alloc, + size_t *seg_amt) +{ + size_t align_diff; + void *ret; + + align_diff = ALIGN(*seg_alloc, align) - *seg_alloc; + if (*seg_amt < amt + align_diff) + error(ENOMEM, "Out of space in mefs SB"); + *seg_amt -= align_diff; + *seg_alloc += align_diff; + ret = (void*)*seg_alloc; + *seg_alloc += amt; + *seg_amt -= amt; + memset(ret, 0, amt); + return ret; +} + +/* Creates a superblock and adds the memory segment. The SB will be at the + * beginning of the segment. */ +struct mefs_superblock *mefs_super_create(uintptr_t init_seg, size_t size) +{ + struct mefs_superblock *sb; + struct mefs_btag *bt; + uintptr_t seg_alloc = init_seg; + size_t seg_amt = size; + + sb = bump_zalloc(sizeof(*sb), __alignof__(*sb), &seg_alloc, &seg_amt); + memcpy(sb->magic, MEFS_MAGIC, sizeof(sb->magic)); + BSD_LIST_INIT(&sb->all_segs); + BSD_LIST_INIT(&sb->unused_btags); + for (int i = 0; i < MEFS_NR_FREE_LISTS; i++) + BSD_LIST_INIT(&sb->free_segs[i]); + + bt = bump_zalloc(sizeof(*bt), __alignof__(*bt), &seg_alloc, &seg_amt); + BSD_LIST_INSERT_HEAD(&sb->unused_btags, bt, misc_link); + + seg_alloc = ALIGN(seg_alloc, MEFS_QUANTUM); + seg_amt = ALIGN_DOWN(seg_amt, MEFS_QUANTUM); + + mefs_super_add(sb, seg_alloc, seg_amt); + + return sb; +} + +/* Ignoring size for now. Could use it for sanity checks. */ +struct mefs_superblock *mefs_super_attach(uintptr_t init_seg, size_t size) +{ + struct mefs_superblock *sb; + + init_seg = ALIGN(init_seg, sizeof(*sb)); + sb = (struct mefs_superblock*)init_seg; + if (strcmp(sb->magic, MEFS_MAGIC)) + return NULL; + return sb; +} + +void mefs_super_add(struct mefs_superblock *sb, uintptr_t seg, size_t size) +{ + struct mefs_btag *bt; + + __ensure_some_btags(sb); + bt = __get_btag(sb); + bt->start = seg; + bt->size = size; + sb->amt_total_segs += size; + __track_free_seg(sb, bt); + __insert_btag(&sb->all_segs, bt, NULL); +} + +void mefs_super_destroy(struct mefs_superblock *sb) +{ + memset(sb->magic, 0xa, sizeof(sb->magic)); +} + +void mefs_super_dump(struct mefs_superblock *sb) +{ + struct mefs_btag *i; + + printk("All segs\n"); + BSD_LIST_FOREACH(i, &sb->all_segs, all_link) + printk("bt %p, start %p, +%lu, %s\n", i, i->start, i->size, + i->status == MEFS_BTAG_FREE ? "free" : "alloc"); +} + +#include <time.h> + +static inline void kb_wait(void) +{ + int i; + + for (i = 0; i < 0x10000; i++) { + if ((inb(0x64) & 0x02) == 0) + break; + udelay(2); + } +} + +void food() +{ + outb(0xcf9, 0x6); + printk("ACPI cf9 FAILED\n"); +} + +void foot() +{ + for (int i = 0; i < 10; i++) { + kb_wait(); + udelay(50); + outb(0x64, 0xfe); /* Pulse reset low */ + udelay(50); + } + + printk("KBD FAILED\n"); +}
diff --git a/kern/drivers/fs/mefs/mefs.c b/kern/drivers/fs/mefs/mefs.c new file mode 100644 index 0000000..83a23e3 --- /dev/null +++ b/kern/drivers/fs/mefs/mefs.c
@@ -0,0 +1,292 @@ +/* Copyright (c) 2018 Google Inc + * Barret Rhoden <brho@cs.berkeley.edu> + * See LICENSE for details. + * + * #mefs: Memory Extent Filesystem + * + * It's designed to run on memory segments, supporting a small number of files + * whose sizes are bimodal - either small, or potentially very large. Small + * files are O(PGSIZE). Large files are O(TB). + * + * We're not designing for persistence in the face of failures, hardcore + * performance, or anything like that. I'd like it to be simple, yet capable of + * handling very large files. + * + * There's only one instance of mefs, similar to KFS and unlike tmpfs. All + * attaches get the same FS. + */ + +#include <ns.h> +#include <kmalloc.h> +#include <string.h> +#include <stdio.h> +#include <assert.h> +#include <error.h> +#include <tree_file.h> +#include <pmap.h> + +#include "mefs.h" + +struct dev mefs_devtab; + +struct mefs { + struct tree_filesystem tfs; + struct mefs_superblock *sb; + + + + atomic_t qid; +}; + +static struct mefs mefs[1]; + +static uint64_t mefs_get_qid_path(struct mefs *mefs) +{ + return atomic_fetch_and_add(&mefs->qid, 1); +} + +static char *devname(void) +{ + return mefs_devtab.name; +} + +static void mefs_tf_free(struct tree_file *tf) +{ + /* We have nothing special hanging off the TF */ +} + +static void mefs_tf_unlink(struct tree_file *parent, struct tree_file *child) +{ + /* This is the "+1 for existing" ref. */ + tf_kref_put(child); +} + +static void __mefs_tf_init(struct tree_file *tf, int dir_type, int dir_dev, + struct username *user, int perm) +{ + struct dir *dir = &tf->file.dir; + + fs_file_init_dir(&tf->file, dir_type, dir_dev, user, perm); + dir->qid.path = mefs_get_qid_path((struct mefs*)tf->tfs); + dir->qid.vers = 0; + /* This is the "+1 for existing" ref. There is no backing store for the FS, + * such as a disk or 9p, so we can't get rid of a file until it is unlinked + * and decreffed. Note that KFS doesn't use pruners or anything else. */ + __kref_get(&tf->kref, 1); +} + +/* Note: If your TFS doesn't support symlinks, you need to error out */ +static void mefs_tf_create(struct tree_file *parent, struct tree_file *child, + int perm) +{ + __mefs_tf_init(child, parent->file.dir.type, parent->file.dir.dev, &eve, + perm); +} + +static void mefs_tf_rename(struct tree_file *tf, struct tree_file *old_parent, + struct tree_file *new_parent, const char *name, + int flags) +{ + /* We don't have a backend, so we don't need to do anything additional for + * rename. */ +} + +static bool mefs_tf_has_children(struct tree_file *parent) +{ + /* The tree_file parent list is complete and not merely a cache for a real + * backend. */ + return !list_empty(&parent->children); +} + +struct tree_file_ops mefs_tf_ops = { + .free = mefs_tf_free, + .unlink = mefs_tf_unlink, + .lookup = NULL, + .create = mefs_tf_create, + .rename = mefs_tf_rename, + .has_children = mefs_tf_has_children, +}; + +/* Fills page with its contents from its backing store file. For KFS, that + * means we're creating or extending a file, and the contents are 0. Note the + * page/offset might be beyond the current file length, based on the current + * pagemap code. */ +static int mefs_pm_readpage(struct page_map *pm, struct page *pg) +{ + memset(page2kva(pg), 0, PGSIZE); + atomic_or(&pg->pg_flags, PG_UPTODATE); + /* Pretend that we blocked while filing this page. This catches a lot of + * bugs. It does slightly slow down the kernel, but it's only when filling + * the page cache, and considering we are using a RAMFS, you shouldn't + * measure things that actually rely on KFS's performance. */ + kthread_usleep(1); + return 0; +} + +/* Meant to take the page from PM and flush to backing store. There is no + * backing store. */ +static int mefs_pm_writepage(struct page_map *pm, struct page *pg) +{ + return 0; +} + +static void mefs_fs_punch_hole(struct fs_file *f, off64_t begin, off64_t end) +{ +} + +static bool mefs_fs_can_grow_to(struct fs_file *f, size_t len) +{ + /* TODO: implement some sort of memory limit */ + return true; +} + +struct fs_file_ops mefs_fs_ops = { + .readpage = mefs_pm_readpage, + .writepage = mefs_pm_writepage, + .punch_hole = mefs_fs_punch_hole, + .can_grow_to = mefs_fs_can_grow_to, +}; + +static struct mefs *chan_to_mefs(struct chan *c) +{ + struct tree_file *tf = chan_to_tree_file(c); + + return (struct mefs*)(tf->tfs); +} + +extern physaddr_t mefs_start; +extern size_t mefs_size; + +static void mefs_init(void) +{ + ERRSTACK(1); + struct tree_filesystem *tfs = (struct tree_filesystem*)mefs; + struct mefs_superblock *sb; + + if (waserror()) { + printk("#mefs threw %s\n", current_errstr()); + poperror(); + return; + } + if (!mefs_start) + error(ENOENT, "Couldn't find mefs_start, aborting"); + sb = mefs_super_attach(mefs_start, mefs_size); + if (sb) { + printk("Found existing mefs sb at %p, reconnecting.\n", sb); + } else { + sb = mefs_super_create(mefs_start, mefs_size); + printk("Created new mefs sb at %p\n", sb); + + mefs_ext_alloc(sb, PGSIZE << 0); + mefs_ext_alloc(sb, PGSIZE << 0); + void * x = mefs_ext_alloc(sb, PGSIZE << 10); + mefs_ext_alloc(sb, PGSIZE << 5); + mefs_ext_alloc(sb, PGSIZE << 1); + mefs_ext_free(sb, x); + mefs_ext_alloc(sb, PGSIZE << 7); + } + mefs_super_dump(sb); + + mefs->sb = sb; +// XXX + + + /* This gives us one ref on root, which we'll never drop. */ + tfs_init(tfs); + tfs->tf_ops = mefs_tf_ops; + tfs->fs_ops = mefs_fs_ops; + + // XXX + /* This gives us an extra refcnt on tfs->root. This is "+1 for existing." + * It is decreffed during the purge CB. */ + __mefs_tf_init(tfs->root, &mefs_devtab - devtab, 0, &eve, DMDIR | 0777); + poperror(); +} + +static struct chan *mefs_attach(char *spec) +{ + struct tree_filesystem *tfs = (struct tree_filesystem*)mefs; + + return tree_file_alloc_chan(tfs->root, &mefs_devtab, "#mefs"); +} + +static unsigned long mefs_chan_ctl(struct chan *c, int op, unsigned long a1, + unsigned long a2, unsigned long a3, + unsigned long a4) +{ + switch (op) { + case CCTL_SYNC: + return 0; + default: + error(EINVAL, "%s does not support %d", __func__, op); + } +} + +struct dev mefs_devtab __devtab = { + .name = "mefs", + .reset = devreset, + .init = mefs_init, + .shutdown = devshutdown, + .attach = mefs_attach, + .walk = tree_chan_walk, + .stat = tree_chan_stat, + .open = tree_chan_open, + .create = tree_chan_create, + .close = tree_chan_close, + .read = tree_chan_read, + .bread = devbread, + .write = tree_chan_write, + .bwrite = devbwrite, + .remove = tree_chan_remove, + .rename = tree_chan_rename, + .wstat = tree_chan_wstat, + .power = devpower, + .chaninfo = devchaninfo, + .mmap = tree_chan_mmap, + .chan_ctl = mefs_chan_ctl, +}; + + +// XXX +// +// syslinux or something didn't work - the segment was zeroed. +// might need a kexec +// device teardown? none of that shit was tested. (NICs) +// k, it's a large ball. +// need that ball to not be in the 'overwrite' spot +// the new one defines the size of the overwrite spot too (elf +// parse, etc) +// need a chunk of code, running on its own protected page tables +// need that to also not be in the overwrite spot +// protected gdt too, and stack page. can disable irqs... +// memcpy to the final location, jump to it. +// basically the elf parser, similar to loadelf.c +// ah, but can't use any external code either. +// maybe kexec is a super-slim OS +// actually, we can bundle it with the target OS image. +// set up its PT in advance? +// need to do it at runtime, since we need the paddr +// +// +// +// will want to destroy the super aggressively. or at least have commands for +// it, so that if we e.g. barcher a new kernel, we're not stuck with the bugs +// +// init is hokey. would like to grow and shrink, and need to sync btw the base +// arena, mefs, and whatever we do to communicate to our future self. +// actually, mefs will describe itself +// but the future self / multiboot memory detection is trickier +// handing segments back is a little trickier (can make a yank function, +// then arena add. though that fragments the space slightly) +// +// +// don't forget some way to sync, if necessary (since we don't sync on unmount) +// btw, should unmount.c also sync? +// +// +// +// btw, for hole-punching, we might not be able to free the intermediate data +// easily. would need to break it up. +// issue is that we don't have individual blocks - we have a large +// structure. and the arena code won't take something that didn't have a +// btag
diff --git a/kern/drivers/fs/mefs/mefs.h b/kern/drivers/fs/mefs/mefs.h new file mode 100644 index 0000000..465817c --- /dev/null +++ b/kern/drivers/fs/mefs/mefs.h
@@ -0,0 +1,49 @@ +/* Copyright (c) 2018 Google Inc + * Barret Rhoden <brho@cs.berkeley.edu> + * See LICENSE for details. + * + */ + +#pragma once + +#include <sys/types.h> +#include <ros/arch/mmu.h> +#include <sys/queue.h> + +#define MEFS_BTAG_FREE 1 +#define MEFS_BTAG_ALLOC 2 + +/* all_link is rooted at all_segs in the SB. misc_link is used for the + * unused_btags list (btag cache) or the free seg list. */ +struct mefs_btag { + BSD_LIST_ENTRY(mefs_btag) all_link; + BSD_LIST_ENTRY(mefs_btag) misc_link; + uintptr_t start; + size_t size; + int status; +}; +BSD_LIST_HEAD(mefs_btag_list, mefs_btag); + +/* 64 is the most powers of two we can express with 64 bits. */ +#define MEFS_NR_FREE_LISTS 64 +#define MEFS_QUANTUM PGSIZE +#define MEFS_MAGIC "MEFS001" + +/* all_segs is the sorted list of all btags that cover the memory space. i.e. + * not the unused btags, but all the btags for the allocated and free memory. */ +struct mefs_superblock { + char magic[8]; + struct mefs_btag_list all_segs; + struct mefs_btag_list unused_btags; + struct mefs_btag_list free_segs[MEFS_NR_FREE_LISTS]; + size_t amt_total_segs; + size_t amt_alloc_segs; +}; + +struct mefs_superblock *mefs_super_create(uintptr_t init_seg, size_t size); +struct mefs_superblock *mefs_super_attach(uintptr_t init_seg, size_t size); +void mefs_super_add(struct mefs_superblock *sb, uintptr_t seg, size_t size); +void mefs_super_destroy(struct mefs_superblock *sb); +void mefs_super_dump(struct mefs_superblock *sb); +struct mefs_btag *mefs_ext_alloc(struct mefs_superblock *sb, size_t size); +void mefs_ext_free(struct mefs_superblock *sb, struct mefs_btag *bt);
diff --git a/kern/src/arena.c b/kern/src/arena.c index 26a5991..050004f 100644 --- a/kern/src/arena.c +++ b/kern/src/arena.c
@@ -1060,6 +1060,9 @@ struct rb_node *rb_p, *rb_n; struct btag *bt_p, *bt_n; + // XXX this could merge more. if we succeed on a merge, then we might move + // to a higher list, and then be able to merge again with both left and + // right rb_n = rb_next(&bt->all_link); if (rb_n) { bt_n = container_of(rb_n, struct btag, all_link);
diff --git a/kern/src/ns/tree_file.c b/kern/src/ns/tree_file.c index 14e5f8d..b7c2dc5 100644 --- a/kern/src/ns/tree_file.c +++ b/kern/src/ns/tree_file.c
@@ -87,6 +87,11 @@ struct tree_filesystem *tfs = tf->tfs; tf->tfs->tf_ops.free(tf); + + // XXX + printk("FREEING %s\n", tree_file_to_name(tf)); + + if (tf->flags & TF_F_IS_ROOT) { assert(tfs->root == tf); assert(!parent);
diff --git a/kern/src/rcu.c b/kern/src/rcu.c index 23a8a6c..71c1d31 100644 --- a/kern/src/rcu.c +++ b/kern/src/rcu.c
@@ -91,7 +91,7 @@ extern int rcu_num_lvls; /* Controls whether we skip cores when we expedite, which forces tardy cores. */ -static bool rcu_debug_tardy; +static bool rcu_debug_tardy = true; /* Externed in rcu_tree_helper.c */ struct rcu_state rcu_state; @@ -610,3 +610,55 @@ rpi->booted = true; } } + +// XXX +struct bar { + int x; + struct rcu_head h; +}; + +void foo() +{ + rcu_dump_rcu_node_tree(&rcu_state); + printk("gp num %d, completed %d\n", rcu_state.gpnum, rcu_state.completed); + struct bar *bar = kmalloc(sizeof(struct bar), MEM_WAIT); + + kfree_rcu(bar, h); +} + + +static void increment(struct rcu_head *head) +{ + struct bar *b = container_of(head, struct bar, h); + + WRITE_ONCE(b->x, b->x + 1); +} + +static void __torture(uint32_t srcid, long a0, long a1, long a2) +{ + struct bar *bars = kzmalloc(sizeof(struct bar) * 1000, MEM_WAIT); + + #define NR_CBS 50 + + for (int i = 0; i < NR_CBS; i++) { + bars[i].x = i; + call_rcu(&bars[i].h, increment); + } + udelay(1000); + /* We know the CBs have not run yet, since this CPU hasn't had a QS */ + for (int i = 0; i < NR_CBS; i++) + assert(bars[i].x == i); + rcu_barrier(); /* might hurt. could imagine a local barrier */ + for (int i = 0; i < NR_CBS; i++) + assert(bars[i].x == i + 1); + kfree(bars); +} + +void torture(void) +{ + /* Most all of this time is spent on core 0 (mpstat) */ + for_each_core(i) { + for (int j = 0; j < 20; j++) + send_kernel_message(i, __torture, 0, 0, 0, KMSG_ROUTINE); + } +}
diff --git a/kern/src/rcu_tree_helper.c b/kern/src/rcu_tree_helper.c index 5c99032..3b36aa6 100644 --- a/kern/src/rcu_tree_helper.c +++ b/kern/src/rcu_tree_helper.c
@@ -46,7 +46,7 @@ /* Number of cores RCU thinks exist. Set to 0 or nothing to use 'num_cores'. * The extra cores are called 'fake cores' in rcu.c, and are used for testing * the tree. */ -int rcu_num_cores; +int rcu_num_cores = 78; /* in rcu.c */ extern struct rcu_state rcu_state;