| /* Copyright (c) 2018 Google Inc |
| * Barret Rhoden <brho@cs.berkeley.edu> |
| * See LICENSE for details. |
| * |
| * fs_file: structs and helpers for files for 9ns devices |
| */ |
| |
| #include <fs_file.h> |
| #include <kmalloc.h> |
| #include <string.h> |
| #include <stdio.h> |
| #include <assert.h> |
| #include <error.h> |
| #include <umem.h> |
| #include <pmap.h> |
| |
| /* Initializes a zalloced fs_file. The caller is responsible for filling in |
| * dir, except for name. Most fields are fine with being zeroed. Note the kref |
| * == 0 too. */ |
| void fs_file_init(struct fs_file *f, const char *name, struct fs_file_ops *ops) |
| { |
| qlock_init(&f->qlock); |
| fs_file_set_basename(f, name); |
| f->ops = ops; |
| /* TODO: consider holding off on initializing the PM, since only walked |
| * and opened entries could use it. pm == NULL means no PM yet. |
| * Negative entries will never be used in this manner. Doing it now |
| * avoids races, though it's mostly zeroing cache-hot fields. */ |
| f->pm = &f->static_pm; |
| pm_init(f->pm, (struct page_map_operations*)ops, f); |
| } |
| |
| void fs_file_set_basename(struct fs_file *f, const char *name) |
| { |
| size_t name_len = strlen(name) + 1; |
| |
| if (name_len > KNAMELEN) |
| f->dir.name = kzmalloc(name_len, MEM_WAIT); |
| else |
| f->dir.name = f->static_name; |
| memcpy(f->dir.name, name, name_len); |
| } |
| |
| /* Technically, a reader could see the old string pointer and read it. That |
| * memory could be realloced and used for something else. But thanks to the |
| * seqctr, the reader will retry. Otherwise, we might not need the seqctr, |
| * since we never change_basename when a file is in a tree. So far. |
| * |
| * The only reader that races with setting the name is stat. Regular lookups |
| * won't see the file, since it was removed from the HT, and readdirs won't see |
| * it due to the parent's qlock. */ |
| void fs_file_change_basename(struct fs_file *f, const char *name) |
| { |
| char *old_name = NULL; |
| char *new_name = NULL; |
| size_t name_len = strlen(name) + 1; |
| |
| if (name_len > KNAMELEN) |
| new_name = kzmalloc(name_len, MEM_WAIT); |
| qlock(&f->qlock); |
| if (f->dir.name != f->static_name) |
| old_name = f->dir.name; |
| if (new_name) |
| f->dir.name = new_name; |
| else |
| f->dir.name = f->static_name; |
| memcpy(f->dir.name, name, name_len); |
| /* TODO: if we store the hash of the name in the file, do so here. */ |
| qunlock(&f->qlock); |
| kfree(old_name); |
| } |
| |
| /* Helper for building a dir. Caller sets qid path and vers. YMMV. */ |
| void fs_file_init_dir(struct fs_file *f, int dir_type, int dir_dev, |
| struct username *user, int perm) |
| { |
| struct dir *dir = &f->dir; |
| |
| if (perm & DMDIR) |
| dir->qid.type |= QTDIR; |
| if (perm & DMAPPEND) |
| dir->qid.type |= QTAPPEND; |
| if (perm & DMEXCL) |
| dir->qid.type |= QTEXCL; |
| if (perm & DMSYMLINK) |
| dir->qid.type |= QTSYMLINK; |
| /* dir->mode stores all the DM bits, but note that userspace can only |
| * affect the permissions (S_PMASK) bits. */ |
| dir->mode = perm; |
| __set_acmtime(f, FSF_ATIME | FSF_BTIME | FSF_MTIME | FSF_CTIME); |
| dir->length = 0; |
| /* TODO: this is a mess if you use anything other than eve. If you use |
| * a process, that memory is sitting in the proc struct, but we have |
| * weak refs on it. What happens when that proc exits? Disaster. */ |
| assert(user == &eve); |
| dir->uid = user->name; |
| dir->gid = user->name; |
| dir->muid = user->name; |
| } |
| |
| static char *copy_str(const char *s) |
| { |
| char *ret; |
| size_t sz; |
| |
| if (!s) |
| return NULL; |
| sz = strlen(s) + 1; |
| ret = kmalloc(sz, MEM_WAIT); |
| memcpy(ret, s, sz); |
| return ret; |
| } |
| |
| /* Deep copies the contents of dir into the fs_file's dir. */ |
| void fs_file_copy_from_dir(struct fs_file *f, struct dir *dir) |
| { |
| memcpy(&f->dir, dir, sizeof(struct dir)); |
| fs_file_set_basename(f, dir->name); |
| /* TODO: sort out usernames. Not only are these just eve, but they are |
| * not struct user or something and they ignore whatever the name was |
| * from the remote end. */ |
| f->dir.uid = eve.name; |
| f->dir.gid = eve.name; |
| f->dir.muid = eve.name; |
| f->dir.ext = copy_str(dir->ext); |
| } |
| |
| void cleanup_fs_file(struct fs_file *f) |
| { |
| if (f->dir.name != f->static_name) |
| kfree(f->dir.name); |
| /* TODO: Not sure if these will be refcounted objects in the future. |
| * Keep this in sync with other code that manages/sets uid/gid/muid. */ |
| f->dir.uid = NULL; |
| f->dir.gid = NULL; |
| f->dir.muid = NULL; |
| if (f->dir.ext) |
| kfree(f->dir.ext); |
| f->dir.ext = NULL; |
| pm_destroy(f->pm); |
| /* Might share mappings in the future. Catch it here. */ |
| assert(f->pm == &f->static_pm); |
| } |
| |
| void __set_acmtime_to(struct fs_file *f, int which, struct timespec *t) |
| { |
| /* WRITE_ONCE, due to lockless peakers */ |
| if (which & FSF_ATIME) { |
| WRITE_ONCE(f->dir.atime.tv_sec, t->tv_sec); |
| WRITE_ONCE(f->dir.atime.tv_nsec, t->tv_nsec); |
| } |
| if (which & FSF_BTIME) { |
| WRITE_ONCE(f->dir.btime.tv_sec, t->tv_sec); |
| WRITE_ONCE(f->dir.btime.tv_nsec, t->tv_nsec); |
| } |
| if (which & FSF_CTIME) { |
| WRITE_ONCE(f->dir.ctime.tv_sec, t->tv_sec); |
| WRITE_ONCE(f->dir.ctime.tv_nsec, t->tv_nsec); |
| } |
| if (which & FSF_MTIME) { |
| WRITE_ONCE(f->dir.mtime.tv_sec, t->tv_sec); |
| WRITE_ONCE(f->dir.mtime.tv_nsec, t->tv_nsec); |
| } |
| } |
| |
| /* Caller should hold f's qlock */ |
| void __set_acmtime(struct fs_file *f, int which) |
| { |
| struct timespec now = nsec2timespec(epoch_nsec()); |
| |
| __set_acmtime_to(f, which, &now); |
| } |
| |
| /* Recall that the frontend always has the most up-to-date info. This gets |
| * synced to the backend when we flush or fsync. */ |
| void set_acmtime_to(struct fs_file *f, int which, struct timespec *t) |
| { |
| ERRSTACK(1); |
| |
| qlock(&f->qlock); |
| if (waserror()) { |
| qunlock(&f->qlock); |
| nexterror(); |
| } |
| if ((which & FSF_ATIME) && !caller_has_file_perms(f, O_READ)) |
| error(EPERM, "insufficient perms to set atime"); |
| if ((which & FSF_BTIME) && !caller_is_username(f->dir.uid)) |
| error(EPERM, "insufficient perms to set btime"); |
| if ((which & FSF_CTIME) && !caller_has_file_perms(f, O_WRITE)) |
| error(EPERM, "insufficient perms to set ctime"); |
| if ((which & FSF_MTIME) && !caller_has_file_perms(f, O_WRITE)) |
| error(EPERM, "insufficient perms to set mtime"); |
| __set_acmtime_to(f, which, t); |
| qunlock(&f->qlock); |
| poperror(); |
| } |
| |
| void set_acmtime_noperm(struct fs_file *f, int which) |
| { |
| struct timespec now = nsec2timespec(epoch_nsec()); |
| |
| /* <3 atime. We'll go with an hour resolution, like NTFS. */ |
| if (which == FSF_ATIME) { |
| if (now.tv_sec < ACCESS_ONCE(f->dir.atime.tv_sec) + 3600) |
| return; |
| } |
| qlock(&f->qlock); |
| __set_acmtime_to(f, which, &now); |
| qunlock(&f->qlock); |
| } |
| |
| size_t fs_file_stat(struct fs_file *f, uint8_t *m_buf, size_t m_buf_sz) |
| { |
| size_t ret; |
| |
| qlock(&f->qlock); |
| ret = convD2M(&f->dir, m_buf, m_buf_sz); |
| qunlock(&f->qlock); |
| if (ret <= BIT16SZ) |
| error(EINVAL, "buffer too small for stat"); |
| return ret; |
| } |
| |
| /* Helper: update file metadata after a write */ |
| static void write_metadata(struct fs_file *f, off64_t offset, |
| bool always_update_len) |
| { |
| qlock(&f->qlock); |
| f->flags |= FSF_DIRTY; |
| if (always_update_len || (offset > f->dir.length)) |
| WRITE_ONCE(f->dir.length, offset); |
| __set_acmtime(f, FSF_MTIME | FSF_CTIME); |
| qunlock(&f->qlock); |
| } |
| |
| /* Punches a hole from begin to end. Pages completely in the hole will be |
| * removed. Otherwise, the edges will be zeroed. |
| * |
| * Concurrent truncates with reads and writes can lead to weird data. |
| * truncate()/punch_hole will attempt to remove data in page-sized chunks. |
| * Concurrent users (with a PM refcnt, under the current code) will prevent |
| * removal. punch_hole will memset those areas to 0, similar to a concurrent |
| * write. |
| * |
| * read() will return data that is up to the file's length. write() and |
| * punch_hole() will add or remove data and set the length. When adding data |
| * (write), we add it first, then set the len. When removing data, we set the |
| * len, then remove it. If you mix those ops, the len could be set above an |
| * area where the data is still being mucked with. read/write/mmap all grab |
| * references to the PM's page slot, locking the page in the page cache for a |
| * little. Truncate often won't remove those pages, but it will try to zero |
| * them. reads and mmaps will check the length on their own, while it is being |
| * changed by other ops. |
| * |
| * A few examples: |
| * - Trunc to 0 during write to N. A reader might get zeros instead of the data |
| * written (trunc was dropping/zeroing the pages after write wrote them). |
| * - Trunc to 0, trunc back to N, with concurrent reads/mmaps of the area in |
| * between: a reader might see the old data or tears in the data. |
| * - mmaps of pages in a region that gets hole-punched and faults at the same |
| * time might not get a SIGBUS / ESPIPE. That length check is best effort. |
| * - After we remove hole pages from the page cache, but before we tell the |
| * backend, a read/write/mmap-fault to a page in the hole could fetch the old |
| * data from the backend before the FS op removes the data from the backend, |
| * and we'd end up with some old data. The root issue here is that the |
| * frontend is a cache, and it has the most recent version of the data. In |
| * the case of hole punching, we want there to be an absence of data. |
| * Technically, we could have zeroed pages, but we don't want the overhead of |
| * that. So we drop the pages - that situation looks the same as not having |
| * the data in the cache/frontend. |
| * |
| * To prevent these things, we could qlock the entire file during all ops, or |
| * even just for trunc, write, and loading pages into the PM for read. That was |
| * my first version of this. But you can imagine backends that don't require |
| * this sort of serialization (e.g. ramfs, future #mnts, etc), and it |
| * complicates some mmap / pagemap code. If you want the qlock to protect the |
| * invariant (one of which is that the file's contents below len are always |
| * valid; another is that hole punches don't keep old data), then we can add |
| * some sort of file locking. |
| */ |
| static void fs_file_punch_hole(struct fs_file *f, off64_t begin, off64_t end) |
| { |
| size_t first_pg_idx, last_pg_idx, nr_pages, zero_amt; |
| struct page *page; |
| int error; |
| |
| /* Caller should check for this */ |
| assert((long)begin >= 0); |
| assert((long)end >= 0); |
| if (end <= begin) |
| return; |
| /* We're punching for the range [begin, end), but inclusive for the |
| * pages: [first_pg_idx, last_pg_idx]. */ |
| first_pg_idx = LA2PPN(begin); |
| last_pg_idx = LA2PPN(ROUNDUP(end, PGSIZE)) - 1; |
| nr_pages = last_pg_idx - first_pg_idx + 1; |
| if (PGOFF(begin)) { |
| error = pm_load_page(f->pm, first_pg_idx, &page); |
| if (error) |
| error(-error, "punch_hole pm_load_page failed"); |
| zero_amt = MIN(PGSIZE - PGOFF(begin), end - begin); |
| memset(page2kva(page) + PGOFF(begin), 0, zero_amt); |
| atomic_or(&page->pg_flags, PG_DIRTY); |
| pm_put_page(page); |
| first_pg_idx++; |
| nr_pages--; |
| if (!nr_pages) |
| return; |
| } |
| if (PGOFF(end)) { |
| /* if this unaligned end is beyond the EOF, we might pull in a |
| * page of zeros, then zero the first part of it. */ |
| error = pm_load_page(f->pm, last_pg_idx, &page); |
| if (error) |
| error(-error, "punch_hole pm_load_page failed"); |
| memset(page2kva(page), 0, PGOFF(end)); |
| atomic_or(&page->pg_flags, PG_DIRTY); |
| pm_put_page(page); |
| last_pg_idx--; |
| nr_pages--; |
| if (!nr_pages) |
| return; |
| } |
| pm_remove_or_zero_pages(f->pm, first_pg_idx, nr_pages); |
| /* After we removed the pages from the PM, but before we tell the |
| * backend, someone could load a backend page. Note that we only tell |
| * the backend about the intermediate pages - we already dealt with the |
| * edge pages above, and the PM has the latest, dirty version of them. |
| * */ |
| f->ops->punch_hole(f, first_pg_idx << PGSHIFT, |
| (first_pg_idx + nr_pages) << PGSHIFT); |
| } |
| |
| void fs_file_truncate(struct fs_file *f, off64_t to) |
| { |
| off64_t old_len = fs_file_get_length(f); |
| |
| fs_file_perm_check(f, O_WRITE); |
| if ((to > old_len) && !f->ops->can_grow_to(f, to)) |
| error(EINVAL, "can't grow file to %lu bytes", to); |
| write_metadata(f, to, true); |
| if (to < old_len) { |
| /* Round up the old_len to avoid making an unnecessary partial |
| * page of zeros at the end of the file. */ |
| fs_file_punch_hole(f, to, ROUNDUP(old_len, PGSIZE)); |
| } |
| } |
| |
| /* Standard read. We sync with write, in that once the length is set, we'll |
| * attempt to read those bytes. */ |
| size_t fs_file_read(struct fs_file *f, uint8_t *buf, size_t count, |
| off64_t offset) |
| { |
| ERRSTACK(1); |
| struct page *page; |
| size_t copy_amt, pg_off, pg_idx, total_remaining; |
| volatile size_t so_far = 0; /* volatile for waserror */ |
| const uint8_t *buf_end = buf + count; |
| int error; |
| |
| /* These errors should have been caught by higher level code */ |
| if ((uintptr_t)buf + count < (uintptr_t)buf) |
| panic("Bad buf %p + count %p", buf, count); |
| if (offset + count < offset) |
| panic("Bad offset %p + count %p", offset, count); |
| if (waserror()) { |
| if (so_far) { |
| poperror(); |
| return so_far; |
| } |
| nexterror(); |
| } |
| while (buf < buf_end) { |
| /* Check early, so we don't load pages beyond length needlessly. |
| * The PM/FSF op might just create zeroed pages when asked. */ |
| if (offset + so_far >= fs_file_get_length(f)) |
| break; |
| pg_off = PGOFF(offset + so_far); |
| pg_idx = LA2PPN(offset + so_far); |
| error = pm_load_page(f->pm, pg_idx, &page); |
| if (error) |
| error(-error, "read pm_load_page failed"); |
| copy_amt = MIN(PGSIZE - pg_off, buf_end - buf); |
| /* Lockless peak. Check the len so we don't read beyond EOF. |
| * We have a page, but we don't necessarily have access to all |
| * of it. */ |
| total_remaining = fs_file_get_length(f) - (offset + so_far); |
| if (copy_amt > total_remaining) { |
| copy_amt = total_remaining; |
| buf_end = buf + copy_amt; |
| } |
| memcpy_to_safe(buf, page2kva(page) + pg_off, copy_amt); |
| buf += copy_amt; |
| so_far += copy_amt; |
| pm_put_page(page); |
| } |
| if (so_far) |
| set_acmtime_noperm(f, FSF_ATIME); |
| poperror(); |
| return so_far; |
| } |
| |
| size_t fs_file_write(struct fs_file *f, const uint8_t *buf, size_t count, |
| off64_t offset) |
| { |
| ERRSTACK(1); |
| struct page *page; |
| size_t copy_amt, pg_off, pg_idx; |
| volatile size_t so_far = 0; /* volatile for waserror */ |
| const uint8_t *buf_end = buf + count; |
| int error; |
| |
| /* These errors should have been caught by higher level code */ |
| if ((uintptr_t)buf + count < (uintptr_t)buf) |
| panic("Bad buf %p + count %p", buf, count); |
| if (offset + count < offset) |
| panic("Bad offset %p + count %p", offset, count); |
| if (waserror()) { |
| if (so_far) { |
| write_metadata(f, offset + so_far, false); |
| poperror(); |
| return so_far; |
| } |
| nexterror(); |
| }; |
| if (offset + count > fs_file_get_length(f)) { |
| if (!f->ops->can_grow_to(f, offset + count)) |
| error(EINVAL, "can't write file to %lu bytes", offset + |
| count); |
| } |
| while (buf < buf_end) { |
| pg_off = PGOFF(offset + so_far); |
| pg_idx = LA2PPN(offset + so_far); |
| error = pm_load_page(f->pm, pg_idx, &page); |
| if (error) |
| error(-error, "write pm_load_page failed"); |
| copy_amt = MIN(PGSIZE - pg_off, buf_end - buf); |
| /* TODO: If you ask the kernel to write from a user address that |
| * will page fault, the memcpy will fail and we'll move on to |
| * the next region. To avoid leaving a chunk of uninitialized |
| * memory, we'll zero it out in the page cache. Otherwise the |
| * user could come back and read old kernel data. |
| * |
| * The real fix will be to have the kernel throw an error if it |
| * was a segfault or block if it wasn't. Note that when the |
| * kernel attempts to access the user's page, it does so with a |
| * handle_page_fault_nofile, which won't attempt to handle |
| * file-backed VMRs *even if* the file is in the page cache. |
| * Yikes! */ |
| if (memcpy_from_safe(page2kva(page) + pg_off, buf, copy_amt)) |
| memset(page2kva(page) + pg_off, 0, copy_amt); |
| buf += copy_amt; |
| so_far += copy_amt; |
| atomic_or(&page->pg_flags, PG_DIRTY); |
| pm_put_page(page); |
| } |
| assert(buf == buf_end); |
| assert(count == so_far); |
| /* We set the len *after* writing for our lockless reads. If we set len |
| * before, then read() could start as soon as we loaded the page (all |
| * zeros), but before we wrote the actual data. They'd get zeros |
| * instead of what we added. */ |
| write_metadata(f, offset + so_far, false); |
| poperror(); |
| return so_far; |
| } |
| |
| static void wstat_mode(struct fs_file *f, int new_mode) |
| { |
| ERRSTACK(1); |
| int mode; |
| |
| qlock(&f->qlock); |
| if (waserror()) { |
| qunlock(&f->qlock); |
| nexterror(); |
| } |
| if (!caller_is_username(f->dir.uid)) |
| error(EPERM, "wrong user for wstat, need %s", f->dir.uid); |
| /* Only allowing changes in permissions, not random stuff like whether |
| * it is a directory or symlink. */ |
| static_assert(!(DMMODE_BITS & S_PMASK)); |
| mode = (f->dir.mode & ~S_PMASK) | (new_mode & S_PMASK); |
| WRITE_ONCE(f->dir.mode, mode); |
| __set_acmtime(f, FSF_CTIME); |
| qunlock(&f->qlock); |
| poperror(); |
| } |
| |
| size_t fs_file_wstat(struct fs_file *f, uint8_t *m_buf, size_t m_buf_sz) |
| { |
| struct dir *m_dir; |
| size_t m_sz; |
| |
| /* common trick in wstats. we want the dir and any strings in the M. |
| * the strings are smaller than the entire M (which is strings plus the |
| * real dir M). the strings will be placed right after the dir |
| * (dir[1]). */ |
| m_dir = kzmalloc(sizeof(struct dir) + m_buf_sz, MEM_WAIT); |
| m_sz = convM2D(m_buf, m_buf_sz, &m_dir[0], (char*)&m_dir[1]); |
| if (!m_sz) { |
| kfree(m_dir); |
| error(ENODATA, "couldn't convM2D"); |
| } |
| /* We'll probably have similar issues for all of the strings. At that |
| * point, we might not even bother reading the strings in. */ |
| if (!emptystr(m_dir->name)) |
| error(EINVAL, "do not rename with wstat"); |
| if (m_dir->mode != -1) |
| wstat_mode(f, m_dir->mode); |
| if (m_dir->length != -1) |
| fs_file_truncate(f, m_dir->length); |
| if ((int64_t)m_dir->atime.tv_sec != -1) |
| set_acmtime_to(f, FSF_ATIME, &m_dir->atime); |
| if ((int64_t)m_dir->btime.tv_sec != -1) |
| set_acmtime_to(f, FSF_BTIME, &m_dir->btime); |
| if ((int64_t)m_dir->ctime.tv_sec != -1) |
| set_acmtime_to(f, FSF_CTIME, &m_dir->ctime); |
| if ((int64_t)m_dir->mtime.tv_sec != -1) |
| set_acmtime_to(f, FSF_MTIME, &m_dir->mtime); |
| /* TODO: handle uid/gid/muid changes */ |
| kfree(m_dir); |
| return m_sz; |
| } |