| /* Copyright (c) 2018 Google Inc |
| * Barret Rhoden <brho@cs.berkeley.edu> |
| * See LICENSE for details. |
| * |
| * tree_file: structs and helpers for a tree-based filesystem for 9ns devices. |
| */ |
| |
| #include <tree_file.h> |
| #include <kmalloc.h> |
| #include <string.h> |
| #include <stdio.h> |
| #include <assert.h> |
| #include <error.h> |
| |
| /* Adds to the LRU if it was not on it. |
| * |
| * Caller holds the TF lock or o/w knows it has the only ref to tf. */ |
| static void __add_to_lru(struct tree_file *tf) |
| { |
| struct walk_cache *wc = &tf->tfs->wc; |
| |
| if (tf->flags & TF_F_ON_LRU) |
| return; |
| tf->flags |= TF_F_ON_LRU; |
| spin_lock(&wc->lru_lock); |
| list_add_tail(&tf->lru, &wc->lru); |
| spin_unlock(&wc->lru_lock); |
| } |
| |
| /* Removes from the LRU if it was on it. |
| * |
| * Caller holds the TF lock or o/w knows it has the only ref to tf. */ |
| static void __remove_from_lru(struct tree_file *tf) |
| { |
| struct walk_cache *wc = &tf->tfs->wc; |
| |
| if (!(tf->flags & TF_F_ON_LRU)) |
| return; |
| assert(kref_refcnt(&tf->kref) == 0); |
| tf->flags &= ~TF_F_ON_LRU; |
| spin_lock(&wc->lru_lock); |
| list_del(&tf->lru); |
| spin_unlock(&wc->lru_lock); |
| } |
| |
| /* Caller holds the parent's qlock. Separate helpers here in case we track |
| * nr_children. */ |
| static void __add_to_parent_list(struct tree_file *parent, |
| struct tree_file *child) |
| { |
| list_add(&child->siblings, &parent->children); |
| } |
| |
| /* Caller holds the parent's qlock */ |
| static void __remove_from_parent_list(struct tree_file *parent, |
| struct tree_file *child) |
| { |
| list_del(&child->siblings); |
| } |
| |
| /* Safely grabs a kref on TF, possibly resurrecting from 0, at which point the |
| * file would be on an LRU list. This syncs with tree removers, including |
| * unlinking from the tree on remove and LRU cache pruners. Returns true if we |
| * got the ref, false if we lost and the file was disconnected. */ |
| bool tf_kref_get(struct tree_file *tf) |
| { |
| spin_lock(&tf->lifetime); |
| if (tf->flags & TF_F_DISCONNECTED) { |
| spin_unlock(&tf->lifetime); |
| return false; |
| } |
| __remove_from_lru(tf); |
| __kref_get(&tf->kref, 1); |
| tf->flags |= TF_F_HAS_BEEN_USED; |
| spin_unlock(&tf->lifetime); |
| return true; |
| } |
| |
| void tf_kref_put(struct tree_file *tf) |
| { |
| kref_put(&tf->kref); |
| } |
| |
| static void __tf_free(struct tree_file *tf) |
| { |
| struct tree_file *parent = tf->parent; |
| struct tree_filesystem *tfs = tf->tfs; |
| |
| tf->tfs->tf_ops.free(tf); |
| if (tf->flags & TF_F_IS_ROOT) { |
| assert(tfs->root == tf); |
| assert(!parent); |
| } |
| cleanup_fs_file((struct fs_file*)tf); |
| kfree(tf); |
| /* the reason for decreffing the parent now is for convenience on |
| * releasing. When we unlink the child from the LRU pruner, we don't |
| * want to release the parent immediately while we hold the parent's |
| * qlock (and other locks). */ |
| if (parent) |
| tf_kref_put(parent); |
| } |
| |
| static void __tf_free_rcu(struct rcu_head *head) |
| { |
| struct tree_file *tf = container_of(head, struct tree_file, rcu); |
| |
| __tf_free(tf); |
| } |
| |
| static void tf_release(struct kref *kref) |
| { |
| struct tree_file *tf = container_of(kref, struct tree_file, kref); |
| |
| assert(!(tf->flags & TF_F_NEGATIVE)); |
| |
| spin_lock(&tf->lifetime); |
| if (kref_refcnt(&tf->kref) > 0) { |
| /* Someone resurrected after we decreffed to 0. */ |
| assert(!(tf->flags & TF_F_ON_LRU)); |
| spin_unlock(&tf->lifetime); |
| return; |
| } |
| if (!(tf->flags & (TF_F_DISCONNECTED | TF_F_IS_ROOT))) { |
| /* It's possible that we paused before locking, then another |
| * thread upped, downed, and put it on the LRU list already. |
| * The helper deals with that. */ |
| __add_to_lru(tf); |
| spin_unlock(&tf->lifetime); |
| return; |
| } |
| spin_unlock(&tf->lifetime); |
| /* Need RCU, since we could have had a reader who saw the object and |
| * still needs to try to kref (and fail). call_rcu, since we can't |
| * block. */ |
| call_rcu(&tf->rcu, __tf_free_rcu); |
| } |
| |
| static unsigned long hash_string(const char *name) |
| { |
| unsigned long hash = 5381; |
| |
| for (const char *p = name; *p; p++) { |
| /* hash * 33 + c, djb2's technique */ |
| hash = ((hash << 5) + hash) + *p; |
| } |
| return hash; |
| } |
| |
| static void wc_init(struct walk_cache *wc) |
| { |
| spinlock_init(&wc->lru_lock); |
| INIT_LIST_HEAD(&wc->lru); |
| spinlock_init(&wc->ht_lock); |
| wc->ht = wc->static_ht; |
| hash_init_hh(&wc->hh); |
| for (int i = 0; i < wc->hh.nr_hash_lists; i++) |
| INIT_HLIST_HEAD(&wc->ht[i]); |
| } |
| |
| static void wc_destroy(struct walk_cache *wc) |
| { |
| assert(list_empty(&wc->lru)); |
| for (int i = 0; i < wc->hh.nr_hash_lists; i++) |
| assert(hlist_empty(&wc->ht[i])); |
| if (wc->ht != wc->static_ht) |
| kfree(wc->ht); |
| } |
| |
| /* Looks up the child of parent named 'name' in the walk cache hash table. |
| * Caller needs to hold an rcu read lock. */ |
| static struct tree_file *wc_lookup_child(struct tree_file *parent, |
| const char *name) |
| { |
| struct walk_cache *wc = &parent->tfs->wc; |
| unsigned long hash_val = hash_string(name); |
| struct hlist_head *bucket; |
| struct tree_file *i; |
| |
| bucket = &wc->ht[hash_val % wc->hh.nr_hash_bits]; |
| hlist_for_each_entry_rcu(i, bucket, hash) { |
| /* Note 'i' is an rcu protected pointer. That deref is safe. |
| * i->parent is also a pointer that in general we want to |
| * protect. In this case, even though we don't dereference it, |
| * we want a pointer that is good enough to dereference so we |
| * can do the comparison. */ |
| if (rcu_dereference(i->parent) != parent) |
| continue; |
| /* The file's name should never change while it is in the table, |
| * so no need for a seq-reader. Can't assert though, since |
| * there are valid reasons for other seq lockers. */ |
| if (!strcmp(tree_file_to_name(i), name)) |
| return i; |
| } |
| return NULL; |
| } |
| |
| /* Caller should hold the parent's qlock */ |
| static void wc_insert_child(struct tree_file *parent, struct tree_file *child) |
| { |
| struct walk_cache *wc = &parent->tfs->wc; |
| unsigned long hash_val = hash_string(tree_file_to_name(child)); |
| struct hlist_head *bucket; |
| |
| assert(child->parent == parent); /* catch bugs from our callers */ |
| /* TODO: consider bucket locks and/or growing the HT. Prob need a |
| * seq_ctr in the WC, used on the read side during resizing. Removal |
| * probably would need something other than the bucket lock too |
| * (confusion about which bucket during the op). */ |
| spin_lock(&wc->ht_lock); |
| bucket = &wc->ht[hash_val % wc->hh.nr_hash_bits]; |
| hlist_add_head_rcu(&child->hash, bucket); |
| spin_unlock(&wc->ht_lock); |
| } |
| |
| /* Caller should hold the parent's qlock */ |
| static void wc_remove_child(struct tree_file *parent, struct tree_file *child) |
| { |
| struct walk_cache *wc = &parent->tfs->wc; |
| |
| assert(child->parent == parent); /* catch bugs from our callers */ |
| spin_lock(&wc->ht_lock); |
| hlist_del_rcu(&child->hash); |
| spin_unlock(&wc->ht_lock); |
| } |
| |
| /* Helper: returns a refcounted pointer to the potential parent. May return 0. |
| * |
| * Caller needs to qlock the parent and recheck tf->parent. Callers always need |
| * to get a kref on the parent. We can rcu-read the parent and *attempt* to |
| * qlock under rcu, but we might block. Then, say, the TF and the parent got |
| * removed, and *then* we get the qlock. We're too late. */ |
| static struct tree_file *__tf_get_potential_parent(struct tree_file *tf) |
| { |
| struct tree_file *parent; |
| |
| rcu_read_lock(); |
| parent = rcu_dereference(tf->parent); |
| if (!parent) { |
| /* the root of the tree has no parent */ |
| rcu_read_unlock(); |
| return NULL; |
| } |
| if (!tf_kref_get(parent)) |
| parent = NULL; |
| rcu_read_unlock(); |
| return parent; |
| } |
| |
| /* Returns a refcounted and qlocked parent for child. NULL on failure. */ |
| static struct tree_file *get_locked_and_kreffed_parent(struct tree_file *child) |
| { |
| struct tree_file *parent; |
| |
| parent = __tf_get_potential_parent(child); |
| if (!parent) |
| return NULL; |
| qlock(&parent->file.qlock); |
| /* Checking the parent == child->parent isn't enough here. That works |
| * for rename, but not removal/unlink. Older versions of TF code |
| * cleared child->parent, but now that's dealt with in tf_free. |
| * |
| * We're doing a lockless peek at child's flags. We hold the potential |
| * parent's lock, so if they are ours, no one will be messing with the |
| * disconnected flag. If they are messing with it, then parent != |
| * child->parent. Also, once disconnected is set, it is never clear. */ |
| if ((child->flags & TF_F_DISCONNECTED) || (parent != child->parent)) { |
| qunlock(&parent->file.qlock); |
| tf_kref_put(parent); |
| return NULL; |
| } |
| return parent; |
| } |
| |
| static bool __mark_disconnected(struct tree_file *tf) |
| { |
| bool need_to_free; |
| |
| spin_lock(&tf->lifetime); |
| tf->flags |= TF_F_DISCONNECTED; |
| __remove_from_lru(tf); |
| need_to_free = kref_refcnt(&tf->kref) == 0; |
| spin_unlock(&tf->lifetime); |
| return need_to_free; |
| } |
| |
| /* Disconnects child from the in-memory tree structure (i.e. only the front |
| * end). Caller holds the parent qlock. Assumes child is a child of parent. |
| * Once you disconnect, you can't touch the child object. |
| * |
| * Racing with concurrent lookups, who might grab a ref if they get in before |
| * DISCONNECTED, and racing with release (after ref = 0), who might free, if it |
| * was already unlinked. */ |
| static void __disconnect_child(struct tree_file *parent, |
| struct tree_file *child) |
| { |
| bool need_to_free; |
| |
| need_to_free = __mark_disconnected(child); |
| /* Note child->parent is still set. We clear that in __tf_free. */ |
| __remove_from_parent_list(parent, child); |
| wc_remove_child(parent, child); |
| if (need_to_free) |
| call_rcu(&child->rcu, __tf_free_rcu); |
| } |
| |
| /* Backend will need to fill in dir, except for name. Note this has a kref == |
| * 0, but is not on the LRU yet. */ |
| struct tree_file *tree_file_alloc(struct tree_filesystem *tfs, |
| struct tree_file *parent, const char *name) |
| { |
| struct tree_file *tf; |
| |
| tf = kzmalloc(sizeof(struct tree_file), MEM_WAIT); |
| fs_file_init((struct fs_file*)tf, name, &tfs->fs_ops); |
| kref_init(&tf->kref, tf_release, 0); |
| spinlock_init(&tf->lifetime); |
| /* Need to set the parent early on, even if the child isn't linked yet, |
| * so that the TFS ops know who the parent is. */ |
| tf->parent = parent; |
| if (parent) |
| kref_get(&parent->kref, 1); |
| INIT_LIST_HEAD(&tf->children); |
| tf->can_have_children = true; |
| tf->tfs = tfs; |
| return tf; |
| } |
| |
| /* Callers must hold the parent's qlock. */ |
| static void __link_child(struct tree_file *parent, struct tree_file *child) |
| { |
| /* Devices may have already increffed ("+1 for existing"). Those that |
| * don't need to be on the LRU. We haven't linked to the parent yet, so |
| * we hold the only ref. Once we unlock in __add_to_lru, we're |
| * discoverable via that list, even though we're not linked. The lru |
| * pruner is careful to not muck with the parent's or wc linkage without |
| * qlocking the parent, which we currently hold. */ |
| if (kref_refcnt(&child->kref) == 0) |
| __add_to_lru(child); |
| /* This was set in tree_file_alloc */ |
| assert(child->parent == parent); |
| __add_to_parent_list(parent, child); |
| wc_insert_child(parent, child); |
| } |
| |
| /* Hold the dir's qlock. This probably works with any directory, though we only |
| * use it when we remove a directory. The normal way to drop negative entries |
| * involves working directly with the WC (tfs_lru_prune_neg). */ |
| static void __prune_dir_negatives(struct tree_file *dir) |
| { |
| struct tree_file *child, *temp; |
| |
| list_for_each_entry_safe(child, temp, &dir->children, siblings) { |
| if (!tree_file_is_negative(child)) |
| continue; |
| spin_lock(&child->lifetime); |
| assert(kref_refcnt(&child->kref) == 0); |
| /* This mark prevents new lookups; will disconnect it shortly */ |
| child->flags |= TF_F_DISCONNECTED; |
| spin_unlock(&child->lifetime); |
| assert(child->parent == dir); |
| __disconnect_child(dir, child); |
| } |
| } |
| |
| static void neuter_directory(struct tree_file *dir) |
| { |
| qlock(&dir->file.qlock); |
| if (dir->tfs->tf_ops.has_children(dir)) { |
| qunlock(&dir->file.qlock); |
| error(ENOTEMPTY, "can't remove dir with children"); |
| } |
| dir->can_have_children = false; |
| /* Even if we don't have real children, we might have some negatives. |
| * Those aren't a reason to abort an rmdir, we just need to drop them.*/ |
| __prune_dir_negatives(dir); |
| qunlock(&dir->file.qlock); |
| } |
| |
| /* Unlink a child from the tree. Last ref will clean it up, which will not be |
| * us. Caller should hold krefs on the child and parent. The child's kref will |
| * actually keep the parent's alive, but all practical callers will have the |
| * parent kreff'ed and qlocked - which is required to ensure the child is still |
| * a child of parent. */ |
| static void __unlink_child(struct tree_file *parent, struct tree_file *child) |
| { |
| /* Need to make sure concurrent creates/renames do not add children to |
| * directories that are unlinked. Note we don't undo the neutering if |
| * the backend fails. */ |
| if (tree_file_is_dir(child)) |
| neuter_directory(child); |
| /* The ramfs backend will probably decref the "+1 for existing" ref. |
| * This is OK. If that was the last ref, the child will briefly be on |
| * the LRU list (which ramfs ignores). When we disconnect, we'll yank |
| * the child back off the list and then free it (after rcu). */ |
| parent->tfs->tf_ops.unlink(parent, child); |
| __disconnect_child(parent, child); |
| } |
| |
| /* Talks to the backend and ensures a tree_file for the child exists, either |
| * positive or negative. Throws an error; doesn't return NULL. |
| * |
| * This returns with an rcu read lock sufficient to protect the returned TF, but |
| * it is *not* kreffed. It could be a negative entry, which is not kreffed. |
| * |
| * It is possible that at the moment we qunlock, someone comes along and removes |
| * the entry (LRU prune or file removal (for positive entries)). That's fine - |
| * we have an rcu read lock, just like during a regular walk, when the removal |
| * could happen anyways. */ |
| static struct tree_file *lookup_child_entry(struct tree_file *parent, |
| const char *name) |
| { |
| ERRSTACK(1); |
| struct tree_file *child; |
| |
| qlock(&parent->file.qlock); |
| child = wc_lookup_child(parent, name); |
| if (child) { |
| /* Since we last looked, but before we qlocked, someone else |
| * added our entry. */ |
| rcu_read_lock(); |
| qunlock(&parent->file.qlock); |
| return child; |
| } |
| if (!parent->can_have_children) { |
| qunlock(&parent->file.qlock); |
| error(ENOENT, "lookup failed, parent dir being removed"); |
| } |
| child = tree_file_alloc(parent->tfs, parent, name); |
| if (waserror()) { |
| /* child wasn't fully created, so freeing it may be tricky, esp |
| * on the device ops side (might see something they never |
| * created). */ |
| __tf_free(child); |
| qunlock(&parent->file.qlock); |
| nexterror(); |
| } |
| parent->tfs->tf_ops.lookup(parent, child); |
| poperror(); |
| __link_child(parent, child); |
| rcu_read_lock(); |
| qunlock(&parent->file.qlock); |
| return child; |
| } |
| |
| /* Walks a tree filesystem from 'from' for array of null-terminated names. |
| * Devices will often use tree_chan_walk(), but can use this directly if they |
| * want more control. |
| * |
| * Returns the WQ of qids. On complete/successful walks, we hang a refcnted TF |
| * on wq->clone. |
| * |
| * Walks can return intermediate results, which is when there is an error but we |
| * have walked at least once. The partial return is useful for namec(), which |
| * can succeed if something was mounted on an intermediate QID. Walks that have |
| * no results set error and return NULL, which is what namec() expects. */ |
| struct walkqid *tree_file_walk(struct tree_file *from, char **name, |
| unsigned int nname) |
| { |
| ERRSTACK(2); |
| struct tree_file *at, *next; |
| struct tree_filesystem *tfs = from->tfs; |
| struct walkqid *wq; |
| |
| wq = kzmalloc(sizeof(struct walkqid) + nname * sizeof(struct qid), |
| MEM_WAIT); |
| /* A walk with zero names means "make me a copy." If we go through the |
| * regular walker, our usual tf_kref_get will fail - similar to failing |
| * if we got a walk for "foo/../" during a concurrent removal of |
| * ourselves. We'll let a walk of zero names work, but if you provide |
| * any names, the actual walk must happen. |
| * |
| * This is tricky, and confused me a little. We're returning a *TF* |
| * through wq->clone, not a chan, and that is refcounted. Normally for |
| * chans that end up with wq->clone == c, we do not incref the object |
| * hanging off the chan (see k/d/d/eventfd.c), since there is just one |
| * chan with a kreffed object hanging off e.g. c->aux. But here, |
| * wq->clone is considered a distinct refcnt to some TF, and it might be |
| * 'from.' Our *caller* needs to deal with the "wq->clone == |
| * from_chan", since they deal with chans. We deal with tree files. */ |
| if (!nname) { |
| kref_get(&from->kref, 1); |
| wq->clone = (struct chan*)from; |
| return wq; |
| } |
| if (waserror()) { |
| kfree(wq); |
| poperror(); |
| return NULL; |
| } |
| rcu_read_lock(); |
| at = from; |
| for (int i = 0; i < nname; i++) { |
| /* Walks end if we reach a regular file, i.e. you can't walk |
| * through a file, only a dir. But even if there are more |
| * names, the overall walk might succeed. E.g. a directory |
| * could be mounted on top of the current file we're at. That's |
| * just not our job. */ |
| if (tree_file_is_file(at)) { |
| if (i == 0) |
| error(ENOTDIR, "initial walk from a file"); |
| break; |
| } |
| /* Normally, symlinks stop walks, and namec's walk() will deal |
| * with it. We allow walks 'through' symlinks, but only for .. |
| * and only for the first name. This is for relative lookups so |
| * we can find the parent of a symlink. */ |
| if (tree_file_is_symlink(at)) { |
| if (i != 0) |
| break; |
| if (strcmp(name[i], "..")) |
| error(ELOOP, |
| "walk from a symlink that wasn't .."); |
| } |
| if (!caller_has_tf_perms(at, O_READ)) { |
| if (i == 0) |
| error(EACCES, "missing perm for lookup"); |
| break; |
| } |
| if (!strcmp(name[i], ".")) { |
| wq->qid[wq->nqid++] = tree_file_to_qid(at); |
| continue; |
| } |
| if (!strcmp(name[i], "..")) { |
| next = rcu_dereference(at->parent); |
| if (!next) { |
| if (tree_file_is_root(at)) { |
| wq->qid[wq->nqid++] = |
| tree_file_to_qid(at); |
| /* I think namec should never give us |
| * DOTDOT that isn't at the end of the |
| * names array. Though devwalk() seems |
| * to expect it. */ |
| if (i != nname - 1) |
| warn("namec DOTDOT bug?"); |
| continue; |
| } |
| /* We lost our parent due to a removal/rename. |
| * We might have walked enough for our walk to |
| * succeed (e.g. there's a mount point in the |
| * WQ), so we can return what we have. Though |
| * if we've done nothing, it's a failure. |
| * |
| * Note the removal could have happened a long |
| * time ago: consider an O_PATH open, then |
| * namec_from(). |
| * |
| * We also could walk up and see the *new* |
| * parent during rename(). For instance, |
| * /src/x/../y could get the old /src/y or the |
| * new /dst/y. If we want to avoid that, then |
| * we'll need some sort of sync with rename to |
| * make sure we don't get the new one. Though |
| * this can happen with namec_from(), so I'm not |
| * sure I care. */ |
| if (i == 0) |
| error(ENOENT, |
| "file lost its parent during lookup"); |
| break; |
| } |
| at = next; |
| wq->qid[wq->nqid++] = tree_file_to_qid(at); |
| continue; |
| } |
| next = wc_lookup_child(at, name[i]); |
| if (!next) { |
| /* TFSs with no backend have the entire tree in the WC |
| * HT. */ |
| if (!tfs->tf_ops.lookup) { |
| if (i == 0) |
| error(ENOENT, "file does not exist"); |
| break; |
| } |
| /* Need to hold a kref on 'at' before we |
| * rcu_read_unlock(). Our TFS op might block. */ |
| if (!tf_kref_get(at)) { |
| if (i == 0) |
| error(ENOENT, |
| "file was removed during lookup"); |
| /* WQ has a qid for 'at' from a previous loop. |
| * We can't grab a ref, so it's being |
| * removed/renamed. Yet it's still OK to report |
| * this as part of the wq, since we could fail |
| * for other reasons (thus not getting nnames) |
| * and have the same race, which we handle below |
| * (i.e. we only get a ref when it was the last |
| * name). Here we're just *noticing* the race. |
| */ |
| break; |
| } |
| rcu_read_unlock(); |
| /* propagate the error only on the first name. Note we |
| * run the 'else' case, and run the poperror case for |
| * non-errors and non-name=0-errors. */ |
| if (waserror()) { |
| tf_kref_put(at); |
| if (i == 0) |
| nexterror(); |
| /* In this case, we're discarding the waserror, |
| * same as in the other i != 0 cases. */ |
| poperror(); |
| break; |
| } |
| /* returns with an rcu_read_lock protecting 'next' */ |
| next = lookup_child_entry(at, name[i]); |
| tf_kref_put(at); |
| poperror(); |
| assert(next); |
| } |
| if (tree_file_is_negative(next)) { |
| /* lockless peek. other flag users aren't atomic. */ |
| if (!(next->flags & TF_F_HAS_BEEN_USED)) { |
| spin_lock(&next->lifetime); |
| next->flags |= TF_F_HAS_BEEN_USED; |
| spin_unlock(&next->lifetime); |
| } |
| if (i == 0) |
| error(ENOENT, "file does not exist"); |
| break; |
| } |
| at = next; |
| wq->qid[wq->nqid++] = tree_file_to_qid(at); |
| } |
| if (wq->nqid == nname || tree_file_is_symlink(at)) { |
| if (!tf_kref_get(at)) { |
| /* need to undo our last result as if we never saw it.*/ |
| wq->nqid--; |
| if (wq->nqid == 0) |
| error(ENOENT, "file was removed during lookup"); |
| } else { |
| /* Hanging the refcounted TF off the wq->clone, which is |
| * cheating. Our walker shim knows to expect this. */ |
| wq->clone = (struct chan*)at; |
| } |
| } |
| rcu_read_unlock(); |
| poperror(); |
| return wq; |
| } |
| |
| /* Most tree devices will use this for their walk op, similar to devwalk. */ |
| struct walkqid *tree_chan_walk(struct chan *c, struct chan *nc, char **name, |
| unsigned int nname) |
| { |
| struct tree_file *from, *to; |
| struct walkqid *wq; |
| |
| from = chan_to_tree_file(c); |
| wq = tree_file_walk(from, name, nname); |
| if (!wq) |
| return NULL; |
| if (!wq->clone) |
| return wq; |
| if (!nc) |
| nc = devclone(c); |
| /* Not sure if callers that specify nc must have a chan from our device. |
| */ |
| assert(nc->type == c->type); |
| to = (struct tree_file*)wq->clone; |
| nc->qid = tree_file_to_qid(to); |
| chan_set_tree_file(nc, to); |
| wq->clone = nc; |
| /* We might be returning the same chan, so there's actually just one |
| * ref. */ |
| if (wq->clone == c) |
| tf_kref_put(chan_to_tree_file(c)); |
| return wq; |
| } |
| |
| /* Creates a tree file under parent with name, omode, and perm. Returns a |
| * kreffed tree_file for the new child. Throws on error. */ |
| struct tree_file *tree_file_create(struct tree_file *parent, const char *name, |
| uint32_t perm, char *ext) |
| { |
| ERRSTACK(2); |
| struct tree_file *child; |
| bool got_ref; |
| struct timespec now; |
| |
| qlock(&parent->file.qlock); |
| if (waserror()) { |
| qunlock(&parent->file.qlock); |
| nexterror(); |
| } |
| if (!caller_has_tf_perms(parent, O_WRITE)) |
| error(EACCES, "missing create permission on dir"); |
| if (!parent->can_have_children) |
| error(ENOENT, "create failed, parent dir being removed"); |
| child = wc_lookup_child(parent, name); |
| if (child) { |
| /* The create(5) message fails if the file exists, which differs |
| * from the syscall. namec() handles this. */ |
| if (!tree_file_is_negative(child)) |
| error(EEXIST, "file exists"); |
| /* future lookups that find no entry will qlock. concurrent |
| * ones that see the child, even if disconnected, will see it |
| * was negative and fail. */ |
| __disconnect_child(parent, child); |
| } |
| child = tree_file_alloc(parent->tfs, parent, name); |
| if (waserror()) { |
| __tf_free(child); |
| nexterror(); |
| } |
| /* Backend will need to know the ext for its create. This gets cleaned |
| * up on error. */ |
| if (perm & DMSYMLINK) |
| kstrdup(&child->file.dir.ext, ext); |
| /* Backend will need to fill in dir, except for name. */ |
| parent->tfs->tf_ops.create(parent, child, perm); |
| now = nsec2timespec(epoch_nsec()); |
| __set_acmtime_to(&child->file, |
| FSF_ATIME | FSF_BTIME | FSF_CTIME | FSF_MTIME, &now); |
| poperror(); |
| /* At this point, the child is visible, so it must be ready to go */ |
| __link_child(parent, child); |
| got_ref = tf_kref_get(child); |
| assert(got_ref); /* we hold the qlock, no one should have removed */ |
| __set_acmtime_to(&parent->file, FSF_CTIME | FSF_MTIME, &now); |
| qunlock(&parent->file.qlock); |
| poperror(); |
| return child; |
| } |
| |
| /* Most tree devices will use this for their create op. */ |
| void tree_chan_create(struct chan *c, char *name, int omode, uint32_t perm, |
| char *ext) |
| { |
| struct tree_file *parent, *child; |
| |
| parent = chan_to_tree_file(c); |
| child = tree_file_create(parent, name, perm, ext); |
| c->qid = tree_file_to_qid(child); |
| c->mode = openmode(omode); |
| chan_set_tree_file(c, child); |
| tf_kref_put(parent); |
| } |
| |
| struct chan *tree_chan_open(struct chan *c, int omode) |
| { |
| struct tree_file *tf = chan_to_tree_file(c); |
| |
| if ((c->qid.type & QTDIR) && (omode & O_WRITE)) |
| error(EISDIR, "can't open a dir for writing"); |
| if (c->qid.type & QTSYMLINK) |
| error(ELOOP, "can't open a symlink"); |
| tree_file_perm_check(tf, omode); |
| /* TODO: if we want to support DMEXCL on dir.mode, we'll need to |
| * lock/sync on the fs_file (have a flag for FSF_IS_OPEN, handle in |
| * close). We'll also need a way to pass it in to the dir.mode during |
| * create/wstat/etc. */ |
| if (omode & O_TRUNC) |
| fs_file_truncate(&tf->file, 0); |
| c->mode = openmode(omode); |
| c->flag |= COPEN; |
| c->offset = 0; |
| return c; |
| } |
| |
| void tree_chan_close(struct chan *c) |
| { |
| struct tree_file *tf = chan_to_tree_file(c); |
| |
| tf_kref_put(tf); |
| } |
| |
| void tree_file_remove(struct tree_file *child) |
| { |
| ERRSTACK(1); |
| struct tree_file *parent; |
| |
| parent = get_locked_and_kreffed_parent(child); |
| if (!parent) |
| error(ENOENT, "%s had no parent", tree_file_to_name(child)); |
| if (waserror()) { |
| qunlock(&parent->file.qlock); |
| tf_kref_put(parent); |
| nexterror(); |
| } |
| if (!caller_has_tf_perms(parent, O_WRITE)) |
| error(EACCES, "missing remove perm for dir"); |
| __unlink_child(parent, child); |
| __set_acmtime(&parent->file, FSF_CTIME | FSF_MTIME); |
| poperror(); |
| qunlock(&parent->file.qlock); |
| tf_kref_put(parent); |
| } |
| |
| void tree_chan_remove(struct chan *c) |
| { |
| ERRSTACK(1); |
| struct tree_file *tf = chan_to_tree_file(c); |
| |
| /* sysremove expects a chan that is disconnected from the device, |
| * regardless of whether or not we fail. See sysremove(); it will clear |
| * type, ensuring our close is never called. */ |
| if (waserror()) { |
| chan_set_tree_file(c, NULL); |
| tf_kref_put(tf); /* The ref from the original walk */ |
| nexterror(); |
| } |
| tree_file_remove(tf); |
| chan_set_tree_file(c, NULL); |
| tf_kref_put(tf); /* The ref from the original walk */ |
| poperror(); |
| } |
| |
| static bool is_descendant_of(struct tree_file *descendant, |
| struct tree_file *ancestor) |
| { |
| if (!tree_file_is_dir(ancestor)) |
| return false; |
| rcu_read_lock(); |
| for (struct tree_file *i = rcu_dereference(descendant->parent); |
| i; |
| i = rcu_dereference(i->parent)) { |
| if (i == ancestor) { |
| rcu_read_unlock(); |
| return true; |
| } |
| } |
| rcu_read_unlock(); |
| return false; |
| } |
| |
| /* Caller should hold the rename mutex. This qlocks a and b, which can be the |
| * same file, and this handles lock ordering. Recall the rule: parents before |
| * children, and otherwise only the renamer can lock. */ |
| static void qlock_tree_files(struct tree_file *a, struct tree_file *b) |
| { |
| if (a == b) { |
| qlock(&a->file.qlock); |
| return; |
| } |
| if (is_descendant_of(a, b)) { |
| qlock(&b->file.qlock); |
| qlock(&a->file.qlock); |
| } else { |
| qlock(&a->file.qlock); |
| qlock(&b->file.qlock); |
| } |
| } |
| |
| static void qunlock_tree_files(struct tree_file *a, struct tree_file *b) |
| { |
| if (a != b) |
| qunlock(&a->file.qlock); |
| qunlock(&b->file.qlock); |
| } |
| |
| /* Higher layers (namec) ensure that tf and new_parent are in the same device. |
| * The device (e.g. #mnt) ensures that they are in the same instance. |
| * new_parent could be the parent of tf; everything should still work if the |
| * move is within the same directory. */ |
| void tree_file_rename(struct tree_file *tf, struct tree_file *new_parent, |
| const char *name, int flags) |
| { |
| ERRSTACK(1); |
| struct tree_file *old_parent; |
| struct tree_file *prev_dst; |
| struct timespec now; |
| |
| old_parent = __tf_get_potential_parent(tf); |
| if (!old_parent) |
| error(ENOENT, "renamed file had no parent"); |
| /* global mtx helps with a variety of weird races, including the "can't |
| * move to a subdirectory of yourself" case and the lock ordering of |
| * parents (locks flow from parent->child). */ |
| qlock(&tf->tfs->rename_mtx); |
| qlock_tree_files(old_parent, new_parent); |
| if (waserror()) { |
| qunlock_tree_files(old_parent, new_parent); |
| qunlock(&tf->tfs->rename_mtx); |
| tf_kref_put(old_parent); |
| nexterror(); |
| }; |
| if (old_parent != tf->parent) |
| error(ENOENT, "renamed file lost its parent"); |
| /* Probably a namec bug (that code isn't written yet). */ |
| assert(tree_file_is_dir(new_parent)); |
| if (!new_parent->can_have_children) |
| error(ENOENT, "target dir being removed"); |
| if (!caller_has_tf_perms(old_parent, O_WRITE)) |
| error(EACCES, "missing remove perm for src dir"); |
| if (!caller_has_tf_perms(new_parent, O_WRITE)) |
| error(EACCES, "missing create perm for dst dir"); |
| /* can't move tf to one of its subdirs */ |
| if (is_descendant_of(new_parent, tf)) |
| error(EINVAL, "can't rename to a child directory"); |
| /* We hold new_parent's qlock, so there's no worry about prev_dst |
| * disappearing, so no need for an rcu read lock. */ |
| prev_dst = wc_lookup_child(new_parent, name); |
| if (prev_dst) { |
| if (tree_file_is_dir(prev_dst)) { |
| if (!tree_file_is_dir(tf)) |
| error(EISDIR, "can't rename a file onto a dir"); |
| /* We need to ensure prev_dst is childless and remains |
| * so. That requires grabbing its qlock, but there's a |
| * potential lock ordering issue with old_parent. We |
| * could have this: new_parent/dst/x/y/z/old_parent/src. |
| * That will fail, but we need to check for that case |
| * instead of grabbing prev_dst's qlock. */ |
| if (is_descendant_of(prev_dst, old_parent)) |
| error(ENOTEMPTY, |
| "old_parent descends from dst"); |
| neuter_directory(prev_dst); |
| } else { |
| if (tree_file_is_dir(tf)) |
| error(ENOTDIR, |
| "can't rename a dir onto a file"); |
| } |
| } |
| /* We check with the backend first, so that it has a chance to fail |
| * early. Once we make the changes to the front end, lookups can see |
| * the effects of the change, which we can't roll back. Since we hold |
| * the parents' qlocks, no one should be able to get the info from the |
| * backend either (lookups that require the backend, readdir, etc). */ |
| tf->tfs->tf_ops.rename(tf, old_parent, new_parent, name, flags); |
| /* Similar to __disconnect_child, we don't clear tf->parent. rcu |
| * readers at TF will be able to walk up (with ..). Same with |
| * namec_from an FD. If we atomically replace tf->parent, we should be |
| * good. See tree_file_walk(). |
| * |
| * Further, we don't mark the tf disconnected. Someone might lookup |
| * from the old location, and that's fine. We just don't want issues |
| * with decrefs. */ |
| __remove_from_parent_list(old_parent, tf); |
| wc_remove_child(old_parent, tf); |
| synchronize_rcu(); |
| /* Now, no new lookups will find it at the old location. That change is |
| * not atomic wrt src, but it will be wrt dst. Importantly, no one will |
| * see /path/to/old_parent/new_basename */ |
| fs_file_change_basename((struct fs_file*)tf, name); |
| /* We're clobbering the old_parent ref, which we'll drop later */ |
| rcu_assign_pointer(tf->parent, new_parent); |
| kref_get(&new_parent->kref, 1); |
| __add_to_parent_list(new_parent, tf); |
| wc_insert_child(new_parent, tf); |
| /* Now both the prev_dst (if it existed) or the tf file are in the walk |
| * cache / HT and either could have been looked up by a concurrent |
| * reader. Readers will always get one or the other, but never see |
| * nothing. This is the atomic guarantee of rename. */ |
| if (prev_dst) { |
| __remove_from_parent_list(new_parent, prev_dst); |
| wc_remove_child(new_parent, prev_dst); |
| synchronize_rcu(); |
| /* Now no one can find prev_dst. Someone might still have a |
| * ref, or it might be on the LRU list (if kref == 0). Now we |
| * can mark disconnected. Had we disconnected earlier, then |
| * lookup code would see that and treat it as a failure. Using |
| * rcu and putting the complexity in rename was easier and |
| * simpler than changing lookup. |
| * |
| * We still need RCU here for freeing the prev_dst. We could |
| * have had an LRU pruner, etc, looking. The synchronize_rcu |
| * above only dealt with lookups via parent in this function. */ |
| if (__mark_disconnected(prev_dst)) |
| call_rcu(&prev_dst->rcu, __tf_free_rcu); |
| } |
| now = nsec2timespec(epoch_nsec()); |
| __set_acmtime_to(&old_parent->file, FSF_CTIME | FSF_MTIME, &now); |
| __set_acmtime_to(&new_parent->file, FSF_CTIME | FSF_MTIME, &now); |
| /* Can we unlock earlier? No. We need to at least hold new_parent's |
| * qlock, which was the parent of old_dst, until old_dst is marked |
| * disconnected. Even though old_dst is removed from new_parent's HT, |
| * it is still in the LRU list. */ |
| qunlock_tree_files(old_parent, new_parent); |
| qunlock(&tf->tfs->rename_mtx); |
| poperror(); |
| tf_kref_put(old_parent); /* the original tf->parent ref we clobbered */ |
| tf_kref_put(old_parent); /* the one we grabbed when we started */ |
| } |
| |
| void tree_chan_rename(struct chan *c, struct chan *new_p_c, const char *name, |
| int flags) |
| { |
| struct tree_file *tf = chan_to_tree_file(c); |
| struct tree_file *new_parent = chan_to_tree_file(new_p_c); |
| |
| tree_file_rename(tf, new_parent, name, flags); |
| } |
| |
| /* dri is a pointer to the chan->dri, which is a count of how many directory |
| * entries that have been read from this chan so far. 9ns handles it; we just |
| * need to increment it for every successful entry. Note we ignore offset. */ |
| ssize_t tree_file_readdir(struct tree_file *parent, void *ubuf, size_t n, |
| off64_t offset, int *dri) |
| { |
| ERRSTACK(1); |
| struct tree_file *i; |
| size_t dir_amt, so_far = 0; |
| uint8_t *write_pos = ubuf; |
| int child_nr = 0; |
| |
| qlock(&parent->file.qlock); |
| if (waserror()) { |
| qunlock(&parent->file.qlock); |
| nexterror(); |
| } |
| list_for_each_entry(i, &parent->children, siblings) { |
| if (child_nr++ < *dri) |
| continue; |
| qlock(&i->file.qlock); |
| dir_amt = convD2M(&i->file.dir, write_pos, n - so_far); |
| qunlock(&i->file.qlock); |
| if (dir_amt <= BIT16SZ) { |
| if (!so_far) |
| error(EINVAL, "buffer to small for readdir"); |
| break; |
| } |
| write_pos += dir_amt; |
| so_far += dir_amt; |
| assert(n - so_far >= 0); |
| (*dri)++; |
| } |
| /* If we care about directory atime, we can do that here. (if so_far) */ |
| qunlock(&parent->file.qlock); |
| poperror(); |
| return so_far; |
| } |
| |
| /* Note this only works for backend-less TFSs. It calls tree_file_readdir, |
| * which only looks at the frontend's tree. */ |
| size_t tree_chan_read(struct chan *c, void *ubuf, size_t n, off64_t offset) |
| { |
| struct tree_file *tf = chan_to_tree_file(c); |
| |
| if (tree_file_is_dir(tf)) |
| return tree_file_readdir(tf, ubuf, n, offset, &c->dri); |
| return fs_file_read(&tf->file, ubuf, n, offset); |
| } |
| |
| size_t tree_chan_write(struct chan *c, void *ubuf, size_t n, off64_t offset) |
| { |
| struct tree_file *tf = chan_to_tree_file(c); |
| |
| /* sysfile.c:rwrite checked the chan's type. */ |
| assert(!tree_file_is_dir(tf)); |
| return fs_file_write(&tf->file, ubuf, n, offset); |
| } |
| |
| size_t tree_chan_stat(struct chan *c, uint8_t *m_buf, size_t m_buf_sz) |
| { |
| struct tree_file *tf = chan_to_tree_file(c); |
| |
| return fs_file_stat(&tf->file, m_buf, m_buf_sz); |
| } |
| |
| size_t tree_chan_wstat(struct chan *c, uint8_t *m_buf, size_t m_buf_sz) |
| { |
| struct tree_file *tf = chan_to_tree_file(c); |
| |
| return fs_file_wstat(&tf->file, m_buf, m_buf_sz); |
| } |
| |
| struct fs_file *tree_chan_mmap(struct chan *c, struct vm_region *vmr, int prot, |
| int flags) |
| { |
| struct fs_file *f = &chan_to_tree_file(c)->file; |
| |
| /* TODO: In the future, we'll check the prot, establish hooks with the |
| * VMR, and other things, mostly in something like fs_file_mmap, which |
| * will be able to handle mmaping something that doesn't use the page |
| * cache. For now, I'm aggressively qlocking to catch bugs. */ |
| qlock(&f->qlock); |
| if ((prot & PROT_WRITE) && (flags & MAP_SHARED)) |
| f->flags |= FSF_DIRTY; |
| qunlock(&f->qlock); |
| return f; |
| } |
| |
| unsigned long tree_chan_ctl(struct chan *c, int op, unsigned long a1, |
| unsigned long a2, unsigned long a3, |
| unsigned long a4) |
| { |
| switch (op) { |
| case CCTL_DEBUG: |
| print_lock(); |
| printk("Dumping tree_file info (frontend), dev %s chan %s:\n\n", |
| chan_dev_name(c), channame(c)); |
| __tfs_dump_tf(chan_to_tree_file(c)); |
| print_unlock(); |
| return 0; |
| default: |
| error(EINVAL, "%s does not support chanctl %d", |
| chan_dev_name(c), op); |
| } |
| } |
| |
| /* Given a tree file, construct a chan that points to the TF for the given |
| * device. Careful with this - it's for bootstrapping. */ |
| struct chan *tree_file_alloc_chan(struct tree_file *tf, struct dev *dev, |
| char *name) |
| { |
| struct chan *c; |
| |
| c = newchan(); |
| c->type = dev - devtab; |
| c->name = newcname(name); |
| kref_get(&tf->kref, 1); |
| chan_set_tree_file(c, tf); |
| c->qid = tree_file_to_qid(tf); |
| return c; |
| } |
| |
| /* Caller needs to set its customizable fields: tf_ops, pm_ops, etc. root is |
| * created with a ref of 1, but needs filled in by the particular TFS. */ |
| void tfs_init(struct tree_filesystem *tfs) |
| { |
| wc_init(&tfs->wc); |
| qlock_init(&tfs->rename_mtx); |
| tfs->root = tree_file_alloc(tfs, NULL, "."); |
| tfs->root->flags |= TF_F_IS_ROOT; |
| assert(!(tfs->root->flags & TF_F_ON_LRU)); |
| __kref_get(&tfs->root->kref, 1); |
| } |
| |
| void tfs_destroy(struct tree_filesystem *tfs) |
| { |
| tfs->root = NULL; /* was just freed in __tf_free() */ |
| wc_destroy(&tfs->wc); |
| } |
| |
| /* For every file except root, we hold our parent's qlock (root has no parent). |
| * This prevents TF's removal/rename and any changes to tf->parent. */ |
| static void tf_dfs_cb(struct tree_file *tf, void (*cb)(struct tree_file *tf)) |
| { |
| struct tree_file *child; |
| |
| if (tree_file_is_dir(tf)) { |
| qlock(&tf->file.qlock); |
| /* Note we don't have a kref on our child's TF - we have a weak |
| * reference (the list membership). We hold the parent's qlock, |
| * which prevents removal/unlinking/disconnecting/etc. The |
| * child's membership on the LRU list can change repeatedly. |
| * |
| * If we want to avoid holding the parent's qlock, we could grab |
| * a kref on the child. However, our list walk would be in |
| * jeopardy - both child and temp could be removed from the |
| * list. So we'd need to qlock the parent, grab krefs on all |
| * children, and put them on a local list. Also, grabbing krefs |
| * on our children will muck with the LRU list; when we're done, |
| * it would be like we sorted the LRU list in the DFS order. */ |
| list_for_each_entry(child, &tf->children, siblings) |
| tf_dfs_cb(child, cb); |
| qunlock(&tf->file.qlock); |
| } |
| if (!tree_file_is_negative(tf)) |
| cb(tf); |
| } |
| |
| /* Runs CB on all in-memory, non-negative TFs from the TFS in a depth-first |
| * search. Compared to the other CB walkers (purge, LRU, etc), this never |
| * removes/prunes/disconnects a TF from the tree. You can use it for syncing FS |
| * trees or pruning page maps (i.e. discarding non-dirty pages). |
| * |
| * One thing to note is that it qlocks the tree as part of its DFS. We can |
| * change that, but at a slight cost in complexity and tampering with the LRU |
| * list. Translation: we can do it if there's a performance problem. |
| * |
| * The tfs->root reference is also weak. It's up to the user to make sure |
| * there is no concurrent tfs_destroy. Typically, if we're called on any |
| * mounted device, we're fine (e.g. #tmpfs) or a device that is never destroyed |
| * (e.g. #kfs). */ |
| void tfs_frontend_for_each(struct tree_filesystem *tfs, |
| void (*cb)(struct tree_file *tf)) |
| { |
| tf_dfs_cb(tfs->root, cb); |
| } |
| |
| /* We should be single-user, so no need for concurrency protections. I keep |
| * them around for paranoia/future use/documentation. */ |
| static void tf_dfs_purge(struct tree_file *tf, void (*cb)(struct tree_file *tf)) |
| { |
| struct tree_file *child, *temp; |
| |
| if (tree_file_is_dir(tf)) { |
| qlock(&tf->file.qlock); |
| /* Note we don't have a kref on TF, and one of our children |
| * should decref *us* to 0. We aren't disconnected yet, and we |
| * can't be removed (parent's qlock is held), so we'll just end |
| * up on the LRU list, which is OK. |
| * |
| * Our child should remove itself from our list, so we need the |
| * _safe. |
| * |
| * Also note that the child decrefs us in a call_rcu. CB can |
| * block, and technically so can qlock, so we might run RCU |
| * callbacks while qlocked. We'll need to rcu_barrier so that |
| * our children's decrefs occur before we remove ourselves from |
| * our parent. */ |
| list_for_each_entry_safe(child, temp, &tf->children, siblings) |
| tf_dfs_purge(child, cb); |
| qunlock(&tf->file.qlock); |
| } |
| rcu_barrier(); |
| /* ramfs will drop the "+1 for existing" ref here */ |
| if (!tree_file_is_negative(tf)) |
| cb(tf); |
| spin_lock(&tf->lifetime); |
| /* should be unused, with no children. we have a ref on root, to keep |
| * the TFS around while we destroy the tree. */ |
| assert(kref_refcnt(&tf->kref) == 0 || tree_file_is_root(tf)); |
| /* This mark prevents new lookups. We'll disconnect it shortly. */ |
| tf->flags |= TF_F_DISCONNECTED; |
| spin_unlock(&tf->lifetime); |
| if (tf->parent) |
| __disconnect_child(tf->parent, tf); |
| } |
| |
| /* Purges all in-memory TFs from the TFS in a depth-first search, both positive |
| * and negative. We run CB on all non-negative TFs. It's up to the caller to |
| * ensure there is no concurrency. |
| * |
| * The caller must make sure they have an extra ref on tfs->root to keep the |
| * TFS around while it gets destroyed. The root TF will get freed when it is |
| * released, unlike other TFs that are still connected. |
| * |
| * This is for devices that want to destroy themselves, such as an unmounted |
| * #tmpfs or #gtfs/#mnt, that want to do some extra work (e.g. sync). |
| * Typically, those devices will call this after they have no users (e.g. mounts |
| * or open chans). */ |
| void tfs_frontend_purge(struct tree_filesystem *tfs, |
| void (*cb)(struct tree_file *tf)) |
| { |
| assert(kref_refcnt(&tfs->root->kref) > 0); |
| tf_dfs_purge(tfs->root, cb); |
| } |
| |
| static void print_tf(struct tree_file *tf) |
| { |
| if (tree_file_is_negative(tf)) { |
| printk("%-10s: Negative\n", tree_file_to_name(tf)); |
| return; |
| } |
| printk("%-10s: Q: %5d, R: %2d, U %s, %c%o %s\n", |
| tree_file_to_name(tf), |
| tf->file.dir.qid.path, |
| kref_refcnt(&tf->kref), |
| tf->file.dir.uid, |
| tree_file_is_dir(tf) ? 'd' |
| : tf->file.dir.mode & DMSYMLINK ? 'l' |
| : '-', |
| tf->file.dir.mode & S_PMASK, |
| tf->file.dir.mode & DMSYMLINK ? tf->file.dir.ext : "" |
| ); |
| } |
| |
| static void dump_tf(struct tree_file *tf, int tabs) |
| { |
| struct tree_file *child; |
| |
| if (!!(tf->file.dir.mode & DMSYMLINK) != |
| !!(tf->file.dir.qid.type & QTSYMLINK)) |
| warn("%s has differing symlink bits", tree_file_to_name(tf)); |
| |
| print_lock(); |
| for (int i = 0; i < tabs; i++) |
| printk(" "); |
| print_tf(tf); |
| if (tree_file_is_dir(tf)) { |
| for (int i = 0; i < tabs; i++) |
| printk(" "); |
| printk("---------------------\n"); |
| list_for_each_entry(child, &tf->children, siblings) |
| dump_tf(child, tabs + 1); |
| } |
| print_unlock(); |
| } |
| |
| void __tfs_dump(struct tree_filesystem *tfs) |
| { |
| dump_tf(tfs->root, 0); |
| } |
| |
| void __tfs_dump_tf(struct tree_file *tf) |
| { |
| dump_tf(tf, 0); |
| } |
| |
| /* Runs a callback on every non-negative TF on the LRU list, for a given |
| * snapshot of the LRU list. The CB returns true if it wants us to attempt to |
| * free the TF. One invariant is that we can never remove a TF from the tree |
| * while it is dirty; it is the job of the CB to maintain that. Note the CB can |
| * run on a TF as soon as that TF was linked to the parent (see lookup). |
| * |
| * The work list is a list of strong refs. We need to keep one in case the file |
| * is disconnected while we're running our CBs. Since we incref, we yank from |
| * the LRU list. We could design the rest of the TF code so that we stay on the |
| * LRU list throughout, but I like the invariant of "kref == 0 IFF on LRU". |
| * |
| * Since we're only on one list at a time ('wc->lru' or 'work'), we can use the |
| * lru list_head in the TF. We know that so long as we hold our kref on a TF, |
| * no one will attempt to put it back on the LRU list. */ |
| void tfs_lru_for_each(struct tree_filesystem *tfs, bool cb(struct tree_file *), |
| size_t max_tfs) |
| { |
| struct list_head work = LIST_HEAD_INIT(work); |
| struct walk_cache *wc = &tfs->wc; |
| struct tree_file *tf, *temp, *parent; |
| size_t nr_tfs = 0; |
| |
| /* We can have multiple LRU workers in flight, though a given TF will be |
| * on only one CB list at a time. */ |
| spin_lock(&wc->lru_lock); |
| list_for_each_entry_safe(tf, temp, &wc->lru, lru) { |
| /* lockless peak at the flag. once it's NEGATIVE, it never goes |
| * back */ |
| if (tree_file_is_negative(tf)) |
| continue; |
| /* Normal lock order is TF -> LRU. |
| * Best effort is fine for LRU. */ |
| if (!spin_trylock(&tf->lifetime)) |
| continue; |
| /* Can't be disconnected and on LRU */ |
| assert(!(tf->flags & TF_F_DISCONNECTED)); |
| assert((tf->flags & TF_F_ON_LRU)); |
| tf->flags &= ~TF_F_ON_LRU; |
| list_del(&tf->lru); |
| __kref_get(&tf->kref, 1); |
| /* The 'used' bit is the what allows us to detect a user in |
| * between our callback and the disconnection/freeing. It's a |
| * moot point if the CB returns false. */ |
| tf->flags &= ~TF_F_HAS_BEEN_USED; |
| spin_unlock(&tf->lifetime); |
| list_add_tail(&tf->lru, &work); |
| if (++nr_tfs >= max_tfs) |
| break; |
| } |
| spin_unlock(&wc->lru_lock); |
| |
| /* We have a snapshot of the LRU list. As soon as we unlocked a file, |
| * someone could incref it (e.g. to something > 1), and they'll set the |
| * used bit. That won't abort the CB. New TFs could be added to the |
| * LRU list. Those are ignored for this pass. */ |
| list_for_each_entry_safe(tf, temp, &work, lru) { |
| if (!cb(tf)) { |
| list_del(&tf->lru); |
| tf_kref_put(tf); |
| continue; |
| } |
| } |
| |
| /* Now we have a list of victims to be removed, so long as they haven't |
| * been used since. */ |
| list_for_each_entry_safe(tf, temp, &work, lru) { |
| parent = get_locked_and_kreffed_parent(tf); |
| if (!parent) { |
| list_del(&tf->lru); |
| tf_kref_put(tf); |
| continue; |
| } |
| spin_lock(&tf->lifetime); |
| if (tf->flags & TF_F_HAS_BEEN_USED) { |
| spin_unlock(&tf->lifetime); |
| qunlock(&parent->file.qlock); |
| tf_kref_put(parent); |
| list_del(&tf->lru); |
| tf_kref_put(tf); |
| continue; |
| } |
| tf->flags |= TF_F_DISCONNECTED; |
| /* We hold a ref, so it shouldn't have found its way back on |
| * LRU. */ |
| assert(!(tf->flags & TF_F_ON_LRU)); |
| spin_unlock(&tf->lifetime); |
| __remove_from_parent_list(parent, tf); |
| wc_remove_child(parent, tf); |
| /* If we get tired of unlocking and relocking, we could see if |
| * the next parent is the current parent before unlocking. */ |
| qunlock(&parent->file.qlock); |
| tf_kref_put(parent); |
| } |
| /* Now we have a list of refs that are all disconnected, kref == 1 |
| * (because no one used them since we increffed them when they were LRU, |
| * which was when the refcnt was 0). Each TF has a ref on its parent |
| * btw, so parents will never be on the LRU list. (leaves only). |
| * |
| * We need to synchronize_rcu() too, since we could have had lockless |
| * lookups that have pointers to TF and are waiting to notice that it is |
| * disconnected. */ |
| synchronize_rcu(); |
| list_for_each_entry_safe(tf, temp, &work, lru) { |
| assert(kref_refcnt(&tf->kref) == 1); |
| list_del(&tf->lru); |
| /* We could decref, but instead we can directly free. We know |
| * the ref == 1 and it is disconnected. Directly freeing |
| * bypasses call_rcu. */ |
| __tf_free(tf); |
| } |
| } |
| |
| /* Does a one-cycle 'clock' algorithm to detect use. On a given pass, we either |
| * clear HAS_BEEN_USED xor we remove it. For negative entries, that bit is used |
| * when we look at an entry (use it), compared to positive entries, which is |
| * used when we get a reference. (we never get refs on negatives). */ |
| void tfs_lru_prune_neg(struct tree_filesystem *tfs) |
| { |
| struct list_head work = LIST_HEAD_INIT(work); |
| struct walk_cache *wc = &tfs->wc; |
| struct tree_file *tf, *temp, *parent; |
| |
| spin_lock(&wc->lru_lock); |
| list_for_each_entry_safe(tf, temp, &wc->lru, lru) { |
| if (!tree_file_is_negative(tf)) |
| continue; |
| if (!spin_trylock(&tf->lifetime)) |
| continue; |
| if (tf->flags & TF_F_HAS_BEEN_USED) { |
| tf->flags &= ~TF_F_HAS_BEEN_USED; |
| spin_unlock(&tf->lifetime); |
| continue; |
| } |
| rcu_read_lock(); /* holding a spinlock, but just to be clear. */ |
| parent = rcu_dereference(tf->parent); |
| /* Again, inverting the lock order, so best effort trylock */ |
| if (!canqlock(&parent->file.qlock)) { |
| rcu_read_unlock(); |
| spin_unlock(&tf->lifetime); |
| continue; |
| } |
| __remove_from_parent_list(parent, tf); |
| wc_remove_child(parent, tf); |
| qunlock(&parent->file.qlock); |
| /* We're off the list, but our kref == 0 still. We can break |
| * that invariant since we have the only ref and are about to |
| * free the TF. */ |
| tf->flags &= ~TF_F_ON_LRU; |
| list_del(&tf->lru); |
| spin_unlock(&tf->lifetime); |
| list_add_tail(&tf->lru, &work); |
| } |
| spin_unlock(&wc->lru_lock); |
| /* Now we have a list of refs that are all unlinked (but actually not |
| * flagged DISCONNECTED; that's only necessary for positives), kref == 0 |
| * (because they are negatives). |
| * |
| * We need to synchronize_rcu() too, since we could have had lockless |
| * lookups that have pointers to TF, and they may even mark |
| * HAS_BEEN_USED. Too late. */ |
| synchronize_rcu(); |
| list_for_each_entry_safe(tf, temp, &work, lru) { |
| assert(kref_refcnt(&tf->kref) == 0); |
| list_del(&tf->lru); |
| __tf_free(tf); |
| } |
| } |