blob: 16e0b3a8d8e3d1302e9e848fde68642a021d0d71 [file] [log] [blame]
/* Copyright (c) 2018 Google Inc
* Barret Rhoden <brho@cs.berkeley.edu>
* See LICENSE for details.
*
* tree_file: structs and helpers for a tree-based filesystem for 9ns devices.
*/
#include <tree_file.h>
#include <kmalloc.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <error.h>
/* Adds to the LRU if it was not on it.
*
* Caller holds the TF lock or o/w knows it has the only ref to tf. */
static void __add_to_lru(struct tree_file *tf)
{
struct walk_cache *wc = &tf->tfs->wc;
if (tf->flags & TF_F_ON_LRU)
return;
tf->flags |= TF_F_ON_LRU;
spin_lock(&wc->lru_lock);
list_add_tail(&tf->lru, &wc->lru);
spin_unlock(&wc->lru_lock);
}
/* Removes from the LRU if it was on it.
*
* Caller holds the TF lock or o/w knows it has the only ref to tf. */
static void __remove_from_lru(struct tree_file *tf)
{
struct walk_cache *wc = &tf->tfs->wc;
if (!(tf->flags & TF_F_ON_LRU))
return;
assert(kref_refcnt(&tf->kref) == 0);
tf->flags &= ~TF_F_ON_LRU;
spin_lock(&wc->lru_lock);
list_del(&tf->lru);
spin_unlock(&wc->lru_lock);
}
/* Caller holds the parent's qlock. Separate helpers here in case we track
* nr_children. */
static void __add_to_parent_list(struct tree_file *parent,
struct tree_file *child)
{
list_add(&child->siblings, &parent->children);
}
/* Caller holds the parent's qlock */
static void __remove_from_parent_list(struct tree_file *parent,
struct tree_file *child)
{
list_del(&child->siblings);
}
/* Safely grabs a kref on TF, possibly resurrecting from 0, at which point the
* file would be on an LRU list. This syncs with tree removers, including
* unlinking from the tree on remove and LRU cache pruners. Returns true if we
* got the ref, false if we lost and the file was disconnected. */
bool tf_kref_get(struct tree_file *tf)
{
spin_lock(&tf->lifetime);
if (tf->flags & TF_F_DISCONNECTED) {
spin_unlock(&tf->lifetime);
return false;
}
__remove_from_lru(tf);
__kref_get(&tf->kref, 1);
tf->flags |= TF_F_HAS_BEEN_USED;
spin_unlock(&tf->lifetime);
return true;
}
void tf_kref_put(struct tree_file *tf)
{
kref_put(&tf->kref);
}
static void __tf_free(struct tree_file *tf)
{
struct tree_file *parent = tf->parent;
struct tree_filesystem *tfs = tf->tfs;
tf->tfs->tf_ops.free(tf);
if (tf->flags & TF_F_IS_ROOT) {
assert(tfs->root == tf);
assert(!parent);
}
cleanup_fs_file((struct fs_file*)tf);
kfree(tf);
/* the reason for decreffing the parent now is for convenience on
* releasing. When we unlink the child from the LRU pruner, we don't
* want to release the parent immediately while we hold the parent's
* qlock (and other locks). */
if (parent)
tf_kref_put(parent);
}
static void __tf_free_rcu(struct rcu_head *head)
{
struct tree_file *tf = container_of(head, struct tree_file, rcu);
__tf_free(tf);
}
static void tf_release(struct kref *kref)
{
struct tree_file *tf = container_of(kref, struct tree_file, kref);
assert(!(tf->flags & TF_F_NEGATIVE));
spin_lock(&tf->lifetime);
if (kref_refcnt(&tf->kref) > 0) {
/* Someone resurrected after we decreffed to 0. */
assert(!(tf->flags & TF_F_ON_LRU));
spin_unlock(&tf->lifetime);
return;
}
if (!(tf->flags & (TF_F_DISCONNECTED | TF_F_IS_ROOT))) {
/* It's possible that we paused before locking, then another
* thread upped, downed, and put it on the LRU list already.
* The helper deals with that. */
__add_to_lru(tf);
spin_unlock(&tf->lifetime);
return;
}
spin_unlock(&tf->lifetime);
/* Need RCU, since we could have had a reader who saw the object and
* still needs to try to kref (and fail). call_rcu, since we can't
* block. */
call_rcu(&tf->rcu, __tf_free_rcu);
}
static unsigned long hash_string(const char *name)
{
unsigned long hash = 5381;
for (const char *p = name; *p; p++) {
/* hash * 33 + c, djb2's technique */
hash = ((hash << 5) + hash) + *p;
}
return hash;
}
static void wc_init(struct walk_cache *wc)
{
spinlock_init(&wc->lru_lock);
INIT_LIST_HEAD(&wc->lru);
spinlock_init(&wc->ht_lock);
wc->ht = wc->static_ht;
hash_init_hh(&wc->hh);
for (int i = 0; i < wc->hh.nr_hash_lists; i++)
INIT_HLIST_HEAD(&wc->ht[i]);
}
static void wc_destroy(struct walk_cache *wc)
{
assert(list_empty(&wc->lru));
for (int i = 0; i < wc->hh.nr_hash_lists; i++)
assert(hlist_empty(&wc->ht[i]));
if (wc->ht != wc->static_ht)
kfree(wc->ht);
}
/* Looks up the child of parent named 'name' in the walk cache hash table.
* Caller needs to hold an rcu read lock. */
static struct tree_file *wc_lookup_child(struct tree_file *parent,
const char *name)
{
struct walk_cache *wc = &parent->tfs->wc;
unsigned long hash_val = hash_string(name);
struct hlist_head *bucket;
struct tree_file *i;
bucket = &wc->ht[hash_val % wc->hh.nr_hash_bits];
hlist_for_each_entry_rcu(i, bucket, hash) {
/* Note 'i' is an rcu protected pointer. That deref is safe.
* i->parent is also a pointer that in general we want to
* protect. In this case, even though we don't dereference it,
* we want a pointer that is good enough to dereference so we
* can do the comparison. */
if (rcu_dereference(i->parent) != parent)
continue;
/* The file's name should never change while it is in the table,
* so no need for a seq-reader. Can't assert though, since
* there are valid reasons for other seq lockers. */
if (!strcmp(tree_file_to_name(i), name))
return i;
}
return NULL;
}
/* Caller should hold the parent's qlock */
static void wc_insert_child(struct tree_file *parent, struct tree_file *child)
{
struct walk_cache *wc = &parent->tfs->wc;
unsigned long hash_val = hash_string(tree_file_to_name(child));
struct hlist_head *bucket;
assert(child->parent == parent); /* catch bugs from our callers */
/* TODO: consider bucket locks and/or growing the HT. Prob need a
* seq_ctr in the WC, used on the read side during resizing. Removal
* probably would need something other than the bucket lock too
* (confusion about which bucket during the op). */
spin_lock(&wc->ht_lock);
bucket = &wc->ht[hash_val % wc->hh.nr_hash_bits];
hlist_add_head_rcu(&child->hash, bucket);
spin_unlock(&wc->ht_lock);
}
/* Caller should hold the parent's qlock */
static void wc_remove_child(struct tree_file *parent, struct tree_file *child)
{
struct walk_cache *wc = &parent->tfs->wc;
assert(child->parent == parent); /* catch bugs from our callers */
spin_lock(&wc->ht_lock);
hlist_del_rcu(&child->hash);
spin_unlock(&wc->ht_lock);
}
/* Helper: returns a refcounted pointer to the potential parent. May return 0.
*
* Caller needs to qlock the parent and recheck tf->parent. Callers always need
* to get a kref on the parent. We can rcu-read the parent and *attempt* to
* qlock under rcu, but we might block. Then, say, the TF and the parent got
* removed, and *then* we get the qlock. We're too late. */
static struct tree_file *__tf_get_potential_parent(struct tree_file *tf)
{
struct tree_file *parent;
rcu_read_lock();
parent = rcu_dereference(tf->parent);
if (!parent) {
/* the root of the tree has no parent */
rcu_read_unlock();
return NULL;
}
if (!tf_kref_get(parent))
parent = NULL;
rcu_read_unlock();
return parent;
}
/* Returns a refcounted and qlocked parent for child. NULL on failure. */
static struct tree_file *get_locked_and_kreffed_parent(struct tree_file *child)
{
struct tree_file *parent;
parent = __tf_get_potential_parent(child);
if (!parent)
return NULL;
qlock(&parent->file.qlock);
/* Checking the parent == child->parent isn't enough here. That works
* for rename, but not removal/unlink. Older versions of TF code
* cleared child->parent, but now that's dealt with in tf_free.
*
* We're doing a lockless peek at child's flags. We hold the potential
* parent's lock, so if they are ours, no one will be messing with the
* disconnected flag. If they are messing with it, then parent !=
* child->parent. Also, once disconnected is set, it is never clear. */
if ((child->flags & TF_F_DISCONNECTED) || (parent != child->parent)) {
qunlock(&parent->file.qlock);
tf_kref_put(parent);
return NULL;
}
return parent;
}
static bool __mark_disconnected(struct tree_file *tf)
{
bool need_to_free;
spin_lock(&tf->lifetime);
tf->flags |= TF_F_DISCONNECTED;
__remove_from_lru(tf);
need_to_free = kref_refcnt(&tf->kref) == 0;
spin_unlock(&tf->lifetime);
return need_to_free;
}
/* Disconnects child from the in-memory tree structure (i.e. only the front
* end). Caller holds the parent qlock. Assumes child is a child of parent.
* Once you disconnect, you can't touch the child object.
*
* Racing with concurrent lookups, who might grab a ref if they get in before
* DISCONNECTED, and racing with release (after ref = 0), who might free, if it
* was already unlinked. */
static void __disconnect_child(struct tree_file *parent,
struct tree_file *child)
{
bool need_to_free;
need_to_free = __mark_disconnected(child);
/* Note child->parent is still set. We clear that in __tf_free. */
__remove_from_parent_list(parent, child);
wc_remove_child(parent, child);
if (need_to_free)
call_rcu(&child->rcu, __tf_free_rcu);
}
/* Backend will need to fill in dir, except for name. Note this has a kref ==
* 0, but is not on the LRU yet. */
struct tree_file *tree_file_alloc(struct tree_filesystem *tfs,
struct tree_file *parent, const char *name)
{
struct tree_file *tf;
tf = kzmalloc(sizeof(struct tree_file), MEM_WAIT);
fs_file_init((struct fs_file*)tf, name, &tfs->fs_ops);
kref_init(&tf->kref, tf_release, 0);
spinlock_init(&tf->lifetime);
/* Need to set the parent early on, even if the child isn't linked yet,
* so that the TFS ops know who the parent is. */
tf->parent = parent;
if (parent)
kref_get(&parent->kref, 1);
INIT_LIST_HEAD(&tf->children);
tf->can_have_children = true;
tf->tfs = tfs;
return tf;
}
/* Callers must hold the parent's qlock. */
static void __link_child(struct tree_file *parent, struct tree_file *child)
{
/* Devices may have already increffed ("+1 for existing"). Those that
* don't need to be on the LRU. We haven't linked to the parent yet, so
* we hold the only ref. Once we unlock in __add_to_lru, we're
* discoverable via that list, even though we're not linked. The lru
* pruner is careful to not muck with the parent's or wc linkage without
* qlocking the parent, which we currently hold. */
if (kref_refcnt(&child->kref) == 0)
__add_to_lru(child);
/* This was set in tree_file_alloc */
assert(child->parent == parent);
__add_to_parent_list(parent, child);
wc_insert_child(parent, child);
}
/* Hold the dir's qlock. This probably works with any directory, though we only
* use it when we remove a directory. The normal way to drop negative entries
* involves working directly with the WC (tfs_lru_prune_neg). */
static void __prune_dir_negatives(struct tree_file *dir)
{
struct tree_file *child, *temp;
list_for_each_entry_safe(child, temp, &dir->children, siblings) {
if (!tree_file_is_negative(child))
continue;
spin_lock(&child->lifetime);
assert(kref_refcnt(&child->kref) == 0);
/* This mark prevents new lookups; will disconnect it shortly */
child->flags |= TF_F_DISCONNECTED;
spin_unlock(&child->lifetime);
assert(child->parent == dir);
__disconnect_child(dir, child);
}
}
static void neuter_directory(struct tree_file *dir)
{
qlock(&dir->file.qlock);
if (dir->tfs->tf_ops.has_children(dir)) {
qunlock(&dir->file.qlock);
error(ENOTEMPTY, "can't remove dir with children");
}
dir->can_have_children = false;
/* Even if we don't have real children, we might have some negatives.
* Those aren't a reason to abort an rmdir, we just need to drop them.*/
__prune_dir_negatives(dir);
qunlock(&dir->file.qlock);
}
/* Unlink a child from the tree. Last ref will clean it up, which will not be
* us. Caller should hold krefs on the child and parent. The child's kref will
* actually keep the parent's alive, but all practical callers will have the
* parent kreff'ed and qlocked - which is required to ensure the child is still
* a child of parent. */
static void __unlink_child(struct tree_file *parent, struct tree_file *child)
{
/* Need to make sure concurrent creates/renames do not add children to
* directories that are unlinked. Note we don't undo the neutering if
* the backend fails. */
if (tree_file_is_dir(child))
neuter_directory(child);
/* The ramfs backend will probably decref the "+1 for existing" ref.
* This is OK. If that was the last ref, the child will briefly be on
* the LRU list (which ramfs ignores). When we disconnect, we'll yank
* the child back off the list and then free it (after rcu). */
parent->tfs->tf_ops.unlink(parent, child);
__disconnect_child(parent, child);
}
/* Talks to the backend and ensures a tree_file for the child exists, either
* positive or negative. Throws an error; doesn't return NULL.
*
* This returns with an rcu read lock sufficient to protect the returned TF, but
* it is *not* kreffed. It could be a negative entry, which is not kreffed.
*
* It is possible that at the moment we qunlock, someone comes along and removes
* the entry (LRU prune or file removal (for positive entries)). That's fine -
* we have an rcu read lock, just like during a regular walk, when the removal
* could happen anyways. */
static struct tree_file *lookup_child_entry(struct tree_file *parent,
const char *name)
{
ERRSTACK(1);
struct tree_file *child;
qlock(&parent->file.qlock);
child = wc_lookup_child(parent, name);
if (child) {
/* Since we last looked, but before we qlocked, someone else
* added our entry. */
rcu_read_lock();
qunlock(&parent->file.qlock);
return child;
}
if (!parent->can_have_children) {
qunlock(&parent->file.qlock);
error(ENOENT, "lookup failed, parent dir being removed");
}
child = tree_file_alloc(parent->tfs, parent, name);
if (waserror()) {
/* child wasn't fully created, so freeing it may be tricky, esp
* on the device ops side (might see something they never
* created). */
__tf_free(child);
qunlock(&parent->file.qlock);
nexterror();
}
parent->tfs->tf_ops.lookup(parent, child);
poperror();
__link_child(parent, child);
rcu_read_lock();
qunlock(&parent->file.qlock);
return child;
}
/* Walks a tree filesystem from 'from' for array of null-terminated names.
* Devices will often use tree_chan_walk(), but can use this directly if they
* want more control.
*
* Returns the WQ of qids. On complete/successful walks, we hang a refcnted TF
* on wq->clone.
*
* Walks can return intermediate results, which is when there is an error but we
* have walked at least once. The partial return is useful for namec(), which
* can succeed if something was mounted on an intermediate QID. Walks that have
* no results set error and return NULL, which is what namec() expects. */
struct walkqid *tree_file_walk(struct tree_file *from, char **name,
unsigned int nname)
{
ERRSTACK(2);
struct tree_file *at, *next;
struct tree_filesystem *tfs = from->tfs;
struct walkqid *wq;
wq = kzmalloc(sizeof(struct walkqid) + nname * sizeof(struct qid),
MEM_WAIT);
/* A walk with zero names means "make me a copy." If we go through the
* regular walker, our usual tf_kref_get will fail - similar to failing
* if we got a walk for "foo/../" during a concurrent removal of
* ourselves. We'll let a walk of zero names work, but if you provide
* any names, the actual walk must happen.
*
* This is tricky, and confused me a little. We're returning a *TF*
* through wq->clone, not a chan, and that is refcounted. Normally for
* chans that end up with wq->clone == c, we do not incref the object
* hanging off the chan (see k/d/d/eventfd.c), since there is just one
* chan with a kreffed object hanging off e.g. c->aux. But here,
* wq->clone is considered a distinct refcnt to some TF, and it might be
* 'from.' Our *caller* needs to deal with the "wq->clone ==
* from_chan", since they deal with chans. We deal with tree files. */
if (!nname) {
kref_get(&from->kref, 1);
wq->clone = (struct chan*)from;
return wq;
}
if (waserror()) {
kfree(wq);
poperror();
return NULL;
}
rcu_read_lock();
at = from;
for (int i = 0; i < nname; i++) {
/* Walks end if we reach a regular file, i.e. you can't walk
* through a file, only a dir. But even if there are more
* names, the overall walk might succeed. E.g. a directory
* could be mounted on top of the current file we're at. That's
* just not our job. */
if (tree_file_is_file(at)) {
if (i == 0)
error(ENOTDIR, "initial walk from a file");
break;
}
/* Normally, symlinks stop walks, and namec's walk() will deal
* with it. We allow walks 'through' symlinks, but only for ..
* and only for the first name. This is for relative lookups so
* we can find the parent of a symlink. */
if (tree_file_is_symlink(at)) {
if (i != 0)
break;
if (strcmp(name[i], ".."))
error(ELOOP,
"walk from a symlink that wasn't ..");
}
if (!caller_has_tf_perms(at, O_READ)) {
if (i == 0)
error(EACCES, "missing perm for lookup");
break;
}
if (!strcmp(name[i], ".")) {
wq->qid[wq->nqid++] = tree_file_to_qid(at);
continue;
}
if (!strcmp(name[i], "..")) {
next = rcu_dereference(at->parent);
if (!next) {
if (tree_file_is_root(at)) {
wq->qid[wq->nqid++] =
tree_file_to_qid(at);
/* I think namec should never give us
* DOTDOT that isn't at the end of the
* names array. Though devwalk() seems
* to expect it. */
if (i != nname - 1)
warn("namec DOTDOT bug?");
continue;
}
/* We lost our parent due to a removal/rename.
* We might have walked enough for our walk to
* succeed (e.g. there's a mount point in the
* WQ), so we can return what we have. Though
* if we've done nothing, it's a failure.
*
* Note the removal could have happened a long
* time ago: consider an O_PATH open, then
* namec_from().
*
* We also could walk up and see the *new*
* parent during rename(). For instance,
* /src/x/../y could get the old /src/y or the
* new /dst/y. If we want to avoid that, then
* we'll need some sort of sync with rename to
* make sure we don't get the new one. Though
* this can happen with namec_from(), so I'm not
* sure I care. */
if (i == 0)
error(ENOENT,
"file lost its parent during lookup");
break;
}
at = next;
wq->qid[wq->nqid++] = tree_file_to_qid(at);
continue;
}
next = wc_lookup_child(at, name[i]);
if (!next) {
/* TFSs with no backend have the entire tree in the WC
* HT. */
if (!tfs->tf_ops.lookup) {
if (i == 0)
error(ENOENT, "file does not exist");
break;
}
/* Need to hold a kref on 'at' before we
* rcu_read_unlock(). Our TFS op might block. */
if (!tf_kref_get(at)) {
if (i == 0)
error(ENOENT,
"file was removed during lookup");
/* WQ has a qid for 'at' from a previous loop.
* We can't grab a ref, so it's being
* removed/renamed. Yet it's still OK to report
* this as part of the wq, since we could fail
* for other reasons (thus not getting nnames)
* and have the same race, which we handle below
* (i.e. we only get a ref when it was the last
* name). Here we're just *noticing* the race.
*/
break;
}
rcu_read_unlock();
/* propagate the error only on the first name. Note we
* run the 'else' case, and run the poperror case for
* non-errors and non-name=0-errors. */
if (waserror()) {
tf_kref_put(at);
if (i == 0)
nexterror();
/* In this case, we're discarding the waserror,
* same as in the other i != 0 cases. */
poperror();
break;
}
/* returns with an rcu_read_lock protecting 'next' */
next = lookup_child_entry(at, name[i]);
tf_kref_put(at);
poperror();
assert(next);
}
if (tree_file_is_negative(next)) {
/* lockless peek. other flag users aren't atomic. */
if (!(next->flags & TF_F_HAS_BEEN_USED)) {
spin_lock(&next->lifetime);
next->flags |= TF_F_HAS_BEEN_USED;
spin_unlock(&next->lifetime);
}
if (i == 0)
error(ENOENT, "file does not exist");
break;
}
at = next;
wq->qid[wq->nqid++] = tree_file_to_qid(at);
}
if (wq->nqid == nname || tree_file_is_symlink(at)) {
if (!tf_kref_get(at)) {
/* need to undo our last result as if we never saw it.*/
wq->nqid--;
if (wq->nqid == 0)
error(ENOENT, "file was removed during lookup");
} else {
/* Hanging the refcounted TF off the wq->clone, which is
* cheating. Our walker shim knows to expect this. */
wq->clone = (struct chan*)at;
}
}
rcu_read_unlock();
poperror();
return wq;
}
/* Most tree devices will use this for their walk op, similar to devwalk. */
struct walkqid *tree_chan_walk(struct chan *c, struct chan *nc, char **name,
unsigned int nname)
{
struct tree_file *from, *to;
struct walkqid *wq;
from = chan_to_tree_file(c);
wq = tree_file_walk(from, name, nname);
if (!wq)
return NULL;
if (!wq->clone)
return wq;
if (!nc)
nc = devclone(c);
/* Not sure if callers that specify nc must have a chan from our device.
*/
assert(nc->type == c->type);
to = (struct tree_file*)wq->clone;
nc->qid = tree_file_to_qid(to);
chan_set_tree_file(nc, to);
wq->clone = nc;
/* We might be returning the same chan, so there's actually just one
* ref. */
if (wq->clone == c)
tf_kref_put(chan_to_tree_file(c));
return wq;
}
/* Creates a tree file under parent with name, omode, and perm. Returns a
* kreffed tree_file for the new child. Throws on error. */
struct tree_file *tree_file_create(struct tree_file *parent, const char *name,
uint32_t perm, char *ext)
{
ERRSTACK(2);
struct tree_file *child;
bool got_ref;
struct timespec now;
qlock(&parent->file.qlock);
if (waserror()) {
qunlock(&parent->file.qlock);
nexterror();
}
if (!caller_has_tf_perms(parent, O_WRITE))
error(EACCES, "missing create permission on dir");
if (!parent->can_have_children)
error(ENOENT, "create failed, parent dir being removed");
child = wc_lookup_child(parent, name);
if (child) {
/* The create(5) message fails if the file exists, which differs
* from the syscall. namec() handles this. */
if (!tree_file_is_negative(child))
error(EEXIST, "file exists");
/* future lookups that find no entry will qlock. concurrent
* ones that see the child, even if disconnected, will see it
* was negative and fail. */
__disconnect_child(parent, child);
}
child = tree_file_alloc(parent->tfs, parent, name);
if (waserror()) {
__tf_free(child);
nexterror();
}
/* Backend will need to know the ext for its create. This gets cleaned
* up on error. */
if (perm & DMSYMLINK)
kstrdup(&child->file.dir.ext, ext);
/* Backend will need to fill in dir, except for name. */
parent->tfs->tf_ops.create(parent, child, perm);
now = nsec2timespec(epoch_nsec());
__set_acmtime_to(&child->file,
FSF_ATIME | FSF_BTIME | FSF_CTIME | FSF_MTIME, &now);
poperror();
/* At this point, the child is visible, so it must be ready to go */
__link_child(parent, child);
got_ref = tf_kref_get(child);
assert(got_ref); /* we hold the qlock, no one should have removed */
__set_acmtime_to(&parent->file, FSF_CTIME | FSF_MTIME, &now);
qunlock(&parent->file.qlock);
poperror();
return child;
}
/* Most tree devices will use this for their create op. */
void tree_chan_create(struct chan *c, char *name, int omode, uint32_t perm,
char *ext)
{
struct tree_file *parent, *child;
parent = chan_to_tree_file(c);
child = tree_file_create(parent, name, perm, ext);
c->qid = tree_file_to_qid(child);
c->mode = openmode(omode);
chan_set_tree_file(c, child);
tf_kref_put(parent);
}
struct chan *tree_chan_open(struct chan *c, int omode)
{
struct tree_file *tf = chan_to_tree_file(c);
if ((c->qid.type & QTDIR) && (omode & O_WRITE))
error(EISDIR, "can't open a dir for writing");
if (c->qid.type & QTSYMLINK)
error(ELOOP, "can't open a symlink");
tree_file_perm_check(tf, omode);
/* TODO: if we want to support DMEXCL on dir.mode, we'll need to
* lock/sync on the fs_file (have a flag for FSF_IS_OPEN, handle in
* close). We'll also need a way to pass it in to the dir.mode during
* create/wstat/etc. */
if (omode & O_TRUNC)
fs_file_truncate(&tf->file, 0);
c->mode = openmode(omode);
c->flag |= COPEN;
c->offset = 0;
return c;
}
void tree_chan_close(struct chan *c)
{
struct tree_file *tf = chan_to_tree_file(c);
tf_kref_put(tf);
}
void tree_file_remove(struct tree_file *child)
{
ERRSTACK(1);
struct tree_file *parent;
parent = get_locked_and_kreffed_parent(child);
if (!parent)
error(ENOENT, "%s had no parent", tree_file_to_name(child));
if (waserror()) {
qunlock(&parent->file.qlock);
tf_kref_put(parent);
nexterror();
}
if (!caller_has_tf_perms(parent, O_WRITE))
error(EACCES, "missing remove perm for dir");
__unlink_child(parent, child);
__set_acmtime(&parent->file, FSF_CTIME | FSF_MTIME);
poperror();
qunlock(&parent->file.qlock);
tf_kref_put(parent);
}
void tree_chan_remove(struct chan *c)
{
ERRSTACK(1);
struct tree_file *tf = chan_to_tree_file(c);
/* sysremove expects a chan that is disconnected from the device,
* regardless of whether or not we fail. See sysremove(); it will clear
* type, ensuring our close is never called. */
if (waserror()) {
chan_set_tree_file(c, NULL);
tf_kref_put(tf); /* The ref from the original walk */
nexterror();
}
tree_file_remove(tf);
chan_set_tree_file(c, NULL);
tf_kref_put(tf); /* The ref from the original walk */
poperror();
}
static bool is_descendant_of(struct tree_file *descendant,
struct tree_file *ancestor)
{
if (!tree_file_is_dir(ancestor))
return false;
rcu_read_lock();
for (struct tree_file *i = rcu_dereference(descendant->parent);
i;
i = rcu_dereference(i->parent)) {
if (i == ancestor) {
rcu_read_unlock();
return true;
}
}
rcu_read_unlock();
return false;
}
/* Caller should hold the rename mutex. This qlocks a and b, which can be the
* same file, and this handles lock ordering. Recall the rule: parents before
* children, and otherwise only the renamer can lock. */
static void qlock_tree_files(struct tree_file *a, struct tree_file *b)
{
if (a == b) {
qlock(&a->file.qlock);
return;
}
if (is_descendant_of(a, b)) {
qlock(&b->file.qlock);
qlock(&a->file.qlock);
} else {
qlock(&a->file.qlock);
qlock(&b->file.qlock);
}
}
static void qunlock_tree_files(struct tree_file *a, struct tree_file *b)
{
if (a != b)
qunlock(&a->file.qlock);
qunlock(&b->file.qlock);
}
/* Higher layers (namec) ensure that tf and new_parent are in the same device.
* The device (e.g. #mnt) ensures that they are in the same instance.
* new_parent could be the parent of tf; everything should still work if the
* move is within the same directory. */
void tree_file_rename(struct tree_file *tf, struct tree_file *new_parent,
const char *name, int flags)
{
ERRSTACK(1);
struct tree_file *old_parent;
struct tree_file *prev_dst;
struct timespec now;
old_parent = __tf_get_potential_parent(tf);
if (!old_parent)
error(ENOENT, "renamed file had no parent");
/* global mtx helps with a variety of weird races, including the "can't
* move to a subdirectory of yourself" case and the lock ordering of
* parents (locks flow from parent->child). */
qlock(&tf->tfs->rename_mtx);
qlock_tree_files(old_parent, new_parent);
if (waserror()) {
qunlock_tree_files(old_parent, new_parent);
qunlock(&tf->tfs->rename_mtx);
tf_kref_put(old_parent);
nexterror();
};
if (old_parent != tf->parent)
error(ENOENT, "renamed file lost its parent");
/* Probably a namec bug (that code isn't written yet). */
assert(tree_file_is_dir(new_parent));
if (!new_parent->can_have_children)
error(ENOENT, "target dir being removed");
if (!caller_has_tf_perms(old_parent, O_WRITE))
error(EACCES, "missing remove perm for src dir");
if (!caller_has_tf_perms(new_parent, O_WRITE))
error(EACCES, "missing create perm for dst dir");
/* can't move tf to one of its subdirs */
if (is_descendant_of(new_parent, tf))
error(EINVAL, "can't rename to a child directory");
/* We hold new_parent's qlock, so there's no worry about prev_dst
* disappearing, so no need for an rcu read lock. */
prev_dst = wc_lookup_child(new_parent, name);
if (prev_dst) {
if (tree_file_is_dir(prev_dst)) {
if (!tree_file_is_dir(tf))
error(EISDIR, "can't rename a file onto a dir");
/* We need to ensure prev_dst is childless and remains
* so. That requires grabbing its qlock, but there's a
* potential lock ordering issue with old_parent. We
* could have this: new_parent/dst/x/y/z/old_parent/src.
* That will fail, but we need to check for that case
* instead of grabbing prev_dst's qlock. */
if (is_descendant_of(prev_dst, old_parent))
error(ENOTEMPTY,
"old_parent descends from dst");
neuter_directory(prev_dst);
} else {
if (tree_file_is_dir(tf))
error(ENOTDIR,
"can't rename a dir onto a file");
}
}
/* We check with the backend first, so that it has a chance to fail
* early. Once we make the changes to the front end, lookups can see
* the effects of the change, which we can't roll back. Since we hold
* the parents' qlocks, no one should be able to get the info from the
* backend either (lookups that require the backend, readdir, etc). */
tf->tfs->tf_ops.rename(tf, old_parent, new_parent, name, flags);
/* Similar to __disconnect_child, we don't clear tf->parent. rcu
* readers at TF will be able to walk up (with ..). Same with
* namec_from an FD. If we atomically replace tf->parent, we should be
* good. See tree_file_walk().
*
* Further, we don't mark the tf disconnected. Someone might lookup
* from the old location, and that's fine. We just don't want issues
* with decrefs. */
__remove_from_parent_list(old_parent, tf);
wc_remove_child(old_parent, tf);
synchronize_rcu();
/* Now, no new lookups will find it at the old location. That change is
* not atomic wrt src, but it will be wrt dst. Importantly, no one will
* see /path/to/old_parent/new_basename */
fs_file_change_basename((struct fs_file*)tf, name);
/* We're clobbering the old_parent ref, which we'll drop later */
rcu_assign_pointer(tf->parent, new_parent);
kref_get(&new_parent->kref, 1);
__add_to_parent_list(new_parent, tf);
wc_insert_child(new_parent, tf);
/* Now both the prev_dst (if it existed) or the tf file are in the walk
* cache / HT and either could have been looked up by a concurrent
* reader. Readers will always get one or the other, but never see
* nothing. This is the atomic guarantee of rename. */
if (prev_dst) {
__remove_from_parent_list(new_parent, prev_dst);
wc_remove_child(new_parent, prev_dst);
synchronize_rcu();
/* Now no one can find prev_dst. Someone might still have a
* ref, or it might be on the LRU list (if kref == 0). Now we
* can mark disconnected. Had we disconnected earlier, then
* lookup code would see that and treat it as a failure. Using
* rcu and putting the complexity in rename was easier and
* simpler than changing lookup.
*
* We still need RCU here for freeing the prev_dst. We could
* have had an LRU pruner, etc, looking. The synchronize_rcu
* above only dealt with lookups via parent in this function. */
if (__mark_disconnected(prev_dst))
call_rcu(&prev_dst->rcu, __tf_free_rcu);
}
now = nsec2timespec(epoch_nsec());
__set_acmtime_to(&old_parent->file, FSF_CTIME | FSF_MTIME, &now);
__set_acmtime_to(&new_parent->file, FSF_CTIME | FSF_MTIME, &now);
/* Can we unlock earlier? No. We need to at least hold new_parent's
* qlock, which was the parent of old_dst, until old_dst is marked
* disconnected. Even though old_dst is removed from new_parent's HT,
* it is still in the LRU list. */
qunlock_tree_files(old_parent, new_parent);
qunlock(&tf->tfs->rename_mtx);
poperror();
tf_kref_put(old_parent); /* the original tf->parent ref we clobbered */
tf_kref_put(old_parent); /* the one we grabbed when we started */
}
void tree_chan_rename(struct chan *c, struct chan *new_p_c, const char *name,
int flags)
{
struct tree_file *tf = chan_to_tree_file(c);
struct tree_file *new_parent = chan_to_tree_file(new_p_c);
tree_file_rename(tf, new_parent, name, flags);
}
/* dri is a pointer to the chan->dri, which is a count of how many directory
* entries that have been read from this chan so far. 9ns handles it; we just
* need to increment it for every successful entry. Note we ignore offset. */
ssize_t tree_file_readdir(struct tree_file *parent, void *ubuf, size_t n,
off64_t offset, int *dri)
{
ERRSTACK(1);
struct tree_file *i;
size_t dir_amt, so_far = 0;
uint8_t *write_pos = ubuf;
int child_nr = 0;
qlock(&parent->file.qlock);
if (waserror()) {
qunlock(&parent->file.qlock);
nexterror();
}
list_for_each_entry(i, &parent->children, siblings) {
if (child_nr++ < *dri)
continue;
qlock(&i->file.qlock);
dir_amt = convD2M(&i->file.dir, write_pos, n - so_far);
qunlock(&i->file.qlock);
if (dir_amt <= BIT16SZ) {
if (!so_far)
error(EINVAL, "buffer to small for readdir");
break;
}
write_pos += dir_amt;
so_far += dir_amt;
assert(n - so_far >= 0);
(*dri)++;
}
/* If we care about directory atime, we can do that here. (if so_far) */
qunlock(&parent->file.qlock);
poperror();
return so_far;
}
/* Note this only works for backend-less TFSs. It calls tree_file_readdir,
* which only looks at the frontend's tree. */
size_t tree_chan_read(struct chan *c, void *ubuf, size_t n, off64_t offset)
{
struct tree_file *tf = chan_to_tree_file(c);
if (tree_file_is_dir(tf))
return tree_file_readdir(tf, ubuf, n, offset, &c->dri);
return fs_file_read(&tf->file, ubuf, n, offset);
}
size_t tree_chan_write(struct chan *c, void *ubuf, size_t n, off64_t offset)
{
struct tree_file *tf = chan_to_tree_file(c);
/* sysfile.c:rwrite checked the chan's type. */
assert(!tree_file_is_dir(tf));
return fs_file_write(&tf->file, ubuf, n, offset);
}
size_t tree_chan_stat(struct chan *c, uint8_t *m_buf, size_t m_buf_sz)
{
struct tree_file *tf = chan_to_tree_file(c);
return fs_file_stat(&tf->file, m_buf, m_buf_sz);
}
size_t tree_chan_wstat(struct chan *c, uint8_t *m_buf, size_t m_buf_sz)
{
struct tree_file *tf = chan_to_tree_file(c);
return fs_file_wstat(&tf->file, m_buf, m_buf_sz);
}
struct fs_file *tree_chan_mmap(struct chan *c, struct vm_region *vmr, int prot,
int flags)
{
struct fs_file *f = &chan_to_tree_file(c)->file;
/* TODO: In the future, we'll check the prot, establish hooks with the
* VMR, and other things, mostly in something like fs_file_mmap, which
* will be able to handle mmaping something that doesn't use the page
* cache. For now, I'm aggressively qlocking to catch bugs. */
qlock(&f->qlock);
if ((prot & PROT_WRITE) && (flags & MAP_SHARED))
f->flags |= FSF_DIRTY;
qunlock(&f->qlock);
return f;
}
unsigned long tree_chan_ctl(struct chan *c, int op, unsigned long a1,
unsigned long a2, unsigned long a3,
unsigned long a4)
{
switch (op) {
case CCTL_DEBUG:
print_lock();
printk("Dumping tree_file info (frontend), dev %s chan %s:\n\n",
chan_dev_name(c), channame(c));
__tfs_dump_tf(chan_to_tree_file(c));
print_unlock();
return 0;
default:
error(EINVAL, "%s does not support chanctl %d",
chan_dev_name(c), op);
}
}
/* Given a tree file, construct a chan that points to the TF for the given
* device. Careful with this - it's for bootstrapping. */
struct chan *tree_file_alloc_chan(struct tree_file *tf, struct dev *dev,
char *name)
{
struct chan *c;
c = newchan();
c->type = dev - devtab;
c->name = newcname(name);
kref_get(&tf->kref, 1);
chan_set_tree_file(c, tf);
c->qid = tree_file_to_qid(tf);
return c;
}
/* Caller needs to set its customizable fields: tf_ops, pm_ops, etc. root is
* created with a ref of 1, but needs filled in by the particular TFS. */
void tfs_init(struct tree_filesystem *tfs)
{
wc_init(&tfs->wc);
qlock_init(&tfs->rename_mtx);
tfs->root = tree_file_alloc(tfs, NULL, ".");
tfs->root->flags |= TF_F_IS_ROOT;
assert(!(tfs->root->flags & TF_F_ON_LRU));
__kref_get(&tfs->root->kref, 1);
}
void tfs_destroy(struct tree_filesystem *tfs)
{
tfs->root = NULL; /* was just freed in __tf_free() */
wc_destroy(&tfs->wc);
}
/* For every file except root, we hold our parent's qlock (root has no parent).
* This prevents TF's removal/rename and any changes to tf->parent. */
static void tf_dfs_cb(struct tree_file *tf, void (*cb)(struct tree_file *tf))
{
struct tree_file *child;
if (tree_file_is_dir(tf)) {
qlock(&tf->file.qlock);
/* Note we don't have a kref on our child's TF - we have a weak
* reference (the list membership). We hold the parent's qlock,
* which prevents removal/unlinking/disconnecting/etc. The
* child's membership on the LRU list can change repeatedly.
*
* If we want to avoid holding the parent's qlock, we could grab
* a kref on the child. However, our list walk would be in
* jeopardy - both child and temp could be removed from the
* list. So we'd need to qlock the parent, grab krefs on all
* children, and put them on a local list. Also, grabbing krefs
* on our children will muck with the LRU list; when we're done,
* it would be like we sorted the LRU list in the DFS order. */
list_for_each_entry(child, &tf->children, siblings)
tf_dfs_cb(child, cb);
qunlock(&tf->file.qlock);
}
if (!tree_file_is_negative(tf))
cb(tf);
}
/* Runs CB on all in-memory, non-negative TFs from the TFS in a depth-first
* search. Compared to the other CB walkers (purge, LRU, etc), this never
* removes/prunes/disconnects a TF from the tree. You can use it for syncing FS
* trees or pruning page maps (i.e. discarding non-dirty pages).
*
* One thing to note is that it qlocks the tree as part of its DFS. We can
* change that, but at a slight cost in complexity and tampering with the LRU
* list. Translation: we can do it if there's a performance problem.
*
* The tfs->root reference is also weak. It's up to the user to make sure
* there is no concurrent tfs_destroy. Typically, if we're called on any
* mounted device, we're fine (e.g. #tmpfs) or a device that is never destroyed
* (e.g. #kfs). */
void tfs_frontend_for_each(struct tree_filesystem *tfs,
void (*cb)(struct tree_file *tf))
{
tf_dfs_cb(tfs->root, cb);
}
/* We should be single-user, so no need for concurrency protections. I keep
* them around for paranoia/future use/documentation. */
static void tf_dfs_purge(struct tree_file *tf, void (*cb)(struct tree_file *tf))
{
struct tree_file *child, *temp;
if (tree_file_is_dir(tf)) {
qlock(&tf->file.qlock);
/* Note we don't have a kref on TF, and one of our children
* should decref *us* to 0. We aren't disconnected yet, and we
* can't be removed (parent's qlock is held), so we'll just end
* up on the LRU list, which is OK.
*
* Our child should remove itself from our list, so we need the
* _safe.
*
* Also note that the child decrefs us in a call_rcu. CB can
* block, and technically so can qlock, so we might run RCU
* callbacks while qlocked. We'll need to rcu_barrier so that
* our children's decrefs occur before we remove ourselves from
* our parent. */
list_for_each_entry_safe(child, temp, &tf->children, siblings)
tf_dfs_purge(child, cb);
qunlock(&tf->file.qlock);
}
rcu_barrier();
/* ramfs will drop the "+1 for existing" ref here */
if (!tree_file_is_negative(tf))
cb(tf);
spin_lock(&tf->lifetime);
/* should be unused, with no children. we have a ref on root, to keep
* the TFS around while we destroy the tree. */
assert(kref_refcnt(&tf->kref) == 0 || tree_file_is_root(tf));
/* This mark prevents new lookups. We'll disconnect it shortly. */
tf->flags |= TF_F_DISCONNECTED;
spin_unlock(&tf->lifetime);
if (tf->parent)
__disconnect_child(tf->parent, tf);
}
/* Purges all in-memory TFs from the TFS in a depth-first search, both positive
* and negative. We run CB on all non-negative TFs. It's up to the caller to
* ensure there is no concurrency.
*
* The caller must make sure they have an extra ref on tfs->root to keep the
* TFS around while it gets destroyed. The root TF will get freed when it is
* released, unlike other TFs that are still connected.
*
* This is for devices that want to destroy themselves, such as an unmounted
* #tmpfs or #gtfs/#mnt, that want to do some extra work (e.g. sync).
* Typically, those devices will call this after they have no users (e.g. mounts
* or open chans). */
void tfs_frontend_purge(struct tree_filesystem *tfs,
void (*cb)(struct tree_file *tf))
{
assert(kref_refcnt(&tfs->root->kref) > 0);
tf_dfs_purge(tfs->root, cb);
}
static void print_tf(struct tree_file *tf)
{
if (tree_file_is_negative(tf)) {
printk("%-10s: Negative\n", tree_file_to_name(tf));
return;
}
printk("%-10s: Q: %5d, R: %2d, U %s, %c%o %s\n",
tree_file_to_name(tf),
tf->file.dir.qid.path,
kref_refcnt(&tf->kref),
tf->file.dir.uid,
tree_file_is_dir(tf) ? 'd'
: tf->file.dir.mode & DMSYMLINK ? 'l'
: '-',
tf->file.dir.mode & S_PMASK,
tf->file.dir.mode & DMSYMLINK ? tf->file.dir.ext : ""
);
}
static void dump_tf(struct tree_file *tf, int tabs)
{
struct tree_file *child;
if (!!(tf->file.dir.mode & DMSYMLINK) !=
!!(tf->file.dir.qid.type & QTSYMLINK))
warn("%s has differing symlink bits", tree_file_to_name(tf));
print_lock();
for (int i = 0; i < tabs; i++)
printk(" ");
print_tf(tf);
if (tree_file_is_dir(tf)) {
for (int i = 0; i < tabs; i++)
printk(" ");
printk("---------------------\n");
list_for_each_entry(child, &tf->children, siblings)
dump_tf(child, tabs + 1);
}
print_unlock();
}
void __tfs_dump(struct tree_filesystem *tfs)
{
dump_tf(tfs->root, 0);
}
void __tfs_dump_tf(struct tree_file *tf)
{
dump_tf(tf, 0);
}
/* Runs a callback on every non-negative TF on the LRU list, for a given
* snapshot of the LRU list. The CB returns true if it wants us to attempt to
* free the TF. One invariant is that we can never remove a TF from the tree
* while it is dirty; it is the job of the CB to maintain that. Note the CB can
* run on a TF as soon as that TF was linked to the parent (see lookup).
*
* The work list is a list of strong refs. We need to keep one in case the file
* is disconnected while we're running our CBs. Since we incref, we yank from
* the LRU list. We could design the rest of the TF code so that we stay on the
* LRU list throughout, but I like the invariant of "kref == 0 IFF on LRU".
*
* Since we're only on one list at a time ('wc->lru' or 'work'), we can use the
* lru list_head in the TF. We know that so long as we hold our kref on a TF,
* no one will attempt to put it back on the LRU list. */
void tfs_lru_for_each(struct tree_filesystem *tfs, bool cb(struct tree_file *),
size_t max_tfs)
{
struct list_head work = LIST_HEAD_INIT(work);
struct walk_cache *wc = &tfs->wc;
struct tree_file *tf, *temp, *parent;
size_t nr_tfs = 0;
/* We can have multiple LRU workers in flight, though a given TF will be
* on only one CB list at a time. */
spin_lock(&wc->lru_lock);
list_for_each_entry_safe(tf, temp, &wc->lru, lru) {
/* lockless peak at the flag. once it's NEGATIVE, it never goes
* back */
if (tree_file_is_negative(tf))
continue;
/* Normal lock order is TF -> LRU.
* Best effort is fine for LRU. */
if (!spin_trylock(&tf->lifetime))
continue;
/* Can't be disconnected and on LRU */
assert(!(tf->flags & TF_F_DISCONNECTED));
assert((tf->flags & TF_F_ON_LRU));
tf->flags &= ~TF_F_ON_LRU;
list_del(&tf->lru);
__kref_get(&tf->kref, 1);
/* The 'used' bit is the what allows us to detect a user in
* between our callback and the disconnection/freeing. It's a
* moot point if the CB returns false. */
tf->flags &= ~TF_F_HAS_BEEN_USED;
spin_unlock(&tf->lifetime);
list_add_tail(&tf->lru, &work);
if (++nr_tfs >= max_tfs)
break;
}
spin_unlock(&wc->lru_lock);
/* We have a snapshot of the LRU list. As soon as we unlocked a file,
* someone could incref it (e.g. to something > 1), and they'll set the
* used bit. That won't abort the CB. New TFs could be added to the
* LRU list. Those are ignored for this pass. */
list_for_each_entry_safe(tf, temp, &work, lru) {
if (!cb(tf)) {
list_del(&tf->lru);
tf_kref_put(tf);
continue;
}
}
/* Now we have a list of victims to be removed, so long as they haven't
* been used since. */
list_for_each_entry_safe(tf, temp, &work, lru) {
parent = get_locked_and_kreffed_parent(tf);
if (!parent) {
list_del(&tf->lru);
tf_kref_put(tf);
continue;
}
spin_lock(&tf->lifetime);
if (tf->flags & TF_F_HAS_BEEN_USED) {
spin_unlock(&tf->lifetime);
qunlock(&parent->file.qlock);
tf_kref_put(parent);
list_del(&tf->lru);
tf_kref_put(tf);
continue;
}
tf->flags |= TF_F_DISCONNECTED;
/* We hold a ref, so it shouldn't have found its way back on
* LRU. */
assert(!(tf->flags & TF_F_ON_LRU));
spin_unlock(&tf->lifetime);
__remove_from_parent_list(parent, tf);
wc_remove_child(parent, tf);
/* If we get tired of unlocking and relocking, we could see if
* the next parent is the current parent before unlocking. */
qunlock(&parent->file.qlock);
tf_kref_put(parent);
}
/* Now we have a list of refs that are all disconnected, kref == 1
* (because no one used them since we increffed them when they were LRU,
* which was when the refcnt was 0). Each TF has a ref on its parent
* btw, so parents will never be on the LRU list. (leaves only).
*
* We need to synchronize_rcu() too, since we could have had lockless
* lookups that have pointers to TF and are waiting to notice that it is
* disconnected. */
synchronize_rcu();
list_for_each_entry_safe(tf, temp, &work, lru) {
assert(kref_refcnt(&tf->kref) == 1);
list_del(&tf->lru);
/* We could decref, but instead we can directly free. We know
* the ref == 1 and it is disconnected. Directly freeing
* bypasses call_rcu. */
__tf_free(tf);
}
}
/* Does a one-cycle 'clock' algorithm to detect use. On a given pass, we either
* clear HAS_BEEN_USED xor we remove it. For negative entries, that bit is used
* when we look at an entry (use it), compared to positive entries, which is
* used when we get a reference. (we never get refs on negatives). */
void tfs_lru_prune_neg(struct tree_filesystem *tfs)
{
struct list_head work = LIST_HEAD_INIT(work);
struct walk_cache *wc = &tfs->wc;
struct tree_file *tf, *temp, *parent;
spin_lock(&wc->lru_lock);
list_for_each_entry_safe(tf, temp, &wc->lru, lru) {
if (!tree_file_is_negative(tf))
continue;
if (!spin_trylock(&tf->lifetime))
continue;
if (tf->flags & TF_F_HAS_BEEN_USED) {
tf->flags &= ~TF_F_HAS_BEEN_USED;
spin_unlock(&tf->lifetime);
continue;
}
rcu_read_lock(); /* holding a spinlock, but just to be clear. */
parent = rcu_dereference(tf->parent);
/* Again, inverting the lock order, so best effort trylock */
if (!canqlock(&parent->file.qlock)) {
rcu_read_unlock();
spin_unlock(&tf->lifetime);
continue;
}
__remove_from_parent_list(parent, tf);
wc_remove_child(parent, tf);
qunlock(&parent->file.qlock);
/* We're off the list, but our kref == 0 still. We can break
* that invariant since we have the only ref and are about to
* free the TF. */
tf->flags &= ~TF_F_ON_LRU;
list_del(&tf->lru);
spin_unlock(&tf->lifetime);
list_add_tail(&tf->lru, &work);
}
spin_unlock(&wc->lru_lock);
/* Now we have a list of refs that are all unlinked (but actually not
* flagged DISCONNECTED; that's only necessary for positives), kref == 0
* (because they are negatives).
*
* We need to synchronize_rcu() too, since we could have had lockless
* lookups that have pointers to TF, and they may even mark
* HAS_BEEN_USED. Too late. */
synchronize_rcu();
list_for_each_entry_safe(tf, temp, &work, lru) {
assert(kref_refcnt(&tf->kref) == 0);
list_del(&tf->lru);
__tf_free(tf);
}
}