kern/src/ns/tree_file.c - akaros - Git at Google

 /* Copyright (c) 2018 Google Inc
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
  * tree_file: structs and helpers for a tree-based filesystem for 9ns devices.
  */

 #include <tree_file.h>
 #include <kmalloc.h>
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
 #include <error.h>

 /* Adds to the LRU if it was not on it.
  *
  * Caller holds the TF lock or o/w knows it has the only ref to tf. */
 static void __add_to_lru(struct tree_file *tf)
 {
 	struct walk_cache *wc = &tf->tfs->wc;

 	if (tf->flags & TF_F_ON_LRU)
 		return;
 	tf->flags |= TF_F_ON_LRU;
 	spin_lock(&wc->lru_lock);
 	list_add_tail(&tf->lru, &wc->lru);
 	spin_unlock(&wc->lru_lock);
 }

 /* Removes from the LRU if it was on it.
  *
  * Caller holds the TF lock or o/w knows it has the only ref to tf. */
 static void __remove_from_lru(struct tree_file *tf)
 {
 	struct walk_cache *wc = &tf->tfs->wc;

 	if (!(tf->flags & TF_F_ON_LRU))
 		return;
 	assert(kref_refcnt(&tf->kref) == 0);
 	tf->flags &= ~TF_F_ON_LRU;
 	spin_lock(&wc->lru_lock);
 	list_del(&tf->lru);
 	spin_unlock(&wc->lru_lock);
 }

 /* Caller holds the parent's qlock.  Separate helpers here in case we track
  * nr_children. */
 static void __add_to_parent_list(struct tree_file *parent,
                                  struct tree_file *child)
 {
 	list_add(&child->siblings, &parent->children);
 }

 /* Caller holds the parent's qlock */
 static void __remove_from_parent_list(struct tree_file *parent,
                                       struct tree_file *child)
 {
 	list_del(&child->siblings);
 }

 /* Safely grabs a kref on TF, possibly resurrecting from 0, at which point the
  * file would be on an LRU list.  This syncs with tree removers, including
  * unlinking from the tree on remove and LRU cache pruners.  Returns true if we
  * got the ref, false if we lost and the file was disconnected. */
 bool tf_kref_get(struct tree_file *tf)
 {
 	spin_lock(&tf->lifetime);
 	if (tf->flags & TF_F_DISCONNECTED) {
 		spin_unlock(&tf->lifetime);
 		return false;
 	}
 	__remove_from_lru(tf);
 	__kref_get(&tf->kref, 1);
 	tf->flags |= TF_F_HAS_BEEN_USED;
 	spin_unlock(&tf->lifetime);
 	return true;
 }

 void tf_kref_put(struct tree_file *tf)
 {
 	kref_put(&tf->kref);
 }

 static void __tf_free(struct tree_file *tf)
 {
 	struct tree_file *parent = tf->parent;
 	struct tree_filesystem *tfs = tf->tfs;

 	tf->tfs->tf_ops.free(tf);
 	if (tf->flags & TF_F_IS_ROOT) {
 		assert(tfs->root == tf);
 		assert(!parent);
 	}
 	cleanup_fs_file((struct fs_file*)tf);
 	kfree(tf);
 	/* the reason for decreffing the parent now is for convenience on
 	 * releasing.  When we unlink the child from the LRU pruner, we don't
 	 * want to release the parent immediately while we hold the parent's
 	 * qlock (and other locks). */
 	if (parent)
 		tf_kref_put(parent);
 }

 static void __tf_free_rcu(struct rcu_head *head)
 {
 	struct tree_file *tf = container_of(head, struct tree_file, rcu);

 	__tf_free(tf);
 }

 static void tf_release(struct kref *kref)
 {
 	struct tree_file *tf = container_of(kref, struct tree_file, kref);

 	assert(!(tf->flags & TF_F_NEGATIVE));

 	spin_lock(&tf->lifetime);
 	if (kref_refcnt(&tf->kref) > 0) {
 		/* Someone resurrected after we decreffed to 0. */
 		assert(!(tf->flags & TF_F_ON_LRU));
 		spin_unlock(&tf->lifetime);
 		return;
 	}
 	if (!(tf->flags & (TF_F_DISCONNECTED | TF_F_IS_ROOT))) {
 		/* It's possible that we paused before locking, then another
 		 * thread upped, downed, and put it on the LRU list already.
 		 * The helper deals with that. */
 		__add_to_lru(tf);
 		spin_unlock(&tf->lifetime);
 		return;
 	}
 	spin_unlock(&tf->lifetime);
 	/* Need RCU, since we could have had a reader who saw the object and
 	 * still needs to try to kref (and fail).  call_rcu, since we can't
 	 * block. */
 	call_rcu(&tf->rcu, __tf_free_rcu);
 }

 static unsigned long hash_string(const char *name)
 {
 	unsigned long hash = 5381;

 	for (const char *p = name; *p; p++) {
 		/* hash * 33 + c, djb2's technique */
 		hash = ((hash << 5) + hash) + *p;
 	}
 	return hash;
 }

 static void wc_init(struct walk_cache *wc)
 {
 	spinlock_init(&wc->lru_lock);
 	INIT_LIST_HEAD(&wc->lru);
 	spinlock_init(&wc->ht_lock);
 	wc->ht = wc->static_ht;
 	hash_init_hh(&wc->hh);
 	for (int i = 0; i < wc->hh.nr_hash_lists; i++)
 		INIT_HLIST_HEAD(&wc->ht[i]);
 }

 static void wc_destroy(struct walk_cache *wc)
 {
 	assert(list_empty(&wc->lru));
 	for (int i = 0; i < wc->hh.nr_hash_lists; i++)
 		assert(hlist_empty(&wc->ht[i]));
 	if (wc->ht != wc->static_ht)
 		kfree(wc->ht);
 }

 /* Looks up the child of parent named 'name' in the walk cache hash table.
  * Caller needs to hold an rcu read lock. */
 static struct tree_file *wc_lookup_child(struct tree_file *parent,
                                          const char *name)
 {
 	struct walk_cache *wc = &parent->tfs->wc;
 	unsigned long hash_val = hash_string(name);
 	struct hlist_head *bucket;
 	struct tree_file *i;

 	bucket = &wc->ht[hash_val % wc->hh.nr_hash_bits];
 	hlist_for_each_entry_rcu(i, bucket, hash) {
 		/* Note 'i' is an rcu protected pointer.  That deref is safe.
 		 * i->parent is also a pointer that in general we want to
 		 * protect.  In this case, even though we don't dereference it,
 		 * we want a pointer that is good enough to dereference so we
 		 * can do the comparison. */
 		if (rcu_dereference(i->parent) != parent)
 			continue;
 		/* The file's name should never change while it is in the table,
 		 * so no need for a seq-reader.  Can't assert though, since
 		 * there are valid reasons for other seq lockers. */
 		if (!strcmp(tree_file_to_name(i), name))
 			return i;
 	}
 	return NULL;
 }

 /* Caller should hold the parent's qlock */
 static void wc_insert_child(struct tree_file *parent, struct tree_file *child)
 {
 	struct walk_cache *wc = &parent->tfs->wc;
 	unsigned long hash_val = hash_string(tree_file_to_name(child));
 	struct hlist_head *bucket;

 	assert(child->parent == parent); /* catch bugs from our callers */
 	/* TODO: consider bucket locks and/or growing the HT.  Prob need a
 	 * seq_ctr in the WC, used on the read side during resizing.  Removal
 	 * probably would need something other than the bucket lock too
 	 * (confusion about which bucket during the op). */
 	spin_lock(&wc->ht_lock);
 	bucket = &wc->ht[hash_val % wc->hh.nr_hash_bits];
 	hlist_add_head_rcu(&child->hash, bucket);
 	spin_unlock(&wc->ht_lock);
 }

 /* Caller should hold the parent's qlock */
 static void wc_remove_child(struct tree_file *parent, struct tree_file *child)
 {
 	struct walk_cache *wc = &parent->tfs->wc;

 	assert(child->parent == parent); /* catch bugs from our callers */
 	spin_lock(&wc->ht_lock);
 	hlist_del_rcu(&child->hash);
 	spin_unlock(&wc->ht_lock);
 }

 /* Helper: returns a refcounted pointer to the potential parent.  May return 0.
  *
  * Caller needs to qlock the parent and recheck tf->parent.  Callers always need
  * to get a kref on the parent.  We can rcu-read the parent and *attempt* to
  * qlock under rcu, but we might block.  Then, say, the TF and the parent got
  * removed, and *then* we get the qlock.  We're too late. */
 static struct tree_file *__tf_get_potential_parent(struct tree_file *tf)
 {
 	struct tree_file *parent;

 	rcu_read_lock();
 	parent = rcu_dereference(tf->parent);
 	if (!parent) {
 		/* the root of the tree has no parent */
 		rcu_read_unlock();
 		return NULL;
 	}
 	if (!tf_kref_get(parent))
 		parent = NULL;
 	rcu_read_unlock();
 	return parent;
 }

 /* Returns a refcounted and qlocked parent for child.  NULL on failure. */
 static struct tree_file *get_locked_and_kreffed_parent(struct tree_file *child)
 {
 	struct tree_file *parent;

 	parent = __tf_get_potential_parent(child);
 	if (!parent)
 		return NULL;
 	qlock(&parent->file.qlock);
 	/* Checking the parent == child->parent isn't enough here.  That works
 	 * for rename, but not removal/unlink.  Older versions of TF code
 	 * cleared child->parent, but now that's dealt with in tf_free.
 	 *
 	 * We're doing a lockless peek at child's flags.  We hold the potential
 	 * parent's lock, so if they are ours, no one will be messing with the
 	 * disconnected flag.  If they are messing with it, then parent !=
 	 * child->parent.  Also, once disconnected is set, it is never clear. */
 	if ((child->flags & TF_F_DISCONNECTED) || (parent != child->parent)) {
 		qunlock(&parent->file.qlock);
 		tf_kref_put(parent);
 		return NULL;
 	}
 	return parent;
 }

 static bool __mark_disconnected(struct tree_file *tf)
 {
 	bool need_to_free;

 	spin_lock(&tf->lifetime);
 	tf->flags |= TF_F_DISCONNECTED;
 	__remove_from_lru(tf);
 	need_to_free = kref_refcnt(&tf->kref) == 0;
 	spin_unlock(&tf->lifetime);
 	return need_to_free;
 }

 /* Disconnects child from the in-memory tree structure (i.e. only the front
  * end).  Caller holds the parent qlock.  Assumes child is a child of parent.
  * Once you disconnect, you can't touch the child object.
  *
  * Racing with concurrent lookups, who might grab a ref if they get in before
  * DISCONNECTED, and racing with release (after ref = 0), who might free, if it
  * was already unlinked. */
 static void __disconnect_child(struct tree_file *parent,
                                struct tree_file *child)
 {
 	bool need_to_free;

 	need_to_free = __mark_disconnected(child);
 	/* Note child->parent is still set.  We clear that in __tf_free. */
 	__remove_from_parent_list(parent, child);
 	wc_remove_child(parent, child);
 	if (need_to_free)
 		call_rcu(&child->rcu, __tf_free_rcu);
 }

 /* Backend will need to fill in dir, except for name.  Note this has a kref ==
  * 0, but is not on the LRU yet. */
 struct tree_file *tree_file_alloc(struct tree_filesystem *tfs,
                                   struct tree_file *parent, const char *name)
 {
 	struct tree_file *tf;

 	tf = kzmalloc(sizeof(struct tree_file), MEM_WAIT);
 	fs_file_init((struct fs_file*)tf, name, &tfs->fs_ops);
 	kref_init(&tf->kref, tf_release, 0);
 	spinlock_init(&tf->lifetime);
 	/* Need to set the parent early on, even if the child isn't linked yet,
 	 * so that the TFS ops know who the parent is. */
 	tf->parent = parent;
 	if (parent)
 		kref_get(&parent->kref, 1);
 	INIT_LIST_HEAD(&tf->children);
 	tf->can_have_children = true;
 	tf->tfs = tfs;
 	return tf;
 }

 /* Callers must hold the parent's qlock. */
 static void __link_child(struct tree_file *parent, struct tree_file *child)
 {
 	/* Devices may have already increffed ("+1 for existing").  Those that
 	 * don't need to be on the LRU.  We haven't linked to the parent yet, so
 	 * we hold the only ref.  Once we unlock in __add_to_lru, we're
 	 * discoverable via that list, even though we're not linked.  The lru
 	 * pruner is careful to not muck with the parent's or wc linkage without
 	 * qlocking the parent, which we currently hold. */
 	if (kref_refcnt(&child->kref) == 0)
 		__add_to_lru(child);
 	/* This was set in tree_file_alloc */
 	assert(child->parent == parent);
 	__add_to_parent_list(parent, child);
 	wc_insert_child(parent, child);
 }

 /* Hold the dir's qlock.  This probably works with any directory, though we only
  * use it when we remove a directory.  The normal way to drop negative entries
  * involves working directly with the WC (tfs_lru_prune_neg). */
 static void __prune_dir_negatives(struct tree_file *dir)
 {
 	struct tree_file *child, *temp;

 	list_for_each_entry_safe(child, temp, &dir->children, siblings) {
 		if (!tree_file_is_negative(child))
 			continue;
 		spin_lock(&child->lifetime);
 		assert(kref_refcnt(&child->kref) == 0);
 		/* This mark prevents new lookups; will disconnect it shortly */
 		child->flags |= TF_F_DISCONNECTED;
 		spin_unlock(&child->lifetime);
 		assert(child->parent == dir);
 		__disconnect_child(dir, child);
 	}
 }

 static void neuter_directory(struct tree_file *dir)
 {
 	qlock(&dir->file.qlock);
 	if (dir->tfs->tf_ops.has_children(dir)) {
 		qunlock(&dir->file.qlock);
 		error(ENOTEMPTY, "can't remove dir with children");
 	}
 	dir->can_have_children = false;
 	/* Even if we don't have real children, we might have some negatives.
 	 * Those aren't a reason to abort an rmdir, we just need to drop them.*/
 	__prune_dir_negatives(dir);
 	qunlock(&dir->file.qlock);
 }

 /* Unlink a child from the tree.  Last ref will clean it up, which will not be
  * us.  Caller should hold krefs on the child and parent.  The child's kref will
  * actually keep the parent's alive, but all practical callers will have the
  * parent kreff'ed and qlocked - which is required to ensure the child is still
  * a child of parent. */
 static void __unlink_child(struct tree_file *parent, struct tree_file *child)
 {
 	/* Need to make sure concurrent creates/renames do not add children to
 	 * directories that are unlinked.  Note we don't undo the neutering if
 	 * the backend fails. */
 	if (tree_file_is_dir(child))
 		neuter_directory(child);
 	/* The ramfs backend will probably decref the "+1 for existing" ref.
 	 * This is OK.  If that was the last ref, the child will briefly be on
 	 * the LRU list (which ramfs ignores).  When we disconnect, we'll yank
 	 * the child back off the list and then free it (after rcu). */
 	parent->tfs->tf_ops.unlink(parent, child);
 	__disconnect_child(parent, child);
 }

 /* Talks to the backend and ensures a tree_file for the child exists, either
  * positive or negative.  Throws an error; doesn't return NULL.
  *
  * This returns with an rcu read lock sufficient to protect the returned TF, but
  * it is *not* kreffed.  It could be a negative entry, which is not kreffed.
  *
  * It is possible that at the moment we qunlock, someone comes along and removes
  * the entry (LRU prune or file removal (for positive entries)).  That's fine -
  * we have an rcu read lock, just like during a regular walk, when the removal
  * could happen anyways. */
 static struct tree_file *lookup_child_entry(struct tree_file *parent,
                                             const char *name)
 {
 	ERRSTACK(1);
 	struct tree_file *child;

 	qlock(&parent->file.qlock);
 	child = wc_lookup_child(parent, name);
 	if (child) {
 		/* Since we last looked, but before we qlocked, someone else
 		 * added our entry. */
 		rcu_read_lock();
 		qunlock(&parent->file.qlock);
 		return child;
 	}
 	if (!parent->can_have_children) {
 		qunlock(&parent->file.qlock);
 		error(ENOENT, "lookup failed, parent dir being removed");
 	}
 	child = tree_file_alloc(parent->tfs, parent, name);
 	if (waserror()) {
 		/* child wasn't fully created, so freeing it may be tricky, esp
 		 * on the device ops side (might see something they never
 		 * created). */
 		__tf_free(child);
 		qunlock(&parent->file.qlock);
 		nexterror();
 	}
 	parent->tfs->tf_ops.lookup(parent, child);
 	poperror();
 	__link_child(parent, child);
 	rcu_read_lock();
 	qunlock(&parent->file.qlock);
 	return child;
 }

 /* Walks a tree filesystem from 'from' for array of null-terminated names.
  * Devices will often use tree_chan_walk(), but can use this directly if they
  * want more control.
  *
  * Returns the WQ of qids.  On complete/successful walks, we hang a refcnted TF
  * on wq->clone.
  *
  * Walks can return intermediate results, which is when there is an error but we
  * have walked at least once.  The partial return is useful for namec(), which
  * can succeed if something was mounted on an intermediate QID.  Walks that have
  * no results set error and return NULL, which is what namec() expects. */
 struct walkqid *tree_file_walk(struct tree_file *from, char **name,
                                unsigned int nname)
 {
 	ERRSTACK(2);
 	struct tree_file *at, *next;
 	struct tree_filesystem *tfs = from->tfs;
 	struct walkqid *wq;

 	wq = kzmalloc(sizeof(struct walkqid) + nname * sizeof(struct qid),
 				  MEM_WAIT);
 	/* A walk with zero names means "make me a copy."  If we go through the
 	 * regular walker, our usual tf_kref_get will fail - similar to failing
 	 * if we got a walk for "foo/../" during a concurrent removal of
 	 * ourselves.  We'll let a walk of zero names work, but if you provide
 	 * any names, the actual walk must happen.
 	 *
 	 * This is tricky, and confused me a little.  We're returning a *TF*
 	 * through wq->clone, not a chan, and that is refcounted.  Normally for
 	 * chans that end up with wq->clone == c, we do not incref the object
 	 * hanging off the chan (see k/d/d/eventfd.c), since there is just one
 	 * chan with a kreffed object hanging off e.g. c->aux.  But here,
 	 * wq->clone is considered a distinct refcnt to some TF, and it might be
 	 * 'from.'  Our *caller* needs to deal with the "wq->clone ==
 	 * from_chan", since they deal with chans.  We deal with tree files. */
 	if (!nname) {
 		kref_get(&from->kref, 1);
 		wq->clone = (struct chan*)from;
 		return wq;
 	}
 	if (waserror()) {
 		kfree(wq);
 		poperror();
 		return NULL;
 	}
 	rcu_read_lock();
 	at = from;
 	for (int i = 0; i < nname; i++) {
 		/* Walks end if we reach a regular file, i.e. you can't walk
 		 * through a file, only a dir.  But even if there are more
 		 * names, the overall walk might succeed.  E.g. a directory
 		 * could be mounted on top of the current file we're at.  That's
 		 * just not our job. */
 		if (tree_file_is_file(at)) {
 			if (i == 0)
 				error(ENOTDIR, "initial walk from a file");
 			break;
 		}
 		/* Normally, symlinks stop walks, and namec's walk() will deal
 		 * with it.  We allow walks 'through' symlinks, but only for ..
 		 * and only for the first name.  This is for relative lookups so
 		 * we can find the parent of a symlink. */
 		if (tree_file_is_symlink(at)) {
 			if (i != 0)
 				break;
 			if (strcmp(name[i], ".."))
 				error(ELOOP,
 				      "walk from a symlink that wasn't ..");
 		}
 		if (!caller_has_tf_perms(at, O_READ)) {
 			if (i == 0)
 				error(EACCES, "missing perm for lookup");
 			break;
 		}
 		if (!strcmp(name[i], ".")) {
 			wq->qid[wq->nqid++] = tree_file_to_qid(at);
 			continue;
 		}
 		if (!strcmp(name[i], "..")) {
 			next = rcu_dereference(at->parent);
 			if (!next) {
 				if (tree_file_is_root(at)) {
 					wq->qid[wq->nqid++] =
 						tree_file_to_qid(at);
 					/* I think namec should never give us
 					 * DOTDOT that isn't at the end of the
 					 * names array.  Though devwalk() seems
 					 * to expect it. */
 					if (i != nname - 1)
 						warn("namec DOTDOT bug?");
 					continue;
 				}
 				/* We lost our parent due to a removal/rename.
 				 * We might have walked enough for our walk to
 				 * succeed (e.g.  there's a mount point in the
 				 * WQ), so we can return what we have.  Though
 				 * if we've done nothing, it's a failure.
 				 *
 				 * Note the removal could have happened a long
 				 * time ago: consider an O_PATH open, then
 				 * namec_from().
 				 *
 				 * We also could walk up and see the *new*
 				 * parent during rename().  For instance,
 				 * /src/x/../y could get the old /src/y or the
 				 * new /dst/y.  If we want to avoid that, then
 				 * we'll need some sort of sync with rename to
 				 * make sure we don't get the new one.  Though
 				 * this can happen with namec_from(), so I'm not
 				 * sure I care. */
 				if (i == 0)
 					error(ENOENT,
 					  "file lost its parent during lookup");
 				break;
 			}
 			at = next;
 			wq->qid[wq->nqid++] = tree_file_to_qid(at);
 			continue;
 		}
 		next = wc_lookup_child(at, name[i]);
 		if (!next) {
 			/* TFSs with no backend have the entire tree in the WC
 			 * HT. */
 			if (!tfs->tf_ops.lookup) {
 				if (i == 0)
 					error(ENOENT, "file does not exist");
 				break;
 			}
 			/* Need to hold a kref on 'at' before we
 			 * rcu_read_unlock().  Our TFS op might block. */
 			if (!tf_kref_get(at)) {
 				if (i == 0)
 					error(ENOENT,
 					      "file was removed during lookup");
 				/* WQ has a qid for 'at' from a previous loop.
 				 * We can't grab a ref, so it's being
 				 * removed/renamed.  Yet it's still OK to report
 				 * this as part of the wq, since we could fail
 				 * for other reasons (thus not getting nnames)
 				 * and have the same race, which we handle below
 				 * (i.e. we only get a ref when it was the last
 				 * name).  Here we're just *noticing* the race.
 				 */
 				break;
 			}
 			rcu_read_unlock();
 			/* propagate the error only on the first name.  Note we
 			 * run the 'else' case, and run the poperror case for
 			 * non-errors and non-name=0-errors. */
 			if (waserror()) {
 				tf_kref_put(at);
 				if (i == 0)
 					nexterror();
 				/* In this case, we're discarding the waserror,
 				 * same as in the other i != 0 cases. */
 				poperror();
 				break;
 			}
 			/* returns with an rcu_read_lock protecting 'next' */
 			next = lookup_child_entry(at, name[i]);
 			tf_kref_put(at);
 			poperror();
 			assert(next);
 		}
 		if (tree_file_is_negative(next)) {
 			/* lockless peek.  other flag users aren't atomic. */
 			if (!(next->flags & TF_F_HAS_BEEN_USED)) {
 				spin_lock(&next->lifetime);
 				next->flags |= TF_F_HAS_BEEN_USED;
 				spin_unlock(&next->lifetime);
 			}
 			if (i == 0)
 				error(ENOENT, "file does not exist");
 			break;
 		}
 		at = next;
 		wq->qid[wq->nqid++] = tree_file_to_qid(at);
 	}
 	if (wq->nqid == nname || tree_file_is_symlink(at)) {
 		if (!tf_kref_get(at)) {
 			/* need to undo our last result as if we never saw it.*/
 			wq->nqid--;
 			if (wq->nqid == 0)
 				error(ENOENT, "file was removed during lookup");
 		} else {
 			/* Hanging the refcounted TF off the wq->clone, which is
 			 * cheating.  Our walker shim knows to expect this. */
 			wq->clone = (struct chan*)at;
 		}
 	}
 	rcu_read_unlock();
 	poperror();
 	return wq;
 }

 /* Most tree devices will use this for their walk op, similar to devwalk. */
 struct walkqid *tree_chan_walk(struct chan *c, struct chan *nc, char **name,
                                unsigned int nname)
 {
 	struct tree_file *from, *to;
 	struct walkqid *wq;

 	from = chan_to_tree_file(c);
 	wq = tree_file_walk(from, name, nname);
 	if (!wq)
 		return NULL;
 	if (!wq->clone)
 		return wq;
 	if (!nc)
 		nc = devclone(c);
 	/* Not sure if callers that specify nc must have a chan from our device.
 	 */
 	assert(nc->type == c->type);
 	to = (struct tree_file*)wq->clone;
 	nc->qid = tree_file_to_qid(to);
 	chan_set_tree_file(nc, to);
 	wq->clone = nc;
 	/* We might be returning the same chan, so there's actually just one
 	 * ref. */
 	if (wq->clone == c)
 		tf_kref_put(chan_to_tree_file(c));
 	return wq;
 }

 /* Creates a tree file under parent with name, omode, and perm.  Returns a
  * kreffed tree_file for the new child.  Throws on error. */
 struct tree_file *tree_file_create(struct tree_file *parent, const char *name,
                                    uint32_t perm, char *ext)
 {
 	ERRSTACK(2);
 	struct tree_file *child;
 	bool got_ref;
 	struct timespec now;

 	qlock(&parent->file.qlock);
 	if (waserror()) {
 		qunlock(&parent->file.qlock);
 		nexterror();
 	}
 	if (!caller_has_tf_perms(parent, O_WRITE))
 		error(EACCES, "missing create permission on dir");
 	if (!parent->can_have_children)
 		error(ENOENT, "create failed, parent dir being removed");
 	child = wc_lookup_child(parent, name);
 	if (child) {
 		/* The create(5) message fails if the file exists, which differs
 		 * from the syscall.  namec() handles this. */
 		if (!tree_file_is_negative(child))
 			error(EEXIST, "file exists");
 		/* future lookups that find no entry will qlock.  concurrent
 		 * ones that see the child, even if disconnected, will see it
 		 * was negative and fail. */
 		__disconnect_child(parent, child);
 	}
 	child = tree_file_alloc(parent->tfs, parent, name);
 	if (waserror()) {
 		__tf_free(child);
 		nexterror();
 	}
 	/* Backend will need to know the ext for its create.  This gets cleaned
 	 * up on error. */
 	if (perm & DMSYMLINK)
 		kstrdup(&child->file.dir.ext, ext);
 	/* Backend will need to fill in dir, except for name. */
 	parent->tfs->tf_ops.create(parent, child, perm);
 	now = nsec2timespec(epoch_nsec());
 	__set_acmtime_to(&child->file,
 	                 FSF_ATIME | FSF_BTIME | FSF_CTIME | FSF_MTIME, &now);
 	poperror();
 	/* At this point, the child is visible, so it must be ready to go */
 	__link_child(parent, child);
 	got_ref = tf_kref_get(child);
 	assert(got_ref); /* we hold the qlock, no one should have removed */
 	__set_acmtime_to(&parent->file, FSF_CTIME | FSF_MTIME, &now);
 	qunlock(&parent->file.qlock);
 	poperror();
 	return child;
 }

 /* Most tree devices will use this for their create op. */
 void tree_chan_create(struct chan *c, char *name, int omode, uint32_t perm,
                       char *ext)
 {
 	struct tree_file *parent, *child;

 	parent = chan_to_tree_file(c);
 	child = tree_file_create(parent, name, perm, ext);
 	c->qid = tree_file_to_qid(child);
 	c->mode = openmode(omode);
 	chan_set_tree_file(c, child);
 	tf_kref_put(parent);
 }

 struct chan *tree_chan_open(struct chan *c, int omode)
 {
 	struct tree_file *tf = chan_to_tree_file(c);

 	if ((c->qid.type & QTDIR) && (omode & O_WRITE))
 		error(EISDIR, "can't open a dir for writing");
 	if (c->qid.type & QTSYMLINK)
 		error(ELOOP, "can't open a symlink");
 	tree_file_perm_check(tf, omode);
 	/* TODO: if we want to support DMEXCL on dir.mode, we'll need to
 	 * lock/sync on the fs_file (have a flag for FSF_IS_OPEN, handle in
 	 * close).  We'll also need a way to pass it in to the dir.mode during
 	 * create/wstat/etc. */
 	if (omode & O_TRUNC)
 		fs_file_truncate(&tf->file, 0);
 	c->mode = openmode(omode);
 	c->flag |= COPEN;
 	c->offset = 0;
 	return c;
 }

 void tree_chan_close(struct chan *c)
 {
 	struct tree_file *tf = chan_to_tree_file(c);

 	tf_kref_put(tf);
 }

 void tree_file_remove(struct tree_file *child)
 {
 	ERRSTACK(1);
 	struct tree_file *parent;

 	parent = get_locked_and_kreffed_parent(child);
 	if (!parent)
 		error(ENOENT, "%s had no parent", tree_file_to_name(child));
 	if (waserror()) {
 		qunlock(&parent->file.qlock);
 		tf_kref_put(parent);
 		nexterror();
 	}
 	if (!caller_has_tf_perms(parent, O_WRITE))
 		error(EACCES, "missing remove perm for dir");
 	__unlink_child(parent, child);
 	__set_acmtime(&parent->file, FSF_CTIME | FSF_MTIME);
 	poperror();
 	qunlock(&parent->file.qlock);
 	tf_kref_put(parent);
 }

 void tree_chan_remove(struct chan *c)
 {
 	ERRSTACK(1);
 	struct tree_file *tf = chan_to_tree_file(c);

 	/* sysremove expects a chan that is disconnected from the device,
 	 * regardless of whether or not we fail.  See sysremove(); it will clear
 	 * type, ensuring our close is never called. */
 	if (waserror()) {
 		chan_set_tree_file(c, NULL);
 		tf_kref_put(tf);	/* The ref from the original walk */
 		nexterror();
 	}
 	tree_file_remove(tf);
 	chan_set_tree_file(c, NULL);
 	tf_kref_put(tf);	/* The ref from the original walk */
 	poperror();
 }

 static bool is_descendant_of(struct tree_file *descendant,
                              struct tree_file *ancestor)
 {
 	if (!tree_file_is_dir(ancestor))
 		return false;
 	rcu_read_lock();
 	for (struct tree_file *i = rcu_dereference(descendant->parent);
 	     i;
 	     i = rcu_dereference(i->parent)) {
 		if (i == ancestor) {
 			rcu_read_unlock();
 			return true;
 		}
 	}
 	rcu_read_unlock();
 	return false;
 }

 /* Caller should hold the rename mutex.  This qlocks a and b, which can be the
  * same file, and this handles lock ordering.  Recall the rule: parents before
  * children, and otherwise only the renamer can lock. */
 static void qlock_tree_files(struct tree_file *a, struct tree_file *b)
 {
 	if (a == b) {
 		qlock(&a->file.qlock);
 		return;
 	}
 	if (is_descendant_of(a, b)) {
 		qlock(&b->file.qlock);
 		qlock(&a->file.qlock);
 	} else {
 		qlock(&a->file.qlock);
 		qlock(&b->file.qlock);
 	}
 }

 static void qunlock_tree_files(struct tree_file *a, struct tree_file *b)
 {
 	if (a != b)
 		qunlock(&a->file.qlock);
 	qunlock(&b->file.qlock);
 }

 /* Higher layers (namec) ensure that tf and new_parent are in the same device.
  * The device (e.g. #mnt) ensures that they are in the same instance.
  * new_parent could be the parent of tf; everything should still work if the
  * move is within the same directory. */
 void tree_file_rename(struct tree_file *tf, struct tree_file *new_parent,
                       const char *name, int flags)
 {
 	ERRSTACK(1);
 	struct tree_file *old_parent;
 	struct tree_file *prev_dst;
 	struct timespec now;

 	old_parent = __tf_get_potential_parent(tf);
 	if (!old_parent)
 		error(ENOENT, "renamed file had no parent");
 	/* global mtx helps with a variety of weird races, including the "can't
 	 * move to a subdirectory of yourself" case and the lock ordering of
 	 * parents (locks flow from parent->child). */
 	qlock(&tf->tfs->rename_mtx);
 	qlock_tree_files(old_parent, new_parent);
 	if (waserror()) {
 		qunlock_tree_files(old_parent, new_parent);
 		qunlock(&tf->tfs->rename_mtx);
 		tf_kref_put(old_parent);
 		nexterror();
 	};
 	if (old_parent != tf->parent)
 		error(ENOENT, "renamed file lost its parent");
 	/* Probably a namec bug (that code isn't written yet). */
 	assert(tree_file_is_dir(new_parent));
 	if (!new_parent->can_have_children)
 		error(ENOENT, "target dir being removed");
 	if (!caller_has_tf_perms(old_parent, O_WRITE))
 		error(EACCES, "missing remove perm for src dir");
 	if (!caller_has_tf_perms(new_parent, O_WRITE))
 		error(EACCES, "missing create perm for dst dir");
 	/* can't move tf to one of its subdirs */
 	if (is_descendant_of(new_parent, tf))
 		error(EINVAL, "can't rename to a child directory");
 	/* We hold new_parent's qlock, so there's no worry about prev_dst
 	 * disappearing, so no need for an rcu read lock. */
 	prev_dst = wc_lookup_child(new_parent, name);
 	if (prev_dst) {
 		if (tree_file_is_dir(prev_dst)) {
 			if (!tree_file_is_dir(tf))
 				error(EISDIR, "can't rename a file onto a dir");
 			/* We need to ensure prev_dst is childless and remains
 			 * so.  That requires grabbing its qlock, but there's a
 			 * potential lock ordering issue with old_parent.  We
 			 * could have this: new_parent/dst/x/y/z/old_parent/src.
 			 * That will fail, but we need to check for that case
 			 * instead of grabbing prev_dst's qlock. */
 			if (is_descendant_of(prev_dst, old_parent))
 				error(ENOTEMPTY,
 				      "old_parent descends from dst");
 			neuter_directory(prev_dst);
 		} else {
 			if (tree_file_is_dir(tf))
 				error(ENOTDIR,
 				      "can't rename a dir onto a file");
 		}
 	}
 	/* We check with the backend first, so that it has a chance to fail
 	 * early.  Once we make the changes to the front end, lookups can see
 	 * the effects of the change, which we can't roll back.  Since we hold
 	 * the parents' qlocks, no one should be able to get the info from the
 	 * backend either (lookups that require the backend, readdir, etc). */
 	tf->tfs->tf_ops.rename(tf, old_parent, new_parent, name, flags);
 	/* Similar to __disconnect_child, we don't clear tf->parent.  rcu
 	 * readers at TF will be able to walk up (with ..).  Same with
 	 * namec_from an FD.  If we atomically replace tf->parent, we should be
 	 * good.  See tree_file_walk().
 	 *
 	 * Further, we don't mark the tf disconnected.  Someone might lookup
 	 * from the old location, and that's fine.  We just don't want issues
 	 * with decrefs. */
 	__remove_from_parent_list(old_parent, tf);
 	wc_remove_child(old_parent, tf);
 	synchronize_rcu();
 	/* Now, no new lookups will find it at the old location.  That change is
 	 * not atomic wrt src, but it will be wrt dst.  Importantly, no one will
 	 * see /path/to/old_parent/new_basename */
 	fs_file_change_basename((struct fs_file*)tf, name);
 	/* We're clobbering the old_parent ref, which we'll drop later */
 	rcu_assign_pointer(tf->parent, new_parent);
 	kref_get(&new_parent->kref, 1);
 	__add_to_parent_list(new_parent, tf);
 	wc_insert_child(new_parent, tf);
 	/* Now both the prev_dst (if it existed) or the tf file are in the walk
 	 * cache / HT and either could have been looked up by a concurrent
 	 * reader.  Readers will always get one or the other, but never see
 	 * nothing.  This is the atomic guarantee of rename. */
 	if (prev_dst) {
 		__remove_from_parent_list(new_parent, prev_dst);
 		wc_remove_child(new_parent, prev_dst);
 		synchronize_rcu();
 		/* Now no one can find prev_dst.  Someone might still have a
 		 * ref, or it might be on the LRU list (if kref == 0).  Now we
 		 * can mark disconnected.  Had we disconnected earlier, then
 		 * lookup code would see that and treat it as a failure.  Using
 		 * rcu and putting the complexity in rename was easier and
 		 * simpler than changing lookup.
 		 *
 		 * We still need RCU here for freeing the prev_dst.  We could
 		 * have had an LRU pruner, etc, looking.  The synchronize_rcu
 		 * above only dealt with lookups via parent in this function. */
 		if (__mark_disconnected(prev_dst))
 			call_rcu(&prev_dst->rcu, __tf_free_rcu);
 	}
 	now = nsec2timespec(epoch_nsec());
 	__set_acmtime_to(&old_parent->file, FSF_CTIME | FSF_MTIME, &now);
 	__set_acmtime_to(&new_parent->file, FSF_CTIME | FSF_MTIME, &now);
 	/* Can we unlock earlier?  No.  We need to at least hold new_parent's
 	 * qlock, which was the parent of old_dst, until old_dst is marked
 	 * disconnected.  Even though old_dst is removed from new_parent's HT,
 	 * it is still in the LRU list. */
 	qunlock_tree_files(old_parent, new_parent);
 	qunlock(&tf->tfs->rename_mtx);
 	poperror();
 	tf_kref_put(old_parent); /* the original tf->parent ref we clobbered */
 	tf_kref_put(old_parent); /* the one we grabbed when we started */
 }

 void tree_chan_rename(struct chan *c, struct chan *new_p_c, const char *name,
                       int flags)
 {
 	struct tree_file *tf = chan_to_tree_file(c);
 	struct tree_file *new_parent = chan_to_tree_file(new_p_c);

 	tree_file_rename(tf, new_parent, name, flags);
 }

 /* dri is a pointer to the chan->dri, which is a count of how many directory
  * entries that have been read from this chan so far.  9ns handles it; we just
  * need to increment it for every successful entry.  Note we ignore offset. */
 ssize_t tree_file_readdir(struct tree_file *parent, void *ubuf, size_t n,
                           off64_t offset, int *dri)
 {
 	ERRSTACK(1);
 	struct tree_file *i;
 	size_t dir_amt, so_far = 0;
 	uint8_t *write_pos = ubuf;
 	int child_nr = 0;

 	qlock(&parent->file.qlock);
 	if (waserror()) {
 		qunlock(&parent->file.qlock);
 		nexterror();
 	}
 	list_for_each_entry(i, &parent->children, siblings) {
 		if (child_nr++ < *dri)
 			continue;
 		qlock(&i->file.qlock);
 		dir_amt = convD2M(&i->file.dir, write_pos, n - so_far);
 		qunlock(&i->file.qlock);
 		if (dir_amt <= BIT16SZ) {
 			if (!so_far)
 				error(EINVAL, "buffer to small for readdir");
 			break;
 		}
 		write_pos += dir_amt;
 		so_far += dir_amt;
 		assert(n - so_far >= 0);
 		(*dri)++;
 	}
 	/* If we care about directory atime, we can do that here. (if so_far) */
 	qunlock(&parent->file.qlock);
 	poperror();
 	return so_far;
 }

 /* Note this only works for backend-less TFSs.  It calls tree_file_readdir,
  * which only looks at the frontend's tree. */
 size_t tree_chan_read(struct chan *c, void *ubuf, size_t n, off64_t offset)
 {
 	struct tree_file *tf = chan_to_tree_file(c);

 	if (tree_file_is_dir(tf))
 		return tree_file_readdir(tf, ubuf, n, offset, &c->dri);
 	return fs_file_read(&tf->file, ubuf, n, offset);
 }

 size_t tree_chan_write(struct chan *c, void *ubuf, size_t n, off64_t offset)
 {
 	struct tree_file *tf = chan_to_tree_file(c);

 	/* sysfile.c:rwrite checked the chan's type. */
 	assert(!tree_file_is_dir(tf));
 	return fs_file_write(&tf->file, ubuf, n, offset);
 }

 size_t tree_chan_stat(struct chan *c, uint8_t *m_buf, size_t m_buf_sz)
 {
 	struct tree_file *tf = chan_to_tree_file(c);

 	return fs_file_stat(&tf->file, m_buf, m_buf_sz);
 }

 size_t tree_chan_wstat(struct chan *c, uint8_t *m_buf, size_t m_buf_sz)
 {
 	struct tree_file *tf = chan_to_tree_file(c);

 	return fs_file_wstat(&tf->file, m_buf, m_buf_sz);
 }

 struct fs_file *tree_chan_mmap(struct chan *c, struct vm_region *vmr, int prot,
                                int flags)
 {
 	struct fs_file *f = &chan_to_tree_file(c)->file;

 	/* TODO: In the future, we'll check the prot, establish hooks with the
 	 * VMR, and other things, mostly in something like fs_file_mmap, which
 	 * will be able to handle mmaping something that doesn't use the page
 	 * cache.  For now, I'm aggressively qlocking to catch bugs. */
 	qlock(&f->qlock);
 	if ((prot & PROT_WRITE) && (flags & MAP_SHARED))
 		f->flags |= FSF_DIRTY;
 	qunlock(&f->qlock);
 	return f;
 }

 unsigned long tree_chan_ctl(struct chan *c, int op, unsigned long a1,
                             unsigned long a2, unsigned long a3,
                             unsigned long a4)
 {
 	switch (op) {
 	case CCTL_DEBUG:
 		print_lock();
 		printk("Dumping tree_file info (frontend), dev %s chan %s:\n\n",
 		       chan_dev_name(c), channame(c));
 		__tfs_dump_tf(chan_to_tree_file(c));
 		print_unlock();
 		return 0;
 	default:
 		error(EINVAL, "%s does not support chanctl %d",
 		      chan_dev_name(c), op);
 	}
 }

 /* Given a tree file, construct a chan that points to the TF for the given
  * device.  Careful with this - it's for bootstrapping. */
 struct chan *tree_file_alloc_chan(struct tree_file *tf, struct dev *dev,
                                   char *name)
 {
 	struct chan *c;

 	c = newchan();
 	c->type = dev - devtab;
 	c->name = newcname(name);
 	kref_get(&tf->kref, 1);
 	chan_set_tree_file(c, tf);
 	c->qid = tree_file_to_qid(tf);
 	return c;
 }

 /* Caller needs to set its customizable fields: tf_ops, pm_ops, etc.  root is
  * created with a ref of 1, but needs filled in by the particular TFS. */
 void tfs_init(struct tree_filesystem *tfs)
 {
 	wc_init(&tfs->wc);
 	qlock_init(&tfs->rename_mtx);
 	tfs->root = tree_file_alloc(tfs, NULL, ".");
 	tfs->root->flags |= TF_F_IS_ROOT;
 	assert(!(tfs->root->flags & TF_F_ON_LRU));
 	__kref_get(&tfs->root->kref, 1);
 }

 void tfs_destroy(struct tree_filesystem *tfs)
 {
 	tfs->root = NULL;	/* was just freed in __tf_free() */
 	wc_destroy(&tfs->wc);
 }

 /* For every file except root, we hold our parent's qlock (root has no parent).
  * This prevents TF's removal/rename and any changes to tf->parent. */
 static void tf_dfs_cb(struct tree_file *tf, void (*cb)(struct tree_file *tf))
 {
 	struct tree_file *child;

 	if (tree_file_is_dir(tf)) {
 		qlock(&tf->file.qlock);
 		/* Note we don't have a kref on our child's TF - we have a weak
 		 * reference (the list membership).  We hold the parent's qlock,
 		 * which prevents removal/unlinking/disconnecting/etc.  The
 		 * child's membership on the LRU list can change repeatedly.
 		 *
 		 * If we want to avoid holding the parent's qlock, we could grab
 		 * a kref on the child.  However, our list walk would be in
 		 * jeopardy - both child and temp could be removed from the
 		 * list.  So we'd need to qlock the parent, grab krefs on all
 		 * children, and put them on a local list.  Also, grabbing krefs
 		 * on our children will muck with the LRU list; when we're done,
 		 * it would be like we sorted the LRU list in the DFS order. */
 		list_for_each_entry(child, &tf->children, siblings)
 			tf_dfs_cb(child, cb);
 		qunlock(&tf->file.qlock);
 	}
 	if (!tree_file_is_negative(tf))
 		cb(tf);
 }

 /* Runs CB on all in-memory, non-negative TFs from the TFS in a depth-first
  * search.  Compared to the other CB walkers (purge, LRU, etc), this never
  * removes/prunes/disconnects a TF from the tree.  You can use it for syncing FS
  * trees or pruning page maps (i.e. discarding non-dirty pages).
  *
  * One thing to note is that it qlocks the tree as part of its DFS.  We can
  * change that, but at a slight cost in complexity and tampering with the LRU
  * list.  Translation: we can do it if there's a performance problem.
  *
  * The tfs->root reference is also weak.  It's up to the user to make sure
  * there is no concurrent tfs_destroy.  Typically, if we're called on any
  * mounted device, we're fine (e.g. #tmpfs) or a device that is never destroyed
  * (e.g. #kfs). */
 void tfs_frontend_for_each(struct tree_filesystem *tfs,
                            void (*cb)(struct tree_file *tf))
 {
 	tf_dfs_cb(tfs->root, cb);
 }

 /* We should be single-user, so no need for concurrency protections.  I keep
  * them around for paranoia/future use/documentation. */
 static void tf_dfs_purge(struct tree_file *tf, void (*cb)(struct tree_file *tf))
 {
 	struct tree_file *child, *temp;

 	if (tree_file_is_dir(tf)) {
 		qlock(&tf->file.qlock);
 		/* Note we don't have a kref on TF, and one of our children
 		 * should decref *us* to 0.  We aren't disconnected yet, and we
 		 * can't be removed (parent's qlock is held), so we'll just end
 		 * up on the LRU list, which is OK.
 		 *
 		 * Our child should remove itself from our list, so we need the
 		 * _safe.
 		 *
 		 * Also note that the child decrefs us in a call_rcu.  CB can
 		 * block, and technically so can qlock, so we might run RCU
 		 * callbacks while qlocked.  We'll need to rcu_barrier so that
 		 * our children's decrefs occur before we remove ourselves from
 		 * our parent. */
 		list_for_each_entry_safe(child, temp, &tf->children, siblings)
 			tf_dfs_purge(child, cb);
 		qunlock(&tf->file.qlock);
 	}
 	rcu_barrier();
 	/* ramfs will drop the "+1 for existing" ref here */
 	if (!tree_file_is_negative(tf))
 		cb(tf);
 	spin_lock(&tf->lifetime);
 	/* should be unused, with no children.  we have a ref on root, to keep
 	 * the TFS around while we destroy the tree. */
 	assert(kref_refcnt(&tf->kref) == 0 || tree_file_is_root(tf));
 	/* This mark prevents new lookups.  We'll disconnect it shortly. */
 	tf->flags |= TF_F_DISCONNECTED;
 	spin_unlock(&tf->lifetime);
 	if (tf->parent)
 		__disconnect_child(tf->parent, tf);
 }

 /* Purges all in-memory TFs from the TFS in a depth-first search, both positive
  * and negative.  We run CB on all non-negative TFs.  It's up to the caller to
  * ensure there is no concurrency.
  *
  * The caller must make sure they have an extra ref on tfs->root to keep the
  * TFS around while it gets destroyed.  The root TF will get freed when it is
  * released, unlike other TFs that are still connected.
  *
  * This is for devices that want to destroy themselves, such as an unmounted
  * #tmpfs or #gtfs/#mnt, that want to do some extra work (e.g. sync).
  * Typically, those devices will call this after they have no users (e.g. mounts
  * or open chans). */
 void tfs_frontend_purge(struct tree_filesystem *tfs,
                         void (*cb)(struct tree_file *tf))
 {
 	assert(kref_refcnt(&tfs->root->kref) > 0);
 	tf_dfs_purge(tfs->root, cb);
 }

 static void print_tf(struct tree_file *tf)
 {
 	if (tree_file_is_negative(tf)) {
 		printk("%-10s: Negative\n", tree_file_to_name(tf));
 		return;
 	}
 	printk("%-10s: Q: %5d, R: %2d, U %s, %c%o %s\n",
 		   tree_file_to_name(tf),
 		   tf->file.dir.qid.path,
 		   kref_refcnt(&tf->kref),
 		   tf->file.dir.uid,
 		   tree_file_is_dir(tf) ? 'd'
 		                        : tf->file.dir.mode & DMSYMLINK ? 'l'
 					                                : '-',
 		   tf->file.dir.mode & S_PMASK,
 		   tf->file.dir.mode & DMSYMLINK ? tf->file.dir.ext : ""
 		   );
 }

 static void dump_tf(struct tree_file *tf, int tabs)
 {
 	struct tree_file *child;

 	if (!!(tf->file.dir.mode & DMSYMLINK) !=
 	    !!(tf->file.dir.qid.type & QTSYMLINK))
 		warn("%s has differing symlink bits", tree_file_to_name(tf));

 	print_lock();
 	for (int i = 0; i < tabs; i++)
 		printk("    ");
 	print_tf(tf);
 	if (tree_file_is_dir(tf)) {
 		for (int i = 0; i < tabs; i++)
 			printk("    ");
 		printk("---------------------\n");
 		list_for_each_entry(child, &tf->children, siblings)
 			dump_tf(child, tabs + 1);
 	}
 	print_unlock();
 }

 void __tfs_dump(struct tree_filesystem *tfs)
 {
 	dump_tf(tfs->root, 0);
 }

 void __tfs_dump_tf(struct tree_file *tf)
 {
 	dump_tf(tf, 0);
 }

 /* Runs a callback on every non-negative TF on the LRU list, for a given
  * snapshot of the LRU list.  The CB returns true if it wants us to attempt to
  * free the TF.  One invariant is that we can never remove a TF from the tree
  * while it is dirty; it is the job of the CB to maintain that.  Note the CB can
  * run on a TF as soon as that TF was linked to the parent (see lookup).
  *
  * The work list is a list of strong refs.  We need to keep one in case the file
  * is disconnected while we're running our CBs.  Since we incref, we yank from
  * the LRU list.  We could design the rest of the TF code so that we stay on the
  * LRU list throughout, but I like the invariant of "kref == 0 IFF on LRU".
  *
  * Since we're only on one list at a time ('wc->lru' or 'work'), we can use the
  * lru list_head in the TF.  We know that so long as we hold our kref on a TF,
  * no one will attempt to put it back on the LRU list. */
 void tfs_lru_for_each(struct tree_filesystem *tfs, bool cb(struct tree_file *),
                       size_t max_tfs)
 {
 	struct list_head work = LIST_HEAD_INIT(work);
 	struct walk_cache *wc = &tfs->wc;
 	struct tree_file *tf, *temp, *parent;
 	size_t nr_tfs = 0;

 	/* We can have multiple LRU workers in flight, though a given TF will be
 	 * on only one CB list at a time. */
 	spin_lock(&wc->lru_lock);
 	list_for_each_entry_safe(tf, temp, &wc->lru, lru) {
 		/* lockless peak at the flag.  once it's NEGATIVE, it never goes
 		 * back */
 		if (tree_file_is_negative(tf))
 			continue;
 		/* Normal lock order is TF -> LRU.
 		 * Best effort is fine for LRU. */
 		if (!spin_trylock(&tf->lifetime))
 			continue;
 		/* Can't be disconnected and on LRU */
 		assert(!(tf->flags & TF_F_DISCONNECTED));
 		assert((tf->flags & TF_F_ON_LRU));
 		tf->flags &= ~TF_F_ON_LRU;
 		list_del(&tf->lru);
 		__kref_get(&tf->kref, 1);
 		/* The 'used' bit is the what allows us to detect a user in
 		 * between our callback and the disconnection/freeing.  It's a
 		 * moot point if the CB returns false. */
 		tf->flags &= ~TF_F_HAS_BEEN_USED;
 		spin_unlock(&tf->lifetime);
 		list_add_tail(&tf->lru, &work);
 		if (++nr_tfs >= max_tfs)
 			break;
 	}
 	spin_unlock(&wc->lru_lock);

 	/* We have a snapshot of the LRU list.  As soon as we unlocked a file,
 	 * someone could incref it (e.g. to something > 1), and they'll set the
 	 * used bit.  That won't abort the CB.  New TFs could be added to the
 	 * LRU list.  Those are ignored for this pass. */
 	list_for_each_entry_safe(tf, temp, &work, lru) {
 		if (!cb(tf)) {
 			list_del(&tf->lru);
 			tf_kref_put(tf);
 			continue;
 		}
 	}

 	/* Now we have a list of victims to be removed, so long as they haven't
 	 * been used since. */
 	list_for_each_entry_safe(tf, temp, &work, lru) {
 		parent = get_locked_and_kreffed_parent(tf);
 		if (!parent) {
 			list_del(&tf->lru);
 			tf_kref_put(tf);
 			continue;
 		}
 		spin_lock(&tf->lifetime);
 		if (tf->flags & TF_F_HAS_BEEN_USED) {
 			spin_unlock(&tf->lifetime);
 			qunlock(&parent->file.qlock);
 			tf_kref_put(parent);
 			list_del(&tf->lru);
 			tf_kref_put(tf);
 			continue;
 		}
 		tf->flags |= TF_F_DISCONNECTED;
 		/* We hold a ref, so it shouldn't have found its way back on
 		 * LRU. */
 		assert(!(tf->flags & TF_F_ON_LRU));
 		spin_unlock(&tf->lifetime);
 		__remove_from_parent_list(parent, tf);
 		wc_remove_child(parent, tf);
 		/* If we get tired of unlocking and relocking, we could see if
 		 * the next parent is the current parent before unlocking. */
 		qunlock(&parent->file.qlock);
 		tf_kref_put(parent);
 	}
 	/* Now we have a list of refs that are all disconnected, kref == 1
 	 * (because no one used them since we increffed them when they were LRU,
 	 * which was when the refcnt was 0).  Each TF has a ref on its parent
 	 * btw, so parents will never be on the LRU list.  (leaves only).
 	 *
 	 * We need to synchronize_rcu() too, since we could have had lockless
 	 * lookups that have pointers to TF and are waiting to notice that it is
 	 * disconnected. */
 	synchronize_rcu();
 	list_for_each_entry_safe(tf, temp, &work, lru) {
 		assert(kref_refcnt(&tf->kref) == 1);
 		list_del(&tf->lru);
 		/* We could decref, but instead we can directly free.  We know
 		 * the ref == 1 and it is disconnected.  Directly freeing
 		 * bypasses call_rcu. */
 		__tf_free(tf);
 	}
 }

 /* Does a one-cycle 'clock' algorithm to detect use.  On a given pass, we either
  * clear HAS_BEEN_USED xor we remove it.  For negative entries, that bit is used
  * when we look at an entry (use it), compared to positive entries, which is
  * used when we get a reference.  (we never get refs on negatives). */
 void tfs_lru_prune_neg(struct tree_filesystem *tfs)
 {
 	struct list_head work = LIST_HEAD_INIT(work);
 	struct walk_cache *wc = &tfs->wc;
 	struct tree_file *tf, *temp, *parent;

 	spin_lock(&wc->lru_lock);
 	list_for_each_entry_safe(tf, temp, &wc->lru, lru) {
 		if (!tree_file_is_negative(tf))
 			continue;
 		if (!spin_trylock(&tf->lifetime))
 			continue;
 		if (tf->flags & TF_F_HAS_BEEN_USED) {
 			tf->flags &= ~TF_F_HAS_BEEN_USED;
 			spin_unlock(&tf->lifetime);
 			continue;
 		}
 		rcu_read_lock(); /* holding a spinlock, but just to be clear. */
 		parent = rcu_dereference(tf->parent);
 		/* Again, inverting the lock order, so best effort trylock */
 		if (!canqlock(&parent->file.qlock)) {
 			rcu_read_unlock();
 			spin_unlock(&tf->lifetime);
 			continue;
 		}
 		__remove_from_parent_list(parent, tf);
 		wc_remove_child(parent, tf);
 		qunlock(&parent->file.qlock);
 		/* We're off the list, but our kref == 0 still.  We can break
 		 * that invariant since we have the only ref and are about to
 		 * free the TF. */
 		tf->flags &= ~TF_F_ON_LRU;
 		list_del(&tf->lru);
 		spin_unlock(&tf->lifetime);
 		list_add_tail(&tf->lru, &work);
 	}
 	spin_unlock(&wc->lru_lock);
 	/* Now we have a list of refs that are all unlinked (but actually not
 	 * flagged DISCONNECTED; that's only necessary for positives), kref == 0
 	 * (because they are negatives).
 	 *
 	 * We need to synchronize_rcu() too, since we could have had lockless
 	 * lookups that have pointers to TF, and they may even mark
 	 * HAS_BEEN_USED.  Too late. */
 	synchronize_rcu();
 	list_for_each_entry_safe(tf, temp, &work, lru) {
 		assert(kref_refcnt(&tf->kref) == 0);
 		list_del(&tf->lru);
 		__tf_free(tf);
 	}
 }