kern/src/rcu.c - akaros - Git at Google

 /* Copyright (c) 2018 Google Inc
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
  * RCU.  We borrow a few things from Linux - mostly the header bits and the
  * tree-rcu structure.
  *
  * Acronyms/definitions:
  * - CB: RCU callbacks (call_rcu)
  * - QS: quiescent state - a time when we know a core isn't in an RCU read-side
  *   critical section.
  * - GP: grace period.  Some quotes from Linux/Paul:
  *   - "A time period during which all such pre-existing readers complete is
  *   called a 'grace period'."
  *   - "Anything outside of an RCU read-side critical section is a quiescent
  *   state, and a grace period is any time period in which every CPU (or task,
  *   for
  * - gpnum: number of the current grace period we are working on
  * - completed: number of the grace periods completed
  *
  * We differ in a few ways from Linux's implementation:
  *
  * - Callbacks run on management cores (a.k.a, LL cores, e.g. core 0).  This way
  *   we don't have to kick idle or user space cores to run their CBs, and those
  *   CBs don't interfere with a possibly unrelated process.
  *
  * - Our RCU is most similar to rcu_sched (classic RCU), and not the preemptible
  *   RCU.  Our kthreads don't get preempted, so we don't need to worry about
  *   read side critical sections being interrupted.
  *
  * - There is no softirq processing to note the passing of GPs or to run CBs.
  *
  * - Our tree uses atomic ops to trace grace periods within the rcu_nodes.
  *   Linux's tree-rcu uses locks.  They need the locks since under some
  *   circumstances, a QS would be marked during a read-side critical section,
  *   and the QS marking needed to track the gpnum to keep the QS matched to the
  *   GP.  See
  *   https://www.kernel.org/doc/Documentation/RCU/Design/Data-Structures/Data-Structures.html
  *   and grep "Come on".  We don't need to worry about this since we only mark a
  *   QS under two situations:
  *
  *   - The core knows it is does not hold an rcu_read_lock, so we can always
  *   mark QS.
  *   - The GP kthread saw the core either idle or in userspace after the gp
  *   started.  That means we know that core had a QS after the GP started.
  *
  *   So any time we mark a QS is actually a QS.  I think Linux has times where
  *   they note a QS for an older GP, and set a note to mark that QS *for that
  *   GP* in the future.  Their locks make sure they are marking for the right
  *   gpnum.  There might be some element of the rnps not knowing about the
  *   latest GP yet too.
  *
  * - We do use locking at the per-core level to decide whether or not to start
  *   mark a QS for a given GP.  (lock, compare gp_acked to gpnum, etc).  This
  *   ensures only one thread (the core or the GP kth) marks the core for a given
  *   GP.  We actually could handle it if the both did, (make the trickle-up
  *   idempotent, which we do for the interior nodes) but we could run into
  *   situations where a core checks in for a GP before the global gpnum was set.
  *   This could happen when the GP kth is resetting the tree for the next GP.
  *   I think it'd be OK, but not worth the hassle and confusion.
  *
  * - We have a kthread for GP management, like Linux.  Callbacks are enqueued
  *   locally (on the core that calls call_rcu), like Linux.  We have a kthread
  *   per management core to process the callbacks, and these threads will handle
  *   the callbacks of *all* cores.  Each core has a specific mgmt kthread that
  *   will run its callbacks.  It is important that a particular core's callbacks
  *   are processed by the same thread - I rely on this to implement rcu_barrier
  *   easily.  In that case, we just need to schedule a CB on every core that has
  *   CBs, and when those N CBs are done, our barrier passed.  This relies on CBs
  *   being processed in order for a given core.  We could do the barrier in
  *   other ways, but it doesn't seem like a big deal.
  *
  * - I kept around some seq counter and locking stuff in rcu_helper.h.  We might
  *   use that in the future.
  */

 #include <rcu.h>
 #include <kthread.h>
 #include <smp.h>
 #include <kmalloc.h>

 /* How many CBs to queue up before we trigger a GP */
 #define RCU_CB_THRESH 10
 /* How long (usec) we wait between running a GP if we weren't triggered. */
 #define RCU_GP_MIN_PERIOD 25000
 /* How long (usec) we wait for cores to check in. */
 #define RCU_GP_TARDY_PERIOD 1000

 /* In rcu_tree_helper.c */
 extern int rcu_num_cores;
 extern int rcu_num_lvls;

 /* Controls whether we skip cores when we expedite, which forces tardy cores. */
 static bool rcu_debug_tardy;

 /* Externed in rcu_tree_helper.c */
 struct rcu_state rcu_state;


 DEFINE_PERCPU(struct rcu_pcpui, rcu_pcpui);

 struct sync_cb_blob {
 	struct rcu_head h;
 	struct semaphore *sem;
 };

 static void __sync_cb(struct rcu_head *head)
 {
 	struct sync_cb_blob *b = container_of(head, struct sync_cb_blob, h);

 	sem_up(b->sem);
 }

 void synchronize_rcu(void)
 {
 	struct sync_cb_blob b[1];
 	struct semaphore sem[1];

 	if (!can_block(this_pcpui_ptr()))
 		panic("Attempted %s() from an unblockable context!", __func__);
 	if (is_rcu_ktask(current_kthread))
 		panic("Attempted %s() from an RCU thread!", __func__);
 	sem_init(sem, 0);
 	init_rcu_head_on_stack(&b->h);
 	b->sem = sem;
 	call_rcu(&b->h, __sync_cb);
 	sem_down(sem);
 }

 static inline bool gp_in_progress(struct rcu_state *rsp)
 {
 	unsigned long completed = READ_ONCE(rsp->completed);
 	unsigned long gpnum = READ_ONCE(rsp->gpnum);

 	assert(gpnum - completed <= 1);
 	return completed != gpnum;
 }

 /* Wakes the kthread to run a grace period if it isn't already running.
  *
  * If 'force', we'll make sure it runs a fresh GP, which will catch all CBs
  * registered before this call.  That's not 100% true.  It might be possible on
  * some non-x86 architectures for the writes that wake the ktask are reordered
  * before the read of gpnum that our caller made.  Thus the caller could have a
  * CB in a later GP.  Worst case, they'll wait an extra GP timeout.  Not too
  * concerned, though I probably should be. */
 static void wake_gp_ktask(struct rcu_state *rsp, bool force)
 {
 	if (!force && gp_in_progress(rsp))
 		return;
 	rsp->gp_ktask_ctl = 1;
 	rendez_wakeup(&rsp->gp_ktask_rv);
 }

 static void rcu_exec_cb(struct rcu_head *head)
 {
 	if (__is_kfree_rcu_offset((unsigned long)head->func))
 		kfree((void*)head - (unsigned long)head->func);
 	else
 		head->func(head);
 }

 void __early_call_rcu(struct rcu_head *head)
 {
 	extern bool booting;

 	assert(booting);
 	assert(core_id() == 0);
 	run_as_rkm(rcu_exec_cb, head);
 }

 /* This could be called from a remote core, e.g. rcu_barrier().  Returns the
  * number of enqueued CBs, including the one we pass in. */
 static int __call_rcu_rpi(struct rcu_state *rsp, struct rcu_pcpui *rpi,
                            struct rcu_head *head, rcu_callback_t func)
 {
 	unsigned int nr_cbs;

 	head->func = func;

 	if (!rpi->booted) {
 		__early_call_rcu(head);
 		return 0;
 	}
 	/* rsp->gpnum is the one we're either working on (if > completed) or the
 	 * one we already did.  Either way, it's a GP that may have already been
 	 * ACKed during a core's QS, and that core could have started a
 	 * read-side critical section that must complete before CB runs.  That
 	 * requires another GP. */
 	head->gpnum = READ_ONCE(rsp->gpnum) + 1;
 	spin_lock_irqsave(&rpi->lock);
 	list_add_tail(&head->link, &rpi->cbs);
 	nr_cbs = ++rpi->nr_cbs;
 	spin_unlock_irqsave(&rpi->lock);
 	/* rcu_barrier requires that the write to ->nr_cbs be visible before any
 	 * future writes.  unlock orders the write inside, but doesn't prevent
 	 * other writes from moving in.  Technically, our lock implementations
 	 * do that, but it's not part of our definition.  Maybe it should be.
 	 * Til then: */
 	wmb();
 	return nr_cbs;
 }

 /* Minus the kfree offset check */
 static void __call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	struct rcu_pcpui *rpi = PERCPU_VARPTR(rcu_pcpui);
 	struct rcu_state *rsp = rpi->rsp;
 	unsigned int thresh;

 	thresh = __call_rcu_rpi(rsp, rpi, head, func);
 	if (thresh > RCU_CB_THRESH)
 		wake_gp_ktask(rpi->rsp, false);
 }

 void call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	assert(!__is_kfree_rcu_offset((unsigned long)func));
 	__call_rcu(head, func);
 }

 void rcu_barrier(void)
 {
 	struct rcu_state *rsp = PERCPU_VAR(rcu_pcpui).rsp;
 	struct rcu_pcpui *rpi;
 	struct semaphore sem[1];
 	struct sync_cb_blob *b;
 	int nr_sent = 0;

 	if (!can_block(this_pcpui_ptr()))
 		panic("Attempted %s() from an unblockable context!", __func__);
 	if (is_rcu_ktask(current_kthread))
 		panic("Attempted %s() from an RCU thread!", __func__);
 	/* TODO: if we have concurrent rcu_barriers, we might be able to share
 	 * the CBs.  Say we have 1 CB on a core, then N rcu_barriers.  We'll
 	 * have N call_rcus in flight, though we could share.  Linux does this
 	 * with a mtx and some accounting, I think. */
 	b = kzmalloc(sizeof(struct sync_cb_blob) * num_cores, MEM_WAIT);
 	/* Remember, you block when sem is <= 0.  We'll get nr_sent ups, and
 	 * we'll down 1 for each.  This is just like the synchronize_rcu() case;
 	 * there, nr_sent == 1. */
 	sem_init(sem, 0);
 	/* Order any signal we received from someone who called call_rcu()
 	 * before our rpi->nr_cbs reads. */
 	rmb();
 	for_each_core(i) {
 		rpi = _PERCPU_VARPTR(rcu_pcpui, i);
 		/* Lockless peek at nr_cbs.  Two things to note here:
 		 * - We look at nr_cbs and not the list, since there could be
 		 *   CBs on the stack-local work list or that have blocked.
 		 * - The guarantee is that we wait for any CBs from call_rcus
 		 *   that can be proved to happen before rcu_barrier.  That
 		 *   means call_rcu had to return, which means it had to set the
 		 *   nr_cbs. */
 		if (!rpi->nr_cbs)
 			continue;
 		init_rcu_head_on_stack(&b[i].h);
 		b[i].sem = sem;
 		__call_rcu_rpi(rsp, rpi, &b[i].h, __sync_cb);
 		nr_sent++;
 	}
 	if (!nr_sent) {
 		kfree(b);
 		return;
 	}
 	wake_gp_ktask(rpi->rsp, true);
 	/* sem_down_bulk is currently slow.  Even with some fixes, we actually
 	 * want a barrier, which you could imagine doing with a tree.
 	 * sem_down_bulk() doesn't have the info that we have: that the wakeups
 	 * are coming from N cores on the leaves of the tree. */
 	sem_down_bulk(sem, nr_sent);
 	kfree(b);
 }

 void rcu_force_quiescent_state(void)
 {
 	/* It's unclear if we want to block until the QS has passed */
 	wake_gp_ktask(PERCPU_VAR(rcu_pcpui).rsp, true);
 }

 void kfree_call_rcu(struct rcu_head *head, rcu_callback_t off)
 {
 	__call_rcu(head, off);
 }

 /* Clears the bits core(s) in grpmask present in rnp, trickling up to the root.
  * Note that a 1 in qsmask means you haven't checked in - like a todo list.
  * Last one out kicks the GP kthread. */
 static void __mark_qs(struct rcu_state *rsp, struct rcu_node *rnp,
                       unsigned long grpmask)
 {
 	unsigned long new_qsm;

 	new_qsm = __sync_and_and_fetch(&rnp->qsmask, ~grpmask);
 	/* I don't fully understand this, but we need some form of transitive
 	 * barrier across the entire tree.  Linux does this when they
 	 * lock/unlock.  Our equivalent is the atomic op. */
 	smp_mb__after_unlock_lock();
 	/* Only one thread will get 0 back - the last one to check in */
 	if (new_qsm)
 		return;
 	if (rnp->parent)
 		__mark_qs(rsp, rnp->parent, rnp->grpmask);
 	else
 		rendez_wakeup(&rsp->gp_ktask_rv);
 }

 static void rcu_report_qs_rpi(struct rcu_state *rsp, struct rcu_pcpui *rpi)
 {
 	/* Note we don't check ->completed == ->gpnum (gp_in_progress()).  We
 	 * only care if our core hasn't reported in for a GP.  This time is a
 	 * subset of gp_in_progress. */
 	if (rpi->gp_acked == READ_ONCE(rsp->gpnum)) {
 		/* If a GP starts right afterwards, oh well.  Catch it next
 		 * time. */
 		return;
 	}
 	/* Lock ensures we only report a QS once per GP. */
 	spin_lock_irqsave(&rpi->lock);
 	if (rpi->gp_acked == READ_ONCE(rsp->gpnum)) {
 		spin_unlock_irqsave(&rpi->lock);
 		return;
 	}
 	/* A gp can start concurrently, but once started, we should never be
 	 * behind by more than 1. */
 	assert(rpi->gp_acked + 1 == READ_ONCE(rsp->gpnum));
 	/* Up our gp_acked before actually marking it.  I don't want to hold the
 	 * lock too long (e.g. some debug code in rendez_wakeup() calls
 	 * call_rcu).  So we've unlocked, but haven't actually checked in yet -
 	 * that's fine.  No one else will attempt to check in until the next GP,
 	 * which can't happen until after we check in for this GP. */
 	rpi->gp_acked++;
 	spin_unlock_irqsave(&rpi->lock);
 	__mark_qs(rsp, rpi->my_node, rpi->grpmask);
 }

 /* Cores advertise when they are in QSs.  If the core already reported in, or if
  * we're not in a GP, this is a quick check (given a global read of ->gpnum). */
 void rcu_report_qs(void)
 {
 	rcu_report_qs_rpi(&rcu_state, PERCPU_VARPTR(rcu_pcpui));
 }

 /* For debugging checks on large trees.  Keep this in sync with
  * rcu_init_fake_cores(). */
 static void rcu_report_qs_fake_cores(struct rcu_state *rsp)
 {
 	struct rcu_node *rnp;

 	rnp = rsp->level[rcu_num_lvls - 1];
 	for (int i = num_cores; i < rcu_num_cores; i++) {
 		while (i > rnp->grphi)
 			rnp++;
 		if (rcu_debug_tardy && (i % 2))
 			continue;
 		__mark_qs(rsp, rnp, 1 << (i - rnp->grplo));
 	}
 }

 static void rcu_report_qs_remote_core(struct rcu_state *rsp, int coreid)
 {
 	int cpu_state = READ_ONCE(pcpui_var(coreid, cpu_state));
 	struct rcu_pcpui *rpi = _PERCPU_VARPTR(rcu_pcpui, coreid);

 	/* Lockless peek.  If we ever saw them idle/user after a GP started, we
 	 * know they had a QS, and we know we're still in the original GP. */
 	if (cpu_state == CPU_STATE_IDLE || cpu_state == CPU_STATE_USER)
 		rcu_report_qs_rpi(rsp, rpi);
 }

 /* Checks every core, remotely via the cpu state, to see if it is in a QS.
  * This is like an expedited grace period. */
 static void rcu_report_qs_remote_cores(struct rcu_state *rsp)
 {
 	for_each_core(i) {
 		if (rcu_debug_tardy && (i % 2))
 			continue;
 		rcu_report_qs_remote_core(rsp, i);
 	}
 }

 static void rcu_report_qs_tardy_cores(struct rcu_state *rsp)
 {
 	struct rcu_node *rnp;
 	unsigned long qsmask;
 	int i;

 	rcu_for_each_leaf_node(rsp, rnp) {
 		qsmask = READ_ONCE(rnp->qsmask);
 		if (!qsmask)
 			continue;
 		for_each_set_bit(i, &qsmask, BITS_PER_LONG) {
 			/* Fake cores */
 			if (i + rnp->grplo >= num_cores) {
 				__mark_qs(rsp, rnp, 1 << i);
 				continue;
 			}
 			rcu_report_qs_remote_core(rsp, i + rnp->grplo);
 		}
 	}
 }

 static int root_qsmask_empty(void *arg)
 {
 	struct rcu_state *rsp = arg;

 	return READ_ONCE(rsp->node[0].qsmask) == 0 ? 1 : 0;
 }

 static void rcu_run_gp(struct rcu_state *rsp)
 {
 	struct rcu_node *rnp;

 	assert(rsp->gpnum == rsp->completed);
 	/* Initialize the tree for accumulating QSs.  We know there are no users
 	 * on the tree.  The only time a core looks at the tree is when
 	 * reporting a QS for a GP.  The previous GP is done, thus all cores
 	 * reported their GP already (for the previous GP), and they won't try
 	 * again until we advertise the next GP. */
 	rcu_for_each_node_breadth_first(rsp, rnp)
 		rnp->qsmask = rnp->qsmaskinit;
 	/* Need the tree set for reporting QSs before advertising the GP */
 	wmb();
 	WRITE_ONCE(rsp->gpnum, rsp->gpnum + 1);
 	/* At this point, the cores can start reporting in. */
 	/* Fake cores help test a tree larger than num_cores. */
 	rcu_report_qs_fake_cores(rsp);
 	/* Expediting aggressively.  We could also wait briefly and then check
 	 * the tardy cores. */
 	rcu_report_qs_remote_cores(rsp);
 	/* Note that even when we expedite the GP by checking remote cores,
 	 * there's a race where a core halted but we didn't see it.  (they
 	 * report QS, decide to halt, pause, we start GP, see they haven't
 	 * halted, etc.  They could report the QS after setting the state, but I
 	 * didn't want to . */
 	do {
 		rendez_sleep_timeout(&rsp->gp_ktask_rv, root_qsmask_empty, rsp,
 		                     RCU_GP_TARDY_PERIOD);
 		rcu_report_qs_tardy_cores(rsp);
 	} while (!root_qsmask_empty(rsp));
 	/* Not sure if we need any barriers here.  Once we post 'completed', the
 	 * CBs can start running.  But no one should touch the tree til gpnum is
 	 * incremented. */
 	WRITE_ONCE(rsp->completed, rsp->gpnum);
 }

 static int should_wake_ctl(void *arg)
 {
 	int *ctl = arg;

 	return *ctl != 0 ? 1 : 0;
 }

 static void wake_mgmt_ktasks(struct rcu_state *rsp)
 {
 	struct rcu_pcpui *rpi;

 	/* TODO: For each mgmt core */
 	rpi = _PERCPU_VARPTR(rcu_pcpui, 0);
 	rpi->mgmt_ktask_ctl = 1;
 	rendez_wakeup(&rpi->mgmt_ktask_rv);
 }

 static void rcu_gp_ktask(void *arg)
 {
 	struct rcu_state *rsp = arg;

 	current_kthread->flags |= KTH_IS_RCU_KTASK;
 	while (1) {
 		rendez_sleep_timeout(&rsp->gp_ktask_rv, should_wake_ctl,
 		                     &rsp->gp_ktask_ctl, RCU_GP_MIN_PERIOD);
 		rsp->gp_ktask_ctl = 0;
 		/* Our write of 0 must happen before starting the GP.  If
 		 * rcu_barrier's CBs miss the start of the GP (and thus are in
 		 * an unscheduled GP), their write of 1 must happen after our
 		 * write of 0 so that we rerun.  This is the post-and-poke
 		 * pattern.  It's not a huge deal, since we'll catch it after
 		 * the GP period timeout. */
 		wmb();
 		rcu_run_gp(rsp);
 		wake_mgmt_ktasks(rsp);
 	};
 }

 static void run_rcu_cbs(struct rcu_state *rsp, int coreid)
 {
 	struct rcu_pcpui *rpi = _PERCPU_VARPTR(rcu_pcpui, coreid);
 	struct list_head work = LIST_HEAD_INIT(work);
 	struct rcu_head *head, *temp, *last_for_gp = NULL;
 	int nr_cbs = 0;
 	unsigned long completed;

 	/* We'll run the CBs for any GP completed so far, but not any GP that
 	 * could be completed concurrently.  "CBs for a GP" means callbacks that
 	 * must wait for that GP to complete. */
 	completed = READ_ONCE(rsp->completed);

 	/* This lockless peek is an optimization.  We're guaranteed to not miss
 	 * the CB for the given GP: If the core had a CB for this GP, it must
 	 * have put it on the list before checking in, before the GP completes,
 	 * and before we run. */
 	if (list_empty(&rpi->cbs))
 		return;

 	spin_lock_irqsave(&rpi->lock);
 	list_for_each_entry(head, &rpi->cbs, link) {
 		if (ULONG_CMP_LT(completed, head->gpnum))
 			break;
 		nr_cbs++;
 		last_for_gp = head;
 	}
 	if (last_for_gp)
 		list_cut_position(&work, &rpi->cbs, &last_for_gp->link);
 	spin_unlock_irqsave(&rpi->lock);

 	if (!nr_cbs) {
 		assert(list_empty(&work));
 		return;
 	}
 	/* When we're in an RCU callback, we can't block.  In our non-preemptive
 	 * world, not blocking also means our kthread won't migrate from this core,
 	 * such that the pcpui pointer (and thus the specific __ctx_depth) won't
 	 * change. */
 	set_cannot_block(this_pcpui_ptr());
 	list_for_each_entry_safe(head, temp, &work, link) {
 		list_del(&head->link);
 		rcu_exec_cb(head);
 	}
 	clear_cannot_block(this_pcpui_ptr());

 	/* We kept nr_cbs in place until the CBs, which could block, completed.
 	 * This allows other readers (rcu_barrier()) of our pcpui to tell if we
 	 * have any CBs pending.  This relies on us being the only
 	 * consumer/runner of CBs for this core. */
 	spin_lock_irqsave(&rpi->lock);
 	rpi->nr_cbs -= nr_cbs;
 	spin_unlock_irqsave(&rpi->lock);
 }

 static void rcu_mgmt_ktask(void *arg)
 {
 	struct rcu_pcpui *rpi = arg;
 	struct rcu_state *rsp = rpi->rsp;

 	current_kthread->flags |= KTH_IS_RCU_KTASK;
 	while (1) {
 		rendez_sleep(&rpi->mgmt_ktask_rv, should_wake_ctl,
 		             &rpi->mgmt_ktask_ctl);
 		rpi->mgmt_ktask_ctl = 0;
 		/* TODO: given the number of mgmt kthreads, we need to assign
 		 * cores */
 		for_each_core(i)
 			run_rcu_cbs(rsp, i);
 	};
 }

 void rcu_init_pcpui(struct rcu_state *rsp, struct rcu_pcpui *rpi, int coreid)
 {
 	struct rcu_node *rnp = rpi->my_node;

 	rpi->rsp = rsp;
 	assert(rnp->grplo <= coreid);
 	assert(coreid <= rnp->grphi);
 	rpi->coreid = coreid;
 	rpi->grpnum = coreid - rnp->grplo;
 	rpi->grpmask = 1 << rpi->grpnum;
 	rpi->booted = false;

 	/* We're single threaded now, so this is OK. */
 	rnp->qsmaskinit |= rpi->grpmask;

 	spinlock_init_irqsave(&rpi->lock);
 	INIT_LIST_HEAD(&rpi->cbs);
 	rpi->nr_cbs = 0;
 	rpi->gp_acked = rsp->completed;

 	/* TODO: For each mgmt core only */
 	if (coreid == 0) {
 		rendez_init(&rpi->mgmt_ktask_rv);
 		rpi->mgmt_ktask_ctl = 0;
 	}
 }

 /* Initializes the fake cores.  Works with rcu_report_qs_fake_cores() */
 static void rcu_init_fake_cores(struct rcu_state *rsp)
 {
 	struct rcu_node *rnp;

 	rnp = rsp->level[rcu_num_lvls - 1];
 	for (int i = num_cores; i < rcu_num_cores; i++) {
 		while (i > rnp->grphi)
 			rnp++;
 		rnp->qsmaskinit |= 1 << (i - rnp->grplo);
 	}
 }

 void rcu_init(void)
 {
 	struct rcu_state *rsp = &rcu_state;
 	struct rcu_pcpui *rpi;

 	rcu_init_geometry();
 	rcu_init_one(rsp);
 	rcu_init_fake_cores(rsp);
 	rcu_dump_rcu_node_tree(rsp);

 	ktask("rcu_gp", rcu_gp_ktask, rsp);
 	/* TODO: For each mgmt core */
 	ktask("rcu_mgmt_0", rcu_mgmt_ktask, _PERCPU_VARPTR(rcu_pcpui, 0));

 	/* If we have a call_rcu before percpu_init, we might be using the spot
 	 * in the actual __percpu .section.  We'd be core 0, so that'd be OK,
 	 * since all we're using it for is reading 'booted'. */
 	for_each_core(i) {
 		rpi = _PERCPU_VARPTR(rcu_pcpui, i);
 		rpi->booted = true;
 	}
 }
	/* Copyright (c) 2018 Google Inc
	* Barret Rhoden <brho@cs.berkeley.edu>
	* See LICENSE for details.
	*
	* RCU. We borrow a few things from Linux - mostly the header bits and the
	* tree-rcu structure.
	*
	* Acronyms/definitions:
	* - CB: RCU callbacks (call_rcu)
	* - QS: quiescent state - a time when we know a core isn't in an RCU read-side
	* critical section.
	* - GP: grace period. Some quotes from Linux/Paul:
	* - "A time period during which all such pre-existing readers complete is
	* called a 'grace period'."
	* - "Anything outside of an RCU read-side critical section is a quiescent
	* state, and a grace period is any time period in which every CPU (or task,
	* for
	* - gpnum: number of the current grace period we are working on
	* - completed: number of the grace periods completed
	*
	* We differ in a few ways from Linux's implementation:
	*
	* - Callbacks run on management cores (a.k.a, LL cores, e.g. core 0). This way
	* we don't have to kick idle or user space cores to run their CBs, and those
	* CBs don't interfere with a possibly unrelated process.
	*
	* - Our RCU is most similar to rcu_sched (classic RCU), and not the preemptible
	* RCU. Our kthreads don't get preempted, so we don't need to worry about
	* read side critical sections being interrupted.
	*
	* - There is no softirq processing to note the passing of GPs or to run CBs.
	*
	* - Our tree uses atomic ops to trace grace periods within the rcu_nodes.
	* Linux's tree-rcu uses locks. They need the locks since under some
	* circumstances, a QS would be marked during a read-side critical section,
	* and the QS marking needed to track the gpnum to keep the QS matched to the
	* GP. See
	* https://www.kernel.org/doc/Documentation/RCU/Design/Data-Structures/Data-Structures.html
	* and grep "Come on". We don't need to worry about this since we only mark a
	* QS under two situations:
	*
	* - The core knows it is does not hold an rcu_read_lock, so we can always
	* mark QS.
	* - The GP kthread saw the core either idle or in userspace after the gp
	* started. That means we know that core had a QS after the GP started.
	*
	* So any time we mark a QS is actually a QS. I think Linux has times where
	* they note a QS for an older GP, and set a note to mark that QS *for that
	* GP* in the future. Their locks make sure they are marking for the right
	* gpnum. There might be some element of the rnps not knowing about the
	* latest GP yet too.
	*
	* - We do use locking at the per-core level to decide whether or not to start
	* mark a QS for a given GP. (lock, compare gp_acked to gpnum, etc). This
	* ensures only one thread (the core or the GP kth) marks the core for a given
	* GP. We actually could handle it if the both did, (make the trickle-up
	* idempotent, which we do for the interior nodes) but we could run into
	* situations where a core checks in for a GP before the global gpnum was set.
	* This could happen when the GP kth is resetting the tree for the next GP.
	* I think it'd be OK, but not worth the hassle and confusion.
	*
	* - We have a kthread for GP management, like Linux. Callbacks are enqueued
	* locally (on the core that calls call_rcu), like Linux. We have a kthread
	* per management core to process the callbacks, and these threads will handle
	* the callbacks of all cores. Each core has a specific mgmt kthread that
	* will run its callbacks. It is important that a particular core's callbacks
	* are processed by the same thread - I rely on this to implement rcu_barrier
	* easily. In that case, we just need to schedule a CB on every core that has
	* CBs, and when those N CBs are done, our barrier passed. This relies on CBs
	* being processed in order for a given core. We could do the barrier in
	* other ways, but it doesn't seem like a big deal.
	*
	* - I kept around some seq counter and locking stuff in rcu_helper.h. We might
	* use that in the future.
	*/

	#include <rcu.h>
	#include <kthread.h>
	#include <smp.h>
	#include <kmalloc.h>

	/* How many CBs to queue up before we trigger a GP */
	#define RCU_CB_THRESH 10
	/* How long (usec) we wait between running a GP if we weren't triggered. */
	#define RCU_GP_MIN_PERIOD 25000
	/* How long (usec) we wait for cores to check in. */
	#define RCU_GP_TARDY_PERIOD 1000

	/* In rcu_tree_helper.c */
	extern int rcu_num_cores;
	extern int rcu_num_lvls;

	/* Controls whether we skip cores when we expedite, which forces tardy cores. */
	static bool rcu_debug_tardy;

	/* Externed in rcu_tree_helper.c */
	struct rcu_state rcu_state;


	DEFINE_PERCPU(struct rcu_pcpui, rcu_pcpui);

	struct sync_cb_blob {
	struct rcu_head h;
	struct semaphore *sem;
	};

	static void __sync_cb(struct rcu_head *head)
	{
	struct sync_cb_blob *b = container_of(head, struct sync_cb_blob, h);

	sem_up(b->sem);
	}

	void synchronize_rcu(void)
	{
	struct sync_cb_blob b[1];
	struct semaphore sem[1];

	if (!can_block(this_pcpui_ptr()))
	panic("Attempted %s() from an unblockable context!", __func__);
	if (is_rcu_ktask(current_kthread))
	panic("Attempted %s() from an RCU thread!", __func__);
	sem_init(sem, 0);
	init_rcu_head_on_stack(&b->h);
	b->sem = sem;
	call_rcu(&b->h, __sync_cb);
	sem_down(sem);
	}

	static inline bool gp_in_progress(struct rcu_state *rsp)
	{
	unsigned long completed = READ_ONCE(rsp->completed);
	unsigned long gpnum = READ_ONCE(rsp->gpnum);

	assert(gpnum - completed <= 1);
	return completed != gpnum;
	}

	/* Wakes the kthread to run a grace period if it isn't already running.
	*
	* If 'force', we'll make sure it runs a fresh GP, which will catch all CBs
	* registered before this call. That's not 100% true. It might be possible on
	* some non-x86 architectures for the writes that wake the ktask are reordered
	* before the read of gpnum that our caller made. Thus the caller could have a
	* CB in a later GP. Worst case, they'll wait an extra GP timeout. Not too
	* concerned, though I probably should be. */
	static void wake_gp_ktask(struct rcu_state *rsp, bool force)
	{
	if (!force && gp_in_progress(rsp))
	return;
	rsp->gp_ktask_ctl = 1;
	rendez_wakeup(&rsp->gp_ktask_rv);
	}

	static void rcu_exec_cb(struct rcu_head *head)
	{
	if (__is_kfree_rcu_offset((unsigned long)head->func))
	kfree((void*)head - (unsigned long)head->func);
	else
	head->func(head);
	}

	void __early_call_rcu(struct rcu_head *head)
	{
	extern bool booting;

	assert(booting);
	assert(core_id() == 0);
	run_as_rkm(rcu_exec_cb, head);
	}

	/* This could be called from a remote core, e.g. rcu_barrier(). Returns the
	* number of enqueued CBs, including the one we pass in. */
	static int __call_rcu_rpi(struct rcu_state rsp, struct rcu_pcpui rpi,
	struct rcu_head *head, rcu_callback_t func)
	{
	unsigned int nr_cbs;

	head->func = func;

	if (!rpi->booted) {
	__early_call_rcu(head);
	return 0;
	}
	/* rsp->gpnum is the one we're either working on (if > completed) or the
	* one we already did. Either way, it's a GP that may have already been
	* ACKed during a core's QS, and that core could have started a
	* read-side critical section that must complete before CB runs. That
	* requires another GP. */
	head->gpnum = READ_ONCE(rsp->gpnum) + 1;
	spin_lock_irqsave(&rpi->lock);
	list_add_tail(&head->link, &rpi->cbs);
	nr_cbs = ++rpi->nr_cbs;
	spin_unlock_irqsave(&rpi->lock);
	/* rcu_barrier requires that the write to ->nr_cbs be visible before any
	* future writes. unlock orders the write inside, but doesn't prevent
	* other writes from moving in. Technically, our lock implementations
	* do that, but it's not part of our definition. Maybe it should be.
	* Til then: */
	wmb();
	return nr_cbs;
	}

	/* Minus the kfree offset check */
	static void __call_rcu(struct rcu_head *head, rcu_callback_t func)
	{
	struct rcu_pcpui *rpi = PERCPU_VARPTR(rcu_pcpui);
	struct rcu_state *rsp = rpi->rsp;
	unsigned int thresh;

	thresh = __call_rcu_rpi(rsp, rpi, head, func);
	if (thresh > RCU_CB_THRESH)
	wake_gp_ktask(rpi->rsp, false);
	}

	void call_rcu(struct rcu_head *head, rcu_callback_t func)
	{
	assert(!__is_kfree_rcu_offset((unsigned long)func));
	__call_rcu(head, func);
	}

	void rcu_barrier(void)
	{
	struct rcu_state *rsp = PERCPU_VAR(rcu_pcpui).rsp;
	struct rcu_pcpui *rpi;
	struct semaphore sem[1];
	struct sync_cb_blob *b;
	int nr_sent = 0;

	if (!can_block(this_pcpui_ptr()))
	panic("Attempted %s() from an unblockable context!", __func__);
	if (is_rcu_ktask(current_kthread))
	panic("Attempted %s() from an RCU thread!", __func__);
	/* TODO: if we have concurrent rcu_barriers, we might be able to share
	* the CBs. Say we have 1 CB on a core, then N rcu_barriers. We'll
	* have N call_rcus in flight, though we could share. Linux does this
	* with a mtx and some accounting, I think. */
	b = kzmalloc(sizeof(struct sync_cb_blob) * num_cores, MEM_WAIT);
	/* Remember, you block when sem is <= 0. We'll get nr_sent ups, and
	* we'll down 1 for each. This is just like the synchronize_rcu() case;
	* there, nr_sent == 1. */
	sem_init(sem, 0);
	/* Order any signal we received from someone who called call_rcu()
	* before our rpi->nr_cbs reads. */
	rmb();
	for_each_core(i) {
	rpi = _PERCPU_VARPTR(rcu_pcpui, i);
	/* Lockless peek at nr_cbs. Two things to note here:
	* - We look at nr_cbs and not the list, since there could be
	* CBs on the stack-local work list or that have blocked.
	* - The guarantee is that we wait for any CBs from call_rcus
	* that can be proved to happen before rcu_barrier. That
	* means call_rcu had to return, which means it had to set the
	* nr_cbs. */
	if (!rpi->nr_cbs)
	continue;
	init_rcu_head_on_stack(&b[i].h);
	b[i].sem = sem;
	__call_rcu_rpi(rsp, rpi, &b[i].h, __sync_cb);
	nr_sent++;
	}
	if (!nr_sent) {
	kfree(b);
	return;
	}
	wake_gp_ktask(rpi->rsp, true);
	/* sem_down_bulk is currently slow. Even with some fixes, we actually
	* want a barrier, which you could imagine doing with a tree.
	* sem_down_bulk() doesn't have the info that we have: that the wakeups
	* are coming from N cores on the leaves of the tree. */
	sem_down_bulk(sem, nr_sent);
	kfree(b);
	}

	void rcu_force_quiescent_state(void)
	{
	/* It's unclear if we want to block until the QS has passed */
	wake_gp_ktask(PERCPU_VAR(rcu_pcpui).rsp, true);
	}

	void kfree_call_rcu(struct rcu_head *head, rcu_callback_t off)
	{
	__call_rcu(head, off);
	}

	/* Clears the bits core(s) in grpmask present in rnp, trickling up to the root.
	* Note that a 1 in qsmask means you haven't checked in - like a todo list.
	* Last one out kicks the GP kthread. */
	static void __mark_qs(struct rcu_state rsp, struct rcu_node rnp,
	unsigned long grpmask)
	{
	unsigned long new_qsm;

	new_qsm = __sync_and_and_fetch(&rnp->qsmask, ~grpmask);
	/* I don't fully understand this, but we need some form of transitive
	* barrier across the entire tree. Linux does this when they
	* lock/unlock. Our equivalent is the atomic op. */
	smp_mb__after_unlock_lock();
	/* Only one thread will get 0 back - the last one to check in */
	if (new_qsm)
	return;
	if (rnp->parent)
	__mark_qs(rsp, rnp->parent, rnp->grpmask);
	else
	rendez_wakeup(&rsp->gp_ktask_rv);
	}

	static void rcu_report_qs_rpi(struct rcu_state rsp, struct rcu_pcpui rpi)
	{
	/* Note we don't check ->completed == ->gpnum (gp_in_progress()). We
	* only care if our core hasn't reported in for a GP. This time is a
	* subset of gp_in_progress. */
	if (rpi->gp_acked == READ_ONCE(rsp->gpnum)) {
	/* If a GP starts right afterwards, oh well. Catch it next
	* time. */
	return;
	}
	/* Lock ensures we only report a QS once per GP. */
	spin_lock_irqsave(&rpi->lock);
	if (rpi->gp_acked == READ_ONCE(rsp->gpnum)) {
	spin_unlock_irqsave(&rpi->lock);
	return;
	}
	/* A gp can start concurrently, but once started, we should never be
	* behind by more than 1. */
	assert(rpi->gp_acked + 1 == READ_ONCE(rsp->gpnum));
	/* Up our gp_acked before actually marking it. I don't want to hold the
	* lock too long (e.g. some debug code in rendez_wakeup() calls
	* call_rcu). So we've unlocked, but haven't actually checked in yet -
	* that's fine. No one else will attempt to check in until the next GP,
	* which can't happen until after we check in for this GP. */
	rpi->gp_acked++;
	spin_unlock_irqsave(&rpi->lock);
	__mark_qs(rsp, rpi->my_node, rpi->grpmask);
	}

	/* Cores advertise when they are in QSs. If the core already reported in, or if
	* we're not in a GP, this is a quick check (given a global read of ->gpnum). */
	void rcu_report_qs(void)
	{
	rcu_report_qs_rpi(&rcu_state, PERCPU_VARPTR(rcu_pcpui));
	}

	/* For debugging checks on large trees. Keep this in sync with
	* rcu_init_fake_cores(). */
	static void rcu_report_qs_fake_cores(struct rcu_state *rsp)
	{
	struct rcu_node *rnp;

	rnp = rsp->level[rcu_num_lvls - 1];
	for (int i = num_cores; i < rcu_num_cores; i++) {
	while (i > rnp->grphi)
	rnp++;
	if (rcu_debug_tardy && (i % 2))
	continue;
	__mark_qs(rsp, rnp, 1 << (i - rnp->grplo));
	}
	}

	static void rcu_report_qs_remote_core(struct rcu_state *rsp, int coreid)
	{
	int cpu_state = READ_ONCE(pcpui_var(coreid, cpu_state));
	struct rcu_pcpui *rpi = _PERCPU_VARPTR(rcu_pcpui, coreid);

	/* Lockless peek. If we ever saw them idle/user after a GP started, we
	* know they had a QS, and we know we're still in the original GP. */
	if (cpu_state == CPU_STATE_IDLE \|\| cpu_state == CPU_STATE_USER)
	rcu_report_qs_rpi(rsp, rpi);
	}

	/* Checks every core, remotely via the cpu state, to see if it is in a QS.
	* This is like an expedited grace period. */
	static void rcu_report_qs_remote_cores(struct rcu_state *rsp)
	{
	for_each_core(i) {
	if (rcu_debug_tardy && (i % 2))
	continue;
	rcu_report_qs_remote_core(rsp, i);
	}
	}

	static void rcu_report_qs_tardy_cores(struct rcu_state *rsp)
	{
	struct rcu_node *rnp;
	unsigned long qsmask;
	int i;

	rcu_for_each_leaf_node(rsp, rnp) {
	qsmask = READ_ONCE(rnp->qsmask);
	if (!qsmask)
	continue;
	for_each_set_bit(i, &qsmask, BITS_PER_LONG) {
	/* Fake cores */
	if (i + rnp->grplo >= num_cores) {
	__mark_qs(rsp, rnp, 1 << i);
	continue;
	}
	rcu_report_qs_remote_core(rsp, i + rnp->grplo);
	}
	}
	}

	static int root_qsmask_empty(void *arg)
	{
	struct rcu_state *rsp = arg;

	return READ_ONCE(rsp->node[0].qsmask) == 0 ? 1 : 0;
	}

	static void rcu_run_gp(struct rcu_state *rsp)
	{
	struct rcu_node *rnp;

	assert(rsp->gpnum == rsp->completed);
	/* Initialize the tree for accumulating QSs. We know there are no users
	* on the tree. The only time a core looks at the tree is when
	* reporting a QS for a GP. The previous GP is done, thus all cores
	* reported their GP already (for the previous GP), and they won't try
	* again until we advertise the next GP. */
	rcu_for_each_node_breadth_first(rsp, rnp)
	rnp->qsmask = rnp->qsmaskinit;
	/* Need the tree set for reporting QSs before advertising the GP */
	wmb();
	WRITE_ONCE(rsp->gpnum, rsp->gpnum + 1);
	/* At this point, the cores can start reporting in. */
	/* Fake cores help test a tree larger than num_cores. */
	rcu_report_qs_fake_cores(rsp);
	/* Expediting aggressively. We could also wait briefly and then check
	* the tardy cores. */
	rcu_report_qs_remote_cores(rsp);
	/* Note that even when we expedite the GP by checking remote cores,
	* there's a race where a core halted but we didn't see it. (they
	* report QS, decide to halt, pause, we start GP, see they haven't
	* halted, etc. They could report the QS after setting the state, but I
	* didn't want to . */
	do {
	rendez_sleep_timeout(&rsp->gp_ktask_rv, root_qsmask_empty, rsp,
	RCU_GP_TARDY_PERIOD);
	rcu_report_qs_tardy_cores(rsp);
	} while (!root_qsmask_empty(rsp));
	/* Not sure if we need any barriers here. Once we post 'completed', the
	* CBs can start running. But no one should touch the tree til gpnum is
	* incremented. */
	WRITE_ONCE(rsp->completed, rsp->gpnum);
	}

	static int should_wake_ctl(void *arg)
	{
	int *ctl = arg;

	return *ctl != 0 ? 1 : 0;
	}

	static void wake_mgmt_ktasks(struct rcu_state *rsp)
	{
	struct rcu_pcpui *rpi;

	/* TODO: For each mgmt core */
	rpi = _PERCPU_VARPTR(rcu_pcpui, 0);
	rpi->mgmt_ktask_ctl = 1;
	rendez_wakeup(&rpi->mgmt_ktask_rv);
	}

	static void rcu_gp_ktask(void *arg)
	{
	struct rcu_state *rsp = arg;

	current_kthread->flags \|= KTH_IS_RCU_KTASK;
	while (1) {
	rendez_sleep_timeout(&rsp->gp_ktask_rv, should_wake_ctl,
	&rsp->gp_ktask_ctl, RCU_GP_MIN_PERIOD);
	rsp->gp_ktask_ctl = 0;
	/* Our write of 0 must happen before starting the GP. If
	* rcu_barrier's CBs miss the start of the GP (and thus are in
	* an unscheduled GP), their write of 1 must happen after our
	* write of 0 so that we rerun. This is the post-and-poke
	* pattern. It's not a huge deal, since we'll catch it after
	* the GP period timeout. */
	wmb();
	rcu_run_gp(rsp);
	wake_mgmt_ktasks(rsp);
	};
	}

	static void run_rcu_cbs(struct rcu_state *rsp, int coreid)
	{
	struct rcu_pcpui *rpi = _PERCPU_VARPTR(rcu_pcpui, coreid);
	struct list_head work = LIST_HEAD_INIT(work);
	struct rcu_head head, temp, *last_for_gp = NULL;
	int nr_cbs = 0;
	unsigned long completed;

	/* We'll run the CBs for any GP completed so far, but not any GP that
	* could be completed concurrently. "CBs for a GP" means callbacks that
	* must wait for that GP to complete. */
	completed = READ_ONCE(rsp->completed);

	/* This lockless peek is an optimization. We're guaranteed to not miss
	* the CB for the given GP: If the core had a CB for this GP, it must
	* have put it on the list before checking in, before the GP completes,
	* and before we run. */
	if (list_empty(&rpi->cbs))
	return;

	spin_lock_irqsave(&rpi->lock);
	list_for_each_entry(head, &rpi->cbs, link) {
	if (ULONG_CMP_LT(completed, head->gpnum))
	break;
	nr_cbs++;
	last_for_gp = head;
	}
	if (last_for_gp)
	list_cut_position(&work, &rpi->cbs, &last_for_gp->link);
	spin_unlock_irqsave(&rpi->lock);

	if (!nr_cbs) {
	assert(list_empty(&work));
	return;
	}
	/* When we're in an RCU callback, we can't block. In our non-preemptive
	* world, not blocking also means our kthread won't migrate from this core,
	* such that the pcpui pointer (and thus the specific __ctx_depth) won't
	* change. */
	set_cannot_block(this_pcpui_ptr());
	list_for_each_entry_safe(head, temp, &work, link) {
	list_del(&head->link);
	rcu_exec_cb(head);
	}
	clear_cannot_block(this_pcpui_ptr());

	/* We kept nr_cbs in place until the CBs, which could block, completed.
	* This allows other readers (rcu_barrier()) of our pcpui to tell if we
	* have any CBs pending. This relies on us being the only
	* consumer/runner of CBs for this core. */
	spin_lock_irqsave(&rpi->lock);
	rpi->nr_cbs -= nr_cbs;
	spin_unlock_irqsave(&rpi->lock);
	}

	static void rcu_mgmt_ktask(void *arg)
	{
	struct rcu_pcpui *rpi = arg;
	struct rcu_state *rsp = rpi->rsp;

	current_kthread->flags \|= KTH_IS_RCU_KTASK;
	while (1) {
	rendez_sleep(&rpi->mgmt_ktask_rv, should_wake_ctl,
	&rpi->mgmt_ktask_ctl);
	rpi->mgmt_ktask_ctl = 0;
	/* TODO: given the number of mgmt kthreads, we need to assign
	* cores */
	for_each_core(i)
	run_rcu_cbs(rsp, i);
	};
	}

	void rcu_init_pcpui(struct rcu_state rsp, struct rcu_pcpui rpi, int coreid)
	{
	struct rcu_node *rnp = rpi->my_node;

	rpi->rsp = rsp;
	assert(rnp->grplo <= coreid);
	assert(coreid <= rnp->grphi);
	rpi->coreid = coreid;
	rpi->grpnum = coreid - rnp->grplo;
	rpi->grpmask = 1 << rpi->grpnum;
	rpi->booted = false;

	/* We're single threaded now, so this is OK. */
	rnp->qsmaskinit \|= rpi->grpmask;

	spinlock_init_irqsave(&rpi->lock);
	INIT_LIST_HEAD(&rpi->cbs);
	rpi->nr_cbs = 0;
	rpi->gp_acked = rsp->completed;

	/* TODO: For each mgmt core only */
	if (coreid == 0) {
	rendez_init(&rpi->mgmt_ktask_rv);
	rpi->mgmt_ktask_ctl = 0;
	}
	}

	/* Initializes the fake cores. Works with rcu_report_qs_fake_cores() */
	static void rcu_init_fake_cores(struct rcu_state *rsp)
	{
	struct rcu_node *rnp;

	rnp = rsp->level[rcu_num_lvls - 1];
	for (int i = num_cores; i < rcu_num_cores; i++) {
	while (i > rnp->grphi)
	rnp++;
	rnp->qsmaskinit \|= 1 << (i - rnp->grplo);
	}
	}

	void rcu_init(void)
	{
	struct rcu_state *rsp = &rcu_state;
	struct rcu_pcpui *rpi;

	rcu_init_geometry();
	rcu_init_one(rsp);
	rcu_init_fake_cores(rsp);
	rcu_dump_rcu_node_tree(rsp);

	ktask("rcu_gp", rcu_gp_ktask, rsp);
	/* TODO: For each mgmt core */
	ktask("rcu_mgmt_0", rcu_mgmt_ktask, _PERCPU_VARPTR(rcu_pcpui, 0));

	/* If we have a call_rcu before percpu_init, we might be using the spot
	* in the actual __percpu .section. We'd be core 0, so that'd be OK,
	* since all we're using it for is reading 'booted'. */
	for_each_core(i) {
	rpi = _PERCPU_VARPTR(rcu_pcpui, i);
	rpi->booted = true;
	}
	}