alarm: Handle the tchain in RKM context The root issue is that we can't touch the alarm after it finished, yet we also want to know when it has completed. The way to do this is with a layer of indirection! (sort of - just with a pointer). We can keep a pointer that tracks the handler we're working on. Don't forget - we want to run the handlers without holding the lock too, which makes setting an alarm from a handler easier. And it might avoid deadlocks, such as the ones we saw in userspace with futexes. To do all of this, we need to synchronize on that pointer, and it's basically part of the tchain. We want to manipulate the handlers and the tchain while holding the lock, and run with the pointer set. This couples the execution of the locks with the processing of the tchain. The old code just fired off a bunch of RKMs and forgot about them - we can't forget now. The answer here is to run the tchain handler as an RKM itself. The easiest way is to have __trigger_tchain send the core an RKM to run_tchain. Initially had a per-core ktask that looped and slept on a CV, and the IRQ kicked the CV. Just using the RKM directly is a little faster and simpler. It also means we won't have to worry about it if we ever do special things with ktasks, like scheduling, priority, affinity, etc. The one slight downside is we might get multiple RKMs on the same core to run the tchain. That's pretty minor, and harmless from a correctness perspective. Also, since we're not touching the lock in IRQ ctx, we probably can make it a non-irqsave one. Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>

diff --git a/kern/include/alarm.h b/kern/include/alarm.h
index c9f6197..d0514a2 100644
--- a/kern/include/alarm.h
+++ b/kern/include/alarm.h

@@ -1,4 +1,5 @@
 /* Copyright (c) 2011 The Regents of the University of California
+ * Copyright (c) 2018 Google Inc.
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
@@ -72,9 +73,6 @@
 	void						*data;
 	TAILQ_ENTRY(alarm_waiter)	next;
 	bool						on_tchain;
-	bool						is_running;
-	bool						no_rearm;
-	struct cond_var				done_cv;
 };
 TAILQ_HEAD(awaiters_tailq, alarm_waiter);		/* ideally not a LL */
 
@@ -86,8 +84,10 @@
 struct timer_chain {
 	spinlock_t					lock;
 	struct awaiters_tailq		waiters;
+	struct alarm_waiter			*running;
 	uint64_t					earliest_time;
 	uint64_t					latest_time;
+	struct cond_var				cv;
 	void (*set_interrupt)(struct timer_chain *);
 };
 

diff --git a/kern/src/alarm.c b/kern/src/alarm.c
index d8fd247..cc3008d 100644
--- a/kern/src/alarm.c
+++ b/kern/src/alarm.c

@@ -1,4 +1,5 @@
 /* Copyright (c) 2011 The Regents of the University of California
+ * Copyright (c) 2018 Google Inc.
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
@@ -43,6 +44,7 @@
 	TAILQ_INIT(&tchain->waiters);
 	tchain->set_interrupt = set_interrupt;
 	reset_tchain_times(tchain);
+	cv_init_irqsave_with_lock(&tchain->cv, &tchain->lock);
 }
 
 void init_awaiter(struct alarm_waiter *waiter,
@@ -52,9 +54,6 @@
 	waiter->func = func;
 	waiter->wake_up_time = ALARM_POISON_TIME;
 	waiter->on_tchain = false;
-	waiter->is_running = false;
-	waiter->no_rearm = false;
-	cv_init_irqsave(&waiter->done_cv);
 }
 
 /* Give this the absolute time.  For now, abs_time is the TSC time that you want
@@ -104,73 +103,59 @@
 	}
 }
 
-static void __finish_awaiter(struct alarm_waiter *waiter)
+static void __run_tchain(uint32_t srcid, long a0, long a1, long a2)
 {
-	int8_t irq_state = 0;
+	struct timer_chain *tchain = (struct timer_chain*)a0;
+	struct alarm_waiter *i;
 
-	/* Syncing with unset_alarm.  They are waiting for us to tell them the
-	 * waiter is not running.
-	 *
-	 * 'is_running' is set true under the tchain lock.  It's checked and cleared
-	 * under the cv_lock, but not necessarily with the tchain lock. */
-	cv_lock_irqsave(&waiter->done_cv, &irq_state);
-	waiter->is_running = false;
-	/* broadcast, instead of signal.  This allows us to have multiple unsetters
-	 * concurrently.  (only one of which will succeed, so YMMV.) */
-	__cv_broadcast(&waiter->done_cv);
-	cv_unlock_irqsave(&waiter->done_cv, &irq_state);
-}
+	spin_lock_irqsave(&tchain->lock);
+	/* It's possible we have multiple contexts running a single tchain.  It
+	 * shouldn't be possible for per-core tchains, but it is possible
+	 * otherwise.  In that case, we can just abort, treating the event/IRQ
+	 * that woke us up as a 'poke'. */
+	if (tchain->running) {
+		spin_unlock_irqsave(&tchain->lock);
+		return;
+	}
+	while ((i = TAILQ_FIRST(&tchain->waiters))) {
+		/* TODO: Could also do something in cases where it's close to
+		 * expiring. */
+		if (i->wake_up_time > read_tsc())
+			break;
+		TAILQ_REMOVE(&tchain->waiters, i, next);
+		i->on_tchain = false;
+		tchain->running = i;
 
-static void __run_awaiter(uint32_t srcid, long a0, long a1, long a2)
-{
-	struct alarm_waiter *waiter = (struct alarm_waiter*)a0;
+		/* Need the tchain times (earliest/latest) in sync when
+		 * unlocked. */
+		reset_tchain_times(tchain);
 
-	set_cannot_block(this_pcpui_ptr());
-	waiter->func(waiter);
-	clear_cannot_block(this_pcpui_ptr());
-	__finish_awaiter(waiter);
-}
+		spin_unlock_irqsave(&tchain->lock);
 
-static void wake_awaiter(struct alarm_waiter *waiter,
-                         struct hw_trapframe *hw_tf)
-{
-	send_kernel_message(core_id(), __run_awaiter, (long)waiter,
-	                    0, 0, KMSG_ROUTINE);
+		/* Don't touch the waiter after running it, since the memory can
+		 * be used immediately (e.g. after a kthread unwinds). */
+		set_cannot_block(this_pcpui_ptr());
+		i->func(i);
+		clear_cannot_block(this_pcpui_ptr());
+
+		spin_lock_irqsave(&tchain->lock);
+		tchain->running = NULL;
+
+		/* There should only be at most one blocked unsetter, since only
+		 * one alarm can run at a time (per tchain). */
+		__cv_signal(&tchain->cv);
+		warn_on(tchain->cv.nr_waiters);
+	}
+	reset_tchain_interrupt(tchain);
+	spin_unlock_irqsave(&tchain->lock);
 }
 
 /* This is called when an interrupt triggers a tchain, and needs to wake up
  * everyone whose time is up.  Called from IRQ context. */
 void __trigger_tchain(struct timer_chain *tchain, struct hw_trapframe *hw_tf)
 {
-	struct alarm_waiter *i, *temp;
-	uint64_t now = read_tsc();
-	struct awaiters_tailq to_wake = TAILQ_HEAD_INITIALIZER(to_wake);
-
-	spin_lock_irqsave(&tchain->lock);
-	TAILQ_FOREACH_SAFE(i, &tchain->waiters, next, temp) {
-		printd("Trying to wake up %p who is due at %llu and now is %llu\n",
-		       i, i->wake_up_time, now);
-		/* TODO: Could also do something in cases where we're close to now */
-		if (i->wake_up_time > now)
-			break;
-		/* At this point, unset must wait until it has finished */
-		i->on_tchain = false;
-		i->is_running = true;
-		TAILQ_REMOVE(&tchain->waiters, i, next);
-		TAILQ_INSERT_TAIL(&to_wake, i, next);
-	}
-	reset_tchain_times(tchain);
-	reset_tchain_interrupt(tchain);
-	spin_unlock_irqsave(&tchain->lock);
-
-	TAILQ_FOREACH_SAFE(i, &to_wake, next, temp) {
-		/* Don't touch the waiter after waking it, since it could be in use on
-		 * another core (and the waiter can be clobbered as the kthread unwinds
-		 * its stack).  Or it could be kfreed.  Technically, the waiter hasn't
-		 * finished until we cleared is_running and unlocked the cv lock. */
-		TAILQ_REMOVE(&to_wake, i, next);
-		wake_awaiter(i, hw_tf);
-	}
+	send_kernel_message(core_id(), __run_tchain, (long)tchain, 0, 0,
+	                    KMSG_ROUTINE);
 }
 
 /* Helper, inserts the waiter into the tchain, returning TRUE if we still need
@@ -225,13 +210,6 @@
 	assert(!waiter->on_tchain);
 
 	spin_lock_irqsave(&tchain->lock);
-	if (waiter->no_rearm) {
-		/* no_rearm exists to prevent alarm handlers from perpetually rearming
-		 * when another thread is trying to unset the alarm.  We could return an
-		 * error / false, but I don't have a use for that yet. */
-		spin_unlock_irqsave(&tchain->lock);
-		return;
-	}
 	if (__insert_awaiter(tchain, waiter))
 		reset_tchain_interrupt(tchain);
 	spin_unlock_irqsave(&tchain->lock);
@@ -270,27 +248,28 @@
 	int8_t irq_state = 0;
 
 	spin_lock_irqsave(&tchain->lock);
-	if (waiter->on_tchain) {
-		if (__remove_awaiter(tchain, waiter))
-			reset_tchain_interrupt(tchain);
-		spin_unlock_irqsave(&tchain->lock);
-		return true;
+	for (;;) {
+		if (waiter->on_tchain) {
+			if (__remove_awaiter(tchain, waiter))
+				reset_tchain_interrupt(tchain);
+			spin_unlock_irqsave(&tchain->lock);
+			return true;
+		}
+		if (tchain->running != waiter) {
+			spin_unlock_irqsave(&tchain->lock);
+			return false;
+		}
+		/* It's running.  We'll need to try again.  Note the alarm could
+		 * have resubmitted itself, so ideally the caller can tell it to
+		 * not resubmit.
+		 *
+		 *
+		 * Arguably by using a CV we're slowing down the common case for
+		 * run_tchain (no race on unset) ever so slightly.  The
+		 * alternative here would be to busy-wait with unlock/yield/lock
+		 * (more of a cv_spin). */
+		cv_wait(&tchain->cv);
 	}
-
-	/* no_rearm is set and checked under the tchain lock.  It is cleared when
-	 * unset completes, outside the lock.  That is safe since we know the alarm
-	 * service is no longer aware of waiter (either the handler ran or we
-	 * stopped it). */
-	waiter->no_rearm = true;
-	spin_unlock_irqsave(&tchain->lock);
-
-	cv_lock_irqsave(&waiter->done_cv, &irq_state);
-	while (waiter->is_running)
-		cv_wait(&waiter->done_cv);
-	cv_unlock_irqsave(&waiter->done_cv, &irq_state);
-
-	waiter->no_rearm = false;
-	return false;
 }
 
 bool reset_alarm_abs(struct timer_chain *tchain, struct alarm_waiter *waiter,

diff --git a/kern/src/smp.c b/kern/src/smp.c
index 934c516..f9dbe2b 100644
--- a/kern/src/smp.c
+++ b/kern/src/smp.c

@@ -120,8 +120,7 @@
 	STAILQ_INIT(&per_cpu_info[coreid].immed_amsgs);
 	spinlock_init_irqsave(&per_cpu_info[coreid].routine_amsg_lock);
 	STAILQ_INIT(&per_cpu_info[coreid].routine_amsgs);
-	/* Initialize the per-core timer chain */
-	init_timer_chain(&per_cpu_info[coreid].tchain, set_pcpu_alarm_interrupt);
+	init_timer_chain(&this_pcpui_var(tchain), set_pcpu_alarm_interrupt);
 	/* Init generic tracing ring */
 	trace_buf = kpage_alloc_addr();
 	assert(trace_buf);