Fix issues with unset_alarm() [1/2]
Previously, unset_alarm() would return before an RKM alarm would fire. It
was up to users (e.g. devalarm) to deal with it. That's a real pain.
Now, unset_alarm() will block until we're sure the alarm is done. This
gets tricky for alarms that rearm themselves.
As a side-effect of this, unset_alarm() now has a waits-on dependency on
the handler for RKM alarms. (All of this is much simpler with IRQ alarms).
In the process, I cleaned up the reset_* family, which were just unset/set
helpers.
Odds are, there are more problems with this that will pop up later.
Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
diff --git a/kern/include/alarm.h b/kern/include/alarm.h
index b0bb52c..1a7e387 100644
--- a/kern/include/alarm.h
+++ b/kern/include/alarm.h
@@ -25,15 +25,15 @@
* irqsave locks.
*
* Another important difference between IRQ and RKM alarms comes when cancelling
- * or unsetting an alarm. When you cancel (unset or reset) an alarm, the alarm
- * is yanked off the tchain. If the waiter was on the chain, then it will not
- * fire for both IRQ and RKM alarms. If the waiter was not on the chain, then
- * for IRQ alarms, this means that the alarm has already fired. However, for
- * RKM alarms, the alarm may have already fired or it may still be waiting to
- * fire (sitting in an RKM queue). It will fire at some point, but perhaps it
- * has not fired yet. It is also possibly (though extremely unlikely) that if
- * you reset an RKM alarm that the new alarm actually happens before the old one
- * (if the new RKM was sent to another core).
+ * or unsetting an alarm. When you cancel (unset or reset) an alarm, you may
+ * need to block until the RKM has run. IRQ alarms run with the tchain lock
+ * held, so once the canceller grabs the lock, it has either run already or will
+ * not at all. With RKMs, the handler runs outside of the lock. Thus you may
+ * have to wait until the RKM has run, and the RKM might be waiting to run on
+ * your core.
+ *
+ * Note that RKM unset_alarm() has a waits-on dependency with the actual alarm
+ * handler, so be careful of deadlock.
*
* To use an IRQ alarm, init the waiter with init_awaiter_irq().
*
@@ -78,6 +78,8 @@
bool on_tchain;
bool irq_ok;
bool holds_tchain_lock;
+ bool rkm_pending;
+ struct cond_var rkm_cv;
};
TAILQ_HEAD(awaiters_tailq, alarm_waiter); /* ideally not a LL */
@@ -107,9 +109,14 @@
void set_awaiter_abs(struct alarm_waiter *waiter, uint64_t abs_time);
void set_awaiter_rel(struct alarm_waiter *waiter, uint64_t usleep);
void set_awaiter_inc(struct alarm_waiter *waiter, uint64_t usleep);
-/* Arms/disarms the alarm. */
+/* Arms/disarms the alarm. Can be called from within a handler.*/
void set_alarm(struct timer_chain *tchain, struct alarm_waiter *waiter);
+/* Unset and reset may block if the alarm is not IRQ. Do not call from within a
+ * handler. Returns TRUE if you stopped the alarm from firing. */
bool unset_alarm(struct timer_chain *tchain, struct alarm_waiter *waiter);
+/* Convenience wrappers for unset, then set. Slower, but easier than just
+ * setting, since you don't need to know if it fired. Returns TRUE if the alarm
+ * did not fire before your reset. */
bool reset_alarm_abs(struct timer_chain *tchain, struct alarm_waiter *waiter,
uint64_t abs_time);
bool reset_alarm_rel(struct timer_chain *tchain, struct alarm_waiter *waiter,
diff --git a/kern/src/alarm.c b/kern/src/alarm.c
index df1efba..6ca48ac 100644
--- a/kern/src/alarm.c
+++ b/kern/src/alarm.c
@@ -59,6 +59,7 @@
assert(func);
waiter->func = func;
__init_awaiter(waiter);
+ cv_init(&waiter->rkm_cv);
}
void init_awaiter_irq(struct alarm_waiter *waiter,
@@ -121,7 +122,16 @@
static void __run_awaiter(uint32_t srcid, long a0, long a1, long a2)
{
struct alarm_waiter *waiter = (struct alarm_waiter*)a0;
+
waiter->func(waiter);
+ cv_lock(&waiter->rkm_cv);
+ /* This completes the alarm's function. We don't need to sync with
+ * wake_waiter, we happen after. We do need to sync with unset_alarm. */
+ waiter->rkm_pending = FALSE;
+ /* broadcast, instead of signal. This allows us to have multiple unsetters
+ * concurrently. (only one of which will succeed, so YMMV.) */
+ __cv_broadcast(&waiter->rkm_cv);
+ cv_unlock(&waiter->rkm_cv);
}
static void wake_awaiter(struct alarm_waiter *waiter,
@@ -132,6 +142,9 @@
waiter->func_irq(waiter, hw_tf);
waiter->holds_tchain_lock = FALSE;
} else {
+ /* The alarm is in limbo and is uncancellable from now (IRQ ctx, tchain
+ * lock held) until it finishes. */
+ waiter->rkm_pending = TRUE;
send_kernel_message(core_id(), __run_awaiter, (long)waiter,
0, 0, KMSG_ROUTINE);
}
@@ -223,14 +236,15 @@
static void __set_alarm(struct timer_chain *tchain, struct alarm_waiter *waiter)
{
+ assert(!waiter->on_tchain);
if (__insert_awaiter(tchain, waiter))
reset_tchain_interrupt(tchain);
}
-/* Sets the alarm. If it is a kthread-style alarm (func == 0), sleep on it
- * later. */
-void set_alarm(struct timer_chain *tchain, struct alarm_waiter *waiter)
+static void __set_alarm_irq(struct timer_chain *tchain,
+ struct alarm_waiter *waiter)
{
+ /* holds_tchain_lock is set when we're called from an alarm handler */
if (waiter->holds_tchain_lock) {
__set_alarm(tchain, waiter);
} else {
@@ -240,6 +254,24 @@
}
}
+static void __set_alarm_rkm(struct timer_chain *tchain,
+ struct alarm_waiter *waiter)
+{
+ spin_lock_irqsave(&tchain->lock);
+ __set_alarm(tchain, waiter);
+ spin_unlock_irqsave(&tchain->lock);
+}
+
+/* Sets the alarm. If it is a kthread-style alarm (func == 0), sleep on it
+ * later. */
+void set_alarm(struct timer_chain *tchain, struct alarm_waiter *waiter)
+{
+ if (waiter->irq_ok)
+ return __set_alarm_irq(tchain, waiter);
+ else
+ return __set_alarm_rkm(tchain, waiter);
+}
+
/* Helper, rips the waiter from the tchain, knowing that it is on the list.
* Returns TRUE if the tchain interrupt needs to be reset. Callers hold the
* lock. */
@@ -264,64 +296,88 @@
return reset_int;
}
+static bool __unset_alarm_irq(struct timer_chain *tchain,
+ struct alarm_waiter *waiter)
+{
+ bool was_on_chain = FALSE;
+
+ /* We need to lock the tchain before looking at on_tchain. At one point, I
+ * thought we could do the check-signal-check again style (lockless peek).
+ * The reason we can't is that on_tchain isn't just set FALSE. A handler
+ * could reset the alarm and set it TRUE while we're looking. We could
+ * briefly peek when it is off the chain but about to run its handler.
+ *
+ * I was tempted to assert(!waiter->holds_tchain_lock), to catch people who
+ * try to unset from a handler. That won't work, since you can validly
+ * unset while the alarm is going off. In that case, you might see
+ * holds_tchain_lock set briefly. */
+ spin_lock_irqsave(&tchain->lock);
+ if (waiter->on_tchain) {
+ was_on_chain = TRUE;
+ if (__remove_awaiter(tchain, waiter))
+ reset_tchain_interrupt(tchain);
+ }
+ spin_unlock_irqsave(&tchain->lock);
+ /* IRQ alarms run under the tchain lock. If we ripped it off the chain, it
+ * won't fire again. Alarms that rearm may have fired multiple times before
+ * we locked, but once we locked, it was done. */
+ return was_on_chain;
+}
+
+static bool __unset_alarm_rkm(struct timer_chain *tchain,
+ struct alarm_waiter *waiter)
+{
+ bool was_on_chain, was_pending;
+
+ cv_lock(&waiter->rkm_cv);
+ while (1) {
+ spin_lock_irqsave(&tchain->lock);
+ was_on_chain = waiter->on_tchain;
+ /* I think we can safely check pending outside the tchain lock, but it's
+ * not worth the hassle and this is probably safer. Basically,
+ * rkm_pending will be set only if on_tchain is FALSE, and it won't get
+ * cleared until someone grabs the cv_lock (which we hold). */
+ was_pending = waiter->rkm_pending;
+ if (was_on_chain) {
+ /* The only way we ever stop repeating alarms permanently (i.e. they
+ * rearm) is if we yank it off the tchain */
+ if (__remove_awaiter(tchain, waiter))
+ reset_tchain_interrupt(tchain);
+ spin_unlock_irqsave(&tchain->lock);
+ cv_unlock(&waiter->rkm_cv);
+ return TRUE;
+ }
+ spin_unlock_irqsave(&tchain->lock);
+ if (!was_pending) {
+ /* wasn't on the chain and wasn't pending: it executed and did not
+ * get rearmed */
+ cv_unlock(&waiter->rkm_cv);
+ return FALSE;
+ }
+ /* Wait til it executes and then try again. */
+ cv_wait(&waiter->rkm_cv);
+ }
+}
+
/* Removes waiter from the tchain before it goes off. Returns TRUE if we
- * disarmed before the alarm went off, FALSE if it already fired. */
+ * disarmed before the alarm went off, FALSE if it already fired. May block for
+ * non-IRQ / RKM alarms, since the handler may be running asynchronously. */
bool unset_alarm(struct timer_chain *tchain, struct alarm_waiter *waiter)
{
- assert(!waiter->holds_tchain_lock); /* Don't call from within a handler */
- spin_lock_irqsave(&tchain->lock);
- bool ret = waiter->on_tchain;
- if (ret && __remove_awaiter(tchain, waiter))
- reset_tchain_interrupt(tchain);
-
- /* if alarm had already gone off then its not on this tchain's list, though
- * the concurrent change to on_tchain (specifically, the setting of it to
- * FALSE), happens under the tchain's lock. */
- spin_unlock_irqsave(&tchain->lock);
- return ret;
-}
-
-/* waiter may be on the tchain, or it might have fired already and be off the
- * tchain. Either way, this will put the waiter on the list, set to go off at
- * abs_time. If you know the alarm has fired, don't call this. Just set the
- * awaiter, and then set_alarm() */
-static bool __reset_alarm_abs(struct timer_chain *tchain,
- struct alarm_waiter *waiter, uint64_t abs_time)
-{
- /* The tchain's lock is held */
- bool ret = waiter->on_tchain;
- /* We only need to remove/unset when the alarm has not fired yet (is still
- * on the tchain). If it has fired, it's like a fresh insert. We must also
- * check if we need to reset the interrupt. */
- bool reset_int = ret && __remove_awaiter(tchain, waiter);
- set_awaiter_abs(waiter, abs_time);
- /* regardless, we need to be reinserted */
- if (__insert_awaiter(tchain, waiter) || reset_int)
- reset_tchain_interrupt(tchain);
- return ret;
-}
-
-static bool __reset_alarm_rel(struct timer_chain *tchain,
- struct alarm_waiter *waiter, uint64_t usleep)
-{
- uint64_t now, then;
- now = read_tsc();
- then = now + usec2tsc(usleep);
- assert(now <= then);
- return __reset_alarm_abs(tchain, waiter, then);
+ if (waiter->irq_ok)
+ return __unset_alarm_irq(tchain, waiter);
+ else
+ return __unset_alarm_rkm(tchain, waiter);
}
bool reset_alarm_abs(struct timer_chain *tchain, struct alarm_waiter *waiter,
uint64_t abs_time)
{
bool ret;
- if (waiter->holds_tchain_lock) {
- ret = __reset_alarm_abs(tchain, waiter, abs_time);
- } else {
- spin_lock_irqsave(&tchain->lock);
- ret = __reset_alarm_abs(tchain, waiter, abs_time);
- spin_unlock_irqsave(&tchain->lock);
- }
+
+ ret = unset_alarm(tchain, waiter);
+ set_awaiter_abs(waiter, abs_time);
+ set_alarm(tchain, waiter);
return ret;
}
@@ -329,13 +385,10 @@
uint64_t usleep)
{
bool ret;
- if (waiter->holds_tchain_lock) {
- ret =__reset_alarm_rel(tchain, waiter, usleep);
- } else {
- spin_lock_irqsave(&tchain->lock);
- ret =__reset_alarm_rel(tchain, waiter, usleep);
- spin_unlock_irqsave(&tchain->lock);
- }
+
+ ret = unset_alarm(tchain, waiter);
+ set_awaiter_rel(waiter, usleep);
+ set_alarm(tchain, waiter);
return ret;
}
diff --git a/kern/src/rendez.c b/kern/src/rendez.c
index 80c29fa..576eaf6 100644
--- a/kern/src/rendez.c
+++ b/kern/src/rendez.c
@@ -80,11 +80,10 @@
}
cv_unlock_irqsave(&rv->cv, &irq_state);
/* The handler will call rendez_wake, but won't mess with the condition
- * state. It's enough to break us out of cv_wait() to see .on_tchain. But
- * for this to work, we need to be an IRQ alarm handler. If an RKM fired,
- * then we'd be off the tchain, but it didn't actually run yet. Also, since
- * all we're doing is poking a rendez, we might as well just do it from IRQ
- * ctx instead of mucking with an extra RKM. */
+ * state. It's enough to break us out of cv_wait() to see .on_tchain.
+ * Since all we're doing is poking a rendez, we might as well just do it
+ * from IRQ ctx instead of mucking with an extra RKM. It also avoids issues
+ * with unset_alarm blocking. */
init_awaiter_irq(&awaiter, rendez_alarm_handler);
awaiter.data = rv;
set_awaiter_rel(&awaiter, usec);