Fix issues with unset_alarm() [1/2] Previously, unset_alarm() would return before an RKM alarm would fire. It was up to users (e.g. devalarm) to deal with it. That's a real pain. Now, unset_alarm() will block until we're sure the alarm is done. This gets tricky for alarms that rearm themselves. As a side-effect of this, unset_alarm() now has a waits-on dependency on the handler for RKM alarms. (All of this is much simpler with IRQ alarms). In the process, I cleaned up the reset_* family, which were just unset/set helpers. Odds are, there are more problems with this that will pop up later. Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
diff --git a/kern/include/alarm.h b/kern/include/alarm.h index b0bb52c..1a7e387 100644 --- a/kern/include/alarm.h +++ b/kern/include/alarm.h
@@ -25,15 +25,15 @@ * irqsave locks. * * Another important difference between IRQ and RKM alarms comes when cancelling - * or unsetting an alarm. When you cancel (unset or reset) an alarm, the alarm - * is yanked off the tchain. If the waiter was on the chain, then it will not - * fire for both IRQ and RKM alarms. If the waiter was not on the chain, then - * for IRQ alarms, this means that the alarm has already fired. However, for - * RKM alarms, the alarm may have already fired or it may still be waiting to - * fire (sitting in an RKM queue). It will fire at some point, but perhaps it - * has not fired yet. It is also possibly (though extremely unlikely) that if - * you reset an RKM alarm that the new alarm actually happens before the old one - * (if the new RKM was sent to another core). + * or unsetting an alarm. When you cancel (unset or reset) an alarm, you may + * need to block until the RKM has run. IRQ alarms run with the tchain lock + * held, so once the canceller grabs the lock, it has either run already or will + * not at all. With RKMs, the handler runs outside of the lock. Thus you may + * have to wait until the RKM has run, and the RKM might be waiting to run on + * your core. + * + * Note that RKM unset_alarm() has a waits-on dependency with the actual alarm + * handler, so be careful of deadlock. * * To use an IRQ alarm, init the waiter with init_awaiter_irq(). * @@ -78,6 +78,8 @@ bool on_tchain; bool irq_ok; bool holds_tchain_lock; + bool rkm_pending; + struct cond_var rkm_cv; }; TAILQ_HEAD(awaiters_tailq, alarm_waiter); /* ideally not a LL */ @@ -107,9 +109,14 @@ void set_awaiter_abs(struct alarm_waiter *waiter, uint64_t abs_time); void set_awaiter_rel(struct alarm_waiter *waiter, uint64_t usleep); void set_awaiter_inc(struct alarm_waiter *waiter, uint64_t usleep); -/* Arms/disarms the alarm. */ +/* Arms/disarms the alarm. Can be called from within a handler.*/ void set_alarm(struct timer_chain *tchain, struct alarm_waiter *waiter); +/* Unset and reset may block if the alarm is not IRQ. Do not call from within a + * handler. Returns TRUE if you stopped the alarm from firing. */ bool unset_alarm(struct timer_chain *tchain, struct alarm_waiter *waiter); +/* Convenience wrappers for unset, then set. Slower, but easier than just + * setting, since you don't need to know if it fired. Returns TRUE if the alarm + * did not fire before your reset. */ bool reset_alarm_abs(struct timer_chain *tchain, struct alarm_waiter *waiter, uint64_t abs_time); bool reset_alarm_rel(struct timer_chain *tchain, struct alarm_waiter *waiter,
diff --git a/kern/src/alarm.c b/kern/src/alarm.c index df1efba..6ca48ac 100644 --- a/kern/src/alarm.c +++ b/kern/src/alarm.c
@@ -59,6 +59,7 @@ assert(func); waiter->func = func; __init_awaiter(waiter); + cv_init(&waiter->rkm_cv); } void init_awaiter_irq(struct alarm_waiter *waiter, @@ -121,7 +122,16 @@ static void __run_awaiter(uint32_t srcid, long a0, long a1, long a2) { struct alarm_waiter *waiter = (struct alarm_waiter*)a0; + waiter->func(waiter); + cv_lock(&waiter->rkm_cv); + /* This completes the alarm's function. We don't need to sync with + * wake_waiter, we happen after. We do need to sync with unset_alarm. */ + waiter->rkm_pending = FALSE; + /* broadcast, instead of signal. This allows us to have multiple unsetters + * concurrently. (only one of which will succeed, so YMMV.) */ + __cv_broadcast(&waiter->rkm_cv); + cv_unlock(&waiter->rkm_cv); } static void wake_awaiter(struct alarm_waiter *waiter, @@ -132,6 +142,9 @@ waiter->func_irq(waiter, hw_tf); waiter->holds_tchain_lock = FALSE; } else { + /* The alarm is in limbo and is uncancellable from now (IRQ ctx, tchain + * lock held) until it finishes. */ + waiter->rkm_pending = TRUE; send_kernel_message(core_id(), __run_awaiter, (long)waiter, 0, 0, KMSG_ROUTINE); } @@ -223,14 +236,15 @@ static void __set_alarm(struct timer_chain *tchain, struct alarm_waiter *waiter) { + assert(!waiter->on_tchain); if (__insert_awaiter(tchain, waiter)) reset_tchain_interrupt(tchain); } -/* Sets the alarm. If it is a kthread-style alarm (func == 0), sleep on it - * later. */ -void set_alarm(struct timer_chain *tchain, struct alarm_waiter *waiter) +static void __set_alarm_irq(struct timer_chain *tchain, + struct alarm_waiter *waiter) { + /* holds_tchain_lock is set when we're called from an alarm handler */ if (waiter->holds_tchain_lock) { __set_alarm(tchain, waiter); } else { @@ -240,6 +254,24 @@ } } +static void __set_alarm_rkm(struct timer_chain *tchain, + struct alarm_waiter *waiter) +{ + spin_lock_irqsave(&tchain->lock); + __set_alarm(tchain, waiter); + spin_unlock_irqsave(&tchain->lock); +} + +/* Sets the alarm. If it is a kthread-style alarm (func == 0), sleep on it + * later. */ +void set_alarm(struct timer_chain *tchain, struct alarm_waiter *waiter) +{ + if (waiter->irq_ok) + return __set_alarm_irq(tchain, waiter); + else + return __set_alarm_rkm(tchain, waiter); +} + /* Helper, rips the waiter from the tchain, knowing that it is on the list. * Returns TRUE if the tchain interrupt needs to be reset. Callers hold the * lock. */ @@ -264,64 +296,88 @@ return reset_int; } +static bool __unset_alarm_irq(struct timer_chain *tchain, + struct alarm_waiter *waiter) +{ + bool was_on_chain = FALSE; + + /* We need to lock the tchain before looking at on_tchain. At one point, I + * thought we could do the check-signal-check again style (lockless peek). + * The reason we can't is that on_tchain isn't just set FALSE. A handler + * could reset the alarm and set it TRUE while we're looking. We could + * briefly peek when it is off the chain but about to run its handler. + * + * I was tempted to assert(!waiter->holds_tchain_lock), to catch people who + * try to unset from a handler. That won't work, since you can validly + * unset while the alarm is going off. In that case, you might see + * holds_tchain_lock set briefly. */ + spin_lock_irqsave(&tchain->lock); + if (waiter->on_tchain) { + was_on_chain = TRUE; + if (__remove_awaiter(tchain, waiter)) + reset_tchain_interrupt(tchain); + } + spin_unlock_irqsave(&tchain->lock); + /* IRQ alarms run under the tchain lock. If we ripped it off the chain, it + * won't fire again. Alarms that rearm may have fired multiple times before + * we locked, but once we locked, it was done. */ + return was_on_chain; +} + +static bool __unset_alarm_rkm(struct timer_chain *tchain, + struct alarm_waiter *waiter) +{ + bool was_on_chain, was_pending; + + cv_lock(&waiter->rkm_cv); + while (1) { + spin_lock_irqsave(&tchain->lock); + was_on_chain = waiter->on_tchain; + /* I think we can safely check pending outside the tchain lock, but it's + * not worth the hassle and this is probably safer. Basically, + * rkm_pending will be set only if on_tchain is FALSE, and it won't get + * cleared until someone grabs the cv_lock (which we hold). */ + was_pending = waiter->rkm_pending; + if (was_on_chain) { + /* The only way we ever stop repeating alarms permanently (i.e. they + * rearm) is if we yank it off the tchain */ + if (__remove_awaiter(tchain, waiter)) + reset_tchain_interrupt(tchain); + spin_unlock_irqsave(&tchain->lock); + cv_unlock(&waiter->rkm_cv); + return TRUE; + } + spin_unlock_irqsave(&tchain->lock); + if (!was_pending) { + /* wasn't on the chain and wasn't pending: it executed and did not + * get rearmed */ + cv_unlock(&waiter->rkm_cv); + return FALSE; + } + /* Wait til it executes and then try again. */ + cv_wait(&waiter->rkm_cv); + } +} + /* Removes waiter from the tchain before it goes off. Returns TRUE if we - * disarmed before the alarm went off, FALSE if it already fired. */ + * disarmed before the alarm went off, FALSE if it already fired. May block for + * non-IRQ / RKM alarms, since the handler may be running asynchronously. */ bool unset_alarm(struct timer_chain *tchain, struct alarm_waiter *waiter) { - assert(!waiter->holds_tchain_lock); /* Don't call from within a handler */ - spin_lock_irqsave(&tchain->lock); - bool ret = waiter->on_tchain; - if (ret && __remove_awaiter(tchain, waiter)) - reset_tchain_interrupt(tchain); - - /* if alarm had already gone off then its not on this tchain's list, though - * the concurrent change to on_tchain (specifically, the setting of it to - * FALSE), happens under the tchain's lock. */ - spin_unlock_irqsave(&tchain->lock); - return ret; -} - -/* waiter may be on the tchain, or it might have fired already and be off the - * tchain. Either way, this will put the waiter on the list, set to go off at - * abs_time. If you know the alarm has fired, don't call this. Just set the - * awaiter, and then set_alarm() */ -static bool __reset_alarm_abs(struct timer_chain *tchain, - struct alarm_waiter *waiter, uint64_t abs_time) -{ - /* The tchain's lock is held */ - bool ret = waiter->on_tchain; - /* We only need to remove/unset when the alarm has not fired yet (is still - * on the tchain). If it has fired, it's like a fresh insert. We must also - * check if we need to reset the interrupt. */ - bool reset_int = ret && __remove_awaiter(tchain, waiter); - set_awaiter_abs(waiter, abs_time); - /* regardless, we need to be reinserted */ - if (__insert_awaiter(tchain, waiter) || reset_int) - reset_tchain_interrupt(tchain); - return ret; -} - -static bool __reset_alarm_rel(struct timer_chain *tchain, - struct alarm_waiter *waiter, uint64_t usleep) -{ - uint64_t now, then; - now = read_tsc(); - then = now + usec2tsc(usleep); - assert(now <= then); - return __reset_alarm_abs(tchain, waiter, then); + if (waiter->irq_ok) + return __unset_alarm_irq(tchain, waiter); + else + return __unset_alarm_rkm(tchain, waiter); } bool reset_alarm_abs(struct timer_chain *tchain, struct alarm_waiter *waiter, uint64_t abs_time) { bool ret; - if (waiter->holds_tchain_lock) { - ret = __reset_alarm_abs(tchain, waiter, abs_time); - } else { - spin_lock_irqsave(&tchain->lock); - ret = __reset_alarm_abs(tchain, waiter, abs_time); - spin_unlock_irqsave(&tchain->lock); - } + + ret = unset_alarm(tchain, waiter); + set_awaiter_abs(waiter, abs_time); + set_alarm(tchain, waiter); return ret; } @@ -329,13 +385,10 @@ uint64_t usleep) { bool ret; - if (waiter->holds_tchain_lock) { - ret =__reset_alarm_rel(tchain, waiter, usleep); - } else { - spin_lock_irqsave(&tchain->lock); - ret =__reset_alarm_rel(tchain, waiter, usleep); - spin_unlock_irqsave(&tchain->lock); - } + + ret = unset_alarm(tchain, waiter); + set_awaiter_rel(waiter, usleep); + set_alarm(tchain, waiter); return ret; }
diff --git a/kern/src/rendez.c b/kern/src/rendez.c index 80c29fa..576eaf6 100644 --- a/kern/src/rendez.c +++ b/kern/src/rendez.c
@@ -80,11 +80,10 @@ } cv_unlock_irqsave(&rv->cv, &irq_state); /* The handler will call rendez_wake, but won't mess with the condition - * state. It's enough to break us out of cv_wait() to see .on_tchain. But - * for this to work, we need to be an IRQ alarm handler. If an RKM fired, - * then we'd be off the tchain, but it didn't actually run yet. Also, since - * all we're doing is poking a rendez, we might as well just do it from IRQ - * ctx instead of mucking with an extra RKM. */ + * state. It's enough to break us out of cv_wait() to see .on_tchain. + * Since all we're doing is poking a rendez, we might as well just do it + * from IRQ ctx instead of mucking with an extra RKM. It also avoids issues + * with unset_alarm blocking. */ init_awaiter_irq(&awaiter, rendez_alarm_handler); awaiter.data = rv; set_awaiter_rel(&awaiter, usec);