Add a simple watchdog
This watchdog will reboot the machine if core 0 is sufficiently wedged.
Typically, this happens when Core 0 is stuck in IRQ context, such that
you cannot interact with the machine remotely. Then you are unable to
do anything, and unless you have decent remote reset capabilities (which
I don't), you can't get the machine back.
The watchdog uses the HPET timer and an NMI, ensuring responsiveness
when IRQs are disabled.
A reboot-worthy delay is defined as the watchdog ktask not running for a
long period: at least the time you requested. This means if you are
stuck in another kthread (to include a non-blocking syscall) for long
enough, such as in a non-irqsave spinlock, or just a long computation,
you could trigger the watchdog. Recall that our kthread scheduler is
non-preemptive.
To set a watchdog that will wait at least 120 seconds before rebooting:
echo on 120 > \#watchdog/ctl
To turn it off, if you want:
echo off > \#watchdog/ctl
To see the status (on or off):
cat \#watchdog/ctl
If you try to set it for longer than the timer / driver can handle,
we'll adjust it down to an appropriate time and send you a warning.
If you're working on a remote machine, I recommend putting something in
your init.sh.
Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
diff --git a/kern/arch/x86/trap.c b/kern/arch/x86/trap.c
index a365d70..39f4ae8 100644
--- a/kern/arch/x86/trap.c
+++ b/kern/arch/x86/trap.c
@@ -347,6 +347,11 @@
static void do_nmi_work(struct hw_trapframe *hw_tf)
{
assert(!irq_is_enabled());
+
+ extern void __watchdog_nmi_handler(struct hw_trapframe *hw_tf);
+
+ __watchdog_nmi_handler(hw_tf);
+
/* It's mostly harmless to snapshot the TF, and we can send a spurious
* PCINT interrupt. perfmon.c just uses the interrupt to tell it to
* check its counters for overflow. Note that the PCINT interrupt is
diff --git a/kern/drivers/dev/Kbuild b/kern/drivers/dev/Kbuild
index 1b129df..77e8b78 100644
--- a/kern/drivers/dev/Kbuild
+++ b/kern/drivers/dev/Kbuild
@@ -24,3 +24,4 @@
obj-y += tmpfs.o
obj-y += version.o
obj-$(CONFIG_DEVVARS) += vars.o
+obj-y += watchdog.o
diff --git a/kern/drivers/dev/watchdog.c b/kern/drivers/dev/watchdog.c
new file mode 100644
index 0000000..42b98d1
--- /dev/null
+++ b/kern/drivers/dev/watchdog.c
@@ -0,0 +1,310 @@
+/* Copyright (c) 2020 Google Inc
+ * Barret Rhoden <brho@cs.berkeley.edu>
+ * See LICENSE for details.
+ *
+ * #watchdog
+ */
+
+#include <ns.h>
+#include <kmalloc.h>
+#include <string.h>
+#include <assert.h>
+#include <error.h>
+
+#include <stdio.h>
+#include <arch/console.h>
+
+/* The usage of the HPET is so hokey that I don't want it in a header in
+ * include/ */
+#include "../timers/hpet.h"
+
+/* Primitive ktask control. Probably want better general support for this
+ * stuff, maybe including rendez or something to kick us out of a sleep.
+ * kthread_usleep() has a built-in rendez already, so it's almost there. */
+struct wd_ctl {
+ bool should_exit;
+};
+
+/* lock-protected invariants
+ * ----------
+ * creation and manipulation of the hpet_timer ht
+ *
+ * if enabled is set:
+ * - cur_wd is set (the ktask responsible for updating the hpet)
+ * - timeout is set once and unchanged
+ * - there may be an old ktask with their own ctl, but it is set to
+ * should_exit.
+ * - ht was already created and initialized
+ * if disabled:
+ * - global cur_wd is NULL
+ * - timeout is zero
+ * - any previously running ktask's should_exit is true
+ *
+ * on the edges:
+ * ----------
+ * disabled->enabled: ktask is kicked, it'll turn on the timer
+ * enabled->disabled: ktask is told to die, we turn off the timer
+ */
+static spinlock_t lock = SPINLOCK_INITIALIZER;
+static bool enabled;
+static struct wd_ctl *cur_wd;
+static uint64_t timeout;
+static struct hpet_timer *ht;
+
+struct dev watchdog_devtab;
+
+static char *devname(void)
+{
+ return watchdog_devtab.name;
+}
+
+enum {
+ Qdir,
+ Qctl,
+};
+
+static struct dirtab wd_dir[] = {
+ {".", {Qdir, 0, QTDIR}, 0, DMDIR | 0555},
+ {"ctl", {Qctl, 0, QTFILE}, 0, 0666},
+};
+
+static struct chan *wd_attach(char *spec)
+{
+ return devattach(devname(), spec);
+}
+
+static struct walkqid *wd_walk(struct chan *c, struct chan *nc, char **name,
+ unsigned int nname)
+{
+ return devwalk(c, nc, name, nname, wd_dir, ARRAY_SIZE(wd_dir),
+ devgen);
+}
+
+static size_t wd_stat(struct chan *c, uint8_t *db, size_t n)
+{
+ return devstat(c, db, n, wd_dir, ARRAY_SIZE(wd_dir), devgen);
+}
+
+static struct chan *wd_open(struct chan *c, int omode)
+{
+ return devopen(c, omode, wd_dir, ARRAY_SIZE(wd_dir), devgen);
+}
+
+static void wd_close(struct chan *c)
+{
+ if (!(c->flag & COPEN))
+ return;
+}
+
+static size_t wd_read(struct chan *c, void *ubuf, size_t n, off64_t offset)
+{
+ switch (c->qid.path) {
+ case Qdir:
+ return devdirread(c, ubuf, n, wd_dir, ARRAY_SIZE(wd_dir),
+ devgen);
+ case Qctl:
+ if (READ_ONCE(enabled))
+ return readstr(offset, ubuf, n, "on");
+ else
+ return readstr(offset, ubuf, n, "off");
+ default:
+ panic("Bad Qid %p!", c->qid.path);
+ }
+ return -1;
+}
+
+/* do_nmi_work() call this directly. We don't have IRQ handlers for NMIs, and
+ * this will get called on *every* NMI, since we're basically muxing in SW. */
+void __watchdog_nmi_handler(struct hw_trapframe *hw_tf)
+{
+ /* It's not enough to check 'enabled', since we get the spurious IRQ at
+ * some point after we call hpet_timer_enable(). We could attempt to
+ * deal with this by enabling the timer, waiting a bit in case the IRQ
+ * fires (which it might not, so we don't know how long to wait), and
+ * *then* setting enabled. With barriers. Fun. */
+ if (!READ_ONCE(enabled))
+ return;
+ if (hpet_check_spurious_64(ht))
+ return;
+
+ /* This is real hokey, and could easily trigger another deadlock. */
+ panic_skip_console_lock = true;
+ panic_skip_print_lock = true;
+ print_trapframe(hw_tf);
+ backtrace_hwtf(hw_tf);
+
+ printk("Watchdog forcing a reboot in 10 sec!\n");
+ udelay(10000000);
+
+ reboot();
+}
+
+/* Attempts to set up a timer. Returns 0 on failure. Returns the actual
+ * timeout to use. i.e. if we're limited by the timer's reach. */
+static uint64_t __init_timer_once(uint64_t sec_timeout)
+{
+ uint64_t max;
+
+ if (!ht) {
+ ht = hpet_get_magic_timer();
+ if (!ht)
+ return 0;
+ /* NMI mode. Vector is ignored, but passing 2 for clarity. If
+ * you try a regular vector/IRQ, you'll need to hook up an
+ * irq_handler. (EOIs, handlers, etc). */
+ hpet_magic_timer_setup(ht, 2, 0x4);
+ }
+ /* We use a 64 bit counter, so the reach32 is a little excessive.
+ * However, we need some limit to avoid wraparound. Might as well use
+ * the 32 bit one, in case we ever sort out the HPET spurious crap. */
+ max = ht->hpb->reach32 / 2;
+ if (max < sec_timeout) {
+ trace_printk("Watchdog request for %d, throttled to %d\n",
+ sec_timeout, max);
+ return max;
+ }
+ return sec_timeout;
+}
+
+static void __shutoff_timer(void)
+{
+ hpet_timer_disable(ht);
+}
+
+static void __increment_timer(uint64_t two_x_timeout)
+{
+ hpet_timer_increment_comparator(ht, two_x_timeout * 1000000000);
+ hpet_timer_enable(ht);
+}
+
+/* Our job is to kick the watchdog by periodically adjusting the interrupt
+ * deadline in the timer into the future. When we execute, we set it for
+ * 2 * timeout more time, based on whatever it is at - not based on our runtime.
+ * We'll sleep for timeout. If we get delayed by another timeout and fail to
+ * reset it, the IRQ will fire and we'll reboot. Technically we could be held
+ * up for 2 * timeout before kicking, but we were held up for at least one
+ * timeout.
+ *
+ * It's mostly OK to have multiple of these ktasks running - that can happen if
+ * you do multiple off-ons quickly. (i.e. start a new one before the old one
+ * had a chance to shut down). Each thread has its own control structure, so
+ * that's fine. They will stop (if instructed) before doing anything. These
+ * threads will sit around though, until their timeout. We don't have any easy
+ * support for kicking a ktask to make it wake up faster. */
+static void wd_ktask(void *arg)
+{
+ struct wd_ctl *ctl = arg;
+ uint64_t sleep_usec;
+
+ while (1) {
+ spin_lock(&lock);
+ if (ctl->should_exit) {
+ spin_unlock(&lock);
+ break;
+ }
+ if (!timeout) {
+ /* We should have been told to exit already. */
+ warn("WD saw timeout == 0!");
+ spin_unlock(&lock);
+ break;
+ }
+ __increment_timer(timeout * 2);
+ sleep_usec = timeout * 1000000;
+ spin_unlock(&lock);
+ kthread_usleep(sleep_usec);
+ }
+ kfree(ctl);
+}
+
+#define WD_CTL_USAGE "on SEC_TIMEOUT | off"
+
+static void wd_ctl_cmd(struct chan *c, struct cmdbuf *cb)
+{
+ struct wd_ctl *ctl;
+ unsigned long sec_timeout;
+
+ if (cb->nf < 1)
+ error(EFAIL, WD_CTL_USAGE);
+
+ if (!strcmp(cb->f[0], "on")) {
+ if (cb->nf < 2)
+ error(EFAIL, WD_CTL_USAGE);
+ sec_timeout = strtoul(cb->f[1], 0, 0);
+ if (!sec_timeout)
+ error(EFAIL, "need a non-zero timeout");
+ ctl = kzmalloc(sizeof(struct wd_ctl), MEM_WAIT);
+ spin_lock(&lock);
+ if (enabled) {
+ spin_unlock(&lock);
+ kfree(ctl);
+ error(EFAIL, "watchdog already running; stop it first");
+ }
+ sec_timeout = __init_timer_once(sec_timeout);
+ if (!sec_timeout) {
+ spin_unlock(&lock);
+ kfree(ctl);
+ error(EFAIL, "unable to get an appropriate timer");
+ }
+ timeout = sec_timeout;
+ WRITE_ONCE(enabled, true);
+ cur_wd = ctl;
+ ktask("watchdog", wd_ktask, cur_wd);
+ spin_unlock(&lock);
+ } else if (!strcmp(cb->f[0], "off")) {
+ spin_lock(&lock);
+ if (!enabled) {
+ spin_unlock(&lock);
+ error(EFAIL, "watchdog was not on");
+ }
+ WRITE_ONCE(enabled, false);
+ timeout = 0;
+ cur_wd->should_exit = true;
+ cur_wd = NULL;
+ __shutoff_timer();
+ spin_unlock(&lock);
+ } else {
+ error(EFAIL, WD_CTL_USAGE);
+ }
+}
+
+static size_t wd_write(struct chan *c, void *ubuf, size_t n, off64_t unused)
+{
+ ERRSTACK(1);
+ struct cmdbuf *cb = parsecmd(ubuf, n);
+
+ if (waserror()) {
+ kfree(cb);
+ nexterror();
+ }
+ switch (c->qid.path) {
+ case Qctl:
+ wd_ctl_cmd(c, cb);
+ break;
+ default:
+ error(EFAIL, "Unable to write to %s", devname());
+ }
+ kfree(cb);
+ poperror();
+ return n;
+}
+
+struct dev watchdog_devtab __devtab = {
+ .name = "watchdog",
+ .reset = devreset,
+ .init = devinit,
+ .shutdown = devshutdown,
+ .attach = wd_attach,
+ .walk = wd_walk,
+ .stat = wd_stat,
+ .open = wd_open,
+ .create = devcreate,
+ .close = wd_close,
+ .read = wd_read,
+ .bread = devbread,
+ .write = wd_write,
+ .bwrite = devbwrite,
+ .remove = devremove,
+ .wstat = devwstat,
+ .power = devpower,
+ .chaninfo = devchaninfo,
+};