Add a simple watchdog This watchdog will reboot the machine if core 0 is sufficiently wedged. Typically, this happens when Core 0 is stuck in IRQ context, such that you cannot interact with the machine remotely. Then you are unable to do anything, and unless you have decent remote reset capabilities (which I don't), you can't get the machine back. The watchdog uses the HPET timer and an NMI, ensuring responsiveness when IRQs are disabled. A reboot-worthy delay is defined as the watchdog ktask not running for a long period: at least the time you requested. This means if you are stuck in another kthread (to include a non-blocking syscall) for long enough, such as in a non-irqsave spinlock, or just a long computation, you could trigger the watchdog. Recall that our kthread scheduler is non-preemptive. To set a watchdog that will wait at least 120 seconds before rebooting: echo on 120 > \#watchdog/ctl To turn it off, if you want: echo off > \#watchdog/ctl To see the status (on or off): cat \#watchdog/ctl If you try to set it for longer than the timer / driver can handle, we'll adjust it down to an appropriate time and send you a warning. If you're working on a remote machine, I recommend putting something in your init.sh. Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
diff --git a/kern/arch/x86/trap.c b/kern/arch/x86/trap.c index a365d70..39f4ae8 100644 --- a/kern/arch/x86/trap.c +++ b/kern/arch/x86/trap.c
@@ -347,6 +347,11 @@ static void do_nmi_work(struct hw_trapframe *hw_tf) { assert(!irq_is_enabled()); + + extern void __watchdog_nmi_handler(struct hw_trapframe *hw_tf); + + __watchdog_nmi_handler(hw_tf); + /* It's mostly harmless to snapshot the TF, and we can send a spurious * PCINT interrupt. perfmon.c just uses the interrupt to tell it to * check its counters for overflow. Note that the PCINT interrupt is
diff --git a/kern/drivers/dev/Kbuild b/kern/drivers/dev/Kbuild index 1b129df..77e8b78 100644 --- a/kern/drivers/dev/Kbuild +++ b/kern/drivers/dev/Kbuild
@@ -24,3 +24,4 @@ obj-y += tmpfs.o obj-y += version.o obj-$(CONFIG_DEVVARS) += vars.o +obj-y += watchdog.o
diff --git a/kern/drivers/dev/watchdog.c b/kern/drivers/dev/watchdog.c new file mode 100644 index 0000000..42b98d1 --- /dev/null +++ b/kern/drivers/dev/watchdog.c
@@ -0,0 +1,310 @@ +/* Copyright (c) 2020 Google Inc + * Barret Rhoden <brho@cs.berkeley.edu> + * See LICENSE for details. + * + * #watchdog + */ + +#include <ns.h> +#include <kmalloc.h> +#include <string.h> +#include <assert.h> +#include <error.h> + +#include <stdio.h> +#include <arch/console.h> + +/* The usage of the HPET is so hokey that I don't want it in a header in + * include/ */ +#include "../timers/hpet.h" + +/* Primitive ktask control. Probably want better general support for this + * stuff, maybe including rendez or something to kick us out of a sleep. + * kthread_usleep() has a built-in rendez already, so it's almost there. */ +struct wd_ctl { + bool should_exit; +}; + +/* lock-protected invariants + * ---------- + * creation and manipulation of the hpet_timer ht + * + * if enabled is set: + * - cur_wd is set (the ktask responsible for updating the hpet) + * - timeout is set once and unchanged + * - there may be an old ktask with their own ctl, but it is set to + * should_exit. + * - ht was already created and initialized + * if disabled: + * - global cur_wd is NULL + * - timeout is zero + * - any previously running ktask's should_exit is true + * + * on the edges: + * ---------- + * disabled->enabled: ktask is kicked, it'll turn on the timer + * enabled->disabled: ktask is told to die, we turn off the timer + */ +static spinlock_t lock = SPINLOCK_INITIALIZER; +static bool enabled; +static struct wd_ctl *cur_wd; +static uint64_t timeout; +static struct hpet_timer *ht; + +struct dev watchdog_devtab; + +static char *devname(void) +{ + return watchdog_devtab.name; +} + +enum { + Qdir, + Qctl, +}; + +static struct dirtab wd_dir[] = { + {".", {Qdir, 0, QTDIR}, 0, DMDIR | 0555}, + {"ctl", {Qctl, 0, QTFILE}, 0, 0666}, +}; + +static struct chan *wd_attach(char *spec) +{ + return devattach(devname(), spec); +} + +static struct walkqid *wd_walk(struct chan *c, struct chan *nc, char **name, + unsigned int nname) +{ + return devwalk(c, nc, name, nname, wd_dir, ARRAY_SIZE(wd_dir), + devgen); +} + +static size_t wd_stat(struct chan *c, uint8_t *db, size_t n) +{ + return devstat(c, db, n, wd_dir, ARRAY_SIZE(wd_dir), devgen); +} + +static struct chan *wd_open(struct chan *c, int omode) +{ + return devopen(c, omode, wd_dir, ARRAY_SIZE(wd_dir), devgen); +} + +static void wd_close(struct chan *c) +{ + if (!(c->flag & COPEN)) + return; +} + +static size_t wd_read(struct chan *c, void *ubuf, size_t n, off64_t offset) +{ + switch (c->qid.path) { + case Qdir: + return devdirread(c, ubuf, n, wd_dir, ARRAY_SIZE(wd_dir), + devgen); + case Qctl: + if (READ_ONCE(enabled)) + return readstr(offset, ubuf, n, "on"); + else + return readstr(offset, ubuf, n, "off"); + default: + panic("Bad Qid %p!", c->qid.path); + } + return -1; +} + +/* do_nmi_work() call this directly. We don't have IRQ handlers for NMIs, and + * this will get called on *every* NMI, since we're basically muxing in SW. */ +void __watchdog_nmi_handler(struct hw_trapframe *hw_tf) +{ + /* It's not enough to check 'enabled', since we get the spurious IRQ at + * some point after we call hpet_timer_enable(). We could attempt to + * deal with this by enabling the timer, waiting a bit in case the IRQ + * fires (which it might not, so we don't know how long to wait), and + * *then* setting enabled. With barriers. Fun. */ + if (!READ_ONCE(enabled)) + return; + if (hpet_check_spurious_64(ht)) + return; + + /* This is real hokey, and could easily trigger another deadlock. */ + panic_skip_console_lock = true; + panic_skip_print_lock = true; + print_trapframe(hw_tf); + backtrace_hwtf(hw_tf); + + printk("Watchdog forcing a reboot in 10 sec!\n"); + udelay(10000000); + + reboot(); +} + +/* Attempts to set up a timer. Returns 0 on failure. Returns the actual + * timeout to use. i.e. if we're limited by the timer's reach. */ +static uint64_t __init_timer_once(uint64_t sec_timeout) +{ + uint64_t max; + + if (!ht) { + ht = hpet_get_magic_timer(); + if (!ht) + return 0; + /* NMI mode. Vector is ignored, but passing 2 for clarity. If + * you try a regular vector/IRQ, you'll need to hook up an + * irq_handler. (EOIs, handlers, etc). */ + hpet_magic_timer_setup(ht, 2, 0x4); + } + /* We use a 64 bit counter, so the reach32 is a little excessive. + * However, we need some limit to avoid wraparound. Might as well use + * the 32 bit one, in case we ever sort out the HPET spurious crap. */ + max = ht->hpb->reach32 / 2; + if (max < sec_timeout) { + trace_printk("Watchdog request for %d, throttled to %d\n", + sec_timeout, max); + return max; + } + return sec_timeout; +} + +static void __shutoff_timer(void) +{ + hpet_timer_disable(ht); +} + +static void __increment_timer(uint64_t two_x_timeout) +{ + hpet_timer_increment_comparator(ht, two_x_timeout * 1000000000); + hpet_timer_enable(ht); +} + +/* Our job is to kick the watchdog by periodically adjusting the interrupt + * deadline in the timer into the future. When we execute, we set it for + * 2 * timeout more time, based on whatever it is at - not based on our runtime. + * We'll sleep for timeout. If we get delayed by another timeout and fail to + * reset it, the IRQ will fire and we'll reboot. Technically we could be held + * up for 2 * timeout before kicking, but we were held up for at least one + * timeout. + * + * It's mostly OK to have multiple of these ktasks running - that can happen if + * you do multiple off-ons quickly. (i.e. start a new one before the old one + * had a chance to shut down). Each thread has its own control structure, so + * that's fine. They will stop (if instructed) before doing anything. These + * threads will sit around though, until their timeout. We don't have any easy + * support for kicking a ktask to make it wake up faster. */ +static void wd_ktask(void *arg) +{ + struct wd_ctl *ctl = arg; + uint64_t sleep_usec; + + while (1) { + spin_lock(&lock); + if (ctl->should_exit) { + spin_unlock(&lock); + break; + } + if (!timeout) { + /* We should have been told to exit already. */ + warn("WD saw timeout == 0!"); + spin_unlock(&lock); + break; + } + __increment_timer(timeout * 2); + sleep_usec = timeout * 1000000; + spin_unlock(&lock); + kthread_usleep(sleep_usec); + } + kfree(ctl); +} + +#define WD_CTL_USAGE "on SEC_TIMEOUT | off" + +static void wd_ctl_cmd(struct chan *c, struct cmdbuf *cb) +{ + struct wd_ctl *ctl; + unsigned long sec_timeout; + + if (cb->nf < 1) + error(EFAIL, WD_CTL_USAGE); + + if (!strcmp(cb->f[0], "on")) { + if (cb->nf < 2) + error(EFAIL, WD_CTL_USAGE); + sec_timeout = strtoul(cb->f[1], 0, 0); + if (!sec_timeout) + error(EFAIL, "need a non-zero timeout"); + ctl = kzmalloc(sizeof(struct wd_ctl), MEM_WAIT); + spin_lock(&lock); + if (enabled) { + spin_unlock(&lock); + kfree(ctl); + error(EFAIL, "watchdog already running; stop it first"); + } + sec_timeout = __init_timer_once(sec_timeout); + if (!sec_timeout) { + spin_unlock(&lock); + kfree(ctl); + error(EFAIL, "unable to get an appropriate timer"); + } + timeout = sec_timeout; + WRITE_ONCE(enabled, true); + cur_wd = ctl; + ktask("watchdog", wd_ktask, cur_wd); + spin_unlock(&lock); + } else if (!strcmp(cb->f[0], "off")) { + spin_lock(&lock); + if (!enabled) { + spin_unlock(&lock); + error(EFAIL, "watchdog was not on"); + } + WRITE_ONCE(enabled, false); + timeout = 0; + cur_wd->should_exit = true; + cur_wd = NULL; + __shutoff_timer(); + spin_unlock(&lock); + } else { + error(EFAIL, WD_CTL_USAGE); + } +} + +static size_t wd_write(struct chan *c, void *ubuf, size_t n, off64_t unused) +{ + ERRSTACK(1); + struct cmdbuf *cb = parsecmd(ubuf, n); + + if (waserror()) { + kfree(cb); + nexterror(); + } + switch (c->qid.path) { + case Qctl: + wd_ctl_cmd(c, cb); + break; + default: + error(EFAIL, "Unable to write to %s", devname()); + } + kfree(cb); + poperror(); + return n; +} + +struct dev watchdog_devtab __devtab = { + .name = "watchdog", + .reset = devreset, + .init = devinit, + .shutdown = devshutdown, + .attach = wd_attach, + .walk = wd_walk, + .stat = wd_stat, + .open = wd_open, + .create = devcreate, + .close = wd_close, + .read = wd_read, + .bread = devbread, + .write = wd_write, + .bwrite = devbwrite, + .remove = devremove, + .wstat = devwstat, + .power = devpower, + .chaninfo = devchaninfo, +};