Add a benchmark for TLB flushes and shootdowns To use, make sure you set the C-state to 0, so processors don't sleep deeply: $ echo 0 > \#arch/c-state Then issue the various commands. To run all 7 tests, between core 0 (where the shell runs) and core 7: $ for i in `seq 1 7`; do echo tlb 7 $i > \#regress/monctl; done To run the POKE test, you'll need to manually change the ASM handler. One note on concurrency: this assumes that send_ipi() and send_kernel_message() provide the appropriate memory barriers. Specifically, that the main thread's write to __tlb_bench_x appears before the message is sent (or at least that the handler's clobber happens after). Safe to say that's a guarantee of those functions, at least conceptually and in their x86 implementations. (wrmsr and LOCK xchg). Also, using both READ_ONCE() and cpu_relax() is overkill. The newer Linux style would be to use READ_ONCE(), I think. Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
diff --git a/kern/drivers/dev/regress.c b/kern/drivers/dev/regress.c index 1565d57..c433000 100644 --- a/kern/drivers/dev/regress.c +++ b/kern/drivers/dev/regress.c
@@ -138,6 +138,129 @@ return n; } +int __tlb_bench_x; + +static void __tlb_s(void) +{ + tlbflush(); + cmb(); /* tlbflush is asm volatile, but it can still be reordered. */ + WRITE_ONCE(__tlb_bench_x, 1); +} + +static void __tlb_s_ipi(struct hw_trapframe *hw_tf, void *data) +{ + __tlb_s(); +} + +static void __tlb_s_kmsg(uint32_t srcid, long a0, long a1, long a2) +{ + __tlb_s(); +} + +/* This runs the test from the calling core, which is typically core 0 if you + * are running from the shell. If you run from another core, note that + * deregister_irq() will synchronize_rcu, which moves this thread to core 0 at + * the end of the function. */ +static void __tlb_shootdown_bench(int target_core, int mode) +{ + ERRSTACK(1); + uint64_t s, *d; + const char *str = NULL; + struct irq_handler *irqh; + int tbdf = MKBUS(BusIPI, 0, 0, 0); + #define ITERS 10 + + if (target_core == core_id()) + error(EINVAL, "TLB bench: Aborting, we are core %d", + target_core); + if (target_core < 0 || target_core >= num_cores) + error(EINVAL, + "TLB bench: Aborting, target_core %d out of range", + target_core); + irqh = register_irq(I_TESTING, __tlb_s_ipi, NULL, tbdf); + if (!irqh) + error(EFAIL, + "TLB bench: Oh crap, we couldn't register the IRQ!"); + d = kmalloc(sizeof(uint64_t) * ITERS, MEM_WAIT); + if (waserror()) { + deregister_irq(irqh->apic_vector, tbdf); + kfree(d); + nexterror(); + } + for (int i = 0; i < ITERS; i++) { + __tlb_bench_x = 0; + s = start_timing(); + switch (mode) { + case 1: + str = "NOOP"; + __tlb_bench_x = 1; + break; + case 2: + tlbflush(); + str = "LOCAL"; + __tlb_bench_x = 1; + break; + case 3: + /* To run this test, you need to hacked this into + * POKE_HANDLER. If not, you'll wedge the machine. + mov %cr3,%rax;\ + mov %rax,%cr3;\ + incl __tlb_bench_x;\ + * And comment out the error(). */ + error(EFAIL, "TLB bench: hack the POKE_HANDLER"); + + send_ipi(target_core, I_POKE_CORE); + str = "POKE"; + while (!READ_ONCE(__tlb_bench_x)) + cpu_relax(); + break; + case 4: + send_ipi(target_core, I_TESTING); + str = "IPI"; + while (!READ_ONCE(__tlb_bench_x)) + cpu_relax(); + break; + case 5: + send_kernel_message(target_core, __tlb_s_kmsg, 0, 0, 0, + KMSG_IMMEDIATE); + str = "KMSG"; + while (!READ_ONCE(__tlb_bench_x)) + cpu_relax(); + break; + case 6: + send_kernel_message(target_core, __tlb_s_kmsg, 0, 0, 0, + KMSG_IMMEDIATE); + str = "NOACK-KMSG"; + break; + case 7: + send_ipi(target_core, I_TESTING); + str = "NOACK-IPI"; + break; + default: + error(EINVAL, "TLB bench: bad mode %d", mode); + } + d[i] = stop_timing(s); + /* The NOACKs still need to wait, so we don't race with the + * remote core and our *next* loop. */ + while (!READ_ONCE(__tlb_bench_x)) + cpu_relax(); + /* The remote core has signalled it did the TLB flush, but it + * takes a little while for it to halt or otherwise get back to + * idle. Wait a little to get a more stable measurement. + * Without this delay (or something similar), I've seen extra + * delays of close to 400ns. Note that in real usage, the + * remote core won't always be ready to handle the IRQ, so this + * test is best case. */ + udelay(1000); + } + for (int i = 0; i < ITERS; i++) + printk("%02d: TLB %s shootdown: %llu ns\n", i, str, + tsc2nsec(d[i])); + deregister_irq(irqh->apic_vector, tbdf); + kfree(d); + poperror(); +} + static size_t regresswrite(struct chan *c, void *a, size_t n, off64_t unused) { ERRSTACK(1); @@ -157,6 +280,12 @@ ctlcommands); if (!strcmp(cb->f[0], "ktest")) { run_registered_ktest_suites(); + } else if (!strcmp(cb->f[0], "tlb")) { + if (cb->nf < 3) + error(EFAIL, + "TLB bench: need core and mode (ints)"); + __tlb_shootdown_bench(strtol(cb->f[1], NULL, 10), + strtol(cb->f[2], NULL, 10)); } else { error(EFAIL, "regresswrite: only commands are %s", ctlcommands);