| /* Copyright (c) 2013, 2014 The Regents of the University of California |
| * Barret Rhoden <brho@cs.berkeley.edu> |
| * See LICENSE for details. |
| * |
| * lock_test: microbenchmark to measure different styles of spinlocks. |
| * |
| * to build on linux: (hacky) |
| * $ gcc -O2 -std=gnu99 -fno-stack-protector -g tests/lock_test.c -lpthread \ |
| * -lm -o linux_lock_test */ |
| |
| #define _GNU_SOURCE /* pthread_yield */ |
| |
| #include <stdio.h> |
| #include <pthread.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| #include <sys/time.h> |
| #include <math.h> |
| #include <argp.h> |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| #include <fcntl.h> |
| #include <assert.h> |
| #include <string.h> |
| |
| /* OS dependent #incs */ |
| #ifdef __ros__ |
| |
| #include <parlib/parlib.h> |
| #include <parlib/stdio.h> |
| #include <parlib/vcore.h> |
| #include <parlib/timing.h> |
| #include <parlib/spinlock.h> |
| #include <parlib/mcs.h> |
| #include <parlib/arch/arch.h> |
| #include <parlib/event.h> |
| |
| #include <parlib/tsc-compat.h> |
| #include <benchutil/measure.h> |
| |
| #else |
| |
| #include "../user/parlib/include/parlib/tsc-compat.h" |
| #include "misc-compat.h" |
| #include "linux-lock-hacks.h" /* TODO: have a build system and lib / C file */ |
| |
| #include "../user/benchutil/include/benchutil/measure.h" |
| #include "../user/benchutil/measure.c" |
| |
| static void os_prep_work(pthread_t *worker_threads, int nr_threads) |
| { |
| if (nr_threads > num_vcores()) |
| printf("WARNING: %d threads requested, but only %d cores available\n", |
| nr_threads, num_vcores()); |
| } |
| |
| static void os_post_work(pthread_t *worker_threads, int nr_threads) |
| { |
| if (nr_threads > num_vcores()) |
| return; |
| /* assuming we're taking cores 0..nr_threads, and we never move. */ |
| for (int i = 0; i < nr_threads; i++) { |
| cpu_set_t cpuset; |
| CPU_ZERO(&cpuset); |
| CPU_SET(i, &cpuset); |
| pthread_setaffinity_np(worker_threads[i], sizeof(cpu_set_t), |
| &cpuset); |
| } |
| } |
| |
| #define print_preempt_trace(args...) {} |
| |
| __thread int __vcore_context = 0; |
| |
| #endif |
| |
| /* TODO: There's lot of work to do still, both on this program and on locking |
| * and vcore code. For some of the issues, I'll leave in the discussion / |
| * answers, in case it comes up in the future (like when I read this in 8 |
| * months). |
| * |
| * BUGS / COMMENTARY |
| * Occasional deadlocks when preempting and not giving back! |
| * - with the new PDRs style, though that doesn't mean the older styles |
| * don't have this problem |
| * - shouldn't be any weaker than PDR. they all check pred_vc to see |
| * if they are running, and if not, they make sure someone runs |
| * - could be weaker if we have an old value for the lockholder, |
| * someone outside the chain, and we made sure they ran, and they do |
| * nothing (spin in the 2LS or something?) |
| * no, they should have gotten a msg about us being preempted, |
| * since whoever we turn into gets the message about us swapping. |
| * - anyway, it's not clear if this is with MCSPDR, event delivery, |
| * preemption handling, or just an artifact of the test (less likely) |
| * why aren't MCS locks in uth_ctx getting dealt with? |
| * - because the event is handled, but the lock holder isn't run. the |
| * preemption was dealt with, but nothing saved the lock holder |
| * - any uthread_ctx lockholder that gets preempted will get |
| * interrupted, and other cores will handle the preemption. but that |
| * uthread won't run again without 2LS support. either all spinners |
| * need to be aware of the 'lockholder' (PDR-style), or the 2LS needs |
| * to know when a uthread becomes a 'lockholder' to make sure it runs |
| * via user-level preempts. If the latter, this needs to happen |
| * atomically with grabbing the lock, or else be able to handle lots of |
| * fake 'lockholders' (like round-robin among all of them) |
| * why is the delay more than the expected delay? |
| * because it takes ~2ms to spawn and run a process |
| * could do this in a separate process, instead of a script |
| * could also consider not using pth_test and changing prov, but |
| * driving it by yields and requests. would also test the |
| * alarm/wakeup code (process sets alarm, goes to sleep, wakes up |
| * and requests X cores) |
| * why do we get occasional preempt-storms? (lots of change_tos) |
| * due to the MCS-PDR chain, which i tried fixing by adjusting the number |
| * of workers down to the number of vcores |
| * why isn't the worker adaptation working? |
| * - it actually was working, and nr_workers == nr_vcores. that |
| * just wasn't the root cause. |
| * - was expecting it to cut down on PDR kernel traffic |
| * - still get periods of low perf |
| * like O(100) preempt msgs per big preempt/prov |
| * does it really take that much to work out an MCS-PDR? |
| * - one thing is that if we fake vc ctx, we never receive preemption |
| * events. might be a bad idea. |
| * - in general, yeah. faking VC and turning off events can really |
| * muck with things |
| * - these events aren't necessarily delivered to a VC who will |
| * check events any time soon (might be the last one in the chain) |
| * - the core of the issue is that we have the right amount of |
| * workers and vcores, but that the system isn't given a chance to |
| * stabilize itself. also, if we have some VCs that are just |
| * sitting around, spinning in the 2LS, if those get preempted, no |
| * one notices or cares (when faking vc_ctx / getting no events) |
| * - there is a slight race where we might make someone run who isn't a |
| * lockholder. logically, its okay. worst case, it would act like an |
| * extra preempt and different startcore, which shouldn't be too bad. |
| * |
| * sanity check: does throughput match latency? (2.5GHz TSC, MCS lock) |
| * ex: 5000 locks/ms = 5 locks/us = 200ns/lock = 500 ticks / lock |
| * 500 ticks * 31 workers (queue) = 15000 ticks |
| * avg acquire time was around 14K. seems fine.. |
| * when our MCSPDR throughput tanks (during preempts), it's around |
| * 400-500 locks/ms, which is around 2us/lock. |
| * when the locker on a preempted chain shows up, it needs to |
| * change to the next one in line. |
| * - though that should be in parallel with the other |
| * lockholders letting go. shouldn't be that bad |
| * - no, it is probably at the head of the chain very soon, |
| * such that it is the bottleneck for the actual lock. 2us |
| * seems possible |
| * |
| * what does it take to get out of a preemption with (old) MCS-PDR? |
| * - these are now called pdro locks (old) |
| * - for a single preempt, it will take 1..n-1 changes. avg n/2 |
| * - for multiple preempts, it's nr_pre * that (avg np/2, worst np) |
| * - for every unlock/reacquire cycle (someone unlocks, then rejoins |
| * the list), its nr_preempts (aka, nr_workers - nr_vcores) |
| * - if we need to have a specific worker get out of the chain, on |
| * average, it'd take n/2 cycles (p*n/2 changes) worst: np |
| * - if we want to get multiple workers out, the worst case is still |
| * np, but as p increases, we're more likely to approach n cycles |
| * - so the current model is np for the initial hit (to move the |
| * offline VCs to the end of the chain) and another np to get our |
| * specific workers out of the chain and yielding (2np) |
| * |
| * - but even with 1 preempt, we're getting 80-200 changes per |
| * |
| * - it shouldn't matter that the sys_change_to is really slow, should |
| * be the same amount of changes. however, the preempted ones are |
| * never really at the tail end of the chain - they should end up right |
| * before the lockholder often. while the sys_change_tos are slowly |
| * moving towards the back of the chain, the locking code is quickly |
| * removing (online) nodes from the head and putting them on the back. |
| * |
| * - end result: based on lock hold time and lock delay time, a |
| * preempted VC stays in the MCS chain (swaps btw VC/nodes), and when |
| * it is inside the chain, someone is polling to make them run. with |
| * someone polling, it is extremely unlikely that someone outside the |
| * chain will win the race and be able to change_to before the in-chain |
| * poller. to clarify: |
| * - hold time and delay time matter, since the longer they are, |
| * the greater the amount of time the change_to percolation has to |
| * get the preempted VCs to the end of the chain (where no one |
| * polls them). |
| * - at least one vcore is getting the event to handle the |
| * preemption of the in-chain, offline VC. we could change it so |
| * every VC polls the preempt_evq, or just wait til whoever is |
| * getting the messages eventually checks their messages (VC0) |
| * - if there is an in-chain poller, they will notice the instant |
| * the VC map changes, and then immediately change_to (and spin on |
| * the proclock in the kernel). there's almost no chance of a |
| * normal preempt event handler doing that faster. (would require |
| * some IRQ latency or something serious). |
| * - adding in any hold time trashes our microbenchmark's perf, but a |
| * little delay time actually helps: (all with no preempts going on) |
| * - mcspdr, no delay: 4200-4400 (-w31 -l10000, no faking, etc) |
| * - mcspdr, d = 1: 4400-4800 |
| * - mcspdr, d = 2: 4200-5200 |
| * - as you add delay, it cuts down on contention for the |
| * lock->lock cacheline. but if you add in too much, you'll tank |
| * throughput (since there is no contention at all). |
| * - as we increase the delay, we cut down on the chance of the |
| * preempt storm / preempt-stuck-in-the-chain, though it can still |
| * happen, even with a delay of 10us |
| * - maybe add in the lockholder again? (removed in 73701d6bfb) |
| * - massively cuts performance, like 2x throughput, without |
| * preempts |
| * - it's ability to help depends on impl: |
| * in one version (old style), it didn't help much at all |
| * - in another (optimized lockholder setting), i can't |
| * even see the throughput hit, it recovered right away, |
| * with O(5) messages |
| * - the diff was having the lockholder assign the vcoreid |
| * before passing off to the next in the chain, so that |
| * there is less time with having "no lockholder". |
| * (there's a brief period where the lockholder says it is |
| * the next person, who still |
| * spins. they'll have to make |
| * sure their pred runs) |
| * -adj workers doesn't matter either... |
| * - the 2LS and preemption handling might be doing this |
| * automatically, when handle_vc_preempt() does a |
| * thread_paused() on its current_uthread. |
| * - adj_workers isn't critical if we're using some locks |
| * that check notif_pending. eventually someone hears |
| * about preempted VCs (assuming we can keep up) |
| * |
| * What about delays? both hold and delay should make it easier to get |
| * the preempted vcore to the end of the chain. but do they have to be |
| * too big to be reasonable? |
| * - yes. hold doesn't really help much until everything is |
| * slower. even with a hold of around 1.2us, we still have the |
| * change_to-storms and lowered throughput. |
| * - doing a combo helps too. if you hold for 1ns (quite a bit |
| * more actually, due to the overhead of ndelay, but sufficient to |
| * be "doing work"), and delaying for around 7us before rejoining, |
| * there's only about a 1/5 chance of a single preempt messing us |
| * up |
| * - though having multiple preempts outstanding make this less |
| * likely to work. |
| * - and it seems like if we get into the storm scenario, we |
| * never really get out. either we do quickly or never do. |
| * depending on the workload, this could be a matter of luck |
| * |
| * So we could try tracking the lockholder, but only looking at it when |
| * we know someone was preempted in the chain - specifically, when our |
| * pred is offline. when that happens, we don't change to them, we |
| * make sure the lockholder is running. |
| * - tracking takes us from 4200->2800 throughput or so for MCS |
| * - 5200 -> 3700 or so for MCS in vc_ctx (__MCSPDR) |
| * - main spike seems to be in the hold time. bimodal distrib, |
| * with most below 91 (the usual is everything packed below 70) and |
| * a big spike around 320 |
| * |
| * Summary: |
| * |
| * So we need to have someone outside the chain change_to the one in the |
| * chain o/w, someone will always be in the chain. Right now, it's always |
| * the next in line who is doing the changing, so a preempted vcore is |
| * always still in the chain. |
| * |
| * If the locking workload has some delaying, such as while holding the |
| * lock or before reacquiring, the "change_to" storm might not be a |
| * problem. If it is, the only alternative I have so far is to check the |
| * lockholder (which prevents a chain member from always ensuring their |
| * pred runs). This hurts the lock's scalability/performance when we |
| * aren't being preempted. On the otherhand, based on what you're doing |
| * with the lock, one more cache miss might not be as big of a deal as in |
| * lock_test. Especially if when you get stormed, your throughput could be |
| * terrible and never recover. |
| * |
| * Similar point: you can use spinpdr locks. They have the PDR-benefits, |
| * and won't induce the storm of change_tos. However, this isn't much |
| * better for contended locks. They perform 2-3x worse (on c89) without |
| * preemption. Arguably, if you were worried about the preempt storms and |
| * want scalability, you might want to use mcspdr with lockholders. |
| * |
| * The MCSPDRS (now just callced MCSPDR, these are default) locks can avoid |
| * the storm, but at the cost of a little more in performance. mcspdrs |
| * style is about the same when not getting preempted from uth ctx compared |
| * to mcspdr (slight drop). When in vc ctx, it's about 10-20% perf hit |
| * (PDRS gains little from --vc_ctx). |
| * |
| * Turns out there is a perf hit to PDRS (and any non-stack based qnode) |
| * when running on c89. The issue is that after shuffling the vcores |
| * around, they are no longer mapped nicely to pcores (VC0->PC1, VC1->PC2). |
| * This is due to some 'false sharing' of the cachelines, caused mostly by |
| * aggressive prefetching (notably the intel adjacent cacheline prefetcher, |
| * which grabs two CLs at a time!). Basically, stack-based qnodes are |
| * qnodes that are very far apart in memory. Cranking up the padding in |
| * qnodes in the "qnodes-in-locks" style replicates this. |
| * |
| * For some info on the prefetching: |
| * http://software.intel.com/en-us/articles/optimizing-application-performance-on-intel-coret-microarchitecture-using-hardware-implemented-prefetchers/ |
| * http://software.intel.com/en-us/forums/topic/341769 |
| * |
| * Here's some rough numbers of the different styles for qnodes on c89. |
| * 'in order' is VCn->PC(n+1) (0->1, 1->2). Worst order is with even VCs |
| * on one socket, odds on the other. the number of CLs is the size of a |
| * qnode. mcspdr is the new style (called mcspdrs in some places in this |
| * document), with lock-based qnodes. mcspdr2 is the same, but with |
| * stack-based qnodes. mcspdro is the old style (bad a recovery), stack |
| * based, sometimes just called mcs-pdr |
| * |
| * with prefetchers disabled (MCS and DCU) |
| * mcspdr 1CL 4.8-5.4 in order, 3.8-4.2 worst order |
| * mcspdr 2CL in order, worst order |
| * mcspdr 4CL 5.2-6.0 in order, 4.7-5.3 worst order |
| * mcspdr 8CL 5.4-6.7 in order, 5.2-6.2 worst order |
| * mcspdr 16CL 5.1-5.8 in order, 5.2-6.8 worst order |
| * mcspdr2 stck in order, worst order |
| * mcspdro stck 4-3.4.3 in order, 4.2-4.5 worst order |
| * mcspdro-vcctx 4.8-7.0 in order, 5.3-6.7 worst order |
| * can we see the 2 humps? |
| * mcspdr 1CL yes but less, varied, etc |
| * mcspdr2 no |
| * |
| * test again with worst order with prefetchers enabled |
| * mcspdr 1CL 3.8-4.0 in order, 2.6-2.7 worst order |
| * mcspdr 2CL 4.2-4.4 in order, 3.8-3.9 worst order |
| * mcspdr 4CL 4.5-5.2 in order, 4.0-4.2 worst order |
| * mcspdr 8CL 4.4-5.1 in order, 4.3-4.7 worst order |
| * mcspdr 16CL 4.4-4.8 in order, 4.4-5.3 worst order |
| * mcspdr2 stck 3.0-3.0 in order, 2.9-3.0 worst order |
| * mcspdro stck 4.2-4.3 in order, 4.2-4.4 worst order |
| * mcspdro-vcctx 5.2-6.4 in order, 5.0-5.9 worst order |
| * can we see the 2 humps? |
| * mcspdrs 1CL yes, clearly |
| * mcspdr2 no |
| * |
| * PROGRAM FEATURES |
| * - verbosity? vcoremap, preempts, the throughput and latency histograms? |
| * - have a max workers option (0?) == max vcores |
| * - would like to randomize (within bounds) the hold/delay times |
| * - help avoid convoys with MCS locks |
| * |
| * PERFORMANCE: |
| * |
| * pcore control? (hyperthreading, core 0, cross socket?) |
| * want some options for controlling which threads run where, or |
| * which vcores are even used (like turning off hyperthreading)? |
| * implement ticket spinlocks? (more fair, more effects of preempts) |
| * no simple way to do PDR either, other than 'check everyone' |
| * MCS vs MCSPDR vs __MCSPDR |
| * MCS seems slightly better than __MCSPDR (and it should) |
| * MCSPDR is a bit worse than __MCSPDR |
| * - the uth_disable/enable code seems to make a |
| * difference. |
| * - i see why the latencies are worse, since they have |
| * extra work to do, but the internal part that contends |
| * with other cores shouldn't be affected, unless there's |
| * some other thing going on. Or perhaps there isn't |
| * always someone waiting for the lock? |
| * - faking VC ctx mostly negates the cost of MCSPDR vs |
| * __MCSPDR things that made a big diff: CL aligning the |
| * qnodes, putting qnodes |
| * on stacks, reading in the vcoreid once before ensuring() |
| * both MCS CAS unlocks could use some branch prediction work |
| * spinpdr locks are 2-3x faster than spinlocks... |
| * test, test&set vs the existing test&set, plus lots of asserts |
| * |
| * some delay (like 10us) lowers latency while maintaining throughput |
| * - makes sense esp with MCS. if you join the queue at the last |
| * second, you'll measure lower latency than attempting right away |
| * - also true for spinlocks |
| * - we can probably figure out the max throughput (TP = f(delay)) |
| * for each lock type |
| * |
| * hard to get steady numbers with MCS - different runs of the same test |
| * will vary in throughput by around 15-30% (e.g., MCS varying from 3k-4k |
| * L/ms) |
| * - happens on c89 (NUMA) and hossin (UMA) |
| * - spinlocks seem a little steadier. |
| * - for MCS locks, the order in which they line up across the |
| * pcores will matter. like if on one run, i regularly hand off |
| * between cores |
| * in the same socket and only do one cross-socket step |
| * - run a lot of shorter ones to get a trend, for now |
| * - might be correllated with spikes in held times (last bin) |
| * - can't turn off legacy USB on c89 (SMM) - interferes with PXE |
| * |
| * PREEMPTS: |
| * better preempt record tracking? |
| * i just hacked some event-intercept and timestamp code together |
| * maybe put it in the event library? |
| * the timestamps definitely helped debugging |
| * |
| * is it true that if uthread code never spins outside a PDR lock, then it |
| * doesn't need preemption IPIs? (just someone checks the event at some |
| * point). |
| * think so: so long as you make progress and when you aren't, you |
| * check events (like if a uthread blocks on something and enters VC |
| * ctx) |
| * adjusting the number of workers, whether vcores or uthreads |
| * - if you have more lockers than cores: |
| * - spinpdr a worker will get starved (akaros) (without 2LS support) |
| * - running this from uth context will cause a handle_events |
| * - mcspdr will require the kernel to switch (akaros) |
| * - spin (akaros) might DL (o/w nothing), (linux) poor perf |
| * - mcs (akaros) will DL, (linux) poor perf |
| * - poor perf (latency spikes) comes from running the wrong thread |
| * sometimes |
| * - deadlock comes from the lack of kernel-level context switching |
| * - if we scale workers down to the number of active vcores: |
| * - two things: the initial hit, and the steady state. during the |
| * initial hit, we can still deadlock, since we have more lockers than |
| * cores |
| * - non-pdr (akaros) could deadlock in the initial hit |
| * - (akaros) steady state, everything is normal (just fewer cores) |
| * - how can we adjust this in linux? |
| * - if know how many cores you have, then futex wait the others |
| * - need some way to wake them back up |
| * - if you do this in userspace, you might need something PDR-like |
| * to handle when the "2LS" code gets preempted |
| * - as mentioned above, the problem in akaros is that the lock/unlock |
| * might be happening too fast to get into the steady-state and recover |
| * from the initial preemption |
| * - one of our benefits is that we can adapt in userspace, with userspace |
| * knowledge, under any circumstance. |
| * - we have the deadlock windows (forcing PDR). |
| * - in return, we can do this adaptation in userspace |
| * - and (arguably) anyone who does this in userspace will need PDR |
| * |
| * MEASUREMENT (user/parlib/measure.c) |
| * extract into its own library, for linux apps |
| * print out raw TSC times? might help sync up diff timelines |
| * Need more latency bins, spinlocks vary too much |
| * maybe we need better high/low too, since this hist looks bad too |
| * or not center on the average? |
| * for printing, its hard to know without already binning. |
| * maybe bin once (latency?), then use that to adjust the hist? |
| * |
| * Had this on a spinlock: |
| * [ 32 - 35656] 1565231: |
| * (less than 200 intermediate) |
| * [ 286557 - 20404788] 65298: * |
| * |
| * Samples per dot: 34782 |
| * Total samples: 1640606 |
| * Avg time : 96658 |
| * Stdev time : 604064.440882 |
| * Coef Var : 6.249503 |
| * High coeff of var with serious outliers, adjusted bins |
| * 50/75/90/99: 33079 / 33079 / 33079 / 290219 (-<860) |
| * Min / Max : 32 / 20404788 |
| * was 50/75/90 really within 860 of each other? |
| * |
| * when we are preempted and don't even attempt anything, say for 10ms, it |
| * actually doesn't hurt our 50/75/90/99 too much. we have a ridiculous |
| * stddev and max, and high average, but there aren't any additional |
| * attempts at locking to mess with the attempt-latency. Only nr_vcores |
| * requests are in flight during the preemption, but we can spit out around |
| * 5000 per ms when we aren't preempted. |
| * |
| */ |
| |
| const char *argp_program_version = "lock_test v0.1475263"; |
| const char *argp_program_bug_address = "<akaros+subscribe@googlegroups.com>"; |
| |
| static char doc[] = "lock_test -- spinlock benchmarking"; |
| static char args_doc[] = "-w NUM -l NUM -t LOCK"; |
| |
| #define OPT_VC_CTX 1 |
| #define OPT_ADJ_WORKERS 2 |
| |
| static struct argp_option options[] = { |
| {"workers", 'w', "NUM", OPTION_NO_USAGE, "Number of threads/cores"}, |
| {0, 0, 0, 0, ""}, |
| {"loops", 'l', "NUM", OPTION_NO_USAGE, "Number of loops per worker"}, |
| {0, 0, 0, 0, ""}, |
| {"type", 't', "LOCK",OPTION_NO_USAGE, "Type of lock to use. " |
| "Options:\n" |
| "\tmcs\n" |
| "\tmcscas\n" |
| "\tmcspdr\n" |
| "\tmcspdro\n" |
| "\t__mcspdro\n" |
| "\tspin\n" |
| "\tspinpdr"}, |
| {0, 0, 0, 0, "Other options (not mandatory):"}, |
| {"adj_workers", OPT_ADJ_WORKERS, 0, 0, |
| "Adjust workers such that the " |
| "number of workers equals the " |
| "number of vcores"}, |
| {"vc_ctx", OPT_VC_CTX, 0, 0, "Run threads in mock-vcore context"}, |
| {0, 0, 0, 0, ""}, |
| {"hold", 'h', "NSEC", 0, "nsec to hold the lock"}, |
| {"delay", 'd', "NSEC", 0, "nsec to delay between grabs"}, |
| {"print", 'p', "ROWS", 0, "Print ROWS of optional measurements"}, |
| {"outfile", 'o', "FILE", 0, "Print ROWS of optional measurements"}, |
| { 0 } |
| }; |
| |
| struct prog_args { |
| int nr_threads; |
| int nr_loops; |
| int hold_time; |
| int delay_time; |
| int nr_print_rows; |
| bool fake_vc_ctx; |
| bool adj_workers; |
| char *outfile_path; |
| void *(*lock_type)(void *arg); |
| }; |
| struct prog_args pargs = {0}; |
| |
| /* Globals */ |
| struct time_stamp { |
| uint64_t pre; |
| uint64_t acq; |
| uint64_t un; |
| bool valid; |
| }; |
| struct time_stamp **times; |
| bool run_locktest = TRUE; |
| pthread_barrier_t start_test; |
| |
| /* Locking functions. Define globals here, init them in main (if possible), and |
| * use the lock_func() macro to make your thread func. */ |
| |
| #define lock_func(lock_name, lock_cmd, unlock_cmd) \ |
| void *lock_name##_thread(void *arg) \ |
| { \ |
| long thread_id = (long)arg; \ |
| int hold_time = ACCESS_ONCE(pargs.hold_time); \ |
| int delay_time = ACCESS_ONCE(pargs.delay_time); \ |
| int nr_loops = ACCESS_ONCE(pargs.nr_loops); \ |
| bool fake_vc_ctx = ACCESS_ONCE(pargs.fake_vc_ctx); \ |
| bool adj_workers = ACCESS_ONCE(pargs.adj_workers); \ |
| uint64_t pre_lock, acq_lock, un_lock; \ |
| struct time_stamp *this_time; \ |
| struct mcs_lock_qnode mcs_qnode = MCS_QNODE_INIT; \ |
| struct mcs_pdro_qnode pdro_qnode = MCSPDRO_QNODE_INIT; \ |
| int i; \ |
| /* guessing a unique vcoreid for vcoreid for the __mcspdr test. if the |
| * program gets preempted for that test, things may go nuts */ \ |
| pdro_qnode.vcoreid = thread_id + 1 % pargs.nr_threads; \ |
| /* Wait til all threads are created. Ideally, I'd like to busywait |
| * unless absolutely critical to yield */ \ |
| pthread_barrier_wait(&start_test); \ |
| if (fake_vc_ctx) { \ |
| /* tells the kernel / other vcores we're in vc ctx */ \ |
| uth_disable_notifs(); \ |
| /* tricks ourselves into believing we're in vc ctx */ \ |
| __vcore_context = TRUE; \ |
| } \ |
| for (i = 0; i < nr_loops; i++) { \ |
| if (!run_locktest) \ |
| break; \ |
| pre_lock = read_tsc_serialized(); \ |
| \ |
| lock_cmd \ |
| \ |
| acq_lock = read_tsc_serialized(); \ |
| if (hold_time) \ |
| ndelay(hold_time); \ |
| \ |
| unlock_cmd \ |
| \ |
| un_lock = read_tsc_serialized(); \ |
| this_time = ×[thread_id][i]; \ |
| this_time->pre = pre_lock; \ |
| this_time->acq = acq_lock; \ |
| this_time->un = un_lock; \ |
| /* Can turn these on/off to control which samples we gather */ \ |
| this_time->valid = TRUE; \ |
| /* this_time->valid = (num_vcores() == max_vcores()); */ \ |
| \ |
| if (delay_time) \ |
| ndelay(delay_time); \ |
| /* worker thread ids are 0..n-1. if we're one of the threads |
| * that's beyond the VC count, we yield. */ \ |
| if (adj_workers && num_vcores() < thread_id + 1) { \ |
| if (fake_vc_ctx) { \ |
| __vcore_context = FALSE; \ |
| uth_enable_notifs(); \ |
| } \ |
| /* we'll come back up once we have enough VCs running*/\ |
| pthread_yield(); \ |
| if (fake_vc_ctx) { \ |
| uth_disable_notifs(); \ |
| __vcore_context = TRUE; \ |
| } \ |
| } \ |
| cmb(); \ |
| } \ |
| /* First thread to finish stops the test */ \ |
| run_locktest = FALSE; \ |
| if (fake_vc_ctx) { \ |
| __vcore_context = FALSE; \ |
| uth_enable_notifs(); \ |
| } \ |
| return (void*)(long)i; \ |
| } |
| |
| #define fake_lock_func(lock_name, x1, x2) \ |
| void *lock_name##_thread(void *arg) \ |
| { \ |
| printf("Lock " #lock_name " not supported!\n"); \ |
| exit(-1); \ |
| } |
| |
| spinlock_t spin_lock = SPINLOCK_INITIALIZER; |
| struct mcs_lock mcs_lock = MCS_LOCK_INIT; |
| |
| /* Defines locking funcs like "mcs_thread" */ |
| lock_func(mcs, |
| mcs_lock_lock(&mcs_lock, &mcs_qnode);, |
| mcs_lock_unlock(&mcs_lock, &mcs_qnode);) |
| lock_func(mcscas, |
| mcs_lock_lock(&mcs_lock, &mcs_qnode);, |
| mcs_lock_unlock_cas(&mcs_lock, &mcs_qnode);) |
| lock_func(spin, |
| spinlock_lock(&spin_lock);, |
| spinlock_unlock(&spin_lock);) |
| |
| #ifdef __ros__ |
| struct spin_pdr_lock spdr_lock = SPINPDR_INITIALIZER; |
| struct mcs_pdr_lock mcspdr_lock; |
| struct mcs_pdro_lock mcspdro_lock = MCSPDRO_LOCK_INIT; |
| |
| lock_func(mcspdr, |
| mcs_pdr_lock(&mcspdr_lock);, |
| mcs_pdr_unlock(&mcspdr_lock);) |
| lock_func(mcspdro, |
| mcs_pdro_lock(&mcspdro_lock, &pdro_qnode);, |
| mcs_pdro_unlock(&mcspdro_lock, &pdro_qnode);) |
| lock_func(__mcspdro, |
| __mcs_pdro_lock(&mcspdro_lock, &pdro_qnode);, |
| __mcs_pdro_unlock(&mcspdro_lock, &pdro_qnode);) |
| lock_func(spinpdr, |
| spin_pdr_lock(&spdr_lock);, |
| spin_pdr_unlock(&spdr_lock);) |
| #else |
| |
| fake_lock_func(mcspdr, 0, 0); |
| fake_lock_func(mcspdro, 0, 0); |
| fake_lock_func(__mcspdro, 0, 0); |
| fake_lock_func(spinpdr, 0, 0); |
| |
| #endif |
| |
| static int get_acq_latency(void **data, int i, int j, uint64_t *sample) |
| { |
| struct time_stamp **times = (struct time_stamp**)data; |
| /* 0 for initial time means we didn't measure */ |
| if (times[i][j].pre == 0) |
| return -1; |
| /* can optionally throw out invalid times (keep this in sync with the |
| * lock_test macro, based on what you want to meaasure. */ |
| #if 0 |
| if (!times[i][j].valid) |
| return -1; |
| #endif |
| *sample = times[i][j].acq - times[i][j].pre - get_tsc_overhead(); |
| return 0; |
| } |
| |
| static int get_hld_latency(void **data, int i, int j, uint64_t *sample) |
| { |
| struct time_stamp **times = (struct time_stamp**)data; |
| /* 0 for initial time means we didn't measure */ |
| if (times[i][j].pre == 0) |
| return -1; |
| *sample = times[i][j].un - times[i][j].acq - get_tsc_overhead(); |
| return 0; |
| } |
| |
| static int get_acq_timestamp(void **data, int i, int j, uint64_t *sample) |
| { |
| struct time_stamp **times = (struct time_stamp**)data; |
| /* 0 for initial time means we didn't measure */ |
| if (times[i][j].pre == 0) |
| return -1; |
| *sample = times[i][j].acq; |
| return 0; |
| } |
| |
| #ifdef __ros__ |
| |
| /* Lousy event intercept. build something similar in the event library? */ |
| #define MAX_NR_EVENT_TRACES 1000 |
| uint64_t preempts[MAX_NR_EVENT_TRACES] = {0}; |
| uint64_t indirs[MAX_NR_EVENT_TRACES] = {0}; |
| atomic_t preempt_idx; |
| atomic_t indir_idx; |
| atomic_t preempt_cnt; |
| atomic_t indir_cnt; |
| |
| static void trace_preempt(struct event_msg *ev_msg, unsigned int ev_type, |
| void *data) |
| { |
| unsigned long my_slot = atomic_fetch_and_add(&preempt_idx, 1); |
| |
| if (my_slot < MAX_NR_EVENT_TRACES) |
| preempts[my_slot] = read_tsc(); |
| atomic_inc(&preempt_cnt); |
| } |
| |
| static void trace_indir(struct event_msg *ev_msg, unsigned int ev_type, |
| void *data) |
| { |
| |
| unsigned long my_slot = atomic_fetch_and_add(&indir_idx, 1); |
| if (my_slot < MAX_NR_EVENT_TRACES) |
| indirs[my_slot] = read_tsc(); |
| atomic_inc(&indir_cnt); |
| } |
| |
| /* Helper, prints out the preempt trace */ |
| static void print_preempt_trace(uint64_t starttsc, int nr_print_rows) |
| { |
| /* reusing nr_print_rows for the nr preempt/indirs rows as well */ |
| |
| int preempt_rows = MIN(MAX_NR_EVENT_TRACES, nr_print_rows); |
| if (pargs.fake_vc_ctx) { |
| printf("No preempt trace available when faking vc ctx\n"); |
| return; |
| } |
| printf("\n"); |
| printf("Nr Preempts: %d\n", atomic_read(&preempt_cnt)); |
| printf("Nr Indirs : %d\n", atomic_read(&indir_cnt)); |
| if (preempt_rows) |
| printf("Preempt/Indir events:\n-----------------\n"); |
| for (int i = 0; i < preempt_rows; i++) { |
| if (preempts[i]) |
| printf("Preempt %3d at %6llu\n", |
| i, tsc2msec(preempts[i] - starttsc)); |
| } |
| for (int i = 0; i < preempt_rows; i++) { |
| if (indirs[i]) |
| printf("Indir %3d at %6llu\n", |
| i, tsc2msec(indirs[i] - starttsc)); |
| } |
| } |
| |
| /* Make sure we have enough VCs for nr_threads, pref 1:1 at the start */ |
| static void os_prep_work(pthread_t *worker_threads, int nr_threads) |
| { |
| if (nr_threads > max_vcores()) { |
| printf("Too many threads (%d) requested, can't get more than %d vc\n", |
| nr_threads, max_vcores()); |
| exit(-1); |
| } |
| atomic_init(&preempt_idx, 0); |
| atomic_init(&indir_idx, 0); |
| atomic_init(&preempt_cnt, 0); |
| atomic_init(&indir_cnt, 0); |
| parlib_never_yield = TRUE; |
| pthread_need_tls(FALSE); |
| pthread_mcp_init(); /* gives us one vcore */ |
| register_ev_handler(EV_VCORE_PREEMPT, trace_preempt, 0); |
| register_ev_handler(EV_CHECK_MSGS, trace_indir, 0); |
| if (pargs.fake_vc_ctx) { |
| /* need to disable events when faking vc ctx. since we're |
| * looping and not handling events, we could run OOM */ |
| clear_kevent_q(EV_VCORE_PREEMPT); |
| clear_kevent_q(EV_CHECK_MSGS); |
| } |
| vcore_request_total(nr_threads); |
| parlib_never_vc_request = TRUE; |
| for (int i = 0; i < nr_threads; i++) { |
| printd("Vcore %d mapped to pcore %d\n", i, |
| __procinfo.vcoremap[i].pcoreid); |
| } |
| } |
| |
| static void os_post_work(pthread_t *worker_threads, int nr_threads) |
| { |
| } |
| |
| #endif |
| |
| /* Argument parsing */ |
| static error_t parse_opt (int key, char *arg, struct argp_state *state) |
| { |
| struct prog_args *pargs = state->input; |
| |
| switch (key) { |
| case 'w': |
| pargs->nr_threads = atoi(arg); |
| if (pargs->nr_threads < 0) { |
| printf("Negative nr_threads...\n\n"); |
| argp_usage(state); |
| } |
| break; |
| case 'l': |
| pargs->nr_loops = atoi(arg); |
| if (pargs->nr_loops < 0) { |
| printf("Negative nr_loops...\n\n"); |
| argp_usage(state); |
| } |
| break; |
| case OPT_ADJ_WORKERS: |
| pargs->adj_workers = TRUE; |
| break; |
| case OPT_VC_CTX: |
| pargs->fake_vc_ctx = TRUE; |
| break; |
| case 'h': |
| pargs->hold_time = atoi(arg); |
| if (pargs->hold_time < 0) { |
| printf("Negative hold_time...\n\n"); |
| argp_usage(state); |
| } |
| break; |
| case 'd': |
| pargs->delay_time = atoi(arg); |
| if (pargs->delay_time < 0) { |
| printf("Negative delay_time...\n\n"); |
| argp_usage(state); |
| } |
| break; |
| case 'o': |
| pargs->outfile_path = arg; |
| break; |
| case 'p': |
| pargs->nr_print_rows = atoi(arg); |
| if (pargs->nr_print_rows < 0) { |
| printf("Negative print_rows...\n\n"); |
| argp_usage(state); |
| } |
| break; |
| case 't': |
| if (!strcmp("mcs", arg)) { |
| pargs->lock_type = mcs_thread; |
| break; |
| } |
| if (!strcmp("mcscas", arg)) { |
| pargs->lock_type = mcscas_thread; |
| break; |
| } |
| if (!strcmp("mcspdr", arg)) { |
| pargs->lock_type = mcspdr_thread; |
| break; |
| } |
| if (!strcmp("mcspdro", arg)) { |
| pargs->lock_type = mcspdro_thread; |
| break; |
| } |
| if (!strcmp("__mcspdro", arg)) { |
| pargs->lock_type = __mcspdro_thread; |
| break; |
| } |
| if (!strcmp("spin", arg)) { |
| pargs->lock_type = spin_thread; |
| break; |
| } |
| if (!strcmp("spinpdr", arg)) { |
| pargs->lock_type = spinpdr_thread; |
| break; |
| } |
| printf("Unknown locktype %s\n\n", arg); |
| argp_usage(state); |
| break; |
| case ARGP_KEY_ARG: |
| printf("Warning, extra argument %s ignored\n\n", arg); |
| break; |
| case ARGP_KEY_END: |
| if (!pargs->nr_threads) { |
| printf("Must select a number of threads.\n\n"); |
| argp_usage(state); |
| break; |
| } |
| if (!pargs->nr_loops) { |
| printf("Must select a number of loops.\n\n"); |
| argp_usage(state); |
| break; |
| } |
| if (!pargs->lock_type) { |
| printf("Must select a type of lock.\n\n"); |
| argp_usage(state); |
| break; |
| } |
| break; |
| default: |
| return ARGP_ERR_UNKNOWN; |
| } |
| return 0; |
| } |
| |
| static struct argp argp = {options, parse_opt, args_doc, doc}; |
| |
| int main(int argc, char** argv) |
| { |
| pthread_t *worker_threads; |
| void **loops_done; |
| struct timeval start_tv = {0}; |
| struct timeval end_tv = {0}; |
| long usec_diff, total_loops = 0; |
| uint64_t starttsc; |
| int nr_threads, nr_loops; |
| FILE *outfile; |
| struct sample_stats acq_stats, hld_stats; |
| |
| argp_parse(&argp, argc, argv, 0, 0, &pargs); |
| nr_threads = pargs.nr_threads; |
| nr_loops = pargs.nr_loops; |
| mcs_pdr_init(&mcspdr_lock); |
| |
| if (pargs.outfile_path) { |
| /* RDWR, CREAT, TRUNC, O666 */ |
| outfile = fopen(pargs.outfile_path, "w+"); |
| if (!outfile) { |
| perror("outfile"); |
| exit(-1); |
| } |
| } |
| worker_threads = malloc(sizeof(pthread_t) * nr_threads); |
| if (!worker_threads) { |
| perror("pthread_t malloc failed:"); |
| exit(-1); |
| } |
| loops_done = malloc(sizeof(void*) * nr_threads); |
| if (!loops_done) { |
| perror("loops_done malloc failed"); |
| exit(-1); |
| } |
| printf("Making %d workers, %d loops each, %sadapting workers to vcores, and %sfaking vcore context\n", |
| nr_threads, nr_loops, |
| pargs.adj_workers ? "" : "not ", |
| pargs.fake_vc_ctx ? "" : "not "); |
| pthread_barrier_init(&start_test, NULL, nr_threads); |
| |
| times = malloc(sizeof(struct time_stamp *) * nr_threads); |
| assert(times); |
| for (int i = 0; i < nr_threads; i++) { |
| times[i] = malloc(sizeof(struct time_stamp) * nr_loops); |
| if (!times[i]) { |
| perror("Record keeping malloc"); |
| exit(-1); |
| } |
| memset(times[i], 0, sizeof(struct time_stamp) * nr_loops); |
| } |
| printf("Record tracking takes %ld bytes of memory\n", |
| nr_threads * nr_loops * sizeof(struct time_stamp)); |
| os_prep_work(worker_threads, nr_threads);/* ensure we have enough VCs */ |
| /* Doing this in MCP ctx, so we might have been getting a few preempts |
| * already. Want to read start before the threads pass their barrier */ |
| starttsc = read_tsc(); |
| /* create and join on yield */ |
| for (long i = 0; i < nr_threads; i++) { |
| if (pthread_create(&worker_threads[i], NULL, pargs.lock_type, |
| (void*)i)) |
| perror("pth_create failed"); |
| } |
| os_post_work(worker_threads, nr_threads); |
| if (gettimeofday(&start_tv, 0)) |
| perror("Start time error..."); |
| for (int i = 0; i < nr_threads; i++) { |
| pthread_join(worker_threads[i], &loops_done[i]); |
| } |
| if (gettimeofday(&end_tv, 0)) |
| perror("End time error..."); |
| |
| printf("Acquire times (TSC Ticks)\n---------------------------\n"); |
| acq_stats.get_sample = get_acq_latency; |
| compute_stats((void**)times, nr_threads, nr_loops, &acq_stats); |
| |
| printf("Held times (from acq til rel done) (TSC Ticks)\n------\n"); |
| hld_stats.get_sample = get_hld_latency; |
| compute_stats((void**)times, nr_threads, nr_loops, &hld_stats); |
| |
| usec_diff = (end_tv.tv_sec - start_tv.tv_sec) * 1000000 + |
| (end_tv.tv_usec - start_tv.tv_usec); |
| printf("Time to run: %ld usec\n", usec_diff); |
| |
| printf("\nLock throughput:\n-----------------\n"); |
| /* throughput for the entire duration (in ms), 1ms steps. print as many |
| * steps as they ask for (up to the end of the run). */ |
| print_throughput((void**)times, usec_diff / 1000 + 1, msec2tsc(1), |
| pargs.nr_print_rows, |
| starttsc, nr_threads, |
| nr_loops, get_acq_timestamp); |
| print_preempt_trace(starttsc, pargs.nr_print_rows); |
| |
| for (int i = 0; i < nr_threads; i++) { |
| total_loops += (long)loops_done[i]; |
| if (!loops_done[i]) |
| printf("WARNING: thread %d performed 0 loops!\n", i); |
| } |
| printf("Average number of loops done, per thread: %ld\n", |
| total_loops / nr_threads); |
| for (int i = 0; i < nr_threads; i++) |
| printf("\tThread %d performed %lu loops\n", |
| i, (long)loops_done[i]); |
| |
| if (pargs.outfile_path) { |
| fprintf(outfile, "#"); |
| for (char **arg = argv; *arg; arg++) |
| fprintf(outfile, " %s", *arg); |
| fprintf(outfile, "\n"); |
| fprintf(outfile, "# thread_id attempt pre acq(uire) un(lock) " |
| "tsc_overhead\n"); |
| fprintf(outfile, |
| "# acquire latency: acq - pre - tsc_overhead\n"); |
| fprintf(outfile, "# hold time: un - acq - tsc_overhead\n"); |
| fprintf(outfile, "# tsc_frequency %llu\n", get_tsc_freq()); |
| fprintf(outfile, |
| "# tsc_overhead is 0 on linux, hard code it with a value from akaros\n"); |
| for (int i = 0; i < nr_threads; i++) { |
| for (int j = 0; j < nr_loops; j++) { |
| struct time_stamp *ts = ×[i][j]; |
| if (!ts->pre) |
| break; /* empty record */ |
| fprintf(outfile, "%d %d %llu %llu %llu %llu\n", |
| i, j, ts->pre, ts->acq, ts->un, |
| get_tsc_overhead()); |
| } |
| } |
| fclose(outfile); |
| } |
| printf("Done, exiting\n"); |
| } |