| /* Barret Rhoden |
| * |
| * Code heavily ported from "How to Benchmark Code Execution Times on Intel(R) |
| * IA-32 and IA-64 Instruction Set Architectures" for linux, except for |
| * check_timing_stability(). |
| * |
| * The idea behind this was that the traditional style of using rdtsc was to |
| * call: |
| * cpuid; |
| * rdtsc; |
| * since rdtsc does no serialization (meaning later instructions can get |
| * executed before it, or vice versa). While this first cpuid isn't a big deal, |
| * doing this in pairs means reading the end time also measures cpuid. This is |
| * a problem since cpuid can vary quite a bit. |
| * |
| * If we use rdtscp for the end call, we can put the cpuid after rdtscp, thereby |
| * not including cpuid's overhead (and variability) in our measurement. That's |
| * where the intel doc ends. For more info, check out: |
| * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf |
| * |
| * Note that the Intel SDM says you can serialize rdtsc with lfence, such as: |
| * lfence; |
| * rdtsc; |
| * Linux uses this (mfence on amd64, lfence on intel). For more info: |
| * https://lkml.org/lkml/2008/1/2/353 |
| * Note this use of lfence before rdtsc is supposedly serializing any |
| * instruction, not just loads. Some stranger on the internet suggested that |
| * while lfence only serializes memory (and not arbitrary instructions), in |
| * actual hardware there is no point to reorder non-memory instructions around |
| * rdtsc: |
| * http://stackoverflow.com/questions/12631856/difference-between-rdtscp-rdtsc-memory-and-cpuid-rdtsc |
| * (look for janneb's response to questions about his comment) |
| * |
| * Its not clear from what anyone writes as to whether or not you need to |
| * serialize below rdtsc. Supposedly, you'd need cpuid/lfence on both sides of |
| * rdtsc to prevent reordering in both directions. Andi Kleen does this in a |
| * few places |
| * https://lkml.org/lkml/2008/1/7/276 |
| * though other places in the kernel suggest it is unnecessary (at least for |
| * loads: |
| * http://lxr.linux.no/#linux+v3.8.2/arch/x86/kvm/x86.c#L1258 |
| * The intel docs don't mention it (otherwise we would be told to use |
| * lfence;rdtsc;lfence). The howto this file is based off of didn't mention it |
| * either, other than to say rdtscp needs to serialize from below. AFAIK, |
| * rdtscp is like rdtsc, except that it serializes from above (and also returns |
| * the CPU id). If rdtscp needs to serialize from below, then so should rdtsc. |
| * |
| * That being said, if these rdtsc(p) calls do not need serialization from |
| * below, then rdtscp (which provides serialization from above) should not need |
| * any additional serialization (lfence or cpuid). |
| * |
| * I tried out a few options for the assembly for the start and end time |
| * measurements, using the intel benchmark. The benchmark reports variance, max |
| * deviation, and minimum per inner loop (line), as well as an overall variance, |
| * max dev, and variance of vars/mins. |
| * |
| * CASE START ASM END ASM |
| * --------------------------------------------------- |
| * case 0: cpuid;rdtsc; cpuid;rdtscp; |
| * case 1: cpuid;rdtsc; rdtscp;cpuid; (or rdtscp;lfence) |
| * case 2: lfence;rdtsc; rdtscp;cpuid; (or rdtscp;lfence) |
| * case 3: rdtscp; rdtscp;cpuid; (or rdtscp;lfence) |
| * case 4: rdtscp; rdtscp; |
| * case 5: lfence;rdtsc; lfence;rdtsc; |
| * case 6: lfence;rdtsc;lfence; lfence;rdtsc;lfence; |
| * |
| * Note I only ran these a couple times, with 1000x10000, and I did notice some |
| * slight variation between runs (on cases 3 and 4). |
| * |
| * case 0: wildly variant, variance of variances wasn't 0, etc (as |
| * reported by intel). |
| * case 0: some lines 0 var, 0-8 max dev, 420 min |
| * case 0: other lines 50-60 var, 20 max dev, 388 min |
| * |
| * For all of the other cases, variance of variances and of minvalues was 0. |
| * |
| * case 1: most lines 2-3 var, 4 max dev, 44 min, 2 var 4 max dev overall |
| * case 2: most lines 2-3 var, 4 max dev, 44 min, 2 var 4 max dev overall |
| * case 3: most lines 0 var, 0 max dev, 32 min, 0 var 0 max dev overall |
| * case 4: most lines 0 var, 0 max dev, 32 min, 0 var 4 max dev overall |
| * case 5: most lines 3 var, 4 max dev, 28 min, 2 var 4 max dev overall |
| * case 6: most lines 3 var, 4 max dev, 44 min, 2 var 4 max dev overall |
| * |
| * case 1-3: cpuid vs lfence: both seem to work the same and have no effect |
| * (since they are outside the loop) |
| * |
| * So running with rdtscp for both start and stop (case 3 and 4) had the least |
| * amount of variance (both per line and total). When these cases have had any |
| * deviation, it was because one run had a min of 28, but o/w was 32. (1 out of |
| * 10000000, often the first run). |
| * |
| * All the others have a little deviation, but with a more stable min. Again, |
| * this is taken mostly from a small number of runs (of 1kx10k). |
| * |
| * Note that cases 5 and 6 have lfences inside the measurement area, and this |
| * does not seem to cause problems the same way cpuid does. However, lfences |
| * inside the critical section (esp after whatever code we are measuring) |
| * probably will have an effect on real code that has made memory accesses (keep |
| * in mind we need to do an mfence on amd64 here). |
| * |
| * All that being said, it's not clear which option to use. Ideally, we want |
| * an isolated region of code to be measured, with very little variance and max |
| * deviation. If cases 1-6 are all the same in terms of protection (which I'm |
| * not sure about), then 3-4 look nice. However, the fact that sometimes the |
| * min is less than 'normal', means that we could get negative numbers for some |
| * measurements (the goal is to determine the overhead and subtract that from |
| * our total measurement, and if we think the overhead is 32 but was actually 28 |
| * for a run, we could have issues). |
| * |
| * But wait, there's more: |
| * |
| * When we add code around (and inside) the measurement, things get even worse: |
| * - If we put variable (a volatile btw) = j + i; in the loop, there's no real |
| * change. I checked cases 1, 4 and 5, 1 being the intel recommended, 4 being |
| * the one with the best variance with no code, and 5 being a good symmetric |
| * choice (same on start and end). Case 1 had no change at all. 4 and 5 had |
| * little change (min was the same, occasional deviation). Note that case 5 |
| * doesn't use rdtscp at the end either. |
| * - If we put in variable = i; as well, the minimum still is unaffected, and |
| * there is a little more variance. For example, for case 4, the min is still |
| * 32, and sometimes you get a 36. |
| * |
| * If we add more code (like a for loop that grows in length with each outer |
| * loop), eventually we can detect the existence of the instructions. The Intel |
| * author talks about this in 3.3 when he finds the resolution of the benchmark. |
| * |
| * My hunch is that the rdtsc(p) calls hide the latency of some previous |
| * instructions, regardless of serialization commands. We see this 'hiding' of |
| * the cost of instructions regardless of whether or not the first or last |
| * commands are rdtscp (I'm more concerned with the end time call, which is |
| * where this hiding may be happening). Perhaps the pipeline needs to be |
| * drained (or something), and it takes a certain amount of time to do so, |
| * regardless of a few extra instructions squeezed in. Meaning we can't tell |
| * the difference between 0 and a few cycles, and probably a few cycles are |
| * 'free' / hidden by the rdtsc call. |
| * |
| * Bottom line? Our measurements are inexact, despite the stable minimum and |
| * low variance. Everything will be +/- our max deviation, as well as |
| * potentially underestimating by a few cycles/ticks. One thing we can do is |
| * try to see what the resolution is of the different methods. |
| * |
| * case 1: cpuid;rdtsc; rdtscp;cpuid; (or rdtscp;lfence) |
| * ------------------- |
| * loop_size:0 >>>> variance(cycles): 3; max_deviation: 8; min time: 44 |
| * loop_size:1 >>>> variance(cycles): 6; max_deviation: 28; min time: 44 |
| * loop_size:2 >>>> variance(cycles): 4; max_deviation: 16; min time: 44 |
| * loop_size:3 >>>> variance(cycles): 12; max_deviation: 44; min time: 44 |
| * loop_size:4 >>>> variance(cycles): 10; max_deviation: 32; min time: 44 |
| * loop_size:5 >>>> variance(cycles): 10; max_deviation: 32; min time: 44 |
| * loop_size:6 >>>> variance(cycles): 12; max_deviation: 36; min time: 44 |
| * loop_size:7 >>>> variance(cycles): 5; max_deviation: 32; min time: 48 |
| * loop_size:8 >>>> variance(cycles): 16; max_deviation: 52; min time: 48 |
| * loop_size:9 >>>> variance(cycles): 13; max_deviation: 48; min time: 52 |
| * loop_size:10 >>>> variance(cycles): 9; max_deviation: 36; min time: 52 |
| * loop_size:11 >>>> variance(cycles): 16; max_deviation: 64; min time: 56 |
| * |
| * case 4: rdtscp; rdtscp; |
| * ------------------- |
| * loop_size:0 >>>> variance(cycles): 1; max_deviation: 20; min time: 32 |
| * loop_size:1 >>>> variance(cycles): 12; max_deviation: 36; min time: 36 |
| * loop_size:2 >>>> variance(cycles): 13; max_deviation: 32; min time: 36 |
| * loop_size:3 >>>> variance(cycles): 7; max_deviation: 32; min time: 40 |
| * loop_size:4 >>>> variance(cycles): 1; max_deviation: 16; min time: 44 |
| * loop_size:5 >>>> variance(cycles): 4; max_deviation: 28; min time: 44 |
| * loop_size:6 >>>> variance(cycles): 12; max_deviation: 48; min time: 44 |
| * loop_size:7 >>>> variance(cycles): 8; max_deviation: 32; min time: 44 |
| * loop_size:8 >>>> variance(cycles): 10; max_deviation: 48; min time: 48 |
| * |
| * case 5: lfence;rdtsc; lfence;rdtsc; |
| * ------------------- |
| * loop_size:0 >>>> variance(cycles): 3; max_deviation: 12; min time: 28 |
| * loop_size:1 >>>> variance(cycles): 8; max_deviation: 28; min time: 32 |
| * loop_size:2 >>>> variance(cycles): 8; max_deviation: 28; min time: 32 |
| * loop_size:3 >>>> variance(cycles): 6; max_deviation: 28; min time: 32 |
| * loop_size:4 >>>> variance(cycles): 2; max_deviation: 24; min time: 36 |
| * loop_size:5 >>>> variance(cycles): 6; max_deviation: 28; min time: 36 |
| * loop_size:6 >>>> variance(cycles): 11; max_deviation: 44; min time: 36 |
| * loop_size:7 >>>> variance(cycles): 7; max_deviation: 32; min time: 36 |
| * loop_size:8 >>>> variance(cycles): 1; max_deviation: 16; min time: 40 |
| * |
| * For cases 4 and 5, we notice quite quickly. The for loop itself has some |
| * overhead (probably more than our simple stores and adds). So the resolution |
| * of these methods is a little more than a loop's overhead. For case 1, we |
| * need about 7 loops, in addition to the overhead, until we can reliably detect |
| * the additional instructions. Note the deviation and variation increases for |
| * all cases. |
| * |
| * |
| * What about extra code before the measurement? I reran the test cases with |
| * some extra tsc-related code above the measurement (an accidental asm |
| * insertion of lfence;rdtsc above reading the start time) and with no work in |
| * between: |
| * case 1: no effect |
| * case 2: no effect |
| * These both had some form of serialization (cpuid or lfence) above the rdtsc |
| * command. But when we try using just rdtscp (with no extra serialization:) |
| * case 3, normal: lines 0 var, 0 max dev, 32 min, 0 var 0 max dev |
| * case 3, extras: lines 2-3 var, 4 max dev, 28 min, 2 var 4 max dev |
| * Similar deal with case 4. Lots of 28s and deviation. It looks like some |
| * times the rdtsc diff is only 28, and others 32 (hence the deviation of 4). |
| * Note this means the measurement interval is *lower*, which means the code was |
| * *faster*. Was the rdtscp not serializing instructions from above (which |
| * doesn't make sense, since anything sneaking in from above should make the |
| * code *slower*)? Or is it because the previous command was rdtsc, which might |
| * 'speed up' subsequent rdtscs. I tried it again, with a little work between |
| * the unused TSC read and the start tsc read: |
| * case 3, more crap : lines 2-3 var, 4 max dev, 28 min, 2 var 4 max dev |
| * So no real change from adding minor code in between. What about adding an |
| * lfence above the rdtscp (so it is almost exactly like case 2)? |
| * Our assembly code now looks like: |
| * lfence; |
| * rdtsc; |
| * mov %edx, (memory); // these get overwritten |
| * mov %eax, (memory); // these get overwritten |
| * |
| * mov (memory), %eax; // misc work (variable = i + j) |
| * add %esi, %eax; // misc work (variable = i + j) |
| * mov %eax, (memory); // misc work (variable = i + j) |
| * |
| * lfence; |
| * rdtscp; // this is the real start measurement |
| * mov %edx, (memory); |
| * mov %eax, (memory); |
| * |
| * // no extra work here |
| * |
| * rdtscp; // this is the real end measurement |
| * mov %edx, (memory); |
| * mov %eax, (memory); |
| * cpuid; // this is case 3, with sync after |
| * |
| * Even with this extra lfence, case 3-style still shows numbers like: |
| * case 3, added crap: lines 2-3 var, 4 max dev, 28 min, 2 var 4 max dev |
| * So either rdtscp is somehow faster due to internal-processor-caching (a |
| * previous rdtsc makes the next rdtscp somewhat faster sometimes, including |
| * after some instructions and an lfence), or the baseline case of no variation |
| * is "wrong", and we really should expect between 28 and 32. FWIW, the Intel |
| * author also had a max deviation of 4 (per line). And remember, on rare |
| * occasions we get a 28 for case 3 and 4 (the other 9999999 times it is 32). |
| * |
| * Note how the modified case 3 is pretty much the same *in performance* as a |
| * case 5. But its code is nearly identical to case 2. If you change the start |
| * measurement's rdtscp to an rdtsc, the min goes from 28 -> 44 (this is case |
| * 2). And if you change the end measurements rdtscp to an lfence; rdtscp, we |
| * go from 44->48 (this is no case). Then if you change that rdtscp to an |
| * rdtsc, we drop from 48->28 (this is case 5). Based on this, it looks like |
| * the different types of rdtsc take their time measurement at different points |
| * within their execution. rdtsc probably takes its measurement earlier in the |
| * instruction (~16-20 cycles/ticks earlier perhaps?), based on the 48->28 |
| * back-side step and the front-side 28->44 step. |
| * |
| * Anyway, what matters is a relatively stable method without a lot of variance |
| * that has a solid floor/min that we can detect at runtime (to run tests on a |
| * given machine). Using rdtscp for the start measurement seems unreliable |
| * (when run alone we get 32, when run with things we get 28, on the corei7). |
| * So even though case 3 and 4 had nice low variances and deviations, I don't |
| * trust it, and would rather go with something that always gives me the same |
| * result (as well as being a low result). So case 5 will be my go-to for now. |
| * It should have the same protection as the others (perhaps 6 is better), it is |
| * stable, and it has a low overhead and low resolution (less capacity to hide |
| * instruction latency). Finally, the start and end measurements use the same |
| * code, which is very convenient. |
| * |
| * This isn't conclusive - we'd need to do more tests with different workloads |
| * on different machines, and probably talk to an intel architect. |
| * |
| * Still reading? There's one more thing: System Management Mode! This is an |
| * interrupt context that is invisible to the OS, but we can see its effects in |
| * our measurements. If you run this code with the default settings, you often |
| * won't see it (unless you have some loops). However, if you run with |
| * 1024x16384 (0x400 by 0x4000), you are likely to see very large max |
| * deviations, such as 100, 600, or even 1500000. From what I can tell, the |
| * likelihood depends on how long the inner loop. Using case 5 at 0x400, |
| * 0x4000, after 3-4 runs, I had one line out of 1024 lines that was much |
| * higher. Three were 112, one was 1659260. AFAIK, this is system management |
| * mode kicking in. You can mitigate this by disabling all types of USB legacy |
| * support in the BIOS. Specifically, faking USB keyboards and mice (making |
| * them look like PS/2) and USB mass storage (making them look like a HDD) all |
| * lead to an increase in SMIs. For more info, check out: |
| * https://rt.wiki.kernel.org/index.php/HOWTO:_Build_an_RT-application |
| * It is not sufficient to merely not use things like the USB mass storage. It |
| * needs to be disabled in the BIOS. At least, this is true on my nehalem. A |
| * while back, we had an issue with microbenchmarks taking 10% longer if you |
| * held down a key on the keyboard, even if the code was running on a core that |
| * did not receive the keyboard IRQ. Turns out this was due to a USB keyboard |
| * in legacy mode. The real root of this problem was SMM, which forces all |
| * cores to enter SMM whenever any core enters SMM (hence the cross-core |
| * interference). |
| * |
| * So finally, disable anything that may lead to SMM interference. I have some |
| * code that runs at startup that tries to determine the min time for the given |
| * approved method of measurement (i.e., case 5), and also tries to detect SMIs |
| * via massive latency spikes. */ |
| |
| #include <ros/common.h> |
| #include <arch/arch.h> |
| #include <stdio.h> |
| #include <kmalloc.h> |
| #include <time.h> |
| #include <smp.h> |
| #include <ros/procinfo.h> |
| |
| #define STAT_SIZE_DEF 10000 |
| #define LOOP_BOUND_DEF 1000 |
| |
| /* Fills in the **times with the results of the double loop measurement. There |
| * are many options for start and end time measurements, all inside #if 0 #endif |
| * comments. Copy/paste whichever you'd like to test out. */ |
| static inline void filltimes(uint64_t **times, unsigned int loop_bound, |
| unsigned int stat_size) |
| { |
| unsigned long flags; |
| int i, j; |
| uint64_t start, end; |
| unsigned int start_low, start_high, end_low, end_high; |
| unsigned int dummy_low, dummy_high; |
| volatile int variable = 0; |
| int8_t state = 0; |
| |
| /* Variety of warmups. recommended for cpuid... */ |
| asm volatile ("cpuid\n\t" |
| "rdtsc\n\t" |
| "cpuid\n\t" |
| "rdtsc\n\t" |
| "cpuid\n\t" |
| "rdtsc\n\t" |
| "mov %%edx, %0\n\t" |
| "mov %%eax, %1\n\t": "=m" (dummy_high), "=m" (dummy_low):: |
| "%eax", "%ebx", "%ecx", "%edx"); |
| for (j = 0; j < loop_bound; j++) { |
| for (i = 0; i < stat_size; i++) { |
| variable = 0; |
| /* starting side, i want to make sure we always copy out |
| * to memory (stack), instead of sometimes using |
| * registers (and other times not). if you use =a, for |
| * instance, with no work, the compiler will use esi and |
| * edi to store start_high and _low. |
| * |
| * The same concern is probably unnecessary at the end, |
| * but it might keep the compiler from reserving the use |
| * of those registers.*/ |
| |
| #if 0 /* extra crap before the measurement code */ |
| asm volatile ( |
| "lfence;" |
| "rdtsc;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| : "=m" (dummy_high), "=m" (dummy_low) |
| : |
| : "%eax", "%edx"); |
| |
| variable = i + j; |
| #endif |
| |
| asm volatile ( |
| "lfence;" |
| "rdtsc;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| : "=m" (start_high), "=m" (start_low) |
| : |
| : "%eax", "%edx"); |
| #if 0 /* types of start time measurements */ |
| asm volatile ( |
| "cpuid;" |
| "rdtsc;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| : "=m" (start_high), "=m" (start_low) |
| : |
| : "%eax", "%ebx", "%ecx", "%edx"); |
| asm volatile ( |
| "lfence;" |
| "rdtsc;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| : "=m" (start_high), "=m" (start_low) |
| : |
| : "%eax", "%edx"); |
| asm volatile ( |
| "lfence;" |
| "rdtsc;" |
| "lfence;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| : "=m" (start_high), "=m" (start_low) |
| : |
| : "%eax", "%edx"); |
| |
| asm volatile( |
| "rdtscp;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| : "=m" (start_high), "=m" (start_low) |
| : |
| : "%eax", "%ecx", "%edx"); |
| #endif |
| |
| /* call the function to measure here */ |
| |
| #if 0 /* some options for code to measure */ |
| variable = j; |
| |
| variable = i + j; |
| |
| for (int k = 0; k < j; k++) |
| variable = k; |
| #endif |
| |
| asm volatile("lfence;" |
| "rdtsc;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| : "=m" (end_high), "=m" (end_low) |
| : |
| : "%eax", "%edx"); |
| #if 0 /* types of end time measurements */ |
| asm volatile("cpuid;" |
| "rdtsc;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| : "=m" (end_high), "=m" (end_low) |
| : |
| : "%eax", "%ebx", "%ecx", "%edx"); |
| asm volatile("lfence;" |
| "rdtsc;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| : "=m" (end_high), "=m" (end_low) |
| : |
| : "%eax", "%edx"); |
| asm volatile("lfence;" |
| "rdtsc;" |
| "lfence;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| : "=m" (end_high), "=m" (end_low) |
| : |
| : "%eax", "%edx"); |
| |
| asm volatile( |
| "rdtscp;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| : "=m" (end_high), "=m" (end_low) |
| : |
| : "%eax", "%ecx", "%edx"); |
| asm volatile( |
| "rdtscp;" |
| "lfence;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| : "=m" (end_high), "=m" (end_low) |
| : |
| : "%eax", "%ecx", "%edx"); |
| asm volatile( |
| "rdtscp;" |
| "mov %%edx, %0;" |
| "mov %%eax, %1;" |
| "cpuid;" |
| : "=m" (end_high), "=m" (end_low) |
| : |
| : "%eax", "%ebx", "%ecx", "%edx"); |
| #endif |
| |
| start = ( ((uint64_t)start_high << 32) | start_low ); |
| end = ( ((uint64_t)end_high << 32) | end_low ); |
| |
| if ( (int64_t)(end - start) < 0) { |
| printk("CRITICAL ERROR IN TAKING THE TIME!!!!\n" |
| "loop(%d) stat(%d) start = %llu, " |
| "end = %llu, variable = %u\n", j, i, |
| start, end, variable); |
| times[j][i] = 0; |
| } else { |
| times[j][i] = end - start; |
| } |
| } |
| } |
| } |
| |
| /* http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance, doing pop |
| * variance, multiplying by N/N, and not checking overflow of size*size */ |
| uint64_t var_calc(uint64_t *inputs, int size) |
| { |
| int i; |
| uint64_t acc = 0, previous = 0, temp_var = 0; |
| |
| for (i = 0; i < size; i++) { |
| if (acc < previous) |
| goto overflow; |
| previous = acc; |
| acc += inputs[i]; |
| } |
| acc = acc * acc; |
| if (acc < previous) |
| goto overflow; |
| previous = 0; |
| for (i = 0; i < size; i++) { |
| if (temp_var < previous) |
| goto overflow; |
| previous = temp_var; |
| temp_var+= (inputs[i]*inputs[i]); |
| } |
| temp_var = temp_var * size; |
| if (temp_var < previous) |
| goto overflow; |
| temp_var = (temp_var - acc)/(((uint64_t)(size))*((uint64_t)(size))); |
| return (temp_var); |
| overflow: |
| printk("CRITICAL OVERFLOW ERROR IN var_calc!!!!!!\n\n"); |
| return -1; |
| } |
| |
| int test_rdtsc(unsigned int loop_bound, unsigned int stat_size) |
| { |
| int8_t state = 0; |
| |
| int i = 0, j = 0, spurious = 0, k = 0; |
| uint64_t **times; |
| uint64_t *variances; |
| uint64_t *min_values; |
| uint64_t max_dev = 0, min_time = 0, max_time = 0, prev_min = 0; |
| uint64_t tot_var = 0, max_dev_all = 0, var_of_vars = 0, var_of_mins = 0; |
| |
| loop_bound = loop_bound ?: LOOP_BOUND_DEF; |
| stat_size = stat_size ?: STAT_SIZE_DEF; |
| |
| printk("Running rdtsc tests...\n"); |
| |
| times = kmalloc(loop_bound * sizeof(uint64_t*), 0); |
| if (!times) { |
| printk("unable to allocate memory for times\n"); |
| return 0; |
| } |
| |
| for (j = 0; j < loop_bound; j++) { |
| times[j] = kmalloc(stat_size * sizeof(uint64_t), 0); |
| if (!times[j]) { |
| printk("unable to allocate memory for times[%d]\n", j); |
| for (k = 0; k < j; k++) |
| kfree(times[k]); |
| return 0; |
| } |
| } |
| |
| variances = kmalloc(loop_bound * sizeof(uint64_t), 0); |
| if (!variances) { |
| printk("unable to allocate memory for variances\n"); |
| // not bothering to free **times |
| return 0; |
| } |
| |
| min_values = kmalloc(loop_bound * sizeof(uint64_t), 0); |
| if (!min_values) { |
| printk("unable to allocate memory for min_values\n"); |
| // not bothering to free **times or variances |
| return 0; |
| } |
| |
| disable_irqsave(&state); |
| |
| filltimes(times, loop_bound, stat_size); |
| |
| enable_irqsave(&state); |
| |
| for (j = 0; j < loop_bound; j++) { |
| max_dev = 0; |
| min_time = 0; |
| max_time = 0; |
| |
| for (i = 0; i < stat_size; i++) { |
| if ((min_time == 0) || (min_time > times[j][i])) |
| min_time = times[j][i]; |
| if (max_time < times[j][i]) |
| max_time = times[j][i]; |
| } |
| max_dev = max_time - min_time; |
| min_values[j] = min_time; |
| if ((prev_min != 0) && (prev_min > min_time)) |
| spurious++; |
| if (max_dev > max_dev_all) |
| max_dev_all = max_dev; |
| variances[j] = var_calc(times[j], stat_size); |
| tot_var += variances[j]; |
| |
| printk("loop_size:%d >>>> variance(cycles): %llu; " |
| "max_deviation: %llu; min time: %llu\n", j, variances[j], |
| max_dev, min_time); |
| prev_min = min_time; |
| } |
| |
| var_of_vars = var_calc(variances, loop_bound); |
| var_of_mins = var_calc(min_values, loop_bound); |
| |
| printk("total number of spurious min values = %d\n", spurious); |
| /* is this next one the mean variance, not the total? */ |
| printk("total variance = %llu\n", (tot_var/loop_bound)); |
| printk("absolute max deviation = %llu\n", max_dev_all); |
| printk("variance of variances = %llu\n", var_of_vars); |
| printk("variance of minimum values = %llu\n", var_of_mins); |
| |
| for (j = 0; j < loop_bound; j++) { |
| kfree(times[j]); |
| } |
| kfree(times); |
| kfree(variances); |
| kfree(min_values); |
| return 0; |
| } |
| |
| |
| /* Crude SMI or other TSC-instability detection. */ |
| bool check_timing_stability(void) |
| { |
| uint64_t min_overhead = UINT64_MAX; |
| uint64_t max_overhead = 0; |
| uint64_t start, end, diff; |
| uint32_t edx; |
| int8_t irq_state = 0; |
| volatile int dummy = 0; |
| |
| /* Don't even bother if we don't have an invariant TSC */ |
| cpuid(0x80000007, 0x0, 0, 0, 0, &edx); |
| if (!(edx & (1 << 8))) { |
| printk("Invariant TSC not present. Do not benchmark!\n"); |
| return FALSE; |
| } |
| disable_irqsave(&irq_state); |
| /* 2mil detected an SMI about 95% of the time on my nehalem. */ |
| for (int i = 0; i < 3000000; i++) { |
| start = read_tsc_serialized(); |
| for (int j = 0; j < 500; j++) |
| dummy = j; |
| end = read_tsc_serialized(); |
| if ((int64_t)(end - start) < 0) { |
| printk("TSC stability overflow error!\n"); |
| return FALSE; |
| } |
| diff = end - start; |
| min_overhead = MIN(min_overhead, diff); |
| max_overhead = MAX(max_overhead, diff); |
| } |
| enable_irqsave(&irq_state); |
| if (max_overhead - min_overhead > 50) { |
| printk("Test TSC overhead unstable (Min: %llu, Max: %llu). " |
| "Do not benchmark!\n", min_overhead, max_overhead); |
| return FALSE; |
| } |
| return TRUE; |
| } |
| |
| void test_tsc_cycles(void) |
| { |
| uint64_t start, end; |
| int8_t irq_state = 0; |
| |
| disable_irqsave(&irq_state); |
| start = read_tsc_serialized(); |
| for (int i = 0; i < 1000; i++) { |
| asm volatile ("addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| "addl $1, %%eax;" |
| : : : "eax", "cc"); |
| } |
| end = read_tsc_serialized(); |
| end = end - start - __proc_global_info.tsc_overhead; |
| printk("%llu (100,000) ticks passed, run twice to load the icache\n", |
| end); |
| |
| enable_irqsave(&irq_state); |
| } |
| |
| static inline __attribute__((always_inline)) |
| uint64_t pmc_cycles(void) |
| { |
| unsigned int a = 0, d = 0; |
| int ecx = (1 << 30) + 1; |
| |
| asm volatile("lfence; rdpmc" : "=a"(a), "=d"(d) : "c"(ecx)); |
| return ((uint64_t)a) | (((uint64_t)d) << 32); |
| } |
| |
| /* run with $ perf stat m kfunc wrmsr_test 0xMSR 100000 */ |
| void wrmsr_test(unsigned int msr, int loops) |
| { |
| uint64_t start_cycles, diff_cycles; |
| uint64_t start_time, diff_time; |
| uint64_t msrval = read_msr(msr); |
| |
| loops = MAX(loops, 1); |
| start_cycles = pmc_cycles(); |
| start_time = start_timing(); |
| |
| for (int i = 0; i < loops; i++) |
| write_msr(msr, msrval); |
| |
| diff_cycles = pmc_cycles() - start_cycles; |
| diff_time = stop_timing(start_time); |
| |
| printk("msr 0x%x, cycles per: %llu, nsec per: %llu\n", msr, |
| diff_cycles / loops, tsc2nsec(diff_time) / loops); |
| } |
| |
| /* Does a basic test for interference. You should kfunc this, often after |
| * starting the monitor on another core. You can spam it with ipi_spam(). |
| * You'll also need the PMCs to run. Easiest way is with: |
| * $ perf stat -e cycles sleep 9999999. */ |
| void interference_test(void) |
| { |
| #define THRESHOLD 200 |
| uint64_t low_samples[THRESHOLD] = {0}; |
| uint64_t deadline = sec2tsc(5); /* assumes TSC and cycles are close */ |
| uint64_t start, diff; |
| size_t nr_below_thresh = 0; |
| size_t nr_over_thresh = 0; |
| size_t total = 0; |
| size_t max = 0; |
| |
| deadline += pmc_cycles(); |
| enable_irq(); |
| do { |
| total++; |
| start = pmc_cycles(); |
| diff = pmc_cycles() - start; |
| if (diff < COUNT_OF(low_samples)) |
| low_samples[diff]++; |
| max = diff > max ? diff : max; |
| if (diff < THRESHOLD) |
| nr_below_thresh++; |
| else |
| nr_over_thresh++; |
| if (!start) { |
| warn("rdpmc got 0, is perf stat -e cycles running? (aborting)"); |
| break; |
| } |
| } while (start < deadline); |
| disable_irq(); |
| |
| printk("\nCore %d\n--------\n", core_id()); |
| for (int i = 0; i < COUNT_OF(low_samples); i++) { |
| if (low_samples[i]) |
| printk("\t[ %2d ] : %lu\n", i, low_samples[i]); |
| } |
| printk("Total loops %lu, threshold %u\n", total, THRESHOLD); |
| printk("Nr over thresh %lu\n", nr_over_thresh); |
| printk("Nr below thresh %lu\n", nr_below_thresh); |
| printk("Max %lu\n", max); |
| } |
| |
| /* Kfunc this to spam a core with IPIs */ |
| void ipi_spam(int coreid) |
| { |
| for (int i = 0; i < 1000; i++) { |
| send_ipi(coreid, I_POKE_CORE); |
| udelay(1000); |
| } |
| } |
| |
| /* Kfunc this to halt with IRQs off. Note this doesn't fully work as |
| * advertised. Keyboard and NIC IRQs still wake it up, but LAPIC timers don't |
| * seem to. */ |
| void superhalt(void) |
| { |
| unsigned int x86_cstate = X86_MWAIT_C2; |
| |
| disable_irq(); |
| asm volatile("monitor" : : "a"(KERNBASE), "c"(0), "d"(0)); |
| asm volatile("mwait" : : "c"(0x0), "a"(x86_cstate) : "memory"); |
| printk("Core %d woke from superhalt!\n", core_id()); |
| } |