| /* Copyright (c) 2013 The Regents of the University of California |
| * Barret Rhoden <brho@cs.berkeley.edu> |
| * See LICENSE for details. |
| * |
| * Basic perf test for small functions. Will run them in a loop and give you |
| * the average cost per iteration. It'll run them both as an SCP and an MCP. |
| * |
| * To use this, define a function of the form: |
| * |
| * void my_test(unsigned long nr_loops) |
| * |
| * Which does some computation you wish to measure inside a loop that run |
| * nr_loops times. Then in microb_test(), add your function in a line such as: |
| * |
| * test_time_ns(my_test, 100000); |
| * |
| * This macro will run your test and print the results. Pick a loop amount that |
| * is reasonable for your operation. You can also use test_time_us() for longer |
| * operations. |
| * |
| * Notes: |
| * - I went with this style so you could do some prep work before and after the |
| * loop (instead of having a macro build the loop). It's what I needed. |
| * - Be sure to double check the ASM inside the loop to make sure the compiler |
| * isn't optimizing out your main work. |
| * - Make sure your function isn't static. If it is static (and even if it is |
| * __attribute__((noinline))), if the function is called only once, the |
| * compiler will compile it differently (specifically, it will hardcode the |
| * number of loops into the function, instead of taking a parameter). |
| * Suddenly, the existence of a second test of the same function could change |
| * the performance of *both* test runs. Incidentally, when this happened to |
| * me, the tests were *better* when this optimization didn't happen. The way |
| * to avoid the optimization completely is to have extern functions, since the |
| * compiler can't assume it is only called once. Though technically they |
| * still could do some optimizations, and the only really safe way is to put |
| * the tests in another .c file. */ |
| |
| #include <stdio.h> |
| #include <pthread.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| #include <sys/time.h> |
| |
| /* OS dependent #incs */ |
| #include <parlib/parlib.h> |
| #include <parlib/vcore.h> |
| #include <parlib/timing.h> |
| #include <parlib/stdio.h> |
| |
| static uint32_t __get_pcoreid(void) |
| { |
| return __procinfo.vcoremap[vcore_id()].pcoreid; |
| } |
| |
| /* Testing functions here */ |
| |
| void set_tlsdesc_test(unsigned long nr_loops) |
| { |
| #ifdef __i386__ |
| uint32_t vcoreid = vcore_id(); |
| void *mytls = get_tls_desc(); |
| void *vctls = get_vcpd_tls_desc(vcoreid); |
| segdesc_t tmp = SEG(STA_W, (uint32_t)vctls, 0xffffffff, 3); |
| uint32_t gs = (vcoreid << 3) | 0x07; |
| |
| for (int i = 0; i < nr_loops; i++) { |
| __procdata.ldt[vcoreid] = tmp; |
| cmb(); |
| asm volatile("movl %0,%%gs" : : "r" (gs) : "memory"); |
| } |
| set_tls_desc(mytls); |
| #endif |
| } |
| |
| /* Internal test infrastructure */ |
| |
| void loop_overhead(unsigned long nr_loops) |
| { |
| for (int i = 0; i < nr_loops; i++) { |
| cmb(); |
| } |
| } |
| |
| /* Runs func(loops) and returns the usec elapsed */ |
| #define __test_time_us(func, loops) \ |
| ({ \ |
| struct timeval start_tv = {0}; \ |
| struct timeval end_tv = {0}; \ |
| \ |
| if (gettimeofday(&start_tv, 0)) \ |
| perror("Start time error..."); \ |
| (func)((loops)); \ |
| if (gettimeofday(&end_tv, 0)) \ |
| perror("End time error..."); \ |
| ((end_tv.tv_sec - start_tv.tv_sec) * 1000000 + \ |
| (end_tv.tv_usec - start_tv.tv_usec)); \ |
| }) |
| |
| /* Runs func(loops) and returns the nsec elapsed */ |
| #define __test_time_ns(func, loops) \ |
| ({ \ |
| (__test_time_us((func), (loops)) * 1000); \ |
| }) |
| |
| /* Runs func(loops), subtracts the loop overhead, and prints the result */ |
| #define test_time_us(func, loops) \ |
| ({ \ |
| unsigned long long usec_diff; \ |
| \ |
| usec_diff = __test_time_us((func), \ |
| (loops)) - nsec_per_loop * loops / 1000; \ |
| printf("\"%s\" total: %lluus, per iteration: %lluus\n", #func, \ |
| usec_diff, \ |
| usec_diff / (loops)); \ |
| }) |
| |
| /* Runs func(loops), subtracts the loop overhead, and prints the result */ |
| #define test_time_ns(func, loops) \ |
| ({ \ |
| unsigned long long nsec_diff; \ |
| \ |
| nsec_diff = __test_time_ns((func), (loops)) - nsec_per_loop * (loops); \ |
| printf("\"%s\" total: %lluns, per iteration: %lluns\n", #func, \ |
| nsec_diff, \ |
| nsec_diff / (loops)); \ |
| }) |
| |
| static void microb_test(void) |
| { |
| unsigned long long nsec_per_loop; |
| |
| printf("We are %sin MCP mode, running on vcore %d, pcore %d\n", |
| (in_multi_mode() ? "" : "not "), vcore_id(), |
| __get_pcoreid()); |
| /* We divide the overhead by loops, and later we multiply again, which |
| * drops off some accuracy at the expense of usability (can do different |
| * iterations for different tests without worrying about recomputing the |
| * loop overhead). */ |
| nsec_per_loop = __test_time_ns(loop_overhead, 100000) / 100000; |
| printd("Loop overhead per loop: %lluns\n", nsec_per_loop); |
| |
| /* Add your tests here. Func name, number of loops */ |
| test_time_ns(set_tlsdesc_test , 100000); |
| } |
| |
| void *worker_thread(void* arg) |
| { |
| microb_test(); |
| return 0; |
| } |
| |
| int main(int argc, char** argv) |
| { |
| pthread_t child; |
| void *child_ret; |
| |
| microb_test(); |
| printf("Spawning worker thread, etc...\n"); |
| pthread_create(&child, NULL, &worker_thread, NULL); |
| pthread_join(child, &child_ret); |
| } |