blob: fb076bf12460e5eaa1f7b7f918286f8143e5303a [file] [log] [blame]
/* Copyright (c) 2013 The Regents of the University of California
* Barret Rhoden <brho@cs.berkeley.edu>
* See LICENSE for details.
*
* Basic perf test for small functions. Will run them in a loop and give you
* the average cost per iteration. It'll run them both as an SCP and an MCP.
*
* To use this, define a function of the form:
*
* void my_test(unsigned long nr_loops)
*
* Which does some computation you wish to measure inside a loop that run
* nr_loops times. Then in microb_test(), add your function in a line such as:
*
* test_time_ns(my_test, 100000);
*
* This macro will run your test and print the results. Pick a loop amount that
* is reasonable for your operation. You can also use test_time_us() for longer
* operations.
*
* Notes:
* - I went with this style so you could do some prep work before and after the
* loop (instead of having a macro build the loop). It's what I needed.
* - Be sure to double check the ASM inside the loop to make sure the compiler
* isn't optimizing out your main work.
* - Make sure your function isn't static. If it is static (and even if it is
* __attribute__((noinline))), if the function is called only once, the
* compiler will compile it differently (specifically, it will hardcode the
* number of loops into the function, instead of taking a parameter).
* Suddenly, the existence of a second test of the same function could change
* the performance of *both* test runs. Incidentally, when this happened to
* me, the tests were *better* when this optimization didn't happen. The way
* to avoid the optimization completely is to have extern functions, since the
* compiler can't assume it is only called once. Though technically they
* still could do some optimizations, and the only really safe way is to put
* the tests in another .c file. */
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
/* OS dependent #incs */
#include <parlib/parlib.h>
#include <parlib/vcore.h>
#include <parlib/timing.h>
#include <parlib/stdio.h>
static uint32_t __get_pcoreid(void)
{
return __procinfo.vcoremap[vcore_id()].pcoreid;
}
/* Testing functions here */
void set_tlsdesc_test(unsigned long nr_loops)
{
#ifdef __i386__
uint32_t vcoreid = vcore_id();
void *mytls = get_tls_desc();
void *vctls = get_vcpd_tls_desc(vcoreid);
segdesc_t tmp = SEG(STA_W, (uint32_t)vctls, 0xffffffff, 3);
uint32_t gs = (vcoreid << 3) | 0x07;
for (int i = 0; i < nr_loops; i++) {
__procdata.ldt[vcoreid] = tmp;
cmb();
asm volatile("movl %0,%%gs" : : "r" (gs) : "memory");
}
set_tls_desc(mytls);
#endif
}
/* Internal test infrastructure */
void loop_overhead(unsigned long nr_loops)
{
for (int i = 0; i < nr_loops; i++) {
cmb();
}
}
/* Runs func(loops) and returns the usec elapsed */
#define __test_time_us(func, loops) \
({ \
struct timeval start_tv = {0}; \
struct timeval end_tv = {0}; \
\
if (gettimeofday(&start_tv, 0)) \
perror("Start time error..."); \
(func)((loops)); \
if (gettimeofday(&end_tv, 0)) \
perror("End time error..."); \
((end_tv.tv_sec - start_tv.tv_sec) * 1000000 + \
(end_tv.tv_usec - start_tv.tv_usec)); \
})
/* Runs func(loops) and returns the nsec elapsed */
#define __test_time_ns(func, loops) \
({ \
(__test_time_us((func), (loops)) * 1000); \
})
/* Runs func(loops), subtracts the loop overhead, and prints the result */
#define test_time_us(func, loops) \
({ \
unsigned long long usec_diff; \
\
usec_diff = __test_time_us((func), \
(loops)) - nsec_per_loop * loops / 1000; \
printf("\"%s\" total: %lluus, per iteration: %lluus\n", #func, \
usec_diff, \
usec_diff / (loops)); \
})
/* Runs func(loops), subtracts the loop overhead, and prints the result */
#define test_time_ns(func, loops) \
({ \
unsigned long long nsec_diff; \
\
nsec_diff = __test_time_ns((func), (loops)) - nsec_per_loop * (loops); \
printf("\"%s\" total: %lluns, per iteration: %lluns\n", #func, \
nsec_diff, \
nsec_diff / (loops)); \
})
static void microb_test(void)
{
unsigned long long nsec_per_loop;
printf("We are %sin MCP mode, running on vcore %d, pcore %d\n",
(in_multi_mode() ? "" : "not "), vcore_id(),
__get_pcoreid());
/* We divide the overhead by loops, and later we multiply again, which
* drops off some accuracy at the expense of usability (can do different
* iterations for different tests without worrying about recomputing the
* loop overhead). */
nsec_per_loop = __test_time_ns(loop_overhead, 100000) / 100000;
printd("Loop overhead per loop: %lluns\n", nsec_per_loop);
/* Add your tests here. Func name, number of loops */
test_time_ns(set_tlsdesc_test , 100000);
}
void *worker_thread(void* arg)
{
microb_test();
return 0;
}
int main(int argc, char** argv)
{
pthread_t child;
void *child_ret;
microb_test();
printf("Spawning worker thread, etc...\n");
pthread_create(&child, NULL, &worker_thread, NULL);
pthread_join(child, &child_ret);
}