/* Copyright (c) 2013 The Regents of the University of California
 * Barret Rhoden <brho@cs.berkeley.edu>
 * See LICENSE for details.
 *
 * Basic perf test for small functions.  Will run them in a loop and give you
 * the average cost per iteration.  It'll run them both as an SCP and an MCP.
 *
 * To use this, define a function of the form:
 *
 * 		void my_test(unsigned long nr_loops)
 *
 * Which does some computation you wish to measure inside a loop that run
 * nr_loops times.  Then in microb_test(), add your function in a line such as:
 *
 * 		test_time_ns(my_test, 100000);
 *
 * This macro will run your test and print the results.  Pick a loop amount that
 * is reasonable for your operation.  You can also use test_time_us() for longer
 * operations.
 *
 * Notes:
 * - I went with this style so you could do some prep work before and after the
 *   loop (instead of having a macro build the loop).  It's what I needed.
 * - Be sure to double check the ASM inside the loop to make sure the compiler
 *   isn't optimizing out your main work. 
 * - Make sure your function isn't static.  If it is static (and even if it is
 *   __attribute__((noinline))), if the function is called only once, the
 *   compiler will compile it differently (specifically, it will hardcode the
 *   number of loops into the function, instead of taking a parameter).
 *   Suddenly, the existence of a second test of the same function could change
 *   the performance of *both* test runs.  Incidentally, when this happened to
 *   me, the tests were *better* when this optimization didn't happen.  The way
 *   to avoid the optimization completely is to have extern functions, since the
 *   compiler can't assume it is only called once.  Though technically they
 *   still could do some optimizations, and the only really safe way is to put
 *   the tests in another .c file. */

#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>

/* OS dependent #incs */
#include <parlib/parlib.h>
#include <parlib/vcore.h>
#include <parlib/timing.h>
#include <parlib/stdio.h>

static uint32_t __get_pcoreid(void)
{
	return __procinfo.vcoremap[vcore_id()].pcoreid;
}

/* Testing functions here */

void set_tlsdesc_test(unsigned long nr_loops)
{
#ifdef __i386__
	uint32_t vcoreid = vcore_id();
	void *mytls = get_tls_desc();
	void *vctls = get_vcpd_tls_desc(vcoreid);
	segdesc_t tmp = SEG(STA_W, (uint32_t)vctls, 0xffffffff, 3);
	uint32_t gs = (vcoreid << 3) | 0x07;
    for (int i = 0; i < nr_loops; i++) {
		__procdata.ldt[vcoreid] = tmp;
		cmb();
		asm volatile("movl %0,%%gs" : : "r" (gs) : "memory");
    }
	set_tls_desc(mytls);
#endif
}

/* Internal test infrastructure */

void loop_overhead(unsigned long nr_loops)
{
    for (int i = 0; i < nr_loops; i++) {
		cmb();
    }
}

/* Runs func(loops) and returns the usec elapsed */
#define __test_time_us(func, loops)                                            \
({                                                                             \
	struct timeval start_tv = {0};                                             \
	struct timeval end_tv = {0};                                               \
	if (gettimeofday(&start_tv, 0))                                            \
		perror("Start time error...");                                         \
	(func)((loops));                                                           \
	if (gettimeofday(&end_tv, 0))                                              \
		perror("End time error...");                                           \
	((end_tv.tv_sec - start_tv.tv_sec) * 1000000 +                             \
	 (end_tv.tv_usec - start_tv.tv_usec));                                     \
})

/* Runs func(loops) and returns the nsec elapsed */
#define __test_time_ns(func, loops)                                            \
({                                                                             \
	(__test_time_us((func), (loops)) * 1000);                                  \
})

/* Runs func(loops), subtracts the loop overhead, and prints the result */
#define test_time_us(func, loops)                                              \
({                                                                             \
	unsigned long long usec_diff;                                              \
	usec_diff = __test_time_us((func), (loops)) - nsec_per_loop * loops / 1000;\
	printf("\"%s\" total: %lluus, per iteration: %lluus\n", #func, usec_diff,  \
	       usec_diff / (loops));                                               \
})

/* Runs func(loops), subtracts the loop overhead, and prints the result */
#define test_time_ns(func, loops)                                              \
({                                                                             \
	unsigned long long nsec_diff;                                              \
	nsec_diff = __test_time_ns((func), (loops)) - nsec_per_loop * (loops);     \
	printf("\"%s\" total: %lluns, per iteration: %lluns\n", #func, nsec_diff,  \
	       nsec_diff / (loops));                                               \
})

static void microb_test(void)
{
	unsigned long long nsec_per_loop;
	printf("We are %sin MCP mode, running on vcore %d, pcore %d\n",
	       (in_multi_mode() ? "" : "not "), vcore_id(),
	       __get_pcoreid());
	/* We divide the overhead by loops, and later we multiply again, which drops
	 * off some accuracy at the expense of usability (can do different
	 * iterations for different tests without worrying about recomputing the
	 * loop overhead). */
	nsec_per_loop = __test_time_ns(loop_overhead, 100000) / 100000;
	printd("Loop overhead per loop: %lluns\n", nsec_per_loop);

	/* Add your tests here.  Func name, number of loops */
	test_time_ns(set_tlsdesc_test , 100000);
}

void *worker_thread(void* arg)
{	
	microb_test();
	return 0;
}

int main(int argc, char** argv) 
{
	pthread_t child;
	void *child_ret;
	microb_test();
	printf("Spawning worker thread, etc...\n");
	pthread_create(&child, NULL, &worker_thread, NULL);
	pthread_join(child, &child_ret);
} 
