| /* Copyright (c) 2013, 2014 The Regents of the University of California | 
 |  * Barret Rhoden <brho@cs.berkeley.edu> | 
 |  * See LICENSE for details. | 
 |  * | 
 |  * lock_test: microbenchmark to measure different styles of spinlocks. | 
 |  * | 
 |  * to build on linux: (hacky) | 
 |  * $ gcc -O2 -std=gnu99 -fno-stack-protector -g tests/lock_test.c -lpthread \ | 
 |  *    -lm -o linux_lock_test */ | 
 |  | 
 | #define _GNU_SOURCE /* pthread_yield */ | 
 |  | 
 | #include <stdio.h> | 
 | #include <pthread.h> | 
 | #include <stdlib.h> | 
 | #include <unistd.h> | 
 | #include <sys/time.h> | 
 | #include <math.h> | 
 | #include <argp.h> | 
 | #include <sys/types.h> | 
 | #include <sys/stat.h> | 
 | #include <fcntl.h> | 
 | #include <assert.h> | 
 | #include <string.h> | 
 |  | 
 | /* OS dependent #incs */ | 
 | #ifdef __ros__ | 
 |  | 
 | #include <parlib/parlib.h> | 
 | #include <parlib/stdio.h> | 
 | #include <parlib/vcore.h> | 
 | #include <parlib/timing.h> | 
 | #include <parlib/spinlock.h> | 
 | #include <parlib/mcs.h> | 
 | #include <parlib/arch/arch.h> | 
 | #include <parlib/event.h> | 
 |  | 
 | #include <parlib/tsc-compat.h> | 
 | #include <benchutil/measure.h> | 
 |  | 
 | #else | 
 |  | 
 | #include "../user/parlib/include/parlib/tsc-compat.h" | 
 | #include "misc-compat.h" | 
 | #include "linux-lock-hacks.h" /* TODO: have a build system and lib / C file */ | 
 |  | 
 | #include "../user/benchutil/include/benchutil/measure.h" | 
 | #include "../user/benchutil/measure.c" | 
 |  | 
 | static void os_prep_work(pthread_t *worker_threads, int nr_threads) | 
 | { | 
 | 	if (nr_threads > num_vcores()) | 
 | 		printf("WARNING: %d threads requested, but only %d cores available\n", | 
 | 		       nr_threads, num_vcores()); | 
 | } | 
 |  | 
 | static void os_post_work(pthread_t *worker_threads, int nr_threads) | 
 | { | 
 | 	if (nr_threads > num_vcores()) | 
 | 		return; | 
 | 	/* assuming we're taking cores 0..nr_threads, and we never move. */ | 
 | 	for (int i = 0; i < nr_threads; i++) { | 
 | 		cpu_set_t cpuset; | 
 | 		CPU_ZERO(&cpuset); | 
 | 		CPU_SET(i, &cpuset); | 
 | 		pthread_setaffinity_np(worker_threads[i], sizeof(cpu_set_t), | 
 | 				       &cpuset); | 
 | 	} | 
 | } | 
 |  | 
 | #define print_preempt_trace(args...) {} | 
 |  | 
 | __thread int __vcore_context = 0; | 
 |  | 
 | #endif | 
 |  | 
 | /* TODO: There's lot of work to do still, both on this program and on locking | 
 |  * and vcore code.  For some of the issues, I'll leave in the discussion / | 
 |  * answers, in case it comes up in the future (like when I read this in 8 | 
 |  * months). | 
 |  * | 
 |  * BUGS / COMMENTARY | 
 |  * Occasional deadlocks when preempting and not giving back! | 
 |  * 	- with the new PDRs style, though that doesn't mean the older styles | 
 |  * 	don't have this problem | 
 |  * 	- shouldn't be any weaker than PDR.  they all check pred_vc to see | 
 |  * 	if they are running, and if not, they make sure someone runs | 
 |  * 	- could be weaker if we have an old value for the lockholder, | 
 |  * 	someone outside the chain, and we made sure they ran, and they do | 
 |  * 	nothing (spin in the 2LS or something?) | 
 |  * 		no, they should have gotten a msg about us being preempted, | 
 |  * 		since whoever we turn into gets the message about us swapping. | 
 |  * 	- anyway, it's not clear if this is with MCSPDR, event delivery, | 
 |  * 	preemption handling, or just an artifact of the test (less likely) | 
 |  * why aren't MCS locks in uth_ctx getting dealt with? | 
 |  * 	- because the event is handled, but the lock holder isn't run.  the | 
 |  * 	preemption was dealt with, but nothing saved the lock holder | 
 |  * 	- any uthread_ctx lockholder that gets preempted will get | 
 |  * 	interrupted, and other cores will handle the preemption.  but that | 
 |  * 	uthread won't run again without 2LS support.  either all spinners | 
 |  * 	need to be aware of the 'lockholder' (PDR-style), or the 2LS needs | 
 |  * 	to know when a uthread becomes a 'lockholder' to make sure it runs | 
 |  * 	via user-level preempts.  If the latter, this needs to happen | 
 |  * 	atomically with grabbing the lock, or else be able to handle lots of | 
 |  * 	fake 'lockholders' (like round-robin among all of them) | 
 |  * why is the delay more than the expected delay? | 
 |  * 	because it takes ~2ms to spawn and run a process | 
 |  * 	could do this in a separate process, instead of a script | 
 |  * 		could also consider not using pth_test and changing prov, but | 
 |  * 		driving it by yields and requests.  would also test the | 
 |  * 		alarm/wakeup code (process sets alarm, goes to sleep, wakes up | 
 |  * 		and requests X cores) | 
 |  * why do we get occasional preempt-storms? (lots of change_tos) | 
 |  * 	due to the MCS-PDR chain, which i tried fixing by adjusting the number | 
 |  * 	of workers down to the number of vcores | 
 |  * why isn't the worker adaptation working? | 
 |  * 		- it actually was working, and nr_workers == nr_vcores.  that | 
 |  * 		just wasn't the root cause. | 
 |  * 		- was expecting it to cut down on PDR kernel traffic | 
 |  * 	- still get periods of low perf | 
 |  * 		like O(100) preempt msgs per big preempt/prov | 
 |  * 		does it really take that much to work out an MCS-PDR? | 
 |  * 	- one thing is that if we fake vc ctx, we never receive preemption | 
 |  * 	events.  might be a bad idea. | 
 |  * 		- in general, yeah.  faking VC and turning off events can really | 
 |  * 		muck with things | 
 |  * 		- these events aren't necessarily delivered to a VC who will | 
 |  * 		check events any time soon (might be the last one in the chain) | 
 |  * 		- the core of the issue is that we have the right amount of | 
 |  * 		workers and vcores, but that the system isn't given a chance to | 
 |  * 		stabilize itself.  also, if we have some VCs that are just | 
 |  * 		sitting around, spinning in the 2LS, if those get preempted, no | 
 |  * 		one notices or cares (when faking vc_ctx / getting no events) | 
 |  * 	- there is a slight race where we might make someone run who isn't a | 
 |  * 	lockholder.  logically, its okay.  worst case, it would act like an | 
 |  * 	extra preempt and different startcore, which shouldn't be too bad. | 
 |  * | 
 |  * sanity check: does throughput match latency? (2.5GHz TSC, MCS lock) | 
 |  * 	ex: 5000 locks/ms = 5 locks/us = 200ns/lock = 500 ticks / lock | 
 |  * 	500 ticks * 31 workers (queue) = 15000 ticks | 
 |  * 	avg acquire time was around 14K.  seems fine.. | 
 |  * 	when our MCSPDR throughput tanks (during preempts), it's around | 
 |  * 	400-500 locks/ms, which is around 2us/lock.   | 
 |  * 		when the locker on a preempted chain shows up, it needs to | 
 |  * 		change to the next one in line.  | 
 |  * 			- though that should be in parallel with the other | 
 |  * 			lockholders letting go.  shouldn't be that bad | 
 |  * 			- no, it is probably at the head of the chain very soon, | 
 |  * 			such that it is the bottleneck for the actual lock.  2us | 
 |  * 			seems possible | 
 |  * | 
 |  * what does it take to get out of a preemption with (old) MCS-PDR? | 
 |  * 	- these are now called pdro locks (old) | 
 |  * 	- for a single preempt, it will take 1..n-1 changes.  avg n/2 | 
 |  * 	- for multiple preempts, it's nr_pre * that (avg np/2, worst np) | 
 |  * 	- for every unlock/reacquire cycle (someone unlocks, then rejoins | 
 |  * 	the list), its nr_preempts (aka, nr_workers - nr_vcores) | 
 |  * 	- if we need to have a specific worker get out of the chain, on | 
 |  * 	average, it'd take n/2 cycles (p*n/2 changes)  worst: np | 
 |  * 	- if we want to get multiple workers out, the worst case is still | 
 |  * 	np, but as p increases, we're more likely to approach n cycles | 
 |  * 	- so the current model is np for the initial hit (to move the | 
 |  * 	offline VCs to the end of the chain) and another np to get our | 
 |  * 	specific workers out of the chain and yielding (2np) | 
 |  * | 
 |  * 	- but even with 1 preempt, we're getting 80-200 changes per | 
 |  * | 
 |  * 	- it shouldn't matter that the sys_change_to is really slow, should | 
 |  * 	be the same amount of changes.  however, the preempted ones are | 
 |  * 	never really at the tail end of the chain - they should end up right | 
 |  * 	before the lockholder often.  while the sys_change_tos are slowly | 
 |  * 	moving towards the back of the chain, the locking code is quickly | 
 |  * 	removing (online) nodes from the head and putting them on the back. | 
 |  * | 
 |  * 	- end result: based on lock hold time and lock delay time, a | 
 |  * 	preempted VC stays in the MCS chain (swaps btw VC/nodes), and when | 
 |  * 	it is inside the chain, someone is polling to make them run.  with | 
 |  * 	someone polling, it is extremely unlikely that someone outside the | 
 |  * 	chain will win the race and be able to change_to before the in-chain | 
 |  * 	poller.  to clarify: | 
 |  * 		- hold time and delay time matter, since the longer they are, | 
 |  * 		the greater the amount of time the change_to percolation has to | 
 |  * 		get the preempted VCs to the end of the chain (where no one | 
 |  * 		polls them). | 
 |  * 		- at least one vcore is getting the event to handle the | 
 |  * 		preemption of the in-chain, offline VC.  we could change it so | 
 |  * 		every VC polls the preempt_evq, or just wait til whoever is | 
 |  * 		getting the messages eventually checks their messages (VC0) | 
 |  * 		- if there is an in-chain poller, they will notice the instant | 
 |  * 		the VC map changes, and then immediately change_to (and spin on | 
 |  * 		the proclock in the kernel).  there's almost no chance of a | 
 |  * 		normal preempt event handler doing that faster.  (would require | 
 |  * 		some IRQ latency or something serious). | 
 |  * - adding in any hold time trashes our microbenchmark's perf, but a | 
 |  * little delay time actually helps: (all with no preempts going on) | 
 |  * 	- mcspdr, no delay: 4200-4400 (-w31 -l10000, no faking, etc) | 
 |  * 	- mcspdr, d = 1: 4400-4800 | 
 |  * 	- mcspdr, d = 2: 4200-5200 | 
 |  * 	- as you add delay, it cuts down on contention for the | 
 |  * 	lock->lock cacheline.  but if you add in too much, you'll tank | 
 |  * 	throughput (since there is no contention at all). | 
 |  * 	- as we increase the delay, we cut down on the chance of the | 
 |  * 	preempt storm / preempt-stuck-in-the-chain, though it can still | 
 |  * 	happen, even with a delay of 10us | 
 |  * - maybe add in the lockholder again? (removed in 73701d6bfb) | 
 |  * 	- massively cuts performance, like 2x throughput, without | 
 |  * 	preempts | 
 |  * 	- it's ability to help depends on impl: | 
 |  * 		in one version (old style), it didn't help much at all | 
 |  * 		- in another (optimized lockholder setting), i can't | 
 |  * 		even see the throughput hit, it recovered right away, | 
 |  * 		with O(5) messages | 
 |  * 		- the diff was having the lockholder assign the vcoreid | 
 |  * 		before passing off to the next in the chain, so that | 
 |  * 		there is less time with having "no lockholder". | 
 |  * 		(there's a brief period where the lockholder says it is | 
 |  * 		the next person, who still | 
 |  * 		spins.  they'll have to make | 
 |  * 		sure their pred runs) | 
 |  * -adj workers doesn't matter either... | 
 |  * 	- the 2LS and preemption handling might be doing this | 
 |  * 	automatically, when handle_vc_preempt() does a | 
 |  * 	thread_paused() on its current_uthread. | 
 |  * 	- adj_workers isn't critical if we're using some locks | 
 |  * 	that check notif_pending.  eventually someone hears | 
 |  * 	about preempted VCs (assuming we can keep up) | 
 |  * | 
 |  * What about delays?  both hold and delay should make it easier to get | 
 |  * the preempted vcore to the end of the chain.  but do they have to be | 
 |  * too big to be reasonable? | 
 |  * 	- yes.  hold doesn't really help much until everything is | 
 |  * 	slower.  even with a hold of around 1.2us, we still have the | 
 |  * 	change_to-storms and lowered throughput. | 
 |  * 	- doing a combo helps too.  if you hold for 1ns (quite a bit | 
 |  * 	more actually, due to the overhead of ndelay, but sufficient to | 
 |  * 	be "doing work"), and delaying for around 7us before rejoining, | 
 |  * 	there's only about a 1/5 chance of a single preempt messing us | 
 |  * 	up | 
 |  * 		- though having multiple preempts outstanding make this less | 
 |  * 		likely to work. | 
 |  * 		- and it seems like if we get into the storm scenario, we | 
 |  * 		never really get out.  either we do quickly or never do. | 
 |  * 		depending on the workload, this could be a matter of luck | 
 |  * | 
 |  * So we could try tracking the lockholder, but only looking at it when | 
 |  * we know someone was preempted in the chain - specifically, when our | 
 |  * pred is offline.  when that happens, we don't change to them, we | 
 |  * make sure the lockholder is running. | 
 |  * 	- tracking takes us from 4200->2800 throughput or so for MCS | 
 |  * 	- 5200 -> 3700 or so for MCS in vc_ctx (__MCSPDR) | 
 |  * 	- main spike seems to be in the hold time.  bimodal distrib, | 
 |  * 	with most below 91 (the usual is everything packed below 70) and | 
 |  * 	a big spike around 320 | 
 |  * | 
 |  * Summary: | 
 |  * | 
 |  * So we need to have someone outside the chain change_to the one in the | 
 |  * chain o/w, someone will always be in the chain.  Right now, it's always | 
 |  * the next in line who is doing the changing, so a preempted vcore is | 
 |  * always still in the chain.  | 
 |  * | 
 |  * If the locking workload has some delaying, such as while holding the | 
 |  * lock or before reacquiring, the "change_to" storm might not be a | 
 |  * problem.  If it is, the only alternative I have so far is to check the | 
 |  * lockholder (which prevents a chain member from always ensuring their | 
 |  * pred runs).  This hurts the lock's scalability/performance when we | 
 |  * aren't being preempted.  On the otherhand, based on what you're doing | 
 |  * with the lock, one more cache miss might not be as big of a deal as in | 
 |  * lock_test.  Especially if when you get stormed, your throughput could be | 
 |  * terrible and never recover. | 
 |  * | 
 |  * Similar point: you can use spinpdr locks.  They have the PDR-benefits, | 
 |  * and won't induce the storm of change_tos.  However, this isn't much | 
 |  * better for contended locks.  They perform 2-3x worse (on c89) without | 
 |  * preemption.  Arguably, if you were worried about the preempt storms and | 
 |  * want scalability, you might want to use mcspdr with lockholders. | 
 |  * | 
 |  * The MCSPDRS (now just callced MCSPDR, these are default) locks can avoid | 
 |  * the storm, but at the cost of a little more in performance.  mcspdrs | 
 |  * style is about the same when not getting preempted from uth ctx compared | 
 |  * to mcspdr (slight drop).  When in vc ctx, it's about 10-20% perf hit | 
 |  * (PDRS gains little from --vc_ctx).  | 
 |  * | 
 |  * Turns out there is a perf hit to PDRS (and any non-stack based qnode) | 
 |  * when running on c89.  The issue is that after shuffling the vcores | 
 |  * around, they are no longer mapped nicely to pcores (VC0->PC1, VC1->PC2). | 
 |  * This is due to some 'false sharing' of the cachelines, caused mostly by | 
 |  * aggressive prefetching (notably the intel adjacent cacheline prefetcher, | 
 |  * which grabs two CLs at a time!).  Basically, stack-based qnodes are | 
 |  * qnodes that are very far apart in memory.  Cranking up the padding in | 
 |  * qnodes in the "qnodes-in-locks" style replicates this. | 
 |  * | 
 |  * For some info on the prefetching: | 
 |  * 	http://software.intel.com/en-us/articles/optimizing-application-performance-on-intel-coret-microarchitecture-using-hardware-implemented-prefetchers/ | 
 |  * 	http://software.intel.com/en-us/forums/topic/341769 | 
 |  * | 
 |  * Here's some rough numbers of the different styles for qnodes on c89. | 
 |  * 'in order' is VCn->PC(n+1) (0->1, 1->2).  Worst order is with even VCs | 
 |  * on one socket, odds on the other.  the number of CLs is the size of a | 
 |  * qnode.  mcspdr is the new style (called mcspdrs in some places in this | 
 |  * document), with lock-based qnodes.  mcspdr2 is the same, but with | 
 |  * stack-based qnodes.  mcspdro is the old style (bad a recovery), stack | 
 |  * based, sometimes just called mcs-pdr | 
 |  * | 
 |  * 	with prefetchers disabled (MCS and DCU) | 
 |  * 		mcspdr   1CL  4.8-5.4 in order, 3.8-4.2 worst order | 
 |  * 		mcspdr   2CL          in order,         worst order | 
 |  * 		mcspdr   4CL  5.2-6.0 in order, 4.7-5.3 worst order | 
 |  * 		mcspdr   8CL  5.4-6.7 in order, 5.2-6.2 worst order | 
 |  * 		mcspdr  16CL  5.1-5.8 in order, 5.2-6.8 worst order | 
 |  * 		mcspdr2 stck          in order,         worst order | 
 |  * 		mcspdro stck  4-3.4.3 in order, 4.2-4.5 worst order | 
 |  * 		mcspdro-vcctx 4.8-7.0 in order, 5.3-6.7 worst order | 
 |  * 		can we see the 2 humps?  | 
 |  * 			mcspdr 1CL yes but less, varied, etc | 
 |  * 			mcspdr2 no | 
 |  * | 
 |  * 	test again with worst order with prefetchers enabled | 
 |  * 		mcspdr   1CL  3.8-4.0 in order, 2.6-2.7 worst order | 
 |  * 		mcspdr   2CL  4.2-4.4 in order, 3.8-3.9 worst order | 
 |  * 		mcspdr   4CL  4.5-5.2 in order, 4.0-4.2 worst order | 
 |  * 		mcspdr   8CL  4.4-5.1 in order, 4.3-4.7 worst order | 
 |  * 		mcspdr  16CL  4.4-4.8 in order, 4.4-5.3 worst order | 
 |  * 		mcspdr2 stck  3.0-3.0 in order, 2.9-3.0 worst order | 
 |  * 		mcspdro stck  4.2-4.3 in order, 4.2-4.4 worst order | 
 |  * 		mcspdro-vcctx 5.2-6.4 in order, 5.0-5.9 worst order | 
 |  * 		can we see the 2 humps? | 
 |  * 			mcspdrs 1CL yes, clearly | 
 |  * 			mcspdr2 no | 
 |  * | 
 |  * PROGRAM FEATURES | 
 |  * 	- verbosity?  vcoremap, preempts, the throughput and latency histograms? | 
 |  * 	- have a max workers option (0?) == max vcores | 
 |  * 	- would like to randomize (within bounds) the hold/delay times | 
 |  * 		- help avoid convoys with MCS locks | 
 |  * | 
 |  * PERFORMANCE: | 
 |  * | 
 |  *	pcore control?  (hyperthreading, core 0, cross socket?) | 
 |  *		want some options for controlling which threads run where, or | 
 |  *		which vcores are even used (like turning off hyperthreading)? | 
 |  *	implement ticket spinlocks?  (more fair, more effects of preempts) | 
 |  *		no simple way to do PDR either, other than 'check everyone' | 
 |  *	MCS vs MCSPDR vs __MCSPDR | 
 |  *		MCS seems slightly better than __MCSPDR (and it should) | 
 |  *		MCSPDR is a bit worse than __MCSPDR | 
 |  *			- the uth_disable/enable code seems to make a | 
 |  *			difference. | 
 |  *			- i see why the latencies are worse, since they have | 
 |  *			extra work to do, but the internal part that contends | 
 |  *			with other cores shouldn't be affected, unless there's | 
 |  *			some other thing going on.  Or perhaps there isn't | 
 |  *			always someone waiting for the lock? | 
 |  *			- faking VC ctx mostly negates the cost of MCSPDR vs | 
 |  *			__MCSPDR things that made a big diff: CL aligning the | 
 |  *			qnodes, putting qnodes | 
 |  *		on stacks, reading in the vcoreid once before ensuring() | 
 |  *	both MCS CAS unlocks could use some branch prediction work | 
 |  *	spinpdr locks are 2-3x faster than spinlocks... | 
 |  *		test, test&set  vs the existing test&set, plus lots of asserts | 
 |  * | 
 |  *	some delay (like 10us) lowers latency while maintaining throughput | 
 |  *		- makes sense esp with MCS.  if you join the queue at the last | 
 |  *		second, you'll measure lower latency than attempting right away | 
 |  *		- also true for spinlocks | 
 |  *		- we can probably figure out the max throughput (TP = f(delay)) | 
 |  *		for each lock type | 
 |  * | 
 |  *	hard to get steady numbers with MCS - different runs of the same test | 
 |  *	will vary in throughput by around 15-30% (e.g., MCS varying from 3k-4k | 
 |  *	L/ms) | 
 |  *		- happens on c89 (NUMA) and hossin (UMA) | 
 |  *		- spinlocks seem a little steadier. | 
 |  *		- for MCS locks, the order in which they line up across the | 
 |  *		pcores will matter.  like if on one run, i regularly hand off | 
 |  *		between cores | 
 |  *		in the same socket and only do one cross-socket step | 
 |  *		- run a lot of shorter ones to get a trend, for now | 
 |  *		- might be correllated with spikes in held times (last bin) | 
 |  *		- can't turn off legacy USB on c89 (SMM) - interferes with PXE | 
 |  * | 
 |  * PREEMPTS: | 
 |  * better preempt record tracking? | 
 |  * 	i just hacked some event-intercept and timestamp code together | 
 |  * 	maybe put it in the event library? | 
 |  * 	the timestamps definitely helped debugging | 
 |  * | 
 |  * is it true that if uthread code never spins outside a PDR lock, then it | 
 |  * doesn't need preemption IPIs?  (just someone checks the event at some | 
 |  * point).  | 
 |  * 	think so: so long as you make progress and when you aren't, you | 
 |  * 	check events (like if a uthread blocks on something and enters VC | 
 |  * 	ctx) | 
 |  * adjusting the number of workers, whether vcores or uthreads | 
 |  * - if you have more lockers than cores: | 
 |  * 	- spinpdr a worker will get starved (akaros) (without 2LS support) | 
 |  * 		- running this from uth context will cause a handle_events | 
 |  * 	- mcspdr will require the kernel to switch (akaros) | 
 |  * 	- spin (akaros) might DL (o/w nothing), (linux) poor perf | 
 |  * 	- mcs (akaros) will DL, (linux) poor perf | 
 |  * 	- poor perf (latency spikes) comes from running the wrong thread | 
 |  * 	sometimes | 
 |  * 	- deadlock comes from the lack of kernel-level context switching | 
 |  * - if we scale workers down to the number of active vcores: | 
 |  * 	- two things: the initial hit, and the steady state.  during the | 
 |  * 	initial hit, we can still deadlock, since we have more lockers than | 
 |  * 	cores | 
 |  * 		- non-pdr (akaros) could deadlock in the initial hit | 
 |  * 		- (akaros) steady state, everything is normal (just fewer cores) | 
 |  * 	- how can we adjust this in linux? | 
 |  * 		- if know how many cores you have, then futex wait the others | 
 |  * 		- need some way to wake them back up | 
 |  * 		- if you do this in userspace, you might need something PDR-like | 
 |  * 		to handle when the "2LS" code gets preempted | 
 |  * 	- as mentioned above, the problem in akaros is that the lock/unlock | 
 |  * 	might be happening too fast to get into the steady-state and recover | 
 |  * 	from the initial preemption | 
 |  * - one of our benefits is that we can adapt in userspace, with userspace | 
 |  * knowledge, under any circumstance. | 
 |  * 	- we have the deadlock windows (forcing PDR). | 
 |  * 	- in return, we can do this adaptation in userspace | 
 |  * 	- and (arguably) anyone who does this in userspace will need PDR | 
 |  * | 
 |  * MEASUREMENT (user/parlib/measure.c) | 
 |  * 	extract into its own library, for linux apps | 
 |  * 	print out raw TSC times?  might help sync up diff timelines | 
 |  * 	Need more latency bins, spinlocks vary too much | 
 |  * 	maybe we need better high/low too, since this hist looks bad too | 
 |  * 		or not center on the average? | 
 |  * 		for printing, its hard to know without already binning. | 
 |  * 		maybe bin once (latency?), then use that to adjust the hist? | 
 |  * | 
 |  * 	Had this on a spinlock: | 
 |  * 	[      32 -    35656] 1565231: | 
 |  * 	(less than 200 intermediate) | 
 |  *      [  286557 - 20404788]   65298: * | 
 |  * | 
 |  * 	Samples per dot: 34782 | 
 |  * 	Total samples: 1640606 | 
 |  * 	Avg time   : 96658 | 
 |  * 	Stdev time : 604064.440882 | 
 |  * 	Coef Var   : 6.249503 | 
 |  * 		High coeff of var with serious outliers, adjusted bins | 
 |  * 		50/75/90/99: 33079 / 33079 / 33079 / 290219 (-<860) | 
 |  * 		Min / Max  : 32 / 20404788 | 
 |  * 	was 50/75/90 really within 860 of each other? | 
 |  * | 
 |  * 	when we are preempted and don't even attempt anything, say for 10ms, it | 
 |  * 	actually doesn't hurt our 50/75/90/99 too much.  we have a ridiculous | 
 |  * 	stddev and max, and high average, but there aren't any additional | 
 |  * 	attempts at locking to mess with the attempt-latency.  Only nr_vcores | 
 |  * 	requests are in flight during the preemption, but we can spit out around | 
 |  * 	5000 per ms when we aren't preempted. | 
 |  * | 
 |  */ | 
 |  | 
 | const char *argp_program_version = "lock_test v0.1475263"; | 
 | const char *argp_program_bug_address = "<akaros+subscribe@googlegroups.com>"; | 
 |  | 
 | static char doc[] = "lock_test -- spinlock benchmarking"; | 
 | static char args_doc[] = "-w NUM -l NUM -t LOCK"; | 
 |  | 
 | #define OPT_VC_CTX 1 | 
 | #define OPT_ADJ_WORKERS 2 | 
 |  | 
 | static struct argp_option options[] = { | 
 | 	{"workers",	'w', "NUM",	OPTION_NO_USAGE, "Number of threads/cores"}, | 
 | 	{0, 0, 0, 0, ""}, | 
 | 	{"loops",	'l', "NUM",	OPTION_NO_USAGE, "Number of loops per worker"}, | 
 | 	{0, 0, 0, 0, ""}, | 
 | 	{"type",	't', "LOCK",OPTION_NO_USAGE, "Type of lock to use.  " | 
 | 	                                             "Options:\n" | 
 | 	                                             "\tmcs\n" | 
 | 	                                             "\tmcscas\n" | 
 | 	                                             "\tmcspdr\n" | 
 | 	                                             "\tmcspdro\n" | 
 | 	                                             "\t__mcspdro\n" | 
 | 	                                             "\tspin\n" | 
 | 	                                             "\tspinpdr"}, | 
 | 	{0, 0, 0, 0, "Other options (not mandatory):"}, | 
 | 	{"adj_workers",	OPT_ADJ_WORKERS, 0,	0, | 
 | 	                                       "Adjust workers such that the " | 
 | 	                                       "number of workers equals the " | 
 | 	                                       "number of vcores"}, | 
 | 	{"vc_ctx",	OPT_VC_CTX, 0,	0, "Run threads in mock-vcore context"}, | 
 | 	{0, 0, 0, 0, ""}, | 
 | 	{"hold",	'h', "NSEC",	0, "nsec to hold the lock"}, | 
 | 	{"delay",	'd', "NSEC",	0, "nsec to delay between grabs"}, | 
 | 	{"print",	'p', "ROWS",	0, "Print ROWS of optional measurements"}, | 
 | 	{"outfile",	'o', "FILE",	0, "Print ROWS of optional measurements"}, | 
 | 	{ 0 } | 
 | }; | 
 |  | 
 | struct prog_args { | 
 | 	int			nr_threads; | 
 | 	int			nr_loops; | 
 | 	int			hold_time; | 
 | 	int			delay_time; | 
 | 	int			nr_print_rows; | 
 | 	bool			fake_vc_ctx; | 
 | 	bool			adj_workers; | 
 | 	char			*outfile_path; | 
 | 	void *(*lock_type)(void *arg); | 
 | }; | 
 | struct prog_args pargs = {0}; | 
 |  | 
 | /* Globals */ | 
 | struct time_stamp { | 
 | 	uint64_t pre; | 
 | 	uint64_t acq; | 
 | 	uint64_t un; | 
 | 	bool valid; | 
 | }; | 
 | struct time_stamp **times; | 
 | bool run_locktest = TRUE; | 
 | pthread_barrier_t start_test; | 
 |  | 
 | /* Locking functions.  Define globals here, init them in main (if possible), and | 
 |  * use the lock_func() macro to make your thread func. */ | 
 |  | 
 | #define lock_func(lock_name, lock_cmd, unlock_cmd)                             \ | 
 | void *lock_name##_thread(void *arg)                                            \ | 
 | {                                                                              \ | 
 | 	long thread_id = (long)arg;                                            \ | 
 | 	int hold_time = ACCESS_ONCE(pargs.hold_time);                          \ | 
 | 	int delay_time = ACCESS_ONCE(pargs.delay_time);                        \ | 
 | 	int nr_loops = ACCESS_ONCE(pargs.nr_loops);                            \ | 
 | 	bool fake_vc_ctx = ACCESS_ONCE(pargs.fake_vc_ctx);                     \ | 
 | 	bool adj_workers = ACCESS_ONCE(pargs.adj_workers);                     \ | 
 | 	uint64_t pre_lock, acq_lock, un_lock;                                  \ | 
 | 	struct time_stamp *this_time;                                          \ | 
 | 	struct mcs_lock_qnode mcs_qnode = MCS_QNODE_INIT;                      \ | 
 | 	struct mcs_pdro_qnode pdro_qnode = MCSPDRO_QNODE_INIT;                 \ | 
 | 	int i;                                                                 \ | 
 | 	/* guessing a unique vcoreid for vcoreid for the __mcspdr test.  if the | 
 | 	 * program gets preempted for that test, things may go nuts */         \ | 
 | 	pdro_qnode.vcoreid = thread_id + 1 % pargs.nr_threads;                 \ | 
 | 	/* Wait til all threads are created.  Ideally, I'd like to busywait | 
 | 	 * unless absolutely critical to yield */                              \ | 
 | 	pthread_barrier_wait(&start_test);                                     \ | 
 | 	if (fake_vc_ctx) {                                                     \ | 
 | 		/* tells the kernel / other vcores we're in vc ctx */          \ | 
 | 		uth_disable_notifs();                                          \ | 
 | 		/* tricks ourselves into believing we're in vc ctx */          \ | 
 | 		__vcore_context = TRUE;                                        \ | 
 | 	}                                                                      \ | 
 | 	for (i = 0; i < nr_loops; i++) {                                       \ | 
 | 		if (!run_locktest)                                             \ | 
 | 			break;                                                 \ | 
 | 		pre_lock = read_tsc_serialized();                              \ | 
 |                                                                                \ | 
 | 		lock_cmd                                                       \ | 
 |                                                                                \ | 
 | 		acq_lock = read_tsc_serialized();                              \ | 
 | 		if (hold_time)                                                 \ | 
 | 			ndelay(hold_time);                                     \ | 
 |                                                                                \ | 
 | 		unlock_cmd                                                     \ | 
 |                                                                                \ | 
 | 		un_lock = read_tsc_serialized();                               \ | 
 | 		this_time = ×[thread_id][i];                              \ | 
 | 		this_time->pre = pre_lock;                                     \ | 
 | 		this_time->acq = acq_lock;                                     \ | 
 | 		this_time->un = un_lock;                                       \ | 
 | 		/* Can turn these on/off to control which samples we gather */ \ | 
 | 		this_time->valid = TRUE;                                       \ | 
 | 		/* this_time->valid = (num_vcores() == max_vcores());  */      \ | 
 |                                                                                \ | 
 | 		if (delay_time)                                                \ | 
 | 			ndelay(delay_time);                                    \ | 
 | 		/* worker thread ids are 0..n-1.  if we're one of the threads | 
 | 		 * that's beyond the VC count, we yield. */                    \ | 
 | 		if (adj_workers && num_vcores() < thread_id + 1) {             \ | 
 | 			if (fake_vc_ctx) {                                     \ | 
 | 				__vcore_context = FALSE;                       \ | 
 | 				uth_enable_notifs();                           \ | 
 | 			}                                                      \ | 
 | 			/* we'll come back up once we have enough VCs running*/\ | 
 | 			pthread_yield();                                       \ | 
 | 			if (fake_vc_ctx) {                                     \ | 
 | 				uth_disable_notifs();                          \ | 
 | 				__vcore_context = TRUE;                        \ | 
 | 			}                                                      \ | 
 | 		}                                                              \ | 
 | 		cmb();                                                         \ | 
 | 	}                                                                      \ | 
 | 	/* First thread to finish stops the test */                            \ | 
 | 	run_locktest = FALSE;                                                  \ | 
 | 	if (fake_vc_ctx) {                                                     \ | 
 | 		__vcore_context = FALSE;                                       \ | 
 | 		uth_enable_notifs();                                           \ | 
 | 	}                                                                      \ | 
 | 	return (void*)(long)i;                                                 \ | 
 | } | 
 |  | 
 | #define fake_lock_func(lock_name, x1, x2)                                      \ | 
 | void *lock_name##_thread(void *arg)                                            \ | 
 | {                                                                              \ | 
 | 	printf("Lock " #lock_name " not supported!\n");                        \ | 
 | 	exit(-1);                                                              \ | 
 | } | 
 |  | 
 | spinlock_t spin_lock = SPINLOCK_INITIALIZER; | 
 | struct mcs_lock mcs_lock = MCS_LOCK_INIT; | 
 |  | 
 | /* Defines locking funcs like "mcs_thread" */ | 
 | lock_func(mcs, | 
 |           mcs_lock_lock(&mcs_lock, &mcs_qnode);, | 
 |           mcs_lock_unlock(&mcs_lock, &mcs_qnode);) | 
 | lock_func(mcscas, | 
 |           mcs_lock_lock(&mcs_lock, &mcs_qnode);, | 
 |           mcs_lock_unlock_cas(&mcs_lock, &mcs_qnode);) | 
 | lock_func(spin, | 
 |           spinlock_lock(&spin_lock);, | 
 |           spinlock_unlock(&spin_lock);) | 
 |  | 
 | #ifdef __ros__ | 
 | struct spin_pdr_lock spdr_lock = SPINPDR_INITIALIZER; | 
 | struct mcs_pdr_lock mcspdr_lock; | 
 | struct mcs_pdro_lock mcspdro_lock = MCSPDRO_LOCK_INIT; | 
 |  | 
 | lock_func(mcspdr, | 
 |           mcs_pdr_lock(&mcspdr_lock);, | 
 |           mcs_pdr_unlock(&mcspdr_lock);) | 
 | lock_func(mcspdro, | 
 |           mcs_pdro_lock(&mcspdro_lock, &pdro_qnode);, | 
 |           mcs_pdro_unlock(&mcspdro_lock, &pdro_qnode);) | 
 | lock_func(__mcspdro, | 
 |           __mcs_pdro_lock(&mcspdro_lock, &pdro_qnode);, | 
 |           __mcs_pdro_unlock(&mcspdro_lock, &pdro_qnode);) | 
 | lock_func(spinpdr, | 
 |           spin_pdr_lock(&spdr_lock);, | 
 |           spin_pdr_unlock(&spdr_lock);) | 
 | #else | 
 |  | 
 | fake_lock_func(mcspdr, 0, 0); | 
 | fake_lock_func(mcspdro, 0, 0); | 
 | fake_lock_func(__mcspdro, 0, 0); | 
 | fake_lock_func(spinpdr, 0, 0); | 
 |  | 
 | #endif | 
 |  | 
 | static int get_acq_latency(void **data, int i, int j, uint64_t *sample) | 
 | { | 
 | 	struct time_stamp **times = (struct time_stamp**)data; | 
 | 	/* 0 for initial time means we didn't measure */ | 
 | 	if (times[i][j].pre == 0) | 
 | 		return -1; | 
 | 	/* can optionally throw out invalid times (keep this in sync with the | 
 | 	 * lock_test macro, based on what you want to meaasure. */ | 
 | 	#if 0 | 
 | 	if (!times[i][j].valid) | 
 | 		return -1; | 
 | 	#endif | 
 | 	*sample = times[i][j].acq - times[i][j].pre - get_tsc_overhead(); | 
 | 	return 0; | 
 | } | 
 |  | 
 | static int get_hld_latency(void **data, int i, int j, uint64_t *sample) | 
 | { | 
 | 	struct time_stamp **times = (struct time_stamp**)data; | 
 | 	/* 0 for initial time means we didn't measure */ | 
 | 	if (times[i][j].pre == 0) | 
 | 		return -1; | 
 | 	*sample = times[i][j].un - times[i][j].acq - get_tsc_overhead(); | 
 | 	return 0; | 
 | } | 
 |  | 
 | static int get_acq_timestamp(void **data, int i, int j, uint64_t *sample) | 
 | { | 
 | 	struct time_stamp **times = (struct time_stamp**)data; | 
 | 	/* 0 for initial time means we didn't measure */ | 
 | 	if (times[i][j].pre == 0) | 
 | 		return -1; | 
 | 	*sample = times[i][j].acq; | 
 | 	return 0; | 
 | } | 
 |  | 
 | #ifdef __ros__ | 
 |  | 
 | /* Lousy event intercept.  build something similar in the event library? */ | 
 | #define MAX_NR_EVENT_TRACES 1000 | 
 | uint64_t preempts[MAX_NR_EVENT_TRACES] = {0}; | 
 | uint64_t indirs[MAX_NR_EVENT_TRACES] = {0}; | 
 | atomic_t preempt_idx; | 
 | atomic_t indir_idx; | 
 | atomic_t preempt_cnt; | 
 | atomic_t indir_cnt; | 
 |  | 
 | static void trace_preempt(struct event_msg *ev_msg, unsigned int ev_type, | 
 |                           void *data) | 
 | { | 
 | 	unsigned long my_slot = atomic_fetch_and_add(&preempt_idx, 1); | 
 |  | 
 | 	if (my_slot < MAX_NR_EVENT_TRACES) | 
 | 		preempts[my_slot] = read_tsc(); | 
 | 	atomic_inc(&preempt_cnt); | 
 | } | 
 |  | 
 | static void trace_indir(struct event_msg *ev_msg, unsigned int ev_type, | 
 |                         void *data) | 
 | { | 
 |  | 
 | 	unsigned long my_slot = atomic_fetch_and_add(&indir_idx, 1); | 
 | 	if (my_slot < MAX_NR_EVENT_TRACES) | 
 | 		indirs[my_slot] = read_tsc(); | 
 | 	atomic_inc(&indir_cnt); | 
 | } | 
 |  | 
 | /* Helper, prints out the preempt trace */ | 
 | static void print_preempt_trace(uint64_t starttsc, int nr_print_rows) | 
 | { | 
 | 	/* reusing nr_print_rows for the nr preempt/indirs rows as well */ | 
 |  | 
 | 	int preempt_rows = MIN(MAX_NR_EVENT_TRACES, nr_print_rows); | 
 | 	if (pargs.fake_vc_ctx) { | 
 | 		printf("No preempt trace available when faking vc ctx\n"); | 
 | 		return; | 
 | 	} | 
 | 	printf("\n"); | 
 | 	printf("Nr Preempts: %d\n", atomic_read(&preempt_cnt)); | 
 | 	printf("Nr Indirs  : %d\n", atomic_read(&indir_cnt)); | 
 | 	if (preempt_rows) | 
 | 		printf("Preempt/Indir events:\n-----------------\n"); | 
 | 	for (int i = 0; i < preempt_rows; i++) { | 
 | 		if (preempts[i]) | 
 | 			printf("Preempt %3d at %6llu\n", | 
 | 			       i, tsc2msec(preempts[i] - starttsc)); | 
 | 	} | 
 | 	for (int i = 0; i < preempt_rows; i++) { | 
 | 		if (indirs[i]) | 
 | 			printf("Indir   %3d at %6llu\n", | 
 | 			       i, tsc2msec(indirs[i] - starttsc)); | 
 | 	} | 
 | } | 
 |  | 
 | /* Make sure we have enough VCs for nr_threads, pref 1:1 at the start */ | 
 | static void os_prep_work(pthread_t *worker_threads, int nr_threads) | 
 | { | 
 | 	if (nr_threads > max_vcores()) { | 
 | 		printf("Too many threads (%d) requested, can't get more than %d vc\n", | 
 | 		       nr_threads, max_vcores()); | 
 | 		exit(-1); | 
 | 	} | 
 | 	atomic_init(&preempt_idx, 0); | 
 | 	atomic_init(&indir_idx, 0); | 
 | 	atomic_init(&preempt_cnt, 0); | 
 | 	atomic_init(&indir_cnt, 0); | 
 | 	parlib_never_yield = TRUE; | 
 | 	pthread_need_tls(FALSE); | 
 | 	pthread_mcp_init();		/* gives us one vcore */ | 
 | 	register_ev_handler(EV_VCORE_PREEMPT, trace_preempt, 0); | 
 | 	register_ev_handler(EV_CHECK_MSGS, trace_indir, 0); | 
 | 	if (pargs.fake_vc_ctx) { | 
 | 		/* need to disable events when faking vc ctx.  since we're | 
 | 		 * looping and not handling events, we could run OOM */ | 
 | 		clear_kevent_q(EV_VCORE_PREEMPT); | 
 | 		clear_kevent_q(EV_CHECK_MSGS); | 
 | 	} | 
 | 	vcore_request_total(nr_threads); | 
 | 	parlib_never_vc_request = TRUE; | 
 | 	for (int i = 0; i < nr_threads; i++) { | 
 | 		printd("Vcore %d mapped to pcore %d\n", i, | 
 | 		       __procinfo.vcoremap[i].pcoreid); | 
 | 	} | 
 | } | 
 |  | 
 | static void os_post_work(pthread_t *worker_threads, int nr_threads) | 
 | { | 
 | } | 
 |  | 
 | #endif | 
 |  | 
 | /* Argument parsing */ | 
 | static error_t parse_opt (int key, char *arg, struct argp_state *state) | 
 | { | 
 | 	struct prog_args *pargs = state->input; | 
 |  | 
 | 	switch (key) { | 
 | 	case 'w': | 
 | 		pargs->nr_threads = atoi(arg); | 
 | 		if (pargs->nr_threads < 0) { | 
 | 			printf("Negative nr_threads...\n\n"); | 
 | 			argp_usage(state); | 
 | 		} | 
 | 		break; | 
 | 	case 'l': | 
 | 		pargs->nr_loops = atoi(arg); | 
 | 		if (pargs->nr_loops < 0) { | 
 | 			printf("Negative nr_loops...\n\n"); | 
 | 			argp_usage(state); | 
 | 		} | 
 | 		break; | 
 | 	case OPT_ADJ_WORKERS: | 
 | 		pargs->adj_workers = TRUE; | 
 | 		break; | 
 | 	case OPT_VC_CTX: | 
 | 		pargs->fake_vc_ctx = TRUE; | 
 | 		break; | 
 | 	case 'h': | 
 | 		pargs->hold_time = atoi(arg); | 
 | 		if (pargs->hold_time < 0) { | 
 | 			printf("Negative hold_time...\n\n"); | 
 | 			argp_usage(state); | 
 | 		} | 
 | 		break; | 
 | 	case 'd': | 
 | 		pargs->delay_time = atoi(arg); | 
 | 		if (pargs->delay_time < 0) { | 
 | 			printf("Negative delay_time...\n\n"); | 
 | 			argp_usage(state); | 
 | 		} | 
 | 		break; | 
 | 	case 'o': | 
 | 		pargs->outfile_path = arg; | 
 | 		break; | 
 | 	case 'p': | 
 | 		pargs->nr_print_rows = atoi(arg); | 
 | 		if (pargs->nr_print_rows < 0) { | 
 | 			printf("Negative print_rows...\n\n"); | 
 | 			argp_usage(state); | 
 | 		} | 
 | 		break; | 
 | 	case 't': | 
 | 		if (!strcmp("mcs", arg)) { | 
 | 			pargs->lock_type = mcs_thread; | 
 | 			break; | 
 | 		} | 
 | 		if (!strcmp("mcscas", arg)) { | 
 | 			pargs->lock_type = mcscas_thread; | 
 | 			break; | 
 | 		} | 
 | 		if (!strcmp("mcspdr", arg)) { | 
 | 			pargs->lock_type = mcspdr_thread; | 
 | 			break; | 
 | 		} | 
 | 		if (!strcmp("mcspdro", arg)) { | 
 | 			pargs->lock_type = mcspdro_thread; | 
 | 			break; | 
 | 		} | 
 | 		if (!strcmp("__mcspdro", arg)) { | 
 | 			pargs->lock_type = __mcspdro_thread; | 
 | 			break; | 
 | 		} | 
 | 		if (!strcmp("spin", arg)) { | 
 | 			pargs->lock_type = spin_thread; | 
 | 			break; | 
 | 		} | 
 | 		if (!strcmp("spinpdr", arg)) { | 
 | 			pargs->lock_type = spinpdr_thread; | 
 | 			break; | 
 | 		} | 
 | 		printf("Unknown locktype %s\n\n", arg); | 
 | 		argp_usage(state); | 
 | 		break; | 
 | 	case ARGP_KEY_ARG: | 
 | 		printf("Warning, extra argument %s ignored\n\n", arg); | 
 | 		break; | 
 | 	case ARGP_KEY_END: | 
 | 		if (!pargs->nr_threads) { | 
 | 			printf("Must select a number of threads.\n\n"); | 
 | 			argp_usage(state); | 
 | 			break; | 
 | 		} | 
 | 		if (!pargs->nr_loops) { | 
 | 			printf("Must select a number of loops.\n\n"); | 
 | 			argp_usage(state); | 
 | 			break; | 
 | 		} | 
 | 		if (!pargs->lock_type) { | 
 | 			printf("Must select a type of lock.\n\n"); | 
 | 			argp_usage(state); | 
 | 			break; | 
 | 		} | 
 | 		break; | 
 | 	default: | 
 | 		return ARGP_ERR_UNKNOWN; | 
 | 	} | 
 | 	return 0; | 
 | } | 
 |  | 
 | static struct argp argp = {options, parse_opt, args_doc, doc}; | 
 |  | 
 | int main(int argc, char** argv) | 
 | { | 
 | 	pthread_t *worker_threads; | 
 | 	void **loops_done; | 
 | 	struct timeval start_tv = {0}; | 
 | 	struct timeval end_tv = {0}; | 
 | 	long usec_diff, total_loops = 0; | 
 | 	uint64_t starttsc; | 
 | 	int nr_threads, nr_loops; | 
 | 	FILE *outfile; | 
 | 	struct sample_stats acq_stats, hld_stats; | 
 |  | 
 | 	argp_parse(&argp, argc, argv, 0, 0, &pargs); | 
 | 	nr_threads = pargs.nr_threads; | 
 | 	nr_loops = pargs.nr_loops; | 
 | 	mcs_pdr_init(&mcspdr_lock); | 
 |  | 
 | 	if (pargs.outfile_path) { | 
 | 		/* RDWR, CREAT, TRUNC, O666 */ | 
 | 		outfile = fopen(pargs.outfile_path, "w+"); | 
 | 		if (!outfile) { | 
 | 			perror("outfile"); | 
 | 			exit(-1); | 
 | 		} | 
 | 	} | 
 | 	worker_threads = malloc(sizeof(pthread_t) * nr_threads); | 
 | 	if (!worker_threads) { | 
 | 		perror("pthread_t malloc failed:"); | 
 | 		exit(-1); | 
 | 	} | 
 | 	loops_done = malloc(sizeof(void*) * nr_threads); | 
 | 	if (!loops_done) { | 
 | 		perror("loops_done malloc failed"); | 
 | 		exit(-1); | 
 | 	} | 
 | 	printf("Making %d workers, %d loops each, %sadapting workers to vcores, and %sfaking vcore context\n", | 
 | 	       nr_threads, nr_loops, | 
 | 	       pargs.adj_workers ? "" : "not ", | 
 | 	       pargs.fake_vc_ctx ? "" : "not "); | 
 | 	pthread_barrier_init(&start_test, NULL, nr_threads); | 
 |  | 
 | 	times = malloc(sizeof(struct time_stamp *) * nr_threads); | 
 | 	assert(times); | 
 | 	for (int i = 0; i < nr_threads; i++) { | 
 | 		times[i] = malloc(sizeof(struct time_stamp) * nr_loops); | 
 | 		if (!times[i]) { | 
 | 			perror("Record keeping malloc"); | 
 | 			exit(-1); | 
 | 		} | 
 | 		memset(times[i], 0, sizeof(struct time_stamp) * nr_loops); | 
 | 	} | 
 | 	printf("Record tracking takes %ld bytes of memory\n", | 
 | 	       nr_threads * nr_loops * sizeof(struct time_stamp)); | 
 | 	os_prep_work(worker_threads, nr_threads);/* ensure we have enough VCs */ | 
 | 	/* Doing this in MCP ctx, so we might have been getting a few preempts | 
 | 	 * already.  Want to read start before the threads pass their barrier */ | 
 | 	starttsc = read_tsc(); | 
 | 	/* create and join on yield */ | 
 | 	for (long i = 0; i < nr_threads; i++) { | 
 | 		if (pthread_create(&worker_threads[i], NULL, pargs.lock_type, | 
 | 		                   (void*)i)) | 
 | 			perror("pth_create failed"); | 
 | 	} | 
 | 	os_post_work(worker_threads, nr_threads); | 
 | 	if (gettimeofday(&start_tv, 0)) | 
 | 		perror("Start time error..."); | 
 | 	for (int i = 0; i < nr_threads; i++) { | 
 | 		pthread_join(worker_threads[i], &loops_done[i]); | 
 | 	} | 
 | 	if (gettimeofday(&end_tv, 0)) | 
 | 		perror("End time error..."); | 
 |  | 
 | 	printf("Acquire times (TSC Ticks)\n---------------------------\n"); | 
 | 	acq_stats.get_sample = get_acq_latency; | 
 | 	compute_stats((void**)times, nr_threads, nr_loops, &acq_stats); | 
 |  | 
 | 	printf("Held times (from acq til rel done) (TSC Ticks)\n------\n"); | 
 | 	hld_stats.get_sample = get_hld_latency; | 
 | 	compute_stats((void**)times, nr_threads, nr_loops, &hld_stats); | 
 |  | 
 | 	usec_diff = (end_tv.tv_sec - start_tv.tv_sec) * 1000000 + | 
 | 	            (end_tv.tv_usec - start_tv.tv_usec); | 
 | 	printf("Time to run: %ld usec\n", usec_diff); | 
 |  | 
 | 	printf("\nLock throughput:\n-----------------\n"); | 
 | 	/* throughput for the entire duration (in ms), 1ms steps.  print as many | 
 | 	 * steps as they ask for (up to the end of the run). */ | 
 | 	print_throughput((void**)times, usec_diff / 1000 + 1, msec2tsc(1), | 
 | 	                 pargs.nr_print_rows, | 
 | 	                 starttsc, nr_threads, | 
 | 	                 nr_loops, get_acq_timestamp); | 
 | 	print_preempt_trace(starttsc, pargs.nr_print_rows); | 
 |  | 
 | 	for (int i = 0; i < nr_threads; i++) { | 
 | 		total_loops += (long)loops_done[i]; | 
 | 		if (!loops_done[i]) | 
 | 			printf("WARNING: thread %d performed 0 loops!\n", i); | 
 | 	} | 
 | 	printf("Average number of loops done, per thread: %ld\n", | 
 | 	       total_loops / nr_threads); | 
 | 	for (int i = 0; i < nr_threads; i++) | 
 | 		printf("\tThread %d performed %lu loops\n", | 
 | 		       i, (long)loops_done[i]); | 
 |  | 
 | 	if (pargs.outfile_path) { | 
 | 		fprintf(outfile, "#"); | 
 | 		for (char **arg = argv; *arg; arg++) | 
 | 			fprintf(outfile, " %s", *arg); | 
 | 		fprintf(outfile, "\n"); | 
 | 		fprintf(outfile, "# thread_id attempt pre acq(uire) un(lock) " | 
 | 		                 "tsc_overhead\n"); | 
 | 		fprintf(outfile, | 
 | 			"# acquire latency: acq - pre - tsc_overhead\n"); | 
 | 		fprintf(outfile, "# hold time: un - acq - tsc_overhead\n"); | 
 | 		fprintf(outfile, "# tsc_frequency %llu\n", get_tsc_freq()); | 
 | 		fprintf(outfile, | 
 | 			"# tsc_overhead is 0 on linux, hard code it with a value from akaros\n"); | 
 | 		for (int i = 0; i < nr_threads; i++) { | 
 | 			for (int j = 0; j < nr_loops; j++) { | 
 | 				struct time_stamp *ts = ×[i][j]; | 
 | 				if (!ts->pre) | 
 | 					break; /* empty record */ | 
 | 				fprintf(outfile, "%d %d %llu %llu %llu %llu\n", | 
 | 					i, j, ts->pre, ts->acq, ts->un, | 
 | 					get_tsc_overhead()); | 
 | 			} | 
 | 		} | 
 | 		fclose(outfile); | 
 | 	} | 
 | 	printf("Done, exiting\n"); | 
 | } |