user/benchutil/measure.c - upstream - Git at Google

 /* Copyright (c) 2013 The Regents of the University of California
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
  * Userspace functions for various measurements.
  *
  * For now, this is built into parlib.  We can pull it out in the future.  Many
  * of the larger functions are in flux. */

 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/param.h>
 #ifdef __ros__
 #include <parlib/tsc-compat.h>
 #include <benchutil/measure.h>
 #endif /* __ros__ */

 /* Basic stats computation and printing.
  *
  * All of these expect a 2D collection of samples, where the first array is an
  * array of arrays of samples.  The first array's members are something like
  * per-thread samples, where each thread fills in its
  * data[thread_id][sample_number].  The samples should be ordered in
  * chronological order.  Ultimately, the sample needs to produce a uint64_t
  * (e.g. TSC tick). */

 static void init_stats(struct sample_stats *stats)
 {
 	stats->avg_time = 0;
 	stats->var_time = 0;
 	stats->max_time = 0;
 	stats->min_time = UINT64_MAX;
 	stats->lat_50 = 0;
 	stats->lat_75 = 0;
 	stats->lat_90 = 0;
 	stats->lat_99 = 0;
 	stats->total_samples = 0;
 }

 /* Could have options for printing for how many rows we want, how much we want
  * to trim the max/min, and how many samples per bin. */
 void compute_stats(void **data, int nr_i, int nr_j, struct sample_stats *stats)
 {
 	uint64_t sample_time, hist_max_time, hist_min_time,
 	         lat_max_time, lat_min_time;
 	size_t hist_idx, lat_idx, hist_bin_sz, lat_bin_sz;
 	float accum_samples = 0.0, coef_var;
 	unsigned int *hist_times;
 	unsigned int *lat_times;
 	unsigned int nr_hist_bins = 75;	/* looks reasonable when printed */
 	unsigned int nr_lat_bins = 500;	/* affects granularity of lat perc */
 	unsigned int max_dots_per_row = 45; /* looks good with 80-wide col */
 	unsigned int max_hist_bin = 0;
 	#define HI_COEF_VAR 1.0

 	init_stats(stats);
 	/* First pass, figure out the min, max, avg, etc. */
 	for (int i = 0; i < nr_i; i++) {
 		for (int j = 0; j < nr_j; j++) {
 			/* get_sample returns 0 on success.  o/w, skip the
 			 * sample */
 			/* depending on semantics, we could break */
 			if (stats->get_sample(data, i, j, &sample_time))
 				continue;
 			stats->total_samples++;
 			stats->avg_time += sample_time;
 			stats->max_time = sample_time > stats->max_time ?
 				sample_time : stats->max_time;
 			stats->min_time = sample_time < stats->min_time ?
 				sample_time : stats->min_time;
 		}
 	}
 	if (stats->total_samples < 2) {
 		printf("Not enough samples (%llu) for avg and var\n",
 		       stats->total_samples);
 		return;
 	}
 	stats->avg_time /= stats->total_samples;
 	/* Second pass, compute the variance.  Want to do this before the
 	 * histograms, so we can trim the serious outliers */
 	for (int i = 0; i < nr_i; i++) {
 		for (int j = 0; j < nr_j; j++) {
 			if (stats->get_sample(data, i, j, &sample_time))
 				continue;
 			/* var: (sum_i=1..n { (x_i - xbar)^2 }) / (n - 1) */
 			stats->var_time += (sample_time - stats->avg_time) *
 			                   (sample_time - stats->avg_time);
 		}
 	}
 	stats->var_time /= stats->total_samples - 1;
 	/* We have two histogram structures.  The main one is for printing, and
 	 * the other is for computing latency percentiles.  The only real diff
 	 * btw the two is the number of bins.  The latency one has a lot more,
 	 * for finer granularity, and the regular one has fewer for better
 	 * printing.
 	 *
 	 * Both have the same max and min bin values.  Any excesses get put in
 	 * the smallest or biggest bin.  This keeps the granularity reasonable
 	 * in the face of very large outliers.  Normally, I trim off anything
 	 * outside 3 stddev.
 	 *
 	 * High variation will throw off our histogram bins, so we adjust.  A
 	 * coef_var > 1 is considered high variance.  The numbers I picked are
 	 * just heuristics to catch SMM interference and make the output look
 	 * nice. */
 	coef_var = sqrt(stats->var_time) / stats->avg_time;
 	if (coef_var > HI_COEF_VAR) {
 		hist_max_time = stats->avg_time * 3;
 		hist_min_time = stats->avg_time / 3;
 	} else {	/* 'normal' data */
 		/* trimming the printable hist at 3 stddevs, which for normal
 		 * data is 99.7% of the data.  For most any data, it gets 89%
 		 * (Chebyshev's inequality) */
 		hist_max_time = stats->avg_time + 3 * sqrt(stats->var_time);
 		hist_min_time = stats->avg_time - 3 * sqrt(stats->var_time);
 		if (hist_min_time > hist_max_time)
 			hist_min_time = 0;
 	}
 	lat_max_time = hist_max_time;
 	lat_min_time = hist_min_time;
 	hist_bin_sz = (hist_max_time - hist_min_time) / nr_hist_bins + 1;
 	lat_bin_sz = (lat_max_time - lat_min_time) / nr_lat_bins + 1;
 	hist_times = malloc(sizeof(unsigned int) * nr_hist_bins);
 	lat_times = malloc(sizeof(unsigned int) * nr_lat_bins);
 	if (!hist_times || !lat_times) {
 		perror("compute_stats failed to alloc hist/lat arrays:");
 		free(hist_times);
 		free(lat_times);
 		return;
 	}
 	memset(hist_times, 0, sizeof(unsigned int) * nr_hist_bins);
 	memset(lat_times, 0, sizeof(unsigned int) * nr_lat_bins);
 	/* third pass, fill the bins for the histogram and latencies */
 	for (int i = 0; i < nr_i; i++) {
 		for (int j = 0; j < nr_j; j++) {
 			if (stats->get_sample(data, i, j, &sample_time))
 				continue;
 			/* need to shift, offset by min_time.  anything too
 			 * small is 0 and will go into the first bin.  anything
 			 * too large will go into the last bin. */
 			lat_idx = sample_time < lat_min_time
 			          ? 0
 			          : (sample_time - lat_min_time) / lat_bin_sz;
 			lat_idx = MIN(lat_idx, nr_lat_bins - 1);
 			lat_times[lat_idx]++;
 			hist_idx = sample_time < hist_min_time
 			           ? 0
 				   : (sample_time - hist_min_time) /
 				     hist_bin_sz;
 			hist_idx = MIN(hist_idx, nr_hist_bins - 1);
 			hist_times[hist_idx]++;
 			/* useful for formatting the ***s */
 			max_hist_bin = (hist_times[hist_idx] > max_hist_bin)
 			               ? hist_times[hist_idx]
 			               : max_hist_bin;
 		}
 	}
 	/* Compute latency percentiles */
 	for (int i = 0; i < nr_lat_bins; i++) {
 		accum_samples += lat_times[i];
 		/* (i + 1), since we've just accumulated one bucket's worth */
 		if (!stats->lat_50 &&
 		    accum_samples / stats->total_samples > 0.50)
 			stats->lat_50 = (i + 1) * lat_bin_sz + lat_min_time;
 		if (!stats->lat_75 &&
 		    accum_samples / stats->total_samples > 0.75)
 			stats->lat_75 = (i + 1) * lat_bin_sz + lat_min_time;
 		if (!stats->lat_90 &&
 		    accum_samples / stats->total_samples > 0.90)
 			stats->lat_90 = (i + 1) * lat_bin_sz + lat_min_time;
 		if (!stats->lat_99 &&
 		    accum_samples / stats->total_samples > 0.99)
 			stats->lat_99 = (i + 1) * lat_bin_sz + lat_min_time;
 	}
 	for (int i = 0; i < nr_hist_bins; i++) {
 		uint64_t interval_start = i * hist_bin_sz + hist_min_time;
 		uint64_t interval_end = (i + 1) * hist_bin_sz + hist_min_time;

 		/* customize the first and last entries */
 		if (i == 0)
 			interval_start = MIN(interval_start, stats->min_time);
 		if (i == nr_hist_bins - 1) {
 			interval_end = MAX(interval_end, stats->max_time);
 			/* but not at the sake of formatting! (8 spaces) */
 			interval_end = MIN(interval_end, 99999999);
 		}
 		printf("    [%8llu - %8llu] %7d: ", interval_start,
 		       interval_end, hist_times[i]);
 		/* nr_dots = hist_times[i] * nr_dots_per_sample
 		 *         = hist_times[i] * (max_num_dots / max_hist_bin) */
 		int nr_dots = hist_times[i] * max_dots_per_row / max_hist_bin;

 		for (int j = 0; j < nr_dots; j++)
 			printf("*");
 		printf("\n");
 	}
 	printf("\n");
 	printf("Samples per dot: %d\n", max_hist_bin / max_dots_per_row);
 	printf("Total samples: %llu\n", stats->total_samples);
 	printf("Avg time   : %llu\n", stats->avg_time);
 	printf("Stdev time : %f\n", sqrt(stats->var_time));
 	printf("Coef Var   : %f\n", coef_var);
 	if (coef_var > HI_COEF_VAR)
 		printf(
 		  "\tHigh coeff of var with serious outliers, adjusted bins\n");
 	/* numbers are overestimates by at most a lat bin */
 	printf("50/75/90/99: %d / %d / %d / %d (-<%ld)\n", stats->lat_50,
 	       stats->lat_75, stats->lat_90, stats->lat_99, lat_bin_sz);
 	printf("Min / Max  : %llu / %llu\n", stats->min_time, stats->max_time);
 	printf("\n");
 	free(hist_times);
 	free(lat_times);
 }

 /* Prints the throughput of certain events over nr_steps of interval time.  Will
  * print the overall throughput of the entire time (total events / steps),
  * and print out each step up to nr_print_steps.
  *
  * Assumes a 2D data structure, where the events in each data[i][] (for a
  * specific i) are in order.  The 'nr_i'es are typically threads or something
  * similar.  nr_j would be how many samples per thread.  The func ptr should
  * return the time of the data[i][j]'th event via *sample and return 0 on
  * success, and any other value for 'no data'.  If start_time is 0, we'll start
  * the clock right before the first event. */
 void print_throughput(void **data, unsigned int nr_steps, uint64_t interval,
                       unsigned int nr_print_steps, uint64_t start_time,
                       int nr_i, int nr_j,
                       int (*get_sample)(void **data, int i, int j,
                                         uint64_t *sample))
 {
 	uint64_t time_now, sample;
 	/* next_sample[] tracks each thread's next lock that was acquired */
 	unsigned int *next_sample;
 	unsigned int *step_events;
 	unsigned int most_step_events = 1;
 	unsigned int max_dots_per_row = 45; /* looks good with 80-wide col */
 	unsigned int total_events = 0;

 	if (!nr_steps)
 		return;
 	nr_print_steps = MIN(nr_print_steps, nr_steps);
 	next_sample = malloc(sizeof(unsigned int) * nr_i);
 	step_events = malloc(sizeof(unsigned int) * nr_steps);
 	if (!next_sample || !step_events) {
 		perror("print_throughput failed alloc:");
 		free(next_sample);
 		free(step_events);
 		return;
 	}
 	memset(next_sample, 0, sizeof(unsigned int) * nr_i);
 	memset(step_events, 0, sizeof(unsigned int) * nr_steps);
 	if (start_time) {
 		time_now = start_time;
 	} else {
 		time_now = UINT64_MAX;
 		/* Set the replay to start right before the first event */
 		for (int i = 0; i < nr_i; i++) {
 			if (get_sample(data, i, 0, &sample))
 				continue;
 			time_now = MIN(time_now, sample);
 		}
 		if (time_now != 0)
 			time_now--;
 	}
 	for (int k = 0; k < nr_steps; k++) {
 		time_now += interval;
 		/* for every 'thread', we'll figure out how many events
 		 * occurred, and advance next_sample to track the next one to
 		 * consider */
 		for (int i = 0; i < nr_i; i++) {
 			/* count nr locks that have happened, advance per thread
 			 * tracker */
 			for ( ; next_sample[i] < nr_j; next_sample[i]++) {
 				/* skip this thread if it has no more data */
 				if (get_sample(data, i, next_sample[i],
 					       &sample))
 					continue;
 				/* break when we found one that hasn't happened
 				 * yet */
 				if (!(sample <= time_now))
 					break;
 				step_events[k]++;
 			}
 		}
 		total_events += step_events[k];
 		/* for dynamically scaling the *'s */
 		most_step_events = MAX(most_step_events, step_events[k]);
 	}
 	if (nr_print_steps)
 		printf("Events per dot: %d\n",
 		       most_step_events / max_dots_per_row);
 	for (int k = 0; k < nr_print_steps; k++) {
 		/* Last step isn't accurate, will only be partially full */
 		if (k == nr_steps - 1)
 			break;
 		printf("%6d: ", k);
 		printf("%6d ", step_events[k]);
 		/* nr_dots = step_events[k] * nr_dots_per_event
 		 *   = step_events[k] * (max_dots_per_row / most_step_events) */
 		int nr_dots = step_events[k] * max_dots_per_row /
 			      most_step_events;
 		for (int i = 0; i < nr_dots; i++)
 			printf("*");
 		printf("\n");
 	}
 	printf("Total events: %d, Avg events/step: %d\n", total_events,
 	       total_events / nr_steps);
 	free(next_sample);
 	free(step_events);
 }
	/* Copyright (c) 2013 The Regents of the University of California
	* Barret Rhoden <brho@cs.berkeley.edu>
	* See LICENSE for details.
	*
	* Userspace functions for various measurements.
	*
	* For now, this is built into parlib. We can pull it out in the future. Many
	* of the larger functions are in flux. */

	#include <math.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <sys/param.h>
	#ifdef __ros__
	#include <parlib/tsc-compat.h>
	#include <benchutil/measure.h>
	#endif /* __ros__ */

	/* Basic stats computation and printing.
	*
	* All of these expect a 2D collection of samples, where the first array is an
	* array of arrays of samples. The first array's members are something like
	* per-thread samples, where each thread fills in its
	* data[thread_id][sample_number]. The samples should be ordered in
	* chronological order. Ultimately, the sample needs to produce a uint64_t
	* (e.g. TSC tick). */

	static void init_stats(struct sample_stats *stats)
	{
	stats->avg_time = 0;
	stats->var_time = 0;
	stats->max_time = 0;
	stats->min_time = UINT64_MAX;
	stats->lat_50 = 0;
	stats->lat_75 = 0;
	stats->lat_90 = 0;
	stats->lat_99 = 0;
	stats->total_samples = 0;
	}

	/* Could have options for printing for how many rows we want, how much we want
	* to trim the max/min, and how many samples per bin. */
	void compute_stats(void *data, int nr_i, int nr_j, struct sample_stats stats)
	{
	uint64_t sample_time, hist_max_time, hist_min_time,
	lat_max_time, lat_min_time;
	size_t hist_idx, lat_idx, hist_bin_sz, lat_bin_sz;
	float accum_samples = 0.0, coef_var;
	unsigned int *hist_times;
	unsigned int *lat_times;
	unsigned int nr_hist_bins = 75; /* looks reasonable when printed */
	unsigned int nr_lat_bins = 500; /* affects granularity of lat perc */
	unsigned int max_dots_per_row = 45; /* looks good with 80-wide col */
	unsigned int max_hist_bin = 0;
	#define HI_COEF_VAR 1.0

	init_stats(stats);
	/* First pass, figure out the min, max, avg, etc. */
	for (int i = 0; i < nr_i; i++) {
	for (int j = 0; j < nr_j; j++) {
	/* get_sample returns 0 on success. o/w, skip the
	* sample */
	/* depending on semantics, we could break */
	if (stats->get_sample(data, i, j, &sample_time))
	continue;
	stats->total_samples++;
	stats->avg_time += sample_time;
	stats->max_time = sample_time > stats->max_time ?
	sample_time : stats->max_time;
	stats->min_time = sample_time < stats->min_time ?
	sample_time : stats->min_time;
	}
	}
	if (stats->total_samples < 2) {
	printf("Not enough samples (%llu) for avg and var\n",
	stats->total_samples);
	return;
	}
	stats->avg_time /= stats->total_samples;
	/* Second pass, compute the variance. Want to do this before the
	* histograms, so we can trim the serious outliers */
	for (int i = 0; i < nr_i; i++) {
	for (int j = 0; j < nr_j; j++) {
	if (stats->get_sample(data, i, j, &sample_time))
	continue;
	/* var: (sum_i=1..n { (x_i - xbar)^2 }) / (n - 1) */
	stats->var_time += (sample_time - stats->avg_time) *
	(sample_time - stats->avg_time);
	}
	}
	stats->var_time /= stats->total_samples - 1;
	/* We have two histogram structures. The main one is for printing, and
	* the other is for computing latency percentiles. The only real diff
	* btw the two is the number of bins. The latency one has a lot more,
	* for finer granularity, and the regular one has fewer for better
	* printing.
	*
	* Both have the same max and min bin values. Any excesses get put in
	* the smallest or biggest bin. This keeps the granularity reasonable
	* in the face of very large outliers. Normally, I trim off anything
	* outside 3 stddev.
	*
	* High variation will throw off our histogram bins, so we adjust. A
	* coef_var > 1 is considered high variance. The numbers I picked are
	* just heuristics to catch SMM interference and make the output look
	* nice. */
	coef_var = sqrt(stats->var_time) / stats->avg_time;
	if (coef_var > HI_COEF_VAR) {
	hist_max_time = stats->avg_time * 3;
	hist_min_time = stats->avg_time / 3;
	} else { /* 'normal' data */
	/* trimming the printable hist at 3 stddevs, which for normal
	* data is 99.7% of the data. For most any data, it gets 89%
	* (Chebyshev's inequality) */
	hist_max_time = stats->avg_time + 3 * sqrt(stats->var_time);
	hist_min_time = stats->avg_time - 3 * sqrt(stats->var_time);
	if (hist_min_time > hist_max_time)
	hist_min_time = 0;
	}
	lat_max_time = hist_max_time;
	lat_min_time = hist_min_time;
	hist_bin_sz = (hist_max_time - hist_min_time) / nr_hist_bins + 1;
	lat_bin_sz = (lat_max_time - lat_min_time) / nr_lat_bins + 1;
	hist_times = malloc(sizeof(unsigned int) * nr_hist_bins);
	lat_times = malloc(sizeof(unsigned int) * nr_lat_bins);
	if (!hist_times \|\| !lat_times) {
	perror("compute_stats failed to alloc hist/lat arrays:");
	free(hist_times);
	free(lat_times);
	return;
	}
	memset(hist_times, 0, sizeof(unsigned int) * nr_hist_bins);
	memset(lat_times, 0, sizeof(unsigned int) * nr_lat_bins);
	/* third pass, fill the bins for the histogram and latencies */
	for (int i = 0; i < nr_i; i++) {
	for (int j = 0; j < nr_j; j++) {
	if (stats->get_sample(data, i, j, &sample_time))
	continue;
	/* need to shift, offset by min_time. anything too
	* small is 0 and will go into the first bin. anything
	* too large will go into the last bin. */
	lat_idx = sample_time < lat_min_time
	? 0
	: (sample_time - lat_min_time) / lat_bin_sz;
	lat_idx = MIN(lat_idx, nr_lat_bins - 1);
	lat_times[lat_idx]++;
	hist_idx = sample_time < hist_min_time
	? 0
	: (sample_time - hist_min_time) /
	hist_bin_sz;
	hist_idx = MIN(hist_idx, nr_hist_bins - 1);
	hist_times[hist_idx]++;
	/* useful for formatting the **s /
	max_hist_bin = (hist_times[hist_idx] > max_hist_bin)
	? hist_times[hist_idx]
	: max_hist_bin;
	}
	}
	/* Compute latency percentiles */
	for (int i = 0; i < nr_lat_bins; i++) {
	accum_samples += lat_times[i];
	/* (i + 1), since we've just accumulated one bucket's worth */
	if (!stats->lat_50 &&
	accum_samples / stats->total_samples > 0.50)
	stats->lat_50 = (i + 1) * lat_bin_sz + lat_min_time;
	if (!stats->lat_75 &&
	accum_samples / stats->total_samples > 0.75)
	stats->lat_75 = (i + 1) * lat_bin_sz + lat_min_time;
	if (!stats->lat_90 &&
	accum_samples / stats->total_samples > 0.90)
	stats->lat_90 = (i + 1) * lat_bin_sz + lat_min_time;
	if (!stats->lat_99 &&
	accum_samples / stats->total_samples > 0.99)
	stats->lat_99 = (i + 1) * lat_bin_sz + lat_min_time;
	}
	for (int i = 0; i < nr_hist_bins; i++) {
	uint64_t interval_start = i * hist_bin_sz + hist_min_time;
	uint64_t interval_end = (i + 1) * hist_bin_sz + hist_min_time;

	/* customize the first and last entries */
	if (i == 0)
	interval_start = MIN(interval_start, stats->min_time);
	if (i == nr_hist_bins - 1) {
	interval_end = MAX(interval_end, stats->max_time);
	/* but not at the sake of formatting! (8 spaces) */
	interval_end = MIN(interval_end, 99999999);
	}
	printf(" [%8llu - %8llu] %7d: ", interval_start,
	interval_end, hist_times[i]);
	/* nr_dots = hist_times[i] * nr_dots_per_sample
	* = hist_times[i] * (max_num_dots / max_hist_bin) */
	int nr_dots = hist_times[i] * max_dots_per_row / max_hist_bin;

	for (int j = 0; j < nr_dots; j++)
	printf("*");
	printf("\n");
	}
	printf("\n");
	printf("Samples per dot: %d\n", max_hist_bin / max_dots_per_row);
	printf("Total samples: %llu\n", stats->total_samples);
	printf("Avg time : %llu\n", stats->avg_time);
	printf("Stdev time : %f\n", sqrt(stats->var_time));
	printf("Coef Var : %f\n", coef_var);
	if (coef_var > HI_COEF_VAR)
	printf(
	"\tHigh coeff of var with serious outliers, adjusted bins\n");
	/* numbers are overestimates by at most a lat bin */
	printf("50/75/90/99: %d / %d / %d / %d (-<%ld)\n", stats->lat_50,
	stats->lat_75, stats->lat_90, stats->lat_99, lat_bin_sz);
	printf("Min / Max : %llu / %llu\n", stats->min_time, stats->max_time);
	printf("\n");
	free(hist_times);
	free(lat_times);
	}

	/* Prints the throughput of certain events over nr_steps of interval time. Will
	* print the overall throughput of the entire time (total events / steps),
	* and print out each step up to nr_print_steps.
	*
	* Assumes a 2D data structure, where the events in each data[i][] (for a
	* specific i) are in order. The 'nr_i'es are typically threads or something
	* similar. nr_j would be how many samples per thread. The func ptr should
	* return the time of the data[i][j]'th event via *sample and return 0 on
	* success, and any other value for 'no data'. If start_time is 0, we'll start
	* the clock right before the first event. */
	void print_throughput(void **data, unsigned int nr_steps, uint64_t interval,
	unsigned int nr_print_steps, uint64_t start_time,
	int nr_i, int nr_j,
	int (get_sample)(void *data, int i, int j,
	uint64_t *sample))
	{
	uint64_t time_now, sample;
	/* next_sample[] tracks each thread's next lock that was acquired */
	unsigned int *next_sample;
	unsigned int *step_events;
	unsigned int most_step_events = 1;
	unsigned int max_dots_per_row = 45; /* looks good with 80-wide col */
	unsigned int total_events = 0;

	if (!nr_steps)
	return;
	nr_print_steps = MIN(nr_print_steps, nr_steps);
	next_sample = malloc(sizeof(unsigned int) * nr_i);
	step_events = malloc(sizeof(unsigned int) * nr_steps);
	if (!next_sample \|\| !step_events) {
	perror("print_throughput failed alloc:");
	free(next_sample);
	free(step_events);
	return;
	}
	memset(next_sample, 0, sizeof(unsigned int) * nr_i);
	memset(step_events, 0, sizeof(unsigned int) * nr_steps);
	if (start_time) {
	time_now = start_time;
	} else {
	time_now = UINT64_MAX;
	/* Set the replay to start right before the first event */
	for (int i = 0; i < nr_i; i++) {
	if (get_sample(data, i, 0, &sample))
	continue;
	time_now = MIN(time_now, sample);
	}
	if (time_now != 0)
	time_now--;
	}
	for (int k = 0; k < nr_steps; k++) {
	time_now += interval;
	/* for every 'thread', we'll figure out how many events
	* occurred, and advance next_sample to track the next one to
	* consider */
	for (int i = 0; i < nr_i; i++) {
	/* count nr locks that have happened, advance per thread
	* tracker */
	for ( ; next_sample[i] < nr_j; next_sample[i]++) {
	/* skip this thread if it has no more data */
	if (get_sample(data, i, next_sample[i],
	&sample))
	continue;
	/* break when we found one that hasn't happened
	* yet */
	if (!(sample <= time_now))
	break;
	step_events[k]++;
	}
	}
	total_events += step_events[k];
	/* for dynamically scaling the 's /
	most_step_events = MAX(most_step_events, step_events[k]);
	}
	if (nr_print_steps)
	printf("Events per dot: %d\n",
	most_step_events / max_dots_per_row);
	for (int k = 0; k < nr_print_steps; k++) {
	/* Last step isn't accurate, will only be partially full */
	if (k == nr_steps - 1)
	break;
	printf("%6d: ", k);
	printf("%6d ", step_events[k]);
	/* nr_dots = step_events[k] * nr_dots_per_event
	* = step_events[k] * (max_dots_per_row / most_step_events) */
	int nr_dots = step_events[k] * max_dots_per_row /
	most_step_events;
	for (int i = 0; i < nr_dots; i++)
	printf("*");
	printf("\n");
	}
	printf("Total events: %d, Avg events/step: %d\n", total_events,
	total_events / nr_steps);
	free(next_sample);
	free(step_events);
	}