blob: 85a5cda111f9a6c50992560245cdb100570c90ff [file] [log] [blame]
/* Copyright (c) 2013 The Regents of the University of California
* Barret Rhoden <brho@cs.berkeley.edu>
* See LICENSE for details.
*
* Userspace functions for various measurements.
*
* For now, this is built into parlib. We can pull it out in the future. Many
* of the larger functions are in flux. */
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/param.h>
#ifdef __ros__
#include <parlib/tsc-compat.h>
#include <benchutil/measure.h>
#endif /* __ros__ */
/* Basic stats computation and printing.
*
* All of these expect a 2D collection of samples, where the first array is an
* array of arrays of samples. The first array's members are something like
* per-thread samples, where each thread fills in its
* data[thread_id][sample_number]. The samples should be ordered in
* chronological order. Ultimately, the sample needs to produce a uint64_t
* (e.g. TSC tick). */
static void init_stats(struct sample_stats *stats)
{
stats->avg_time = 0;
stats->var_time = 0;
stats->max_time = 0;
stats->min_time = UINT64_MAX;
stats->lat_50 = 0;
stats->lat_75 = 0;
stats->lat_90 = 0;
stats->lat_99 = 0;
stats->total_samples = 0;
}
/* Could have options for printing for how many rows we want, how much we want
* to trim the max/min, and how many samples per bin. */
void compute_stats(void **data, int nr_i, int nr_j, struct sample_stats *stats)
{
uint64_t sample_time, hist_max_time, hist_min_time,
lat_max_time, lat_min_time;
size_t hist_idx, lat_idx, hist_bin_sz, lat_bin_sz;
float accum_samples = 0.0, coef_var;
unsigned int *hist_times;
unsigned int *lat_times;
unsigned int nr_hist_bins = 75; /* looks reasonable when printed */
unsigned int nr_lat_bins = 500; /* affects granularity of lat perc */
unsigned int max_dots_per_row = 45; /* looks good with 80-wide col */
unsigned int max_hist_bin = 0;
#define HI_COEF_VAR 1.0
init_stats(stats);
/* First pass, figure out the min, max, avg, etc. */
for (int i = 0; i < nr_i; i++) {
for (int j = 0; j < nr_j; j++) {
/* get_sample returns 0 on success. o/w, skip the
* sample */
/* depending on semantics, we could break */
if (stats->get_sample(data, i, j, &sample_time))
continue;
stats->total_samples++;
stats->avg_time += sample_time;
stats->max_time = sample_time > stats->max_time ?
sample_time : stats->max_time;
stats->min_time = sample_time < stats->min_time ?
sample_time : stats->min_time;
}
}
if (stats->total_samples < 2) {
printf("Not enough samples (%llu) for avg and var\n",
stats->total_samples);
return;
}
stats->avg_time /= stats->total_samples;
/* Second pass, compute the variance. Want to do this before the
* histograms, so we can trim the serious outliers */
for (int i = 0; i < nr_i; i++) {
for (int j = 0; j < nr_j; j++) {
if (stats->get_sample(data, i, j, &sample_time))
continue;
/* var: (sum_i=1..n { (x_i - xbar)^2 }) / (n - 1) */
stats->var_time += (sample_time - stats->avg_time) *
(sample_time - stats->avg_time);
}
}
stats->var_time /= stats->total_samples - 1;
/* We have two histogram structures. The main one is for printing, and
* the other is for computing latency percentiles. The only real diff
* btw the two is the number of bins. The latency one has a lot more,
* for finer granularity, and the regular one has fewer for better
* printing.
*
* Both have the same max and min bin values. Any excesses get put in
* the smallest or biggest bin. This keeps the granularity reasonable
* in the face of very large outliers. Normally, I trim off anything
* outside 3 stddev.
*
* High variation will throw off our histogram bins, so we adjust. A
* coef_var > 1 is considered high variance. The numbers I picked are
* just heuristics to catch SMM interference and make the output look
* nice. */
coef_var = sqrt(stats->var_time) / stats->avg_time;
if (coef_var > HI_COEF_VAR) {
hist_max_time = stats->avg_time * 3;
hist_min_time = stats->avg_time / 3;
} else { /* 'normal' data */
/* trimming the printable hist at 3 stddevs, which for normal
* data is 99.7% of the data. For most any data, it gets 89%
* (Chebyshev's inequality) */
hist_max_time = stats->avg_time + 3 * sqrt(stats->var_time);
hist_min_time = stats->avg_time - 3 * sqrt(stats->var_time);
if (hist_min_time > hist_max_time)
hist_min_time = 0;
}
lat_max_time = hist_max_time;
lat_min_time = hist_min_time;
hist_bin_sz = (hist_max_time - hist_min_time) / nr_hist_bins + 1;
lat_bin_sz = (lat_max_time - lat_min_time) / nr_lat_bins + 1;
hist_times = malloc(sizeof(unsigned int) * nr_hist_bins);
lat_times = malloc(sizeof(unsigned int) * nr_lat_bins);
if (!hist_times || !lat_times) {
perror("compute_stats failed to alloc hist/lat arrays:");
free(hist_times);
free(lat_times);
return;
}
memset(hist_times, 0, sizeof(unsigned int) * nr_hist_bins);
memset(lat_times, 0, sizeof(unsigned int) * nr_lat_bins);
/* third pass, fill the bins for the histogram and latencies */
for (int i = 0; i < nr_i; i++) {
for (int j = 0; j < nr_j; j++) {
if (stats->get_sample(data, i, j, &sample_time))
continue;
/* need to shift, offset by min_time. anything too
* small is 0 and will go into the first bin. anything
* too large will go into the last bin. */
lat_idx = sample_time < lat_min_time
? 0
: (sample_time - lat_min_time) / lat_bin_sz;
lat_idx = MIN(lat_idx, nr_lat_bins - 1);
lat_times[lat_idx]++;
hist_idx = sample_time < hist_min_time
? 0
: (sample_time - hist_min_time) /
hist_bin_sz;
hist_idx = MIN(hist_idx, nr_hist_bins - 1);
hist_times[hist_idx]++;
/* useful for formatting the ***s */
max_hist_bin = (hist_times[hist_idx] > max_hist_bin)
? hist_times[hist_idx]
: max_hist_bin;
}
}
/* Compute latency percentiles */
for (int i = 0; i < nr_lat_bins; i++) {
accum_samples += lat_times[i];
/* (i + 1), since we've just accumulated one bucket's worth */
if (!stats->lat_50 &&
accum_samples / stats->total_samples > 0.50)
stats->lat_50 = (i + 1) * lat_bin_sz + lat_min_time;
if (!stats->lat_75 &&
accum_samples / stats->total_samples > 0.75)
stats->lat_75 = (i + 1) * lat_bin_sz + lat_min_time;
if (!stats->lat_90 &&
accum_samples / stats->total_samples > 0.90)
stats->lat_90 = (i + 1) * lat_bin_sz + lat_min_time;
if (!stats->lat_99 &&
accum_samples / stats->total_samples > 0.99)
stats->lat_99 = (i + 1) * lat_bin_sz + lat_min_time;
}
for (int i = 0; i < nr_hist_bins; i++) {
uint64_t interval_start = i * hist_bin_sz + hist_min_time;
uint64_t interval_end = (i + 1) * hist_bin_sz + hist_min_time;
/* customize the first and last entries */
if (i == 0)
interval_start = MIN(interval_start, stats->min_time);
if (i == nr_hist_bins - 1) {
interval_end = MAX(interval_end, stats->max_time);
/* but not at the sake of formatting! (8 spaces) */
interval_end = MIN(interval_end, 99999999);
}
printf(" [%8llu - %8llu] %7d: ", interval_start,
interval_end, hist_times[i]);
/* nr_dots = hist_times[i] * nr_dots_per_sample
* = hist_times[i] * (max_num_dots / max_hist_bin) */
int nr_dots = hist_times[i] * max_dots_per_row / max_hist_bin;
for (int j = 0; j < nr_dots; j++)
printf("*");
printf("\n");
}
printf("\n");
printf("Samples per dot: %d\n", max_hist_bin / max_dots_per_row);
printf("Total samples: %llu\n", stats->total_samples);
printf("Avg time : %llu\n", stats->avg_time);
printf("Stdev time : %f\n", sqrt(stats->var_time));
printf("Coef Var : %f\n", coef_var);
if (coef_var > HI_COEF_VAR)
printf(
"\tHigh coeff of var with serious outliers, adjusted bins\n");
/* numbers are overestimates by at most a lat bin */
printf("50/75/90/99: %d / %d / %d / %d (-<%ld)\n", stats->lat_50,
stats->lat_75, stats->lat_90, stats->lat_99, lat_bin_sz);
printf("Min / Max : %llu / %llu\n", stats->min_time, stats->max_time);
printf("\n");
free(hist_times);
free(lat_times);
}
/* Prints the throughput of certain events over nr_steps of interval time. Will
* print the overall throughput of the entire time (total events / steps),
* and print out each step up to nr_print_steps.
*
* Assumes a 2D data structure, where the events in each data[i][] (for a
* specific i) are in order. The 'nr_i'es are typically threads or something
* similar. nr_j would be how many samples per thread. The func ptr should
* return the time of the data[i][j]'th event via *sample and return 0 on
* success, and any other value for 'no data'. If start_time is 0, we'll start
* the clock right before the first event. */
void print_throughput(void **data, unsigned int nr_steps, uint64_t interval,
unsigned int nr_print_steps, uint64_t start_time,
int nr_i, int nr_j,
int (*get_sample)(void **data, int i, int j,
uint64_t *sample))
{
uint64_t time_now, sample;
/* next_sample[] tracks each thread's next lock that was acquired */
unsigned int *next_sample;
unsigned int *step_events;
unsigned int most_step_events = 1;
unsigned int max_dots_per_row = 45; /* looks good with 80-wide col */
unsigned int total_events = 0;
if (!nr_steps)
return;
nr_print_steps = MIN(nr_print_steps, nr_steps);
next_sample = malloc(sizeof(unsigned int) * nr_i);
step_events = malloc(sizeof(unsigned int) * nr_steps);
if (!next_sample || !step_events) {
perror("print_throughput failed alloc:");
free(next_sample);
free(step_events);
return;
}
memset(next_sample, 0, sizeof(unsigned int) * nr_i);
memset(step_events, 0, sizeof(unsigned int) * nr_steps);
if (start_time) {
time_now = start_time;
} else {
time_now = UINT64_MAX;
/* Set the replay to start right before the first event */
for (int i = 0; i < nr_i; i++) {
if (get_sample(data, i, 0, &sample))
continue;
time_now = MIN(time_now, sample);
}
if (time_now != 0)
time_now--;
}
for (int k = 0; k < nr_steps; k++) {
time_now += interval;
/* for every 'thread', we'll figure out how many events
* occurred, and advance next_sample to track the next one to
* consider */
for (int i = 0; i < nr_i; i++) {
/* count nr locks that have happened, advance per thread
* tracker */
for ( ; next_sample[i] < nr_j; next_sample[i]++) {
/* skip this thread if it has no more data */
if (get_sample(data, i, next_sample[i],
&sample))
continue;
/* break when we found one that hasn't happened
* yet */
if (!(sample <= time_now))
break;
step_events[k]++;
}
}
total_events += step_events[k];
/* for dynamically scaling the *'s */
most_step_events = MAX(most_step_events, step_events[k]);
}
if (nr_print_steps)
printf("Events per dot: %d\n",
most_step_events / max_dots_per_row);
for (int k = 0; k < nr_print_steps; k++) {
/* Last step isn't accurate, will only be partially full */
if (k == nr_steps - 1)
break;
printf("%6d: ", k);
printf("%6d ", step_events[k]);
/* nr_dots = step_events[k] * nr_dots_per_event
* = step_events[k] * (max_dots_per_row / most_step_events) */
int nr_dots = step_events[k] * max_dots_per_row /
most_step_events;
for (int i = 0; i < nr_dots; i++)
printf("*");
printf("\n");
}
printf("Total events: %d, Avg events/step: %d\n", total_events,
total_events / nr_steps);
free(next_sample);
free(step_events);
}