|  | /* Copyright (c) 2013 The Regents of the University of California | 
|  | * Barret Rhoden <brho@cs.berkeley.edu> | 
|  | * See LICENSE for details. | 
|  | * | 
|  | * Userspace functions for various measurements. | 
|  | * | 
|  | * For now, this is built into parlib.  We can pull it out in the future.  Many | 
|  | * of the larger functions are in flux. */ | 
|  |  | 
|  | #include <math.h> | 
|  | #include <stdio.h> | 
|  | #include <stdlib.h> | 
|  | #include <sys/param.h> | 
|  | #ifdef __ros__ | 
|  | #include <parlib/tsc-compat.h> | 
|  | #include <benchutil/measure.h> | 
|  | #endif /* __ros__ */ | 
|  |  | 
|  | /* Basic stats computation and printing. | 
|  | * | 
|  | * All of these expect a 2D collection of samples, where the first array is an | 
|  | * array of arrays of samples.  The first array's members are something like | 
|  | * per-thread samples, where each thread fills in its | 
|  | * data[thread_id][sample_number].  The samples should be ordered in | 
|  | * chronological order.  Ultimately, the sample needs to produce a uint64_t | 
|  | * (e.g. TSC tick). */ | 
|  |  | 
|  | static void init_stats(struct sample_stats *stats) | 
|  | { | 
|  | stats->avg_time = 0; | 
|  | stats->var_time = 0; | 
|  | stats->max_time = 0; | 
|  | stats->min_time = UINT64_MAX; | 
|  | stats->lat_50 = 0; | 
|  | stats->lat_75 = 0; | 
|  | stats->lat_90 = 0; | 
|  | stats->lat_99 = 0; | 
|  | stats->total_samples = 0; | 
|  | } | 
|  |  | 
|  | /* Could have options for printing for how many rows we want, how much we want | 
|  | * to trim the max/min, and how many samples per bin. */ | 
|  | void compute_stats(void **data, int nr_i, int nr_j, struct sample_stats *stats) | 
|  | { | 
|  | uint64_t sample_time, hist_max_time, hist_min_time, | 
|  | lat_max_time, lat_min_time; | 
|  | size_t hist_idx, lat_idx, hist_bin_sz, lat_bin_sz; | 
|  | float accum_samples = 0.0, coef_var; | 
|  | unsigned int *hist_times; | 
|  | unsigned int *lat_times; | 
|  | unsigned int nr_hist_bins = 75;	/* looks reasonable when printed */ | 
|  | unsigned int nr_lat_bins = 500;	/* affects granularity of lat perc */ | 
|  | unsigned int max_dots_per_row = 45; /* looks good with 80-wide col */ | 
|  | unsigned int max_hist_bin = 0; | 
|  | #define HI_COEF_VAR 1.0 | 
|  |  | 
|  | init_stats(stats); | 
|  | /* First pass, figure out the min, max, avg, etc. */ | 
|  | for (int i = 0; i < nr_i; i++) { | 
|  | for (int j = 0; j < nr_j; j++) { | 
|  | /* get_sample returns 0 on success.  o/w, skip the | 
|  | * sample */ | 
|  | /* depending on semantics, we could break */ | 
|  | if (stats->get_sample(data, i, j, &sample_time)) | 
|  | continue; | 
|  | stats->total_samples++; | 
|  | stats->avg_time += sample_time; | 
|  | stats->max_time = sample_time > stats->max_time ? | 
|  | sample_time : stats->max_time; | 
|  | stats->min_time = sample_time < stats->min_time ? | 
|  | sample_time : stats->min_time; | 
|  | } | 
|  | } | 
|  | if (stats->total_samples < 2) { | 
|  | printf("Not enough samples (%llu) for avg and var\n", | 
|  | stats->total_samples); | 
|  | return; | 
|  | } | 
|  | stats->avg_time /= stats->total_samples; | 
|  | /* Second pass, compute the variance.  Want to do this before the | 
|  | * histograms, so we can trim the serious outliers */ | 
|  | for (int i = 0; i < nr_i; i++) { | 
|  | for (int j = 0; j < nr_j; j++) { | 
|  | if (stats->get_sample(data, i, j, &sample_time)) | 
|  | continue; | 
|  | /* var: (sum_i=1..n { (x_i - xbar)^2 }) / (n - 1) */ | 
|  | stats->var_time += (sample_time - stats->avg_time) * | 
|  | (sample_time - stats->avg_time); | 
|  | } | 
|  | } | 
|  | stats->var_time /= stats->total_samples - 1; | 
|  | /* We have two histogram structures.  The main one is for printing, and | 
|  | * the other is for computing latency percentiles.  The only real diff | 
|  | * btw the two is the number of bins.  The latency one has a lot more, | 
|  | * for finer granularity, and the regular one has fewer for better | 
|  | * printing. | 
|  | * | 
|  | * Both have the same max and min bin values.  Any excesses get put in | 
|  | * the smallest or biggest bin.  This keeps the granularity reasonable | 
|  | * in the face of very large outliers.  Normally, I trim off anything | 
|  | * outside 3 stddev. | 
|  | * | 
|  | * High variation will throw off our histogram bins, so we adjust.  A | 
|  | * coef_var > 1 is considered high variance.  The numbers I picked are | 
|  | * just heuristics to catch SMM interference and make the output look | 
|  | * nice. */ | 
|  | coef_var = sqrt(stats->var_time) / stats->avg_time; | 
|  | if (coef_var > HI_COEF_VAR) { | 
|  | hist_max_time = stats->avg_time * 3; | 
|  | hist_min_time = stats->avg_time / 3; | 
|  | } else {	/* 'normal' data */ | 
|  | /* trimming the printable hist at 3 stddevs, which for normal | 
|  | * data is 99.7% of the data.  For most any data, it gets 89% | 
|  | * (Chebyshev's inequality) */ | 
|  | hist_max_time = stats->avg_time + 3 * sqrt(stats->var_time); | 
|  | hist_min_time = stats->avg_time - 3 * sqrt(stats->var_time); | 
|  | if (hist_min_time > hist_max_time) | 
|  | hist_min_time = 0; | 
|  | } | 
|  | lat_max_time = hist_max_time; | 
|  | lat_min_time = hist_min_time; | 
|  | hist_bin_sz = (hist_max_time - hist_min_time) / nr_hist_bins + 1; | 
|  | lat_bin_sz = (lat_max_time - lat_min_time) / nr_lat_bins + 1; | 
|  | hist_times = malloc(sizeof(unsigned int) * nr_hist_bins); | 
|  | lat_times = malloc(sizeof(unsigned int) * nr_lat_bins); | 
|  | if (!hist_times || !lat_times) { | 
|  | perror("compute_stats failed to alloc hist/lat arrays:"); | 
|  | free(hist_times); | 
|  | free(lat_times); | 
|  | return; | 
|  | } | 
|  | memset(hist_times, 0, sizeof(unsigned int) * nr_hist_bins); | 
|  | memset(lat_times, 0, sizeof(unsigned int) * nr_lat_bins); | 
|  | /* third pass, fill the bins for the histogram and latencies */ | 
|  | for (int i = 0; i < nr_i; i++) { | 
|  | for (int j = 0; j < nr_j; j++) { | 
|  | if (stats->get_sample(data, i, j, &sample_time)) | 
|  | continue; | 
|  | /* need to shift, offset by min_time.  anything too | 
|  | * small is 0 and will go into the first bin.  anything | 
|  | * too large will go into the last bin. */ | 
|  | lat_idx = sample_time < lat_min_time | 
|  | ? 0 | 
|  | : (sample_time - lat_min_time) / lat_bin_sz; | 
|  | lat_idx = MIN(lat_idx, nr_lat_bins - 1); | 
|  | lat_times[lat_idx]++; | 
|  | hist_idx = sample_time < hist_min_time | 
|  | ? 0 | 
|  | : (sample_time - hist_min_time) / | 
|  | hist_bin_sz; | 
|  | hist_idx = MIN(hist_idx, nr_hist_bins - 1); | 
|  | hist_times[hist_idx]++; | 
|  | /* useful for formatting the ***s */ | 
|  | max_hist_bin = (hist_times[hist_idx] > max_hist_bin) | 
|  | ? hist_times[hist_idx] | 
|  | : max_hist_bin; | 
|  | } | 
|  | } | 
|  | /* Compute latency percentiles */ | 
|  | for (int i = 0; i < nr_lat_bins; i++) { | 
|  | accum_samples += lat_times[i]; | 
|  | /* (i + 1), since we've just accumulated one bucket's worth */ | 
|  | if (!stats->lat_50 && | 
|  | accum_samples / stats->total_samples > 0.50) | 
|  | stats->lat_50 = (i + 1) * lat_bin_sz + lat_min_time; | 
|  | if (!stats->lat_75 && | 
|  | accum_samples / stats->total_samples > 0.75) | 
|  | stats->lat_75 = (i + 1) * lat_bin_sz + lat_min_time; | 
|  | if (!stats->lat_90 && | 
|  | accum_samples / stats->total_samples > 0.90) | 
|  | stats->lat_90 = (i + 1) * lat_bin_sz + lat_min_time; | 
|  | if (!stats->lat_99 && | 
|  | accum_samples / stats->total_samples > 0.99) | 
|  | stats->lat_99 = (i + 1) * lat_bin_sz + lat_min_time; | 
|  | } | 
|  | for (int i = 0; i < nr_hist_bins; i++) { | 
|  | uint64_t interval_start = i * hist_bin_sz + hist_min_time; | 
|  | uint64_t interval_end = (i + 1) * hist_bin_sz + hist_min_time; | 
|  |  | 
|  | /* customize the first and last entries */ | 
|  | if (i == 0) | 
|  | interval_start = MIN(interval_start, stats->min_time); | 
|  | if (i == nr_hist_bins - 1) { | 
|  | interval_end = MAX(interval_end, stats->max_time); | 
|  | /* but not at the sake of formatting! (8 spaces) */ | 
|  | interval_end = MIN(interval_end, 99999999); | 
|  | } | 
|  | printf("    [%8llu - %8llu] %7d: ", interval_start, | 
|  | interval_end, hist_times[i]); | 
|  | /* nr_dots = hist_times[i] * nr_dots_per_sample | 
|  | *         = hist_times[i] * (max_num_dots / max_hist_bin) */ | 
|  | int nr_dots = hist_times[i] * max_dots_per_row / max_hist_bin; | 
|  |  | 
|  | for (int j = 0; j < nr_dots; j++) | 
|  | printf("*"); | 
|  | printf("\n"); | 
|  | } | 
|  | printf("\n"); | 
|  | printf("Samples per dot: %d\n", max_hist_bin / max_dots_per_row); | 
|  | printf("Total samples: %llu\n", stats->total_samples); | 
|  | printf("Avg time   : %llu\n", stats->avg_time); | 
|  | printf("Stdev time : %f\n", sqrt(stats->var_time)); | 
|  | printf("Coef Var   : %f\n", coef_var); | 
|  | if (coef_var > HI_COEF_VAR) | 
|  | printf( | 
|  | "\tHigh coeff of var with serious outliers, adjusted bins\n"); | 
|  | /* numbers are overestimates by at most a lat bin */ | 
|  | printf("50/75/90/99: %d / %d / %d / %d (-<%ld)\n", stats->lat_50, | 
|  | stats->lat_75, stats->lat_90, stats->lat_99, lat_bin_sz); | 
|  | printf("Min / Max  : %llu / %llu\n", stats->min_time, stats->max_time); | 
|  | printf("\n"); | 
|  | free(hist_times); | 
|  | free(lat_times); | 
|  | } | 
|  |  | 
|  | /* Prints the throughput of certain events over nr_steps of interval time.  Will | 
|  | * print the overall throughput of the entire time (total events / steps), | 
|  | * and print out each step up to nr_print_steps. | 
|  | * | 
|  | * Assumes a 2D data structure, where the events in each data[i][] (for a | 
|  | * specific i) are in order.  The 'nr_i'es are typically threads or something | 
|  | * similar.  nr_j would be how many samples per thread.  The func ptr should | 
|  | * return the time of the data[i][j]'th event via *sample and return 0 on | 
|  | * success, and any other value for 'no data'.  If start_time is 0, we'll start | 
|  | * the clock right before the first event. */ | 
|  | void print_throughput(void **data, unsigned int nr_steps, uint64_t interval, | 
|  | unsigned int nr_print_steps, uint64_t start_time, | 
|  | int nr_i, int nr_j, | 
|  | int (*get_sample)(void **data, int i, int j, | 
|  | uint64_t *sample)) | 
|  | { | 
|  | uint64_t time_now, sample; | 
|  | /* next_sample[] tracks each thread's next lock that was acquired */ | 
|  | unsigned int *next_sample; | 
|  | unsigned int *step_events; | 
|  | unsigned int most_step_events = 1; | 
|  | unsigned int max_dots_per_row = 45; /* looks good with 80-wide col */ | 
|  | unsigned int total_events = 0; | 
|  |  | 
|  | if (!nr_steps) | 
|  | return; | 
|  | nr_print_steps = MIN(nr_print_steps, nr_steps); | 
|  | next_sample = malloc(sizeof(unsigned int) * nr_i); | 
|  | step_events = malloc(sizeof(unsigned int) * nr_steps); | 
|  | if (!next_sample || !step_events) { | 
|  | perror("print_throughput failed alloc:"); | 
|  | free(next_sample); | 
|  | free(step_events); | 
|  | return; | 
|  | } | 
|  | memset(next_sample, 0, sizeof(unsigned int) * nr_i); | 
|  | memset(step_events, 0, sizeof(unsigned int) * nr_steps); | 
|  | if (start_time) { | 
|  | time_now = start_time; | 
|  | } else { | 
|  | time_now = UINT64_MAX; | 
|  | /* Set the replay to start right before the first event */ | 
|  | for (int i = 0; i < nr_i; i++) { | 
|  | if (get_sample(data, i, 0, &sample)) | 
|  | continue; | 
|  | time_now = MIN(time_now, sample); | 
|  | } | 
|  | if (time_now != 0) | 
|  | time_now--; | 
|  | } | 
|  | for (int k = 0; k < nr_steps; k++) { | 
|  | time_now += interval; | 
|  | /* for every 'thread', we'll figure out how many events | 
|  | * occurred, and advance next_sample to track the next one to | 
|  | * consider */ | 
|  | for (int i = 0; i < nr_i; i++) { | 
|  | /* count nr locks that have happened, advance per thread | 
|  | * tracker */ | 
|  | for ( ; next_sample[i] < nr_j; next_sample[i]++) { | 
|  | /* skip this thread if it has no more data */ | 
|  | if (get_sample(data, i, next_sample[i], | 
|  | &sample)) | 
|  | continue; | 
|  | /* break when we found one that hasn't happened | 
|  | * yet */ | 
|  | if (!(sample <= time_now)) | 
|  | break; | 
|  | step_events[k]++; | 
|  | } | 
|  | } | 
|  | total_events += step_events[k]; | 
|  | /* for dynamically scaling the *'s */ | 
|  | most_step_events = MAX(most_step_events, step_events[k]); | 
|  | } | 
|  | if (nr_print_steps) | 
|  | printf("Events per dot: %d\n", | 
|  | most_step_events / max_dots_per_row); | 
|  | for (int k = 0; k < nr_print_steps; k++) { | 
|  | /* Last step isn't accurate, will only be partially full */ | 
|  | if (k == nr_steps - 1) | 
|  | break; | 
|  | printf("%6d: ", k); | 
|  | printf("%6d ", step_events[k]); | 
|  | /* nr_dots = step_events[k] * nr_dots_per_event | 
|  | *   = step_events[k] * (max_dots_per_row / most_step_events) */ | 
|  | int nr_dots = step_events[k] * max_dots_per_row / | 
|  | most_step_events; | 
|  | for (int i = 0; i < nr_dots; i++) | 
|  | printf("*"); | 
|  | printf("\n"); | 
|  | } | 
|  | printf("Total events: %d, Avg events/step: %d\n", total_events, | 
|  | total_events / nr_steps); | 
|  | free(next_sample); | 
|  | free(step_events); | 
|  | } |