scripts/lock_test.R - upstream - Git at Google

 # brho: 2014-10-13
 #
 # this is partly fleshed out.  to use, i've just been sourcing the script in R,
 # then overriding the tsc overhead and freq.  then just running various
 # functions directly, like print_stats, plot_densities, plot_tput, etc.  don't
 # expect any command line options to work.

 # library that includes the pwelch function
 suppressPackageStartupMessages(library(oce))
 # library for command line option parsing
 suppressPackageStartupMessages(library(optparse))

 # file format: thread_id attempt pre acq(uire) un(lock) tsc_overhead

 g_tsc_overhead <- 0
 g_tsc_frequency <- 0

 ######################################
 ### Functions
 ######################################

 # takes any outliers 2 * farther than the 99th quantile and rounds them down to
 # that limit.  the limit is pretty arbitrary.  useful for not having
 # ridiculously large graphs, but still is lousy for various datasets.
 round_outlier <- function(vec)
 {
 	vec99 = quantile(vec, .99)
 	lim = vec99 + 2 * (vec99 - median(vec))
 	return(sapply(vec, function(x) min(x, lim)))
 }

 # computes acquire latency, using global tsc freq if there isn't one in the
 # data
 acq_latency <- function(data)
 {
 	tsc_overhead = data$V6
 	if (tsc_overhead[1] == 0)
 		tsc_overhead = sapply(tsc_overhead, function(x) g_tsc_overhead)
 	return (data$V4 - data$V3 - tsc_overhead)
 }

 # computes hold latency, using global tsc freq if there isn't one in the data
 hld_latency <- function(data)
 {
 	tsc_overhead = data$V6
 	if (tsc_overhead[1] == 0)
 		tsc_overhead = sapply(tsc_overhead, function(x) g_tsc_overhead)
 	return (data$V5 - data$V4 - tsc_overhead)
 }

 # histogram, bins based on percentiles, with limits of the graph based on the
 # outermost bins.  somewhat works.  can get a 'need finite ylim' if the bins
 # are too small.  maybe since there are no values in it.
 #
 # with density and percentiles for bins, keep in mind the area of a rectangle
 # is the fraction of data points in the cell.  since all bins have the same
 # amount of data points, taller cells show a denser concentration in a skinnier
 # bin
 #
 # i don't actually like this much.  using a round_outlier with 20-bin hist or a
 # density plot look nicer.
 quant_hist <- function(vec)
 {
 	vec_quant = c(quantile(vec, probs=seq(0, 1, .01)))
 	print(vec_quant)
 	# keep the 100 in sync with the 0.01 above
 	hist(vec, breaks=vec_quant, xlim=c(vec_quant[2], vec_quant[100]))
 }

 plot_densities <- function(vecs, names=NULL, outfile="",
                            title="Lock Acquisition Latency",
                            xlab="TSC Ticks")
 {
 	nr_vecs = length(vecs)
 	densities = list()
 	max_y = 0
 	min_x = Inf
 	max_x = 0

 	for (i in 1:nr_vecs) {
 		# [[ ]] chooses the actual element.  [] just subsets
 		dense_i = density(vecs[[i]])
 		densities = c(densities, list(dense_i))
 		max_y = max(max_y, dense_i$y)
 		max_x = max(max_x, dense_i$x)
 		min_x = min(min_x, dense_i$x)
 	}

 	# http://www.statmethods.net/graphs/line.html
 	colors <- rainbow(nr_vecs) # not a huge fan.  color #2 is light blue.
 	linetype <- c(1:nr_vecs)
 	plotchar <- seq(18, 18 + nr_vecs, 1)

 	# http://stackoverflow.com/questions/8929663/r-legend-placement-in-a-plot
 	# can manually move it if we don't want to waste space
 	if (!is.null(names)) {
 		plot(c(min_x,max_x), c(0, max_y), type="n", xaxt="n", yaxt="n")
 		legend_sz = legend("topright", legend=names, lty=linetype, plot=FALSE)
 		max_y = 1.04 * (max_y + legend_sz$rect$h)
 		invisible(dev.off())
 	}

 	if (outfile != "")
 		pdf(outfile)

 	plot(c(min_x,max_x), c(0, max_y), type="n", xlab=xlab, main=title,
 	     ylab="Density")

 	for (i in 1:nr_vecs) {
 		# too many points, so using "l" and no plotchar.
 		#lines(densities[[i]], type="b", lty=linetype[i], col=colors[i],
 		#      pch=plotchar[i], lwd=1.5)
 		lines(densities[[i]], type="l", lty=linetype[i], lwd=1.5)
 	}

 	#legend(x=min_x, y=max_y, legend=names, lty=linetype, col=colors)
 	if (!is.null(names))
 		legend("topright", legend=names, lty=linetype)

 	if (outfile != "")
 		invisible(dev.off())
 }


 plot_density <- function(vec, outfile="",
                          title="Lock Acquisition Latency",
                          xlab="TSC Ticks")
 {
 	vecs = list(vec)
 	plot_densities(vecs=vecs, outfile=outfile, title=title, xlab=xlab)
 }


 plot_acq_times <- function(data, outfile="")
 {
 	if (outfile != "")
 		pdf(outfile)

     # all acquire times, timestamps starting at 0
 	time0 = min(data$V4)
 	total_acq <- data$V4 - time0

 	threadid <- unique(data$V1)

 	acq_n <- list()
 	names <- c()
 	for (i in threadid) {
 		thread_data <- subset(data, data$V1 == i) - time0
 		acq_n <- c(acq_n, list(thread_data$V4))
 		names <- c(names, paste("Thread ", i))
 	}
 	# can adjust ylim, default are from 1..nr_items
 	stripchart(acq_n, group.names=names, pch='.', xlab="Time (TSC Ticks)",
 	           main="Lock Acquisition Timestamps")

 	if (outfile != "")
 		invisible(dev.off())
 }

 print_vec <- function(vec)
 {
 	# this whole str, paste dance is nasty
 	print("---------------")
 	str = paste("Average: ", round(mean(vec), 4))
 	print(str)
 	str = paste("Stddev: ", round(sd(vec), 4))
 	print(str)
 	quants = round(quantile(vec, c(.5, .75, .9, .99, .999)))
 	str = paste("50/75/90/99/99.9: ", quants[[1]], quants[[2]], quants[[3]],
 	            quants[[4]], quants[[5]])
 	print(str)
 	str = paste("Min: ", min(vec), " Max: ", max(vec))
 	print(str)
 }

 # using something like the tables package to output latex booktab's would be
 # much nicer
 print_stats <- function(data)
 {
 	acq_lat = acq_latency(data)
 	hld_lat = hld_latency(data)

 	print("Acquire Latency")
 	print_vec(acq_lat)
 	print("")
 	print("Hold Latency")
 	print_vec(hld_lat)
 }

 # if you know how many msec there are, this is like doing:
 #     hist(total_acq/1000000, breaks=50)
 # except it gives you a line, with points being the top of the hist bars
 plot_tput <- function(data, title="Lock Acquisition Throughput", outfile="")
 {
 	if (outfile != "")
 		pdf(outfile)

 	total_acq = sort(data$V4 - min(data$V4))

 	if (g_tsc_frequency == 0)
 		stop("WARNING: global TSC freq not set!")
 	# convert to nsec? XXX
 	total_acq = total_acq / (g_tsc_frequency / 1e9)

 	# rounds down all times to the nearest msec, will collect into a table,
 	# which counts the freq of each bucket, as per:
 	# http://stackoverflow.com/questions/5034513/how-to-graph-requests-per-second-from-web-log-file-using-r
 	msec_times = trunc(total_acq/1e6)

 	# if we just table directly, we'll lose the absent values (msec where no
 	# timestamp happened).  not sure if factor is the best way, the help says
 	# it should be a small range.
 	# http://stackoverflow.com/questions/1617061/including-absent-values-in-table-results-in-r
 	msec_times = factor(msec_times, 0:max(msec_times))

 	# without the c(), it'll be a bunch of bars at each msec
 	tab = c(table(msec_times))
 	plot(tab, type="o", main=title, xlab="Time (msec)", ylab="Locks per msec")

 	if (outfile != "")
 		invisible(dev.off())
 }


 # extract useful information from the raw data file
 extract_data <- function(filename) {
 	mydata = read.table(filename, comment.char="#")

 	work_amt = mydata$V2

 	# calculate time steps and mean time step (all in ns)
 	times = as.numeric(as.character(mydata$V1))
 	N_entries = length(times)
 	time_steps_ns = times[2:N_entries] - times[1:(N_entries-1)]
 	avg_time_step_ns = mean(time_steps_ns)

 	return(list(work_amt=work_amt, time_steps_ns=time_steps_ns,
 		N_entries=N_entries, avg_time_step_ns=avg_time_step_ns))
 }


 ######################################
 ### Main
 ######################################

 ### collect command line arguments
 # establish optional arguments
 # "-h" and "--help" are automatically in the list
 option_list <- list(
   make_option(c("-i", "--input"), type="character",
     default="welch_input.dat",
     help="Input data file"),
   make_option(c("-o", "--output"), type="character",
     default="welch_plot.pdf",
     help="Output file for plotting"),
   make_option("--xmin", type="double", default=0,
     help=paste("Minimum frequency (horizontal axis) ",
       "in output plot [default %default]",sep="")),
   make_option("--xmax", type="double", default=40,
     help=paste("Maximum frequency (horizontal axis) ",
       "in output plot [default %default]",sep="")),
   make_option("--ymin", type="double", default=-1,
     help=paste("Minimum spectrum (vertical axis) ",
       "in output plot [default adaptive]",sep="")),
   make_option("--ymax", type="double", default=-1,
     help=paste("Maximum spectrum (vertical axis) ",
       "in output plot [default adaptive]",sep=""))
 )

 ## read command line
 #opt <- parse_args(OptionParser(option_list=option_list))
 #
 ##max_freq = as.numeric(as.character(args[3]))
 #
 #### read in data
 #mydata = extract_data(opt$input)

 #round_outlier <- function(vec)
 #acq_latency <- function(data)
 #hld_latency <- function(data)
 #plot_densities <- function(vecs, names=NULL, outfile="",
 #plot_density <- function(vec, outfile="",
 #plot_acq_times <- function(data, outfile="")
 #print_vec <- function(vec)
 #print_stats <- function(data)
 #plot_tput <- function(data)
 #mydata = read.table(filename, comment.char="#")
	# brho: 2014-10-13
	#
	# this is partly fleshed out. to use, i've just been sourcing the script in R,
	# then overriding the tsc overhead and freq. then just running various
	# functions directly, like print_stats, plot_densities, plot_tput, etc. don't
	# expect any command line options to work.

	# library that includes the pwelch function
	suppressPackageStartupMessages(library(oce))
	# library for command line option parsing
	suppressPackageStartupMessages(library(optparse))

	# file format: thread_id attempt pre acq(uire) un(lock) tsc_overhead

	g_tsc_overhead <- 0
	g_tsc_frequency <- 0

	######################################
	### Functions
	######################################

	# takes any outliers 2 * farther than the 99th quantile and rounds them down to
	# that limit. the limit is pretty arbitrary. useful for not having
	# ridiculously large graphs, but still is lousy for various datasets.
	round_outlier <- function(vec)
	{
	vec99 = quantile(vec, .99)
	lim = vec99 + 2 * (vec99 - median(vec))
	return(sapply(vec, function(x) min(x, lim)))
	}

	# computes acquire latency, using global tsc freq if there isn't one in the
	# data
	acq_latency <- function(data)
	{
	tsc_overhead = data$V6
	if (tsc_overhead[1] == 0)
	tsc_overhead = sapply(tsc_overhead, function(x) g_tsc_overhead)
	return (data$V4 - data$V3 - tsc_overhead)
	}

	# computes hold latency, using global tsc freq if there isn't one in the data
	hld_latency <- function(data)
	{
	tsc_overhead = data$V6
	if (tsc_overhead[1] == 0)
	tsc_overhead = sapply(tsc_overhead, function(x) g_tsc_overhead)
	return (data$V5 - data$V4 - tsc_overhead)
	}

	# histogram, bins based on percentiles, with limits of the graph based on the
	# outermost bins. somewhat works. can get a 'need finite ylim' if the bins
	# are too small. maybe since there are no values in it.
	#
	# with density and percentiles for bins, keep in mind the area of a rectangle
	# is the fraction of data points in the cell. since all bins have the same
	# amount of data points, taller cells show a denser concentration in a skinnier
	# bin
	#
	# i don't actually like this much. using a round_outlier with 20-bin hist or a
	# density plot look nicer.
	quant_hist <- function(vec)
	{
	vec_quant = c(quantile(vec, probs=seq(0, 1, .01)))
	print(vec_quant)
	# keep the 100 in sync with the 0.01 above
	hist(vec, breaks=vec_quant, xlim=c(vec_quant[2], vec_quant[100]))
	}

	plot_densities <- function(vecs, names=NULL, outfile="",
	title="Lock Acquisition Latency",
	xlab="TSC Ticks")
	{
	nr_vecs = length(vecs)
	densities = list()
	max_y = 0
	min_x = Inf
	max_x = 0

	for (i in 1:nr_vecs) {
	# [[ ]] chooses the actual element. [] just subsets
	dense_i = density(vecs[[i]])
	densities = c(densities, list(dense_i))
	max_y = max(max_y, dense_i$y)
	max_x = max(max_x, dense_i$x)
	min_x = min(min_x, dense_i$x)
	}

	# http://www.statmethods.net/graphs/line.html
	colors <- rainbow(nr_vecs) # not a huge fan. color #2 is light blue.
	linetype <- c(1:nr_vecs)
	plotchar <- seq(18, 18 + nr_vecs, 1)

	# http://stackoverflow.com/questions/8929663/r-legend-placement-in-a-plot
	# can manually move it if we don't want to waste space
	if (!is.null(names)) {
	plot(c(min_x,max_x), c(0, max_y), type="n", xaxt="n", yaxt="n")
	legend_sz = legend("topright", legend=names, lty=linetype, plot=FALSE)
	max_y = 1.04 * (max_y + legend_sz$rect$h)
	invisible(dev.off())
	}

	if (outfile != "")
	pdf(outfile)

	plot(c(min_x,max_x), c(0, max_y), type="n", xlab=xlab, main=title,
	ylab="Density")

	for (i in 1:nr_vecs) {
	# too many points, so using "l" and no plotchar.
	#lines(densities[[i]], type="b", lty=linetype[i], col=colors[i],
	# pch=plotchar[i], lwd=1.5)
	lines(densities[[i]], type="l", lty=linetype[i], lwd=1.5)
	}

	#legend(x=min_x, y=max_y, legend=names, lty=linetype, col=colors)
	if (!is.null(names))
	legend("topright", legend=names, lty=linetype)

	if (outfile != "")
	invisible(dev.off())
	}


	plot_density <- function(vec, outfile="",
	title="Lock Acquisition Latency",
	xlab="TSC Ticks")
	{
	vecs = list(vec)
	plot_densities(vecs=vecs, outfile=outfile, title=title, xlab=xlab)
	}


	plot_acq_times <- function(data, outfile="")
	{
	if (outfile != "")
	pdf(outfile)

	# all acquire times, timestamps starting at 0
	time0 = min(data$V4)
	total_acq <- data$V4 - time0

	threadid <- unique(data$V1)

	acq_n <- list()
	names <- c()
	for (i in threadid) {
	thread_data <- subset(data, data$V1 == i) - time0
	acq_n <- c(acq_n, list(thread_data$V4))
	names <- c(names, paste("Thread ", i))
	}
	# can adjust ylim, default are from 1..nr_items
	stripchart(acq_n, group.names=names, pch='.', xlab="Time (TSC Ticks)",
	main="Lock Acquisition Timestamps")

	if (outfile != "")
	invisible(dev.off())
	}

	print_vec <- function(vec)
	{
	# this whole str, paste dance is nasty
	print("---------------")
	str = paste("Average: ", round(mean(vec), 4))
	print(str)
	str = paste("Stddev: ", round(sd(vec), 4))
	print(str)
	quants = round(quantile(vec, c(.5, .75, .9, .99, .999)))
	str = paste("50/75/90/99/99.9: ", quants[[1]], quants[[2]], quants[[3]],
	quants[[4]], quants[[5]])
	print(str)
	str = paste("Min: ", min(vec), " Max: ", max(vec))
	print(str)
	}

	# using something like the tables package to output latex booktab's would be
	# much nicer
	print_stats <- function(data)
	{
	acq_lat = acq_latency(data)
	hld_lat = hld_latency(data)

	print("Acquire Latency")
	print_vec(acq_lat)
	print("")
	print("Hold Latency")
	print_vec(hld_lat)
	}

	# if you know how many msec there are, this is like doing:
	# hist(total_acq/1000000, breaks=50)
	# except it gives you a line, with points being the top of the hist bars
	plot_tput <- function(data, title="Lock Acquisition Throughput", outfile="")
	{
	if (outfile != "")
	pdf(outfile)

	total_acq = sort(data$V4 - min(data$V4))

	if (g_tsc_frequency == 0)
	stop("WARNING: global TSC freq not set!")
	# convert to nsec? XXX
	total_acq = total_acq / (g_tsc_frequency / 1e9)

	# rounds down all times to the nearest msec, will collect into a table,
	# which counts the freq of each bucket, as per:
	# http://stackoverflow.com/questions/5034513/how-to-graph-requests-per-second-from-web-log-file-using-r
	msec_times = trunc(total_acq/1e6)

	# if we just table directly, we'll lose the absent values (msec where no
	# timestamp happened). not sure if factor is the best way, the help says
	# it should be a small range.
	# http://stackoverflow.com/questions/1617061/including-absent-values-in-table-results-in-r
	msec_times = factor(msec_times, 0:max(msec_times))

	# without the c(), it'll be a bunch of bars at each msec
	tab = c(table(msec_times))
	plot(tab, type="o", main=title, xlab="Time (msec)", ylab="Locks per msec")

	if (outfile != "")
	invisible(dev.off())
	}


	# extract useful information from the raw data file
	extract_data <- function(filename) {
	mydata = read.table(filename, comment.char="#")

	work_amt = mydata$V2

	# calculate time steps and mean time step (all in ns)
	times = as.numeric(as.character(mydata$V1))
	N_entries = length(times)
	time_steps_ns = times[2:N_entries] - times[1:(N_entries-1)]
	avg_time_step_ns = mean(time_steps_ns)

	return(list(work_amt=work_amt, time_steps_ns=time_steps_ns,
	N_entries=N_entries, avg_time_step_ns=avg_time_step_ns))
	}


	######################################
	### Main
	######################################

	### collect command line arguments
	# establish optional arguments
	# "-h" and "--help" are automatically in the list
	option_list <- list(
	make_option(c("-i", "--input"), type="character",
	default="welch_input.dat",
	help="Input data file"),
	make_option(c("-o", "--output"), type="character",
	default="welch_plot.pdf",
	help="Output file for plotting"),
	make_option("--xmin", type="double", default=0,
	help=paste("Minimum frequency (horizontal axis) ",
	"in output plot [default %default]",sep="")),
	make_option("--xmax", type="double", default=40,
	help=paste("Maximum frequency (horizontal axis) ",
	"in output plot [default %default]",sep="")),
	make_option("--ymin", type="double", default=-1,
	help=paste("Minimum spectrum (vertical axis) ",
	"in output plot [default adaptive]",sep="")),
	make_option("--ymax", type="double", default=-1,
	help=paste("Maximum spectrum (vertical axis) ",
	"in output plot [default adaptive]",sep=""))
	)

	## read command line
	#opt <- parse_args(OptionParser(option_list=option_list))
	#
	##max_freq = as.numeric(as.character(args[3]))
	#
	#### read in data
	#mydata = extract_data(opt$input)

	#round_outlier <- function(vec)
	#acq_latency <- function(data)
	#hld_latency <- function(data)
	#plot_densities <- function(vecs, names=NULL, outfile="",
	#plot_density <- function(vec, outfile="",
	#plot_acq_times <- function(data, outfile="")
	#print_vec <- function(vec)
	#print_stats <- function(data)
	#plot_tput <- function(data)
	#mydata = read.table(filename, comment.char="#")