#!/usr/bin/Rscript --vanilla
#
# Generate stacked chart of API latencies by API from a TSV data-set
#
# ariel faigon - Dec 2012
#
.libPaths(c('~/local/lib/R',
             '/usr/lib/R/library',
             '/usr/lib/R/site-library'
))

suppressPackageStartupMessages(library(ggplot2))
# grid lib needed for 'unit()':
suppressPackageStartupMessages(library(grid))

#
# Constants: width, height, resolution, font-colors and styles
# Adapt to taste
#
wh.ratio = 2
WIDTH = 8
HEIGHT = WIDTH / wh.ratio
DPI = 200
FONTSIZE = 11
MyGray = gray(0.5)

title.theme   = element_text(family="FreeSans", face="bold.italic",
                            size=FONTSIZE)
x.label.theme = element_text(family="FreeSans", face="bold.italic",
                            size=FONTSIZE-1, vjust=-0.1)
y.label.theme = element_text(family="FreeSans", face="bold.italic",
                           size=FONTSIZE-1, angle=90, vjust=0.2)
x.axis.theme  = element_text(family="FreeSans", face="bold",
                            size=FONTSIZE-1, colour=MyGray)
y.axis.theme  = element_text(family="FreeSans", face="bold",
                            size=FONTSIZE-1, colour=MyGray)


# eprintf: handy function for prints/debugging
eprintf <- function(...) cat(sprintf(...), sep='', file=stderr())

#
# Function generating well-spaced & well-labeled y-axis (count) breaks
#
yscale_breaks <- function(from.to) {
    from <- 0
    to <- from.to[2]
    # round to 10 ceiling
    to <- ceiling(to / 10.0) * 10
    # Count major breaks on 10^N boundaries, include the 0
    n.maj = 1 + ceiling(log(to) / log(10))
    # if major breaks are too few, add minor-breaks half-way between them
    n.breaks <- ifelse(n.maj < 5, max(5, n.maj*2+1), n.maj)
    breaks <- as.integer(seq(from, to, length.out=n.breaks))
    # eprintf("from=%g to=%g n.maj=%g breaks %g\n", from, to, n.maj, breaks)
    breaks
}

#
# -- main
#

# -- process the command line args:  [tsv_file [png_file]]
#    (use defaults if they aren't provided)
#
argv <- commandArgs(trailingOnly = TRUE)
if (is.null(argv) || (length(argv) < 1)) {
    argv <- c(Sys.glob('*api-lat.tsv')[1])
}
tsvfile <- argv[1]
stopifnot(! is.na(tsvfile))
pngfile <- ifelse(is.na(argv[2]), paste(tsvfile, '.png', sep=''), argv[2])

# -- Read the data from the TSV file into an internal data.frame d
d <- read.csv(tsvfile, sep='\t', head=F)

# -- Give each data column a human readable name
names(d) <- c('API', 'Latency')

#
# -- Convert microseconds Latency (our weblog resolution) to seconds
#
d <- transform(d, Latency=Latency/1e6)

#
# -- Trim the latency axis:
#       Drop the few 0.001% extreme-slowest outliers on the right
#       to prevent them from pushing the bulk of the data to the left
Max.Lat <- quantile(d$Latency, probs=0.99999)
d <- subset(d, Latency < Max.Lat)


#
# -- API factor pruning
#       Drop rows where the APIs is less than small % of total calls
#
Rare.APIs.pct <- 0.001
if (Rare.APIs.pct > 0.0) {
    d.N <- nrow(d)
    API.counts <- table(d$API)
    d <- transform(d, CallPct=100.0*API.counts[d$API]/d.N)
    d <- d[d$CallPct > Rare.APIs.pct, ]
    d.N.new <- nrow(d)

    # eprintf("Before %d line   after %d lines (diff: %d)\n",
    #   d.N, d.N.new, d.N - d.N.new)
}

#
# -- Adjust legend item-height &font-size
#    to the number of distinct APIs we have
#
API.count <- nlevels(as.factor(d$API))
Legend.LineSize <- ifelse(API.count < 20, 1.0, 20.0/API.count)
Legend.FontSize <- max(6, as.integer(Legend.LineSize * (FONTSIZE - 1)))
legend.theme  = element_text(family="FreeSans", face="bold.italic",
                            size=Legend.FontSize,
                            colour=gray(0.3))

# eprintf("API.count=%d  Legend.LineSize.Factor=%g Legend.FontSize=%g\n",
#       API.count, Legend.LineSize, Legend.FontSize)


# -- set latency (X-axis) breaks and labels (s.b made more generic)
lat.breaks <- c(0, 0.0001, 0.001, 0.01, 0.1, 1, 10)
lat.labels <- sprintf("%g", lat.breaks)

#
# -- Generate the chart using ggplot
#
p <- ggplot(data=d, aes(x=Latency, y=..count../1000.0, group=API, fill=API)) +
       geom_bar(binwidth=0.01) +
          scale_x_log10(breaks=lat.breaks, labels=lat.labels) +
          scale_y_continuous(breaks=yscale_breaks) +
          ggtitle('APIs Calls & Latency Distribution') +
          xlab('Latency in seconds - log(10) scale') +
          ylab('Call count (in 1000s)') +
          theme(
                plot.title=title.theme,
                axis.title.y=y.label.theme,
                axis.title.x=x.label.theme,
                axis.text.x=x.axis.theme,
                axis.text.y=y.axis.theme,
                legend.text=legend.theme,
                legend.key.height=unit(Legend.LineSize, "line")
          )

#
# -- Save the plot into the png file
#
ggsave(p, file=pngfile, width=WIDTH, height=HEIGHT, dpi=DPI)

