zeek/auxil/broker/caf/scripts/caf-prof

#!/usr/bin/env Rscript --vanilla
#
# Generates plots from CAF profiler output.
#
# The script takes a profiler log file on standard input and generates one plot
# according to the command line options given. In general, the script generates
# plots at the granularity of worker threads as well as individual actors.

suppressPackageStartupMessages(library(colorspace))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(grid))
suppressPackageStartupMessages(library(scales)) # pretty_breaks
suppressPackageStartupMessages(library(optparse))

# ----------------------------------------------------------------------------
#                              Helper Functions
# ----------------------------------------------------------------------------

# Generates a diverging color palette of a given size.
#   - http://tools.medialab.sciences-po.fr/iwanthue
#   - https://gist.github.com/johnbaums/45b49da5e260a9fc1cd7
iwanthue <- function(n, hmin=0, hmax=360, cmin=0, cmax=180, lmin=0, lmax=100) {
  stopifnot(hmin >= 0, cmin >= 0, lmin >= 0,
            hmax <= 360, cmax <= 180, lmax <= 100,
            hmin <= hmax, cmin <= cmax, lmin <= lmax,
            n > 0)
  lab <- LAB(as.matrix(expand.grid(seq(0, 100, 1),
                                   seq(-100, 100, 5),
                                   seq(-110, 100, 5))))
  if (any((hmin != 0 || cmin != 0 || lmin != 0 ||
           hmax != 360 || cmax != 180 || lmax != 100))) {
    hcl <- as(lab, 'polarLUV')
    hcl_coords <- coords(hcl)
    hcl <- hcl[which(hcl_coords[, 'H'] <= hmax & hcl_coords[, 'H'] >= hmin &
                     hcl_coords[, 'C'] <= cmax & hcl_coords[, 'C'] >= cmin &
                     hcl_coords[, 'L'] <= lmax & hcl_coords[, 'L'] >= lmin), ]
    #hcl <- hcl[-which(is.na(coords(hcl)[, 2]))]
    lab <- as(hcl, 'LAB')
  }
  lab <- lab[which(!is.na(hex(lab))), ]
  clus <- kmeans(coords(lab), n, iter.max=100, algorithm="MacQueen")
  hex(LAB(clus$centers))
}

# Generates a palette with equally spaced hues around the color wheel.
color_hue <- function(n, l=65) {
  hues <- seq(15, 375, length=n+1)
  hcl(h=hues, l=l, c=100)[1:n]
}

# Generates labels microsecond ticks.
make_usec_labels <- function(ticks=10^(0:9), sep="") {
  fuse <- function(x, u) paste(x, u, sep=sep)
  unitize <- function(us) {
    if (us < 1e3)
      return(fuse(round(us), "us"))
    else if (us < 1e6)
      return(fuse(round(us / 1e3, 1), "ms"))
    else if (us < 60 * 1e6)
      return(fuse(round(us / 1e6, 1), "s"))
    else if (us < 60 * 60 * 1e6)
      return(fuse(round(us / (60 * 1e6), 1), "m"))
    else
      return(fuse(round(us / (60 * 60 * 1e6), 1), "h"))
  }
  sapply(ticks, unitize)
}

# FIXME: crappy hack to get a log transformation that keeps 0s as 0s instead of
# setting them to -Inf. This only "makes sense" because we have almost no
# values in [0,1]. The only reason why we need such a thing is to get prettier
# scatterplots where the points don't hang directly on top of the axes lines.
# Another annoying artifact of this hack is that annotation logticks
# between 0 and the first order of magnitude are wrong.
# Possible solution: coord_trans()
log_magic_trans <- function(base=10) {
  magic <- 1.000000042
  trans <- function(x) {
    stopifnot(! magic %in% x)
    r <- x
    r[r == 1] <- magic
    r <- log(r, base)
    r[is.infinite(r)] <- 0
    r
  }
  inv <- function(x) {
    r <- x
    r[r == base^magic] <- 1
    r <- base^x
    r[r == 1] <- 0
    r
  }
  trans_new(paste0("log-magic-", format(base)), trans, inv,
            log_breaks(base=base), domain=c(0, Inf))
}

# Dynamic time scale that flips to logarithmic if our data is more than two
# orders of magnitude apart.
scale_time <- function(.data, fun) {
  trans <- NULL
  breaks <- NULL
  if (log10(diff(range(.data))) > 2) {
    #trans <- log10_trans()
    #breaks <- 10^(0:10)
    trans <- log_magic_trans()
    breaks <- c(0, 10^(1:10))
  } else {
    trans <- identity_trans()
    breaks <- pretty_breaks(10)(.data)
  }
  fun(breaks=breaks, labels=make_usec_labels(breaks), trans=trans)
}

scale_x_time <- function(.data) scale_time(.data, fun=scale_x_continuous)
scale_y_time <- function(.data) scale_time(.data, fun=scale_y_continuous)

# Adds a geom_point to an existing plot for a profile. If the the profile has
# labels, the function adds one geom_point per label, with (1) layers sorted by
# decreasing number of points and (2) alpha scaled inversely proportional to
# the number of points.
add_points <- function(p, .data) {
  r <- p
  # Add one layer per label, sorted by per-label point frequency to avoid
  # hiding smaller point groups behind big point clouds.
  f <- table(.data$label)
  o <- names(rev(sort(f))) # order of layers
  gp <- function(x) { force(x); geom_point(data=subset(.data, label == x)) }
  r <- Reduce("+", lapply(o, gp), r)
  # Give each layer its own color and assign it it custom alpha value
  # inversely proportional to the point frequency.
  n <- length(f)
  stopifnot(n > 0)
  # Scales a vector to the interval [amin, 1] inversely proportional to the
  # point values.
  make_alpha <- function(x, amin=0.4) {
    stopifnot(amin > 0, amin < 1)
    1 - log(x) / (max(log(x)) + amin * max(log(x)))
  }
  r <- r + aes(color=label, alpha=label) +
    scale_color_manual(name="ID", values=as.vector(iwanthue(n))) +
    guides(color=guide_legend(override.aes=list(size=4)))
  # Turn each point into a shape and slightly increase alpha.
  if (n > 25) {
    write("** ignoring shapification: more than 25 unique labels", stderr())
    r <- r +
      scale_alpha_manual(values=as.list(make_alpha(f)), guide='none')
  } else {
    r <- r + aes(shape=label) +
      scale_shape_manual(name="ID", values=rev(order(f))) +
      scale_alpha_manual(values=as.list(make_alpha(f, 0.5)), guide='none')
  }
  r
}

# ----------------------------------------------------------------------------
#                               Plot Functions
# ----------------------------------------------------------------------------

# Plots system and user CPU utilization over time.
plot_utilization_time <- function(.data, color.sys="red", color.usr="blue") {
  ggplot(.data) +
    aes(x=clock) +
    geom_point(aes(y=usr/time), shape=4, color=color.usr) +
    geom_point(aes(y=sys/time), shape=1, color=color.sys) +
    geom_line(aes(y=usr/time), color=color.usr) +
    geom_line(aes(y=sys/time), color=color.sys) +
    scale_y_continuous(breaks=seq(0,1,length=5)) +
    labs(x="Time", y="CPU utilization") +
    theme(axis.ticks=element_blank(), axis.text.x=element_blank()) +
    facet_wrap(~ id, ncol=4)
}

# Plots the total CPU utilization in a scatterplot, where the axes denote user
# and system CPU utilization. The point radius is scaled logarithmically to the
# runtime.
plot_utilization_scatter <- function(.data) {
  max.xy <- with(.data, max(usr/time, sys/time))
  p <- ggplot(.data, aes(x=usr/time, y=sys/time, size=time)) +
    scale_size(name="ID", range=c(2, 10), guide='none') +
    xlim(0, max.xy) + ylim(0, max.xy) +
    labs(x="User CPU utilization", y="System CPU utilization")
  add_points(p, .data)
}

# Plots the CPU time in a scatterplot, where axes denote absolute runtime for
# CPU time in user and system mode. The point radius is scaled to utilization.
plot_time_scatter <- function(.data) {
  p <- ggplot(.data, aes(x=usr, y=sys, size=(usr+sys)/time)) +
    scale_size(name="Utilization", range=c(1,6)) +
    #scale_size_area(name="Utilization", max_size=6) +
    scale_x_time(.data$usr) +
    scale_y_time(.data$sys) +
# TODO: re-enable when fixing the magic transformation.
#    annotation_logticks(color="grey") +
    xlab("User CPU time") + ylab("System CPU time")
  add_points(p, .data)
}

# Plots CPU time as boxplot per label.
plot_time_boxplot <- function(.data) {
  totals <- .data %>% group_by(label) %>% summarize(sort=sum(usr+sys))
  lvls <- rev(totals[order(totals$sort), ]$label)
  xs <- .data %>% group_by(label) %>% mutate(util=sum(cpu)/sum(time))
  # Do not drop zeros on log transformation.
  xs$cpu <- mapvalues(xs$cpu, 0, 1, warn_missing=FALSE)
  xs$label <- factor(xs$label, levels=lvls)
  ggplot(xs) +
    aes(x=label, y=cpu, fill=util) +
# TODO: re-enable when fixing the magic transformation.
#    annotation_logticks(sides="lr", color="grey") +
    geom_boxplot(outlier.colour="grey") +
    labs(x="ID", y="CPU time") +
    scale_y_time(xs$cpu) +
    scale_fill_gradient(name="Utilization", low="red", high="green",
                        limits=c(0, 1)) +
    theme(axis.text.x=element_text(angle=90, vjust=.5, hjust=1))
}

# Plots CPU utilization as boxplot per label.
plot_utilization_boxplot <- function(.data) {
  totals <- .data %>% group_by(label) %>% summarize(sort=sum(cpu))
  lvls <- rev(totals[order(totals$sort), ]$label)
  xs <- .data %>% group_by(label) %>% mutate(dom=sum(usr-sys)/sum(usr+sys))
  xs$label <- factor(xs$label, levels=lvls)
  ggplot(xs) +
    aes(x=label, y=util, fill=dom) +
    geom_boxplot() +
    labs(x="ID", y="CPU utilization") +
    scale_fill_gradient2(name="Domination",
                         low="red", mid="white", high="green",
                         breaks=c(-.5, .5), labels=c("System", "User"),
                         limits=c(-1, 1)) +
    theme(axis.text.x=element_text(angle=90, vjust=.5, hjust=1))
}


# Plots the total runtime where each bar consists of system (bottom) and user
# (top) CPU time.
plot_time_barplot <- function(.data) {
  sums <- .data %>% group_by(label) %>% summarize(usr=sum(usr), sys=sum(sys))
  molten <- melt(sums, .(label))
  molten <- molten[order(molten$variable, decreasing=TRUE),] # sys at bottom
  lvls <- rev(sums[order(sums$usr + sums$sys), ]$label)
  molten$label <- factor(molten$label, levels=lvls) # highest bar on the left
  ticks <- pretty_breaks(10)(sums$usr + sums$sys)
  ggplot(molten) +
    aes(x=label, y=value, fill=variable) +
    geom_bar(stat="identity") +
    xlab("ID") +
    scale_y_continuous(name="CPU time", breaks=ticks,
                       labels=make_usec_labels(ticks)) +
    scale_fill_manual(name="CPU time", values=rev(color_hue(2)),
                      labels=c("User", "System")) +
    theme(axis.text.x=element_text(angle=90, vjust=.5, hjust=1),
          legend.justification=c(1,1), legend.position=c(1,1))
}

# ----------------------------------------------------------------------------
#                            Command Line Parsing
# ----------------------------------------------------------------------------

option_list <- list(
  make_option(c("-a", "--actors"), action="store_true", default=F,
              help="generate plots involving actors"),
  make_option(c("-w", "--workers"), action="store_true", default=F,
              help="generate plots involving workers"),
  make_option(c("-l", "--labels"), default=NULL,
              help="two-column file mapping IDs to labels"),
  make_option(c("-f", "--font-size"), type="integer", default=12,
              metavar="number", help="font base size [%default]"),
  make_option(c("-s", "--squeeze"), action="store_true", default=F,
              help="make plot edages as small as possible [%default]"),
  make_option(c("-o", "--output"), default="png",
              help="the image format of the output [%default]"),
  make_option(c("-r", "--read"), default="stdin",
              help="read CAF profile from file [-]"),
  make_option(c("-h", "--help"), action="store_true", default=F,
              help="display this help and exit")
  )

opt <- parse_args(OptionParser(option_list=option_list,
                               add_help_option=F))

# If neither --actors nor --workers given, set 'em both.
if (! opt$actors && ! opt$workers) {
  opt$actors <- TRUE
  opt$workers <- TRUE
}

# ----------------------------------------------------------------------------
#                               Plot Generation
# ----------------------------------------------------------------------------

# Setup theme.
options(scipen=1000)
theme_set(theme_bw(base_size=opt$`font-size`))
theme_update(legend.key=element_rect(colour="white"))
if (opt$squeeze)
  theme_update(legend.key.width=unit(3, "lines"),
               plot.margin=unit(rep(0, 4), "lines"))

record <- function(name, fun, ...) {
  filename <- paste("plot", name, sep="-")
  filename <- paste(filename, opt$output, sep=".")
  write(paste("-- generating", filename), stderr())
  ggsave(filename, fun(...), height=10, width=10)
}

make_zero <- function(x) mapvalues(x, NaN, 0, warn_missing=FALSE)

make_profile <- function(filename) {
  read.table(filename, header=T) %>%
    filter(time > 0) %>%
    group_by(id, type) %>%
    arrange(time) %>%
    mutate(cpu=usr+sys, util=cpu/time, dom=make_zero((usr-sys)/cpu))
}

make_workers <- function(prof) {
  x <- prof %>% filter(type == "worker")
  x$label <- factor(x$id)
  x
}

make_actors <- function(prof, labels=NULL) {
  x <- prof %>% filter(type == "actor")
  if (is.null(labels)) {
    x$label <- factor(x$id)
  } else {
    ls <- read.table(labels, col.names=c("id", "label"))
    x <- left_join(x, ls, "id")
    x$label <- droplevels(x$label)
    x$label <- addNA(x$label)
    levels(x$label) <- mapvalues(levels(x$label), NA, "OTHER")
  }
  x
}

summarize.prof <- function(x) {
  summarize(x, usr=sum(usr), sys=sum(sys), cpu=sum(cpu), time=sum(time),
            util=sum(cpu)/sum(time), dom=make_zero(sum(usr-sys)/sum(cpu)))
}

prof <- make_profile(file(opt$read))

if (opt$workers) {
  workers <- make_workers(prof)
  workers.by_id <- workers %>% group_by(id, label) %>% summarize.prof
  record("worker-time-bar", plot_time_barplot, workers)
  record("worker-time-facets", plot_utilization_time, workers)
  record("worker-time-scatter", plot_time_scatter, workers)
  record("worker-time-scatter-id", plot_time_scatter, workers.by_id)
  record("worker-util-box", plot_utilization_boxplot, workers)
  record("worker-util-scatter", plot_utilization_scatter, workers)
  record("worker-util-scatter-id", plot_utilization_scatter, workers.by_id)
}

if (opt$actors) {
  actors <- make_actors(prof, opt$labels)
  actors.by_id <- actors %>% group_by(id, label) %>% summarize.prof
  actors.by_label <- actors.by_id %>% group_by(label) %>% summarize.prof
  record("actor-time-bar", plot_time_barplot, actors.by_label)
  record("actor-time-scatter", plot_time_scatter, actors)
  record("actor-time-scatter-id", plot_time_scatter, actors.by_id)
  record("actor-time-scatter-label", plot_time_scatter, actors.by_label)
  record("actor-time-box", plot_time_boxplot, actors)
  record("actor-time-box-id", plot_time_boxplot, actors.by_id)
  record("actor-util-box", plot_utilization_boxplot, actors)
  record("actor-util-box-id", plot_utilization_boxplot, actors.by_id)
  record("actor-util-scatter-id", plot_utilization_scatter, actors.by_id)
  record("actor-util-scatter-label", plot_utilization_scatter, actors.by_label)
}