• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env Rscript
2library(ggplot2);
3library(plyr);
4
5# get __dirname and load ./_cli.R
6args = commandArgs(trailingOnly = F);
7dirname = dirname(sub("--file=", "", args[grep("--file", args)]));
8source(paste0(dirname, '/_cli.R'), chdir=T);
9
10if (!is.null(args.options$help) ||
11   (!is.null(args.options$plot) && args.options$plot == TRUE)) {
12  stop("usage: cat file.csv | Rscript compare.R
13  --help           show this message
14  --plot filename  save plot to filename");
15}
16
17plot.filename = args.options$plot;
18
19dat = read.csv(
20  file('stdin'),
21  colClasses=c('character', 'character', 'character', 'numeric', 'numeric')
22);
23dat = data.frame(dat);
24
25dat$nameTwoLines = paste0(dat$filename, '\n', dat$configuration);
26dat$name = paste0(dat$filename, ' ', dat$configuration);
27
28# Create a box plot
29if (!is.null(plot.filename)) {
30  p = ggplot(data=dat);
31  p = p + geom_boxplot(aes(x=nameTwoLines, y=rate, fill=binary));
32  p = p + ylab("rate of operations (higher is better)");
33  p = p + xlab("benchmark");
34  p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5));
35  ggsave(plot.filename, p);
36}
37
38# Computes the shared standard error, as used in Welch's t-test.
39welch.sd = function (old.rate, new.rate) {
40  old.se.squared = var(old.rate) / length(old.rate)
41  new.se.squared = var(new.rate) / length(new.rate)
42  return(sqrt(old.se.squared + new.se.squared))
43}
44
45# Calculate the improvement confidence interval. The improvement is calculated
46# by dividing by old.mu and not new.mu, because old.mu is what the mean
47# improvement is calculated relative to.
48confidence.interval = function (shared.se, old.mu, w, risk) {
49  interval = qt(1 - (risk / 2), w$parameter) * shared.se;
50  return(sprintf("±%.2f%%", (interval / old.mu) * 100))
51}
52
53# Calculate the statistics table.
54statistics = ddply(dat, "name", function(subdat) {
55  old.rate = subset(subdat, binary == "old")$rate;
56  new.rate = subset(subdat, binary == "new")$rate;
57
58  # Calculate improvement for the "new" binary compared with the "old" binary
59  old.mu = mean(old.rate);
60  new.mu = mean(new.rate);
61  improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100));
62
63  r = list(
64    confidence = "NA",
65    improvement = improvement,
66    "accuracy (*)" = "NA",
67    "(**)" = "NA",
68    "(***)" = "NA"
69  );
70
71  # Check if there is enough data to calculate the p-value.
72  if (length(old.rate) > 1 && length(new.rate) > 1) {
73    # Perform a statistical test to see if there actually is a difference in
74    # performance.
75    w = t.test(rate ~ binary, data=subdat);
76    shared.se = welch.sd(old.rate, new.rate)
77
78    # Add user-friendly stars to the table. There should be at least one star
79    # before you can say that there is an improvement.
80    confidence = '';
81    if (w$p.value < 0.001) {
82      confidence = '***';
83    } else if (w$p.value < 0.01) {
84      confidence = '**';
85    } else if (w$p.value < 0.05) {
86      confidence = '*';
87    }
88
89    r = list(
90      confidence = confidence,
91      improvement = improvement,
92      "accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05),
93      "(**)" = confidence.interval(shared.se, old.mu, w, 0.01),
94      "(***)" = confidence.interval(shared.se, old.mu, w, 0.001)
95    );
96  }
97
98  return(data.frame(r, check.names=FALSE));
99});
100
101
102# Set the benchmark names as the row.names to left align them in the print.
103row.names(statistics) = statistics$name;
104statistics$name = NULL;
105
106options(width = 200);
107print(statistics);
108cat("\n")
109cat(sprintf(
110"Be aware that when doing many comparisons the risk of a false-positive
111result increases. In this case, there are %d comparisons, you can thus
112expect the following amount of false-positive results:
113  %.2f false positives, when considering a   5%% risk acceptance (*, **, ***),
114  %.2f false positives, when considering a   1%% risk acceptance (**, ***),
115  %.2f false positives, when considering a 0.1%% risk acceptance (***)
116",
117nrow(statistics),
118nrow(statistics) * 0.05,
119nrow(statistics) * 0.01,
120nrow(statistics) * 0.001))
121