1#!/usr/bin/env Rscript 2library(ggplot2); 3library(plyr); 4 5# get __dirname and load ./_cli.R 6args = commandArgs(trailingOnly = F); 7dirname = dirname(sub("--file=", "", args[grep("--file", args)])); 8source(paste0(dirname, '/_cli.R'), chdir=T); 9 10if (!is.null(args.options$help) || 11 (!is.null(args.options$plot) && args.options$plot == TRUE)) { 12 stop("usage: cat file.csv | Rscript compare.R 13 --help show this message 14 --plot filename save plot to filename"); 15} 16 17plot.filename = args.options$plot; 18 19dat = read.csv( 20 file('stdin'), 21 colClasses=c('character', 'character', 'character', 'numeric', 'numeric') 22); 23dat = data.frame(dat); 24 25dat$nameTwoLines = paste0(dat$filename, '\n', dat$configuration); 26dat$name = paste0(dat$filename, ' ', dat$configuration); 27 28# Create a box plot 29if (!is.null(plot.filename)) { 30 p = ggplot(data=dat); 31 p = p + geom_boxplot(aes(x=nameTwoLines, y=rate, fill=binary)); 32 p = p + ylab("rate of operations (higher is better)"); 33 p = p + xlab("benchmark"); 34 p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)); 35 ggsave(plot.filename, p); 36} 37 38# Computes the shared standard error, as used in Welch's t-test. 39welch.sd = function (old.rate, new.rate) { 40 old.se.squared = var(old.rate) / length(old.rate) 41 new.se.squared = var(new.rate) / length(new.rate) 42 return(sqrt(old.se.squared + new.se.squared)) 43} 44 45# Calculate the improvement confidence interval. The improvement is calculated 46# by dividing by old.mu and not new.mu, because old.mu is what the mean 47# improvement is calculated relative to. 48confidence.interval = function (shared.se, old.mu, w, risk) { 49 interval = qt(1 - (risk / 2), w$parameter) * shared.se; 50 return(sprintf("±%.2f%%", (interval / old.mu) * 100)) 51} 52 53# Calculate the statistics table. 54statistics = ddply(dat, "name", function(subdat) { 55 old.rate = subset(subdat, binary == "old")$rate; 56 new.rate = subset(subdat, binary == "new")$rate; 57 58 # Calculate improvement for the "new" binary compared with the "old" binary 59 old.mu = mean(old.rate); 60 new.mu = mean(new.rate); 61 improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100)); 62 63 r = list( 64 confidence = "NA", 65 improvement = improvement, 66 "accuracy (*)" = "NA", 67 "(**)" = "NA", 68 "(***)" = "NA" 69 ); 70 71 # Check if there is enough data to calculate the p-value. 72 if (length(old.rate) > 1 && length(new.rate) > 1) { 73 # Perform a statistical test to see if there actually is a difference in 74 # performance. 75 w = t.test(rate ~ binary, data=subdat); 76 shared.se = welch.sd(old.rate, new.rate) 77 78 # Add user-friendly stars to the table. There should be at least one star 79 # before you can say that there is an improvement. 80 confidence = ''; 81 if (w$p.value < 0.001) { 82 confidence = '***'; 83 } else if (w$p.value < 0.01) { 84 confidence = '**'; 85 } else if (w$p.value < 0.05) { 86 confidence = '*'; 87 } 88 89 r = list( 90 confidence = confidence, 91 improvement = improvement, 92 "accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05), 93 "(**)" = confidence.interval(shared.se, old.mu, w, 0.01), 94 "(***)" = confidence.interval(shared.se, old.mu, w, 0.001) 95 ); 96 } 97 98 return(data.frame(r, check.names=FALSE)); 99}); 100 101 102# Set the benchmark names as the row.names to left align them in the print. 103row.names(statistics) = statistics$name; 104statistics$name = NULL; 105 106options(width = 200); 107print(statistics); 108cat("\n") 109cat(sprintf( 110"Be aware that when doing many comparisons the risk of a false-positive 111result increases. In this case, there are %d comparisons, you can thus 112expect the following amount of false-positive results: 113 %.2f false positives, when considering a 5%% risk acceptance (*, **, ***), 114 %.2f false positives, when considering a 1%% risk acceptance (**, ***), 115 %.2f false positives, when considering a 0.1%% risk acceptance (***) 116", 117nrow(statistics), 118nrow(statistics) * 0.05, 119nrow(statistics) * 0.01, 120nrow(statistics) * 0.001)) 121