• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2016 the V8 project authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5# Do statistical tests on benchmark results
6# This script requires the libraries rjson, R.utils, ggplot2 and data.table
7# Install them prior to running
8
9# To use the script, first get some benchmark results, for example via
10# tools/run_perf.py ../v8-perf/benchmarks/Octane2.1/Octane2.1-TF.json
11#  --outdir=out/x64.release-on --outdir-no-patch=out/x64.release-off
12# --json-test-results=results-on.json
13# --json-test-results-no-patch=results-off.json
14# then run this script
15# Rscript statistics-for-json.R results-on.json results-off.json ~/SVG
16# to produce graphs (and get stdio output of statistical tests).
17
18
19suppressMessages(library("rjson"))       # for fromJson
20suppressMessages(library("R.utils"))     # for printf
21suppressMessages(library("ggplot2"))     # for plotting
22suppressMessages(library("data.table"))  # less broken than data.frame
23
24# Clear all variables from environment
25rm(list=ls())
26
27args <- commandArgs(TRUE)
28if (length(args) != 3) {
29  printf(paste("usage: Rscript %%this_script patched-results.json",
30               "unpatched-results.json\n"))
31} else {
32  patch <- fromJSON(file=args[1])
33  nopatch <- fromJSON(file=args[2])
34  outputPath <- args[3]
35  df <- data.table(L = numeric(), R = numeric(), E = numeric(),
36                   p.value = numeric(), yL = character(),
37                   p.value.sig = logical())
38
39  for (i in seq(1, length(patch$traces))) {
40    testName <- patch$traces[[i]]$graphs[[2]]
41    printf("%s\n", testName)
42
43    nopatch_res <- as.integer(nopatch$traces[[i]]$results)
44    patch_res <- as.integer(patch$traces[[i]]$results)
45    if (length(nopatch_res) > 0) {
46      patch_norm <- shapiro.test(patch_res);
47      nopatch_norm <- shapiro.test(nopatch_res);
48
49      # Shaprio-Wilk test indicates whether data is not likely to
50      # come from a normal distribution. The p-value is the probability
51      # to obtain the sample from a normal distribution. This means, the
52      # smaller p, the more likely the sample was not drawn from a normal
53      # distribution. See [wikipedia:Shapiro-Wilk-Test].
54      printf("  Patched scores look %s distributed (W=%.4f, p=%.4f)\n",
55             ifelse(patch_norm$p.value < 0.05, "not normally", "normally"),
56             patch_norm$statistic, patch_norm$p.value);
57      printf("  Unpatched scores look %s distributed (W=%.4f, p=%.4f)\n",
58             ifelse(nopatch_norm$p.value < 0.05, "not normally", "normally"),
59             nopatch_norm$statistic, nopatch_norm$p.value);
60
61      hist <- ggplot(data=data.frame(x=as.integer(patch_res)), aes(x)) +
62        theme_bw() +
63        geom_histogram(bins=50) +
64        ylab("Points") +
65        xlab(patch$traces[[i]]$graphs[[2]])
66      ggsave(filename=sprintf("%s/%s.svg", outputPath, testName),
67             plot=hist, width=7, height=7)
68
69      hist <- ggplot(data=data.frame(x=as.integer(nopatch_res)), aes(x)) +
70        theme_bw() +
71        geom_histogram(bins=50) +
72        ylab("Points") +
73        xlab(patch$traces[[i]]$graphs[[2]])
74      ggsave(filename=sprintf("%s/%s-before.svg", outputPath, testName),
75             plot=hist, width=7, height=7)
76
77      # The Wilcoxon rank-sum test
78      mww <- wilcox.test(patch_res, nopatch_res, conf.int = TRUE, exact=TRUE)
79      printf(paste("  Wilcoxon U-test W=%.4f, p=%.4f,",
80                   "confidence interval [%.1f, %.1f],",
81                   "est. effect size %.1f \n"),
82                   mww$statistic, mww$p.value,
83                   mww$conf.int[1], mww$conf.int[2], mww$estimate);
84      df <-rbind(df, list(mww$conf.int[1], mww$conf.int[2],
85                          unname(mww$estimate), unname(mww$p.value),
86                          testName, ifelse(mww$p.value < 0.05, TRUE, FALSE)))
87      # t-test
88      t <- t.test(patch_res, nopatch_res, paired=FALSE)
89      printf(paste("  Welch t-test t=%.4f, df = %.2f, p=%.4f,",
90                   "confidence interval [%.1f, %.1f], mean diff %.1f \n"),
91             t$statistic, t$parameter, t$p.value,
92             t$conf.int[1], t$conf.int[2], t$estimate[1]-t$estimate[2]);
93    }
94  }
95  df2 <- cbind(x=1:nrow(df), df[order(E),])
96  speedup <- ggplot(df2, aes(x = x, y = E, colour=p.value.sig)) +
97    geom_errorbar(aes(ymax = L, ymin = R), colour="black") +
98    geom_point(size = 4) +
99    scale_x_discrete(limits=df2$yL,
100                       name=paste("Benchmark, n=", length(patch_res))) +
101    theme_bw() +
102    geom_hline(yintercept = 0) +
103    ylab("Est. Effect Size in Points") +
104    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5)) +
105    theme(legend.position = "bottom") +
106    scale_colour_manual(name="Statistical Significance (MWW, p < 0.05)",
107                          values=c("red", "green"),
108                          labels=c("not significant", "significant")) +
109    theme(legend.justification=c(0,1), legend.position=c(0,1))
110  print(speedup)
111  ggsave(filename=sprintf("%s/speedup-estimates.svg", outputPath),
112         plot=speedup, width=7, height=7)
113}
114