• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# encoding: utf-8
3
4# Copyright 2017 Google Inc.
5#
6# Use of this source code is governed by a BSD-style license that can be found
7# in the LICENSE file.
8#
9# This is an A/B test utility script used by calmbench.py
10#
11# For each bench, we get a distribution of min_ms measurements from nanobench.
12# From that, we try to recover the 1/3 and 2/3 quantiles of the distribution.
13# If range (1/3 quantile, 2/3 quantile) is completely disjoint between A and B,
14# we report that as a regression.
15#
16# The more measurements we have for a bench, the more accurate our quantiles
17# are. However, taking more measurements is time consuming. Hence we'll prune
18# out benches and only take more measurements for benches whose current quantile
19# ranges are disjoint.
20#
21# P.S. The current script is brute forcely translated from a ruby script. So it
22# may be ugly...
23
24import re
25import os
26import sys
27import time
28import json
29import subprocess
30import shlex
31import multiprocessing
32import traceback
33from argparse import ArgumentParser
34from multiprocessing import Process
35from threading import Thread
36from threading import Lock
37from pdb import set_trace
38
39
40HELP = """
41\033[31mPlease call calmbench.py to drive this script if you're not doing so.
42This script is not supposed to be used by itself. (At least, it's not easy to
43use by itself. The calmbench bots may use this script directly.)
44\033[0m
45"""
46
47FACTOR  = 3     # lower/upper quantile factor
48DIFF_T  = 0.99  # different enough threshold
49TERM    = 10    # terminate after this no. of iterations without suspect changes
50MAXTRY  = 30    # max number of nanobench tries to narrow down suspects
51
52UNITS   = "ns µs ms s".split()
53
54
55timesLock = Lock()
56timesA  = {}
57timesB  = {}
58
59
60def parse_args():
61  parser = ArgumentParser(description=HELP)
62
63  parser.add_argument('outdir', type=str, help="output directory")
64  parser.add_argument('a', type=str, help="name of A")
65  parser.add_argument('b', type=str, help="name of B")
66  parser.add_argument('nano_a', type=str, help="path to A's nanobench binary")
67  parser.add_argument('nano_b', type=str, help="path to B's nanobench binary")
68  parser.add_argument('arg_a', type=str, help="args for A's nanobench run")
69  parser.add_argument('arg_b', type=str, help="args for B's nanobench run")
70  parser.add_argument('repeat', type=int, help="number of initial runs")
71  parser.add_argument('skip_b', type=str, help=("whether to skip running B"
72                                                " ('true' or 'false')"))
73  parser.add_argument('config', type=str, help="nanobenh config")
74  parser.add_argument('threads', type=int, help="number of threads to run")
75  parser.add_argument('noinit', type=str, help=("whether to skip running B"
76                                                " ('true' or 'false')"))
77
78  parser.add_argument('--concise', dest='concise', action="store_true",
79      help="If set, no verbose thread info will be printed.")
80  parser.set_defaults(concise=False)
81
82  # Additional args for bots
83  BHELP = "bot specific options"
84  parser.add_argument('--githash', type=str, default="", help=BHELP)
85  parser.add_argument('--keys', type=str, default=[], nargs='+', help=BHELP)
86
87  args = parser.parse_args()
88  args.skip_b = args.skip_b == "true"
89  args.noinit = args.noinit == "true"
90
91  if args.threads == -1:
92    args.threads = 1
93    if args.config in ["8888", "565"]: # multi-thread for CPU only
94        args.threads = max(1, multiprocessing.cpu_count() / 2)
95
96  return args
97
98def append_dict_sorted_array(dict_array, key, value):
99  if key not in dict_array:
100    dict_array[key] = []
101  dict_array[key].append(value)
102  dict_array[key].sort()
103
104
105def add_time(args, name, bench, t, unit):
106  normalized_t = t * 1000 ** UNITS.index(unit);
107  if name.startswith(args.a):
108    append_dict_sorted_array(timesA, bench, normalized_t)
109  else:
110    append_dict_sorted_array(timesB, bench, normalized_t)
111
112
113def append_times_from_file(args, name, filename):
114  with open(filename) as f:
115    lines = f.readlines()
116  for line in lines:
117    items = line.split()
118    if len(items) > 10:
119      bench = items[10]
120      matches = re.search("([+-]?\d*.?\d+)(s|ms|µs|ns)", items[3])
121      if (not matches or items[9] != args.config):
122        continue
123      time_num = matches.group(1)
124      time_unit = matches.group(2)
125      add_time(args, name, bench, float(time_num), time_unit)
126
127
128class ThreadWithException(Thread):
129  def __init__(self, target):
130    super(ThreadWithException, self).__init__(target = target)
131    self.exception = None
132
133  def run(self):
134    try:
135      self._Thread__target(*self._Thread__args, **self._Thread__kwargs)
136    except BaseException as e:
137      self.exception = e
138
139  def join(self, timeout=None):
140    super(ThreadWithException, self).join(timeout)
141
142
143class ThreadRunner:
144  """Simplest and stupidiest threaded executer."""
145  def __init__(self, args):
146    self.concise = args.concise
147    self.threads = []
148
149  def add(self, args, fn):
150    if len(self.threads) >= args.threads:
151      self.wait()
152    t = ThreadWithException(target = fn)
153    t.daemon = True
154    self.threads.append(t)
155    t.start()
156
157  def wait(self):
158    def spin():
159      i = 0
160      spinners = [".  ", ".. ", "..."]
161      while len(self.threads) > 0:
162        timesLock.acquire()
163        sys.stderr.write(
164            "\r" + spinners[i % len(spinners)] +
165            " (%d threads running)" % len(self.threads) +
166            "           \r" # spaces for erasing characters
167        )
168        timesLock.release()
169        time.sleep(0.5)
170        i += 1
171
172    if not self.concise:
173      ts = Thread(target = spin);
174      ts.start()
175
176    for t in self.threads:
177      t.join()
178
179    exceptions = []
180    for t in self.threads:
181      if t.exception:
182        exceptions.append(t.exception)
183
184    self.threads = []
185
186    if not self.concise:
187      ts.join()
188
189    if len(exceptions):
190      for exc in exceptions:
191        print exc
192      raise exceptions[0]
193
194
195def split_arg(arg):
196  raw = shlex.split(arg)
197  result = []
198  for r in raw:
199    if '~' in r:
200      result.append(os.path.expanduser(r))
201    else:
202      result.append(r)
203  return result
204
205
206def run(args, threadRunner, name, nano, arg, i):
207  def task():
208    file_i = "%s/%s.out%d" % (args.outdir, name, i)
209
210    should_run = not args.noinit and not (name == args.b and args.skip_b)
211    if i <= 0:
212      should_run = True # always run for suspects
213
214    if should_run:
215      if i > 0:
216        timesLock.acquire()
217        print "Init run %d for %s..." % (i, name)
218        timesLock.release()
219      subprocess.check_call(["touch", file_i])
220      with open(file_i, 'w') as f:
221        subprocess.check_call([nano] + split_arg(arg) +
222                              ["--config", args.config], stderr=f, stdout=f)
223
224    timesLock.acquire()
225    append_times_from_file(args, name, file_i)
226    timesLock.release()
227
228  threadRunner.add(args, task)
229
230
231def init_run(args):
232  threadRunner = ThreadRunner(args)
233  for i in range(1, max(args.repeat, args.threads / 2) + 1):
234    run(args, threadRunner, args.a, args.nano_a, args.arg_a, i)
235    run(args, threadRunner, args.b, args.nano_b, args.arg_b, i)
236  threadRunner.wait()
237
238
239def get_lower_upper(values):
240  i = max(0, (len(values) - 1) / FACTOR)
241  return values[i], values[-i - 1]
242
243
244def different_enough(lower1, upper2):
245  return upper2 < DIFF_T * lower1
246
247
248# TODO(liyuqian): we used this hacky criteria mainly because that I didn't have
249# time to study more rigorous statistical tests. We should adopt a more rigorous
250# test in the future.
251def get_suspects():
252  suspects = []
253  for bench in timesA.keys():
254    if bench not in timesB:
255      continue
256    lowerA, upperA = get_lower_upper(timesA[bench])
257    lowerB, upperB = get_lower_upper(timesB[bench])
258    if different_enough(lowerA, upperB) or different_enough(lowerB, upperA):
259      suspects.append(bench)
260  return suspects
261
262
263def process_bench_pattern(s):
264  if ".skp" in s: # skp bench won't match their exact names...
265    return "^\"" + s[0:(s.index(".skp") + 3)] + "\""
266  else:
267    return "^\"" + s + "\"$"
268
269
270def suspects_arg(suspects):
271  patterns = map(process_bench_pattern, suspects)
272  return " --match " + (" ".join(patterns))
273
274
275def median(array):
276  return array[len(array) / 2]
277
278
279def regression(bench):
280  a = median(timesA[bench])
281  b = median(timesB[bench])
282  if (a == 0): # bad bench, just return no regression
283    return 1
284  return b / a
285
286
287def percentage(x):
288  return (x - 1) * 100
289
290
291def format_r(r):
292  return ('%6.2f' % percentage(r)) + "%"
293
294
295def normalize_r(r):
296  if r > 1.0:
297    return r - 1.0
298  else:
299    return 1.0 - 1/r
300
301
302def test():
303  args = parse_args()
304
305  init_run(args)
306  last_unchanged_iter = 0
307  last_suspect_number = -1
308  tryCnt = 0
309  it = 0
310  while tryCnt < MAXTRY:
311    it += 1
312    suspects = get_suspects()
313    if len(suspects) != last_suspect_number:
314      last_suspect_number = len(suspects)
315      last_unchanged_iter = it
316    if (len(suspects) == 0 or it - last_unchanged_iter >= TERM):
317      break
318
319    print "Number of suspects at iteration %d: %d" % (it, len(suspects))
320    threadRunner = ThreadRunner(args)
321    for j in range(1, max(1, args.threads / 2) + 1):
322      run(args, threadRunner, args.a, args.nano_a,
323          args.arg_a + suspects_arg(suspects), -j)
324      run(args, threadRunner, args.b, args.nano_b,
325          args.arg_b + suspects_arg(suspects), -j)
326      tryCnt += 1
327    threadRunner.wait()
328
329  suspects = get_suspects()
330  if len(suspects) == 0:
331    print ("%s and %s does not seem to have significant " + \
332           "performance differences.") % (args.a, args.b)
333  else:
334    suspects.sort(key = regression)
335    print "%s (compared to %s) is likely" % (args.a, args.b)
336    for suspect in suspects:
337      r = regression(suspect)
338      if r < 1:
339        print "\033[31m  %s slower in %s\033[0m" % \
340                (format_r(1/r), suspect)
341      else:
342        print "\033[32m  %s faster in %s\033[0m" % \
343                (format_r(r), suspect)
344
345  with open("%s/bench_%s_%s.json" % (args.outdir, args.a, args.b), 'w') as f:
346    results = {}
347    for bench in timesA:
348      r = regression(bench) if bench in suspects else 1.0
349      results[bench] = {
350        args.config: {
351          "signed_regression": normalize_r(r),
352          "lower_quantile_ms": get_lower_upper(timesA[bench])[0] * 1e-6,
353          "upper_quantile_ms": get_lower_upper(timesA[bench])[1] * 1e-6,
354          "options": {
355            # TODO(liyuqian): let ab.py call nanobench with --outResultsFile so
356            # nanobench could generate the json for us that's exactly the same
357            # as that being used by perf bots. Currently, we cannot guarantee
358            # that bench is the name (e.g., bench may have additional resolution
359            # information appended after name).
360            "name": bench
361          }
362        }
363      }
364
365    output = {"results": results}
366    if args.githash:
367      output["gitHash"] = args.githash
368    if args.keys:
369      keys = {}
370      for i in range(len(args.keys) / 2):
371        keys[args.keys[i * 2]] = args.keys[i * 2 + 1]
372      output["key"] = keys
373    f.write(json.dumps(output, indent=4))
374    print ("\033[36mJSON results available in %s\033[0m" % f.name)
375
376  with open("%s/bench_%s_%s.csv" % (args.outdir, args.a, args.b), 'w') as out:
377    out.write(("bench, significant?, raw regresion, " +
378                   "%(A)s quantile (ns), %(B)s quantile (ns), " +
379                   "%(A)s (ns), %(B)s (ns)\n") % {'A': args.a, 'B': args.b})
380    for bench in suspects + timesA.keys():
381      if (bench not in timesA or bench not in timesB):
382        continue
383      ta = timesA[bench]
384      tb = timesB[bench]
385      out.write(
386          "%s, %s, %f, " % (bench, bench in suspects, regression(bench)) +
387          ' '.join(map(str, get_lower_upper(ta))) + ", " +
388          ' '.join(map(str, get_lower_upper(tb))) + ", " +
389          ("%s, %s\n" % (' '.join(map(str, ta)), ' '.join(map(str, tb))))
390      )
391    print (("\033[36m" +
392           "Compared %d benches. " +
393           "%d of them seem to be significantly differrent." +
394           "\033[0m") %
395           (len([x for x in timesA if x in timesB]), len(suspects)))
396    print ("\033[36mPlease see detailed bench results in %s\033[0m" %
397            out.name)
398
399
400if __name__ == "__main__":
401  try:
402    test()
403  except Exception as e:
404    print e
405    print HELP
406    traceback.print_exc()
407    raise e
408