1#!/usr/bin/python 2# encoding: utf-8 3 4# Copyright 2017 Google Inc. 5# 6# Use of this source code is governed by a BSD-style license that can be found 7# in the LICENSE file. 8# 9# This is an A/B test utility script used by calmbench.py 10# 11# For each bench, we get a distribution of min_ms measurements from nanobench. 12# From that, we try to recover the 1/3 and 2/3 quantiles of the distribution. 13# If range (1/3 quantile, 2/3 quantile) is completely disjoint between A and B, 14# we report that as a regression. 15# 16# The more measurements we have for a bench, the more accurate our quantiles 17# are. However, taking more measurements is time consuming. Hence we'll prune 18# out benches and only take more measurements for benches whose current quantile 19# ranges are disjoint. 20# 21# P.S. The current script is brute forcely translated from a ruby script. So it 22# may be ugly... 23 24import re 25import os 26import sys 27import time 28import json 29import subprocess 30import shlex 31import multiprocessing 32import traceback 33from argparse import ArgumentParser 34from multiprocessing import Process 35from threading import Thread 36from threading import Lock 37from pdb import set_trace 38 39 40HELP = """ 41\033[31mPlease call calmbench.py to drive this script if you're not doing so. 42This script is not supposed to be used by itself. (At least, it's not easy to 43use by itself. The calmbench bots may use this script directly.) 44\033[0m 45""" 46 47FACTOR = 3 # lower/upper quantile factor 48DIFF_T = 0.99 # different enough threshold 49TERM = 10 # terminate after this no. of iterations without suspect changes 50MAXTRY = 30 # max number of nanobench tries to narrow down suspects 51 52UNITS = "ns µs ms s".split() 53 54 55timesLock = Lock() 56timesA = {} 57timesB = {} 58 59 60def parse_args(): 61 parser = ArgumentParser(description=HELP) 62 63 parser.add_argument('outdir', type=str, help="output directory") 64 parser.add_argument('a', type=str, help="name of A") 65 parser.add_argument('b', type=str, help="name of B") 66 parser.add_argument('nano_a', type=str, help="path to A's nanobench binary") 67 parser.add_argument('nano_b', type=str, help="path to B's nanobench binary") 68 parser.add_argument('arg_a', type=str, help="args for A's nanobench run") 69 parser.add_argument('arg_b', type=str, help="args for B's nanobench run") 70 parser.add_argument('repeat', type=int, help="number of initial runs") 71 parser.add_argument('skip_b', type=str, help=("whether to skip running B" 72 " ('true' or 'false')")) 73 parser.add_argument('config', type=str, help="nanobenh config") 74 parser.add_argument('threads', type=int, help="number of threads to run") 75 parser.add_argument('noinit', type=str, help=("whether to skip running B" 76 " ('true' or 'false')")) 77 78 parser.add_argument('--concise', dest='concise', action="store_true", 79 help="If set, no verbose thread info will be printed.") 80 parser.set_defaults(concise=False) 81 82 # Additional args for bots 83 BHELP = "bot specific options" 84 parser.add_argument('--githash', type=str, default="", help=BHELP) 85 parser.add_argument('--keys', type=str, default=[], nargs='+', help=BHELP) 86 87 args = parser.parse_args() 88 args.skip_b = args.skip_b == "true" 89 args.noinit = args.noinit == "true" 90 91 if args.threads == -1: 92 args.threads = 1 93 if args.config in ["8888", "565"]: # multi-thread for CPU only 94 args.threads = max(1, multiprocessing.cpu_count() / 2) 95 96 return args 97 98def append_dict_sorted_array(dict_array, key, value): 99 if key not in dict_array: 100 dict_array[key] = [] 101 dict_array[key].append(value) 102 dict_array[key].sort() 103 104 105def add_time(args, name, bench, t, unit): 106 normalized_t = t * 1000 ** UNITS.index(unit); 107 if name.startswith(args.a): 108 append_dict_sorted_array(timesA, bench, normalized_t) 109 else: 110 append_dict_sorted_array(timesB, bench, normalized_t) 111 112 113def append_times_from_file(args, name, filename): 114 with open(filename) as f: 115 lines = f.readlines() 116 for line in lines: 117 items = line.split() 118 if len(items) > 10: 119 bench = items[10] 120 matches = re.search("([+-]?\d*.?\d+)(s|ms|µs|ns)", items[3]) 121 if (not matches or items[9] != args.config): 122 continue 123 time_num = matches.group(1) 124 time_unit = matches.group(2) 125 add_time(args, name, bench, float(time_num), time_unit) 126 127 128class ThreadWithException(Thread): 129 def __init__(self, target): 130 super(ThreadWithException, self).__init__(target = target) 131 self.exception = None 132 133 def run(self): 134 try: 135 self._Thread__target(*self._Thread__args, **self._Thread__kwargs) 136 except BaseException as e: 137 self.exception = e 138 139 def join(self, timeout=None): 140 super(ThreadWithException, self).join(timeout) 141 142 143class ThreadRunner: 144 """Simplest and stupidiest threaded executer.""" 145 def __init__(self, args): 146 self.concise = args.concise 147 self.threads = [] 148 149 def add(self, args, fn): 150 if len(self.threads) >= args.threads: 151 self.wait() 152 t = ThreadWithException(target = fn) 153 t.daemon = True 154 self.threads.append(t) 155 t.start() 156 157 def wait(self): 158 def spin(): 159 i = 0 160 spinners = [". ", ".. ", "..."] 161 while len(self.threads) > 0: 162 timesLock.acquire() 163 sys.stderr.write( 164 "\r" + spinners[i % len(spinners)] + 165 " (%d threads running)" % len(self.threads) + 166 " \r" # spaces for erasing characters 167 ) 168 timesLock.release() 169 time.sleep(0.5) 170 i += 1 171 172 if not self.concise: 173 ts = Thread(target = spin); 174 ts.start() 175 176 for t in self.threads: 177 t.join() 178 179 exceptions = [] 180 for t in self.threads: 181 if t.exception: 182 exceptions.append(t.exception) 183 184 self.threads = [] 185 186 if not self.concise: 187 ts.join() 188 189 if len(exceptions): 190 for exc in exceptions: 191 print exc 192 raise exceptions[0] 193 194 195def split_arg(arg): 196 raw = shlex.split(arg) 197 result = [] 198 for r in raw: 199 if '~' in r: 200 result.append(os.path.expanduser(r)) 201 else: 202 result.append(r) 203 return result 204 205 206def run(args, threadRunner, name, nano, arg, i): 207 def task(): 208 file_i = "%s/%s.out%d" % (args.outdir, name, i) 209 210 should_run = not args.noinit and not (name == args.b and args.skip_b) 211 if i <= 0: 212 should_run = True # always run for suspects 213 214 if should_run: 215 if i > 0: 216 timesLock.acquire() 217 print "Init run %d for %s..." % (i, name) 218 timesLock.release() 219 subprocess.check_call(["touch", file_i]) 220 with open(file_i, 'w') as f: 221 subprocess.check_call([nano] + split_arg(arg) + 222 ["--config", args.config], stderr=f, stdout=f) 223 224 timesLock.acquire() 225 append_times_from_file(args, name, file_i) 226 timesLock.release() 227 228 threadRunner.add(args, task) 229 230 231def init_run(args): 232 threadRunner = ThreadRunner(args) 233 for i in range(1, max(args.repeat, args.threads / 2) + 1): 234 run(args, threadRunner, args.a, args.nano_a, args.arg_a, i) 235 run(args, threadRunner, args.b, args.nano_b, args.arg_b, i) 236 threadRunner.wait() 237 238 239def get_lower_upper(values): 240 i = max(0, (len(values) - 1) / FACTOR) 241 return values[i], values[-i - 1] 242 243 244def different_enough(lower1, upper2): 245 return upper2 < DIFF_T * lower1 246 247 248# TODO(liyuqian): we used this hacky criteria mainly because that I didn't have 249# time to study more rigorous statistical tests. We should adopt a more rigorous 250# test in the future. 251def get_suspects(): 252 suspects = [] 253 for bench in timesA.keys(): 254 if bench not in timesB: 255 continue 256 lowerA, upperA = get_lower_upper(timesA[bench]) 257 lowerB, upperB = get_lower_upper(timesB[bench]) 258 if different_enough(lowerA, upperB) or different_enough(lowerB, upperA): 259 suspects.append(bench) 260 return suspects 261 262 263def process_bench_pattern(s): 264 if ".skp" in s: # skp bench won't match their exact names... 265 return "^\"" + s[0:(s.index(".skp") + 3)] + "\"" 266 else: 267 return "^\"" + s + "\"$" 268 269 270def suspects_arg(suspects): 271 patterns = map(process_bench_pattern, suspects) 272 return " --match " + (" ".join(patterns)) 273 274 275def median(array): 276 return array[len(array) / 2] 277 278 279def regression(bench): 280 a = median(timesA[bench]) 281 b = median(timesB[bench]) 282 if (a == 0): # bad bench, just return no regression 283 return 1 284 return b / a 285 286 287def percentage(x): 288 return (x - 1) * 100 289 290 291def format_r(r): 292 return ('%6.2f' % percentage(r)) + "%" 293 294 295def normalize_r(r): 296 if r > 1.0: 297 return r - 1.0 298 else: 299 return 1.0 - 1/r 300 301 302def test(): 303 args = parse_args() 304 305 init_run(args) 306 last_unchanged_iter = 0 307 last_suspect_number = -1 308 tryCnt = 0 309 it = 0 310 while tryCnt < MAXTRY: 311 it += 1 312 suspects = get_suspects() 313 if len(suspects) != last_suspect_number: 314 last_suspect_number = len(suspects) 315 last_unchanged_iter = it 316 if (len(suspects) == 0 or it - last_unchanged_iter >= TERM): 317 break 318 319 print "Number of suspects at iteration %d: %d" % (it, len(suspects)) 320 threadRunner = ThreadRunner(args) 321 for j in range(1, max(1, args.threads / 2) + 1): 322 run(args, threadRunner, args.a, args.nano_a, 323 args.arg_a + suspects_arg(suspects), -j) 324 run(args, threadRunner, args.b, args.nano_b, 325 args.arg_b + suspects_arg(suspects), -j) 326 tryCnt += 1 327 threadRunner.wait() 328 329 suspects = get_suspects() 330 if len(suspects) == 0: 331 print ("%s and %s does not seem to have significant " + \ 332 "performance differences.") % (args.a, args.b) 333 else: 334 suspects.sort(key = regression) 335 print "%s (compared to %s) is likely" % (args.a, args.b) 336 for suspect in suspects: 337 r = regression(suspect) 338 if r < 1: 339 print "\033[31m %s slower in %s\033[0m" % \ 340 (format_r(1/r), suspect) 341 else: 342 print "\033[32m %s faster in %s\033[0m" % \ 343 (format_r(r), suspect) 344 345 with open("%s/bench_%s_%s.json" % (args.outdir, args.a, args.b), 'w') as f: 346 results = {} 347 for bench in timesA: 348 r = regression(bench) if bench in suspects else 1.0 349 results[bench] = { 350 args.config: { 351 "signed_regression": normalize_r(r), 352 "lower_quantile_ms": get_lower_upper(timesA[bench])[0] * 1e-6, 353 "upper_quantile_ms": get_lower_upper(timesA[bench])[1] * 1e-6, 354 "options": { 355 # TODO(liyuqian): let ab.py call nanobench with --outResultsFile so 356 # nanobench could generate the json for us that's exactly the same 357 # as that being used by perf bots. Currently, we cannot guarantee 358 # that bench is the name (e.g., bench may have additional resolution 359 # information appended after name). 360 "name": bench 361 } 362 } 363 } 364 365 output = {"results": results} 366 if args.githash: 367 output["gitHash"] = args.githash 368 if args.keys: 369 keys = {} 370 for i in range(len(args.keys) / 2): 371 keys[args.keys[i * 2]] = args.keys[i * 2 + 1] 372 output["key"] = keys 373 f.write(json.dumps(output, indent=4)) 374 print ("\033[36mJSON results available in %s\033[0m" % f.name) 375 376 with open("%s/bench_%s_%s.csv" % (args.outdir, args.a, args.b), 'w') as out: 377 out.write(("bench, significant?, raw regresion, " + 378 "%(A)s quantile (ns), %(B)s quantile (ns), " + 379 "%(A)s (ns), %(B)s (ns)\n") % {'A': args.a, 'B': args.b}) 380 for bench in suspects + timesA.keys(): 381 if (bench not in timesA or bench not in timesB): 382 continue 383 ta = timesA[bench] 384 tb = timesB[bench] 385 out.write( 386 "%s, %s, %f, " % (bench, bench in suspects, regression(bench)) + 387 ' '.join(map(str, get_lower_upper(ta))) + ", " + 388 ' '.join(map(str, get_lower_upper(tb))) + ", " + 389 ("%s, %s\n" % (' '.join(map(str, ta)), ' '.join(map(str, tb)))) 390 ) 391 print (("\033[36m" + 392 "Compared %d benches. " + 393 "%d of them seem to be significantly differrent." + 394 "\033[0m") % 395 (len([x for x in timesA if x in timesB]), len(suspects))) 396 print ("\033[36mPlease see detailed bench results in %s\033[0m" % 397 out.name) 398 399 400if __name__ == "__main__": 401 try: 402 test() 403 except Exception as e: 404 print e 405 print HELP 406 traceback.print_exc() 407 raise e 408