1#!/usr/bin/env python 2 3import unittest 4""" 5compare.py - versatile benchmark output compare tool 6""" 7 8import argparse 9from argparse import ArgumentParser 10import json 11import sys 12import gbench 13from gbench import util, report 14from gbench.util import * 15 16 17def check_inputs(in1, in2, flags): 18 """ 19 Perform checking on the user provided inputs and diagnose any abnormalities 20 """ 21 in1_kind, in1_err = classify_input_file(in1) 22 in2_kind, in2_err = classify_input_file(in2) 23 output_file = find_benchmark_flag('--benchmark_out=', flags) 24 output_type = find_benchmark_flag('--benchmark_out_format=', flags) 25 if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file: 26 print(("WARNING: '--benchmark_out=%s' will be passed to both " 27 "benchmarks causing it to be overwritten") % output_file) 28 if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0: 29 print("WARNING: passing optional flags has no effect since both " 30 "inputs are JSON") 31 if output_type is not None and output_type != 'json': 32 print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`" 33 " is not supported.") % output_type) 34 sys.exit(1) 35 36 37def create_parser(): 38 parser = ArgumentParser( 39 description='versatile benchmark output compare tool') 40 41 parser.add_argument( 42 '-a', 43 '--display_aggregates_only', 44 dest='display_aggregates_only', 45 action="store_true", 46 help="If there are repetitions, by default, we display everything - the" 47 " actual runs, and the aggregates computed. Sometimes, it is " 48 "desirable to only view the aggregates. E.g. when there are a lot " 49 "of repetitions. Do note that only the display is affected. " 50 "Internally, all the actual runs are still used, e.g. for U test.") 51 52 parser.add_argument( 53 '--no-color', 54 dest='color', 55 default=True, 56 action="store_false", 57 help="Do not use colors in the terminal output" 58 ) 59 60 parser.add_argument( 61 '-d', 62 '--dump_to_json', 63 dest='dump_to_json', 64 help="Additionally, dump benchmark comparison output to this file in JSON format.") 65 66 utest = parser.add_argument_group() 67 utest.add_argument( 68 '--no-utest', 69 dest='utest', 70 default=True, 71 action="store_false", 72 help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS)) 73 alpha_default = 0.05 74 utest.add_argument( 75 "--alpha", 76 dest='utest_alpha', 77 default=alpha_default, 78 type=float, 79 help=("significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)") % 80 alpha_default) 81 82 subparsers = parser.add_subparsers( 83 help='This tool has multiple modes of operation:', 84 dest='mode') 85 86 parser_a = subparsers.add_parser( 87 'benchmarks', 88 help='The most simple use-case, compare all the output of these two benchmarks') 89 baseline = parser_a.add_argument_group( 90 'baseline', 'The benchmark baseline') 91 baseline.add_argument( 92 'test_baseline', 93 metavar='test_baseline', 94 type=argparse.FileType('r'), 95 nargs=1, 96 help='A benchmark executable or JSON output file') 97 contender = parser_a.add_argument_group( 98 'contender', 'The benchmark that will be compared against the baseline') 99 contender.add_argument( 100 'test_contender', 101 metavar='test_contender', 102 type=argparse.FileType('r'), 103 nargs=1, 104 help='A benchmark executable or JSON output file') 105 parser_a.add_argument( 106 'benchmark_options', 107 metavar='benchmark_options', 108 nargs=argparse.REMAINDER, 109 help='Arguments to pass when running benchmark executables') 110 111 parser_b = subparsers.add_parser( 112 'filters', help='Compare filter one with the filter two of benchmark') 113 baseline = parser_b.add_argument_group( 114 'baseline', 'The benchmark baseline') 115 baseline.add_argument( 116 'test', 117 metavar='test', 118 type=argparse.FileType('r'), 119 nargs=1, 120 help='A benchmark executable or JSON output file') 121 baseline.add_argument( 122 'filter_baseline', 123 metavar='filter_baseline', 124 type=str, 125 nargs=1, 126 help='The first filter, that will be used as baseline') 127 contender = parser_b.add_argument_group( 128 'contender', 'The benchmark that will be compared against the baseline') 129 contender.add_argument( 130 'filter_contender', 131 metavar='filter_contender', 132 type=str, 133 nargs=1, 134 help='The second filter, that will be compared against the baseline') 135 parser_b.add_argument( 136 'benchmark_options', 137 metavar='benchmark_options', 138 nargs=argparse.REMAINDER, 139 help='Arguments to pass when running benchmark executables') 140 141 parser_c = subparsers.add_parser( 142 'benchmarksfiltered', 143 help='Compare filter one of first benchmark with filter two of the second benchmark') 144 baseline = parser_c.add_argument_group( 145 'baseline', 'The benchmark baseline') 146 baseline.add_argument( 147 'test_baseline', 148 metavar='test_baseline', 149 type=argparse.FileType('r'), 150 nargs=1, 151 help='A benchmark executable or JSON output file') 152 baseline.add_argument( 153 'filter_baseline', 154 metavar='filter_baseline', 155 type=str, 156 nargs=1, 157 help='The first filter, that will be used as baseline') 158 contender = parser_c.add_argument_group( 159 'contender', 'The benchmark that will be compared against the baseline') 160 contender.add_argument( 161 'test_contender', 162 metavar='test_contender', 163 type=argparse.FileType('r'), 164 nargs=1, 165 help='The second benchmark executable or JSON output file, that will be compared against the baseline') 166 contender.add_argument( 167 'filter_contender', 168 metavar='filter_contender', 169 type=str, 170 nargs=1, 171 help='The second filter, that will be compared against the baseline') 172 parser_c.add_argument( 173 'benchmark_options', 174 metavar='benchmark_options', 175 nargs=argparse.REMAINDER, 176 help='Arguments to pass when running benchmark executables') 177 178 return parser 179 180 181def main(): 182 # Parse the command line flags 183 parser = create_parser() 184 args, unknown_args = parser.parse_known_args() 185 if args.mode is None: 186 parser.print_help() 187 exit(1) 188 assert not unknown_args 189 benchmark_options = args.benchmark_options 190 191 if args.mode == 'benchmarks': 192 test_baseline = args.test_baseline[0].name 193 test_contender = args.test_contender[0].name 194 filter_baseline = '' 195 filter_contender = '' 196 197 # NOTE: if test_baseline == test_contender, you are analyzing the stdev 198 199 description = 'Comparing %s to %s' % (test_baseline, test_contender) 200 elif args.mode == 'filters': 201 test_baseline = args.test[0].name 202 test_contender = args.test[0].name 203 filter_baseline = args.filter_baseline[0] 204 filter_contender = args.filter_contender[0] 205 206 # NOTE: if filter_baseline == filter_contender, you are analyzing the 207 # stdev 208 209 description = 'Comparing %s to %s (from %s)' % ( 210 filter_baseline, filter_contender, args.test[0].name) 211 elif args.mode == 'benchmarksfiltered': 212 test_baseline = args.test_baseline[0].name 213 test_contender = args.test_contender[0].name 214 filter_baseline = args.filter_baseline[0] 215 filter_contender = args.filter_contender[0] 216 217 # NOTE: if test_baseline == test_contender and 218 # filter_baseline == filter_contender, you are analyzing the stdev 219 220 description = 'Comparing %s (from %s) to %s (from %s)' % ( 221 filter_baseline, test_baseline, filter_contender, test_contender) 222 else: 223 # should never happen 224 print("Unrecognized mode of operation: '%s'" % args.mode) 225 parser.print_help() 226 exit(1) 227 228 check_inputs(test_baseline, test_contender, benchmark_options) 229 230 if args.display_aggregates_only: 231 benchmark_options += ['--benchmark_display_aggregates_only=true'] 232 233 options_baseline = [] 234 options_contender = [] 235 236 if filter_baseline and filter_contender: 237 options_baseline = ['--benchmark_filter=%s' % filter_baseline] 238 options_contender = ['--benchmark_filter=%s' % filter_contender] 239 240 # Run the benchmarks and report the results 241 json1 = json1_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark( 242 test_baseline, benchmark_options + options_baseline)) 243 json2 = json2_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark( 244 test_contender, benchmark_options + options_contender)) 245 246 # Now, filter the benchmarks so that the difference report can work 247 if filter_baseline and filter_contender: 248 replacement = '[%s vs. %s]' % (filter_baseline, filter_contender) 249 json1 = gbench.report.filter_benchmark( 250 json1_orig, filter_baseline, replacement) 251 json2 = gbench.report.filter_benchmark( 252 json2_orig, filter_contender, replacement) 253 254 diff_report = gbench.report.get_difference_report( 255 json1, json2, args.utest) 256 output_lines = gbench.report.print_difference_report( 257 diff_report, 258 args.display_aggregates_only, 259 args.utest, args.utest_alpha, args.color) 260 print(description) 261 for ln in output_lines: 262 print(ln) 263 264 # Optionally, diff and output to JSON 265 if args.dump_to_json is not None: 266 with open(args.dump_to_json, 'w') as f_json: 267 json.dump(diff_report, f_json) 268 269class TestParser(unittest.TestCase): 270 def setUp(self): 271 self.parser = create_parser() 272 testInputs = os.path.join( 273 os.path.dirname( 274 os.path.realpath(__file__)), 275 'gbench', 276 'Inputs') 277 self.testInput0 = os.path.join(testInputs, 'test1_run1.json') 278 self.testInput1 = os.path.join(testInputs, 'test1_run2.json') 279 280 def test_benchmarks_basic(self): 281 parsed = self.parser.parse_args( 282 ['benchmarks', self.testInput0, self.testInput1]) 283 self.assertFalse(parsed.display_aggregates_only) 284 self.assertTrue(parsed.utest) 285 self.assertEqual(parsed.mode, 'benchmarks') 286 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 287 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 288 self.assertFalse(parsed.benchmark_options) 289 290 def test_benchmarks_basic_without_utest(self): 291 parsed = self.parser.parse_args( 292 ['--no-utest', 'benchmarks', self.testInput0, self.testInput1]) 293 self.assertFalse(parsed.display_aggregates_only) 294 self.assertFalse(parsed.utest) 295 self.assertEqual(parsed.utest_alpha, 0.05) 296 self.assertEqual(parsed.mode, 'benchmarks') 297 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 298 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 299 self.assertFalse(parsed.benchmark_options) 300 301 def test_benchmarks_basic_display_aggregates_only(self): 302 parsed = self.parser.parse_args( 303 ['-a', 'benchmarks', self.testInput0, self.testInput1]) 304 self.assertTrue(parsed.display_aggregates_only) 305 self.assertTrue(parsed.utest) 306 self.assertEqual(parsed.mode, 'benchmarks') 307 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 308 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 309 self.assertFalse(parsed.benchmark_options) 310 311 def test_benchmarks_basic_with_utest_alpha(self): 312 parsed = self.parser.parse_args( 313 ['--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1]) 314 self.assertFalse(parsed.display_aggregates_only) 315 self.assertTrue(parsed.utest) 316 self.assertEqual(parsed.utest_alpha, 0.314) 317 self.assertEqual(parsed.mode, 'benchmarks') 318 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 319 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 320 self.assertFalse(parsed.benchmark_options) 321 322 def test_benchmarks_basic_without_utest_with_utest_alpha(self): 323 parsed = self.parser.parse_args( 324 ['--no-utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1]) 325 self.assertFalse(parsed.display_aggregates_only) 326 self.assertFalse(parsed.utest) 327 self.assertEqual(parsed.utest_alpha, 0.314) 328 self.assertEqual(parsed.mode, 'benchmarks') 329 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 330 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 331 self.assertFalse(parsed.benchmark_options) 332 333 def test_benchmarks_with_remainder(self): 334 parsed = self.parser.parse_args( 335 ['benchmarks', self.testInput0, self.testInput1, 'd']) 336 self.assertFalse(parsed.display_aggregates_only) 337 self.assertTrue(parsed.utest) 338 self.assertEqual(parsed.mode, 'benchmarks') 339 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 340 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 341 self.assertEqual(parsed.benchmark_options, ['d']) 342 343 def test_benchmarks_with_remainder_after_doubleminus(self): 344 parsed = self.parser.parse_args( 345 ['benchmarks', self.testInput0, self.testInput1, '--', 'e']) 346 self.assertFalse(parsed.display_aggregates_only) 347 self.assertTrue(parsed.utest) 348 self.assertEqual(parsed.mode, 'benchmarks') 349 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 350 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 351 self.assertEqual(parsed.benchmark_options, ['e']) 352 353 def test_filters_basic(self): 354 parsed = self.parser.parse_args( 355 ['filters', self.testInput0, 'c', 'd']) 356 self.assertFalse(parsed.display_aggregates_only) 357 self.assertTrue(parsed.utest) 358 self.assertEqual(parsed.mode, 'filters') 359 self.assertEqual(parsed.test[0].name, self.testInput0) 360 self.assertEqual(parsed.filter_baseline[0], 'c') 361 self.assertEqual(parsed.filter_contender[0], 'd') 362 self.assertFalse(parsed.benchmark_options) 363 364 def test_filters_with_remainder(self): 365 parsed = self.parser.parse_args( 366 ['filters', self.testInput0, 'c', 'd', 'e']) 367 self.assertFalse(parsed.display_aggregates_only) 368 self.assertTrue(parsed.utest) 369 self.assertEqual(parsed.mode, 'filters') 370 self.assertEqual(parsed.test[0].name, self.testInput0) 371 self.assertEqual(parsed.filter_baseline[0], 'c') 372 self.assertEqual(parsed.filter_contender[0], 'd') 373 self.assertEqual(parsed.benchmark_options, ['e']) 374 375 def test_filters_with_remainder_after_doubleminus(self): 376 parsed = self.parser.parse_args( 377 ['filters', self.testInput0, 'c', 'd', '--', 'f']) 378 self.assertFalse(parsed.display_aggregates_only) 379 self.assertTrue(parsed.utest) 380 self.assertEqual(parsed.mode, 'filters') 381 self.assertEqual(parsed.test[0].name, self.testInput0) 382 self.assertEqual(parsed.filter_baseline[0], 'c') 383 self.assertEqual(parsed.filter_contender[0], 'd') 384 self.assertEqual(parsed.benchmark_options, ['f']) 385 386 def test_benchmarksfiltered_basic(self): 387 parsed = self.parser.parse_args( 388 ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e']) 389 self.assertFalse(parsed.display_aggregates_only) 390 self.assertTrue(parsed.utest) 391 self.assertEqual(parsed.mode, 'benchmarksfiltered') 392 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 393 self.assertEqual(parsed.filter_baseline[0], 'c') 394 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 395 self.assertEqual(parsed.filter_contender[0], 'e') 396 self.assertFalse(parsed.benchmark_options) 397 398 def test_benchmarksfiltered_with_remainder(self): 399 parsed = self.parser.parse_args( 400 ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f']) 401 self.assertFalse(parsed.display_aggregates_only) 402 self.assertTrue(parsed.utest) 403 self.assertEqual(parsed.mode, 'benchmarksfiltered') 404 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 405 self.assertEqual(parsed.filter_baseline[0], 'c') 406 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 407 self.assertEqual(parsed.filter_contender[0], 'e') 408 self.assertEqual(parsed.benchmark_options[0], 'f') 409 410 def test_benchmarksfiltered_with_remainder_after_doubleminus(self): 411 parsed = self.parser.parse_args( 412 ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g']) 413 self.assertFalse(parsed.display_aggregates_only) 414 self.assertTrue(parsed.utest) 415 self.assertEqual(parsed.mode, 'benchmarksfiltered') 416 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 417 self.assertEqual(parsed.filter_baseline[0], 'c') 418 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 419 self.assertEqual(parsed.filter_contender[0], 'e') 420 self.assertEqual(parsed.benchmark_options[0], 'g') 421 422 423if __name__ == '__main__': 424 # unittest.main() 425 main() 426 427# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 428# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off; 429# kate: indent-mode python; remove-trailing-spaces modified; 430