• Home
  • Raw
  • Download

Lines Matching +full:upload +full:- +full:perf +full:- +full:stats

45 from scipy.stats import gmean, ttest_ind
267 # capture TORCH_COMPILE_DEBUG logs in CI runs and preseve them (i.e., for upload) if
288 "Invalid --only arguments. Check help message for the correct format"
329 writer.writerow(list(line) + ["0"] * (len(headers) - len(line)))
383 Sorts and de-dupes the graphs breaks on the reason string. Note that this
385 miss some graph breaks because of de-duping. We can further refine this
499 time_total += t_iter_end - t_iter_begin
506 time_total += t_1 - t_0
510 def _normalize_bench_inputs(example_inputs) -> Tuple[Tuple[Any], Mapping[str, Any]]:
521 def _register_dataclass_output_as_pytree(example_outputs) -> None:
538 class Stats: class
554 print(f"STATS {k}\n {lines}")
680 tolerance = args.xla_tolerance if args.trace_on_xla else 1e-4
739 f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy",
762 msg = f"{baseline_speedup:.3f}x -> {speedup:.3f}x [{speedup / baseline_speedup:.3f}x]"
800 output_filename[:-4] + "_compilation_metrics.csv",
821 … f"\ndynamic shapes experiments are slow, consider setting --repeat less than {args.repeat}\n"
911 … 1. Creating iobinding with OnnxModel if device is CUDA, which is essential for perf measurement.
936 # NOTE: Making perf comparison fair by moving out the i/o adapting part.
937 # 1. Pre-adapt `pt_inputs` to `onnx_inputs` here.
938 …# 2. Drop `onnx_outputs` to `pt_outputs` adapting. Output comparison is not part of perf measureme…
964 # Insert ONNX warm-up
1012 f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy",
1039 output_filename[:-4] + "_compilation_metrics.csv",
1174 # copy.deepcopy is required to prevent any surprising side-effect,
1198 ) # type: ignore[arg-type]
1243 def wrapper(self, *args, **kwargs) -> Any:
1326 ) -> pathlib.Path:
1339 def format_pt_inputs(self, pt_inputs: Any) -> Sequence[torch.Tensor]:
1343 def format_pt_outputs(self, pt_outputs: Any) -> Sequence[torch.Tensor]:
1346 def adapt_pt_inputs_to_onnx(self, pt_inputs) -> Mapping[str, np.ndarray]:
1353 def adapt_onnx_outputs_to_pt(self, onnx_outputs: List[np.ndarray]) -> Any:
1384 def is_cpu(self) -> bool:
1387 def cpu(self) -> Self:
1463 # Otherwise perf number is inaccurate.
1478 cuda:1, however ORT perf drop significantly.
1502 def _export(self, model, example_inputs, output_path: str, /, **kwargs) -> None:
1586 ) -> torch.onnx.ONNXProgram:
1617 ) -> torch.onnx.ONNXProgram:
1652 ) -> torch.onnx.ONNXProgram:
1682 """Patch non-tensor outputs to make them comparable with the correct result.
1773 def headers(self) -> List[str]:
1777 def row(self) -> List[str]:
1787 def _qualified_exception_class_name(self, exception: Exception) -> str:
1795 ) -> Generator[OnnxExportErrorRow, Any, Any]:
1810 def parse_exception(self, exception: Exception) -> OnnxExportErrorRow:
1831 ) -> Callable:
1851 output_error_filename = output_filename[:-4] + "_export_error.csv"
1869 # Due to this, this function is not and should not be used for perf measurement.
1925 elif batch_size == -1:
1968 A no-op experiment useful for making sure TorchBenchark alone works properly.
2016 "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"],
2017 "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"],
2053 # https://pytorch.org/blog/understanding-gpu-memory-1/
2103 # 1) Benchmark setup runs 2 iterations of fwd-bwd. So, not useful.
2297 out_batch_size = batch_size - 1
2321 for _ in range(n - 1):
2340 if self._args.partition_id < self._args.total_partitions - 1
2414 if self.args.devices[-1] == "cuda"
2608 # Workaround for ONNX for non-tensor outputs
2618 tolerance = 1e-2
2705 t = torch.abs(ref - res) / (1 + torch.abs(ref))
2746 latency = t1 - t0
2755 return sys.exit(-1)
2777 ok, total = Stats.reset_counters()
2797 aot_compilation_time = t_1 - t_0
2834 compilation_time = dynamo_latency - eager_latency + aot_compilation_time
2860 ok, total = Stats.reset_counters()
2865 _, frames_second_pass = Stats.reset_counters() # should be 0
2868 _, frames_third_pass = Stats.reset_counters() # should be 0
2980 stats = "STATS: "
2981 stats = stats + " | ".join(
2987 print(stats)
2988 stats = get_dynamo_stats()
2989 stats.subtract(start_stats)
2993 f"Dynamo produced {stats['unique_graphs']} graphs "
2994 f"covering {stats['calls_captured']} ops with "
2995 f"{stats['graph_breaks']} graph breaks ({stats['unique_graph_breaks']} unique)"
3016 if self.args.stats:
3017 Stats.print_summary()
3024 diff_branch_default = "DIFF-BRANCH-DEFAULT"
3034 "--filter", "-k", action="append", help="filter benchmarks with regexp"
3037 "--exclude", "-x", action="append", help="filter benchmarks with regexp"
3040 "--exclude-exact", action="append", help="filter benchmarks with exact match"
3043 "--total-partitions",
3050 "--partition-id",
3056 "--devices", "--device", "-d", action="append", help="cpu or cuda"
3058 parser.add_argument("--device-index", help="CUDA device index")
3060 "--repeat", "-n", type=int, default=30, help="number of timing runs"
3069 "--iterations-per-run", type=int, default=1, help=iterations_per_run_help
3072 "--randomize-input",
3077 "--threads",
3078 "-t",
3083 "--nopython", action="store_true", help="Turn graph breaks into errors"
3086 "--no-skip",
3091 "--prims-nvfuser", action="store_true", help="user prims + nvfuser backend"
3094 "--dump-raw-metrics",
3099 "--log-operator-inputs",
3104 "--channels-last",
3110 "--batch-size", "--batch_size", type=int, help="batch size for benchmarking"
3113 "--iterations", type=int, default=2, help="how many iterations to run"
3116 "--batch-size-file", type=str, help="String to load batch size from"
3118 parser.add_argument("--cosine", action="store_true", help="use cosine similarity")
3120 "--freezing", action="store_true", help="turn on freezing", default=False
3123 "--ci", action="store_true", help="Flag to tell that its a CI run"
3126 "--dashboard", action="store_true", help="Flag to tell that its a Dashboard run"
3129 "--skip-fp64-check", action="store_true", help="skip accuracy check using fp64"
3132 "--fast", "-f", action="store_true", help="skip slow benchmarks"
3135 "--only",
3138 --only=path:<MODEL_FILE_PATH>,class:<CLASS_NAME>
3160 "--multiprocess",
3165 "--ddp",
3170 "--fsdp",
3177 "--optimize-ddp-mode",
3180 help="Specify the DDP optimization mode -- the value of torch._dynamo.config.optimize_ddp.",
3183 "--distributed-master-port",
3188 "--dynamic-shapes",
3193 "--propagate-real-tensors",
3198 "--dynamic-batch-only",
3200 help="Only assume batch dimension is dynamic. Implies --dynamic-shapes",
3203 "--specialize-int", action="store_true", help="Run with specialize_int=True."
3206 "--use-eval-mode",
3211 "--skip-accuracy-check",
3216 "--generate-aot-autograd-stats",
3218 help="Generates AOT Autograd stats like how mnay graphs are sent to AOT",
3221 "--inductor-settings",
3223 help="Use same settings as --inductor for baseline comparisons",
3226 "--suppress-errors",
3231 "--output",
3235 "--output-directory",
3239 "--disable-output",
3241 help="Disable writing of output files, e.g., for warm-up runs",
3244 "--baseline",
3245 help="Compare with a prior --output",
3248 "--part",
3253 "--export-profiler-trace",
3258 "--profiler-trace-name",
3259 "--profiler_trace_name",
3263 "--diff-branch",
3268 "--tag", default=None, help="Specify a tag to be included in csv files."
3271 "--explain",
3276 "--stats",
3278 help="print graph counter stats",
3281 "--use-warm-peak-memory",
3282 "--use_warm_peak_memory",
3287 "--print-memory",
3292 "--print-compilation-time",
3297 "--print-dataframe-summary",
3302 "--disable-cudagraphs",
3307 "--disable-split-reductions",
3312 "--disable-persistent-reductions",
3317 "--disable-divisible-by-16",
3322 "--inductor-compile-mode",
3327 "--print-graph-breaks",
3332 "--log-graph-breaks",
3337 "--trace-on-xla",
3342 "--xla-tolerance",
3344 default=1e-2,
3348 "--collect-outputs",
3355 "--enable-activation-checkpointing",
3359 parser.add_argument("--timing", action="store_true", help="Emits phase timing")
3362 "--progress",
3368 "--timeout",
3375 "--per_process_memory_fraction",
3378 … help="Set per-process GPU memory fraction (limit) for reducing usable size and reproducing OOMs",
3382 "--no-translation-validation",
3388 "--minify",
3394 "--compiled-autograd",
3400 "--profile_dynamo_cache_lookup",
3401 "--profile-dynamo-cache-lookup",
3407 "--snapshot-memory",
3408 "--snapshot_memory",
3410 … Memory Snapshot tool for memory deep dives: https://pytorch.org/blog/understanding-gpu-memory-1/",
3415 "--cold-start-latency",
3416 "--cold_start_latency",
3418 help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
3421 "--warm-start-latency",
3422 "--warm_start_latency",
3428 # --nvfuser is now the default, keep the option to not break scripts
3429 group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)
3430 group_fuser.add_argument("--nnc", action="store_true", help="enable NNC for GPUs")
3433 group_prec.add_argument("--float16", action="store_true", help="cast model to fp16")
3435 "--bfloat16", action="store_true", help="cast model to bf16"
3437 group_prec.add_argument("--float32", action="store_true", help="cast model to fp32")
3439 "--amp", action="store_true", help="use automatic mixed precision"
3442 "--amp-dtype",
3448 "--verbose", "-v", action="store_true", help="enable verbose debug printouts"
3451 "--quiet", "-q", action="store_true", help="suppress debug printouts"
3456 "--coverage", action="store_true", help="(default) " + help(coverage_experiment)
3459 "--overhead", action="store_true", help=help(overhead_experiment)
3462 "--speedup-dynamo-ts",
3467 "--speedup-fx2trt", action="store_true", help=help(speedup_experiment_fx2trt)
3470 "--speedup-fx2trt-fp16",
3475 "--print-fx",
3480 "--print-aten-ops",
3485 "--inductor",
3490 "--quantization",
3502 "--export",
3507 "--export-aot-inductor",
3512 "--xla", action="store_true", help="Compare TorchXLA to eager PyTorch"
3515 "--torchscript-onnx",
3516 "--torchscript_onnx",
3521 "--dynamo-onnx",
3522 "--dynamo_onnx",
3527 "--dynamo-onnx-aot-inline",
3528 "--dynamo_onnx_aot_inline",
3533 "--dynamo-onnx-aot-optimize",
3534 "--dynamo_onnx_aot_optimize",
3539 "--backend",
3543 group.add_argument("--nothing", action="store_true", help=help(null_experiment))
3545 "--log-conv-args",
3550 "--recompile-profiler",
3551 "--recompile_profiler",
3556 "--find-batch-sizes",
3563 "--accuracy",
3568 "--performance", action="store_true", help="Measures performance speedup"
3571 "--tolerance",
3577 "--training",
3582 "--inference", action="store_true", help="Performs inference"
3622 "--diff-branch called on dirty branch. Commit, stash, or reset."
3627 … f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?"
3647 # Warm start mode. Enable FX graph caching and perform back-to-back runs in
3652 cmd.remove("--warm-start-latency")
3654 print(f"Performing cold-start run for {args.only}")
3655 warmup_cmd = cmd + ["--repeat=1", "--disable-output"]
3658 print(f"Performing warm-start run for {args.only}")
3723 assert args.training, "DDP benchmark requires --training mode"
3729 return sys.exit(-1)
3733 # TODO - Go through the failures for batch size = 2
3746 …# TODO - Using train mode for timm_models and HF models. Move to train mode for Torchbench as well.
3784 print("Cannot specify both --device_index and --multiprocess")
3785 return sys.exit(-1)
3794 return sys.exit(-1)
4068 # Adding diff-branch again to the args will override previous value
4070 [sys.executable] + sys.argv + [f"--diff-branch={diff_branch_default}"]
4073 subprocess.check_call(call_args + [f"--tag={main_branch}"])
4076 subprocess.check_call(call_args + [f"--tag={args.diff_branch}"])
4102 "--rank",
4104 "--world_size",
4182 # NB: Assumes only the first batch-y like dimension is the batch
4240 *Stats.aot_summary(),
4264 [sys.executable] + sys.argv + [f"--only={name}"],
4281 # TODO - add option for coalescing inputs over multiple runs