common.py - OpenGrok cross reference for /external/pytorch/benchmarks/dynamo/common.py

Lines Matching +full:upload +full:- +full:perf +full:- +full:stats
45 from scipy.stats import gmean, ttest_ind
267 # capture TORCH_COMPILE_DEBUG logs in CI runs and preseve them (i.e., for upload) if
288                 "Invalid --only arguments. Check help message for the correct format"
329             writer.writerow(list(line) + ["0"] * (len(headers) - len(line)))
383     Sorts and de-dupes the graphs breaks on the reason string. Note that this
385     miss some graph breaks because of de-duping. We can further refine this
499         time_total += t_iter_end - t_iter_begin
506     time_total += t_1 - t_0
510 def _normalize_bench_inputs(example_inputs) -> Tuple[Tuple[Any], Mapping[str, Any]]:
521 def _register_dataclass_output_as_pytree(example_outputs) -> None:
538 class Stats:  class
554             print(f"STATS {k}\n  {lines}")
680     tolerance = args.xla_tolerance if args.trace_on_xla else 1e-4
739             f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy",
762             msg = f"{baseline_speedup:.3f}x -> {speedup:.3f}x [{speedup / baseline_speedup:.3f}x]"
800         output_filename[:-4] + "_compilation_metrics.csv",
821 …      f"\ndynamic shapes experiments are slow, consider setting --repeat less than {args.repeat}\n"
911 …   1. Creating iobinding with OnnxModel if device is CUDA, which is essential for perf measurement.
936         # NOTE: Making perf comparison fair by moving out the i/o adapting part.
937         # 1. Pre-adapt `pt_inputs` to `onnx_inputs` here.
938 …# 2. Drop `onnx_outputs` to `pt_outputs` adapting. Output comparison is not part of perf measureme…
964     # Insert ONNX warm-up
1012             f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy",
1039         output_filename[:-4] + "_compilation_metrics.csv",
1174                 # copy.deepcopy is required to prevent any surprising side-effect,
1198                 )  # type: ignore[arg-type]
1243     def wrapper(self, *args, **kwargs) -> Any:
1326     ) -> pathlib.Path:
1339     def format_pt_inputs(self, pt_inputs: Any) -> Sequence[torch.Tensor]:
1343     def format_pt_outputs(self, pt_outputs: Any) -> Sequence[torch.Tensor]:
1346     def adapt_pt_inputs_to_onnx(self, pt_inputs) -> Mapping[str, np.ndarray]:
1353     def adapt_onnx_outputs_to_pt(self, onnx_outputs: List[np.ndarray]) -> Any:
1384     def is_cpu(self) -> bool:
1387     def cpu(self) -> Self:
1463         # Otherwise perf number is inaccurate.
1478           cuda:1, however ORT perf drop significantly.
1502     def _export(self, model, example_inputs, output_path: str, /, **kwargs) -> None:
1586     ) -> torch.onnx.ONNXProgram:
1617     ) -> torch.onnx.ONNXProgram:
1652     ) -> torch.onnx.ONNXProgram:
1682         """Patch non-tensor outputs to make them comparable with the correct result.
1773     def headers(self) -> List[str]:
1777     def row(self) -> List[str]:
1787     def _qualified_exception_class_name(self, exception: Exception) -> str:
1795     ) -> Generator[OnnxExportErrorRow, Any, Any]:
1810     def parse_exception(self, exception: Exception) -> OnnxExportErrorRow:
1831 ) -> Callable:
1851         output_error_filename = output_filename[:-4] + "_export_error.csv"
1869                     # Due to this, this function is not and should not be used for perf measurement.
1925     elif batch_size == -1:
1968     A no-op experiment useful for making sure TorchBenchark alone works properly.
2016             "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"],
2017             "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"],
2053     # https://pytorch.org/blog/understanding-gpu-memory-1/
2103             #  1) Benchmark setup runs 2 iterations of fwd-bwd. So, not useful.
2297             out_batch_size = batch_size - 1
2321         for _ in range(n - 1):
2340             if self._args.partition_id < self._args.total_partitions - 1
2414                 if self.args.devices[-1] == "cuda"
2608                 # Workaround for ONNX for non-tensor outputs
2618                     tolerance = 1e-2
2705                     t = torch.abs(ref - res) / (1 + torch.abs(ref))
2746                 latency = t1 - t0
2755                 return sys.exit(-1)
2777             ok, total = Stats.reset_counters()
2797                 aot_compilation_time = t_1 - t_0
2834             compilation_time = dynamo_latency - eager_latency + aot_compilation_time
2860                 ok, total = Stats.reset_counters()
2865                 _, frames_second_pass = Stats.reset_counters()  # should be 0
2868                     _, frames_third_pass = Stats.reset_counters()  # should be 0
2980             stats = "STATS: "
2981             stats = stats + " | ".join(
2987             print(stats)
2988         stats = get_dynamo_stats()
2989         stats.subtract(start_stats)
2993                 f"Dynamo produced {stats['unique_graphs']} graphs "
2994                 f"covering {stats['calls_captured']} ops with "
2995                 f"{stats['graph_breaks']} graph breaks ({stats['unique_graph_breaks']} unique)"
3016         if self.args.stats:
3017             Stats.print_summary()
3024 diff_branch_default = "DIFF-BRANCH-DEFAULT"
3034         "--filter", "-k", action="append", help="filter benchmarks with regexp"
3037         "--exclude", "-x", action="append", help="filter benchmarks with regexp"
3040         "--exclude-exact", action="append", help="filter benchmarks with exact match"
3043         "--total-partitions",
3050         "--partition-id",
3056         "--devices", "--device", "-d", action="append", help="cpu or cuda"
3058     parser.add_argument("--device-index", help="CUDA device index")
3060         "--repeat", "-n", type=int, default=30, help="number of timing runs"
3069         "--iterations-per-run", type=int, default=1, help=iterations_per_run_help
3072         "--randomize-input",
3077         "--threads",
3078         "-t",
3083         "--nopython", action="store_true", help="Turn graph breaks into errors"
3086         "--no-skip",
3091         "--prims-nvfuser", action="store_true", help="user prims + nvfuser backend"
3094         "--dump-raw-metrics",
3099         "--log-operator-inputs",
3104         "--channels-last",
3110         "--batch-size", "--batch_size", type=int, help="batch size for benchmarking"
3113         "--iterations", type=int, default=2, help="how many iterations to run"
3116         "--batch-size-file", type=str, help="String to load batch size from"
3118     parser.add_argument("--cosine", action="store_true", help="use cosine similarity")
3120         "--freezing", action="store_true", help="turn on freezing", default=False
3123         "--ci", action="store_true", help="Flag to tell that its a CI run"
3126         "--dashboard", action="store_true", help="Flag to tell that its a Dashboard run"
3129         "--skip-fp64-check", action="store_true", help="skip accuracy check using fp64"
3132         "--fast", "-f", action="store_true", help="skip slow benchmarks"
3135         "--only",
3138         --only=path:<MODEL_FILE_PATH>,class:<CLASS_NAME>
3160         "--multiprocess",
3165         "--ddp",
3170         "--fsdp",
3177         "--optimize-ddp-mode",
3180         help="Specify the DDP optimization mode -- the value of torch._dynamo.config.optimize_ddp.",
3183         "--distributed-master-port",
3188         "--dynamic-shapes",
3193         "--propagate-real-tensors",
3198         "--dynamic-batch-only",
3200         help="Only assume batch dimension is dynamic.  Implies --dynamic-shapes",
3203         "--specialize-int", action="store_true", help="Run with specialize_int=True."
3206         "--use-eval-mode",
3211         "--skip-accuracy-check",
3216         "--generate-aot-autograd-stats",
3218         help="Generates AOT Autograd stats like how mnay graphs are sent to AOT",
3221         "--inductor-settings",
3223         help="Use same settings as --inductor for baseline comparisons",
3226         "--suppress-errors",
3231         "--output",
3235         "--output-directory",
3239         "--disable-output",
3241         help="Disable writing of output files, e.g., for warm-up runs",
3244         "--baseline",
3245         help="Compare with a prior --output",
3248         "--part",
3253         "--export-profiler-trace",
3258         "--profiler-trace-name",
3259         "--profiler_trace_name",
3263         "--diff-branch",
3268         "--tag", default=None, help="Specify a tag to be included in csv files."
3271         "--explain",
3276         "--stats",
3278         help="print graph counter stats",
3281         "--use-warm-peak-memory",
3282         "--use_warm_peak_memory",
3287         "--print-memory",
3292         "--print-compilation-time",
3297         "--print-dataframe-summary",
3302         "--disable-cudagraphs",
3307         "--disable-split-reductions",
3312         "--disable-persistent-reductions",
3317         "--disable-divisible-by-16",
3322         "--inductor-compile-mode",
3327         "--print-graph-breaks",
3332         "--log-graph-breaks",
3337         "--trace-on-xla",
3342         "--xla-tolerance",
3344         default=1e-2,
3348         "--collect-outputs",
3355         "--enable-activation-checkpointing",
3359     parser.add_argument("--timing", action="store_true", help="Emits phase timing")
3362         "--progress",
3368         "--timeout",
3375         "--per_process_memory_fraction",
3378 …  help="Set per-process GPU memory fraction (limit) for reducing usable size and reproducing OOMs",
3382         "--no-translation-validation",
3388         "--minify",
3394         "--compiled-autograd",
3400         "--profile_dynamo_cache_lookup",
3401         "--profile-dynamo-cache-lookup",
3407         "--snapshot-memory",
3408         "--snapshot_memory",
3410 … Memory Snapshot tool for memory deep dives: https://pytorch.org/blog/understanding-gpu-memory-1/",
3415         "--cold-start-latency",
3416         "--cold_start_latency",
3418         help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
3421         "--warm-start-latency",
3422         "--warm_start_latency",
3428     # --nvfuser is now the default, keep the option to not break scripts
3429     group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)
3430     group_fuser.add_argument("--nnc", action="store_true", help="enable NNC for GPUs")
3433     group_prec.add_argument("--float16", action="store_true", help="cast model to fp16")
3435         "--bfloat16", action="store_true", help="cast model to bf16"
3437     group_prec.add_argument("--float32", action="store_true", help="cast model to fp32")
3439         "--amp", action="store_true", help="use automatic mixed precision"
3442         "--amp-dtype",
3448         "--verbose", "-v", action="store_true", help="enable verbose debug printouts"
3451         "--quiet", "-q", action="store_true", help="suppress debug printouts"
3456         "--coverage", action="store_true", help="(default) " + help(coverage_experiment)
3459         "--overhead", action="store_true", help=help(overhead_experiment)
3462         "--speedup-dynamo-ts",
3467         "--speedup-fx2trt", action="store_true", help=help(speedup_experiment_fx2trt)
3470         "--speedup-fx2trt-fp16",
3475         "--print-fx",
3480         "--print-aten-ops",
3485         "--inductor",
3490         "--quantization",
3502         "--export",
3507         "--export-aot-inductor",
3512         "--xla", action="store_true", help="Compare TorchXLA to eager PyTorch"
3515         "--torchscript-onnx",
3516         "--torchscript_onnx",
3521         "--dynamo-onnx",
3522         "--dynamo_onnx",
3527         "--dynamo-onnx-aot-inline",
3528         "--dynamo_onnx_aot_inline",
3533         "--dynamo-onnx-aot-optimize",
3534         "--dynamo_onnx_aot_optimize",
3539         "--backend",
3543     group.add_argument("--nothing", action="store_true", help=help(null_experiment))
3545         "--log-conv-args",
3550         "--recompile-profiler",
3551         "--recompile_profiler",
3556         "--find-batch-sizes",
3563         "--accuracy",
3568         "--performance", action="store_true", help="Measures performance speedup"
3571         "--tolerance",
3577         "--training",
3582         "--inference", action="store_true", help="Performs inference"
3622                 "--diff-branch called on dirty branch. Commit, stash, or reset."
3627 …       f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?"
3647             # Warm start mode. Enable FX graph caching and perform back-to-back runs in
3652             cmd.remove("--warm-start-latency")
3654             print(f"Performing cold-start run for {args.only}")
3655             warmup_cmd = cmd + ["--repeat=1", "--disable-output"]
3658             print(f"Performing warm-start run for {args.only}")
3723         assert args.training, "DDP benchmark requires --training mode"
3729             return sys.exit(-1)
3733         # TODO - Go through the failures for batch size = 2
3746 …# TODO - Using train mode for timm_models and HF models. Move to train mode for Torchbench as well.
3784             print("Cannot specify both --device_index and --multiprocess")
3785             return sys.exit(-1)
3794         return sys.exit(-1)
4068             # Adding diff-branch again to the args will override previous value
4070                 [sys.executable] + sys.argv + [f"--diff-branch={diff_branch_default}"]
4073             subprocess.check_call(call_args + [f"--tag={main_branch}"])
4076             subprocess.check_call(call_args + [f"--tag={args.diff_branch}"])
4102                                 "--rank",
4104                                 "--world_size",
4182             # NB: Assumes only the first batch-y like dimension is the batch
4240                     *Stats.aot_summary(),
4264                     [sys.executable] + sys.argv + [f"--only={name}"],
4281     # TODO - add option for coalescing inputs over multiple runs