1#!/usr/bin/env python3 2# Copyright (c) Meta Platforms, Inc. and affiliates. 3# All rights reserved. 4# 5# This source code is licensed under the BSD-style license found in the 6# LICENSE file in the root directory of this source tree. 7 8import json 9import logging 10import os 11import re 12import zipfile 13from argparse import Action, ArgumentParser, Namespace 14from io import BytesIO 15from logging import info, warning 16from typing import Any, Dict, List, Optional 17from urllib import error, request 18 19 20logging.basicConfig(level=logging.INFO) 21 22 23BENCHMARK_RESULTS_FILENAME = "benchmark_results.json" 24ARTIFACTS_FILENAME_REGEX = re.compile(r"(android|ios)-artifacts-(?P<job_id>\d+).json") 25 26# iOS-related regexes and variables 27IOS_TEST_SPEC_REGEX = re.compile( 28 r"Test Case\s+'-\[(?P<test_class>\w+)\s+(?P<test_name>[\w\+]+)\]'\s+measured\s+\[(?P<metric>.+)\]\s+average:\s+(?P<value>[\d\.]+)," 29) 30IOS_TEST_NAME_REGEX = re.compile( 31 r"test_(?P<method>forward|load|generate)_(?P<model_name>[\w\+]+)_pte.*iOS_(?P<ios_ver>\w+)_iPhone(?P<iphone_ver>\w+)" 32) 33# The backend name could contain +, i.e. tinyllama_xnnpack+custom+qe_fp32 34IOS_MODEL_NAME_REGEX = re.compile( 35 r"(?P<model>[^_]+)_(?P<backend>[\w\+]+)_(?P<dtype>\w+)" 36) 37 38 39class ValidateArtifacts(Action): 40 def __call__( 41 self, 42 parser: ArgumentParser, 43 namespace: Namespace, 44 values: Any, 45 option_string: Optional[str] = None, 46 ) -> None: 47 if os.path.isfile(values) and values.endswith(".json"): 48 setattr(namespace, self.dest, values) 49 return 50 51 parser.error(f"{values} is not a valid JSON file (*.json)") 52 53 54class ValidateOutputDir(Action): 55 def __call__( 56 self, 57 parser: ArgumentParser, 58 namespace: Namespace, 59 values: Any, 60 option_string: Optional[str] = None, 61 ) -> None: 62 if os.path.isdir(values): 63 setattr(namespace, self.dest, values) 64 return 65 66 parser.error(f"{values} is not a valid directory") 67 68 69def parse_args() -> Any: 70 from argparse import ArgumentParser 71 72 parser = ArgumentParser("extract benchmark results from AWS Device Farm artifacts") 73 parser.add_argument( 74 "--artifacts", 75 type=str, 76 required=True, 77 action=ValidateArtifacts, 78 help="the list of artifacts from AWS in JSON format", 79 ) 80 parser.add_argument( 81 "--output-dir", 82 type=str, 83 required=True, 84 action=ValidateOutputDir, 85 help="the directory to keep the benchmark results", 86 ) 87 parser.add_argument( 88 "--repo", 89 type=str, 90 required=True, 91 help="which GitHub repo this workflow run belongs to", 92 ) 93 parser.add_argument( 94 "--head-branch", 95 type=str, 96 required=True, 97 help="the head branch that runs", 98 ) 99 parser.add_argument( 100 "--workflow-name", 101 type=str, 102 required=True, 103 help="the name of the benchmark workflow", 104 ) 105 parser.add_argument( 106 "--workflow-run-id", 107 type=int, 108 required=True, 109 help="the id of the benchmark workflow", 110 ) 111 parser.add_argument( 112 "--workflow-run-attempt", 113 type=int, 114 required=True, 115 help="which retry of the workflow this is", 116 ) 117 118 return parser.parse_args() 119 120 121def extract_android_benchmark_results( 122 job_name: str, artifact_type: str, artifact_s3_url: str 123) -> List: 124 """ 125 The benchmark results from Android have already been stored in CUSTOMER_ARTIFACT 126 artifact, so we will just need to get it 127 128 Return the list of benchmark results. 129 """ 130 if artifact_type != "CUSTOMER_ARTIFACT": 131 return [] 132 133 try: 134 with request.urlopen(artifact_s3_url) as data: 135 with zipfile.ZipFile(BytesIO(data.read())) as customer_artifact: 136 for name in customer_artifact.namelist(): 137 if BENCHMARK_RESULTS_FILENAME in name: 138 return json.loads(customer_artifact.read(name)) 139 140 except error.HTTPError: 141 warning(f"Fail to {artifact_type} {artifact_s3_url}") 142 return [] 143 except json.decoder.JSONDecodeError: 144 # This is to handle the case where there is no benchmark results 145 warning(f"Fail to load the benchmark results from {artifact_s3_url}") 146 return [] 147 148 149def initialize_ios_metadata(test_name: str) -> Dict[str, any]: 150 """ 151 Extract the benchmark metadata from the test name, for example: 152 test_forward_llama2_pte_iOS_17_2_1_iPhone15_4 153 test_load_resnet50_xnnpack_q8_pte_iOS_17_2_1_iPhone15_4 154 """ 155 m = IOS_TEST_NAME_REGEX.match(test_name) 156 if not m: 157 return {} 158 159 method = m.group("method") 160 model_name = m.group("model_name") 161 ios_ver = m.group("ios_ver").replace("_", ".") 162 iphone_ver = m.group("iphone_ver").replace("_", ".") 163 164 # The default backend and quantization dtype if the script couldn't extract 165 # them from the model name 166 backend = "" 167 quantization = "unknown" 168 169 m = IOS_MODEL_NAME_REGEX.match(model_name) 170 if m: 171 backend = m.group("backend") 172 quantization = m.group("dtype") 173 model_name = m.group("model") 174 175 return { 176 "benchmarkModel": { 177 "backend": backend, 178 "quantization": quantization, 179 "name": model_name, 180 }, 181 "deviceInfo": { 182 "arch": f"iPhone {iphone_ver}", 183 "device": f"iPhone {iphone_ver}", 184 "os": f"iOS {ios_ver}", 185 "availMem": 0, 186 "totalMem": 0, 187 }, 188 "method": method, 189 # These fields will be populated later by extract_ios_metric 190 "metric": "", 191 "actualValue": 0, 192 "targetValue": 0, 193 } 194 195 196def extract_ios_metric( 197 benchmark_result: Dict[str, Any], 198 test_name: str, 199 metric_name: str, 200 metric_value: float, 201) -> Dict[str, Any]: 202 """ 203 Map the metric name from iOS xcresult to the benchmark result 204 """ 205 method = benchmark_result.get("method", "") 206 if not method: 207 return benchmark_result 208 209 # NB: This looks brittle, but unless we can return iOS benchmark results in JSON 210 # format by the test, the mapping is needed to match with Android test 211 if method == "load": 212 if metric_name == "Clock Monotonic Time, s": 213 benchmark_result["metric"] = "model_load_time(ms)" 214 benchmark_result["actualValue"] = metric_value * 1000 215 216 elif metric_name == "Memory Peak Physical, kB": 217 # NB: Showing the value in mB is friendlier IMO 218 benchmark_result["metric"] = "peak_load_mem_usage(mb)" 219 benchmark_result["actualValue"] = metric_value / 1024 220 221 elif method == "forward": 222 if metric_name == "Clock Monotonic Time, s": 223 benchmark_result["metric"] = ( 224 "generate_time(ms)" 225 if "llama" in test_name 226 else "avg_inference_latency(ms)" 227 ) 228 benchmark_result["actualValue"] = metric_value * 1000 229 230 elif metric_name == "Memory Peak Physical, kB": 231 # NB: Showing the value in mB is friendlier IMO 232 benchmark_result["metric"] = "peak_inference_mem_usage(mb)" 233 benchmark_result["actualValue"] = metric_value / 1024 234 235 elif method == "generate" and metric_name == "Tokens Per Second, t/s": 236 benchmark_result["metric"] = "token_per_sec" 237 benchmark_result["actualValue"] = metric_value 238 239 return benchmark_result 240 241 242def extract_ios_benchmark_results( 243 job_name: str, artifact_type: str, artifact_s3_url: str 244) -> List: 245 """ 246 The benchmark results from iOS are currently from xcresult, which could either 247 be parsed from CUSTOMER_ARTIFACT or get from the test spec output. The latter 248 is probably easier to process 249 """ 250 if artifact_type != "TESTSPEC_OUTPUT": 251 return [] 252 253 try: 254 benchmark_results = [] 255 256 with request.urlopen(artifact_s3_url) as data: 257 current_test_name = "" 258 current_metric_name = "" 259 current_record = {} 260 261 for line in data.read().decode("utf8").splitlines(): 262 s = IOS_TEST_SPEC_REGEX.search(line) 263 if not s: 264 continue 265 266 test_name = s.group("test_name") 267 metric_name = s.group("metric") 268 metric_value = float(s.group("value")) 269 270 if test_name != current_test_name or metric_name != current_metric_name: 271 if current_record and current_record.get("metric", ""): 272 # Save the benchmark result in the same format used by Android 273 benchmark_results.append(current_record.copy()) 274 275 current_test_name = test_name 276 current_metric_name = metric_name 277 current_record = initialize_ios_metadata(current_test_name) 278 279 current_record = extract_ios_metric( 280 current_record, test_name, metric_name, metric_value 281 ) 282 283 if current_record and current_record.get("metric", ""): 284 benchmark_results.append(current_record.copy()) 285 286 return benchmark_results 287 288 except error.HTTPError: 289 warning(f"Fail to {artifact_type} {artifact_s3_url}") 290 return [] 291 292 293def extract_job_id(artifacts_filename: str) -> int: 294 """ 295 Extract the job id from the artifacts filename 296 """ 297 m = ARTIFACTS_FILENAME_REGEX.match(os.path.basename(artifacts_filename)) 298 if not m: 299 return 0 300 return int(m.group("job_id")) 301 302 303def transform( 304 app_type: str, 305 benchmark_results: List, 306 repo: str, 307 head_branch: str, 308 workflow_name: str, 309 workflow_run_id: int, 310 workflow_run_attempt: int, 311 job_name: str, 312 job_id: int, 313) -> List: 314 """ 315 Transform the benchmark results into the format writable into the benchmark database 316 """ 317 # Overwrite the device name here with the job name as it has more information about 318 # the device, i.e. Samsung Galaxy S22 5G instead of just Samsung 319 for r in benchmark_results: 320 r["deviceInfo"]["device"] = job_name 321 322 # TODO (huydhn): This is the current schema of the database oss_ci_benchmark_v2, 323 # and I'm trying to fit ET benchmark results into it, which is kind of awkward. 324 # However, the schema is going to be updated soon 325 return [ 326 { 327 # GH-info to identify where the benchmark is run 328 "repo": repo, 329 "head_branch": head_branch, 330 "workflow_id": workflow_run_id, 331 "run_attempt": workflow_run_attempt, 332 "job_id": job_id, 333 # The model 334 "name": f"{r['benchmarkModel']['name']} {r['benchmarkModel'].get('backend', '')}".strip(), 335 "dtype": ( 336 r["benchmarkModel"]["quantization"] 337 if r["benchmarkModel"]["quantization"] 338 else "unknown" 339 ), 340 # The metric value 341 "metric": r["metric"], 342 "actual": r["actualValue"], 343 "target": r["targetValue"], 344 # The device 345 "device": r["deviceInfo"]["device"], 346 "arch": r["deviceInfo"].get("os", ""), 347 # Not used here, just set it to something unique here 348 "filename": workflow_name, 349 "test_name": app_type, 350 "runner": job_name, 351 } 352 for r in benchmark_results 353 ] 354 355 356def main() -> None: 357 args = parse_args() 358 359 # Across all devices 360 all_benchmark_results = [] 361 362 with open(args.artifacts) as f: 363 for artifact in json.load(f): 364 app_type = artifact.get("app_type", "") 365 # We expect this to be set to either ANDROID_APP or IOS_APP 366 if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]: 367 info( 368 f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}" 369 ) 370 continue 371 372 job_name = artifact["job_name"] 373 artifact_type = artifact["type"] 374 artifact_s3_url = artifact["s3_url"] 375 376 if app_type == "ANDROID_APP": 377 benchmark_results = extract_android_benchmark_results( 378 job_name, artifact_type, artifact_s3_url 379 ) 380 381 if app_type == "IOS_APP": 382 benchmark_results = extract_ios_benchmark_results( 383 job_name, artifact_type, artifact_s3_url 384 ) 385 386 if benchmark_results: 387 benchmark_results = transform( 388 app_type, 389 benchmark_results, 390 args.repo, 391 args.head_branch, 392 args.workflow_name, 393 args.workflow_run_id, 394 args.workflow_run_attempt, 395 job_name, 396 extract_job_id(args.artifacts), 397 ) 398 all_benchmark_results.extend(benchmark_results) 399 400 if all_benchmark_results: 401 output_file = os.path.basename(args.artifacts) 402 with open(f"{args.output_dir}/{output_file}", "w") as f: 403 json.dump(all_benchmark_results, f) 404 405 406if __name__ == "__main__": 407 main() 408