• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# Copyright (c) Meta Platforms, Inc. and affiliates.
3# All rights reserved.
4#
5# This source code is licensed under the BSD-style license found in the
6# LICENSE file in the root directory of this source tree.
7
8import json
9import logging
10import os
11import re
12import zipfile
13from argparse import Action, ArgumentParser, Namespace
14from io import BytesIO
15from logging import info, warning
16from typing import Any, Dict, List, Optional
17from urllib import error, request
18
19
20logging.basicConfig(level=logging.INFO)
21
22
23BENCHMARK_RESULTS_FILENAME = "benchmark_results.json"
24ARTIFACTS_FILENAME_REGEX = re.compile(r"(android|ios)-artifacts-(?P<job_id>\d+).json")
25
26# iOS-related regexes and variables
27IOS_TEST_SPEC_REGEX = re.compile(
28    r"Test Case\s+'-\[(?P<test_class>\w+)\s+(?P<test_name>[\w\+]+)\]'\s+measured\s+\[(?P<metric>.+)\]\s+average:\s+(?P<value>[\d\.]+),"
29)
30IOS_TEST_NAME_REGEX = re.compile(
31    r"test_(?P<method>forward|load|generate)_(?P<model_name>[\w\+]+)_pte.*iOS_(?P<ios_ver>\w+)_iPhone(?P<iphone_ver>\w+)"
32)
33# The backend name could contain +, i.e. tinyllama_xnnpack+custom+qe_fp32
34IOS_MODEL_NAME_REGEX = re.compile(
35    r"(?P<model>[^_]+)_(?P<backend>[\w\+]+)_(?P<dtype>\w+)"
36)
37
38
39class ValidateArtifacts(Action):
40    def __call__(
41        self,
42        parser: ArgumentParser,
43        namespace: Namespace,
44        values: Any,
45        option_string: Optional[str] = None,
46    ) -> None:
47        if os.path.isfile(values) and values.endswith(".json"):
48            setattr(namespace, self.dest, values)
49            return
50
51        parser.error(f"{values} is not a valid JSON file (*.json)")
52
53
54class ValidateOutputDir(Action):
55    def __call__(
56        self,
57        parser: ArgumentParser,
58        namespace: Namespace,
59        values: Any,
60        option_string: Optional[str] = None,
61    ) -> None:
62        if os.path.isdir(values):
63            setattr(namespace, self.dest, values)
64            return
65
66        parser.error(f"{values} is not a valid directory")
67
68
69def parse_args() -> Any:
70    from argparse import ArgumentParser
71
72    parser = ArgumentParser("extract benchmark results from AWS Device Farm artifacts")
73    parser.add_argument(
74        "--artifacts",
75        type=str,
76        required=True,
77        action=ValidateArtifacts,
78        help="the list of artifacts from AWS in JSON format",
79    )
80    parser.add_argument(
81        "--output-dir",
82        type=str,
83        required=True,
84        action=ValidateOutputDir,
85        help="the directory to keep the benchmark results",
86    )
87    parser.add_argument(
88        "--repo",
89        type=str,
90        required=True,
91        help="which GitHub repo this workflow run belongs to",
92    )
93    parser.add_argument(
94        "--head-branch",
95        type=str,
96        required=True,
97        help="the head branch that runs",
98    )
99    parser.add_argument(
100        "--workflow-name",
101        type=str,
102        required=True,
103        help="the name of the benchmark workflow",
104    )
105    parser.add_argument(
106        "--workflow-run-id",
107        type=int,
108        required=True,
109        help="the id of the benchmark workflow",
110    )
111    parser.add_argument(
112        "--workflow-run-attempt",
113        type=int,
114        required=True,
115        help="which retry of the workflow this is",
116    )
117
118    return parser.parse_args()
119
120
121def extract_android_benchmark_results(
122    job_name: str, artifact_type: str, artifact_s3_url: str
123) -> List:
124    """
125    The benchmark results from Android have already been stored in CUSTOMER_ARTIFACT
126    artifact, so we will just need to get it
127
128    Return the list of benchmark results.
129    """
130    if artifact_type != "CUSTOMER_ARTIFACT":
131        return []
132
133    try:
134        with request.urlopen(artifact_s3_url) as data:
135            with zipfile.ZipFile(BytesIO(data.read())) as customer_artifact:
136                for name in customer_artifact.namelist():
137                    if BENCHMARK_RESULTS_FILENAME in name:
138                        return json.loads(customer_artifact.read(name))
139
140    except error.HTTPError:
141        warning(f"Fail to {artifact_type} {artifact_s3_url}")
142        return []
143    except json.decoder.JSONDecodeError:
144        # This is to handle the case where there is no benchmark results
145        warning(f"Fail to load the benchmark results from {artifact_s3_url}")
146        return []
147
148
149def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
150    """
151    Extract the benchmark metadata from the test name, for example:
152        test_forward_llama2_pte_iOS_17_2_1_iPhone15_4
153        test_load_resnet50_xnnpack_q8_pte_iOS_17_2_1_iPhone15_4
154    """
155    m = IOS_TEST_NAME_REGEX.match(test_name)
156    if not m:
157        return {}
158
159    method = m.group("method")
160    model_name = m.group("model_name")
161    ios_ver = m.group("ios_ver").replace("_", ".")
162    iphone_ver = m.group("iphone_ver").replace("_", ".")
163
164    # The default backend and quantization dtype if the script couldn't extract
165    # them from the model name
166    backend = ""
167    quantization = "unknown"
168
169    m = IOS_MODEL_NAME_REGEX.match(model_name)
170    if m:
171        backend = m.group("backend")
172        quantization = m.group("dtype")
173        model_name = m.group("model")
174
175    return {
176        "benchmarkModel": {
177            "backend": backend,
178            "quantization": quantization,
179            "name": model_name,
180        },
181        "deviceInfo": {
182            "arch": f"iPhone {iphone_ver}",
183            "device": f"iPhone {iphone_ver}",
184            "os": f"iOS {ios_ver}",
185            "availMem": 0,
186            "totalMem": 0,
187        },
188        "method": method,
189        # These fields will be populated later by extract_ios_metric
190        "metric": "",
191        "actualValue": 0,
192        "targetValue": 0,
193    }
194
195
196def extract_ios_metric(
197    benchmark_result: Dict[str, Any],
198    test_name: str,
199    metric_name: str,
200    metric_value: float,
201) -> Dict[str, Any]:
202    """
203    Map the metric name from iOS xcresult to the benchmark result
204    """
205    method = benchmark_result.get("method", "")
206    if not method:
207        return benchmark_result
208
209    # NB: This looks brittle, but unless we can return iOS benchmark results in JSON
210    # format by the test, the mapping is needed to match with Android test
211    if method == "load":
212        if metric_name == "Clock Monotonic Time, s":
213            benchmark_result["metric"] = "model_load_time(ms)"
214            benchmark_result["actualValue"] = metric_value * 1000
215
216        elif metric_name == "Memory Peak Physical, kB":
217            # NB: Showing the value in mB is friendlier IMO
218            benchmark_result["metric"] = "peak_load_mem_usage(mb)"
219            benchmark_result["actualValue"] = metric_value / 1024
220
221    elif method == "forward":
222        if metric_name == "Clock Monotonic Time, s":
223            benchmark_result["metric"] = (
224                "generate_time(ms)"
225                if "llama" in test_name
226                else "avg_inference_latency(ms)"
227            )
228            benchmark_result["actualValue"] = metric_value * 1000
229
230        elif metric_name == "Memory Peak Physical, kB":
231            # NB: Showing the value in mB is friendlier IMO
232            benchmark_result["metric"] = "peak_inference_mem_usage(mb)"
233            benchmark_result["actualValue"] = metric_value / 1024
234
235    elif method == "generate" and metric_name == "Tokens Per Second, t/s":
236        benchmark_result["metric"] = "token_per_sec"
237        benchmark_result["actualValue"] = metric_value
238
239    return benchmark_result
240
241
242def extract_ios_benchmark_results(
243    job_name: str, artifact_type: str, artifact_s3_url: str
244) -> List:
245    """
246    The benchmark results from iOS are currently from xcresult, which could either
247    be parsed from CUSTOMER_ARTIFACT or get from the test spec output. The latter
248    is probably easier to process
249    """
250    if artifact_type != "TESTSPEC_OUTPUT":
251        return []
252
253    try:
254        benchmark_results = []
255
256        with request.urlopen(artifact_s3_url) as data:
257            current_test_name = ""
258            current_metric_name = ""
259            current_record = {}
260
261            for line in data.read().decode("utf8").splitlines():
262                s = IOS_TEST_SPEC_REGEX.search(line)
263                if not s:
264                    continue
265
266                test_name = s.group("test_name")
267                metric_name = s.group("metric")
268                metric_value = float(s.group("value"))
269
270                if test_name != current_test_name or metric_name != current_metric_name:
271                    if current_record and current_record.get("metric", ""):
272                        # Save the benchmark result in the same format used by Android
273                        benchmark_results.append(current_record.copy())
274
275                    current_test_name = test_name
276                    current_metric_name = metric_name
277                    current_record = initialize_ios_metadata(current_test_name)
278
279                current_record = extract_ios_metric(
280                    current_record, test_name, metric_name, metric_value
281                )
282
283            if current_record and current_record.get("metric", ""):
284                benchmark_results.append(current_record.copy())
285
286        return benchmark_results
287
288    except error.HTTPError:
289        warning(f"Fail to {artifact_type} {artifact_s3_url}")
290        return []
291
292
293def extract_job_id(artifacts_filename: str) -> int:
294    """
295    Extract the job id from the artifacts filename
296    """
297    m = ARTIFACTS_FILENAME_REGEX.match(os.path.basename(artifacts_filename))
298    if not m:
299        return 0
300    return int(m.group("job_id"))
301
302
303def transform(
304    app_type: str,
305    benchmark_results: List,
306    repo: str,
307    head_branch: str,
308    workflow_name: str,
309    workflow_run_id: int,
310    workflow_run_attempt: int,
311    job_name: str,
312    job_id: int,
313) -> List:
314    """
315    Transform the benchmark results into the format writable into the benchmark database
316    """
317    # Overwrite the device name here with the job name as it has more information about
318    # the device, i.e. Samsung Galaxy S22 5G instead of just Samsung
319    for r in benchmark_results:
320        r["deviceInfo"]["device"] = job_name
321
322    # TODO (huydhn): This is the current schema of the database oss_ci_benchmark_v2,
323    # and I'm trying to fit ET benchmark results into it, which is kind of awkward.
324    # However, the schema is going to be updated soon
325    return [
326        {
327            # GH-info to identify where the benchmark is run
328            "repo": repo,
329            "head_branch": head_branch,
330            "workflow_id": workflow_run_id,
331            "run_attempt": workflow_run_attempt,
332            "job_id": job_id,
333            # The model
334            "name": f"{r['benchmarkModel']['name']} {r['benchmarkModel'].get('backend', '')}".strip(),
335            "dtype": (
336                r["benchmarkModel"]["quantization"]
337                if r["benchmarkModel"]["quantization"]
338                else "unknown"
339            ),
340            # The metric value
341            "metric": r["metric"],
342            "actual": r["actualValue"],
343            "target": r["targetValue"],
344            # The device
345            "device": r["deviceInfo"]["device"],
346            "arch": r["deviceInfo"].get("os", ""),
347            # Not used here, just set it to something unique here
348            "filename": workflow_name,
349            "test_name": app_type,
350            "runner": job_name,
351        }
352        for r in benchmark_results
353    ]
354
355
356def main() -> None:
357    args = parse_args()
358
359    # Across all devices
360    all_benchmark_results = []
361
362    with open(args.artifacts) as f:
363        for artifact in json.load(f):
364            app_type = artifact.get("app_type", "")
365            # We expect this to be set to either ANDROID_APP or IOS_APP
366            if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]:
367                info(
368                    f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}"
369                )
370                continue
371
372            job_name = artifact["job_name"]
373            artifact_type = artifact["type"]
374            artifact_s3_url = artifact["s3_url"]
375
376            if app_type == "ANDROID_APP":
377                benchmark_results = extract_android_benchmark_results(
378                    job_name, artifact_type, artifact_s3_url
379                )
380
381            if app_type == "IOS_APP":
382                benchmark_results = extract_ios_benchmark_results(
383                    job_name, artifact_type, artifact_s3_url
384                )
385
386            if benchmark_results:
387                benchmark_results = transform(
388                    app_type,
389                    benchmark_results,
390                    args.repo,
391                    args.head_branch,
392                    args.workflow_name,
393                    args.workflow_run_id,
394                    args.workflow_run_attempt,
395                    job_name,
396                    extract_job_id(args.artifacts),
397                )
398                all_benchmark_results.extend(benchmark_results)
399
400    if all_benchmark_results:
401        output_file = os.path.basename(args.artifacts)
402        with open(f"{args.output_dir}/{output_file}", "w") as f:
403            json.dump(all_benchmark_results, f)
404
405
406if __name__ == "__main__":
407    main()
408