1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3# Copyright 2019 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7"""Download profdata from different arches, merge them and upload to gs. 8 9The script is used for updating the PGO profiles for LLVM. The workflow 10is that the script will download profdata from different PGO builds, merge 11them and then upload it to a gs location that LLVM can access. 12 13The simplest way of using this script, is to run: 14 ./merge_profdata_and_upload.py --all_latest_profiles 15which will automatically grab profdata from latest PGO generate builders 16for three different architectures and merge them. LLVM hash is also 17detected automatically from the artifacts. 18 19If you want to specify certain llvm hash, run it with: 20 ./merge_profdata_and_upload.py --all_latest_profiles --llvm_hash LLVM_HASH 21Note that hash checking will fail if the llvm hash you provided is not the 22same as those in artifacts, or llvm hash in different artifacts are not the 23same. 24 25To only use profiles from buildbucket tasks for PGO generate, run it with: 26 ./merge_profdata_and_upload.py -b amd64/bb_id1 -b arm/bb_id2 ... 27The buildbucket id can be found using `bb ls` command after manually launched 28builder finishes. 29 30There is a chance that builders only succeeded partially, in this case, you 31can run this script to merge both profdata from builder scheduled and manually 32launched: 33 ./merge_profdata_and_upload.py -l arm -l amd64 -b arm64/bb_id 34In this example, the script will merge profdata from arm and amd64 builder, and 35profdata from an arm64 buildbucket task. 36""" 37 38from __future__ import print_function 39 40import argparse 41import collections 42import distutils.spawn 43import json 44import os 45import os.path 46import shutil 47import subprocess 48import sys 49import tempfile 50 51_LLVM_PROFDATA = '/usr/bin/llvm-profdata' 52_GS_PREFIX = 'gs://' 53 54_LLVMMetadata = collections.namedtuple('_LLVMMetadata', ['head_sha']) 55 56 57def _fetch_gs_artifact(remote_name, local_name): 58 """Fetch single file from remote gs location to local. 59 60 Args: 61 remote_name: full gs location to the file. 62 local_name: the name of local file to be copied to. 63 """ 64 assert remote_name.startswith(_GS_PREFIX) 65 subprocess.check_call(['gsutil', 'cp', remote_name, local_name]) 66 67 68def _get_gs_profdata(remote_profdata, arch): 69 """Fetch and extract profdata from remote gs location. 70 71 Args: 72 remote_profdata: remote gs location of the profdata tarball. 73 arch: directory named with arch to saperate each profdata. 74 75 Returns: 76 Local location of the extracted profdata. 77 """ 78 tar = 'llvm_profdata.tar.xz' 79 _fetch_gs_artifact(remote_profdata, tar) 80 extract_cmd = ['tar', '-xvf', tar] 81 82 profdata_name = subprocess.check_output(extract_cmd).strip() 83 # The output of the `tar` command should only contain one line of the 84 # extracted profdata name. 85 if b'.llvm.profdata' not in profdata_name: 86 raise RuntimeError('No profdata in the tarball: %s' % remote_profdata) 87 88 os.mkdir(arch) 89 profdata_loc = os.path.join(arch, 'llvm.profdata') 90 os.rename(profdata_name, profdata_loc) 91 print('Profdata extracted to: %s' % profdata_loc) 92 return profdata_loc 93 94 95def _get_gs_metadata(remote_metadata): 96 """Fetch metadata from remote gs location and read the LLVM head_sha. 97 98 Args: 99 remote_metadata: remote gs location of the metadata json file. 100 101 Returns: 102 LLVM head_sha metadata 103 """ 104 metadata_basename = 'llvm_metadata.json' 105 _fetch_gs_artifact(remote_metadata, metadata_basename) 106 107 with open(metadata_basename) as f: 108 result = json.load(f) 109 110 return _LLVMMetadata(head_sha=result['head_sha']) 111 112 113def _find_latest_artifacts(gs_url, arch): 114 """Fetch the latest profdata and metadata from a give gs location. 115 116 Args: 117 gs_url: a gs location containing one or more artifacts to fetch. 118 arch: the arch profdata collected from. 119 120 Returns: 121 A tuple of local profdata location and metadata 122 """ 123 assert gs_url.startswith(_GS_PREFIX) 124 try: 125 # List all artifacts in the gs location and sort by time. 126 output = subprocess.check_output(['gsutil', 'ls', '-l', gs_url], 127 encoding='utf-8').strip().split('\n') 128 lines = sorted(output, key=lambda x: x.split()[1], reverse=True) 129 except subprocess.CalledProcessError: 130 raise RuntimeError('Artifacts not found: %s' % gs_url) 131 132 # Use a loop to go through all artifacts to find the latest profdata. 133 # An example of the output of latest builder bucket: 134 # pylint: disable=line-too-long 135 # 5006528 2020-05-31T10:08:48Z gs://chromeos-toolchain-artifacts/llvm-pgo/arm/llvm-11.0_pre387436_p20200403-r7-a8e5dcb072b1f794883ae8125fb08c06db678d56.llvm.profdata.tar.xz 136 # 56 2020-05-31T10:08:48Z gs://chromeos-toolchain-artifacts/llvm-pgo/arm/llvm-11.0_pre387436_p20200403-r7-a8e5dcb072b1f794883ae8125fb08c06db678d56.llvm_metadata.json 137 # 5005952 2020-05-24T10:53:34Z gs://chromeos-toolchain-artifacts/llvm-pgo/arm/llvm-11.0_pre387436_p20200403-r5-a8e5dcb072b1f794883ae8125fb08c06db678d56.llvm.profdata.tar.xz 138 # 56 2020-05-24T10:53:34Z gs://chromeos-toolchain-artifacts/llvm-pgo/arm/llvm-11.0_pre387436_p20200403-r5-a8e5dcb072b1f794883ae8125fb08c06db678d56.llvm_metadata.json 139 # An example for the lines of buildbucket location: 140 # 5004260 2020-05-29T09:48:04Z gs://chromeos-image-archive/arm-pgo-generate-llvm-next-toolchain/R85-13254.0.0-1-8879010326583123168/llvm-11.0_pre387436_p20200403-r7-a8e5dcb072b1f794883ae8125fb08c06db678d56.llvm.profdata.tar.xz 141 # 56 2020-05-29T09:48:04Z gs://chromeos-image-archive/arm-pgo-generate-llvm-next-toolchain/R85-13254.0.0-1-8879010326583123168/llvm-11.0_pre387436_p20200403-r7-a8e5dcb072b1f794883ae8125fb08c06db678d56.llvm_metadata.json 142 # pylint: enable=line-too-long 143 profdata_url = '' 144 for line in lines: 145 url = line.split()[-1] 146 if '.llvm.profdata.tar.xz' in url: 147 profile_path = _get_gs_profdata(url, arch) 148 profdata_url = url 149 break 150 if not profile_path or not profdata_url: 151 raise RuntimeError('No profdata found from %s' % gs_url) 152 153 metadata_url = profdata_url.replace('.llvm.profdata.tar.xz', 154 '.llvm_metadata.json') 155 metadata = _get_gs_metadata(metadata_url) 156 if not metadata: 157 raise RuntimeError('No metadata found from %s' % gs_url) 158 return metadata, profile_path 159 160 161def _fetch_from_latest(arch): 162 """Fetch artifacts from latest builders. 163 164 Args: 165 arch: the arch profdata collected from. 166 167 Returns: 168 A tuple of local profdata location and metadata 169 """ 170 print('\nFETCHING LATEST PROFDATA ON %s...' % arch.upper()) 171 remote_latest = ( 172 '%schromeos-toolchain-artifacts/llvm-pgo/%s' % (_GS_PREFIX, arch)) 173 return _find_latest_artifacts(remote_latest, arch) 174 175 176def _fetch_from_buildbucket(arch, bb): 177 """Fetch artifacts from buildbucket task. 178 179 Args: 180 arch: the arch profdata collected from. 181 bb: buildbucket id. 182 183 Returns: 184 A tuple of local profdata location and metadata 185 """ 186 print('\nFETCHING BUILDBUCKET PROFDATA ON %s...' % arch.upper()) 187 remote_arch = ('%schromeos-image-archive/%s-pgo-generate-llvm-next-toolchain' 188 % (_GS_PREFIX, arch)) 189 # List all buckets under {arch}-pgo-generate-llvm-next-toolchain and 190 # grep with buildbucket id. 191 remote_bb = subprocess.check_output(['gsutil', 'ls', remote_arch], 192 encoding='utf-8').strip().split('\n') 193 for line in remote_bb: 194 if bb in line: 195 return _find_latest_artifacts(line, arch) 196 raise RuntimeError('No matched results found in %s with bb: %s' % (arch, bb)) 197 198 199def _merge_profdata(profdata_list, output_name): 200 """Merge profdata. 201 202 Args: 203 profdata_list: list of profdata location of each arch. 204 output_name: name of merged profdata. 205 """ 206 merge_cmd = [_LLVM_PROFDATA, 'merge', '-output', output_name] + profdata_list 207 print('\nMerging PGO profiles.\nCMD: %s' % merge_cmd) 208 subprocess.check_call(merge_cmd) 209 210 211def _tar_and_upload_profdata(profdata, name_suffix): 212 """Create a tarball of merged profdata and upload to certain gs location. 213 214 Args: 215 profdata: location of merged profdata. 216 name_suffix: usually the LLVM head_sha. 217 """ 218 tarball = 'llvm-profdata-%s.tar.xz' % name_suffix 219 print('Making profdata tarball: %s' % tarball) 220 subprocess.check_call( 221 ['tar', '--sparse', '-I', 'xz', '-cf', tarball, profdata]) 222 223 upload_location = '%schromeos-localmirror/distfiles/%s' % (_GS_PREFIX, 224 tarball) 225 226 # TODO: it's better to create a subdir: distfiles/llvm_pgo_profile, but 227 # now llvm could only recognize distfiles. 228 upload_cmd = [ 229 'gsutil', 230 '-m', 231 'cp', 232 '-n', 233 '-a', 234 'public-read', 235 tarball, 236 upload_location, 237 ] 238 print('\nUploading tarball to gs.\nCMD: %s\n' % upload_cmd) 239 240 # gsutil prints all status to stderr, oddly enough. 241 gs_output = subprocess.check_output( 242 upload_cmd, stderr=subprocess.STDOUT, encoding='utf-8') 243 244 # gsutil exits successfully even if it uploaded nothing. It prints a summary 245 # of what all it did, though. Successful uploads are just a progress bar, 246 # unsuccessful ones note that items were skipped. 247 if 'Skipping existing item' in gs_output: 248 raise ValueError('Profile upload failed: would overwrite an existing ' 249 'profile at %s' % upload_location) 250 251 252def main(): 253 parser = argparse.ArgumentParser( 254 description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) 255 parser.add_argument( 256 '-a', 257 '--all_latest_profiles', 258 action='store_true', 259 help='Merge and upload profiles from the latest builders.') 260 parser.add_argument( 261 '-l', 262 '--latest', 263 default=[], 264 action='append', 265 help='User can specify the profdata from which builder with specific ' 266 'architecture to download. By default, we merge profdata from arm, ' 267 'arm64, amd64.') 268 parser.add_argument( 269 '-b', 270 '--buildbucket', 271 default=[], 272 action='append', 273 help='Extra pgo-generate-llvm-next-toolchain buildbucket results to be ' 274 'used. Format should be: {arch}/{bb_id}.') 275 parser.add_argument( 276 '-o', 277 '--output', 278 default='llvm.profdata', 279 help='Where to put merged PGO profile. The default is to not save it ' 280 'anywhere.') 281 parser.add_argument( 282 '--llvm_hash', 283 help='The LLVM hash to select for the profiles. Generally autodetected.') 284 args = parser.parse_args() 285 286 if not args.all_latest_profiles and not (args.latest or args.buildbucket): 287 parser.error('Please specify whether to use latest profiles or ' 288 'profiles from buildbucket') 289 290 if args.all_latest_profiles and (args.latest or args.buildbucket): 291 parser.error('--all_latest_profiles cannot be specified together ' 292 'with --latest or --buildbucket') 293 294 latest = ['arm', 'arm64', 'amd64'] \ 295 if args.all_latest_profiles else args.latest 296 297 all_arch_list = latest.copy() 298 arch_bb_list = [] 299 if args.buildbucket: 300 for arch_bb in args.buildbucket: 301 arch, bb = arch_bb.split('/') 302 arch_bb_list.append((arch, bb)) 303 all_arch_list.append(arch) 304 305 if len(set(all_arch_list)) != len(all_arch_list): 306 parser.error('Each arch can be only passed once.') 307 308 if not distutils.spawn.find_executable(_LLVM_PROFDATA): 309 sys.exit(_LLVM_PROFDATA + ' not found; are you in the chroot?') 310 311 initial_dir = os.getcwd() 312 temp_dir = tempfile.mkdtemp(prefix='merge_pgo') 313 success = True 314 try: 315 os.chdir(temp_dir) 316 profdata_list = [] 317 heads = set() 318 319 def append_artifacts(fetched_tuple): 320 llvm_metadata, profdata_loc = fetched_tuple 321 if os.path.getsize(profdata_loc) < 512 * 1024: 322 raise RuntimeError('The PGO profile in local path %s is suspiciously ' 323 'small. Something might have gone ' 324 'wrong.' % profdata_loc) 325 heads.add(llvm_metadata.head_sha) 326 profdata_list.append(profdata_loc) 327 328 for arch in latest: 329 append_artifacts(_fetch_from_latest(arch)) 330 331 for arch, bb in arch_bb_list: 332 append_artifacts(_fetch_from_buildbucket(arch, bb)) 333 334 assert heads, "Didn't fetch anything?" 335 336 def die_with_head_complaint(complaint): 337 extra = ' (HEADs found: %s)' % sorted(heads) 338 raise RuntimeError(complaint.rstrip() + extra) 339 340 llvm_hash = args.llvm_hash 341 if not llvm_hash: 342 if len(heads) != 1: 343 die_with_head_complaint( 344 '%d LLVM HEADs were found, which is more than one. You probably ' 345 'want a consistent set of HEADs for a profile. If you know you ' 346 "don't, please specify --llvm_hash, and note that *all* profiles " 347 'will be merged into this final profile, regardless of their ' 348 'reported HEAD.' % len(heads)) 349 llvm_hash, = heads 350 351 if llvm_hash not in heads: 352 assert llvm_hash == args.llvm_hash 353 die_with_head_complaint( 354 "HEAD %s wasn't found in any fetched artifacts." % llvm_hash) 355 356 print('\nUsing LLVM hash: %s' % llvm_hash) 357 358 _merge_profdata(profdata_list, args.output) 359 print('Merged profdata locates at %s' % os.path.abspath(args.output)) 360 _tar_and_upload_profdata(args.output, name_suffix=llvm_hash) 361 print('\nMerged profdata uploaded successfully.') 362 except: 363 success = False 364 raise 365 finally: 366 os.chdir(initial_dir) 367 if success: 368 print('Clearing temp directory.') 369 shutil.rmtree(temp_dir, ignore_errors=True) 370 else: 371 print('Script fails, temp directory is at: %s' % temp_dir) 372 373 374if __name__ == '__main__': 375 sys.exit(main()) 376