• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3# Copyright 2019 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Download profdata from different arches, merge them and upload to gs.
8
9The script is used for updating the PGO profiles for LLVM. The workflow
10is that the script will download profdata from different PGO builds, merge
11them and then upload it to a gs location that LLVM can access.
12
13The simplest way of using this script, is to run:
14    ./merge_profdata_and_upload.py --all_latest_profiles
15which will automatically grab profdata from latest PGO generate builders
16for three different architectures and merge them. LLVM hash is also
17detected automatically from the artifacts.
18
19If you want to specify certain llvm hash, run it with:
20    ./merge_profdata_and_upload.py --all_latest_profiles --llvm_hash LLVM_HASH
21Note that hash checking will fail if the llvm hash you provided is not the
22same as those in artifacts, or llvm hash in different artifacts are not the
23same.
24
25To only use profiles from buildbucket tasks for PGO generate, run it with:
26    ./merge_profdata_and_upload.py -b amd64/bb_id1 -b arm/bb_id2 ...
27The buildbucket id can be found using `bb ls` command after manually launched
28builder finishes.
29
30There is a chance that builders only succeeded partially, in this case, you
31can run this script to merge both profdata from builder scheduled and manually
32launched:
33    ./merge_profdata_and_upload.py -l arm -l amd64 -b arm64/bb_id
34In this example, the script will merge profdata from arm and amd64 builder, and
35profdata from an arm64 buildbucket task.
36"""
37
38from __future__ import print_function
39
40import argparse
41import collections
42import distutils.spawn
43import json
44import os
45import os.path
46import shutil
47import subprocess
48import sys
49import tempfile
50
51_LLVM_PROFDATA = '/usr/bin/llvm-profdata'
52_GS_PREFIX = 'gs://'
53
54_LLVMMetadata = collections.namedtuple('_LLVMMetadata', ['head_sha'])
55
56
57def _fetch_gs_artifact(remote_name, local_name):
58  """Fetch single file from remote gs location to local.
59
60  Args:
61    remote_name: full gs location to the file.
62    local_name: the name of local file to be copied to.
63  """
64  assert remote_name.startswith(_GS_PREFIX)
65  subprocess.check_call(['gsutil', 'cp', remote_name, local_name])
66
67
68def _get_gs_profdata(remote_profdata, arch):
69  """Fetch and extract profdata from remote gs location.
70
71  Args:
72    remote_profdata: remote gs location of the profdata tarball.
73    arch: directory named with arch to saperate each profdata.
74
75  Returns:
76    Local location of the extracted profdata.
77  """
78  tar = 'llvm_profdata.tar.xz'
79  _fetch_gs_artifact(remote_profdata, tar)
80  extract_cmd = ['tar', '-xvf', tar]
81
82  profdata_name = subprocess.check_output(extract_cmd).strip()
83  # The output of the `tar` command should only contain one line of the
84  # extracted profdata name.
85  if b'.llvm.profdata' not in profdata_name:
86    raise RuntimeError('No profdata in the tarball: %s' % remote_profdata)
87
88  os.mkdir(arch)
89  profdata_loc = os.path.join(arch, 'llvm.profdata')
90  os.rename(profdata_name, profdata_loc)
91  print('Profdata extracted to: %s' % profdata_loc)
92  return profdata_loc
93
94
95def _get_gs_metadata(remote_metadata):
96  """Fetch metadata from remote gs location and read the LLVM head_sha.
97
98  Args:
99    remote_metadata: remote gs location of the metadata json file.
100
101  Returns:
102    LLVM head_sha metadata
103  """
104  metadata_basename = 'llvm_metadata.json'
105  _fetch_gs_artifact(remote_metadata, metadata_basename)
106
107  with open(metadata_basename) as f:
108    result = json.load(f)
109
110  return _LLVMMetadata(head_sha=result['head_sha'])
111
112
113def _find_latest_artifacts(gs_url, arch):
114  """Fetch the latest profdata and metadata from a give gs location.
115
116  Args:
117    gs_url: a gs location containing one or more artifacts to fetch.
118    arch: the arch profdata collected from.
119
120  Returns:
121    A tuple of local profdata location and metadata
122  """
123  assert gs_url.startswith(_GS_PREFIX)
124  try:
125    # List all artifacts in the gs location and sort by time.
126    output = subprocess.check_output(['gsutil', 'ls', '-l', gs_url],
127                                     encoding='utf-8').strip().split('\n')
128    lines = sorted(output, key=lambda x: x.split()[1], reverse=True)
129  except subprocess.CalledProcessError:
130    raise RuntimeError('Artifacts not found: %s' % gs_url)
131
132  # Use a loop to go through all artifacts to find the latest profdata.
133  # An example of the output of latest builder bucket:
134  # pylint: disable=line-too-long
135  #   5006528  2020-05-31T10:08:48Z  gs://chromeos-toolchain-artifacts/llvm-pgo/arm/llvm-11.0_pre387436_p20200403-r7-a8e5dcb072b1f794883ae8125fb08c06db678d56.llvm.profdata.tar.xz
136  #   56  2020-05-31T10:08:48Z  gs://chromeos-toolchain-artifacts/llvm-pgo/arm/llvm-11.0_pre387436_p20200403-r7-a8e5dcb072b1f794883ae8125fb08c06db678d56.llvm_metadata.json
137  #   5005952  2020-05-24T10:53:34Z  gs://chromeos-toolchain-artifacts/llvm-pgo/arm/llvm-11.0_pre387436_p20200403-r5-a8e5dcb072b1f794883ae8125fb08c06db678d56.llvm.profdata.tar.xz
138  #   56  2020-05-24T10:53:34Z  gs://chromeos-toolchain-artifacts/llvm-pgo/arm/llvm-11.0_pre387436_p20200403-r5-a8e5dcb072b1f794883ae8125fb08c06db678d56.llvm_metadata.json
139  # An example for the lines of buildbucket location:
140  #   5004260  2020-05-29T09:48:04Z  gs://chromeos-image-archive/arm-pgo-generate-llvm-next-toolchain/R85-13254.0.0-1-8879010326583123168/llvm-11.0_pre387436_p20200403-r7-a8e5dcb072b1f794883ae8125fb08c06db678d56.llvm.profdata.tar.xz
141  #   56  2020-05-29T09:48:04Z  gs://chromeos-image-archive/arm-pgo-generate-llvm-next-toolchain/R85-13254.0.0-1-8879010326583123168/llvm-11.0_pre387436_p20200403-r7-a8e5dcb072b1f794883ae8125fb08c06db678d56.llvm_metadata.json
142  # pylint: enable=line-too-long
143  profdata_url = ''
144  for line in lines:
145    url = line.split()[-1]
146    if '.llvm.profdata.tar.xz' in url:
147      profile_path = _get_gs_profdata(url, arch)
148      profdata_url = url
149      break
150  if not profile_path or not profdata_url:
151    raise RuntimeError('No profdata found from %s' % gs_url)
152
153  metadata_url = profdata_url.replace('.llvm.profdata.tar.xz',
154                                      '.llvm_metadata.json')
155  metadata = _get_gs_metadata(metadata_url)
156  if not metadata:
157    raise RuntimeError('No metadata found from %s' % gs_url)
158  return metadata, profile_path
159
160
161def _fetch_from_latest(arch):
162  """Fetch artifacts from latest builders.
163
164  Args:
165    arch: the arch profdata collected from.
166
167  Returns:
168    A tuple of local profdata location and metadata
169  """
170  print('\nFETCHING LATEST PROFDATA ON %s...' % arch.upper())
171  remote_latest = (
172      '%schromeos-toolchain-artifacts/llvm-pgo/%s' % (_GS_PREFIX, arch))
173  return _find_latest_artifacts(remote_latest, arch)
174
175
176def _fetch_from_buildbucket(arch, bb):
177  """Fetch artifacts from buildbucket task.
178
179  Args:
180    arch: the arch profdata collected from.
181    bb: buildbucket id.
182
183  Returns:
184    A tuple of local profdata location and metadata
185  """
186  print('\nFETCHING BUILDBUCKET PROFDATA ON %s...' % arch.upper())
187  remote_arch = ('%schromeos-image-archive/%s-pgo-generate-llvm-next-toolchain'
188                 % (_GS_PREFIX, arch))
189  # List all buckets under {arch}-pgo-generate-llvm-next-toolchain and
190  # grep with buildbucket id.
191  remote_bb = subprocess.check_output(['gsutil', 'ls', remote_arch],
192                                      encoding='utf-8').strip().split('\n')
193  for line in remote_bb:
194    if bb in line:
195      return _find_latest_artifacts(line, arch)
196  raise RuntimeError('No matched results found in %s with bb: %s' % (arch, bb))
197
198
199def _merge_profdata(profdata_list, output_name):
200  """Merge profdata.
201
202  Args:
203    profdata_list: list of profdata location of each arch.
204    output_name: name of merged profdata.
205  """
206  merge_cmd = [_LLVM_PROFDATA, 'merge', '-output', output_name] + profdata_list
207  print('\nMerging PGO profiles.\nCMD: %s' % merge_cmd)
208  subprocess.check_call(merge_cmd)
209
210
211def _tar_and_upload_profdata(profdata, name_suffix):
212  """Create a tarball of merged profdata and upload to certain gs location.
213
214  Args:
215    profdata: location of merged profdata.
216    name_suffix: usually the LLVM head_sha.
217  """
218  tarball = 'llvm-profdata-%s.tar.xz' % name_suffix
219  print('Making profdata tarball: %s' % tarball)
220  subprocess.check_call(
221      ['tar', '--sparse', '-I', 'xz', '-cf', tarball, profdata])
222
223  upload_location = '%schromeos-localmirror/distfiles/%s' % (_GS_PREFIX,
224                                                             tarball)
225
226  # TODO: it's better to create a subdir: distfiles/llvm_pgo_profile, but
227  # now llvm could only recognize distfiles.
228  upload_cmd = [
229      'gsutil',
230      '-m',
231      'cp',
232      '-n',
233      '-a',
234      'public-read',
235      tarball,
236      upload_location,
237  ]
238  print('\nUploading tarball to gs.\nCMD: %s\n' % upload_cmd)
239
240  # gsutil prints all status to stderr, oddly enough.
241  gs_output = subprocess.check_output(
242      upload_cmd, stderr=subprocess.STDOUT, encoding='utf-8')
243
244  # gsutil exits successfully even if it uploaded nothing. It prints a summary
245  # of what all it did, though. Successful uploads are just a progress bar,
246  # unsuccessful ones note that items were skipped.
247  if 'Skipping existing item' in gs_output:
248    raise ValueError('Profile upload failed: would overwrite an existing '
249                     'profile at %s' % upload_location)
250
251
252def main():
253  parser = argparse.ArgumentParser(
254      description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
255  parser.add_argument(
256      '-a',
257      '--all_latest_profiles',
258      action='store_true',
259      help='Merge and upload profiles from the latest builders.')
260  parser.add_argument(
261      '-l',
262      '--latest',
263      default=[],
264      action='append',
265      help='User can specify the profdata from which builder with specific '
266      'architecture to download. By default, we merge profdata from arm, '
267      'arm64, amd64.')
268  parser.add_argument(
269      '-b',
270      '--buildbucket',
271      default=[],
272      action='append',
273      help='Extra pgo-generate-llvm-next-toolchain buildbucket results to be '
274      'used. Format should be: {arch}/{bb_id}.')
275  parser.add_argument(
276      '-o',
277      '--output',
278      default='llvm.profdata',
279      help='Where to put merged PGO profile. The default is to not save it '
280      'anywhere.')
281  parser.add_argument(
282      '--llvm_hash',
283      help='The LLVM hash to select for the profiles. Generally autodetected.')
284  args = parser.parse_args()
285
286  if not args.all_latest_profiles and not (args.latest or args.buildbucket):
287    parser.error('Please specify whether to use latest profiles or '
288                 'profiles from buildbucket')
289
290  if args.all_latest_profiles and (args.latest or args.buildbucket):
291    parser.error('--all_latest_profiles cannot be specified together '
292                 'with --latest or --buildbucket')
293
294  latest = ['arm', 'arm64', 'amd64'] \
295    if args.all_latest_profiles else args.latest
296
297  all_arch_list = latest.copy()
298  arch_bb_list = []
299  if args.buildbucket:
300    for arch_bb in args.buildbucket:
301      arch, bb = arch_bb.split('/')
302      arch_bb_list.append((arch, bb))
303      all_arch_list.append(arch)
304
305  if len(set(all_arch_list)) != len(all_arch_list):
306    parser.error('Each arch can be only passed once.')
307
308  if not distutils.spawn.find_executable(_LLVM_PROFDATA):
309    sys.exit(_LLVM_PROFDATA + ' not found; are you in the chroot?')
310
311  initial_dir = os.getcwd()
312  temp_dir = tempfile.mkdtemp(prefix='merge_pgo')
313  success = True
314  try:
315    os.chdir(temp_dir)
316    profdata_list = []
317    heads = set()
318
319    def append_artifacts(fetched_tuple):
320      llvm_metadata, profdata_loc = fetched_tuple
321      if os.path.getsize(profdata_loc) < 512 * 1024:
322        raise RuntimeError('The PGO profile in local path %s is suspiciously '
323                           'small. Something might have gone '
324                           'wrong.' % profdata_loc)
325      heads.add(llvm_metadata.head_sha)
326      profdata_list.append(profdata_loc)
327
328    for arch in latest:
329      append_artifacts(_fetch_from_latest(arch))
330
331    for arch, bb in arch_bb_list:
332      append_artifacts(_fetch_from_buildbucket(arch, bb))
333
334    assert heads, "Didn't fetch anything?"
335
336    def die_with_head_complaint(complaint):
337      extra = ' (HEADs found: %s)' % sorted(heads)
338      raise RuntimeError(complaint.rstrip() + extra)
339
340    llvm_hash = args.llvm_hash
341    if not llvm_hash:
342      if len(heads) != 1:
343        die_with_head_complaint(
344            '%d LLVM HEADs were found, which is more than one. You probably '
345            'want a consistent set of HEADs for a profile. If you know you '
346            "don't, please specify --llvm_hash, and note that *all* profiles "
347            'will be merged into this final profile, regardless of their '
348            'reported HEAD.' % len(heads))
349      llvm_hash, = heads
350
351    if llvm_hash not in heads:
352      assert llvm_hash == args.llvm_hash
353      die_with_head_complaint(
354          "HEAD %s wasn't found in any fetched artifacts." % llvm_hash)
355
356    print('\nUsing LLVM hash: %s' % llvm_hash)
357
358    _merge_profdata(profdata_list, args.output)
359    print('Merged profdata locates at %s' % os.path.abspath(args.output))
360    _tar_and_upload_profdata(args.output, name_suffix=llvm_hash)
361    print('\nMerged profdata uploaded successfully.')
362  except:
363    success = False
364    raise
365  finally:
366    os.chdir(initial_dir)
367    if success:
368      print('Clearing temp directory.')
369      shutil.rmtree(temp_dir, ignore_errors=True)
370    else:
371      print('Script fails, temp directory is at: %s' % temp_dir)
372
373
374if __name__ == '__main__':
375  sys.exit(main())
376