1#!/usr/bin/env python3 2# Copyright 2020 The TensorFlow Authors. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# ============================================================================== 16r"""Used for Google-internal artifact size tracking. 17 18See go/tf-devinfra/sizetrack. 19 20INVOCATION: The following flags are required: 21 22 sizetrack_helper.py \ 23 --artifact=ARTIFACT, or --manual_bytes=MANUAL_BYTES 24 --artifact_id=ARTIFACT_ID \ 25 --team=TEAM \ 26 ... other optional args ... 27 28On Windows you might need something like: 29 30 C:\Python38\python.exe C:\path\to\sizetrack_helper.py ... 31 32PREREQUISITES: 33 34 1. Your current activated GCP user must have access scopes and IAM permissions 35 to do the following: 36 37 1. Query and load data into BigQuery 38 2. Upload files to GCS 39 40 2. Your environment must match the following criteria: 41 42 1. Current directory is a git repository 43 2. CL-based commits have a PiperOrigin-RevId trailer. This is the case 44 for any use of Copybara Single-source-of-truth, e.g. TensorFlow. 45 Only these commits are considered when running commands. 46""" 47 48import argparse 49import csv 50import datetime 51import os 52import os.path 53import pathlib 54import platform 55import re 56import subprocess 57 58parser = argparse.ArgumentParser( 59 usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) 60parser.add_argument( 61 "--project", 62 type=str, 63 default="tensorflow-testing", 64 help="GCP project you can access.") 65parser.add_argument( 66 "--dataset", 67 type=str, 68 default="sizetracker", 69 help="BigQuery dataset containing --table") 70parser.add_argument( 71 "--table", type=str, default="tensorflow_devinfra", help="BigQuery table.") 72parser.add_argument( 73 "--upload", 74 action="store_true", 75 help="Upload the artifact to --bucket for analysis.") 76parser.add_argument( 77 "--bucket", 78 type=str, 79 default="gs://tf-sizetracker-artifacts", 80 help="GCS bucket for artifacts.") 81parser.add_argument( 82 "--team", 83 type=str, 84 help="For grouping in the dashboard and buckets; e.g. tf-lite-team.") 85parser.add_argument( 86 "--artifact_id", 87 type=str, 88 help="Unique ID for your artifact, used for sorting dashboards.") 89parser.add_argument( 90 "-n", 91 "--dry_run", 92 action="store_true", 93 help="Dry run: do not load to BigQuery or upload to GCS.") 94parser.add_argument( 95 "--job", 96 type=str, 97 help="Name of job calling this script. Default: $KOKORO_JOB_NAME.") 98parser.add_argument( 99 "--build_id", 100 type=str, 101 help="UUID of build calling this script. Default: $KOKORO_BUILD_ID.") 102parser.add_argument( 103 "--print_schema", 104 action="store_true", 105 help="Print the table schema and don't do anything else.") 106size = parser.add_mutually_exclusive_group() 107size.add_argument( 108 "--artifact", 109 type=argparse.FileType("r"), 110 help="Local file you are measuring.") 111size.add_argument( 112 "--manual_bytes", 113 type=int, 114 help="Manually set the recorded size instead of providing an artifact.") 115FLAGS = parser.parse_args() 116 117NOW = datetime.datetime.now( 118 datetime.timezone.utc).replace(microsecond=0).isoformat() 119TABLE_NAME = "{}.{}".format(FLAGS.dataset, FLAGS.table) 120PROJECT_LEVEL_TABLE_NAME = "{}:{}".format(FLAGS.project, TABLE_NAME) 121CL_TRAILER = "PiperOrigin-RevId" 122PRETTY_COMMIT_DATE = "%cI" 123# \001 is a byte with value "1", in octal. We use this in git_pretty() 124PRETTY_CL = "\001%(trailers)\001" 125PRETTY_HEAD_INFO = "%h\t{cl}\t%s\t%ae\t%aI\t%ce\t%cI".format(cl=PRETTY_CL) 126PRETTY_EARLY = "%aI\t{cl}\t%cI".format(cl=PRETTY_CL) 127PRETTY_COMMIT = "%h" 128# This is a BigQuery table schema defined as CSV 129# See https://cloud.google.com/bigquery/docs/schemas 130SCHEMA = ",".join([ 131 "id:string", 132 "filename:string", 133 # These 6 lines are from git's format=pretty 134 # %h $CL_PRETTY %s %ae %aI %ce %cI 135 "commit:string", 136 "cl:int64", 137 "description:string", 138 "author:string", 139 "author_date:timestamp", 140 "committer:string", 141 "commit_date:timestamp", 142 # Done with format=pretty 143 "earliest_commit:string", 144 "earliest_cl:int64", 145 "earliest_author_date:timestamp", 146 "earliest_commit_date:timestamp", 147 "all_commits:string", 148 "all_cls:string", 149 "bytes:int64", 150 "team:string", 151 "logged_date:timestamp", 152 "uploaded_to:string", 153 "job:string", 154 "build_id:string", 155]) 156# Select the earliest recorded commit in the same table for the same artifact 157# and team. Used to determine the full range of tested commits for each 158# invocation. Returns empty string if there are no earlier records. 159BQ_GET_EARLIEST_INCLUDED_COMMIT = """ 160 SELECT 161 commit 162 FROM {table} WHERE 163 commit_date < '{earlier_than_this_date}' 164 AND id = '{artifact_id}' 165 AND team = '{team}' 166 ORDER BY commit_date DESC LIMIT 1 167""" 168 169 170# pylint: disable=unused-argument 171def git_pretty(commit_range, pretty_format, n=None): 172 r"""Run git log and return the cleaned results. 173 174 Git is assumed to be available in the PATH. 175 176 The PiperOrigin-RevId trailer always picks up an extra newline, so this splits 177 entries on a null byte (\0, or %x00 for git log) and removes newlines. 178 179 Args: 180 commit_range: Standard range given to git log, e.g. HEAD~1..HEAD 181 pretty_format: See https://git-scm.com/docs/pretty-formats 182 n: Number of commits to get. By default, get all within commit_range. 183 184 Returns: 185 List of strings of whatever the format string was. 186 """ 187 n = [] if n is None else ["-n", "1"] 188 try: 189 ret = subprocess.run([ 190 "git", "log", *n, "--date", "iso", "--grep", CL_TRAILER, commit_range, 191 "--pretty=format:" + pretty_format + "%x00" 192 ], 193 check=True, 194 universal_newlines=True, 195 stderr=subprocess.PIPE, 196 stdout=subprocess.PIPE) 197 except subprocess.CalledProcessError as e: 198 print(e.stderr) 199 print(e.stdout) 200 raise e 201 out = ret.stdout.replace("\n", "") 202 # Unique case: Old versions of git do not expand the special parts of the 203 # trailers formatter. In that case, the entire formatter remains, and we 204 # need to extract the information in another way. The %trailers general 205 # formatter is available, so we'll use that and regex over it. 206 cleaned = list(filter(None, map(str.strip, out.split("\0")))) 207 trailers_removed = [] 208 for row in cleaned: 209 # Find: a chunk of text surrounded by \001, and extract the number after 210 # PiperOrigin-RevId. 211 row = re.sub("\001.*PiperOrigin-RevId: (?P<cl>[0-9]+).*\001", r"\g<1>", row) 212 trailers_removed.append(row) 213 return trailers_removed 214 215 216def gcloud(tool, args, stdin=None): 217 r"""Run a Google cloud utility. 218 219 On Linux and MacOS, utilities are assumed to be in the PATH. 220 On Windows, utilities are assumed to be available as 221 C:\Program Files (x86)\Google\Cloud SDK\google-cloud-sdk\bin\{tool}.cmd 222 223 Args: 224 tool: CLI tool, e.g. bq, gcloud, gsutil 225 args: List of arguments, same format as subprocess.run 226 stdin: String to send to stdin 227 228 Returns: 229 String, the stdout of the tool 230 """ 231 232 if platform.system() == "Windows": 233 tool = (r"C:\Program Files (x86)\Google\Cloud " 234 r"SDK\google-cloud-sdk\bin\{}.cmd").format(tool) 235 236 try: 237 ret = subprocess.run([tool, *args], 238 check=True, 239 universal_newlines=True, 240 stdout=subprocess.PIPE, 241 stderr=subprocess.PIPE, 242 input=stdin) 243 except subprocess.CalledProcessError as e: 244 print(e.stderr) 245 print(e.stdout) 246 raise e 247 return ret.stdout.strip() 248 249 250def bq(args, stdin=None): 251 """Helper for running bq, the BigQuery tool.""" 252 # bq prints extra messages to stdout if ~/.bigqueryrc doesn't exist 253 pathlib.Path(pathlib.Path.home() / ".bigqueryrc").touch() 254 return gcloud( 255 "bq", ["--project_id", FLAGS.project, "--headless", *args], stdin=stdin) 256 257 258def get_all_tested_commits(): 259 """Get details about the full commit range tested by this invocation.""" 260 head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1) 261 _, _, _, _, _, _, current_commit_date = head_info[0].split("\t") 262 263 query_earliest_included_commit = BQ_GET_EARLIEST_INCLUDED_COMMIT.format( 264 table=TABLE_NAME, 265 earlier_than_this_date=current_commit_date, 266 artifact_id=FLAGS.artifact_id, 267 team=FLAGS.team) 268 269 # --format=csv returns an empty string if no results, or else two lines: 270 # commit 271 # COMMIT_HASH 272 earliest_commit = bq(["query", "--format", "csv", "--nouse_legacy_sql"], 273 stdin=query_earliest_included_commit) 274 275 # Compute the commit/CL range since the last test 276 if earliest_commit: 277 278 earliest_commit = earliest_commit.splitlines()[-1] # Ignore CSV header 279 early_author_date, early_cl, early_commit_date = git_pretty( 280 earliest_commit, PRETTY_EARLY, n=1)[0].split("\t") 281 282 all_range = "{commit}..HEAD".format(commit=earliest_commit) 283 # Reversed: convert to chronological 284 all_commits = ",".join(reversed(git_pretty(all_range, PRETTY_COMMIT))) 285 all_changelists = ",".join(reversed(git_pretty(all_range, PRETTY_CL))) 286 287 return [ 288 earliest_commit, early_cl, early_author_date, early_commit_date, 289 all_commits, all_changelists 290 ] 291 292 # If the artifact has never been tracked before this commit 293 # Empty cells in CSV loads are loaded as NULL values 294 else: 295 return [""] * 6 296 297 298def get_upload_path(): 299 """Generate URL for 'gsutil cp'.""" 300 if FLAGS.upload and FLAGS.artifact: 301 artifact_filename = os.path.basename(FLAGS.artifact.name) 302 # note: not os.path.join here, because gsutil is always linux-style 303 # Using a timestamp prevents duplicate entries 304 path = "{bucket}/{team}/{artifact_id}/{now}.{artifact_filename}".format( 305 bucket=FLAGS.bucket, 306 team=FLAGS.team, 307 artifact_id=FLAGS.artifact_id, 308 now=NOW, 309 artifact_filename=artifact_filename) 310 return path 311 else: 312 return "" 313 314 315def build_row(): 316 """Assemble one row of data about this artifact.""" 317 (earliest_commit, early_cl, early_author_date, early_commit_date, all_commits, 318 all_changelists) = get_all_tested_commits() 319 320 # Use UTC to make sure machines in different timezones load consistent data 321 current_time = datetime.datetime.now(datetime.timezone.utc).isoformat() 322 artifact_filename = ("NO_FILE" if not FLAGS.artifact else os.path.basename( 323 FLAGS.artifact.name)) 324 size_bytes = FLAGS.manual_bytes or os.path.getsize(FLAGS.artifact.name) 325 head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1) 326 all_head_info_items = head_info[0].split("\t") 327 return [ 328 FLAGS.artifact_id, 329 artifact_filename, 330 *all_head_info_items, 331 earliest_commit, 332 early_cl, 333 early_author_date, 334 early_commit_date, 335 all_commits, 336 all_changelists, 337 size_bytes, 338 FLAGS.team, 339 current_time, 340 get_upload_path(), 341 FLAGS.job, 342 FLAGS.build_id, 343 ] 344 345 346def main(): 347 348 # Validate flags 349 if FLAGS.print_schema: 350 print(SCHEMA) 351 exit(0) 352 elif not FLAGS.team or not FLAGS.artifact_id or not (FLAGS.artifact or 353 FLAGS.manual_bytes): 354 print( 355 "--team and --artifact_id are required if --print_schema is not " 356 "specified.\nYou must also specify one of --artifact or --manual_bytes." 357 "\nPass -h or --help for usage.") 358 exit(1) 359 360 if not FLAGS.job: 361 FLAGS.job = os.environ.get("KOKORO_JOB_NAME", "NO_JOB") 362 if not FLAGS.build_id: 363 FLAGS.build_id = os.environ.get("KOKORO_BUILD_ID", "NO_BUILD") 364 365 # Generate data about this artifact into a Tab Separated Value file 366 next_tsv_row = build_row() 367 368 # Upload artifact into GCS if it exists 369 if FLAGS.upload and FLAGS.artifact: 370 upload_path = get_upload_path() 371 if FLAGS.dry_run: 372 print("DRY RUN: Would gsutil cp to:\n{}".format(upload_path)) 373 else: 374 gcloud("gsutil", ["cp", FLAGS.artifact.name, upload_path]) 375 376 # Load into BigQuery 377 if FLAGS.dry_run: 378 print("DRY RUN: Generated this TSV row:") 379 print("\t".join(map(str, next_tsv_row))) 380 else: 381 with open("data.tsv", "w", newline="") as tsvfile: 382 writer = csv.writer( 383 tsvfile, 384 delimiter="\t", 385 quoting=csv.QUOTE_MINIMAL, 386 lineterminator=os.linesep) 387 writer.writerow(next_tsv_row) 388 bq([ 389 "load", "--source_format", "CSV", "--field_delimiter", "tab", 390 PROJECT_LEVEL_TABLE_NAME, "data.tsv", SCHEMA 391 ]) 392 393 394if __name__ == "__main__": 395 main() 396