• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15# ==============================================================================
16r"""Used for Google-internal artifact size tracking.
17
18See go/tf-devinfra/sizetrack.
19
20INVOCATION: The following flags are required:
21
22  sizetrack_helper.py \
23      --artifact=ARTIFACT, or --manual_bytes=MANUAL_BYTES
24      --artifact_id=ARTIFACT_ID \
25      --team=TEAM \
26      ... other optional args ...
27
28On Windows you might need something like:
29
30    C:\Python38\python.exe C:\path\to\sizetrack_helper.py ...
31
32PREREQUISITES:
33
34  1. Your current activated GCP user must have access scopes and IAM permissions
35     to do the following:
36
37      1. Query and load data into BigQuery
38      2. Upload files to GCS
39
40  2. Your environment must match the following criteria:
41
42      1. Current directory is a git repository
43      2. CL-based commits have a PiperOrigin-RevId trailer. This is the case
44         for any use of Copybara Single-source-of-truth, e.g. TensorFlow.
45         Only these commits are considered when running commands.
46"""
47
48import argparse
49import csv
50import datetime
51import os
52import os.path
53import pathlib
54import platform
55import re
56import subprocess
57
58parser = argparse.ArgumentParser(
59    usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
60parser.add_argument(
61    "--project",
62    type=str,
63    default="tensorflow-testing",
64    help="GCP project you can access.")
65parser.add_argument(
66    "--dataset",
67    type=str,
68    default="sizetracker",
69    help="BigQuery dataset containing --table")
70parser.add_argument(
71    "--table", type=str, default="tensorflow_devinfra", help="BigQuery table.")
72parser.add_argument(
73    "--upload",
74    action="store_true",
75    help="Upload the artifact to --bucket for analysis.")
76parser.add_argument(
77    "--bucket",
78    type=str,
79    default="gs://tf-sizetracker-artifacts",
80    help="GCS bucket for artifacts.")
81parser.add_argument(
82    "--team",
83    type=str,
84    help="For grouping in the dashboard and buckets; e.g. tf-lite-team.")
85parser.add_argument(
86    "--artifact_id",
87    type=str,
88    help="Unique ID for your artifact, used for sorting dashboards.")
89parser.add_argument(
90    "-n",
91    "--dry_run",
92    action="store_true",
93    help="Dry run: do not load to BigQuery or upload to GCS.")
94parser.add_argument(
95    "--job",
96    type=str,
97    help="Name of job calling this script. Default: $KOKORO_JOB_NAME.")
98parser.add_argument(
99    "--build_id",
100    type=str,
101    help="UUID of build calling this script. Default: $KOKORO_BUILD_ID.")
102parser.add_argument(
103    "--print_schema",
104    action="store_true",
105    help="Print the table schema and don't do anything else.")
106size = parser.add_mutually_exclusive_group()
107size.add_argument(
108    "--artifact",
109    type=argparse.FileType("r"),
110    help="Local file you are measuring.")
111size.add_argument(
112    "--manual_bytes",
113    type=int,
114    help="Manually set the recorded size instead of providing an artifact.")
115FLAGS = parser.parse_args()
116
117NOW = datetime.datetime.now(
118    datetime.timezone.utc).replace(microsecond=0).isoformat()
119TABLE_NAME = "{}.{}".format(FLAGS.dataset, FLAGS.table)
120PROJECT_LEVEL_TABLE_NAME = "{}:{}".format(FLAGS.project, TABLE_NAME)
121CL_TRAILER = "PiperOrigin-RevId"
122PRETTY_COMMIT_DATE = "%cI"
123# \001 is a byte with value "1", in octal. We use this in git_pretty()
124PRETTY_CL = "\001%(trailers)\001"
125PRETTY_HEAD_INFO = "%h\t{cl}\t%s\t%ae\t%aI\t%ce\t%cI".format(cl=PRETTY_CL)
126PRETTY_EARLY = "%aI\t{cl}\t%cI".format(cl=PRETTY_CL)
127PRETTY_COMMIT = "%h"
128# This is a BigQuery table schema defined as CSV
129# See https://cloud.google.com/bigquery/docs/schemas
130SCHEMA = ",".join([
131    "id:string",
132    "filename:string",
133    # These 6 lines are from git's format=pretty
134    # %h $CL_PRETTY %s %ae %aI %ce %cI
135    "commit:string",
136    "cl:int64",
137    "description:string",
138    "author:string",
139    "author_date:timestamp",
140    "committer:string",
141    "commit_date:timestamp",
142    # Done with format=pretty
143    "earliest_commit:string",
144    "earliest_cl:int64",
145    "earliest_author_date:timestamp",
146    "earliest_commit_date:timestamp",
147    "all_commits:string",
148    "all_cls:string",
149    "bytes:int64",
150    "team:string",
151    "logged_date:timestamp",
152    "uploaded_to:string",
153    "job:string",
154    "build_id:string",
155])
156# Select the earliest recorded commit in the same table for the same artifact
157# and team. Used to determine the full range of tested commits for each
158# invocation. Returns empty string if there are no earlier records.
159BQ_GET_EARLIEST_INCLUDED_COMMIT = """
160  SELECT
161    commit
162  FROM {table} WHERE
163    commit_date < '{earlier_than_this_date}'
164    AND id = '{artifact_id}'
165    AND team = '{team}'
166  ORDER BY commit_date DESC LIMIT 1
167"""
168
169
170# pylint: disable=unused-argument
171def git_pretty(commit_range, pretty_format, n=None):
172  r"""Run git log and return the cleaned results.
173
174  Git is assumed to be available in the PATH.
175
176  The PiperOrigin-RevId trailer always picks up an extra newline, so this splits
177  entries on a null byte (\0, or %x00 for git log) and removes newlines.
178
179  Args:
180    commit_range: Standard range given to git log, e.g. HEAD~1..HEAD
181    pretty_format: See https://git-scm.com/docs/pretty-formats
182    n: Number of commits to get. By default, get all within commit_range.
183
184  Returns:
185    List of strings of whatever the format string was.
186  """
187  n = [] if n is None else ["-n", "1"]
188  try:
189    ret = subprocess.run([
190        "git", "log", *n, "--date", "iso", "--grep", CL_TRAILER, commit_range,
191        "--pretty=format:" + pretty_format + "%x00"
192    ],
193                         check=True,
194                         universal_newlines=True,
195                         stderr=subprocess.PIPE,
196                         stdout=subprocess.PIPE)
197  except subprocess.CalledProcessError as e:
198    print(e.stderr)
199    print(e.stdout)
200    raise e
201  out = ret.stdout.replace("\n", "")
202  # Unique case: Old versions of git do not expand the special parts of the
203  # trailers formatter. In that case, the entire formatter remains, and we
204  # need to extract the information in another way. The %trailers general
205  # formatter is available, so we'll use that and regex over it.
206  cleaned = list(filter(None, map(str.strip, out.split("\0"))))
207  trailers_removed = []
208  for row in cleaned:
209    # Find: a chunk of text surrounded by \001, and extract the number after
210    # PiperOrigin-RevId.
211    row = re.sub("\001.*PiperOrigin-RevId: (?P<cl>[0-9]+).*\001", r"\g<1>", row)
212    trailers_removed.append(row)
213  return trailers_removed
214
215
216def gcloud(tool, args, stdin=None):
217  r"""Run a Google cloud utility.
218
219  On Linux and MacOS, utilities are assumed to be in the PATH.
220  On Windows, utilities are assumed to be available as
221    C:\Program Files (x86)\Google\Cloud SDK\google-cloud-sdk\bin\{tool}.cmd
222
223  Args:
224    tool: CLI tool, e.g. bq, gcloud, gsutil
225    args: List of arguments, same format as subprocess.run
226    stdin: String to send to stdin
227
228  Returns:
229    String, the stdout of the tool
230  """
231
232  if platform.system() == "Windows":
233    tool = (r"C:\Program Files (x86)\Google\Cloud "
234            r"SDK\google-cloud-sdk\bin\{}.cmd").format(tool)
235
236  try:
237    ret = subprocess.run([tool, *args],
238                         check=True,
239                         universal_newlines=True,
240                         stdout=subprocess.PIPE,
241                         stderr=subprocess.PIPE,
242                         input=stdin)
243  except subprocess.CalledProcessError as e:
244    print(e.stderr)
245    print(e.stdout)
246    raise e
247  return ret.stdout.strip()
248
249
250def bq(args, stdin=None):
251  """Helper for running bq, the BigQuery tool."""
252  # bq prints extra messages to stdout if ~/.bigqueryrc doesn't exist
253  pathlib.Path(pathlib.Path.home() / ".bigqueryrc").touch()
254  return gcloud(
255      "bq", ["--project_id", FLAGS.project, "--headless", *args], stdin=stdin)
256
257
258def get_all_tested_commits():
259  """Get details about the full commit range tested by this invocation."""
260  head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1)
261  _, _, _, _, _, _, current_commit_date = head_info[0].split("\t")
262
263  query_earliest_included_commit = BQ_GET_EARLIEST_INCLUDED_COMMIT.format(
264      table=TABLE_NAME,
265      earlier_than_this_date=current_commit_date,
266      artifact_id=FLAGS.artifact_id,
267      team=FLAGS.team)
268
269  # --format=csv returns an empty string if no results, or else two lines:
270  # commit
271  # COMMIT_HASH
272  earliest_commit = bq(["query", "--format", "csv", "--nouse_legacy_sql"],
273                       stdin=query_earliest_included_commit)
274
275  # Compute the commit/CL range since the last test
276  if earliest_commit:
277
278    earliest_commit = earliest_commit.splitlines()[-1]  # Ignore CSV header
279    early_author_date, early_cl, early_commit_date = git_pretty(
280        earliest_commit, PRETTY_EARLY, n=1)[0].split("\t")
281
282    all_range = "{commit}..HEAD".format(commit=earliest_commit)
283    # Reversed: convert to chronological
284    all_commits = ",".join(reversed(git_pretty(all_range, PRETTY_COMMIT)))
285    all_changelists = ",".join(reversed(git_pretty(all_range, PRETTY_CL)))
286
287    return [
288        earliest_commit, early_cl, early_author_date, early_commit_date,
289        all_commits, all_changelists
290    ]
291
292  # If the artifact has never been tracked before this commit
293  # Empty cells in CSV loads are loaded as NULL values
294  else:
295    return [""] * 6
296
297
298def get_upload_path():
299  """Generate URL for 'gsutil cp'."""
300  if FLAGS.upload and FLAGS.artifact:
301    artifact_filename = os.path.basename(FLAGS.artifact.name)
302    # note: not os.path.join here, because gsutil is always linux-style
303    # Using a timestamp prevents duplicate entries
304    path = "{bucket}/{team}/{artifact_id}/{now}.{artifact_filename}".format(
305        bucket=FLAGS.bucket,
306        team=FLAGS.team,
307        artifact_id=FLAGS.artifact_id,
308        now=NOW,
309        artifact_filename=artifact_filename)
310    return path
311  else:
312    return ""
313
314
315def build_row():
316  """Assemble one row of data about this artifact."""
317  (earliest_commit, early_cl, early_author_date, early_commit_date, all_commits,
318   all_changelists) = get_all_tested_commits()
319
320  # Use UTC to make sure machines in different timezones load consistent data
321  current_time = datetime.datetime.now(datetime.timezone.utc).isoformat()
322  artifact_filename = ("NO_FILE" if not FLAGS.artifact else os.path.basename(
323      FLAGS.artifact.name))
324  size_bytes = FLAGS.manual_bytes or os.path.getsize(FLAGS.artifact.name)
325  head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1)
326  all_head_info_items = head_info[0].split("\t")
327  return [
328      FLAGS.artifact_id,
329      artifact_filename,
330      *all_head_info_items,
331      earliest_commit,
332      early_cl,
333      early_author_date,
334      early_commit_date,
335      all_commits,
336      all_changelists,
337      size_bytes,
338      FLAGS.team,
339      current_time,
340      get_upload_path(),
341      FLAGS.job,
342      FLAGS.build_id,
343  ]
344
345
346def main():
347
348  # Validate flags
349  if FLAGS.print_schema:
350    print(SCHEMA)
351    exit(0)
352  elif not FLAGS.team or not FLAGS.artifact_id or not (FLAGS.artifact or
353                                                       FLAGS.manual_bytes):
354    print(
355        "--team and --artifact_id are required if --print_schema is not "
356        "specified.\nYou must also specify one of --artifact or --manual_bytes."
357        "\nPass -h or --help for usage.")
358    exit(1)
359
360  if not FLAGS.job:
361    FLAGS.job = os.environ.get("KOKORO_JOB_NAME", "NO_JOB")
362  if not FLAGS.build_id:
363    FLAGS.build_id = os.environ.get("KOKORO_BUILD_ID", "NO_BUILD")
364
365  # Generate data about this artifact into a Tab Separated Value file
366  next_tsv_row = build_row()
367
368  # Upload artifact into GCS if it exists
369  if FLAGS.upload and FLAGS.artifact:
370    upload_path = get_upload_path()
371    if FLAGS.dry_run:
372      print("DRY RUN: Would gsutil cp to:\n{}".format(upload_path))
373    else:
374      gcloud("gsutil", ["cp", FLAGS.artifact.name, upload_path])
375
376  # Load into BigQuery
377  if FLAGS.dry_run:
378    print("DRY RUN: Generated this TSV row:")
379    print("\t".join(map(str, next_tsv_row)))
380  else:
381    with open("data.tsv", "w", newline="") as tsvfile:
382      writer = csv.writer(
383          tsvfile,
384          delimiter="\t",
385          quoting=csv.QUOTE_MINIMAL,
386          lineterminator=os.linesep)
387      writer.writerow(next_tsv_row)
388    bq([
389        "load", "--source_format", "CSV", "--field_delimiter", "tab",
390        PROJECT_LEVEL_TABLE_NAME, "data.tsv", SCHEMA
391    ])
392
393
394if __name__ == "__main__":
395  main()
396