# Copyright 2013 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import difflib import hashlib import itertools import json import os import sys import zipfile # When set and a difference is detected, a diff of what changed is printed. PRINT_EXPLANATIONS = int(os.environ.get('PRINT_BUILD_EXPLANATIONS', 0)) # An escape hatch that causes all targets to be rebuilt. _FORCE_REBUILD = int(os.environ.get('FORCE_REBUILD', 0)) def CallAndRecordIfStale( function, record_path=None, input_paths=None, input_strings=None, output_paths=None, force=False, pass_changes=False): """Calls function if outputs are stale. Outputs are considered stale if: - any output_paths are missing, or - the contents of any file within input_paths has changed, or - the contents of input_strings has changed. To debug which files are out-of-date, set the environment variable: PRINT_MD5_DIFFS=1 Args: function: The function to call. record_path: Path to record metadata. Defaults to output_paths[0] + '.md5.stamp' input_paths: List of paths to calcualte an md5 sum on. input_strings: List of strings to record verbatim. output_paths: List of output paths. force: Whether to treat outputs as missing regardless of whether they actually are. pass_changes: Whether to pass a Changes instance to |function|. """ assert record_path or output_paths input_paths = input_paths or [] input_strings = input_strings or [] output_paths = output_paths or [] record_path = record_path or output_paths[0] + '.md5.stamp' assert record_path.endswith('.stamp'), ( 'record paths must end in \'.stamp\' so that they are easy to find ' 'and delete') new_metadata = _Metadata() new_metadata.AddStrings(input_strings) for path in input_paths: if _IsZipFile(path): entries = _ExtractZipEntries(path) new_metadata.AddZipFile(path, entries) else: new_metadata.AddFile(path, _Md5ForPath(path)) old_metadata = None force = force or _FORCE_REBUILD missing_outputs = [x for x in output_paths if force or not os.path.exists(x)] # When outputs are missing, don't bother gathering change information. if not missing_outputs and os.path.exists(record_path): with open(record_path, 'r') as jsonfile: try: old_metadata = _Metadata.FromFile(jsonfile) except: # pylint: disable=bare-except pass # Not yet using new file format. changes = Changes(old_metadata, new_metadata, force, missing_outputs) if not changes.HasChanges(): return if PRINT_EXPLANATIONS: print('=' * 80) print('Target is stale: %s' % record_path) print(changes.DescribeDifference()) print('=' * 80) args = (changes,) if pass_changes else () function(*args) with open(record_path, 'w') as f: new_metadata.ToFile(f) class Changes(object): """Provides and API for querying what changed between runs.""" def __init__(self, old_metadata, new_metadata, force, missing_outputs): self.old_metadata = old_metadata self.new_metadata = new_metadata self.force = force self.missing_outputs = missing_outputs def _GetOldTag(self, path, subpath=None): return self.old_metadata and self.old_metadata.GetTag(path, subpath) def HasChanges(self): """Returns whether any changes exist.""" return (self.force or not self.old_metadata or self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5() or self.old_metadata.FilesMd5() != self.new_metadata.FilesMd5()) def AddedOrModifiedOnly(self): """Returns whether the only changes were from added or modified (sub)files. No missing outputs, no removed paths/subpaths. """ if (self.force or not self.old_metadata or self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5()): return False if any(self.IterRemovedPaths()): return False for path in self.IterModifiedPaths(): if any(self.IterRemovedSubpaths(path)): return False return True def IterAllPaths(self): """Generator for paths.""" return self.new_metadata.IterPaths(); def IterAllSubpaths(self, path): """Generator for subpaths.""" return self.new_metadata.IterSubpaths(path); def IterAddedPaths(self): """Generator for paths that were added.""" for path in self.new_metadata.IterPaths(): if self._GetOldTag(path) is None: yield path def IterAddedSubpaths(self, path): """Generator for paths that were added within the given zip file.""" for subpath in self.new_metadata.IterSubpaths(path): if self._GetOldTag(path, subpath) is None: yield subpath def IterRemovedPaths(self): """Generator for paths that were removed.""" if self.old_metadata: for path in self.old_metadata.IterPaths(): if self.new_metadata.GetTag(path) is None: yield path def IterRemovedSubpaths(self, path): """Generator for paths that were removed within the given zip file.""" if self.old_metadata: for subpath in self.old_metadata.IterSubpaths(path): if self.new_metadata.GetTag(path, subpath) is None: yield subpath def IterModifiedPaths(self): """Generator for paths whose contents have changed.""" for path in self.new_metadata.IterPaths(): old_tag = self._GetOldTag(path) new_tag = self.new_metadata.GetTag(path) if old_tag is not None and old_tag != new_tag: yield path def IterModifiedSubpaths(self, path): """Generator for paths within a zip file whose contents have changed.""" for subpath in self.new_metadata.IterSubpaths(path): old_tag = self._GetOldTag(path, subpath) new_tag = self.new_metadata.GetTag(path, subpath) if old_tag is not None and old_tag != new_tag: yield subpath def IterChangedPaths(self): """Generator for all changed paths (added/removed/modified).""" return itertools.chain(self.IterRemovedPaths(), self.IterModifiedPaths(), self.IterAddedPaths()) def IterChangedSubpaths(self, path): """Generator for paths within a zip that were added/removed/modified.""" return itertools.chain(self.IterRemovedSubpaths(path), self.IterModifiedSubpaths(path), self.IterAddedSubpaths(path)) def DescribeDifference(self): """Returns a human-readable description of what changed.""" if self.force: return 'force=True' elif self.missing_outputs: return 'Outputs do not exist:\n ' + '\n '.join(self.missing_outputs) elif self.old_metadata is None: return 'Previous stamp file not found.' if self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5(): ndiff = difflib.ndiff(self.old_metadata.GetStrings(), self.new_metadata.GetStrings()) changed = [s for s in ndiff if not s.startswith(' ')] return 'Input strings changed:\n ' + '\n '.join(changed) if self.old_metadata.FilesMd5() == self.new_metadata.FilesMd5(): return "There's no difference." lines = [] lines.extend('Added: ' + p for p in self.IterAddedPaths()) lines.extend('Removed: ' + p for p in self.IterRemovedPaths()) for path in self.IterModifiedPaths(): lines.append('Modified: ' + path) lines.extend(' -> Subpath added: ' + p for p in self.IterAddedSubpaths(path)) lines.extend(' -> Subpath removed: ' + p for p in self.IterRemovedSubpaths(path)) lines.extend(' -> Subpath modified: ' + p for p in self.IterModifiedSubpaths(path)) if lines: return 'Input files changed:\n ' + '\n '.join(lines) return 'I have no idea what changed (there is a bug).' class _Metadata(object): """Data model for tracking change metadata.""" # Schema: # { # "files-md5": "VALUE", # "strings-md5": "VALUE", # "input-files": [ # { # "path": "path.jar", # "tag": "{MD5 of entries}", # "entries": [ # { "path": "org/chromium/base/Foo.class", "tag": "{CRC32}" }, ... # ] # }, { # "path": "path.txt", # "tag": "{MD5}", # } # ], # "input-strings": ["a", "b", ...], # } def __init__(self): self._files_md5 = None self._strings_md5 = None self._files = [] self._strings = [] # Map of (path, subpath) -> entry. Created upon first call to _GetEntry(). self._file_map = None @classmethod def FromFile(cls, fileobj): """Returns a _Metadata initialized from a file object.""" ret = cls() obj = json.load(fileobj) ret._files_md5 = obj['files-md5'] ret._strings_md5 = obj['strings-md5'] ret._files = obj['input-files'] ret._strings = obj['input-strings'] return ret def ToFile(self, fileobj): """Serializes metadata to the given file object.""" obj = { "files-md5": self.FilesMd5(), "strings-md5": self.StringsMd5(), "input-files": self._files, "input-strings": self._strings, } json.dump(obj, fileobj, indent=2) def _AssertNotQueried(self): assert self._files_md5 is None assert self._strings_md5 is None assert self._file_map is None def AddStrings(self, values): self._AssertNotQueried() self._strings.extend(str(v) for v in values) def AddFile(self, path, tag): """Adds metadata for a non-zip file. Args: path: Path to the file. tag: A short string representative of the file contents. """ self._AssertNotQueried() self._files.append({ 'path': path, 'tag': tag, }) def AddZipFile(self, path, entries): """Adds metadata for a zip file. Args: path: Path to the file. entries: List of (subpath, tag) tuples for entries within the zip. """ self._AssertNotQueried() tag = _ComputeInlineMd5(itertools.chain((e[0] for e in entries), (e[1] for e in entries))) self._files.append({ 'path': path, 'tag': tag, 'entries': [{"path": e[0], "tag": e[1]} for e in entries], }) def GetStrings(self): """Returns the list of input strings.""" return self._strings def FilesMd5(self): """Lazily computes and returns the aggregate md5 of input files.""" if self._files_md5 is None: # Omit paths from md5 since temporary files have random names. self._files_md5 = _ComputeInlineMd5( self.GetTag(p) for p in sorted(self.IterPaths())) return self._files_md5 def StringsMd5(self): """Lazily computes and returns the aggregate md5 of input strings.""" if self._strings_md5 is None: self._strings_md5 = _ComputeInlineMd5(self._strings) return self._strings_md5 def _GetEntry(self, path, subpath=None): """Returns the JSON entry for the given path / subpath.""" if self._file_map is None: self._file_map = {} for entry in self._files: self._file_map[(entry['path'], None)] = entry for subentry in entry.get('entries', ()): self._file_map[(entry['path'], subentry['path'])] = subentry return self._file_map.get((path, subpath)) def GetTag(self, path, subpath=None): """Returns the tag for the given path / subpath.""" ret = self._GetEntry(path, subpath) return ret and ret['tag'] def IterPaths(self): """Returns a generator for all top-level paths.""" return (e['path'] for e in self._files) def IterSubpaths(self, path): """Returns a generator for all subpaths in the given zip. If the given path is not a zip file or doesn't exist, returns an empty iterable. """ outer_entry = self._GetEntry(path) if not outer_entry: return () subentries = outer_entry.get('entries', []) return (entry['path'] for entry in subentries) def _UpdateMd5ForFile(md5, path, block_size=2**16): with open(path, 'rb') as infile: while True: data = infile.read(block_size) if not data: break md5.update(data) def _UpdateMd5ForDirectory(md5, dir_path): for root, _, files in os.walk(dir_path): for f in files: _UpdateMd5ForFile(md5, os.path.join(root, f)) def _Md5ForPath(path): md5 = hashlib.md5() if os.path.isdir(path): _UpdateMd5ForDirectory(md5, path) else: _UpdateMd5ForFile(md5, path) return md5.hexdigest() def _ComputeInlineMd5(iterable): """Computes the md5 of the concatenated parameters.""" md5 = hashlib.md5() for item in iterable: md5.update(str(item)) return md5.hexdigest() def _IsZipFile(path): """Returns whether to treat the given file as a zip file.""" # ijar doesn't set the CRC32 field. if path.endswith('.interface.jar'): return False return path[-4:] in ('.zip', '.apk', '.jar') or path.endswith('.srcjar') def _ExtractZipEntries(path): """Returns a list of (path, CRC32) of all files within |path|.""" entries = [] with zipfile.ZipFile(path) as zip_file: for zip_info in zip_file.infolist(): # Skip directories and empty files. if zip_info.CRC: entries.append( (zip_info.filename, zip_info.CRC + zip_info.compress_type)) return entries