#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright 2013 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import difflib import hashlib import itertools import json import os import zipfile from .pycache import pycache_enabled from .pycache import pycache # When set and a difference is detected, a diff of what changed is printed. PRINT_EXPLANATIONS = int(os.environ.get('PRINT_BUILD_EXPLANATIONS', 0)) # An escape hatch that causes all targets to be rebuilt. _FORCE_REBUILD = int(os.environ.get('FORCE_REBUILD', 0)) def get_new_metadata(input_strings, input_paths): new_metadata = _Metadata() new_metadata.add_strings(input_strings) for path in input_paths: if _is_zip_file(path): entries = _extract_zip_entries(path) new_metadata.add_zip_file(path, entries) else: new_metadata.add_file(path, _md5_for_path(path)) return new_metadata def get_old_metadata(record_path): old_metadata = None if os.path.exists(record_path): with open(record_path, 'r') as jsonfile: try: old_metadata = _Metadata.from_file(jsonfile) except: # noqa: E722 pylint: disable=bare-except pass return old_metadata def print_explanations(record_path, changes): if PRINT_EXPLANATIONS: print('=' * 80) print('Target is stale: %s' % record_path) print(changes.describe_difference()) print('=' * 80) def call_and_record_if_stale( function, # pylint: disable=invalid-name record_path=None, input_paths=None, input_strings=None, output_paths=None, force=False, pass_changes=False): """Calls function if outputs are stale. Outputs are considered stale if: - any output_paths are missing, or - the contents of any file within input_paths has changed, or - the contents of input_strings has changed. To debug which files are out-of-date, set the environment variable: PRINT_MD5_DIFFS=1 Args: function: The function to call. record_path: Path to record metadata. Defaults to output_paths[0] + '.md5.stamp' input_paths: List of paths to calculate a md5 sum on. input_strings: List of strings to record verbatim. output_paths: List of output paths. force: Whether to treat outputs as missing regardless of whether they actually are. pass_changes: Whether to pass a Changes instance to |function|. """ assert record_path or output_paths input_paths = input_paths or [] input_strings = input_strings or [] output_paths = output_paths or [] new_metadata = get_new_metadata(input_strings, input_paths) force = force or _FORCE_REBUILD missing_outputs = [ x for x in output_paths if force or not os.path.exists(x) ] if pycache_enabled: # Input strings, input files and outputs names together compose # cache manifest, which is the only identifier of a python action. manifest = '-'.join( [new_metadata.strings_md5(), new_metadata.files_md5()] + sorted(output_paths)) record_path = pycache.get_manifest_path('{}.manifest'.format(manifest)) old_metadata = get_old_metadata(record_path) else: record_path = record_path or output_paths[0] + '.md5.stamp' # When outputs are missing, don't bother gathering change information. if not missing_outputs: old_metadata = get_old_metadata(record_path) else: old_metadata = None changes = Changes(old_metadata, new_metadata, force, missing_outputs) if not changes.has_changes(): if not pycache_enabled: return if pycache_enabled and pycache.retrieve(output_paths, prefix=manifest): return print_explanations(record_path, changes) args = (changes, ) if pass_changes else () function(*args) if pycache_enabled: try: pycache.report_cache_stat('cache_miss') except: # noqa: E722 pylint: disable=bare-except pass pycache.save(output_paths, prefix=manifest) with open(record_path, 'w') as record: new_metadata.to_file(record) class Changes(object): """Provides and API for querying what changed between runs.""" def __init__(self, old_metadata, new_metadata, force, missing_outputs): self.old_metadata = old_metadata self.new_metadata = new_metadata self.force = force self.missing_outputs = missing_outputs def _get_old_tag(self, path, subpath=None): return self.old_metadata and self.old_metadata.get_tag(path, subpath) def has_changes(self): """Returns whether any changes exist.""" return ( self.force or not self.old_metadata or self.old_metadata.strings_md5() != self.new_metadata.strings_md5() or self.old_metadata.files_md5() != self.new_metadata.files_md5()) def added_or_modified_only(self): """Returns whether the only changes were from added or modified (sub)files. No missing outputs, no removed paths/subpaths. """ if (self.force or not self.old_metadata or self.old_metadata.strings_md5() != self.new_metadata.strings_md5()): return False if any(self.iter_removed_paths()): return False for path in self.iter_modified_paths(): if any(self.iter_removed_subpaths(path)): return False return True def iter_all_paths(self): """Generator for paths.""" return self.new_metadata.iter_paths() def iter_all_subpaths(self, path): """Generator for subpaths.""" return self.new_metadata.iter_subpaths(path) def iter_added_paths(self): """Generator for paths that were added.""" for path in self.new_metadata.iter_paths(): if self._get_old_tag(path) is None: yield path def iter_added_subpaths(self, path): """Generator for paths that were added within the given zip file.""" for subpath in self.new_metadata.iter_subpaths(path): if self._get_old_tag(path, subpath) is None: yield subpath def iter_removed_paths(self): """Generator for paths that were removed.""" if self.old_metadata: for path in self.old_metadata.iter_paths(): if self.new_metadata.get_tag(path) is None: yield path def iter_removed_subpaths(self, path): """Generator for paths that were removed within the given zip file.""" if self.old_metadata: for subpath in self.old_metadata.iter_subpaths(path): if self.new_metadata.get_tag(path, subpath) is None: yield subpath def iter_modified_paths(self): """Generator for paths whose contents have changed.""" for path in self.new_metadata.iter_paths(): old_tag = self._get_old_tag(path) new_tag = self.new_metadata.get_tag(path) if old_tag is not None and old_tag != new_tag: yield path def iter_modified_subpaths(self, path): """Generator for paths within a zip file whose contents have changed.""" for subpath in self.new_metadata.iter_subpaths(path): old_tag = self._get_old_tag(path, subpath) new_tag = self.new_metadata.get_tag(path, subpath) if old_tag is not None and old_tag != new_tag: yield subpath def iter_changed_paths(self): """Generator for all changed paths (added/removed/modified).""" return itertools.chain(self.iter_removed_paths(), self.iter_modified_paths(), self.iter_added_paths()) def iter_changed_subpaths(self, path): """Generator for paths within a zip that were added/removed/modified.""" return itertools.chain(self.iter_removed_subpaths(path), self.iter_modified_subpaths(path), self.iter_added_subpaths(path)) def describe_difference(self): """Returns a human-readable description of what changed.""" if self.force: return 'force=True' elif self.old_metadata is None: return 'Previous stamp file not found.' if self.old_metadata.strings_md5() != self.new_metadata.strings_md5(): ndiff = difflib.ndiff(self.old_metadata.get_strings(), self.new_metadata.get_strings()) changed = [s for s in ndiff if not s.startswith(' ')] return 'Input strings changed:\n ' + '\n '.join(changed) if self.old_metadata.files_md5() == self.new_metadata.files_md5(): return "There's no difference." lines = [] lines.extend('Added: {}'.format(p for p in self.iter_added_paths())) lines.extend('Removed: {}'.format(p for p in self.iter_removed_paths())) for path in self.iter_modified_paths(): lines.append('Modified: {}'.format(path)) lines.extend(' -> Subpath added: {}'.format( p for p in self.iter_added_subpaths(path))) lines.extend(' -> Subpath removed: {}'.format( p for p in self.iter_removed_subpaths(path))) lines.extend(' -> Subpath modified: {}'.format( p for p in self.iter_modified_subpaths(path))) if lines: return 'Input files changed:\n {}'.format('\n '.join(lines)) if self.missing_outputs: return 'Outputs do not exist:\n {}'.format('\n '.join( self.missing_outputs)) return 'I have no idea what changed (there is a bug).' class _Metadata(object): """Data model for tracking change metadata.""" def __init__(self): self._files_md5 = None self._strings_md5 = None self._files = [] self._strings = [] # Map of (path, subpath) -> entry. Created upon first call to _get_entry(). self._file_map = None @classmethod def from_file(cls, fileobj): """Returns a _Metadata initialized from a file object.""" ret = cls() obj = json.load(fileobj) ret._files_md5 = obj['files-md5'] ret._strings_md5 = obj['strings-md5'] ret._files = obj['input-files'] ret._strings = obj['input-strings'] return ret def to_file(self, fileobj): """Serializes metadata to the given file object.""" obj = { "files-md5": self.files_md5(), "strings-md5": self.strings_md5(), "input-files": self._files, "input-strings": self._strings, } json.dump(obj, fileobj, indent=2, sort_keys=True) def _assert_not_queried(self): assert self._files_md5 is None assert self._strings_md5 is None assert self._file_map is None def add_strings(self, values): self._assert_not_queried() self._strings.extend(str(v) for v in values) def add_file(self, path, tag): """Adds metadata for a non-zip file. Args: path: Path to the file. tag: A short string representative of the file contents. """ self._assert_not_queried() self._files.append({ 'path': path, 'tag': tag, }) def add_zip_file(self, path, entries): """Adds metadata for a zip file. Args: path: Path to the file. entries: List of (subpath, tag) tuples for entries within the zip. """ self._assert_not_queried() tag = _compute_inline_md5( itertools.chain((e[0] for e in entries), (e[1] for e in entries))) self._files.append({ 'path': path, 'tag': tag, 'entries': [{ "path": e[0], "tag": e[1] } for e in entries], }) def get_strings(self): """Returns the list of input strings.""" return self._strings def files_md5(self): """Lazily computes and returns the aggregate md5 of input files.""" if self._files_md5 is None: # Omit paths from md5 since temporary files have random names. self._files_md5 = _compute_inline_md5( self.get_tag(p) for p in sorted(self.iter_paths())) return self._files_md5 def strings_md5(self): """Lazily computes and returns the aggregate md5 of input strings.""" if self._strings_md5 is None: self._strings_md5 = _compute_inline_md5(self._strings) return self._strings_md5 def _get_entry(self, path, subpath=None): """Returns the JSON entry for the given path / subpath.""" if self._file_map is None: self._file_map = {} for entry in self._files: self._file_map[(entry['path'], None)] = entry for subentry in entry.get('entries', ()): self._file_map[(entry['path'], subentry['path'])] = subentry return self._file_map.get((path, subpath)) def get_tag(self, path, subpath=None): """Returns the tag for the given path / subpath.""" ret = self._get_entry(path, subpath) return ret and ret['tag'] def iter_paths(self): """Returns a generator for all top-level paths.""" return (e['path'] for e in self._files) def iter_subpaths(self, path): """Returns a generator for all subpaths in the given zip. If the given path is not a zip file or doesn't exist, returns an empty iterable. """ outer_entry = self._get_entry(path) if not outer_entry: return () subentries = outer_entry.get('entries', []) return (entry['path'] for entry in subentries) def _update_md5_for_file(md5, path, block_size=2**16): # record md5 of linkto for dead link. if os.path.islink(path): linkto = os.readlink(path) if not os.path.exists(linkto): md5.update(linkto.encode()) return with open(path, 'rb') as infile: while True: data = infile.read(block_size) if not data: break md5.update(data) def _update_md5_for_directory(md5, dir_path): for root, _, files in os.walk(dir_path): for f in files: _update_md5_for_file(md5, os.path.join(root, f)) def _md5_for_path(path): md5 = hashlib.md5() if os.path.isdir(path): _update_md5_for_directory(md5, path) else: _update_md5_for_file(md5, path) return md5.hexdigest() def _compute_inline_md5(iterable): """Computes the md5 of the concatenated parameters.""" md5 = hashlib.md5() for item in iterable: md5.update(str(item).encode()) return md5.hexdigest() def _is_zip_file(path): """Returns whether to treat the given file as a zip file.""" return path[-4:] in ('.zip') def _extract_zip_entries(path): """Returns a list of (path, CRC32) of all files within |path|.""" entries = [] with zipfile.ZipFile(path) as zip_file: for zip_info in zip_file.infolist(): # Skip directories and empty files. if zip_info.CRC: entries.append( (zip_info.filename, zip_info.CRC + zip_info.compress_type)) return entries