• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Copyright 2013 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7import difflib
8import hashlib
9import itertools
10import json
11import os
12import zipfile
13from .pycache import pycache_enabled
14from .pycache import pycache
15
16# When set and a difference is detected, a diff of what changed is printed.
17PRINT_EXPLANATIONS = int(os.environ.get('PRINT_BUILD_EXPLANATIONS', 0))
18
19# An escape hatch that causes all targets to be rebuilt.
20_FORCE_REBUILD = int(os.environ.get('FORCE_REBUILD', 0))
21
22
23def get_new_metadata(input_strings, input_paths):
24    new_metadata = _Metadata()
25    new_metadata.add_strings(input_strings)
26
27    for path in input_paths:
28        if _is_zip_file(path):
29            entries = _extract_zip_entries(path)
30            new_metadata.add_zip_file(path, entries)
31        else:
32            new_metadata.add_file(path, _md5_for_path(path))
33    return new_metadata
34
35
36def get_old_metadata(record_path):
37    old_metadata = None
38    if os.path.exists(record_path):
39        with open(record_path, 'r') as jsonfile:
40            try:
41                old_metadata = _Metadata.from_file(jsonfile)
42            except:  # noqa: E722 pylint: disable=bare-except
43                pass
44    return old_metadata
45
46
47def print_explanations(record_path, changes):
48    if PRINT_EXPLANATIONS:
49        print('=' * 80)
50        print('Target is stale: %s' % record_path)
51        print(changes.describe_difference())
52        print('=' * 80)
53
54
55def call_and_record_if_stale(
56        function,  # pylint: disable=invalid-name
57        record_path=None,
58        input_paths=None,
59        input_strings=None,
60        output_paths=None,
61        force=False,
62        pass_changes=False):
63    """Calls function if outputs are stale.
64
65    Outputs are considered stale if:
66    - any output_paths are missing, or
67    - the contents of any file within input_paths has changed, or
68    - the contents of input_strings has changed.
69
70    To debug which files are out-of-date, set the environment variable:
71        PRINT_MD5_DIFFS=1
72
73    Args:
74      function: The function to call.
75      record_path: Path to record metadata.
76        Defaults to output_paths[0] + '.md5.stamp'
77      input_paths: List of paths to calculate a md5 sum on.
78      input_strings: List of strings to record verbatim.
79      output_paths: List of output paths.
80      force: Whether to treat outputs as missing regardless of whether they
81        actually are.
82      pass_changes: Whether to pass a Changes instance to |function|.
83    """
84    assert record_path or output_paths
85    input_paths = input_paths or []
86    input_strings = input_strings or []
87    output_paths = output_paths or []
88
89    new_metadata = get_new_metadata(input_strings, input_paths)
90    force = force or _FORCE_REBUILD
91    missing_outputs = [
92        x for x in output_paths if force or not os.path.exists(x)
93    ]
94
95    if pycache_enabled:
96        # Input strings, input files and outputs names together compose
97        # cache manifest, which is the only identifier of a python action.
98        manifest = '-'.join(
99            [new_metadata.strings_md5(),
100             new_metadata.files_md5()] + sorted(output_paths))
101        record_path = pycache.get_manifest_path('{}.manifest'.format(manifest))
102        old_metadata = get_old_metadata(record_path)
103    else:
104        record_path = record_path or output_paths[0] + '.md5.stamp'
105        # When outputs are missing, don't bother gathering change information.
106        if not missing_outputs:
107            old_metadata = get_old_metadata(record_path)
108        else:
109            old_metadata = None
110
111    changes = Changes(old_metadata, new_metadata, force, missing_outputs)
112    if not changes.has_changes():
113        if not pycache_enabled:
114            return
115        if pycache_enabled and pycache.retrieve(output_paths, prefix=manifest):
116            return
117
118    print_explanations(record_path, changes)
119
120    args = (changes, ) if pass_changes else ()
121    function(*args)
122    if pycache_enabled:
123        try:
124            pycache.report_cache_stat('cache_miss')
125        except:  # noqa: E722 pylint: disable=bare-except
126            pass
127        pycache.save(output_paths, prefix=manifest)
128
129    with open(record_path, 'w') as record:
130        new_metadata.to_file(record)
131
132
133class Changes(object):
134    """Provides and API for querying what changed between runs."""
135    def __init__(self, old_metadata, new_metadata, force, missing_outputs):
136        self.old_metadata = old_metadata
137        self.new_metadata = new_metadata
138        self.force = force
139        self.missing_outputs = missing_outputs
140
141    def _get_old_tag(self, path, subpath=None):
142        return self.old_metadata and self.old_metadata.get_tag(path, subpath)
143
144    def has_changes(self):
145        """Returns whether any changes exist."""
146        return (
147            self.force or not self.old_metadata or
148            self.old_metadata.strings_md5() != self.new_metadata.strings_md5()
149            or self.old_metadata.files_md5() != self.new_metadata.files_md5())
150
151    def added_or_modified_only(self):
152        """Returns whether the only changes were from added or modified (sub)files.
153
154        No missing outputs, no removed paths/subpaths.
155        """
156        if (self.force or not self.old_metadata
157                or self.old_metadata.strings_md5() !=
158                self.new_metadata.strings_md5()):
159            return False
160        if any(self.iter_removed_paths()):
161            return False
162        for path in self.iter_modified_paths():
163            if any(self.iter_removed_subpaths(path)):
164                return False
165        return True
166
167    def iter_all_paths(self):
168        """Generator for paths."""
169        return self.new_metadata.iter_paths()
170
171    def iter_all_subpaths(self, path):
172        """Generator for subpaths."""
173        return self.new_metadata.iter_subpaths(path)
174
175    def iter_added_paths(self):
176        """Generator for paths that were added."""
177        for path in self.new_metadata.iter_paths():
178            if self._get_old_tag(path) is None:
179                yield path
180
181    def iter_added_subpaths(self, path):
182        """Generator for paths that were added within the given zip file."""
183        for subpath in self.new_metadata.iter_subpaths(path):
184            if self._get_old_tag(path, subpath) is None:
185                yield subpath
186
187    def iter_removed_paths(self):
188        """Generator for paths that were removed."""
189        if self.old_metadata:
190            for path in self.old_metadata.iter_paths():
191                if self.new_metadata.get_tag(path) is None:
192                    yield path
193
194    def iter_removed_subpaths(self, path):
195        """Generator for paths that were removed within the given zip file."""
196        if self.old_metadata:
197            for subpath in self.old_metadata.iter_subpaths(path):
198                if self.new_metadata.get_tag(path, subpath) is None:
199                    yield subpath
200
201    def iter_modified_paths(self):
202        """Generator for paths whose contents have changed."""
203        for path in self.new_metadata.iter_paths():
204            old_tag = self._get_old_tag(path)
205            new_tag = self.new_metadata.get_tag(path)
206            if old_tag is not None and old_tag != new_tag:
207                yield path
208
209    def iter_modified_subpaths(self, path):
210        """Generator for paths within a zip file whose contents have changed."""
211        for subpath in self.new_metadata.iter_subpaths(path):
212            old_tag = self._get_old_tag(path, subpath)
213            new_tag = self.new_metadata.get_tag(path, subpath)
214            if old_tag is not None and old_tag != new_tag:
215                yield subpath
216
217    def iter_changed_paths(self):
218        """Generator for all changed paths (added/removed/modified)."""
219        return itertools.chain(self.iter_removed_paths(),
220                               self.iter_modified_paths(),
221                               self.iter_added_paths())
222
223    def iter_changed_subpaths(self, path):
224        """Generator for paths within a zip that were added/removed/modified."""
225        return itertools.chain(self.iter_removed_subpaths(path),
226                               self.iter_modified_subpaths(path),
227                               self.iter_added_subpaths(path))
228
229    def describe_difference(self):
230        """Returns a human-readable description of what changed."""
231        if self.force:
232            return 'force=True'
233        elif self.old_metadata is None:
234            return 'Previous stamp file not found.'
235
236        if self.old_metadata.strings_md5() != self.new_metadata.strings_md5():
237            ndiff = difflib.ndiff(self.old_metadata.get_strings(),
238                                  self.new_metadata.get_strings())
239            changed = [s for s in ndiff if not s.startswith(' ')]
240            return 'Input strings changed:\n  ' + '\n  '.join(changed)
241
242        if self.old_metadata.files_md5() == self.new_metadata.files_md5():
243            return "There's no difference."
244
245        lines = []
246        lines.extend('Added: {}'.format(p for p in self.iter_added_paths()))
247        lines.extend('Removed: {}'.format(p
248                                          for p in self.iter_removed_paths()))
249        for path in self.iter_modified_paths():
250            lines.append('Modified: {}'.format(path))
251            lines.extend('  -> Subpath added: {}'.format(
252                p for p in self.iter_added_subpaths(path)))
253            lines.extend('  -> Subpath removed: {}'.format(
254                p for p in self.iter_removed_subpaths(path)))
255            lines.extend('  -> Subpath modified: {}'.format(
256                p for p in self.iter_modified_subpaths(path)))
257        if lines:
258            return 'Input files changed:\n  {}'.format('\n  '.join(lines))
259
260        if self.missing_outputs:
261            return 'Outputs do not exist:\n  {}'.format('\n  '.join(
262                self.missing_outputs))
263
264        return 'I have no idea what changed (there is a bug).'
265
266
267class _Metadata(object):
268    """Data model for tracking change metadata."""
269    def __init__(self):
270        self._files_md5 = None
271        self._strings_md5 = None
272        self._files = []
273        self._strings = []
274        # Map of (path, subpath) -> entry. Created upon first call to _get_entry().
275        self._file_map = None
276
277    @classmethod
278    def from_file(cls, fileobj):
279        """Returns a _Metadata initialized from a file object."""
280        ret = cls()
281        obj = json.load(fileobj)
282        ret._files_md5 = obj['files-md5']
283        ret._strings_md5 = obj['strings-md5']
284        ret._files = obj['input-files']
285        ret._strings = obj['input-strings']
286        return ret
287
288    def to_file(self, fileobj):
289        """Serializes metadata to the given file object."""
290        obj = {
291            "files-md5": self.files_md5(),
292            "strings-md5": self.strings_md5(),
293            "input-files": self._files,
294            "input-strings": self._strings,
295        }
296        json.dump(obj, fileobj, indent=2, sort_keys=True)
297
298    def _assert_not_queried(self):
299        assert self._files_md5 is None
300        assert self._strings_md5 is None
301        assert self._file_map is None
302
303    def add_strings(self, values):
304        self._assert_not_queried()
305        self._strings.extend(str(v) for v in values)
306
307    def add_file(self, path, tag):
308        """Adds metadata for a non-zip file.
309
310        Args:
311          path: Path to the file.
312          tag: A short string representative of the file contents.
313        """
314        self._assert_not_queried()
315        self._files.append({
316            'path': path,
317            'tag': tag,
318        })
319
320    def add_zip_file(self, path, entries):
321        """Adds metadata for a zip file.
322
323        Args:
324          path: Path to the file.
325          entries: List of (subpath, tag) tuples for entries within the zip.
326        """
327        self._assert_not_queried()
328        tag = _compute_inline_md5(
329            itertools.chain((e[0] for e in entries), (e[1] for e in entries)))
330        self._files.append({
331            'path':
332            path,
333            'tag':
334            tag,
335            'entries': [{
336                "path": e[0],
337                "tag": e[1]
338            } for e in entries],
339        })
340
341    def get_strings(self):
342        """Returns the list of input strings."""
343        return self._strings
344
345    def files_md5(self):
346        """Lazily computes and returns the aggregate md5 of input files."""
347        if self._files_md5 is None:
348            # Omit paths from md5 since temporary files have random names.
349            self._files_md5 = _compute_inline_md5(
350                self.get_tag(p) for p in sorted(self.iter_paths()))
351        return self._files_md5
352
353    def strings_md5(self):
354        """Lazily computes and returns the aggregate md5 of input strings."""
355        if self._strings_md5 is None:
356            self._strings_md5 = _compute_inline_md5(self._strings)
357        return self._strings_md5
358
359    def _get_entry(self, path, subpath=None):
360        """Returns the JSON entry for the given path / subpath."""
361        if self._file_map is None:
362            self._file_map = {}
363            for entry in self._files:
364                self._file_map[(entry['path'], None)] = entry
365                for subentry in entry.get('entries', ()):
366                    self._file_map[(entry['path'],
367                                    subentry['path'])] = subentry
368        return self._file_map.get((path, subpath))
369
370    def get_tag(self, path, subpath=None):
371        """Returns the tag for the given path / subpath."""
372        ret = self._get_entry(path, subpath)
373        return ret and ret['tag']
374
375    def iter_paths(self):
376        """Returns a generator for all top-level paths."""
377        return (e['path'] for e in self._files)
378
379    def iter_subpaths(self, path):
380        """Returns a generator for all subpaths in the given zip.
381
382        If the given path is not a zip file or doesn't exist, returns an empty
383        iterable.
384        """
385        outer_entry = self._get_entry(path)
386        if not outer_entry:
387            return ()
388        subentries = outer_entry.get('entries', [])
389        return (entry['path'] for entry in subentries)
390
391
392def _update_md5_for_file(md5, path, block_size=2**16):
393    # record md5 of linkto for dead link.
394    if os.path.islink(path):
395        linkto = os.readlink(path)
396        if not os.path.exists(linkto):
397            md5.update(linkto.encode())
398            return
399
400    with open(path, 'rb') as infile:
401        while True:
402            data = infile.read(block_size)
403            if not data:
404                break
405            md5.update(data)
406
407
408def _update_md5_for_directory(md5, dir_path):
409    for root, _, files in os.walk(dir_path):
410        for f in files:
411            _update_md5_for_file(md5, os.path.join(root, f))
412
413
414def _md5_for_path(path):
415    md5 = hashlib.md5()
416    if os.path.isdir(path):
417        _update_md5_for_directory(md5, path)
418    else:
419        _update_md5_for_file(md5, path)
420    return md5.hexdigest()
421
422
423def _compute_inline_md5(iterable):
424    """Computes the md5 of the concatenated parameters."""
425    md5 = hashlib.md5()
426    for item in iterable:
427        md5.update(str(item).encode())
428    return md5.hexdigest()
429
430
431def _is_zip_file(path):
432    """Returns whether to treat the given file as a zip file."""
433    return path[-4:] in ('.zip')
434
435
436def _extract_zip_entries(path):
437    """Returns a list of (path, CRC32) of all files within |path|."""
438    entries = []
439    with zipfile.ZipFile(path) as zip_file:
440        for zip_info in zip_file.infolist():
441            # Skip directories and empty files.
442            if zip_info.CRC:
443                entries.append(
444                    (zip_info.filename, zip_info.CRC + zip_info.compress_type))
445    return entries
446