1#!/usr/bin/env python3
2#
3# Copyright (C) 2016 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16import argparse, collections, os, re, sys
17
18dir_of_this_script = os.path.dirname(os.path.realpath(__file__))
19
20parser = argparse.ArgumentParser(
21    description="""USAGE:
22    Simplifies a build.log from hundreds of megabytes to <100 lines. Prints output to terminal.
23    Pass this script a filepath to parse. You should be able to type "python3 build_log_simplifier.py"
24    And then drag-and-drop a log file onto the terminal window to get its path.
25
26    Sample usage: python3 development/build_log_simplifier.py Users/owengray/Desktop/build.log
27    """)
28parser.add_argument("--validate", action="store_true", help="Validate that no unrecognized messages exist in the given log")
29parser.add_argument("--update", action="store_true", help="Update our list of recognized messages to include all messages from the given log")
30parser.add_argument("--gc", action="store_true", help="When generating a new exemptions file, exclude any exemptions that were not found in the given log. Only relevant with --update or --validate")
31parser.add_argument("log_path", help="Filepath of log(s) to process", nargs="+")
32
33# a regexes_matcher can quickly identify which of a set of regexes matches a given text
34class regexes_matcher(object):
35    def __init__(self, regexes):
36        self.regex_texts = regexes
37        self.children = None
38        self.matcher = None
39
40    # returns a list of regexes that match the given text
41    def get_matching_regexes(self, text, expect_match=True):
42        if expect_match and len(self.regex_texts) > 1:
43            # If we already expect our matcher to match, we can directly jump to asking our children
44            return self.query_children_for_matching_regexes(text)
45        # It takes more time to match lots of regexes than to match one composite regex
46        # So, we try to match one composite regex first
47        if self.matches(text):
48            if len(self.regex_texts) > 1:
49                # At least one child regex matches, so we have to determine which ones
50                return self.query_children_for_matching_regexes(text)
51            else:
52                return self.regex_texts
53        # Our composite regex yielded no matches
54        return []
55
56    # queries our children for regexes that match <text>
57    def query_children_for_matching_regexes(self, text):
58        # Create children if they don't yet exist
59        self.ensure_split()
60        # query children and join their results
61        results = []
62        for child in self.children:
63            results += child.get_matching_regexes(text, False)
64        return results
65
66    # Returns the index of the first regex matching this string, or None of not found
67    def index_first_matching_regex(self, text):
68        if len(self.regex_texts) <= 1:
69            if len(self.regex_texts) == 0:
70                return None
71            if self.matches(text):
72                return 0
73            return None
74        if not self.matches(text):
75            return None
76        self.ensure_split()
77        count = 0
78        for child in self.children:
79            child_index = child.index_first_matching_regex(text)
80            if child_index is not None:
81                return count + child_index
82            count += len(child.regex_texts)
83        return None
84
85    # Create children if they don't yet exist
86    def ensure_split(self):
87        if self.children is None:
88            # It takes more time to compile a longer regex, but it also takes more time to
89            # test lots of small regexes.
90            # In practice, this number of children seems to result in fast execution
91            num_children = min(len(self.regex_texts), 32)
92            child_start = 0
93            self.children = []
94            for i in range(num_children):
95                child_end = int(len(self.regex_texts) * (i + 1) / num_children)
96                self.children.append(regexes_matcher(self.regex_texts[child_start:child_end]))
97                child_start = child_end
98
99
100    def matches(self, text):
101        if self.matcher is None:
102            full_regex_text = "(?:" + ")|(?:".join(self.regex_texts) + ")"
103            self.matcher = re.compile(full_regex_text)
104        return self.matcher.fullmatch(text)
105
106
107def print_failing_task_names(lines):
108    tasks_of_interest = []
109    # first, find tasks of interest
110    for line in lines:
111        if line.startswith("Execution failed for task"):
112            tasks_of_interest.append(line.split("task '")[1][:-3])
113
114    print("Detected these failing tasks: " + str(tasks_of_interest))
115
116def shorten_uninteresting_stack_frames(lines):
117    result = []
118    prev_line_is_boring = False
119    for line in lines:
120        if line.startswith("\tat ") and not line.startswith("\tat androidx"):
121            # non-androidx stack frame
122            if not prev_line_is_boring:
123                result.append(line.replace("\n", "...\n"))
124            prev_line_is_boring = True
125        else:
126            result.append(line)
127            prev_line_is_boring = False
128    return result
129
130# Returns the path of the config file holding exemptions for deterministic/consistent output.
131# These exemptions can be garbage collected via the `--gc` argument
132def get_deterministic_exemptions_path():
133    return os.path.join(dir_of_this_script, "messages.ignore")
134
135# Returns the path of the config file holding exemptions for nondetermistic/flaky output.
136# These exemptions will not be garbage collected via the `--gc` argument
137def get_flake_exemptions_path():
138    return os.path.join(dir_of_this_script, "message-flakes.ignore")
139
140# Returns a regexes_matcher that matches what is described by our config file
141# Ignores comments and ordering in our config file
142def build_exemptions_matcher(config_lines):
143    config_lines = [line.replace("\n", "") for line in config_lines]
144    regexes = []
145    for line in config_lines:
146        line = line.strip()
147        if line.startswith("#") or line == "":
148            # skip comments
149            continue
150        regexes.append(line)
151        if remove_control_characters(line) != line:
152            raise Exception("Unexpected control characters found in configuration line:\n\n " +
153                "'" + line + "'\n\n. This line is unexpected to match anything. Is this a copying mistake?")
154
155    return regexes_matcher(sorted(regexes))
156
157# Returns a regexes_matcher that matches the content of our config file
158# Can match comments
159# Respects ordering in the config
160# This is used for editing the config file itself
161def build_exemptions_code_matcher(config_lines):
162    config_lines = [line.strip() for line in config_lines]
163    regexes = []
164    for line in config_lines:
165        line = line.strip()
166        if line == "":
167            continue
168        regexes.append(line)
169    return regexes_matcher(regexes)
170
171def remove_by_regexes(lines, config_lines, validate_no_duplicates):
172    fast_matcher = build_exemptions_matcher(config_lines)
173    result = []
174    for line in lines:
175        stripped = line.strip()
176        matching_exemptions = fast_matcher.get_matching_regexes(stripped, expect_match=True)
177        if validate_no_duplicates and len(matching_exemptions) > 1:
178           print("")
179           print("build_log_simplifier.py: Invalid configuration: multiple message exemptions match the same message. Are some exemptions too broad?")
180           print("")
181           print("Line: '" + stripped + "'")
182           print("")
183           print(str(len(matching_exemptions)) + " Matching exemptions:")
184           for exemption_text in matching_exemptions:
185               print("'" + exemption_text + "'")
186           exit(1)
187        if len(matching_exemptions) < 1:
188            result.append(line)
189    return result
190
191def collapse_consecutive_blank_lines(lines):
192    result = []
193    prev_blank = True
194    for line in lines:
195        if line.strip() == "":
196            if not prev_blank:
197                result.append(line)
198            prev_blank = True
199        else:
200            result.append(line)
201            prev_blank = False
202    return result
203
204def remove_trailing_blank_lines(lines):
205    while len(lines) > 0 and lines[-1].strip() == "":
206        del lines[-1]
207    return lines
208
209def extract_task_name(line):
210    prefix = "> Task "
211    if line.startswith(prefix):
212        return line[len(prefix):].strip()
213    return None
214
215def is_task_line(line):
216    return extract_task_name(line) is not None
217
218def extract_task_names(lines):
219    names = []
220    for line in lines:
221        name = extract_task_name(line)
222        if name is not None and name not in names:
223            names.append(name)
224    return names
225
226# If a task has no output (or only blank output), this function removes the task (and its output)
227# For example, turns this:
228#  > Task :a
229#  > Task :b
230#  some message
231#
232# into this:
233#
234#  > Task :b
235#  some message
236def collapse_tasks_having_no_output(lines):
237    result = []
238    # When we see a task name, we might not emit it if it doesn't have any output
239    # This variable is that pending task name, or none if we have no pending task
240    pending_task = None
241    pending_blanks = []
242    for line in lines:
243        is_section = is_task_line(line) or line.startswith("> Configure project ") or line.startswith("FAILURE: Build failed with an exception.")
244        if is_section:
245            pending_task = line
246            pending_blanks = []
247        elif line.strip() == "":
248            # If we have a pending task and we found a blank line, then hold the blank line,
249            # and only output it if we later find some nonempty output
250            if pending_task is not None:
251                pending_blanks.append(line)
252            else:
253                result.append(line)
254        else:
255            # We found some nonempty output, now we emit any pending task names
256            if pending_task is not None:
257                result.append(pending_task)
258                result += pending_blanks
259                pending_task = None
260                pending_blanks = []
261            result.append(line)
262    return result
263
264# Removes color characters and other ANSI control characters from this input
265control_character_regex = re.compile(r"""
266        \x1B  # Escape
267        (?:   # 7-bit C1 Fe (except CSI)
268            [@-Z\\-_]
269        |     # or [ for CSI, followed by a control sequence
270            \[
271            [0-?]*  # Parameters
272            [ -/]*  # Intermediate bytes
273            [@-~]   # End
274        )
275        """, re.VERBOSE)
276
277def remove_control_characters(line):
278    return control_character_regex.sub("", line)
279
280# Removes strings from the input wherever they are found
281# This list is less convenient than the .ignore files:
282#   This list doesn't get autosuggested additions
283#   This list isn't automatically garbage collected
284#   Users interested in seeing the exemption history probably won't think to look here
285# This list does allow removing part of the text from a line and still validating the remainder of the line
286# If this list eventually gets long we might want to make it easier to update
287inline_ignores_regex = re.compile(
288    # b/300072778
289    "Sharing is only supported for boot loader classes because bootstrap classpath has been appended"
290)
291
292def remove_inline_ignores(line):
293    return re.sub(inline_ignores_regex, "", line)
294
295# Normalizes some filepaths to more easily simplify/skip some messages
296def normalize_paths(lines):
297    # get OUT_DIR, DIST_DIR, and the path of the root of the checkout
298    out_dir = None
299    dist_dir = None
300    checkout_dir = None
301    gradle_user_home = None
302    # we read checkout_root from the log file in case this build was run in a location,
303    # such as on a build server
304    out_marker = "OUT_DIR="
305    dist_marker = "DIST_DIR="
306    checkout_marker = "CHECKOUT="
307    gradle_user_home_marker="GRADLE_USER_HOME="
308    for line in lines:
309        if line.startswith(out_marker):
310            out_dir = line.split(out_marker)[1].strip()
311            continue
312        if line.startswith(dist_marker):
313            dist_dir = line.split(dist_marker)[1].strip()
314            continue
315        if line.startswith(checkout_marker):
316            checkout_dir = line.split(checkout_marker)[1].strip()
317            continue
318        if line.startswith(gradle_user_home_marker):
319            gradle_user_home = line.split(gradle_user_home_marker)[1].strip()
320            continue
321        if out_dir is not None and dist_dir is not None and checkout_dir is not None and gradle_user_home is not None:
322            break
323
324    # Remove any mentions of these paths, and replace them with consistent values
325    # Make sure to put these paths in the correct order so that more-specific paths will
326    # be matched first
327    remove_paths = collections.OrderedDict()
328    if gradle_user_home is not None:
329        remove_paths[gradle_user_home] = "$GRADLE_USER_HOME"
330    if dist_dir is not None:
331        remove_paths[dist_dir] = "$DIST_DIR"
332    if out_dir is not None:
333        remove_paths[out_dir] = "$OUT_DIR"
334    if checkout_dir is not None:
335        remove_paths[checkout_dir + "/frameworks/support"] = "$SUPPORT"
336        remove_paths[checkout_dir] = "$CHECKOUT"
337    result = []
338    for line in lines:
339        for path in remove_paths:
340            if path in line:
341                replacement = remove_paths[path]
342                line = line.replace(path + "/", replacement + "/")
343                line = line.replace(path, replacement)
344        result.append(line)
345    return result
346
347# Given a regex with hashes in it like ".gradle/caches/transforms-2/files-2.1/73f631f487bd87cfd8cb2aabafbac6a8",
348# tries to return a more generalized regex like ".gradle/caches/transforms-2/files-2.1/[0-9a-f]{32}"
349def generalize_hashes(message):
350    hash_matcher = "[0-9a-f]{32}"
351    return re.sub(hash_matcher, hash_matcher, message)
352
353# Given a regex with numbers in it like ".gradle/caches/transforms-2/files-2.1/73f631f487bd87cfd8cb2aabafbac6a8"
354# tries to return a more generalized regex like ".gradle/caches/transforms-[0-9]*/files-[0-9]*.[0-9]*/73f631f487bd87cfd8cb2aabafbac6a8"
355def generalize_numbers(message):
356    matcher = "[0-9]+"
357    generalized = re.sub(matcher, matcher, message)
358    # the above replacement corrupts strings of the form "[0-9a-f]{32}", so we fix them before returning
359    return generalized.replace("[[0-9]+-[0-9]+a-f]{[0-9]+}", "[0-9a-f]{32}")
360
361# Given a list of output messages and a list of existing exemption lines,
362# generates a new list of exemption lines
363def generate_suggested_exemptions(messages, config_lines, remove_unmatched_lines):
364    new_config = suggest_missing_exemptions(messages, config_lines)
365    if remove_unmatched_lines:
366        new_config = remove_unmatched_exemptions(messages, new_config)
367    return new_config
368
369# Given a list of output messages and a list of existing exemption lines,
370# generates an augmented list of exemptions containing any necessary new exemptions
371def suggest_missing_exemptions(messages, config_lines):
372    # given a message, finds the index of the existing exemption for that message, if any
373    existing_matcher = build_exemptions_code_matcher(config_lines)
374    # the index of the previously matched exemption
375    previous_found_index = -1
376    # map from line index to list of lines to insert there
377    insertions_by_position = collections.defaultdict(lambda: [])
378    insertions_by_task_name = collections.OrderedDict()
379    # current task generating any subsequent output
380    pending_task_line = None
381    # new, suggested exemptions
382    new_suggestions = set()
383    # generate new suggestions
384    for line in messages:
385        line = line.strip()
386        if line == "":
387            continue
388        # save task name
389        is_section = False
390        if is_task_line(line) or line.startswith("> Configure project "):
391            # If a task creates output, we record its name
392            line = "# " + line
393            pending_task_line = line
394            is_section = True
395        # determine where to put task name
396        current_found_index = existing_matcher.index_first_matching_regex(line)
397        if current_found_index is not None:
398            # We already have a mention of this line
399            # We don't need to exempt it again, but this informs where to insert our next exemption
400            previous_found_index = current_found_index
401            pending_task_line = None
402            continue
403        # skip outputting task names for tasks that don't output anything
404        if is_section:
405            continue
406
407        # escape message
408        escaped = re.escape(line)
409        escaped = escaped.replace("\ ", " ") # spaces don't need to be escaped
410        escaped = generalize_hashes(escaped)
411        escaped = generalize_numbers(escaped)
412        # confirm that we haven't already inserted this message
413        if escaped in new_suggestions:
414            continue
415        # insert this regex into an appropriate position
416        if pending_task_line is not None:
417            # We know which task this line came from, and it's a task that didn't previously make output
418            if pending_task_line not in insertions_by_task_name:
419                insertions_by_task_name[pending_task_line] = []
420            insertions_by_task_name[pending_task_line].append(escaped)
421        else:
422            # This line of output didn't come from a new task
423            # So we append it after the previous line that we found
424            insertions_by_position[previous_found_index].append(escaped)
425        new_suggestions.add(escaped)
426
427    # for each regex for which we chose a position in the file, insert it there
428    exemption_lines = []
429    for i in range(len(existing_matcher.regex_texts)):
430        exemption_lines.append(existing_matcher.regex_texts[i])
431        if i in insertions_by_position:
432            exemption_lines += insertions_by_position[i]
433    # for regexes that could not be assigned to a task, insert them next
434    if -1 in insertions_by_position:
435        exemption_lines += insertions_by_position[-1]
436    # for regexes that were simply assigned to certain task names, insert the there, grouped by task
437    for task_name in insertions_by_task_name:
438        exemption_lines.append(task_name)
439        exemption_lines += insertions_by_task_name[task_name]
440    return exemption_lines
441
442# Searches for config lines in <config_lines> that match no line in <messages>
443# Create and returns a new list of config lines, which excludes unmatched lines and
444# any corresponding comments
445def remove_unmatched_exemptions(messages, config_lines):
446    existing_matcher = build_exemptions_matcher(config_lines)
447    matched_config_lines = set()
448    # find all of the regexes that match at least one message
449    for line in messages:
450        line = line.strip()
451        if line.startswith("#"):
452            continue
453        for regex in existing_matcher.get_matching_regexes(line):
454            matched_config_lines.add(regex)
455    # generate a new list of config lines
456    # keep config lines that were matched in the list of messages
457    # keep comments where there remains a matched config line before the next comment
458    # skip comments that were previously followed by other config lines that were deleted
459    result = []
460    pending_comments = [] # comments that we haven't yet decided to keep or not
461    found_unused_line_after_comment = False
462    for line in config_lines:
463        if line.startswith("#"):
464            # We found a comment
465            if found_unused_line_after_comment:
466                # We found an unused config line more recently than the previous comment,
467                # and now we've found a new comment.
468                if len(pending_comments) > 0:
469                    # We also haven't found any used config lines more recently than the previous comment
470                    # Presumably these pending comments were intended to describe the lines that we're removing
471                    # So, we skip emitting these pending comments too
472                    pending_comments = []
473            pending_comments.append(line)
474            found_unused_line_after_comment = False
475            continue
476        matched = (line in matched_config_lines)
477        if matched:
478            # If this config line is being used, then we keep its comments too
479            result += pending_comments
480            pending_comments = []
481            result.append(line)
482        else:
483            found_unused_line_after_comment = True
484    # If there are any comments at the bottom of the file, then keep them too
485    if not found_unused_line_after_comment:
486        result += pending_comments
487    return result
488
489# opens a file and reads the lines in it
490def readlines(path):
491    infile = open(path)
492    lines = infile.readlines()
493    infile.close()
494    return lines
495
496def writelines(path, lines):
497    destfile = open(path, 'w')
498    destfile.write("\n".join(lines))
499    destfile.close()
500
501def main():
502    arguments = parser.parse_args()
503
504    # read each file
505    log_paths = arguments.log_path
506    all_lines = []
507    for log_path in log_paths:
508        lines = readlines(log_path)
509        lines = [remove_control_characters(line) for line in lines]
510        lines = [remove_inline_ignores(line) for line in lines]
511        lines = normalize_paths(lines)
512        all_lines += lines
513    # load configuration
514    flake_exemption_regexes = readlines(get_flake_exemptions_path())
515    deterministic_exemption_regexes = readlines(get_deterministic_exemptions_path())
516    exemption_regexes = flake_exemption_regexes + deterministic_exemption_regexes
517    # load configuration
518    # remove lines we're not interested in
519    update = arguments.update or arguments.gc
520    validate = update or arguments.validate
521    interesting_lines = all_lines
522    if not validate:
523        print_failing_task_names(interesting_lines)
524    interesting_lines = remove_by_regexes(interesting_lines, exemption_regexes, validate)
525    interesting_lines = collapse_tasks_having_no_output(interesting_lines)
526    interesting_lines = collapse_consecutive_blank_lines(interesting_lines)
527    interesting_lines = remove_trailing_blank_lines(interesting_lines)
528
529    # process results
530    if update:
531        if arguments.gc or len(interesting_lines) != 0:
532            update_path = get_deterministic_exemptions_path()
533            # filter out any inconsistently observed messages so we don't try to exempt them twice
534            all_lines = remove_by_regexes(all_lines, flake_exemption_regexes, validate)
535            # update the deterministic exemptions file based on the result
536            suggested = generate_suggested_exemptions(all_lines, deterministic_exemption_regexes, arguments.gc)
537            writelines(update_path, suggested)
538            print("build_log_simplifier.py updated exemptions " + update_path)
539    elif validate:
540        if len(interesting_lines) != 0:
541            print("")
542            print("=" * 80)
543            print("build_log_simplifier.py: Error: Found " + str(len(interesting_lines)) + " new lines of warning output!")
544            print("")
545            print("The new output:")
546            print("  " + "  ".join(interesting_lines))
547            print("")
548            print("To reproduce this failure:")
549            print("  Try $ ./gradlew -Pandroidx.validateNoUnrecognizedMessages --rerun-tasks " + " ".join(extract_task_names(interesting_lines)))
550            print("")
551            print("Instructions:")
552            print("  If you can fix these messages, do so.")
553            print("  If you cannot fix these messages, you may suppress them.")
554            print("    To automatically suppress new output from build server builds, run development/build_log_simplifier/update.sh")
555            print("  See also https://android.googlesource.com/platform/frameworks/support/+/androidx-main/development/build_log_simplifier/VALIDATION_FAILURE.md")
556            print("")
557            new_exemptions_path = log_paths[0] + ".ignore"
558            # filter out any inconsistently observed messages so we don't try to exempt them twice
559            all_lines = remove_by_regexes(all_lines, flake_exemption_regexes, validate)
560            # update deterministic exemptions file based on the result
561            suggested = generate_suggested_exemptions(all_lines, deterministic_exemption_regexes, arguments.gc)
562            writelines(new_exemptions_path, suggested)
563            print("Files:")
564            print("  Full Log                   : " + ",".join(log_paths))
565            print("  Baseline                   : " + get_deterministic_exemptions_path())
566            print("  Autogenerated new baseline : " + new_exemptions_path)
567            exit(1)
568    else:
569        interesting_lines = shorten_uninteresting_stack_frames(interesting_lines)
570        print("".join(interesting_lines))
571
572if __name__ == "__main__":
573    main()
574