1#
2#  Copyright (C) 2019 The Android Open Source Project
3#
4#  Licensed under the Apache License, Version 2.0 (the "License");
5#  you may not use this file except in compliance with the License.
6#  You may obtain a copy of the License at
7#
8#       http://www.apache.org/licenses/LICENSE-2.0
9#
10#  Unless required by applicable law or agreed to in writing, software
11#  distributed under the License is distributed on an "AS IS" BASIS,
12#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13#  See the License for the specific language governing permissions and
14#  limitations under the License.
15#
16"""A helper script for validateRefactor.sh. Should generally not be used directly.
17
18Can be used directly if validateRefactor.sh has already created the out-old & out-new dirs.
19In such a case, it can be run to compare those directories without regenerating them.
20This is generally only useful when updating baselines or iterating on this script itself.
21Takes baseline names as CLI arguments, which may be passed through from validateRefactor.sh.
22
23Typical usage example:
24
25  python validateRefactorHelper.py agpKmp
26"""
27import itertools
28import logging
29import queue
30import re
31import shutil
32import subprocess
33import sys
34import threading
35from typing import Dict
36
37logger = logging.getLogger(__name__)
38logging.basicConfig(level=logging.INFO)
39
40# noto-emoji-compat `bundleinside`s an externally-built with-timestamps jar.
41# classes.jar is compared using `diffuse` instead of unzipping and diffing class files.
42bannedJars = ["-x", "noto-emoji-compat-java.jar", "-x", "classes.jar"]
43# java and json aren't for unzipping, but the poor exclude-everything-but-jars regex doesn't
44# exclude them. Same for exclude-non-klib and .kt/.knm
45areNotZips = ["-x", r"**\.java", "-x", r"**\.json", "-x", r"**\.kt", "-x", r"**\.knm", "-x", r"**\.xml",
46              "-x", r"**\.sha1", "-x", r"**\.sha256", "-x", r"**\.sha512", "-x", r"**\.md5",
47              "-x", r"**\.module", "-x", r"**\.pom", "-x", r"**\.html"]
48# keeps making my regexes fall over :(
49hasNoExtension = ["-x", "manifest", "-x", "module"]
50doNotUnzip = bannedJars + areNotZips + hasNoExtension
51
52def diff(excludes):
53    return popenAndReturn(["diff", "-r", "../../out-old/dist/", "../../out-new/dist/"] + excludes)
54
55def popenAndReturn(args):
56    logger.debug(" ".join(args))
57    return subprocess.Popen(args, stdout=subprocess.PIPE).stdout.read().decode("utf-8").split("\n")
58
59# Finds and unzips all files with old/new diff that _do not_ match the argument regexes.
60# Because the `diff` command doesn't have an --include, only --exclude.
61def findFilesNotMatchingWithDiffAndUnzip(*regexesToExclude):
62    excludeArgs = list(itertools.chain.from_iterable(zip(["-x"]*9, regexesToExclude)))
63    # Exclude all things that are *not* the desired zip type
64    zipsWithDiffs = diff(["-q"] + excludeArgs + doNotUnzip)
65    # Take only changed files, not new/deleted ones (the diff there is obvious)
66    zipsWithDiffs = filter(lambda s: s.startswith("Files"), zipsWithDiffs)
67    zipsWithDiffs = map(lambda s: s.split()[1:4:2], zipsWithDiffs)
68    zipsWithDiffs = itertools.chain.from_iterable(zipsWithDiffs)  # flatten
69    workQueueOfZips = queue.LifoQueue()
70    for it in zipsWithDiffs: workQueueOfZips.put(it)
71    # And unzip them
72    # If we spam unzip commands without a break, the unzips start failing.
73    # if we wait after every Popen, the script runs very slowly
74    # So create a pool of 10 unzip workers to consume from zipsWithDiffs
75    numWorkers = 10
76    workers = []
77    for i in range(min(numWorkers, workQueueOfZips.qsize())):
78        w = threading.Thread(target=unzipWorker, args=(workQueueOfZips,))
79        w.start()
80        workers.append(w)
81    for w in workers: w.join()
82
83def unzipWorker(workQueueOfZips):
84    while not workQueueOfZips.empty():
85        zipFilePath = workQueueOfZips.get(0)
86        try: shutil.rmtree(zipFilePath+".unzipped/")
87        except FileNotFoundError: pass
88        logger.debug("unzipping " + zipFilePath)
89        subprocess.Popen(["unzip", "-qq", "-o", zipFilePath, "-d", zipFilePath+".unzipped/"]).wait()
90
91diffusePath = "../../prebuilts/build-tools/diffuse/diffuse-0.3.0/bin/diffuser"
92
93diffuseIsPresent = True
94def compareWithDiffuse(listOfJars):
95    global diffuseIsPresent
96    if not diffuseIsPresent: return
97    for jarPath in list(filter(None, listOfJars)):
98        logger.info("jarpath: " + jarPath)
99        newJarPath = jarPath.replace("out-old", "out-new")
100        try: logger.info("\n".join(popenAndReturn([diffusePath, "diff", "--jar", jarPath, newJarPath])))
101        except FileNotFoundError:
102            logger.warning(f"https://github.com/JakeWharton/diffuse is not present on disk in expected location"
103                  f" ${diffusePath}. You can install it.")
104            diffuseIsPresent = False
105            return
106
107# We might care to know whether .sha1 or .md5 files have changed, but changes in those files will
108# always be accompanied by more meaningful changes in other files, so we don"t need to show changes
109# in .sha1 or .md5 files, or in .module files showing the hashes of other files, or config names.
110excludedHashes = ["-x", "*.md5*", "-x", "*.sha**", "-I", "        \"md5\".*",
111"-I", "        \"sha.*", "-I", "        \"size\".*", "-I", "      \"name\".*"]
112# Don"t care about maven-metadata files because they have timestamps in them.
113# temporarily ignore knm files
114# If changes to the dackka args json are meaningful, they will affect the generated docs and show diff there
115excludedFiles = ["-x", "*maven-metadata.xml**", "-x", r"**\.knm", "-x", "dackkaArgs-docs-tip-of-tree.json"]
116# Also, ignore files that we already unzipped
117excludedZips = ["-x", "*.zip", "-x", "*.jar", "-x", "*.aar", "-x", "*.apk", "-x", "*.klib"]
118
119# These are baselined changes that we understand and know are no-ops in refactors
120# "Unskippable" changes are multi-line and can't be skipped in `diff`, so post-process
121baselinedChangesForAgpKmp = [
122    # these are new attributes being added
123    """>         "org.gradle.libraryelements": "aar",""",
124    """>         "org.gradle.jvm.environment": "android",""",
125    """>         "org.gradle.jvm.environment": "non-jvm",""",
126    """>         "org.gradle.jvm.environment": "standard-jvm",""",
127    """>       <type>aar</type>""",
128    # this attribute swap occurs alongside the above new attributes added.
129    # https://chat.google.com/room/AAAAW8qmCIs/4phaNn_gsrc
130    """<         "org.jetbrains.kotlin.platform.type": "androidJvm\"""",
131    """>         "org.jetbrains.kotlin.platform.type": "jvm\"""",
132    # name-only change; nothing resolves based on names
133    """<      "name": "releaseApiElements-published",""",
134    """>      "name": "androidApiElements-published",""",
135    """             <pre>actual typealias""",  # open bug in dackka b/339221337
136    # we are switching from our KMP sourcejars solution to the upstream one
137    """<         "org.gradle.docstype": "fake-sources",""",
138    """>         "org.gradle.docstype": "sources",""",
139]
140unskippableBaselinedChangesForAgpKmp = [
141# This was an AGP workaround for a dependency resolution issue for kotlin stdlib
142# https://chat.google.com/room/AAAAW8qmCIs/4phaNn_gsrc
143re.compile(r"""
144[0-9]+,[0-9]+c[0-9]+
145<           \},
146<           "excludes": \[
147<             \{
148<               "group": "org.jetbrains.kotlin",
149<               "module": "kotlin-stdlib-common"
150<             \},
151<             \{
152<               "group": "org.jetbrains.kotlin",
153<               "module": "kotlin-test-common"
154<             \},
155<             \{
156<               "group": "org.jetbrains.kotlin",
157<               "module": "kotlin-test-annotations-common"
158<             \}
159<           \]
160---
161>           \}"""),
162re.compile(r"""
163<       <exclusions>
164<         <exclusion>
165<           <groupId>org.jetbrains.kotlin</groupId>
166<           <artifactId>kotlin-stdlib-common</artifactId>
167<         </exclusion>
168<         <exclusion>
169<           <groupId>org.jetbrains.kotlin</groupId>
170<           <artifactId>kotlin-test-common</artifactId>
171<         </exclusion>
172<         <exclusion>
173<           <groupId>org.jetbrains.kotlin</groupId>
174<           <artifactId>kotlin-test-annotations-common</artifactId>
175<         </exclusion>
176<       </exclusions>"""),
177# .module files[] blocks aren't ordered; baseline reordering of samples-sources b/374956513
178re.compile(r"""
179[0-9]+,[0-9]+d[0-9]+
180<           "name": "[a-z3\-]+-[0-9].[0-9].[0-9](-[a-z0-9]+)?-samples-sources.jar",
181<           "url": "[a-z3\-]+-[0-9].[0-9].[0-9](-[a-z0-9]+)?-samples-sources.jar",
182<           "size": [0-9]+,
183<           "sha512": "[0-9a-z]+",
184<           "sha256": "[0-9a-z]+",
185<           "sha1": "[0-9a-z]+",
186<           "md5": "[0-9a-z]+"
187<         \},
188<         \{
189[0-9]+a[0-9]+,[0-9]+
190>         \},
191>         \{
192>           "name": "[a-z3\-]+-[0-9].[0-9].[0-9](-[a-z0-9]+)?-samples-sources.jar",
193>           "url": "[a-z3\-]+-[0-9].[0-9].[0-9](-[a-z0-9]+)?-samples-sources.jar",
194>           "size": [0-9]+,
195>           "sha512": "[0-9a-z]+",
196>           "sha256": "[0-9a-z]+",
197>           "sha1": "[0-9a-z]+",
198>           "md5": "[0-9a-z]+"
199"""),
200# This one is okay because the common pom expresses a dependency on the jvm pom
201# https://repo1.maven.org/maven2/org/jetbrains/kotlinx/kotlinx-coroutines-core/1.7.3/kotlinx-coroutines-core-1.7.3.pom
202re.compile(r"""[0-9]+c[0-9]+
203<       <artifactId>kotlinx-coroutines-core-jvm</artifactId>
204---
205>       <artifactId>kotlinx-coroutines-core</artifactId>"""),
206# AGP-KMP adds a new default sourceSet, which in itself doesn't do anything
207re.compile(r"""(11,17d10|12,18d11)
208<       "name": "androidRelease",
209<       "dependencies": \[
210<         "commonMain"
211<       \],
212<       "analysisPlatform": "jvm"
213<     \},
214<     \{
215"""),
216]
217
218baselines = []
219baselinedChanges = []
220unskippableBaselinedChanges = []
221arguments = sys.argv[1:]
222if "agpKmp" in arguments:
223    arguments.remove("agpKmp"); baselines += ["agpKmp"]
224    logger.info("IGNORING DIFF FOR agpKmp")
225    baselinedChanges += baselinedChangesForAgpKmp
226    unskippableBaselinedChanges += unskippableBaselinedChangesForAgpKmp
227    excludedFiles += ["-x", r"**\.aar.unzipped/res"]  # agp-kmp may add this empty
228if arguments:
229    logger.error("invalid argument(s) for validateRefactorHelper: " + ", ".join(arguments))
230    logger.error("currently recognized arguments: agpKmp")
231    exit()
232
233# interleave "-I" to tell diffutils to 'I'gnore the baselined lines
234baselinedChangesArgs = list(itertools.chain.from_iterable(zip(["-I"]*99, [it.removeprefix(">").removeprefix("<") for it in baselinedChanges])))
235
236def removeLinesStartingWith(listOfStrings, listOfStringsToMatchAgainst):
237    return [line for line in listOfStrings if not any(line.startswith(it) for it in listOfStringsToMatchAgainst)]
238
239# removeLinesWithChangedSuffixes(["foo"], ["foo-bar"], "-bar") returns [], []
240def removeLinesWithChangedSuffixes(newStrings, oldStrings, newSuffix, oldSuffix=""):
241    possibleIndices = [i for i, string in enumerate(newStrings) if string.endswith(newSuffix)]
242    convertedMap: Dict[int, str] = {i: newStrings[i].replace(newSuffix, oldSuffix) for i in possibleIndices}
243    confirmedIndicesNew = [i for i, converted in convertedMap.items() if converted in oldStrings]
244    confirmedIndicesOld = [oldStrings.index(convertedMap[i]) for i in confirmedIndicesNew]
245    resultNew = [string for i, string in enumerate(newStrings) if i not in confirmedIndicesNew]
246    resultOld = [string for i, string in enumerate(oldStrings) if i not in confirmedIndicesOld]
247    return resultNew, resultOld
248
249# remove baselined elements from a single diff segment, starting with a location-in-file element like 223c220
250def processDiffSegment(segment, fileExtension):
251    if segment == "": return ""
252    lines = segment.split("\n")
253    lines = removeLinesStartingWith(lines, baselinedChanges)
254    removed = [line[1:] for line in lines if line.startswith("<")]
255    added = [line[1:] for line in lines if line.startswith(">")]
256    if (fileExtension == "pom") and "agpKmp" in baselines:
257        # Ignore artifactIds' new -jvm and -android suffixes in poms b/356612738
258        added, removed = removeLinesWithChangedSuffixes(added, removed, "-jvm</artifactId>", "</artifactId>")
259        added, removed = removeLinesWithChangedSuffixes(added, removed, "-android</artifactId>", "</artifactId>")
260    keptContentLines = set(">" + it for it in added).union(set("<" + it for it in removed))
261    # Do not keep any formatting lines or the header if there is no content
262    if len(keptContentLines) == 0: return ""
263    # return value is based on `lines` because we want to retain ordering we may have lost during processing
264    # We want to keep keptContentLines, and formatting lines like "---" and the header (which don't start with <>).
265    return "\n".join([line for line in lines if (line != "") and ((not line[0] in "<>") or line in keptContentLines)])
266
267# The output of diff entails multiple files, and multiple segments per file
268# This function removes baselined changes from the entire diff output
269def processMegaDiff(inputString):
270    perFileDiffs = inputString.split("diff -r")
271    processedPerFileDiffs = []
272    for i in range(1, len(perFileDiffs)):
273        diffStatement, _, diffContent = perFileDiffs[i].partition("\n")
274        newFilePath = diffStatement.rpartition(" ")[2]
275        fileExtension = newFilePath.rpartition(".")[2]
276        for multilineBaselinedElement in unskippableBaselinedChanges:
277            diffContent = multilineBaselinedElement.sub("", diffContent)
278        diffSegments = re.split(r'(^[0-9]+[0-9acd,]*\n)', diffContent, flags=re.MULTILINE)
279        result = []
280        # every other segment is a segment header like 99,112d87; 0th is ""
281        for j in range(1, len(diffSegments)-1, 2):
282            # a complete segment is a location-in-file header and everything until the next header. E.g.
283            # 83c70
284            # <       <artifactId>kotlinx-coroutines-core-jvm</artifactId>
285            # ---
286            # >       <artifactId>kotlinx-coroutines-core</artifactId>
287            segment = diffSegments[j] + diffSegments[j+1]
288            processedSegment = processDiffSegment(segment, fileExtension)
289            if processedSegment != "": result.append(processedSegment)
290        if len(result) != 0: processedPerFileDiffs += [newFilePath + "\n" + "\n".join(result)]
291    return "\ndiff ".join(processedPerFileDiffs)
292
293# We unzip multiple times in this order because e.g. zips can contain apks.
294# Find all zip files with a diff, e.g. the tip-of-tree-repository file, and maybe the docs zip
295logger.info("UNZIPPING ZIP FILES");
296findFilesNotMatchingWithDiffAndUnzip(r"**\.[^z][a-z]*")
297# Find all aar and apk files with a diff. The proper regex would be `.*\..*[^akpr]+.*`, but it
298# doesn"t work in difftools exclude's very limited regex syntax.
299logger.info("UNZIPPING AAR/APK FILES");
300findFilesNotMatchingWithDiffAndUnzip(r"**\.zip", r"**\.jar", r"**\.klib")
301# Find all jars and klibs and unzip them (comes after because they could be inside aars/apks).
302logger.info("UNZIPPING JAR/KLIB FILES");
303findFilesNotMatchingWithDiffAndUnzip(r"**\.zip", r"**\.aar", r"**\.apk")
304
305# now find all diffs in classes.jars
306# TODO(375636734) Disabled because this tracks internal methods' diffs
307# classesJarsWithDiffs = popenAndReturn(["find", "../../out-old/dist/", "-name", "classes.jar"])
308# logger.info("classes.jar s: " + str(classesJarsWithDiffs))
309# compareWithDiffuse(classesJarsWithDiffs)
310
311# Now find all diffs in non-zipped files
312finalExcludes = excludedHashes + excludedFiles + excludedZips + baselinedChangesArgs
313finalDiff = "\n".join(diff(finalExcludes))
314finalDiff = processMegaDiff(finalDiff)
315print(finalDiff)
316