""" This linter ensures that users don't set a SHA hash checksum in Bazel for the http_archive. Although the security practice of setting the checksum is good, it doesn't work when the archive is downloaded from some sites like GitHub because it can change. Specifically, GitHub gives no guarantee to keep the same value forever. Check for more details at https://github.com/community/community/discussions/46034. """ from __future__ import annotations import argparse import json import re import shlex import subprocess import sys import xml.etree.ElementTree as ET from enum import Enum from typing import NamedTuple from urllib.parse import urlparse LINTER_CODE = "BAZEL_LINTER" SHA256_REGEX = re.compile(r"\s*sha256\s*=\s*['\"](?P[a-zA-Z0-9]{64})['\"]\s*,") DOMAINS_WITH_UNSTABLE_CHECKSUM = {"github.com"} class LintSeverity(str, Enum): ERROR = "error" WARNING = "warning" ADVICE = "advice" DISABLED = "disabled" class LintMessage(NamedTuple): path: str | None line: int | None char: int | None code: str severity: LintSeverity name: str original: str | None replacement: str | None description: str | None def is_required_checksum(urls: list[str | None]) -> bool: if not urls: return False for url in urls: if not url: continue parsed_url = urlparse(url) if parsed_url.hostname in DOMAINS_WITH_UNSTABLE_CHECKSUM: return False return True def get_disallowed_checksums( binary: str, ) -> set[str]: """ Return the set of disallowed checksums from all http_archive rules """ # Use bazel to get the list of external dependencies in XML format proc = subprocess.run( [binary, "query", "kind(http_archive, //external:*)", "--output=xml"], capture_output=True, check=True, text=True, ) root = ET.fromstring(proc.stdout) disallowed_checksums = set() # Parse all the http_archive rules in the XML output for rule in root.findall('.//rule[@class="http_archive"]'): urls_node = rule.find('.//list[@name="urls"]') if urls_node is None: continue urls = [n.get("value") for n in urls_node.findall(".//string")] checksum_node = rule.find('.//string[@name="sha256"]') if checksum_node is None: continue checksum = checksum_node.get("value") if not checksum: continue if not is_required_checksum(urls): disallowed_checksums.add(checksum) return disallowed_checksums def check_bazel( filename: str, disallowed_checksums: set[str], ) -> list[LintMessage]: original = "" replacement = "" with open(filename) as f: for line in f: original += f"{line}" m = SHA256_REGEX.match(line) if m: sha256 = m.group("sha256") if sha256 in disallowed_checksums: continue replacement += f"{line}" if original == replacement: return [] return [ LintMessage( path=filename, line=None, char=None, code=LINTER_CODE, severity=LintSeverity.ADVICE, name="format", original=original, replacement=replacement, description="Found redundant SHA checksums. Run `lintrunner -a` to apply this patch.", ) ] def main() -> None: parser = argparse.ArgumentParser( description="A custom linter to detect redundant SHA checksums in Bazel", fromfile_prefix_chars="@", ) parser.add_argument( "--binary", required=True, help="bazel binary path", ) parser.add_argument( "filenames", nargs="+", help="paths to lint", ) args = parser.parse_args() try: disallowed_checksums = get_disallowed_checksums(args.binary) except subprocess.CalledProcessError as err: err_msg = LintMessage( path=None, line=None, char=None, code=__file__, severity=LintSeverity.ADVICE, name="command-failed", original=None, replacement=None, description=( f"COMMAND (exit code {err.returncode})\n" f"{shlex.join(err.cmd)}\n\n" f"STDERR\n{err.stderr or '(empty)'}\n\n" f"STDOUT\n{err.stdout or '(empty)'}" ), ) print(json.dumps(err_msg._asdict())) return except Exception as e: err_msg = LintMessage( path=None, line=None, char=None, code=LINTER_CODE, severity=LintSeverity.ERROR, name="command-failed", original=None, replacement=None, description=(f"Failed due to {e.__class__.__name__}:\n{e}"), ) print(json.dumps(err_msg._asdict()), flush=True) sys.exit(0) for filename in args.filenames: for lint_message in check_bazel(filename, disallowed_checksums): print(json.dumps(lint_message._asdict()), flush=True) if __name__ == "__main__": main()