• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Tool for generating Software Bill of Materials (SBOM) for Python's dependencies"""
2import os
3import re
4import hashlib
5import json
6import glob
7from pathlib import Path, PurePosixPath, PureWindowsPath
8import subprocess
9import sys
10import urllib.request
11import typing
12
13CPYTHON_ROOT_DIR = Path(__file__).parent.parent.parent
14
15# Before adding a new entry to this list, double check that
16# the license expression is a valid SPDX license expression:
17# See: https://spdx.org/licenses
18ALLOWED_LICENSE_EXPRESSIONS = {
19    "Apache-2.0",
20    "Apache-2.0 OR BSD-2-Clause",
21    "BSD-2-Clause",
22    "BSD-3-Clause",
23    "CC0-1.0",
24    "ISC",
25    "LGPL-2.1-only",
26    "MIT",
27    "MPL-2.0",
28    "Python-2.0.1",
29}
30
31# Properties which are required for our purposes.
32REQUIRED_PROPERTIES_PACKAGE = frozenset([
33    "SPDXID",
34    "name",
35    "versionInfo",
36    "downloadLocation",
37    "checksums",
38    "licenseConcluded",
39    "externalRefs",
40    "primaryPackagePurpose",
41])
42
43
44class PackageFiles(typing.NamedTuple):
45    """Structure for describing the files of a package"""
46    include: list[str] | None
47    exclude: list[str] | None = None
48
49
50# SBOMS don't have a method to specify the sources of files
51# so we need to do that external to the SBOM itself. Add new
52# values to 'exclude' if we create new files within tracked
53# directories that aren't sourced from third-party packages.
54PACKAGE_TO_FILES = {
55    "mpdecimal": PackageFiles(
56        include=["Modules/_decimal/libmpdec/**"]
57    ),
58    "expat": PackageFiles(
59        include=["Modules/expat/**"],
60        exclude=[
61            "Modules/expat/expat_config.h",
62            "Modules/expat/pyexpatns.h",
63            "Modules/_hacl/refresh.sh",
64        ]
65    ),
66    "macholib": PackageFiles(
67        include=["Lib/ctypes/macholib/**"],
68        exclude=[
69            "Lib/ctypes/macholib/README.ctypes",
70            "Lib/ctypes/macholib/fetch_macholib",
71            "Lib/ctypes/macholib/fetch_macholib.bat",
72        ],
73    ),
74    "libb2": PackageFiles(
75        include=["Modules/_blake2/impl/**"]
76    ),
77    "hacl-star": PackageFiles(
78        include=["Modules/_hacl/**"],
79        exclude=[
80            "Modules/_hacl/refresh.sh",
81            "Modules/_hacl/README.md",
82            "Modules/_hacl/python_hacl_namespace.h",
83        ]
84    ),
85}
86
87
88def spdx_id(value: str) -> str:
89    """Encode a value into characters that are valid in an SPDX ID"""
90    return re.sub(r"[^a-zA-Z0-9.\-]+", "-", value)
91
92
93def error_if(value: bool, error_message: str) -> None:
94    """Prints an error if a comparison fails along with a link to the devguide"""
95    if value:
96        print(error_message)
97        print("See 'https://devguide.python.org/developer-workflow/sbom' for more information.")
98        sys.exit(1)
99
100
101def is_root_directory_git_index() -> bool:
102    """Checks if the root directory is a git index"""
103    try:
104        subprocess.check_call(
105            ["git", "-C", str(CPYTHON_ROOT_DIR), "rev-parse"],
106            stdout=subprocess.DEVNULL,
107            stderr=subprocess.DEVNULL,
108        )
109    except subprocess.CalledProcessError:
110        return False
111    return True
112
113
114def filter_gitignored_paths(paths: list[str]) -> list[str]:
115    """
116    Filter out paths excluded by the gitignore file.
117    The output of 'git check-ignore --non-matching --verbose' looks
118    like this for non-matching (included) files:
119
120        '::<whitespace><path>'
121
122    And looks like this for matching (excluded) files:
123
124        '.gitignore:9:*.a    Tools/lib.a'
125    """
126    # No paths means no filtering to be done.
127    if not paths:
128        return []
129
130    # Filter out files in gitignore.
131    # Non-matching files show up as '::<whitespace><path>'
132    git_check_ignore_proc = subprocess.run(
133        ["git", "check-ignore", "--verbose", "--non-matching", *paths],
134        cwd=CPYTHON_ROOT_DIR,
135        check=False,
136        stdout=subprocess.PIPE,
137    )
138    # 1 means matches, 0 means no matches.
139    assert git_check_ignore_proc.returncode in (0, 1)
140
141    # Paths may or may not be quoted, Windows quotes paths.
142    git_check_ignore_re = re.compile(r"^::\s+(\"([^\"]+)\"|(.+))\Z")
143
144    # Return the list of paths sorted
145    git_check_ignore_lines = git_check_ignore_proc.stdout.decode().splitlines()
146    git_check_not_ignored = []
147    for line in git_check_ignore_lines:
148        if match := git_check_ignore_re.fullmatch(line):
149            git_check_not_ignored.append(match.group(2) or match.group(3))
150    return sorted(git_check_not_ignored)
151
152
153def get_externals() -> list[str]:
154    """
155    Parses 'PCbuild/get_externals.bat' for external libraries.
156    Returns a list of (git tag, name, version) tuples.
157    """
158    get_externals_bat_path = CPYTHON_ROOT_DIR / "PCbuild/get_externals.bat"
159    externals = re.findall(
160        r"set\s+libraries\s*=\s*%libraries%\s+([a-zA-Z0-9.-]+)\s",
161        get_externals_bat_path.read_text()
162    )
163    return externals
164
165
166def check_sbom_packages(sbom_data: dict[str, typing.Any]) -> None:
167    """Make a bunch of assertions about the SBOM package data to ensure it's consistent."""
168
169    for package in sbom_data["packages"]:
170        # Properties and ID must be properly formed.
171        error_if(
172            "name" not in package,
173            "Package is missing the 'name' field"
174        )
175
176        # Verify that the checksum matches the expected value
177        # and that the download URL is valid.
178        if "checksums" not in package or "CI" in os.environ:
179            download_location = package["downloadLocation"]
180            resp = urllib.request.urlopen(download_location)
181            error_if(resp.status != 200, f"Couldn't access URL: {download_location}'")
182
183            package["checksums"] = [{
184                "algorithm": "SHA256",
185                "checksumValue": hashlib.sha256(resp.read()).hexdigest()
186            }]
187
188        missing_required_keys = REQUIRED_PROPERTIES_PACKAGE - set(package.keys())
189        error_if(
190            bool(missing_required_keys),
191            f"Package '{package['name']}' is missing required fields: {missing_required_keys}",
192        )
193        error_if(
194            package["SPDXID"] != spdx_id(f"SPDXRef-PACKAGE-{package['name']}"),
195            f"Package '{package['name']}' has a malformed SPDXID",
196        )
197
198        # Version must be in the download and external references.
199        version = package["versionInfo"]
200        error_if(
201            version not in package["downloadLocation"],
202            f"Version '{version}' for package '{package['name']} not in 'downloadLocation' field",
203        )
204        error_if(
205            any(version not in ref["referenceLocator"] for ref in package["externalRefs"]),
206            (
207                f"Version '{version}' for package '{package['name']} not in "
208                f"all 'externalRefs[].referenceLocator' fields"
209            ),
210        )
211
212        # HACL* specifies its expected rev in a refresh script.
213        if package["name"] == "hacl-star":
214            hacl_refresh_sh = (CPYTHON_ROOT_DIR / "Modules/_hacl/refresh.sh").read_text()
215            hacl_expected_rev_match = re.search(
216                r"expected_hacl_star_rev=([0-9a-f]{40})",
217                hacl_refresh_sh
218            )
219            hacl_expected_rev = hacl_expected_rev_match and hacl_expected_rev_match.group(1)
220
221            error_if(
222                hacl_expected_rev != version,
223                "HACL* SBOM version doesn't match value in 'Modules/_hacl/refresh.sh'"
224            )
225
226        # libexpat specifies its expected rev in a refresh script.
227        if package["name"] == "libexpat":
228            libexpat_refresh_sh = (CPYTHON_ROOT_DIR / "Modules/expat/refresh.sh").read_text()
229            libexpat_expected_version_match = re.search(
230                r"expected_libexpat_version=\"([0-9]+\.[0-9]+\.[0-9]+)\"",
231                libexpat_refresh_sh
232            )
233            libexpat_expected_sha256_match = re.search(
234                r"expected_libexpat_sha256=\"[a-f0-9]{40}\"",
235                libexpat_refresh_sh
236            )
237            libexpat_expected_version = libexpat_expected_version_match and libexpat_expected_version_match.group(1)
238            libexpat_expected_sha256 = libexpat_expected_sha256_match and libexpat_expected_sha256_match.group(1)
239
240            error_if(
241                libexpat_expected_version != version,
242                "libexpat SBOM version doesn't match value in 'Modules/expat/refresh.sh'"
243            )
244            error_if(
245                package["checksums"] != [{
246                    "algorithm": "SHA256",
247                    "checksumValue": libexpat_expected_sha256
248                }],
249                "libexpat SBOM checksum doesn't match value in 'Modules/expat/refresh.sh'"
250            )
251
252        # License must be on the approved list for SPDX.
253        license_concluded = package["licenseConcluded"]
254        error_if(
255            license_concluded != "NOASSERTION",
256            f"License identifier must be 'NOASSERTION'"
257        )
258
259
260def create_source_sbom() -> None:
261    sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json"
262    sbom_data = json.loads(sbom_path.read_bytes())
263
264    # We regenerate all of this information. Package information
265    # should be preserved though since that is edited by humans.
266    sbom_data["files"] = []
267    sbom_data["relationships"] = []
268
269    # Ensure all packages in this tool are represented also in the SBOM file.
270    actual_names = {package["name"] for package in sbom_data["packages"]}
271    expected_names = set(PACKAGE_TO_FILES)
272    error_if(
273        actual_names != expected_names,
274        f"Packages defined in SBOM tool don't match those defined in SBOM file: {actual_names}, {expected_names}",
275    )
276
277    check_sbom_packages(sbom_data)
278
279    # We call 'sorted()' here a lot to avoid filesystem scan order issues.
280    for name, files in sorted(PACKAGE_TO_FILES.items()):
281        package_spdx_id = spdx_id(f"SPDXRef-PACKAGE-{name}")
282        exclude = files.exclude or ()
283        for include in sorted(files.include or ()):
284            # Find all the paths and then filter them through .gitignore.
285            paths = glob.glob(include, root_dir=CPYTHON_ROOT_DIR, recursive=True)
286            paths = filter_gitignored_paths(paths)
287            error_if(
288                len(paths) == 0,
289                f"No valid paths found at path '{include}' for package '{name}",
290            )
291
292            for path in paths:
293
294                # Normalize the filename from any combination of slashes.
295                path = str(PurePosixPath(PureWindowsPath(path)))
296
297                # Skip directories and excluded files
298                if not (CPYTHON_ROOT_DIR / path).is_file() or path in exclude:
299                    continue
300
301                # SPDX requires SHA1 to be used for files, but we provide SHA256 too.
302                data = (CPYTHON_ROOT_DIR / path).read_bytes()
303                # We normalize line-endings for consistent checksums.
304                # This is a rudimentary check for binary files.
305                if b"\x00" not in data:
306                    data = data.replace(b"\r\n", b"\n")
307                checksum_sha1 = hashlib.sha1(data).hexdigest()
308                checksum_sha256 = hashlib.sha256(data).hexdigest()
309
310                file_spdx_id = spdx_id(f"SPDXRef-FILE-{path}")
311                sbom_data["files"].append({
312                    "SPDXID": file_spdx_id,
313                    "fileName": path,
314                    "checksums": [
315                        {"algorithm": "SHA1", "checksumValue": checksum_sha1},
316                        {"algorithm": "SHA256", "checksumValue": checksum_sha256},
317                    ],
318                })
319
320                # Tie each file back to its respective package.
321                sbom_data["relationships"].append({
322                    "spdxElementId": package_spdx_id,
323                    "relatedSpdxElement": file_spdx_id,
324                    "relationshipType": "CONTAINS",
325                })
326
327    # Update the SBOM on disk
328    sbom_path.write_text(json.dumps(sbom_data, indent=2, sort_keys=True))
329
330
331def create_externals_sbom() -> None:
332    sbom_path = CPYTHON_ROOT_DIR / "Misc/externals.spdx.json"
333    sbom_data = json.loads(sbom_path.read_bytes())
334
335    externals = get_externals()
336    externals_name_to_version = {}
337    externals_name_to_git_tag = {}
338    for git_tag in externals:
339        name, _, version = git_tag.rpartition("-")
340        externals_name_to_version[name] = version
341        externals_name_to_git_tag[name] = git_tag
342
343    # Ensure all packages in this tool are represented also in the SBOM file.
344    actual_names = {package["name"] for package in sbom_data["packages"]}
345    expected_names = set(externals_name_to_version)
346    error_if(
347        actual_names != expected_names,
348        f"Packages defined in SBOM tool don't match those defined in SBOM file: {actual_names}, {expected_names}",
349    )
350
351    # Set the versionInfo and downloadLocation fields for all packages.
352    for package in sbom_data["packages"]:
353        package_version = externals_name_to_version[package["name"]]
354
355        # Update the version information in all the locations.
356        package["versionInfo"] = package_version
357        for external_ref in package["externalRefs"]:
358            if external_ref["referenceType"] != "cpe23Type":
359                continue
360            # Version is the fifth field of a CPE.
361            cpe23ref = external_ref["referenceLocator"]
362            external_ref["referenceLocator"] = re.sub(
363                r"\A(cpe(?::[^:]+){4}):[^:]+:",
364                fr"\1:{package_version}:",
365                cpe23ref
366            )
367
368        download_location = (
369            f"https://github.com/python/cpython-source-deps/archive/refs/tags/{externals_name_to_git_tag[package['name']]}.tar.gz"
370        )
371        download_location_changed = download_location != package["downloadLocation"]
372        package["downloadLocation"] = download_location
373
374        # If the download URL has changed we want one to get recalulated.
375        if download_location_changed:
376            package.pop("checksums", None)
377
378    check_sbom_packages(sbom_data)
379
380    # Update the SBOM on disk
381    sbom_path.write_text(json.dumps(sbom_data, indent=2, sort_keys=True))
382
383
384def main() -> None:
385    # Don't regenerate the SBOM if we're not a git repository.
386    if not is_root_directory_git_index():
387        print("Skipping SBOM generation due to not being a git repository")
388        return
389
390    create_source_sbom()
391    create_externals_sbom()
392
393
394if __name__ == "__main__":
395    main()
396