1"""Tool for generating Software Bill of Materials (SBOM) for Python's dependencies""" 2import os 3import re 4import hashlib 5import json 6import glob 7from pathlib import Path, PurePosixPath, PureWindowsPath 8import subprocess 9import sys 10import urllib.request 11import typing 12 13CPYTHON_ROOT_DIR = Path(__file__).parent.parent.parent 14 15# Before adding a new entry to this list, double check that 16# the license expression is a valid SPDX license expression: 17# See: https://spdx.org/licenses 18ALLOWED_LICENSE_EXPRESSIONS = { 19 "Apache-2.0", 20 "Apache-2.0 OR BSD-2-Clause", 21 "BSD-2-Clause", 22 "BSD-3-Clause", 23 "CC0-1.0", 24 "ISC", 25 "LGPL-2.1-only", 26 "MIT", 27 "MPL-2.0", 28 "Python-2.0.1", 29} 30 31# Properties which are required for our purposes. 32REQUIRED_PROPERTIES_PACKAGE = frozenset([ 33 "SPDXID", 34 "name", 35 "versionInfo", 36 "downloadLocation", 37 "checksums", 38 "licenseConcluded", 39 "externalRefs", 40 "primaryPackagePurpose", 41]) 42 43 44class PackageFiles(typing.NamedTuple): 45 """Structure for describing the files of a package""" 46 include: list[str] | None 47 exclude: list[str] | None = None 48 49 50# SBOMS don't have a method to specify the sources of files 51# so we need to do that external to the SBOM itself. Add new 52# values to 'exclude' if we create new files within tracked 53# directories that aren't sourced from third-party packages. 54PACKAGE_TO_FILES = { 55 "mpdecimal": PackageFiles( 56 include=["Modules/_decimal/libmpdec/**"] 57 ), 58 "expat": PackageFiles( 59 include=["Modules/expat/**"], 60 exclude=[ 61 "Modules/expat/expat_config.h", 62 "Modules/expat/pyexpatns.h", 63 "Modules/_hacl/refresh.sh", 64 ] 65 ), 66 "macholib": PackageFiles( 67 include=["Lib/ctypes/macholib/**"], 68 exclude=[ 69 "Lib/ctypes/macholib/README.ctypes", 70 "Lib/ctypes/macholib/fetch_macholib", 71 "Lib/ctypes/macholib/fetch_macholib.bat", 72 ], 73 ), 74 "libb2": PackageFiles( 75 include=["Modules/_blake2/impl/**"] 76 ), 77 "hacl-star": PackageFiles( 78 include=["Modules/_hacl/**"], 79 exclude=[ 80 "Modules/_hacl/refresh.sh", 81 "Modules/_hacl/README.md", 82 "Modules/_hacl/python_hacl_namespace.h", 83 ] 84 ), 85} 86 87 88def spdx_id(value: str) -> str: 89 """Encode a value into characters that are valid in an SPDX ID""" 90 return re.sub(r"[^a-zA-Z0-9.\-]+", "-", value) 91 92 93def error_if(value: bool, error_message: str) -> None: 94 """Prints an error if a comparison fails along with a link to the devguide""" 95 if value: 96 print(error_message) 97 print("See 'https://devguide.python.org/developer-workflow/sbom' for more information.") 98 sys.exit(1) 99 100 101def is_root_directory_git_index() -> bool: 102 """Checks if the root directory is a git index""" 103 try: 104 subprocess.check_call( 105 ["git", "-C", str(CPYTHON_ROOT_DIR), "rev-parse"], 106 stdout=subprocess.DEVNULL, 107 stderr=subprocess.DEVNULL, 108 ) 109 except subprocess.CalledProcessError: 110 return False 111 return True 112 113 114def filter_gitignored_paths(paths: list[str]) -> list[str]: 115 """ 116 Filter out paths excluded by the gitignore file. 117 The output of 'git check-ignore --non-matching --verbose' looks 118 like this for non-matching (included) files: 119 120 '::<whitespace><path>' 121 122 And looks like this for matching (excluded) files: 123 124 '.gitignore:9:*.a Tools/lib.a' 125 """ 126 # No paths means no filtering to be done. 127 if not paths: 128 return [] 129 130 # Filter out files in gitignore. 131 # Non-matching files show up as '::<whitespace><path>' 132 git_check_ignore_proc = subprocess.run( 133 ["git", "check-ignore", "--verbose", "--non-matching", *paths], 134 cwd=CPYTHON_ROOT_DIR, 135 check=False, 136 stdout=subprocess.PIPE, 137 ) 138 # 1 means matches, 0 means no matches. 139 assert git_check_ignore_proc.returncode in (0, 1) 140 141 # Paths may or may not be quoted, Windows quotes paths. 142 git_check_ignore_re = re.compile(r"^::\s+(\"([^\"]+)\"|(.+))\Z") 143 144 # Return the list of paths sorted 145 git_check_ignore_lines = git_check_ignore_proc.stdout.decode().splitlines() 146 git_check_not_ignored = [] 147 for line in git_check_ignore_lines: 148 if match := git_check_ignore_re.fullmatch(line): 149 git_check_not_ignored.append(match.group(2) or match.group(3)) 150 return sorted(git_check_not_ignored) 151 152 153def get_externals() -> list[str]: 154 """ 155 Parses 'PCbuild/get_externals.bat' for external libraries. 156 Returns a list of (git tag, name, version) tuples. 157 """ 158 get_externals_bat_path = CPYTHON_ROOT_DIR / "PCbuild/get_externals.bat" 159 externals = re.findall( 160 r"set\s+libraries\s*=\s*%libraries%\s+([a-zA-Z0-9.-]+)\s", 161 get_externals_bat_path.read_text() 162 ) 163 return externals 164 165 166def check_sbom_packages(sbom_data: dict[str, typing.Any]) -> None: 167 """Make a bunch of assertions about the SBOM package data to ensure it's consistent.""" 168 169 for package in sbom_data["packages"]: 170 # Properties and ID must be properly formed. 171 error_if( 172 "name" not in package, 173 "Package is missing the 'name' field" 174 ) 175 176 # Verify that the checksum matches the expected value 177 # and that the download URL is valid. 178 if "checksums" not in package or "CI" in os.environ: 179 download_location = package["downloadLocation"] 180 resp = urllib.request.urlopen(download_location) 181 error_if(resp.status != 200, f"Couldn't access URL: {download_location}'") 182 183 package["checksums"] = [{ 184 "algorithm": "SHA256", 185 "checksumValue": hashlib.sha256(resp.read()).hexdigest() 186 }] 187 188 missing_required_keys = REQUIRED_PROPERTIES_PACKAGE - set(package.keys()) 189 error_if( 190 bool(missing_required_keys), 191 f"Package '{package['name']}' is missing required fields: {missing_required_keys}", 192 ) 193 error_if( 194 package["SPDXID"] != spdx_id(f"SPDXRef-PACKAGE-{package['name']}"), 195 f"Package '{package['name']}' has a malformed SPDXID", 196 ) 197 198 # Version must be in the download and external references. 199 version = package["versionInfo"] 200 error_if( 201 version not in package["downloadLocation"], 202 f"Version '{version}' for package '{package['name']} not in 'downloadLocation' field", 203 ) 204 error_if( 205 any(version not in ref["referenceLocator"] for ref in package["externalRefs"]), 206 ( 207 f"Version '{version}' for package '{package['name']} not in " 208 f"all 'externalRefs[].referenceLocator' fields" 209 ), 210 ) 211 212 # HACL* specifies its expected rev in a refresh script. 213 if package["name"] == "hacl-star": 214 hacl_refresh_sh = (CPYTHON_ROOT_DIR / "Modules/_hacl/refresh.sh").read_text() 215 hacl_expected_rev_match = re.search( 216 r"expected_hacl_star_rev=([0-9a-f]{40})", 217 hacl_refresh_sh 218 ) 219 hacl_expected_rev = hacl_expected_rev_match and hacl_expected_rev_match.group(1) 220 221 error_if( 222 hacl_expected_rev != version, 223 "HACL* SBOM version doesn't match value in 'Modules/_hacl/refresh.sh'" 224 ) 225 226 # libexpat specifies its expected rev in a refresh script. 227 if package["name"] == "libexpat": 228 libexpat_refresh_sh = (CPYTHON_ROOT_DIR / "Modules/expat/refresh.sh").read_text() 229 libexpat_expected_version_match = re.search( 230 r"expected_libexpat_version=\"([0-9]+\.[0-9]+\.[0-9]+)\"", 231 libexpat_refresh_sh 232 ) 233 libexpat_expected_sha256_match = re.search( 234 r"expected_libexpat_sha256=\"[a-f0-9]{40}\"", 235 libexpat_refresh_sh 236 ) 237 libexpat_expected_version = libexpat_expected_version_match and libexpat_expected_version_match.group(1) 238 libexpat_expected_sha256 = libexpat_expected_sha256_match and libexpat_expected_sha256_match.group(1) 239 240 error_if( 241 libexpat_expected_version != version, 242 "libexpat SBOM version doesn't match value in 'Modules/expat/refresh.sh'" 243 ) 244 error_if( 245 package["checksums"] != [{ 246 "algorithm": "SHA256", 247 "checksumValue": libexpat_expected_sha256 248 }], 249 "libexpat SBOM checksum doesn't match value in 'Modules/expat/refresh.sh'" 250 ) 251 252 # License must be on the approved list for SPDX. 253 license_concluded = package["licenseConcluded"] 254 error_if( 255 license_concluded != "NOASSERTION", 256 f"License identifier must be 'NOASSERTION'" 257 ) 258 259 260def create_source_sbom() -> None: 261 sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json" 262 sbom_data = json.loads(sbom_path.read_bytes()) 263 264 # We regenerate all of this information. Package information 265 # should be preserved though since that is edited by humans. 266 sbom_data["files"] = [] 267 sbom_data["relationships"] = [] 268 269 # Ensure all packages in this tool are represented also in the SBOM file. 270 actual_names = {package["name"] for package in sbom_data["packages"]} 271 expected_names = set(PACKAGE_TO_FILES) 272 error_if( 273 actual_names != expected_names, 274 f"Packages defined in SBOM tool don't match those defined in SBOM file: {actual_names}, {expected_names}", 275 ) 276 277 check_sbom_packages(sbom_data) 278 279 # We call 'sorted()' here a lot to avoid filesystem scan order issues. 280 for name, files in sorted(PACKAGE_TO_FILES.items()): 281 package_spdx_id = spdx_id(f"SPDXRef-PACKAGE-{name}") 282 exclude = files.exclude or () 283 for include in sorted(files.include or ()): 284 # Find all the paths and then filter them through .gitignore. 285 paths = glob.glob(include, root_dir=CPYTHON_ROOT_DIR, recursive=True) 286 paths = filter_gitignored_paths(paths) 287 error_if( 288 len(paths) == 0, 289 f"No valid paths found at path '{include}' for package '{name}", 290 ) 291 292 for path in paths: 293 294 # Normalize the filename from any combination of slashes. 295 path = str(PurePosixPath(PureWindowsPath(path))) 296 297 # Skip directories and excluded files 298 if not (CPYTHON_ROOT_DIR / path).is_file() or path in exclude: 299 continue 300 301 # SPDX requires SHA1 to be used for files, but we provide SHA256 too. 302 data = (CPYTHON_ROOT_DIR / path).read_bytes() 303 # We normalize line-endings for consistent checksums. 304 # This is a rudimentary check for binary files. 305 if b"\x00" not in data: 306 data = data.replace(b"\r\n", b"\n") 307 checksum_sha1 = hashlib.sha1(data).hexdigest() 308 checksum_sha256 = hashlib.sha256(data).hexdigest() 309 310 file_spdx_id = spdx_id(f"SPDXRef-FILE-{path}") 311 sbom_data["files"].append({ 312 "SPDXID": file_spdx_id, 313 "fileName": path, 314 "checksums": [ 315 {"algorithm": "SHA1", "checksumValue": checksum_sha1}, 316 {"algorithm": "SHA256", "checksumValue": checksum_sha256}, 317 ], 318 }) 319 320 # Tie each file back to its respective package. 321 sbom_data["relationships"].append({ 322 "spdxElementId": package_spdx_id, 323 "relatedSpdxElement": file_spdx_id, 324 "relationshipType": "CONTAINS", 325 }) 326 327 # Update the SBOM on disk 328 sbom_path.write_text(json.dumps(sbom_data, indent=2, sort_keys=True)) 329 330 331def create_externals_sbom() -> None: 332 sbom_path = CPYTHON_ROOT_DIR / "Misc/externals.spdx.json" 333 sbom_data = json.loads(sbom_path.read_bytes()) 334 335 externals = get_externals() 336 externals_name_to_version = {} 337 externals_name_to_git_tag = {} 338 for git_tag in externals: 339 name, _, version = git_tag.rpartition("-") 340 externals_name_to_version[name] = version 341 externals_name_to_git_tag[name] = git_tag 342 343 # Ensure all packages in this tool are represented also in the SBOM file. 344 actual_names = {package["name"] for package in sbom_data["packages"]} 345 expected_names = set(externals_name_to_version) 346 error_if( 347 actual_names != expected_names, 348 f"Packages defined in SBOM tool don't match those defined in SBOM file: {actual_names}, {expected_names}", 349 ) 350 351 # Set the versionInfo and downloadLocation fields for all packages. 352 for package in sbom_data["packages"]: 353 package_version = externals_name_to_version[package["name"]] 354 355 # Update the version information in all the locations. 356 package["versionInfo"] = package_version 357 for external_ref in package["externalRefs"]: 358 if external_ref["referenceType"] != "cpe23Type": 359 continue 360 # Version is the fifth field of a CPE. 361 cpe23ref = external_ref["referenceLocator"] 362 external_ref["referenceLocator"] = re.sub( 363 r"\A(cpe(?::[^:]+){4}):[^:]+:", 364 fr"\1:{package_version}:", 365 cpe23ref 366 ) 367 368 download_location = ( 369 f"https://github.com/python/cpython-source-deps/archive/refs/tags/{externals_name_to_git_tag[package['name']]}.tar.gz" 370 ) 371 download_location_changed = download_location != package["downloadLocation"] 372 package["downloadLocation"] = download_location 373 374 # If the download URL has changed we want one to get recalulated. 375 if download_location_changed: 376 package.pop("checksums", None) 377 378 check_sbom_packages(sbom_data) 379 380 # Update the SBOM on disk 381 sbom_path.write_text(json.dumps(sbom_data, indent=2, sort_keys=True)) 382 383 384def main() -> None: 385 # Don't regenerate the SBOM if we're not a git repository. 386 if not is_root_directory_git_index(): 387 print("Skipping SBOM generation due to not being a git repository") 388 return 389 390 create_source_sbom() 391 create_externals_sbom() 392 393 394if __name__ == "__main__": 395 main() 396