1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3 4# 5# Copyright (c) 2025 Northeastern University 6# Licensed under the Apache License, Version 2.0 (the "License"); 7# you may not use this file except in compliance with the License. 8# You may obtain a copy of the License at 9# 10# http://www.apache.org/licenses/LICENSE-2.0 11# 12# Unless required by applicable law or agreed to in writing, software 13# distributed under the License is distributed on an "AS IS" BASIS, 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15# See the License for the specific language governing permissions and 16# limitations under the License. 17# 18 19import os 20from argparse import ArgumentParser 21from pathlib import Path 22from typing import Dict, List, Union, Set, Optional 23 24from ohos.sbom.analysis.depend_graph import DependGraphAnalyzer 25from ohos.sbom.analysis.file_dependency import FileDependencyAnalyzer 26from ohos.sbom.analysis.install_module import InstallModuleAnalyzer 27from ohos.sbom.analysis.project_dependency import ProjectDependencyAnalyzer 28from ohos.sbom.common.utils import generate_purl, get_purl_type_from_url, commit_url_of 29from ohos.sbom.data.file_dependence import File 30from ohos.sbom.data.manifest import Project 31from ohos.sbom.data.opensource import OpenSource 32from ohos.sbom.extraction.copyright_and_license_scanner import LicenseFileScanner, FileScanner 33from ohos.sbom.extraction.local_resource_loader import LocalResourceLoader 34from ohos.sbom.sbom.builder.file_builder import FileBuilder 35from ohos.sbom.sbom.builder.package_builder import PackageBuilder 36from ohos.sbom.sbom.builder.relationship_builder import RelationshipBuilder 37from ohos.sbom.sbom.builder.sbom_meta_data_builder import SBOMMetaDataBuilder 38from ohos.sbom.sbom.metadata.sbom_meta_data import RelationshipType, SBOMMetaData, NOASSERTION 39 40 41class SBOMGenerator: 42 def __init__(self, args: ArgumentParser): 43 self.args = args 44 self.source_ninja_json = None 45 self.manifest = None 46 self.file_dependence_analyzer = None 47 self._install_target_name_dest_map: Dict[str, List[str]] = {} 48 self._file_ref_map: Dict[str, str] = {} 49 self._file_dep_filter: Dict[str, File] = {} 50 self.sbom_builder: SBOMMetaDataBuilder = SBOMMetaDataBuilder() 51 self.license_scanner = LicenseFileScanner() 52 self.file_scanner = FileScanner() 53 self.init() 54 self.build_filtered_files() 55 56 def init(self): 57 print("Initializing SBOM generator...") 58 print("Initializing [1/4]: Loading Manifest and gn-generated JSON ...") 59 self.source_ninja_json = LocalResourceLoader.load_ninja_json() 60 self.manifest = LocalResourceLoader.load_manifest() 61 print("Initializing [2/4]: Determining whether Targets are installed to the image ...") 62 install_module_analyzer = InstallModuleAnalyzer(self.source_ninja_json) 63 self._install_target_name_dest_map = install_module_analyzer.get_install_with_dest() 64 print("Initializing [3/4]: Building Target dependency network ...") 65 depend_graph_analyzer = DependGraphAnalyzer(self.source_ninja_json) 66 print("Initializing [4/4]: Building dependencies for files installed on the image ...") 67 self.file_dependence_analyzer = FileDependencyAnalyzer(depend_graph_analyzer) 68 self.file_dependence_analyzer.build_all_install_deps_optimized(self._install_target_name_dest_map.keys()) 69 70 def build_filtered_files(self): 71 """ 72 Get _file_dependencies from file_dependence_analyzer, 73 filter items whose keys start with "//", 74 assign the result to self._file_dep_filter, 75 and build self._file_ref_map (path -> file_id). 76 """ 77 raw_files = self.file_dependence_analyzer.get_file_dependencies() 78 79 self._file_dep_filter.clear() 80 self._file_ref_map.clear() 81 map_source_file: Dict[str, Set[File]] = {} 82 for path, file in raw_files.items(): 83 file_id = path 84 self._file_ref_map[path] = file_id 85 if path.startswith("//") or "/" not in path[2:]: 86 continue 87 map_source_file[path] = {item for dep_set in file.get_dependencies().values() for item in dep_set} 88 89 for path, file in raw_files.items(): 90 if not path.startswith("//") and "/" in path[2:]: 91 continue 92 for relationship_type in RelationshipType: 93 original_deps = file.get_dependencies(relationship_type) 94 new_deps = set() 95 # Substitute intermediate files with original source references 96 for dep in original_deps: 97 new_deps.update(map_source_file.get(dep.relative_path, {dep})) 98 file.set_dependencies(relationship_type, new_deps) 99 self._file_dep_filter[path] = file 100 101 self._add_install_dest_file() 102 103 print(f"Filtered {len(self._file_dep_filter)} files starting with '//'") 104 print(f"Built file_ref_map with {len(self._file_ref_map)} entries") 105 106 def build_file_information(self): 107 all_files = self._file_dep_filter 108 109 for file_path, file_obj in all_files.items(): 110 file_id = self._file_ref_map[file_path] 111 file_scanner_ret = self._extract_scanner_info(file_path) 112 file = (FileBuilder() 113 .with_file_name(os.path.basename(file_path)) 114 .with_file_id(file_id) 115 .with_file_author(file_scanner_ret["fileAuthor"]) 116 .with_copyright_text(file_scanner_ret["copyright_text"]) 117 ) 118 119 self.sbom_builder.add_file(file) 120 121 for file_path, file_obj in all_files.items(): 122 for relationship_type in RelationshipType: 123 dep_file_id_list = [] 124 dependencies = file_obj.get_dependencies(relationship_type) 125 if len(dependencies) == 0: 126 continue 127 file_id = self._file_ref_map[file_path] 128 for dep in dependencies: 129 dep_id = self._file_ref_map[dep.relative_path] 130 dep_file_id_list.append(dep_id) 131 relationship_builder = (RelationshipBuilder().with_relationship_type(relationship_type) 132 .with_bom_ref(file_id) 133 .with_depends_on(dep_file_id_list) 134 ) 135 self.sbom_builder.add_relationship(relationship_builder) 136 137 def build_package_information(self): 138 """Build package information and dependencies for SBOM generation.""" 139 pdb = ProjectDependencyAnalyzer() 140 pdb.build([file for file in self._file_dep_filter.values()]) 141 all_project_dependence = pdb.get_project_dependence() 142 143 project_bom_refs = self._build_main_packages(all_project_dependence) 144 self._build_dependencies(all_project_dependence, project_bom_refs) 145 146 def build_document_information(self): 147 doc_builder = self.sbom_builder.start_document() 148 doc_builder.with_name(f"{self.args.product}-{self.manifest.default['revision']}").end() 149 150 def build_sbom(self) -> SBOMMetaData: 151 print("Building file information...") 152 self.build_file_information() 153 print("Building package information...") 154 self.build_package_information() 155 print("Building document information...") 156 self.build_document_information() 157 print("Generating final SBOM metadata ...") 158 sbom_meta_data = self.sbom_builder.build(validate=False) 159 print("Generation completed:") 160 print("• Packages:", len(sbom_meta_data.packages)) 161 print("• Files:", len(sbom_meta_data.files)) 162 print("• Relationships:", len(sbom_meta_data.relationships)) 163 return sbom_meta_data 164 165 def _get_file_reference(self, dep: File) -> Optional[str]: 166 """Get the file reference for a file dependency.""" 167 return self._file_ref_map.get(dep.relative_path) 168 169 def _get_project_license(self, source_project) -> str: 170 """Get the license for a project, defaulting to NOASSERTION if not found.""" 171 license_scanner_ret = self.license_scanner.scan(source_project.path) 172 return license_scanner_ret[0]["license_type"] if len(license_scanner_ret) >= 1 else NOASSERTION 173 174 def _add_relationship(self, source_bom_ref: str, depends_on_refs: List[str], rel_type: RelationshipType) -> None: 175 """Add a relationship to the SBOM builder.""" 176 rb = (RelationshipBuilder() 177 .with_bom_ref(source_bom_ref) 178 .with_depends_on(depends_on_refs) 179 .with_relationship_type(rel_type)) 180 self.sbom_builder.add_relationship(rb) 181 182 def _add_install_dest_file(self): 183 target_name_map_file = self.file_dependence_analyzer.get_target_name_map_file() 184 185 for target_name, dest_list in self._install_target_name_dest_map.items(): 186 file_list = target_name_map_file.get(target_name, []) 187 if not file_list: 188 continue 189 190 stripped_file_list = [f for f in file_list if f.is_stripped] 191 for dest in dest_list: 192 dest_file = File(dest, None) 193 if len(stripped_file_list) == 1: 194 dest_file.add_dependency(RelationshipType.COPY_OF, stripped_file_list[0]) 195 else: 196 self._handle_multiple_sources(dest, stripped_file_list, dest_file) 197 self._file_dep_filter[dest] = dest_file 198 self._file_ref_map[dest] = dest 199 200 def _handle_multiple_sources(self, dest: str, stripped_file_list, dest_file): 201 """Handle case where multiple source files may match the destination.""" 202 dest_filename = os.path.basename(dest) 203 matched_files = [ 204 f for f in stripped_file_list 205 if os.path.basename(f.relative_path) == dest_filename 206 ] 207 208 if not matched_files: 209 print(f"Warning: Files on the image failed to match with Targets: {dest}") 210 else: 211 for src_file in matched_files: 212 dest_file.add_dependency(RelationshipType.COPY_OF, src_file) 213 214 def _extract_scanner_info(self, file_path: Union[str, Path]) -> Dict: 215 """ 216 Extract license and copyright information from the scan result of a file. 217 218 Args: 219 file_path (str or Path): Path to the file being scanned. 220 221 Returns: 222 Dict: A dictionary containing: 223 - concluded_license: The primary license (first in list) or NOASSERTION 224 - license_info_in_files: List of detected licenses or [NOASSERTION] 225 - copyright_text: Copyright statement if found 226 - fileAuthor: Comma-separated list of filtered authors/holders 227 """ 228 # Scan the file and extract results 229 ret = self.file_scanner.scan(file_path) 230 licenses = ret.get("licenses", []) 231 copyrights = ret.get("copyrights", []) 232 233 authors = set() 234 copyright_text = NOASSERTION 235 236 # Define filtering rules for invalid holder patterns 237 invalid_prefixes = ('by ', 'copyright', 'all rights', 'distributed', 'licensed') 238 min_len, max_len = 2, 50 239 240 # Process each copyright entry 241 for cp in copyrights: 242 if not isinstance(cp, dict): 243 continue 244 245 # Extract copyright statement (use first non-empty one) 246 if copyright_text == NOASSERTION: 247 statement = cp.get("statement") 248 if statement: 249 copyright_text = statement 250 251 # Extract and normalize holder 252 holder = cp.get("holder", "").strip() 253 if not holder: 254 continue 255 256 # Split comma-separated holders and clean them 257 holders = [h.strip() for h in holder.split(",") if h.strip()] 258 filtered_holders = self._filter_holders(holders, invalid_prefixes, min_len, max_len) 259 authors.update(filtered_holders) 260 261 # Format final author string 262 file_author = ", ".join(sorted(authors)) if authors else NOASSERTION 263 264 return { 265 "concluded_license": licenses[0] if licenses else NOASSERTION, 266 "license_info_in_files": licenses or [NOASSERTION], 267 "copyright_text": copyright_text, 268 "fileAuthor": file_author 269 } 270 271 def _filter_holders(self, holders: List[str], invalid_prefixes: tuple, min_len: int, max_len: int) -> List[str]: 272 """Filter out invalid holders based on prefix, length, and format.""" 273 filtered = [] 274 for h in holders: 275 h_lower = h.lower() 276 if (h_lower.startswith(invalid_prefixes) or 277 len(h) < min_len or 278 len(h) > max_len or 279 ('.' in h and h.count('.') > 2)): 280 continue 281 filtered.append(h) 282 return filtered 283 284 def _build_dependencies(self, all_project_dependence: Dict, project_bom_refs: Dict) -> None: 285 """Build dependency relationships for all projects.""" 286 for name, project_dependence in all_project_dependence.items(): 287 source_bom_ref = project_bom_refs.get(project_dependence.source_project.name) 288 if not source_bom_ref: 289 continue 290 291 for rel_type in RelationshipType: 292 dependencies = project_dependence.get_dependencies(rel_type) 293 if not dependencies: 294 continue 295 296 depends_on_refs = self._process_dependencies(dependencies, project_bom_refs) 297 if depends_on_refs: 298 self._add_relationship(source_bom_ref, depends_on_refs, rel_type) 299 300 def _process_dependencies(self, dependencies: List, project_bom_refs: Dict) -> List[str]: 301 """Process dependencies and return list of bom_refs.""" 302 depends_on_refs = [] 303 304 for dep in dependencies: 305 if isinstance(dep, Project): 306 if dep.name in project_bom_refs: 307 depends_on_refs.append(project_bom_refs[dep.name]) 308 elif isinstance(dep, OpenSource): 309 depends_on_refs.append(self._process_opensource_dependency(dep, project_bom_refs)) 310 elif hasattr(dep, "name") and dep.name in project_bom_refs: 311 depends_on_refs.append(project_bom_refs[dep.name]) 312 elif isinstance(dep, File): 313 file_ref = self._get_file_reference(dep) 314 if file_ref: 315 depends_on_refs.append(file_ref) 316 317 return depends_on_refs 318 319 def _build_main_packages(self, all_project_dependence: Dict) -> Dict: 320 """Build main package information and return bom_refs mapping.""" 321 project_bom_refs = {} 322 package_version = self.manifest.default["revision"] 323 324 for name, project_dependence in all_project_dependence.items(): 325 source_project = project_dependence.source_project 326 purl = self.manifest.purl_of(source_project) 327 project_bom_refs[source_project.name] = purl 328 329 pb = self._create_main_package_builder( 330 name=name, 331 source_project=source_project, 332 purl=purl, 333 package_version=package_version 334 ) 335 self.sbom_builder.add_package(pb) 336 337 return project_bom_refs 338 339 def _process_opensource_dependency(self, dep: OpenSource, project_bom_refs: Dict) -> str: 340 """Process an open source dependency and return its bom_ref.""" 341 purl = generate_purl( 342 pkg_type=get_purl_type_from_url(dep.upstream_url), 343 namespace="upstream", 344 name=dep.name, 345 version=dep.version_number, 346 ) 347 348 if purl not in project_bom_refs: 349 pb = (PackageBuilder() 350 .with_name(dep.name) 351 .with_purl(purl) 352 .with_bom_ref(purl) 353 .with_license_concluded(dep.license) 354 .with_version(dep.version_number) 355 .with_download_location(dep.upstream_url) 356 .with_type("library")) 357 self.sbom_builder.add_package(pb) 358 359 return purl 360 361 def _create_main_package_builder(self, name: str, source_project, purl: str, package_version: str): 362 """Create a PackageBuilder for main project packages.""" 363 url = self.manifest.remote_url_of(source_project) 364 parsed_license = self._get_project_license(source_project) 365 366 return (PackageBuilder() 367 .with_name(name) 368 .with_purl(purl) 369 .with_bom_ref(purl) 370 .with_type(source_project.type) 371 .with_supplier("Organization: OpenHarmony") 372 .with_group("OpenHarmony") 373 .with_license_declared(parsed_license) 374 .with_version(package_version) 375 .with_download_location(commit_url_of(url, source_project.revision)) 376 .with_type(source_project.type) 377 .with_comp_platform(self.args.platform)) 378