• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4#
5# Copyright (c) 2025 Northeastern University
6# Licensed under the Apache License, Version 2.0 (the "License");
7# you may not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10#     http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17#
18
19import os
20from argparse import ArgumentParser
21from pathlib import Path
22from typing import Dict, List, Union, Set, Optional
23
24from ohos.sbom.analysis.depend_graph import DependGraphAnalyzer
25from ohos.sbom.analysis.file_dependency import FileDependencyAnalyzer
26from ohos.sbom.analysis.install_module import InstallModuleAnalyzer
27from ohos.sbom.analysis.project_dependency import ProjectDependencyAnalyzer
28from ohos.sbom.common.utils import generate_purl, get_purl_type_from_url, commit_url_of
29from ohos.sbom.data.file_dependence import File
30from ohos.sbom.data.manifest import Project
31from ohos.sbom.data.opensource import OpenSource
32from ohos.sbom.extraction.copyright_and_license_scanner import LicenseFileScanner, FileScanner
33from ohos.sbom.extraction.local_resource_loader import LocalResourceLoader
34from ohos.sbom.sbom.builder.file_builder import FileBuilder
35from ohos.sbom.sbom.builder.package_builder import PackageBuilder
36from ohos.sbom.sbom.builder.relationship_builder import RelationshipBuilder
37from ohos.sbom.sbom.builder.sbom_meta_data_builder import SBOMMetaDataBuilder
38from ohos.sbom.sbom.metadata.sbom_meta_data import RelationshipType, SBOMMetaData, NOASSERTION
39
40
41class SBOMGenerator:
42    def __init__(self, args: ArgumentParser):
43        self.args = args
44        self.source_ninja_json = None
45        self.manifest = None
46        self.file_dependence_analyzer = None
47        self._install_target_name_dest_map: Dict[str, List[str]] = {}
48        self._file_ref_map: Dict[str, str] = {}
49        self._file_dep_filter: Dict[str, File] = {}
50        self.sbom_builder: SBOMMetaDataBuilder = SBOMMetaDataBuilder()
51        self.license_scanner = LicenseFileScanner()
52        self.file_scanner = FileScanner()
53        self.init()
54        self.build_filtered_files()
55
56    def init(self):
57        print("Initializing SBOM generator...")
58        print("Initializing [1/4]: Loading Manifest and gn-generated JSON ...")
59        self.source_ninja_json = LocalResourceLoader.load_ninja_json()
60        self.manifest = LocalResourceLoader.load_manifest()
61        print("Initializing [2/4]: Determining whether Targets are installed to the image ...")
62        install_module_analyzer = InstallModuleAnalyzer(self.source_ninja_json)
63        self._install_target_name_dest_map = install_module_analyzer.get_install_with_dest()
64        print("Initializing [3/4]: Building Target dependency network ...")
65        depend_graph_analyzer = DependGraphAnalyzer(self.source_ninja_json)
66        print("Initializing [4/4]: Building dependencies for files installed on the image ...")
67        self.file_dependence_analyzer = FileDependencyAnalyzer(depend_graph_analyzer)
68        self.file_dependence_analyzer.build_all_install_deps_optimized(self._install_target_name_dest_map.keys())
69
70    def build_filtered_files(self):
71        """
72        Get _file_dependencies from file_dependence_analyzer,
73        filter items whose keys start with "//",
74        assign the result to self._file_dep_filter,
75        and build self._file_ref_map (path -> file_id).
76        """
77        raw_files = self.file_dependence_analyzer.get_file_dependencies()
78
79        self._file_dep_filter.clear()
80        self._file_ref_map.clear()
81        map_source_file: Dict[str, Set[File]] = {}
82        for path, file in raw_files.items():
83            file_id = path
84            self._file_ref_map[path] = file_id
85            if path.startswith("//") or "/" not in path[2:]:
86                continue
87            map_source_file[path] = {item for dep_set in file.get_dependencies().values() for item in dep_set}
88
89        for path, file in raw_files.items():
90            if not path.startswith("//") and "/" in path[2:]:
91                continue
92            for relationship_type in RelationshipType:
93                original_deps = file.get_dependencies(relationship_type)
94                new_deps = set()
95                # Substitute intermediate files with original source references
96                for dep in original_deps:
97                    new_deps.update(map_source_file.get(dep.relative_path, {dep}))
98                file.set_dependencies(relationship_type, new_deps)
99            self._file_dep_filter[path] = file
100
101        self._add_install_dest_file()
102
103        print(f"Filtered {len(self._file_dep_filter)} files starting with '//'")
104        print(f"Built file_ref_map with {len(self._file_ref_map)} entries")
105
106    def build_file_information(self):
107        all_files = self._file_dep_filter
108
109        for file_path, file_obj in all_files.items():
110            file_id = self._file_ref_map[file_path]
111            file_scanner_ret = self._extract_scanner_info(file_path)
112            file = (FileBuilder()
113                    .with_file_name(os.path.basename(file_path))
114                    .with_file_id(file_id)
115                    .with_file_author(file_scanner_ret["fileAuthor"])
116                    .with_copyright_text(file_scanner_ret["copyright_text"])
117                    )
118
119            self.sbom_builder.add_file(file)
120
121        for file_path, file_obj in all_files.items():
122            for relationship_type in RelationshipType:
123                dep_file_id_list = []
124                dependencies = file_obj.get_dependencies(relationship_type)
125                if len(dependencies) == 0:
126                    continue
127                file_id = self._file_ref_map[file_path]
128                for dep in dependencies:
129                    dep_id = self._file_ref_map[dep.relative_path]
130                    dep_file_id_list.append(dep_id)
131                relationship_builder = (RelationshipBuilder().with_relationship_type(relationship_type)
132                                        .with_bom_ref(file_id)
133                                        .with_depends_on(dep_file_id_list)
134                                        )
135                self.sbom_builder.add_relationship(relationship_builder)
136
137    def build_package_information(self):
138        """Build package information and dependencies for SBOM generation."""
139        pdb = ProjectDependencyAnalyzer()
140        pdb.build([file for file in self._file_dep_filter.values()])
141        all_project_dependence = pdb.get_project_dependence()
142
143        project_bom_refs = self._build_main_packages(all_project_dependence)
144        self._build_dependencies(all_project_dependence, project_bom_refs)
145
146    def build_document_information(self):
147        doc_builder = self.sbom_builder.start_document()
148        doc_builder.with_name(f"{self.args.product}-{self.manifest.default['revision']}").end()
149
150    def build_sbom(self) -> SBOMMetaData:
151        print("Building file information...")
152        self.build_file_information()
153        print("Building package information...")
154        self.build_package_information()
155        print("Building document information...")
156        self.build_document_information()
157        print("Generating final SBOM metadata ...")
158        sbom_meta_data = self.sbom_builder.build(validate=False)
159        print("Generation completed:")
160        print("• Packages:", len(sbom_meta_data.packages))
161        print("• Files:", len(sbom_meta_data.files))
162        print("• Relationships:", len(sbom_meta_data.relationships))
163        return sbom_meta_data
164
165    def _get_file_reference(self, dep: File) -> Optional[str]:
166        """Get the file reference for a file dependency."""
167        return self._file_ref_map.get(dep.relative_path)
168
169    def _get_project_license(self, source_project) -> str:
170        """Get the license for a project, defaulting to NOASSERTION if not found."""
171        license_scanner_ret = self.license_scanner.scan(source_project.path)
172        return license_scanner_ret[0]["license_type"] if len(license_scanner_ret) >= 1 else NOASSERTION
173
174    def _add_relationship(self, source_bom_ref: str, depends_on_refs: List[str], rel_type: RelationshipType) -> None:
175        """Add a relationship to the SBOM builder."""
176        rb = (RelationshipBuilder()
177              .with_bom_ref(source_bom_ref)
178              .with_depends_on(depends_on_refs)
179              .with_relationship_type(rel_type))
180        self.sbom_builder.add_relationship(rb)
181
182    def _add_install_dest_file(self):
183        target_name_map_file = self.file_dependence_analyzer.get_target_name_map_file()
184
185        for target_name, dest_list in self._install_target_name_dest_map.items():
186            file_list = target_name_map_file.get(target_name, [])
187            if not file_list:
188                continue
189
190            stripped_file_list = [f for f in file_list if f.is_stripped]
191            for dest in dest_list:
192                dest_file = File(dest, None)
193                if len(stripped_file_list) == 1:
194                    dest_file.add_dependency(RelationshipType.COPY_OF, stripped_file_list[0])
195                else:
196                    self._handle_multiple_sources(dest, stripped_file_list, dest_file)
197                self._file_dep_filter[dest] = dest_file
198                self._file_ref_map[dest] = dest
199
200    def _handle_multiple_sources(self, dest: str, stripped_file_list, dest_file):
201        """Handle case where multiple source files may match the destination."""
202        dest_filename = os.path.basename(dest)
203        matched_files = [
204            f for f in stripped_file_list
205            if os.path.basename(f.relative_path) == dest_filename
206        ]
207
208        if not matched_files:
209            print(f"Warning: Files on the image failed to match with Targets: {dest}")
210        else:
211            for src_file in matched_files:
212                dest_file.add_dependency(RelationshipType.COPY_OF, src_file)
213
214    def _extract_scanner_info(self, file_path: Union[str, Path]) -> Dict:
215        """
216        Extract license and copyright information from the scan result of a file.
217
218        Args:
219            file_path (str or Path): Path to the file being scanned.
220
221        Returns:
222            Dict: A dictionary containing:
223                - concluded_license: The primary license (first in list) or NOASSERTION
224                - license_info_in_files: List of detected licenses or [NOASSERTION]
225                - copyright_text: Copyright statement if found
226                - fileAuthor: Comma-separated list of filtered authors/holders
227        """
228        # Scan the file and extract results
229        ret = self.file_scanner.scan(file_path)
230        licenses = ret.get("licenses", [])
231        copyrights = ret.get("copyrights", [])
232
233        authors = set()
234        copyright_text = NOASSERTION
235
236        # Define filtering rules for invalid holder patterns
237        invalid_prefixes = ('by ', 'copyright', 'all rights', 'distributed', 'licensed')
238        min_len, max_len = 2, 50
239
240        # Process each copyright entry
241        for cp in copyrights:
242            if not isinstance(cp, dict):
243                continue
244
245            # Extract copyright statement (use first non-empty one)
246            if copyright_text == NOASSERTION:
247                statement = cp.get("statement")
248                if statement:
249                    copyright_text = statement
250
251            # Extract and normalize holder
252            holder = cp.get("holder", "").strip()
253            if not holder:
254                continue
255
256            # Split comma-separated holders and clean them
257            holders = [h.strip() for h in holder.split(",") if h.strip()]
258            filtered_holders = self._filter_holders(holders, invalid_prefixes, min_len, max_len)
259            authors.update(filtered_holders)
260
261        # Format final author string
262        file_author = ", ".join(sorted(authors)) if authors else NOASSERTION
263
264        return {
265            "concluded_license": licenses[0] if licenses else NOASSERTION,
266            "license_info_in_files": licenses or [NOASSERTION],
267            "copyright_text": copyright_text,
268            "fileAuthor": file_author
269        }
270
271    def _filter_holders(self, holders: List[str], invalid_prefixes: tuple, min_len: int, max_len: int) -> List[str]:
272        """Filter out invalid holders based on prefix, length, and format."""
273        filtered = []
274        for h in holders:
275            h_lower = h.lower()
276            if (h_lower.startswith(invalid_prefixes) or
277                    len(h) < min_len or
278                    len(h) > max_len or
279                    ('.' in h and h.count('.') > 2)):
280                continue
281            filtered.append(h)
282        return filtered
283
284    def _build_dependencies(self, all_project_dependence: Dict, project_bom_refs: Dict) -> None:
285        """Build dependency relationships for all projects."""
286        for name, project_dependence in all_project_dependence.items():
287            source_bom_ref = project_bom_refs.get(project_dependence.source_project.name)
288            if not source_bom_ref:
289                continue
290
291            for rel_type in RelationshipType:
292                dependencies = project_dependence.get_dependencies(rel_type)
293                if not dependencies:
294                    continue
295
296                depends_on_refs = self._process_dependencies(dependencies, project_bom_refs)
297                if depends_on_refs:
298                    self._add_relationship(source_bom_ref, depends_on_refs, rel_type)
299
300    def _process_dependencies(self, dependencies: List, project_bom_refs: Dict) -> List[str]:
301        """Process dependencies and return list of bom_refs."""
302        depends_on_refs = []
303
304        for dep in dependencies:
305            if isinstance(dep, Project):
306                if dep.name in project_bom_refs:
307                    depends_on_refs.append(project_bom_refs[dep.name])
308            elif isinstance(dep, OpenSource):
309                depends_on_refs.append(self._process_opensource_dependency(dep, project_bom_refs))
310            elif hasattr(dep, "name") and dep.name in project_bom_refs:
311                depends_on_refs.append(project_bom_refs[dep.name])
312            elif isinstance(dep, File):
313                file_ref = self._get_file_reference(dep)
314                if file_ref:
315                    depends_on_refs.append(file_ref)
316
317        return depends_on_refs
318
319    def _build_main_packages(self, all_project_dependence: Dict) -> Dict:
320        """Build main package information and return bom_refs mapping."""
321        project_bom_refs = {}
322        package_version = self.manifest.default["revision"]
323
324        for name, project_dependence in all_project_dependence.items():
325            source_project = project_dependence.source_project
326            purl = self.manifest.purl_of(source_project)
327            project_bom_refs[source_project.name] = purl
328
329            pb = self._create_main_package_builder(
330                name=name,
331                source_project=source_project,
332                purl=purl,
333                package_version=package_version
334            )
335            self.sbom_builder.add_package(pb)
336
337        return project_bom_refs
338
339    def _process_opensource_dependency(self, dep: OpenSource, project_bom_refs: Dict) -> str:
340        """Process an open source dependency and return its bom_ref."""
341        purl = generate_purl(
342            pkg_type=get_purl_type_from_url(dep.upstream_url),
343            namespace="upstream",
344            name=dep.name,
345            version=dep.version_number,
346        )
347
348        if purl not in project_bom_refs:
349            pb = (PackageBuilder()
350                  .with_name(dep.name)
351                  .with_purl(purl)
352                  .with_bom_ref(purl)
353                  .with_license_concluded(dep.license)
354                  .with_version(dep.version_number)
355                  .with_download_location(dep.upstream_url)
356                  .with_type("library"))
357            self.sbom_builder.add_package(pb)
358
359        return purl
360
361    def _create_main_package_builder(self, name: str, source_project, purl: str, package_version: str):
362        """Create a PackageBuilder for main project packages."""
363        url = self.manifest.remote_url_of(source_project)
364        parsed_license = self._get_project_license(source_project)
365
366        return (PackageBuilder()
367                .with_name(name)
368                .with_purl(purl)
369                .with_bom_ref(purl)
370                .with_type(source_project.type)
371                .with_supplier("Organization: OpenHarmony")
372                .with_group("OpenHarmony")
373                .with_license_declared(parsed_license)
374                .with_version(package_version)
375                .with_download_location(commit_url_of(url, source_project.revision))
376                .with_type(source_project.type)
377                .with_comp_platform(self.args.platform))
378