• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4#
5# Copyright (c) 2025 Northeastern University
6# Licensed under the Apache License, Version 2.0 (the "License");
7# you may not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10#     http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17#
18
19import logging
20import re
21from pathlib import Path
22from typing import Dict, List, Union, Tuple
23
24from license_expression import get_spdx_licensing, LicenseSymbol
25
26from ohos.sbom.extraction.local_resource_loader import LocalResourceLoader
27from ohos.sbom.sbom.metadata.sbom_meta_data import NOASSERTION
28
29logger = logging.getLogger(__name__)
30
31
32class CopyrightDetector:
33
34    @classmethod
35    def find_copyrights(cls, texts: List[str]) -> List[Dict]:
36        """
37        Extract complete copyright information, preserving the original output format.
38        Returns list of dicts with keys: statement, year, holder.
39        """
40        results = []
41        seen = set()
42
43        for text in texts:
44            if not text or not isinstance(text, str):
45                continue
46
47            for block in cls._find_copyright_blocks(text):
48                year_str_output = cls._extract_years(block)
49                holder_str_output = cls._extract_holder(block)
50
51                if not year_str_output and not holder_str_output:
52                    continue
53
54                full_statement = block.strip()
55                if full_statement in seen:
56                    continue
57                seen.add(full_statement)
58
59                results.append({
60                    "statement": full_statement,
61                    "year": year_str_output,
62                    "holder": holder_str_output
63                })
64
65        return results
66
67    @classmethod
68    def _find_copyright_blocks(cls, text: str) -> List[str]:
69        """Find potential copyright blocks in text."""
70        potential_pattern = r'(?i)\b(?:Copyright\s*(?:\(C\))?|\(C\)|©)[^\r\n]*(?:\r\n?|\n|$)'
71        matches = re.findall(potential_pattern, text)
72        return [block.strip() for block in matches if block.strip() and re.search(r'\b\d{4}\b', block)]
73
74    @classmethod
75    def _extract_years(cls, block: str) -> str:
76        """Extract and format years from copyright block."""
77        year_range_pattern = re.compile(r'\b(\d{4})\s*[-–]\s*(\d{4})\b')
78        year_single_pattern = re.compile(r'\b\d{4}\b')
79
80        years = set()
81        all_ranges = year_range_pattern.findall(block)
82        all_single_years = year_single_pattern.findall(block)
83
84        # Add year ranges
85        for match in year_range_pattern.finditer(block):
86            years.add(match.group(0).strip())
87
88        # Add single years not already in ranges
89        for year_str in all_single_years:
90            try:
91                year_val = int(year_str)
92            except ValueError:
93                continue
94            in_range = any(
95                int(start) <= year_val <= int(end)
96                for start, end in all_ranges
97            )
98            if not in_range:
99                years.add(year_str)
100
101        return ", ".join(sorted(years, key=lambda y: ('-' not in y, y)))
102
103    @classmethod
104    def _extract_holder(cls, block: str) -> str:
105        """Extract and format copyright holder from block."""
106        holder_text = cls._clean_copyright_markers(block)
107        holder_text = cls._clean_years(holder_text)
108        holder_text = cls._clean_urls_and_references(holder_text)
109        holder_text = cls._normalize_text(holder_text)
110
111        holders = cls._split_and_filter_holders(holder_text)
112        unique_holders = sorted(set(h for h in holders if h))
113
114        return cls._format_final_holder(", ".join(unique_holders) if unique_holders else "")
115
116    @classmethod
117    def _clean_copyright_markers(cls, text: str) -> str:
118        """Remove copyright markers from text."""
119        text = re.sub(r'(?i)\bCopyright\s*(?:\(C\))?\b', '', text)
120        text = re.sub(r'\(C\)', '', text)
121        return re.sub(r'©', '', text)
122
123    @classmethod
124    def _clean_years(cls, text: str) -> str:
125        """Remove year information from text."""
126        text = re.sub(r'\b\d{4}\s*[-–]\s*\d{4}\b', '', text)
127        return re.sub(r'\b\d{4}\b', '', text)
128
129    @classmethod
130    def _clean_urls_and_references(cls, text: str) -> str:
131        """Remove URLs and reference statements from text."""
132        text = re.sub(r'\(\s*https?://[^\s)]+\s*\)', '', text)  # ( http://... )
133        text = re.sub(r'\(\s*[a-z][a-z0-9\-]*\s*\)', '', text)  # (minizip), (project)
134        text = re.sub(r'\(\s*(?:Inc|Ltd|Co|Corp|LLC|GmbH|Foundation)\.?\s*\)', '', text, flags=re.I)
135        text = re.sub(r'https?://[^\s]+', '', text)
136        return re.sub(r'\b[\w.-]+@[\w.-]+\b', '', text)
137
138    @classmethod
139    def _normalize_text(cls, text: str) -> str:
140        """Normalize text by removing special characters and extra spaces."""
141        text = re.sub(r'[^\w\s\-.,&()]', ' ', text)
142        text = re.sub(r'\s+', ' ', text).strip()
143        text = re.sub(r'\(\s*\)', '', text)
144        return re.sub(r'\s+', ' ', text).strip()
145
146    @classmethod
147    def _split_and_filter_holders(cls, text: str) -> List[str]:
148        """Split holder text and filter out invalid parts."""
149        holders = []
150        parts = re.split(r'[,;]|\s+and\s+|&', text, flags=re.I)
151
152        for part in parts:
153            part = re.sub(r'^\s*(?:and|&|,|\.)\s*|\s*(?:and|&|,|\.)\s*$', '', part, flags=re.I)
154            part = re.sub(r'\(\s*\)', '', part)
155            part = part.strip()
156
157            if part and not cls._is_invalid_holder_part(part):
158                holders.append(part)
159
160        return holders
161
162    @classmethod
163    def _is_invalid_holder_part(cls, part: str) -> bool:
164        """Check if a holder part should be filtered out."""
165        return bool(re.search(
166            r'\b(?:modification|project|info|read|support|unzip|zip|license|version|'
167            r'part of|conditions|distribution|use|see|notice|rights reserved|developer|'
168            r'maintainer|author|team)\b',
169            part, re.I
170        ))
171
172    @classmethod
173    def _format_final_holder(cls, holder: str) -> str:
174        """Apply final formatting to holder string."""
175        holder = re.sub(r'\s*[-–—]\s*$', '', holder)
176        holder = re.sub(r'\s*[-–—]\s*http\S*$', '', holder)
177        holder = re.sub(r'\s+http\S+', '', holder)
178        holder = re.sub(r'\s*,\s*$', '', holder).strip()
179        return re.sub(r'\s+and\s+$', '', holder, flags=re.I).strip()
180
181
182class LicenseDetector:
183    LICENSE_PATTERNS = {
184        'Apache-2.0': [
185            r'Apache License[\s,]+Version 2\.0',
186            r'http://www\.apache\.org/licenses/LICENSE-2\.0',
187            r'ASF 2\.0'
188        ],
189        'MIT': [
190            r'\bMIT (?:License|Permission)\b',
191            r'Permission is hereby granted,? free of charge',
192            r'THE SOFTWARE IS PROVIDED "AS IS"'
193        ],
194        'GPL-3.0': [
195            r'GNU GENERAL PUBLIC LICENSE[\s,]+Version 3',
196            r'\bGPLv3\b',
197            r'https?://www\.gnu\.org/licenses/gpl-3\.0'
198        ],
199        'BSD-3-Clause': [
200            r'Redistribution and use in source and binary forms',
201            r'BSD 3-Clause(?: License)?'
202        ],
203        'ISC': [
204            r'\bISC License\b',
205            r'Permission to use, copy, modify, and distribute this software'
206        ],
207        'MPL-2.0': [
208            r'http://mozilla\.org/MPL/2\.0/',
209            r'Mozilla Public License[\s,]+Version\s+2\.0',
210            r'This Source Code Form is subject to the terms of the Mozilla Public License.*?Version.*?2\.0'
211        ]
212    }
213
214    LICENSE_FILE_NAMES = {
215        "license", "copying", "notice",
216        "license.txt", "copying.txt", "notice.txt",
217        "license.md", "copying.md", "notice.md"
218    }
219
220    def __init__(self):
221        self.licensing = get_spdx_licensing()
222
223    def detect_licenses(self, texts: List[str]) -> List[str]:
224        found_licenses = set()
225
226        spdx_pattern = re.compile(r'SPDX-License-Identifier:\s*([^\n]+)', re.IGNORECASE)
227        for text in texts:
228            match = spdx_pattern.search(text)
229            if match:
230                try:
231                    parsed_license = self.licensing.parse(match.group(1))
232                    found_licenses.update(str(s) for s in parsed_license.objects if isinstance(s, LicenseSymbol))
233                except Exception as e:
234                    logger.debug(
235                        f"Failed to parse SPDX license identifier: "
236                        f"'{match.group(1)}' in text snippet: '{text[:50]}...'",
237                        exc_info=True
238                    )
239
240        for license_type, patterns in self.LICENSE_PATTERNS.items():
241            if any(re.search(pattern, text, re.IGNORECASE) for text in texts for pattern in patterns):
242                found_licenses.add(license_type)
243
244        return sorted(found_licenses)
245
246    def identify_license(self, text: str) -> Tuple[str, float]:
247        spdx_match = re.search(r'SPDX-License-Identifier:\s*([^\n]+)', text, re.IGNORECASE)
248        if spdx_match:
249            try:
250                parsed_license = self.licensing.parse(spdx_match.group(1))
251                if parsed_license:
252                    return str(parsed_license), 1.0
253            except (AttributeError, ValueError, SyntaxError) as e:
254                print(f"[Debug] License parse failed: {e}")
255                pass
256            except Exception as e:
257                pass
258
259        best_match = (NOASSERTION, 0.0)
260        for license_id, patterns in self.LICENSE_PATTERNS.items():
261            matched = sum(1 for pattern in patterns if re.search(pattern, text, re.IGNORECASE))
262            if matched > 0:
263                confidence = matched / len(patterns)
264                if confidence > best_match[1]:
265                    best_match = (license_id, confidence)
266
267        return best_match
268
269
270class FileScanner:
271
272    def __init__(self):
273        self.license_detector = LicenseDetector()
274
275    def scan(self, file_path: Union[str, Path]) -> Dict:
276        path = Path(file_path)
277        real_path = LocalResourceLoader.to_local_path(file_path)
278        if not Path(real_path).is_file():
279            return {
280                "path": str(real_path),
281                "licenses": [],
282                "copyrights": [],
283                "content_type": "NOASSERTION",
284                "content": ""
285            }
286        content = LocalResourceLoader.load_text_file(real_path, max_bytes=8192)
287        licenses = self.license_detector.detect_licenses([content])
288        copyrights = CopyrightDetector.find_copyrights([content])
289
290        return {
291            "path": str(path),
292            "licenses": licenses,
293            "copyrights": copyrights,
294            "content_type": "Text",
295            "content": content
296        }
297
298
299class LicenseFileScanner:
300
301    def __init__(self):
302        self.license_detector = LicenseDetector()
303
304    def scan(self, directory: Union[str, Path]) -> List[Dict]:
305        directory = Path(LocalResourceLoader.to_local_path(directory))
306        if not directory.is_dir():
307            return []
308
309        license_files = []
310
311        for item in directory.iterdir():
312            if item.is_file() and item.name.lower() in LicenseDetector.LICENSE_FILE_NAMES:
313                result = self.scan_license_file(item)
314                license_files.append(result)
315        return license_files
316
317    def scan_license_file(self, file_path: Union[str, Path]) -> Dict:
318        path = Path(file_path)
319        if not path.is_file():
320            return {
321                "path": str(path),
322                "license_type": "NOASSERTION",
323                "license_text": "",
324                "copyrights": [],
325                "confidence": 0.0
326            }
327
328        try:
329            content = LocalResourceLoader.load_text_file(file_path)
330        except OSError as e:
331            logger.debug("Skipping file '%s': read failed (%s)", file_path, e)
332            return {
333                "path": str(path),
334                "license_type": "NOASSERTION",
335                "license_text": "",
336                "copyrights": [],
337                "confidence": 0.0
338            }
339
340        license_type, confidence = self.license_detector.identify_license(content)
341        copyrights = CopyrightDetector.find_copyrights([content])
342
343        return {
344            "path": str(path),
345            "license_type": license_type,
346            "license_text": content,
347            "copyrights": copyrights,
348            "confidence": confidence
349        }
350