1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3 4# 5# Copyright (c) 2025 Northeastern University 6# Licensed under the Apache License, Version 2.0 (the "License"); 7# you may not use this file except in compliance with the License. 8# You may obtain a copy of the License at 9# 10# http://www.apache.org/licenses/LICENSE-2.0 11# 12# Unless required by applicable law or agreed to in writing, software 13# distributed under the License is distributed on an "AS IS" BASIS, 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15# See the License for the specific language governing permissions and 16# limitations under the License. 17# 18 19import logging 20import re 21from pathlib import Path 22from typing import Dict, List, Union, Tuple 23 24from license_expression import get_spdx_licensing, LicenseSymbol 25 26from ohos.sbom.extraction.local_resource_loader import LocalResourceLoader 27from ohos.sbom.sbom.metadata.sbom_meta_data import NOASSERTION 28 29logger = logging.getLogger(__name__) 30 31 32class CopyrightDetector: 33 34 @classmethod 35 def find_copyrights(cls, texts: List[str]) -> List[Dict]: 36 """ 37 Extract complete copyright information, preserving the original output format. 38 Returns list of dicts with keys: statement, year, holder. 39 """ 40 results = [] 41 seen = set() 42 43 for text in texts: 44 if not text or not isinstance(text, str): 45 continue 46 47 for block in cls._find_copyright_blocks(text): 48 year_str_output = cls._extract_years(block) 49 holder_str_output = cls._extract_holder(block) 50 51 if not year_str_output and not holder_str_output: 52 continue 53 54 full_statement = block.strip() 55 if full_statement in seen: 56 continue 57 seen.add(full_statement) 58 59 results.append({ 60 "statement": full_statement, 61 "year": year_str_output, 62 "holder": holder_str_output 63 }) 64 65 return results 66 67 @classmethod 68 def _find_copyright_blocks(cls, text: str) -> List[str]: 69 """Find potential copyright blocks in text.""" 70 potential_pattern = r'(?i)\b(?:Copyright\s*(?:\(C\))?|\(C\)|©)[^\r\n]*(?:\r\n?|\n|$)' 71 matches = re.findall(potential_pattern, text) 72 return [block.strip() for block in matches if block.strip() and re.search(r'\b\d{4}\b', block)] 73 74 @classmethod 75 def _extract_years(cls, block: str) -> str: 76 """Extract and format years from copyright block.""" 77 year_range_pattern = re.compile(r'\b(\d{4})\s*[-–]\s*(\d{4})\b') 78 year_single_pattern = re.compile(r'\b\d{4}\b') 79 80 years = set() 81 all_ranges = year_range_pattern.findall(block) 82 all_single_years = year_single_pattern.findall(block) 83 84 # Add year ranges 85 for match in year_range_pattern.finditer(block): 86 years.add(match.group(0).strip()) 87 88 # Add single years not already in ranges 89 for year_str in all_single_years: 90 try: 91 year_val = int(year_str) 92 except ValueError: 93 continue 94 in_range = any( 95 int(start) <= year_val <= int(end) 96 for start, end in all_ranges 97 ) 98 if not in_range: 99 years.add(year_str) 100 101 return ", ".join(sorted(years, key=lambda y: ('-' not in y, y))) 102 103 @classmethod 104 def _extract_holder(cls, block: str) -> str: 105 """Extract and format copyright holder from block.""" 106 holder_text = cls._clean_copyright_markers(block) 107 holder_text = cls._clean_years(holder_text) 108 holder_text = cls._clean_urls_and_references(holder_text) 109 holder_text = cls._normalize_text(holder_text) 110 111 holders = cls._split_and_filter_holders(holder_text) 112 unique_holders = sorted(set(h for h in holders if h)) 113 114 return cls._format_final_holder(", ".join(unique_holders) if unique_holders else "") 115 116 @classmethod 117 def _clean_copyright_markers(cls, text: str) -> str: 118 """Remove copyright markers from text.""" 119 text = re.sub(r'(?i)\bCopyright\s*(?:\(C\))?\b', '', text) 120 text = re.sub(r'\(C\)', '', text) 121 return re.sub(r'©', '', text) 122 123 @classmethod 124 def _clean_years(cls, text: str) -> str: 125 """Remove year information from text.""" 126 text = re.sub(r'\b\d{4}\s*[-–]\s*\d{4}\b', '', text) 127 return re.sub(r'\b\d{4}\b', '', text) 128 129 @classmethod 130 def _clean_urls_and_references(cls, text: str) -> str: 131 """Remove URLs and reference statements from text.""" 132 text = re.sub(r'\(\s*https?://[^\s)]+\s*\)', '', text) # ( http://... ) 133 text = re.sub(r'\(\s*[a-z][a-z0-9\-]*\s*\)', '', text) # (minizip), (project) 134 text = re.sub(r'\(\s*(?:Inc|Ltd|Co|Corp|LLC|GmbH|Foundation)\.?\s*\)', '', text, flags=re.I) 135 text = re.sub(r'https?://[^\s]+', '', text) 136 return re.sub(r'\b[\w.-]+@[\w.-]+\b', '', text) 137 138 @classmethod 139 def _normalize_text(cls, text: str) -> str: 140 """Normalize text by removing special characters and extra spaces.""" 141 text = re.sub(r'[^\w\s\-.,&()]', ' ', text) 142 text = re.sub(r'\s+', ' ', text).strip() 143 text = re.sub(r'\(\s*\)', '', text) 144 return re.sub(r'\s+', ' ', text).strip() 145 146 @classmethod 147 def _split_and_filter_holders(cls, text: str) -> List[str]: 148 """Split holder text and filter out invalid parts.""" 149 holders = [] 150 parts = re.split(r'[,;]|\s+and\s+|&', text, flags=re.I) 151 152 for part in parts: 153 part = re.sub(r'^\s*(?:and|&|,|\.)\s*|\s*(?:and|&|,|\.)\s*$', '', part, flags=re.I) 154 part = re.sub(r'\(\s*\)', '', part) 155 part = part.strip() 156 157 if part and not cls._is_invalid_holder_part(part): 158 holders.append(part) 159 160 return holders 161 162 @classmethod 163 def _is_invalid_holder_part(cls, part: str) -> bool: 164 """Check if a holder part should be filtered out.""" 165 return bool(re.search( 166 r'\b(?:modification|project|info|read|support|unzip|zip|license|version|' 167 r'part of|conditions|distribution|use|see|notice|rights reserved|developer|' 168 r'maintainer|author|team)\b', 169 part, re.I 170 )) 171 172 @classmethod 173 def _format_final_holder(cls, holder: str) -> str: 174 """Apply final formatting to holder string.""" 175 holder = re.sub(r'\s*[-–—]\s*$', '', holder) 176 holder = re.sub(r'\s*[-–—]\s*http\S*$', '', holder) 177 holder = re.sub(r'\s+http\S+', '', holder) 178 holder = re.sub(r'\s*,\s*$', '', holder).strip() 179 return re.sub(r'\s+and\s+$', '', holder, flags=re.I).strip() 180 181 182class LicenseDetector: 183 LICENSE_PATTERNS = { 184 'Apache-2.0': [ 185 r'Apache License[\s,]+Version 2\.0', 186 r'http://www\.apache\.org/licenses/LICENSE-2\.0', 187 r'ASF 2\.0' 188 ], 189 'MIT': [ 190 r'\bMIT (?:License|Permission)\b', 191 r'Permission is hereby granted,? free of charge', 192 r'THE SOFTWARE IS PROVIDED "AS IS"' 193 ], 194 'GPL-3.0': [ 195 r'GNU GENERAL PUBLIC LICENSE[\s,]+Version 3', 196 r'\bGPLv3\b', 197 r'https?://www\.gnu\.org/licenses/gpl-3\.0' 198 ], 199 'BSD-3-Clause': [ 200 r'Redistribution and use in source and binary forms', 201 r'BSD 3-Clause(?: License)?' 202 ], 203 'ISC': [ 204 r'\bISC License\b', 205 r'Permission to use, copy, modify, and distribute this software' 206 ], 207 'MPL-2.0': [ 208 r'http://mozilla\.org/MPL/2\.0/', 209 r'Mozilla Public License[\s,]+Version\s+2\.0', 210 r'This Source Code Form is subject to the terms of the Mozilla Public License.*?Version.*?2\.0' 211 ] 212 } 213 214 LICENSE_FILE_NAMES = { 215 "license", "copying", "notice", 216 "license.txt", "copying.txt", "notice.txt", 217 "license.md", "copying.md", "notice.md" 218 } 219 220 def __init__(self): 221 self.licensing = get_spdx_licensing() 222 223 def detect_licenses(self, texts: List[str]) -> List[str]: 224 found_licenses = set() 225 226 spdx_pattern = re.compile(r'SPDX-License-Identifier:\s*([^\n]+)', re.IGNORECASE) 227 for text in texts: 228 match = spdx_pattern.search(text) 229 if match: 230 try: 231 parsed_license = self.licensing.parse(match.group(1)) 232 found_licenses.update(str(s) for s in parsed_license.objects if isinstance(s, LicenseSymbol)) 233 except Exception as e: 234 logger.debug( 235 f"Failed to parse SPDX license identifier: " 236 f"'{match.group(1)}' in text snippet: '{text[:50]}...'", 237 exc_info=True 238 ) 239 240 for license_type, patterns in self.LICENSE_PATTERNS.items(): 241 if any(re.search(pattern, text, re.IGNORECASE) for text in texts for pattern in patterns): 242 found_licenses.add(license_type) 243 244 return sorted(found_licenses) 245 246 def identify_license(self, text: str) -> Tuple[str, float]: 247 spdx_match = re.search(r'SPDX-License-Identifier:\s*([^\n]+)', text, re.IGNORECASE) 248 if spdx_match: 249 try: 250 parsed_license = self.licensing.parse(spdx_match.group(1)) 251 if parsed_license: 252 return str(parsed_license), 1.0 253 except (AttributeError, ValueError, SyntaxError) as e: 254 print(f"[Debug] License parse failed: {e}") 255 pass 256 except Exception as e: 257 pass 258 259 best_match = (NOASSERTION, 0.0) 260 for license_id, patterns in self.LICENSE_PATTERNS.items(): 261 matched = sum(1 for pattern in patterns if re.search(pattern, text, re.IGNORECASE)) 262 if matched > 0: 263 confidence = matched / len(patterns) 264 if confidence > best_match[1]: 265 best_match = (license_id, confidence) 266 267 return best_match 268 269 270class FileScanner: 271 272 def __init__(self): 273 self.license_detector = LicenseDetector() 274 275 def scan(self, file_path: Union[str, Path]) -> Dict: 276 path = Path(file_path) 277 real_path = LocalResourceLoader.to_local_path(file_path) 278 if not Path(real_path).is_file(): 279 return { 280 "path": str(real_path), 281 "licenses": [], 282 "copyrights": [], 283 "content_type": "NOASSERTION", 284 "content": "" 285 } 286 content = LocalResourceLoader.load_text_file(real_path, max_bytes=8192) 287 licenses = self.license_detector.detect_licenses([content]) 288 copyrights = CopyrightDetector.find_copyrights([content]) 289 290 return { 291 "path": str(path), 292 "licenses": licenses, 293 "copyrights": copyrights, 294 "content_type": "Text", 295 "content": content 296 } 297 298 299class LicenseFileScanner: 300 301 def __init__(self): 302 self.license_detector = LicenseDetector() 303 304 def scan(self, directory: Union[str, Path]) -> List[Dict]: 305 directory = Path(LocalResourceLoader.to_local_path(directory)) 306 if not directory.is_dir(): 307 return [] 308 309 license_files = [] 310 311 for item in directory.iterdir(): 312 if item.is_file() and item.name.lower() in LicenseDetector.LICENSE_FILE_NAMES: 313 result = self.scan_license_file(item) 314 license_files.append(result) 315 return license_files 316 317 def scan_license_file(self, file_path: Union[str, Path]) -> Dict: 318 path = Path(file_path) 319 if not path.is_file(): 320 return { 321 "path": str(path), 322 "license_type": "NOASSERTION", 323 "license_text": "", 324 "copyrights": [], 325 "confidence": 0.0 326 } 327 328 try: 329 content = LocalResourceLoader.load_text_file(file_path) 330 except OSError as e: 331 logger.debug("Skipping file '%s': read failed (%s)", file_path, e) 332 return { 333 "path": str(path), 334 "license_type": "NOASSERTION", 335 "license_text": "", 336 "copyrights": [], 337 "confidence": 0.0 338 } 339 340 license_type, confidence = self.license_detector.identify_license(content) 341 copyrights = CopyrightDetector.find_copyrights([content]) 342 343 return { 344 "path": str(path), 345 "license_type": license_type, 346 "license_text": content, 347 "copyrights": copyrights, 348 "confidence": confidence 349 } 350