1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3 4# 5# Copyright (c) 2025 Northeastern University 6# Licensed under the Apache License, Version 2.0 (the "License"); 7# you may not use this file except in compliance with the License. 8# You may obtain a copy of the License at 9# 10# http://www.apache.org/licenses/LICENSE-2.0 11# 12# Unless required by applicable law or agreed to in writing, software 13# distributed under the License is distributed on an "AS IS" BASIS, 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15# See the License for the specific language governing permissions and 16# limitations under the License. 17# 18 19import json 20import mimetypes 21import os 22import re 23from collections import OrderedDict 24from pathlib import Path 25from typing import Any, Tuple, List 26from typing import Union 27from urllib.parse import urlparse 28 29from packageurl import PackageURL 30 31 32def read_json(path: Union[str, Path]) -> Any: 33 """ 34 Read and parse a JSON file from the specified path. 35 """ 36 with open(path, "r", encoding="utf-8") as f: 37 return json.load(f) 38 39 40def write_json(data: Any, file_path: str, *, indent: bool = True) -> None: 41 """ 42 Serialize Python object to JSON and write to file. 43 """ 44 with open(file_path, "w", encoding="utf-8") as f: 45 if indent: 46 json.dump(data, f, indent=4, sort_keys=True, ensure_ascii=False) 47 else: 48 json.dump(data, f, ensure_ascii=False) 49 50 51def remove_empty(obj: Any) -> Any: 52 """ 53 Recursively remove empty values from nested dicts/lists. 54 Removes: None, "", [], {}, () 55 """ 56 if isinstance(obj, (dict, OrderedDict)): 57 cleaned = OrderedDict() 58 for k, v in obj.items(): 59 cleaned_value = remove_empty(v) 60 # Only keep non-empty values 61 if cleaned_value not in (None, "", [], {}, ()): 62 cleaned[k] = cleaned_value 63 return cleaned 64 elif isinstance(obj, list): 65 cleaned_list = [remove_empty(item) for item in obj] 66 # Filter out empty items 67 return [item for item in cleaned_list if item not in (None, "", [], {}, ())] 68 else: 69 return obj 70 71 72def generate_purl(pkg_type: str, namespace: str, name: str, 73 version: str = None, qualifiers: dict = None, 74 subpath: str = None) -> str: 75 """ 76 Generate standard Package URL (purl) string. 77 """ 78 try: 79 purl = PackageURL( 80 type=pkg_type, 81 namespace=namespace, 82 name=name, 83 version=version, 84 qualifiers=qualifiers, 85 subpath=subpath 86 ) 87 return purl.to_string() 88 except Exception as e: 89 raise ValueError(f"Invalid PURL fields: {e}") from e 90 91 92def get_purl_type_from_url(url: str) -> str: 93 """ 94 Infer purl type from URL based on predefined rules. 95 """ 96 if not url or not isinstance(url, str): 97 return "generic" 98 99 url_lower = url.lower().strip() 100 101 # Define matching rules: (pattern, purl_type, is_regex) 102 rules: List[Tuple[str, str, bool]] = [ 103 # Hosting platforms (exact domain matches first) 104 ("github.com", "github", False), 105 ("gitlab.com", "gitlab", False), 106 ("gitlab", "gitlab", True), 107 ("gitee.com", "gitee", False), 108 ("gitcode.net", "gitcode", False), 109 ("bitbucket.org", "bitbucket", False), 110 111 # Package types (based on extensions/paths) 112 (r"\.src\.rpm$", "rpm", True), 113 (r"\.rpm$", "rpm", True), 114 (r"\.deb$", "deb", True), 115 (r"\.whl$", "pypi", True), 116 (r"/pypi/", "pypi", True), 117 (r"\.jar$", "maven", True), 118 (r"/maven2/", "maven", True), 119 (r"\.gem$", "gem", True), 120 (r"\.git$", "git", True), 121 122 # Generic source packages 123 (r"\.tar\.gz$", "generic", True), 124 (r"\.tgz$", "generic", True), 125 (r"\.zip$", "generic", True), 126 (r"\.tar\.bz2$", "generic", True), 127 ] 128 129 for pattern, purl_type, is_regex in rules: 130 if is_regex: 131 if re.search(pattern, url_lower): 132 return purl_type 133 else: 134 if pattern in url_lower: 135 return purl_type 136 137 return "generic" 138 139 140def is_text_file(path: str) -> bool: 141 """ 142 Determine if file is text-based using mimetype detection. 143 """ 144 if not os.path.isfile(path): 145 return False 146 147 # Guess MIME type from extension 148 mime_type, _ = mimetypes.guess_type(path) 149 150 # Default to False for unknown types 151 if mime_type is None: 152 return False 153 154 # Check if MIME type indicates text 155 return mime_type.startswith("text/") 156 157 158def commit_url_of(url: str, commit_id: str) -> str: 159 """ 160 Generate web URL for viewing a specific commit in code hosting platforms. 161 """ 162 # Preserve original input for fallback 163 original_url = url.strip() if url else "" 164 if not original_url or not commit_id: 165 return original_url 166 167 # Clean commit ID (remove non-hex chars) 168 cleaned_commit = re.sub(r'[^a-fA-F0-9]', '', commit_id) 169 if not cleaned_commit: 170 return original_url 171 try: 172 parsed = urlparse(original_url) 173 host = parsed.netloc.lower() 174 path = parsed.path.strip("/") 175 176 # Require at least owner/repo in path 177 parts = path.split("/") 178 if len(parts) < 2: 179 return original_url 180 181 owner, repo = parts[0], parts[1] 182 base_url = f"https://{host}/{owner}/{repo}" 183 184 # Generate platform-specific commit URLs 185 if "gitee.com" in host: 186 return f"{base_url}/tree/{commit_id}" 187 elif "github.com" in host: 188 return f"{base_url}/tree/{commit_id}" 189 elif "gitcode.com" in host: 190 return f"{base_url}/tree/{commit_id}" 191 else: 192 return original_url 193 194 except Exception: 195 # Fallback to original URL on any parsing error 196 return original_url 197