• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4#
5# Copyright (c) 2025 Northeastern University
6# Licensed under the Apache License, Version 2.0 (the "License");
7# you may not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10#     http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17#
18
19import json
20import mimetypes
21import os
22import re
23from collections import OrderedDict
24from pathlib import Path
25from typing import Any, Tuple, List
26from typing import Union
27from urllib.parse import urlparse
28
29from packageurl import PackageURL
30
31
32def read_json(path: Union[str, Path]) -> Any:
33    """
34    Read and parse a JSON file from the specified path.
35    """
36    with open(path, "r", encoding="utf-8") as f:
37        return json.load(f)
38
39
40def write_json(data: Any, file_path: str, *, indent: bool = True) -> None:
41    """
42    Serialize Python object to JSON and write to file.
43    """
44    with open(file_path, "w", encoding="utf-8") as f:
45        if indent:
46            json.dump(data, f, indent=4, sort_keys=True, ensure_ascii=False)
47        else:
48            json.dump(data, f, ensure_ascii=False)
49
50
51def remove_empty(obj: Any) -> Any:
52    """
53    Recursively remove empty values from nested dicts/lists.
54    Removes: None, "", [], {}, ()
55    """
56    if isinstance(obj, (dict, OrderedDict)):
57        cleaned = OrderedDict()
58        for k, v in obj.items():
59            cleaned_value = remove_empty(v)
60            # Only keep non-empty values
61            if cleaned_value not in (None, "", [], {}, ()):
62                cleaned[k] = cleaned_value
63        return cleaned
64    elif isinstance(obj, list):
65        cleaned_list = [remove_empty(item) for item in obj]
66        # Filter out empty items
67        return [item for item in cleaned_list if item not in (None, "", [], {}, ())]
68    else:
69        return obj
70
71
72def generate_purl(pkg_type: str, namespace: str, name: str,
73                  version: str = None, qualifiers: dict = None,
74                  subpath: str = None) -> str:
75    """
76    Generate standard Package URL (purl) string.
77    """
78    try:
79        purl = PackageURL(
80            type=pkg_type,
81            namespace=namespace,
82            name=name,
83            version=version,
84            qualifiers=qualifiers,
85            subpath=subpath
86        )
87        return purl.to_string()
88    except Exception as e:
89        raise ValueError(f"Invalid PURL fields: {e}") from e
90
91
92def get_purl_type_from_url(url: str) -> str:
93    """
94    Infer purl type from URL based on predefined rules.
95    """
96    if not url or not isinstance(url, str):
97        return "generic"
98
99    url_lower = url.lower().strip()
100
101    # Define matching rules: (pattern, purl_type, is_regex)
102    rules: List[Tuple[str, str, bool]] = [
103        # Hosting platforms (exact domain matches first)
104        ("github.com", "github", False),
105        ("gitlab.com", "gitlab", False),
106        ("gitlab", "gitlab", True),
107        ("gitee.com", "gitee", False),
108        ("gitcode.net", "gitcode", False),
109        ("bitbucket.org", "bitbucket", False),
110
111        # Package types (based on extensions/paths)
112        (r"\.src\.rpm$", "rpm", True),
113        (r"\.rpm$", "rpm", True),
114        (r"\.deb$", "deb", True),
115        (r"\.whl$", "pypi", True),
116        (r"/pypi/", "pypi", True),
117        (r"\.jar$", "maven", True),
118        (r"/maven2/", "maven", True),
119        (r"\.gem$", "gem", True),
120        (r"\.git$", "git", True),
121
122        # Generic source packages
123        (r"\.tar\.gz$", "generic", True),
124        (r"\.tgz$", "generic", True),
125        (r"\.zip$", "generic", True),
126        (r"\.tar\.bz2$", "generic", True),
127    ]
128
129    for pattern, purl_type, is_regex in rules:
130        if is_regex:
131            if re.search(pattern, url_lower):
132                return purl_type
133        else:
134            if pattern in url_lower:
135                return purl_type
136
137    return "generic"
138
139
140def is_text_file(path: str) -> bool:
141    """
142    Determine if file is text-based using mimetype detection.
143    """
144    if not os.path.isfile(path):
145        return False
146
147    # Guess MIME type from extension
148    mime_type, _ = mimetypes.guess_type(path)
149
150    # Default to False for unknown types
151    if mime_type is None:
152        return False
153
154    # Check if MIME type indicates text
155    return mime_type.startswith("text/")
156
157
158def commit_url_of(url: str, commit_id: str) -> str:
159    """
160    Generate web URL for viewing a specific commit in code hosting platforms.
161    """
162    # Preserve original input for fallback
163    original_url = url.strip() if url else ""
164    if not original_url or not commit_id:
165        return original_url
166
167    # Clean commit ID (remove non-hex chars)
168    cleaned_commit = re.sub(r'[^a-fA-F0-9]', '', commit_id)
169    if not cleaned_commit:
170        return original_url
171    try:
172        parsed = urlparse(original_url)
173        host = parsed.netloc.lower()
174        path = parsed.path.strip("/")
175
176        # Require at least owner/repo in path
177        parts = path.split("/")
178        if len(parts) < 2:
179            return original_url
180
181        owner, repo = parts[0], parts[1]
182        base_url = f"https://{host}/{owner}/{repo}"
183
184        # Generate platform-specific commit URLs
185        if "gitee.com" in host:
186            return f"{base_url}/tree/{commit_id}"
187        elif "github.com" in host:
188            return f"{base_url}/tree/{commit_id}"
189        elif "gitcode.com" in host:
190            return f"{base_url}/tree/{commit_id}"
191        else:
192            return original_url
193
194    except Exception:
195        # Fallback to original URL on any parsing error
196        return original_url
197