sbom/common/utils.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
# Copyright (c) 2025 Northeastern University
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import json
import mimetypes
import os
import re
from collections import OrderedDict
from pathlib import Path
from typing import Any, Tuple, List
from typing import Union
from urllib.parse import urlparse

from packageurl import PackageURL


def read_json(path: Union[str, Path]) -> Any:
    """
    Read and parse a JSON file from the specified path.
    """
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def write_json(data: Any, file_path: str, *, indent: bool = True) -> None:
    """
    Serialize Python object to JSON and write to file.
    """
    with open(file_path, "w", encoding="utf-8") as f:
        if indent:
            json.dump(data, f, indent=4, sort_keys=True, ensure_ascii=False)
        else:
            json.dump(data, f, ensure_ascii=False)


def remove_empty(obj: Any) -> Any:
    """
    Recursively remove empty values from nested dicts/lists.
    Removes: None, "", [], {}, ()
    """
    if isinstance(obj, (dict, OrderedDict)):
        cleaned = OrderedDict()
        for k, v in obj.items():
            cleaned_value = remove_empty(v)
            # Only keep non-empty values
            if cleaned_value not in (None, "", [], {}, ()):
                cleaned[k] = cleaned_value
        return cleaned
    elif isinstance(obj, list):
        cleaned_list = [remove_empty(item) for item in obj]
        # Filter out empty items
        return [item for item in cleaned_list if item not in (None, "", [], {}, ())]
    else:
        return obj


def generate_purl(pkg_type: str, namespace: str, name: str,
                  version: str = None, qualifiers: dict = None,
                  subpath: str = None) -> str:
    """
    Generate standard Package URL (purl) string.
    """
    try:
        purl = PackageURL(
            type=pkg_type,
            namespace=namespace,
            name=name,
            version=version,
            qualifiers=qualifiers,
            subpath=subpath
        )
        return purl.to_string()
    except Exception as e:
        raise ValueError(f"Invalid PURL fields: {e}") from e


def get_purl_type_from_url(url: str) -> str:
    """
    Infer purl type from URL based on predefined rules.
    """
    if not url or not isinstance(url, str):
        return "generic"

    url_lower = url.lower().strip()

    # Define matching rules: (pattern, purl_type, is_regex)
    rules: List[Tuple[str, str, bool]] = [
        # Hosting platforms (exact domain matches first)
        ("github.com", "github", False),
        ("gitlab.com", "gitlab", False),
        ("gitlab", "gitlab", True),
        ("gitee.com", "gitee", False),
        ("gitcode.net", "gitcode", False),
        ("bitbucket.org", "bitbucket", False),

        # Package types (based on extensions/paths)
        (r"\.src\.rpm$", "rpm", True),
        (r"\.rpm$", "rpm", True),
        (r"\.deb$", "deb", True),
        (r"\.whl$", "pypi", True),
        (r"/pypi/", "pypi", True),
        (r"\.jar$", "maven", True),
        (r"/maven2/", "maven", True),
        (r"\.gem$", "gem", True),
        (r"\.git$", "git", True),

        # Generic source packages
        (r"\.tar\.gz$", "generic", True),
        (r"\.tgz$", "generic", True),
        (r"\.zip$", "generic", True),
        (r"\.tar\.bz2$", "generic", True),
    ]

    for pattern, purl_type, is_regex in rules:
        if is_regex:
            if re.search(pattern, url_lower):
                return purl_type
        else:
            if pattern in url_lower:
                return purl_type

    return "generic"


def is_text_file(path: str) -> bool:
    """
    Determine if file is text-based using mimetype detection.
    """
    if not os.path.isfile(path):
        return False

    # Guess MIME type from extension
    mime_type, _ = mimetypes.guess_type(path)

    # Default to False for unknown types
    if mime_type is None:
        return False

    # Check if MIME type indicates text
    return mime_type.startswith("text/")


def commit_url_of(url: str, commit_id: str) -> str:
    """
    Generate web URL for viewing a specific commit in code hosting platforms.
    """
    # Preserve original input for fallback
    original_url = url.strip() if url else ""
    if not original_url or not commit_id:
        return original_url

    # Clean commit ID (remove non-hex chars)
    cleaned_commit = re.sub(r'[^a-fA-F0-9]', '', commit_id)
    if not cleaned_commit:
        return original_url
    try:
        parsed = urlparse(original_url)
        host = parsed.netloc.lower()
        path = parsed.path.strip("/")

        # Require at least owner/repo in path
        parts = path.split("/")
        if len(parts) < 2:
            return original_url

        owner, repo = parts[0], parts[1]
        base_url = f"https://{host}/{owner}/{repo}"

        # Generate platform-specific commit URLs
        if "gitee.com" in host:
            return f"{base_url}/tree/{commit_id}"
        elif "github.com" in host:
            return f"{base_url}/tree/{commit_id}"
        elif "gitcode.com" in host:
            return f"{base_url}/tree/{commit_id}"
        else:
            return original_url

    except Exception:
        # Fallback to original URL on any parsing error
        return original_url