• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Copyright (c) 2024 Huawei Device Co., Ltd.
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16import re
17import sys
18import json
19import pandas as pd
20
21class SPDXLicenseMatcher:
22    def __init__(self, input_excel_path, input_json_path):
23        # Load Excel and SPDX JSON data
24        self.df = pd.read_excel(input_excel_path)
25        self.spdx_mapping = self._load_spdx_data(input_json_path)
26
27    @staticmethod
28    def _load_spdx_data(json_path):
29        # Load SPDX JSON data with enhanced standardization on keys (case-insensitive, punctuation-free)
30        with open(json_path, 'r', encoding='utf-8') as f:
31            spdx_data = json.load(f)
32            spdx_mapping = {re.sub(r'[^a-zA-Z0-9 ]', '', key.lower()): value for key, value in spdx_data.items()}
33            return spdx_mapping
34
35    @staticmethod
36    def _normalize_license_name(name):
37        """Further normalize license names by removing non-alphanumeric characters."""
38        normalized_name = re.sub(r'[^a-zA-Z0-9 ]', '', name).lower()
39        return normalized_name
40
41    def copy_url_column(self):
42        # Copy cc_url to match_url for reference
43        self.df['match_url'] = self.df['cc_url']
44
45    def match_license_column(self):
46        # Map spdx_fixed_license_name column against SPDX data
47        self.df['match_license'] = self.df['spdx_fixed_license_name'].apply(self._map_license)
48
49    def _map_license(self, license_names):
50        # Process multiple license names separated by semicolons
51        license_keys = [self._normalize_license_name(name) for name in license_names.split(';')]
52        matched_licenses = [self._find_license_match(key) for key in license_keys]
53
54        # Filter out any None results and join by ';' to mimic input format
55        matched_licenses = [license for license in matched_licenses if license]
56        return ';'.join(matched_licenses) if matched_licenses else "No Match"
57
58    def _find_license_match(self, key):
59        # Attempt to find an exact match first
60        if key in self.spdx_mapping:
61            return self.spdx_mapping[key]
62
63        # Fallback to fuzzy matching if no exact match found
64        for spdx_key in self.spdx_mapping.keys():
65            if all(word in spdx_key for word in key.split()):
66                return self.spdx_mapping[spdx_key]
67        return None
68
69    def save_to_excel(self, output_excel_path):
70        # Save the DataFrame to an Excel file
71        self.df.to_excel(output_excel_path, index=False)
72        print(f"[INFO] Final processed results saved to {output_excel_path}")
73
74
75
76def main(input_excel_path, input_json_path, output_excel_path):
77    matcher = SPDXLicenseMatcher(input_excel_path, input_json_path)
78    #matcher.copy_url_column()
79    matcher.match_license_column()
80    matcher.save_to_excel(output_excel_path)
81
82if __name__ == "__main__":
83    input_excel_path = sys.argv[1]
84    input_json_path = sys.argv[2]
85    output_excel_path = sys.argv[3]
86    main(input_excel_path, input_json_path, output_excel_path)
87
88