1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# Copyright (c) 2024 Huawei Device Co., Ltd. 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15 16import re 17import sys 18import json 19import pandas as pd 20 21class SPDXLicenseMatcher: 22 def __init__(self, input_excel_path, input_json_path): 23 # Load Excel and SPDX JSON data 24 self.df = pd.read_excel(input_excel_path) 25 self.spdx_mapping = self._load_spdx_data(input_json_path) 26 27 @staticmethod 28 def _load_spdx_data(json_path): 29 # Load SPDX JSON data with enhanced standardization on keys (case-insensitive, punctuation-free) 30 with open(json_path, 'r', encoding='utf-8') as f: 31 spdx_data = json.load(f) 32 spdx_mapping = {re.sub(r'[^a-zA-Z0-9 ]', '', key.lower()): value for key, value in spdx_data.items()} 33 return spdx_mapping 34 35 @staticmethod 36 def _normalize_license_name(name): 37 """Further normalize license names by removing non-alphanumeric characters.""" 38 normalized_name = re.sub(r'[^a-zA-Z0-9 ]', '', name).lower() 39 return normalized_name 40 41 def copy_url_column(self): 42 # Copy cc_url to match_url for reference 43 self.df['match_url'] = self.df['cc_url'] 44 45 def match_license_column(self): 46 # Map spdx_fixed_license_name column against SPDX data 47 self.df['match_license'] = self.df['spdx_fixed_license_name'].apply(self._map_license) 48 49 def _map_license(self, license_names): 50 # Process multiple license names separated by semicolons 51 license_keys = [self._normalize_license_name(name) for name in license_names.split(';')] 52 matched_licenses = [self._find_license_match(key) for key in license_keys] 53 54 # Filter out any None results and join by ';' to mimic input format 55 matched_licenses = [license for license in matched_licenses if license] 56 return ';'.join(matched_licenses) if matched_licenses else "No Match" 57 58 def _find_license_match(self, key): 59 # Attempt to find an exact match first 60 if key in self.spdx_mapping: 61 return self.spdx_mapping[key] 62 63 # Fallback to fuzzy matching if no exact match found 64 for spdx_key in self.spdx_mapping.keys(): 65 if all(word in spdx_key for word in key.split()): 66 return self.spdx_mapping[spdx_key] 67 return None 68 69 def save_to_excel(self, output_excel_path): 70 # Save the DataFrame to an Excel file 71 self.df.to_excel(output_excel_path, index=False) 72 print(f"[INFO] Final processed results saved to {output_excel_path}") 73 74 75 76def main(input_excel_path, input_json_path, output_excel_path): 77 matcher = SPDXLicenseMatcher(input_excel_path, input_json_path) 78 #matcher.copy_url_column() 79 matcher.match_license_column() 80 matcher.save_to_excel(output_excel_path) 81 82if __name__ == "__main__": 83 input_excel_path = sys.argv[1] 84 input_json_path = sys.argv[2] 85 output_excel_path = sys.argv[3] 86 main(input_excel_path, input_json_path, output_excel_path) 87 88