1 // Copyright (C) 2025 The Android Open Source Project 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 use std::{ 16 collections::{BTreeMap, BTreeSet}, 17 ffi::OsString, 18 path::Path, 19 sync::LazyLock, 20 }; 21 22 use itertools::Itertools; 23 use spdx::Licensee; 24 use textdistance::str::ratcliff_obershelp; 25 26 use crate::{ 27 license_data::{CRATE_LICENSE_SPECIAL_CASES, LICENSES, LICENSE_PREFERENCE}, 28 util::{normalize_filename, strip_punctuation}, 29 CrateLicenseSpecialCase, CrateLicenseSpecialCases, Error, License, LicenseTerms, ParsedLicense, 30 }; 31 32 #[derive(Debug)] 33 pub(crate) struct Licenses { 34 licenses: BTreeMap<Licensee, ParsedLicense>, 35 license_preference: Vec<Licensee>, 36 crate_license_special_cases: CrateLicenseSpecialCases, 37 license_file_names: BTreeMap<OsString, Licensee>, 38 } 39 40 impl Licenses { new( raw_licenses: &'static [License], license_preference: &[&str], crate_license_special_cases: &'static [CrateLicenseSpecialCase], ) -> Result<Licenses, Error>41 fn new( 42 raw_licenses: &'static [License], 43 license_preference: &[&str], 44 crate_license_special_cases: &'static [CrateLicenseSpecialCase], 45 ) -> Result<Licenses, Error> { 46 if raw_licenses.is_empty() { 47 return Err(Error::NoLicenses); 48 } 49 50 let mut licenses = BTreeMap::new(); 51 let mut license_file_names = BTreeMap::new(); 52 for license in raw_licenses { 53 let parsed = ParsedLicense::try_from(license)?; 54 let licensee = parsed.licensee().clone(); 55 for file_name in parsed.file_names() { 56 if let Some(other) = license_file_names.insert(file_name.clone(), licensee.clone()) 57 { 58 return Err(Error::DuplicateLicenseFileName { 59 file_name: file_name.to_string_lossy().into_owned(), 60 license: parsed.licensee().to_string(), 61 other_license: other.to_string(), 62 }); 63 } 64 } 65 if licenses.insert(licensee.clone(), parsed).is_some() { 66 return Err(Error::DuplicateLicense(licensee.to_string())); 67 } 68 } 69 70 let mut ranked_licenses = Vec::new(); 71 for pref in license_preference { 72 let licensee = Licensee::parse(pref)?; 73 if !licenses.contains_key(&licensee) { 74 return Err(Error::LicensePreferenceForUnknownLicense(pref.to_string())); 75 } 76 ranked_licenses.push(licensee); 77 } 78 let unranked_licenses = licenses 79 .keys() 80 .filter_map(|l| if !ranked_licenses.contains(l) { Some(l.clone()) } else { None }) 81 .collect::<Vec<_>>(); 82 let license_preference = ranked_licenses.into_iter().chain(unranked_licenses).collect(); 83 84 let licenses = Licenses { 85 licenses, 86 license_preference, 87 crate_license_special_cases: crate_license_special_cases.try_into()?, 88 license_file_names, 89 }; 90 licenses.validate()?; 91 Ok(licenses) 92 } 93 validate(&self) -> Result<(), Error>94 fn validate(&self) -> Result<(), Error> { 95 for (licensee, license) in &self.licenses { 96 // The license text can't be a substring of any other license text. 97 for (other_licensee, other_license) in &self.licenses { 98 if licensee != other_licensee 99 && license 100 .processed_text() 101 .is_some_and(|text| other_license.is_substring_of(text)) 102 { 103 return Err(Error::AmbiguousLicenseText( 104 other_license.licensee().to_string(), 105 license.licensee().to_string(), 106 )); 107 } 108 } 109 } 110 111 Ok(()) 112 } 113 114 /// Evaluate the SPDX license expression from Cargo.toml for a given crate. 115 /// Slashes such as "MIT/Apache-2.0" are interpreted as OR. 116 /// A limited set of exceptions are applied for crates where the license terms are 117 /// known to be missing or incorrect. evaluate_crate_license( &self, crate_name: &str, cargo_toml_license: Option<&str>, ) -> Result<LicenseTerms, Error>118 pub fn evaluate_crate_license( 119 &self, 120 crate_name: &str, 121 cargo_toml_license: Option<&str>, 122 ) -> Result<LicenseTerms, Error> { 123 LicenseTerms::try_from( 124 self.crate_license_special_cases 125 .get_corrected_license(crate_name, cargo_toml_license)?, 126 &self.license_preference, 127 ) 128 } 129 classify_file_name(&self, file: impl AsRef<Path>) -> Option<&Licensee>130 pub fn classify_file_name(&self, file: impl AsRef<Path>) -> Option<&Licensee> { 131 self.license_file_names.get(&normalize_filename(file)) 132 } 133 134 /// Classify file contents by exact substring match on the license text. classify_file_contents(&self, contents: &str) -> BTreeSet<Licensee>135 pub fn classify_file_contents(&self, contents: &str) -> BTreeSet<Licensee> { 136 let contents = strip_punctuation(contents); 137 138 let mut matches = BTreeSet::new(); 139 for license in self.licenses.values() { 140 if license.is_substring_of(contents.as_str()) { 141 matches.insert(license.licensee().clone()); 142 } 143 } 144 matches 145 } 146 classify_file_contents_fuzzy(&self, contents: &str) -> Option<Licensee>147 pub fn classify_file_contents_fuzzy(&self, contents: &str) -> Option<Licensee> { 148 let contents = strip_punctuation(contents); 149 150 // Fuzzy match. This is expensive, so start with licenses that are closest in length to the file, 151 // and only return a single match at most. 152 for license in 153 self.licenses.values().filter(|l| l.processed_text().is_some()).sorted_by(|a, b| { 154 let mut ra = a.processed_text().unwrap().len() as f32 / contents.len() as f32; 155 let mut rb = b.processed_text().unwrap().len() as f32 / contents.len() as f32; 156 if ra > 1.0 { 157 ra = 1.0 / ra; 158 } 159 if rb > 1.0 { 160 rb = 1.0 / rb; 161 } 162 rb.partial_cmp(&ra).unwrap() 163 }) 164 { 165 if let Some(processed_text) = license.processed_text() { 166 let similarity = ratcliff_obershelp(contents.as_str(), processed_text); 167 if similarity > 0.95 { 168 return Some(license.licensee().clone()); 169 } 170 } 171 } 172 173 None 174 } 175 } 176 177 pub(crate) static LICENSE_DATA: LazyLock<Licenses> = LazyLock::new(|| { 178 Licenses::new(LICENSES, LICENSE_PREFERENCE, CRATE_LICENSE_SPECIAL_CASES).unwrap() 179 }); 180 181 #[cfg(test)] 182 mod tests { 183 use std::collections::BTreeSet; 184 185 use super::*; 186 187 #[test] static_data_sanity_test()188 fn static_data_sanity_test() { 189 assert_eq!(LICENSES.len(), LICENSE_DATA.licenses.len()); 190 assert_eq!(LICENSE_DATA.license_preference.len(), LICENSE_DATA.licenses.len()); 191 } 192 193 #[test] basic()194 fn basic() { 195 assert!(Licenses::new( 196 &[ 197 License { name: "Apache-2.0", text: None, file_names: &["LICENSE-APACHE"] }, 198 License { name: "MIT", text: None, file_names: &["LICENSE-MIT"] }, 199 License { name: "BSD-3-Clause", text: None, file_names: &["LICENSE-BSD-3-Clause"] }, 200 ], 201 &["Apache-2.0", "MIT"], 202 &[], 203 ) 204 .is_ok()); 205 } 206 207 #[test] no_licenses()208 fn no_licenses() { 209 assert!(matches!(Licenses::new(&[], &[], &[]), Err(Error::NoLicenses))); 210 } 211 212 #[test] duplicate_license()213 fn duplicate_license() { 214 assert!(matches!( 215 Licenses::new( 216 &[ 217 License { name: "MIT", text: None, file_names: &["LICENSE-foo"] }, 218 License { name: "MIT", text: None, file_names: &["LICENSE-bar"] } 219 ], 220 &[], 221 &[], 222 ), 223 Err(Error::DuplicateLicense(_)) 224 )); 225 } 226 227 #[test] license_text_substrings()228 fn license_text_substrings() { 229 assert!(matches!( 230 Licenses::new( 231 &[ 232 License { name: "Apache-2.0", text: Some("foo"), file_names: &[] }, 233 License { name: "MIT", text: Some("foobar"), file_names: &[] } 234 ], 235 &[], 236 &[], 237 ), 238 Err(Error::AmbiguousLicenseText(_, _,)) 239 )); 240 } 241 242 #[test] duplicate_license_file_names()243 fn duplicate_license_file_names() { 244 assert!(matches!( 245 Licenses::new( 246 &[ 247 License { name: "Apache-2.0", text: None, file_names: &["LICENSE"] }, 248 License { name: "MIT", text: None, file_names: &["LICENSE"] } 249 ], 250 &[], 251 &[], 252 ), 253 Err(Error::DuplicateLicenseFileName { file_name: _, license: _, other_license: _ }) 254 )); 255 } 256 257 #[test] unfindable_license_file()258 fn unfindable_license_file() { 259 assert!(matches!( 260 Licenses::new(&[License { name: "MIT", text: None, file_names: &["foo"] },], &[], &[],), 261 Err(Error::LicenseFileNotFindable(_, _)) 262 )); 263 } 264 265 #[test] preference_for_unknown_license()266 fn preference_for_unknown_license() { 267 assert!(matches!( 268 Licenses::new( 269 &[License { name: "MIT", text: None, file_names: &["LICENSE-MIT"] }], 270 &["foo"], 271 &[], 272 ), 273 Err(Error::LicenseParseError(_)) 274 )); 275 assert!(matches!( 276 Licenses::new( 277 &[License { name: "MIT", text: None, file_names: &["LICENSE-MIT"] }], 278 &["Apache-2.0"], 279 &[], 280 ), 281 Err(Error::LicensePreferenceForUnknownLicense(_)) 282 )); 283 } 284 285 #[test] evaluate_crate_license()286 fn evaluate_crate_license() { 287 let licenses = Licenses::new( 288 &[ 289 License { name: "Apache-2.0", text: None, file_names: &["LICENSE-APACHE"] }, 290 License { name: "MIT", text: None, file_names: &["LICENSE-MIT"] }, 291 ], 292 &["Apache-2.0", "MIT"], 293 &[], 294 ) 295 .unwrap(); 296 assert_eq!( 297 licenses.evaluate_crate_license("foo", Some("Apache-2.0 OR MIT")).unwrap(), 298 LicenseTerms { 299 required: BTreeSet::from([Licensee::parse("Apache-2.0").unwrap().into_req()]), 300 not_required: BTreeSet::from([Licensee::parse("MIT").unwrap().into_req()]) 301 } 302 ); 303 assert!( 304 matches!( 305 licenses.evaluate_crate_license("foo", Some("BSD-3-Clause")), 306 Err(Error::MinimizeError(_)) 307 ), 308 "Unknown license" 309 ); 310 assert!( 311 matches!( 312 licenses.evaluate_crate_license("foo", None), 313 Err(Error::MissingLicenseField(_)) 314 ), 315 "No license and no special case" 316 ); 317 } 318 } 319