1 use crate::{ 2 error::{ParseError, Reason}, 3 ExceptionId, LicenseId, 4 }; 5 6 /// Parsing configuration for SPDX expression 7 #[derive(Default, Copy, Clone)] 8 pub struct ParseMode { 9 /// The `AND`, `OR`, and `WITH` operators are required to be uppercase in 10 /// the SPDX spec, but enabling this option allows them to be lowercased 11 pub allow_lower_case_operators: bool, 12 /// Allows the use of `/` as a synonym for the `OR` operator. 13 /// 14 /// This also allows for not having whitespace between the `/` and the terms 15 /// on either side 16 pub allow_slash_as_or_operator: bool, 17 /// Allows some invalid/imprecise identifiers as synonyms for an actual 18 /// license identifier. 19 /// 20 /// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for a list 21 /// of the current synonyms. Note that this list is not comprehensive but 22 /// can be expanded upon when invalid identifiers are found in the wild. 23 pub allow_imprecise_license_names: bool, 24 /// The various GPL licenses diverge from every other license in the SPDX 25 /// license list by having an `-or-later` variant that is used as a suffix 26 /// on a base license (eg. `GPL-3.0-or-later`) rather than the canonical 27 /// `GPL-3.0+`. 28 /// 29 /// This option just allows GPL licenses to be treated similarly to all of 30 /// the other SPDX licenses. 31 pub allow_postfix_plus_on_gpl: bool, 32 } 33 34 impl ParseMode { 35 /// Strict, specification compliant SPDX parsing. 36 /// 37 /// 1. Only license identifiers in the SPDX license list, or 38 /// Document/LicenseRef, are allowed. The license identifiers are also 39 /// case-sensitive. 40 /// 1. `WITH`, `AND`, and `OR` are the only valid operators 41 pub const STRICT: Self = Self { 42 allow_lower_case_operators: false, 43 allow_slash_as_or_operator: false, 44 allow_imprecise_license_names: false, 45 allow_postfix_plus_on_gpl: false, 46 }; 47 48 /// Allow non-conforming syntax for crates-io compatibility 49 /// 50 /// 1. Additional, invalid, identifiers are accepted and mapped to a correct 51 /// SPDX license identifier. 52 /// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for the 53 /// list of additionally accepted identifiers and the license they 54 /// correspond to. 55 /// 1. `/` can by used as a synonym for `OR`, and doesn't need to be 56 /// separated by whitespace from the terms it combines 57 pub const LAX: Self = Self { 58 allow_lower_case_operators: true, 59 allow_slash_as_or_operator: true, 60 allow_imprecise_license_names: true, 61 allow_postfix_plus_on_gpl: true, 62 }; 63 } 64 65 /// A single token in an SPDX license expression 66 #[derive(Clone, Debug, PartialEq, Eq)] 67 pub enum Token<'a> { 68 /// A recognized SPDX license id 69 Spdx(LicenseId), 70 /// A `LicenseRef-` prefixed id, with an optional `DocumentRef-` 71 LicenseRef { 72 doc_ref: Option<&'a str>, 73 lic_ref: &'a str, 74 }, 75 /// A recognized SPDX exception id 76 Exception(ExceptionId), 77 /// A postfix `+` indicating "or later" for a particular SPDX license id 78 Plus, 79 /// A `(` for starting a group 80 OpenParen, 81 /// A `)` for ending a group 82 CloseParen, 83 /// A `WITH` operator 84 With, 85 /// An `AND` operator 86 And, 87 /// An `OR` operator 88 Or, 89 } 90 91 impl std::fmt::Display for Token<'_> { fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result92 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 93 std::fmt::Debug::fmt(self, f) 94 } 95 } 96 97 impl Token<'_> { len(&self) -> usize98 fn len(&self) -> usize { 99 match self { 100 Token::Spdx(id) => id.name.len(), 101 Token::Exception(e) => e.name.len(), 102 Token::With => 4, 103 Token::And => 3, 104 Token::Or => 2, 105 Token::Plus | Token::OpenParen | Token::CloseParen => 1, 106 Token::LicenseRef { doc_ref, lic_ref } => { 107 doc_ref.map_or(0, |d| { 108 // +1 is for the `:` 109 "DocumentRef-".len() + d.len() + 1 110 }) + "LicenseRef-".len() 111 + lic_ref.len() 112 } 113 } 114 } 115 } 116 117 /// Allows iteration through an SPDX license expression, yielding 118 /// a token or a `ParseError`. 119 /// 120 /// Prefer to use `Expression::parse` or `Licensee::parse` rather 121 /// than directly using the lexer 122 pub struct Lexer<'a> { 123 inner: &'a str, 124 original: &'a str, 125 offset: usize, 126 mode: ParseMode, 127 } 128 129 impl<'a> Lexer<'a> { 130 /// Creates a Lexer over a license expression 131 #[must_use] new(text: &'a str) -> Self132 pub fn new(text: &'a str) -> Self { 133 Self { 134 inner: text, 135 original: text, 136 offset: 0, 137 mode: ParseMode::STRICT, 138 } 139 } 140 141 /// Creates a Lexer over a license expression 142 /// 143 /// With `ParseMode::Lax` it allows non-conforming syntax 144 /// used in crates-io crates. 145 #[must_use] new_mode(text: &'a str, mode: ParseMode) -> Self146 pub fn new_mode(text: &'a str, mode: ParseMode) -> Self { 147 Self { 148 inner: text, 149 original: text, 150 offset: 0, 151 mode, 152 } 153 } 154 155 #[inline] is_ref_char(c: &char) -> bool156 fn is_ref_char(c: &char) -> bool { 157 c.is_ascii_alphanumeric() || *c == '-' || *c == '.' 158 } 159 160 /// Return a matching text token if found - equivalent to the regex `^[-a-zA-Z0-9.:]+` find_text_token(text: &'a str) -> Option<&'a str>161 fn find_text_token(text: &'a str) -> Option<&'a str> { 162 let is_token_char = |c: &char| Self::is_ref_char(c) || *c == ':'; 163 match text.chars().take_while(is_token_char).count() { 164 index if index > 0 => Some(&text[..index]), 165 _ => None, 166 } 167 } 168 169 /// Extract the text after `prefix` if made up of valid ref characters find_ref(prefix: &str, text: &'a str) -> Option<&'a str>170 fn find_ref(prefix: &str, text: &'a str) -> Option<&'a str> { 171 text.strip_prefix(prefix).map(|value| { 172 let end = value.chars().take_while(Self::is_ref_char).count(); 173 &text[prefix.len()..prefix.len() + end] 174 }) 175 } 176 177 /// Return a license ref if found - equivalent to the regex `^LicenseRef-([-a-zA-Z0-9.]+)` 178 #[inline] find_license_ref(text: &'a str) -> Option<&'a str>179 fn find_license_ref(text: &'a str) -> Option<&'a str> { 180 Self::find_ref("LicenseRef-", text) 181 } 182 183 /// Return a document ref and license ref if found, 184 /// equivalent to the regex `^DocumentRef-([-a-zA-Z0-9.]+):LicenseRef-([-a-zA-Z0-9.]+)` find_document_and_license_ref(text: &'a str) -> Option<(&'a str, &'a str)>185 fn find_document_and_license_ref(text: &'a str) -> Option<(&'a str, &'a str)> { 186 let split = text.split_once(':'); 187 let doc_ref = split.and_then(|(doc, _)| Self::find_ref("DocumentRef-", doc)); 188 let lic_ref = split.and_then(|(_, lic)| Self::find_license_ref(lic)); 189 Option::zip(doc_ref, lic_ref) 190 } 191 } 192 193 /// A wrapper around a particular token that includes the span of the characters 194 /// in the original string, for diagnostic purposes 195 #[derive(Debug)] 196 pub struct LexerToken<'a> { 197 /// The token that was lexed 198 pub token: Token<'a>, 199 /// The range of the token characters in the original license expression 200 pub span: std::ops::Range<usize>, 201 } 202 203 impl<'a> Iterator for Lexer<'a> { 204 type Item = Result<LexerToken<'a>, ParseError>; 205 next(&mut self) -> Option<Self::Item>206 fn next(&mut self) -> Option<Self::Item> { 207 #[allow(clippy::unnecessary_wraps)] 208 fn ok_token(token: Token<'_>) -> Option<Result<(Token<'_>, usize), ParseError>> { 209 let len = token.len(); 210 Some(Ok((token, len))) 211 } 212 213 // Jump over any whitespace, updating `self.inner` and `self.offset` appropriately 214 let non_whitespace_index = match self.inner.find(|c: char| !c.is_whitespace()) { 215 Some(idx) => idx, 216 None => self.inner.len(), 217 }; 218 self.inner = &self.inner[non_whitespace_index..]; 219 self.offset += non_whitespace_index; 220 221 match self.inner.chars().next() { 222 None => None, 223 // From SPDX 2.1 spec 224 // There MUST NOT be whitespace between a license-id and any following "+". 225 Some('+') => { 226 if non_whitespace_index == 0 { 227 ok_token(Token::Plus) 228 } else { 229 Some(Err(ParseError { 230 original: self.original.to_owned(), 231 span: self.offset - non_whitespace_index..self.offset, 232 reason: Reason::SeparatedPlus, 233 })) 234 } 235 } 236 Some('(') => ok_token(Token::OpenParen), 237 Some(')') => ok_token(Token::CloseParen), 238 Some('/') if self.mode.allow_slash_as_or_operator => Some(Ok((Token::Or, 1))), 239 Some(_) => match Lexer::find_text_token(self.inner) { 240 None => Some(Err(ParseError { 241 original: self.original.to_owned(), 242 span: self.offset..self.offset + self.inner.len(), 243 reason: Reason::InvalidCharacters, 244 })), 245 Some(m) => { 246 if m == "WITH" { 247 ok_token(Token::With) 248 } else if m == "AND" { 249 ok_token(Token::And) 250 } else if m == "OR" { 251 ok_token(Token::Or) 252 } else if self.mode.allow_lower_case_operators && m == "and" { 253 ok_token(Token::And) 254 } else if self.mode.allow_lower_case_operators && m == "or" { 255 ok_token(Token::Or) 256 } else if self.mode.allow_lower_case_operators && m == "with" { 257 ok_token(Token::With) 258 } else if let Some(lic_id) = crate::license_id(m) { 259 ok_token(Token::Spdx(lic_id)) 260 } else if let Some(exc_id) = crate::exception_id(m) { 261 ok_token(Token::Exception(exc_id)) 262 } else if let Some((doc_ref, lic_ref)) = Lexer::find_document_and_license_ref(m) 263 { 264 ok_token(Token::LicenseRef { 265 doc_ref: Some(doc_ref), 266 lic_ref, 267 }) 268 } else if let Some(lic_ref) = Lexer::find_license_ref(m) { 269 ok_token(Token::LicenseRef { 270 doc_ref: None, 271 lic_ref, 272 }) 273 } else if let Some((lic_id, token_len)) = 274 if self.mode.allow_imprecise_license_names { 275 crate::imprecise_license_id(self.inner) 276 } else { 277 None 278 } 279 { 280 Some(Ok((Token::Spdx(lic_id), token_len))) 281 } else { 282 Some(Err(ParseError { 283 original: self.original.to_owned(), 284 span: self.offset..self.offset + m.len(), 285 reason: Reason::UnknownTerm, 286 })) 287 } 288 } 289 }, 290 } 291 .map(|res| { 292 res.map(|(tok, len)| { 293 let start = self.offset; 294 self.inner = &self.inner[len..]; 295 self.offset += len; 296 297 LexerToken { 298 token: tok, 299 span: start..self.offset, 300 } 301 }) 302 }) 303 } 304 } 305