• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use crate::{
2     error::{ParseError, Reason},
3     ExceptionId, LicenseId,
4 };
5 
6 /// Parsing configuration for SPDX expression
7 #[derive(Default, Copy, Clone)]
8 pub struct ParseMode {
9     /// The `AND`, `OR`, and `WITH` operators are required to be uppercase in
10     /// the SPDX spec, but enabling this option allows them to be lowercased
11     pub allow_lower_case_operators: bool,
12     /// Allows the use of `/` as a synonym for the `OR` operator.
13     ///
14     /// This also allows for not having whitespace between the `/` and the terms
15     /// on either side
16     pub allow_slash_as_or_operator: bool,
17     /// Allows some invalid/imprecise identifiers as synonyms for an actual
18     /// license identifier.
19     ///
20     /// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for a list
21     /// of the current synonyms. Note that this list is not comprehensive but
22     /// can be expanded upon when invalid identifiers are found in the wild.
23     pub allow_imprecise_license_names: bool,
24     /// The various GPL licenses diverge from every other license in the SPDX
25     /// license list by having an `-or-later` variant that is used as a suffix
26     /// on a base license (eg. `GPL-3.0-or-later`) rather than the canonical
27     /// `GPL-3.0+`.
28     ///
29     /// This option just allows GPL licenses to be treated similarly to all of
30     /// the other SPDX licenses.
31     pub allow_postfix_plus_on_gpl: bool,
32 }
33 
34 impl ParseMode {
35     /// Strict, specification compliant SPDX parsing.
36     ///
37     /// 1. Only license identifiers in the SPDX license list, or
38     ///     Document/LicenseRef, are allowed. The license identifiers are also
39     ///     case-sensitive.
40     /// 1. `WITH`, `AND`, and `OR` are the only valid operators
41     pub const STRICT: Self = Self {
42         allow_lower_case_operators: false,
43         allow_slash_as_or_operator: false,
44         allow_imprecise_license_names: false,
45         allow_postfix_plus_on_gpl: false,
46     };
47 
48     /// Allow non-conforming syntax for crates-io compatibility
49     ///
50     /// 1. Additional, invalid, identifiers are accepted and mapped to a correct
51     ///     SPDX license identifier.
52     ///     See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for the
53     ///     list of additionally accepted identifiers and the license they
54     ///     correspond to.
55     /// 1. `/` can by used as a synonym for `OR`, and doesn't need to be
56     ///     separated by whitespace from the terms it combines
57     pub const LAX: Self = Self {
58         allow_lower_case_operators: true,
59         allow_slash_as_or_operator: true,
60         allow_imprecise_license_names: true,
61         allow_postfix_plus_on_gpl: true,
62     };
63 }
64 
65 /// A single token in an SPDX license expression
66 #[derive(Clone, Debug, PartialEq, Eq)]
67 pub enum Token<'a> {
68     /// A recognized SPDX license id
69     Spdx(LicenseId),
70     /// A `LicenseRef-` prefixed id, with an optional `DocumentRef-`
71     LicenseRef {
72         doc_ref: Option<&'a str>,
73         lic_ref: &'a str,
74     },
75     /// A recognized SPDX exception id
76     Exception(ExceptionId),
77     /// A postfix `+` indicating "or later" for a particular SPDX license id
78     Plus,
79     /// A `(` for starting a group
80     OpenParen,
81     /// A `)` for ending a group
82     CloseParen,
83     /// A `WITH` operator
84     With,
85     /// An `AND` operator
86     And,
87     /// An `OR` operator
88     Or,
89 }
90 
91 impl std::fmt::Display for Token<'_> {
fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result92     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
93         std::fmt::Debug::fmt(self, f)
94     }
95 }
96 
97 impl Token<'_> {
len(&self) -> usize98     fn len(&self) -> usize {
99         match self {
100             Token::Spdx(id) => id.name.len(),
101             Token::Exception(e) => e.name.len(),
102             Token::With => 4,
103             Token::And => 3,
104             Token::Or => 2,
105             Token::Plus | Token::OpenParen | Token::CloseParen => 1,
106             Token::LicenseRef { doc_ref, lic_ref } => {
107                 doc_ref.map_or(0, |d| {
108                     // +1 is for the `:`
109                     "DocumentRef-".len() + d.len() + 1
110                 }) + "LicenseRef-".len()
111                     + lic_ref.len()
112             }
113         }
114     }
115 }
116 
117 /// Allows iteration through an SPDX license expression, yielding
118 /// a token or a `ParseError`.
119 ///
120 /// Prefer to use `Expression::parse` or `Licensee::parse` rather
121 /// than directly using the lexer
122 pub struct Lexer<'a> {
123     inner: &'a str,
124     original: &'a str,
125     offset: usize,
126     mode: ParseMode,
127 }
128 
129 impl<'a> Lexer<'a> {
130     /// Creates a Lexer over a license expression
131     #[must_use]
new(text: &'a str) -> Self132     pub fn new(text: &'a str) -> Self {
133         Self {
134             inner: text,
135             original: text,
136             offset: 0,
137             mode: ParseMode::STRICT,
138         }
139     }
140 
141     /// Creates a Lexer over a license expression
142     ///
143     /// With `ParseMode::Lax` it allows non-conforming syntax
144     /// used in crates-io crates.
145     #[must_use]
new_mode(text: &'a str, mode: ParseMode) -> Self146     pub fn new_mode(text: &'a str, mode: ParseMode) -> Self {
147         Self {
148             inner: text,
149             original: text,
150             offset: 0,
151             mode,
152         }
153     }
154 
155     #[inline]
is_ref_char(c: &char) -> bool156     fn is_ref_char(c: &char) -> bool {
157         c.is_ascii_alphanumeric() || *c == '-' || *c == '.'
158     }
159 
160     /// Return a matching text token if found - equivalent to the regex `^[-a-zA-Z0-9.:]+`
find_text_token(text: &'a str) -> Option<&'a str>161     fn find_text_token(text: &'a str) -> Option<&'a str> {
162         let is_token_char = |c: &char| Self::is_ref_char(c) || *c == ':';
163         match text.chars().take_while(is_token_char).count() {
164             index if index > 0 => Some(&text[..index]),
165             _ => None,
166         }
167     }
168 
169     /// Extract the text after `prefix` if made up of valid ref characters
find_ref(prefix: &str, text: &'a str) -> Option<&'a str>170     fn find_ref(prefix: &str, text: &'a str) -> Option<&'a str> {
171         text.strip_prefix(prefix).map(|value| {
172             let end = value.chars().take_while(Self::is_ref_char).count();
173             &text[prefix.len()..prefix.len() + end]
174         })
175     }
176 
177     /// Return a license ref if found - equivalent to the regex `^LicenseRef-([-a-zA-Z0-9.]+)`
178     #[inline]
find_license_ref(text: &'a str) -> Option<&'a str>179     fn find_license_ref(text: &'a str) -> Option<&'a str> {
180         Self::find_ref("LicenseRef-", text)
181     }
182 
183     /// Return a document ref and license ref if found,
184     /// equivalent to the regex `^DocumentRef-([-a-zA-Z0-9.]+):LicenseRef-([-a-zA-Z0-9.]+)`
find_document_and_license_ref(text: &'a str) -> Option<(&'a str, &'a str)>185     fn find_document_and_license_ref(text: &'a str) -> Option<(&'a str, &'a str)> {
186         let split = text.split_once(':');
187         let doc_ref = split.and_then(|(doc, _)| Self::find_ref("DocumentRef-", doc));
188         let lic_ref = split.and_then(|(_, lic)| Self::find_license_ref(lic));
189         Option::zip(doc_ref, lic_ref)
190     }
191 }
192 
193 /// A wrapper around a particular token that includes the span of the characters
194 /// in the original string, for diagnostic purposes
195 #[derive(Debug)]
196 pub struct LexerToken<'a> {
197     /// The token that was lexed
198     pub token: Token<'a>,
199     /// The range of the token characters in the original license expression
200     pub span: std::ops::Range<usize>,
201 }
202 
203 impl<'a> Iterator for Lexer<'a> {
204     type Item = Result<LexerToken<'a>, ParseError>;
205 
next(&mut self) -> Option<Self::Item>206     fn next(&mut self) -> Option<Self::Item> {
207         #[allow(clippy::unnecessary_wraps)]
208         fn ok_token(token: Token<'_>) -> Option<Result<(Token<'_>, usize), ParseError>> {
209             let len = token.len();
210             Some(Ok((token, len)))
211         }
212 
213         // Jump over any whitespace, updating `self.inner` and `self.offset` appropriately
214         let non_whitespace_index = match self.inner.find(|c: char| !c.is_whitespace()) {
215             Some(idx) => idx,
216             None => self.inner.len(),
217         };
218         self.inner = &self.inner[non_whitespace_index..];
219         self.offset += non_whitespace_index;
220 
221         match self.inner.chars().next() {
222             None => None,
223             // From SPDX 2.1 spec
224             // There MUST NOT be whitespace between a license-id and any following "+".
225             Some('+') => {
226                 if non_whitespace_index == 0 {
227                     ok_token(Token::Plus)
228                 } else {
229                     Some(Err(ParseError {
230                         original: self.original.to_owned(),
231                         span: self.offset - non_whitespace_index..self.offset,
232                         reason: Reason::SeparatedPlus,
233                     }))
234                 }
235             }
236             Some('(') => ok_token(Token::OpenParen),
237             Some(')') => ok_token(Token::CloseParen),
238             Some('/') if self.mode.allow_slash_as_or_operator => Some(Ok((Token::Or, 1))),
239             Some(_) => match Lexer::find_text_token(self.inner) {
240                 None => Some(Err(ParseError {
241                     original: self.original.to_owned(),
242                     span: self.offset..self.offset + self.inner.len(),
243                     reason: Reason::InvalidCharacters,
244                 })),
245                 Some(m) => {
246                     if m == "WITH" {
247                         ok_token(Token::With)
248                     } else if m == "AND" {
249                         ok_token(Token::And)
250                     } else if m == "OR" {
251                         ok_token(Token::Or)
252                     } else if self.mode.allow_lower_case_operators && m == "and" {
253                         ok_token(Token::And)
254                     } else if self.mode.allow_lower_case_operators && m == "or" {
255                         ok_token(Token::Or)
256                     } else if self.mode.allow_lower_case_operators && m == "with" {
257                         ok_token(Token::With)
258                     } else if let Some(lic_id) = crate::license_id(m) {
259                         ok_token(Token::Spdx(lic_id))
260                     } else if let Some(exc_id) = crate::exception_id(m) {
261                         ok_token(Token::Exception(exc_id))
262                     } else if let Some((doc_ref, lic_ref)) = Lexer::find_document_and_license_ref(m)
263                     {
264                         ok_token(Token::LicenseRef {
265                             doc_ref: Some(doc_ref),
266                             lic_ref,
267                         })
268                     } else if let Some(lic_ref) = Lexer::find_license_ref(m) {
269                         ok_token(Token::LicenseRef {
270                             doc_ref: None,
271                             lic_ref,
272                         })
273                     } else if let Some((lic_id, token_len)) =
274                         if self.mode.allow_imprecise_license_names {
275                             crate::imprecise_license_id(self.inner)
276                         } else {
277                             None
278                         }
279                     {
280                         Some(Ok((Token::Spdx(lic_id), token_len)))
281                     } else {
282                         Some(Err(ParseError {
283                             original: self.original.to_owned(),
284                             span: self.offset..self.offset + m.len(),
285                             reason: Reason::UnknownTerm,
286                         }))
287                     }
288                 }
289             },
290         }
291         .map(|res| {
292             res.map(|(tok, len)| {
293                 let start = self.offset;
294                 self.inner = &self.inner[len..];
295                 self.offset += len;
296 
297                 LexerToken {
298                     token: tok,
299                     span: start..self.offset,
300                 }
301             })
302         })
303     }
304 }
305