1 use std::borrow::Cow;
2 use std::char;
3 use std::ops::RangeInclusive;
4 
5 use winnow::combinator::alt;
6 use winnow::combinator::cut_err;
7 use winnow::combinator::delimited;
8 use winnow::combinator::empty;
9 use winnow::combinator::fail;
10 use winnow::combinator::opt;
11 use winnow::combinator::peek;
12 use winnow::combinator::preceded;
13 use winnow::combinator::repeat;
14 use winnow::combinator::terminated;
15 use winnow::combinator::trace;
16 use winnow::prelude::*;
17 use winnow::stream::Stream;
18 use winnow::token::any;
19 use winnow::token::none_of;
20 use winnow::token::one_of;
21 use winnow::token::take_while;
22 
23 use crate::parser::error::CustomError;
24 use crate::parser::numbers::HEXDIG;
25 use crate::parser::prelude::*;
26 use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
27 
28 // ;; String
29 
30 // string = ml-basic-string / basic-string / ml-literal-string / literal-string
string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>31 pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
32     trace(
33         "string",
34         alt((
35             ml_basic_string,
36             basic_string,
37             ml_literal_string,
38             literal_string.map(Cow::Borrowed),
39         )),
40     )
41     .parse_next(input)
42 }
43 
44 // ;; Basic String
45 
46 // basic-string = quotation-mark *basic-char quotation-mark
basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>47 pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
48     trace("basic-string", |input: &mut Input<'i>| {
49         let _ = one_of(QUOTATION_MARK).parse_next(input)?;
50 
51         let mut c = Cow::Borrowed("");
52         if let Some(ci) = opt(basic_chars).parse_next(input)? {
53             c = ci;
54         }
55         while let Some(ci) = opt(basic_chars).parse_next(input)? {
56             c.to_mut().push_str(&ci);
57         }
58 
59         let _ = cut_err(one_of(QUOTATION_MARK))
60             .context(StrContext::Label("basic string"))
61             .parse_next(input)?;
62 
63         Ok(c)
64     })
65     .parse_next(input)
66 }
67 
68 // quotation-mark = %x22            ; "
69 pub(crate) const QUOTATION_MARK: u8 = b'"';
70 
71 // basic-char = basic-unescaped / escaped
basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>72 fn basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
73     alt((
74         // Deviate from the official grammar by batching the unescaped chars so we build a string a
75         // chunk at a time, rather than a `char` at a time.
76         take_while(1.., BASIC_UNESCAPED)
77             .try_map(std::str::from_utf8)
78             .map(Cow::Borrowed),
79         escaped.map(|c| Cow::Owned(String::from(c))),
80     ))
81     .parse_next(input)
82 }
83 
84 // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
85 pub(crate) const BASIC_UNESCAPED: (
86     (u8, u8),
87     u8,
88     RangeInclusive<u8>,
89     RangeInclusive<u8>,
90     RangeInclusive<u8>,
91 ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
92 
93 // escaped = escape escape-seq-char
escaped(input: &mut Input<'_>) -> PResult<char>94 fn escaped(input: &mut Input<'_>) -> PResult<char> {
95     preceded(ESCAPE, escape_seq_char).parse_next(input)
96 }
97 
98 // escape = %x5C                    ; \
99 pub(crate) const ESCAPE: u8 = b'\\';
100 
101 // escape-seq-char =  %x22         ; "    quotation mark  U+0022
102 // escape-seq-char =/ %x5C         ; \    reverse solidus U+005C
103 // escape-seq-char =/ %x62         ; b    backspace       U+0008
104 // escape-seq-char =/ %x66         ; f    form feed       U+000C
105 // escape-seq-char =/ %x6E         ; n    line feed       U+000A
106 // escape-seq-char =/ %x72         ; r    carriage return U+000D
107 // escape-seq-char =/ %x74         ; t    tab             U+0009
108 // escape-seq-char =/ %x75 4HEXDIG ; uXXXX                U+XXXX
109 // escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX            U+XXXXXXXX
escape_seq_char(input: &mut Input<'_>) -> PResult<char>110 fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> {
111     dispatch! {any;
112         b'b' => empty.value('\u{8}'),
113         b'f' => empty.value('\u{c}'),
114         b'n' => empty.value('\n'),
115         b'r' => empty.value('\r'),
116         b't' => empty.value('\t'),
117         b'u' => cut_err(hexescape::<4>).context(StrContext::Label("unicode 4-digit hex code")),
118         b'U' => cut_err(hexescape::<8>).context(StrContext::Label("unicode 8-digit hex code")),
119         b'\\' => empty.value('\\'),
120         b'"' => empty.value('"'),
121         _ => {
122             cut_err(fail::<_, char, _>)
123             .context(StrContext::Label("escape sequence"))
124             .context(StrContext::Expected(StrContextValue::CharLiteral('b')))
125             .context(StrContext::Expected(StrContextValue::CharLiteral('f')))
126             .context(StrContext::Expected(StrContextValue::CharLiteral('n')))
127             .context(StrContext::Expected(StrContextValue::CharLiteral('r')))
128             .context(StrContext::Expected(StrContextValue::CharLiteral('t')))
129             .context(StrContext::Expected(StrContextValue::CharLiteral('u')))
130             .context(StrContext::Expected(StrContextValue::CharLiteral('U')))
131             .context(StrContext::Expected(StrContextValue::CharLiteral('\\')))
132             .context(StrContext::Expected(StrContextValue::CharLiteral('"')))
133         }
134     }
135     .parse_next(input)
136 }
137 
hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char>138 pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> {
139     take_while(0..=N, HEXDIG)
140         .verify(|b: &[u8]| b.len() == N)
141         .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") })
142         .verify_map(|s| u32::from_str_radix(s, 16).ok())
143         .try_map(|h| char::from_u32(h).ok_or(CustomError::OutOfRange))
144         .parse_next(input)
145 }
146 
147 // ;; Multiline Basic String
148 
149 // ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
150 //                   ml-basic-string-delim
ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>151 fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
152     trace(
153         "ml-basic-string",
154         delimited(
155             ML_BASIC_STRING_DELIM,
156             preceded(opt(newline), cut_err(ml_basic_body))
157                 .context(StrContext::Label("multiline basic string")),
158             cut_err(ML_BASIC_STRING_DELIM).context(StrContext::Label("multiline basic string")),
159         ),
160     )
161     .parse_next(input)
162 }
163 
164 // ml-basic-string-delim = 3quotation-mark
165 pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\"";
166 
167 // ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>168 fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
169     let mut c = Cow::Borrowed("");
170     if let Some(ci) = opt(mlb_content).parse_next(input)? {
171         c = ci;
172     }
173     while let Some(ci) = opt(mlb_content).parse_next(input)? {
174         c.to_mut().push_str(&ci);
175     }
176 
177     while let Some(qi) = opt(mlb_quotes(none_of(b'\"').value(()))).parse_next(input)? {
178         if let Some(ci) = opt(mlb_content).parse_next(input)? {
179             c.to_mut().push_str(qi);
180             c.to_mut().push_str(&ci);
181             while let Some(ci) = opt(mlb_content).parse_next(input)? {
182                 c.to_mut().push_str(&ci);
183             }
184         } else {
185             break;
186         }
187     }
188 
189     if let Some(qi) = opt(mlb_quotes(ML_BASIC_STRING_DELIM.void())).parse_next(input)? {
190         c.to_mut().push_str(qi);
191     }
192 
193     Ok(c)
194 }
195 
196 // mlb-content = mlb-char / newline / mlb-escaped-nl
197 // mlb-char = mlb-unescaped / escaped
mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>198 fn mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
199     alt((
200         // Deviate from the official grammar by batching the unescaped chars so we build a string a
201         // chunk at a time, rather than a `char` at a time.
202         take_while(1.., MLB_UNESCAPED)
203             .try_map(std::str::from_utf8)
204             .map(Cow::Borrowed),
205         // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences
206         mlb_escaped_nl.map(|_| Cow::Borrowed("")),
207         escaped.map(|c| Cow::Owned(String::from(c))),
208         newline.map(|_| Cow::Borrowed("\n")),
209     ))
210     .parse_next(input)
211 }
212 
213 // mlb-quotes = 1*2quotation-mark
mlb_quotes<'i>( mut term: impl Parser<Input<'i>, (), ContextError>, ) -> impl Parser<Input<'i>, &'i str, ContextError>214 fn mlb_quotes<'i>(
215     mut term: impl Parser<Input<'i>, (), ContextError>,
216 ) -> impl Parser<Input<'i>, &'i str, ContextError> {
217     move |input: &mut Input<'i>| {
218         let start = input.checkpoint();
219         let res = terminated(b"\"\"", peek(term.by_ref()))
220             .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
221             .parse_next(input);
222 
223         match res {
224             Err(winnow::error::ErrMode::Backtrack(_)) => {
225                 input.reset(&start);
226                 terminated(b"\"", peek(term.by_ref()))
227                     .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
228                     .parse_next(input)
229             }
230             res => res,
231         }
232     }
233 }
234 
235 // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
236 pub(crate) const MLB_UNESCAPED: (
237     (u8, u8),
238     u8,
239     RangeInclusive<u8>,
240     RangeInclusive<u8>,
241     RangeInclusive<u8>,
242 ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
243 
244 // mlb-escaped-nl = escape ws newline *( wschar / newline
245 // When the last non-whitespace character on a line is a \,
246 // it will be trimmed along with all whitespace
247 // (including newlines) up to the next non-whitespace
248 // character or closing delimiter.
mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()>249 fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> {
250     repeat(1.., (ESCAPE, ws, ws_newlines))
251         .map(|()| ())
252         .value(())
253         .parse_next(input)
254 }
255 
256 // ;; Literal String
257 
258 // literal-string = apostrophe *literal-char apostrophe
literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str>259 pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
260     trace(
261         "literal-string",
262         delimited(
263             APOSTROPHE,
264             cut_err(take_while(0.., LITERAL_CHAR)),
265             cut_err(APOSTROPHE),
266         )
267         .try_map(std::str::from_utf8)
268         .context(StrContext::Label("literal string")),
269     )
270     .parse_next(input)
271 }
272 
273 // apostrophe = %x27 ; ' apostrophe
274 pub(crate) const APOSTROPHE: u8 = b'\'';
275 
276 // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
277 pub(crate) const LITERAL_CHAR: (
278     u8,
279     RangeInclusive<u8>,
280     RangeInclusive<u8>,
281     RangeInclusive<u8>,
282 ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
283 
284 // ;; Multiline Literal String
285 
286 // ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
287 //                     ml-literal-string-delim
ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>288 fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
289     trace(
290         "ml-literal-string",
291         delimited(
292             (ML_LITERAL_STRING_DELIM, opt(newline)),
293             cut_err(ml_literal_body.map(|t| {
294                 if t.contains("\r\n") {
295                     Cow::Owned(t.replace("\r\n", "\n"))
296                 } else {
297                     Cow::Borrowed(t)
298                 }
299             }))
300             .context(StrContext::Label("multiline literal string")),
301             cut_err(ML_LITERAL_STRING_DELIM).context(StrContext::Label("multiline literal string")),
302         ),
303     )
304     .parse_next(input)
305 }
306 
307 // ml-literal-string-delim = 3apostrophe
308 pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
309 
310 // ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str>311 fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
312     (
313         repeat(0.., mll_content).map(|()| ()),
314         repeat(
315             0..,
316             (
317                 mll_quotes(none_of(APOSTROPHE).value(())),
318                 repeat(1.., mll_content).map(|()| ()),
319             ),
320         )
321         .map(|()| ()),
322         opt(mll_quotes(ML_LITERAL_STRING_DELIM.void())),
323     )
324         .take()
325         .try_map(std::str::from_utf8)
326         .parse_next(input)
327 }
328 
329 // mll-content = mll-char / newline
mll_content(input: &mut Input<'_>) -> PResult<u8>330 fn mll_content(input: &mut Input<'_>) -> PResult<u8> {
331     alt((one_of(MLL_CHAR), newline.value(b'\n'))).parse_next(input)
332 }
333 
334 // mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
335 const MLL_CHAR: (
336     u8,
337     RangeInclusive<u8>,
338     RangeInclusive<u8>,
339     RangeInclusive<u8>,
340 ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
341 
342 // mll-quotes = 1*2apostrophe
mll_quotes<'i>( mut term: impl Parser<Input<'i>, (), ContextError>, ) -> impl Parser<Input<'i>, &'i str, ContextError>343 fn mll_quotes<'i>(
344     mut term: impl Parser<Input<'i>, (), ContextError>,
345 ) -> impl Parser<Input<'i>, &'i str, ContextError> {
346     move |input: &mut Input<'i>| {
347         let start = input.checkpoint();
348         let res = terminated(b"''", peek(term.by_ref()))
349             .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
350             .parse_next(input);
351 
352         match res {
353             Err(winnow::error::ErrMode::Backtrack(_)) => {
354                 input.reset(&start);
355                 terminated(b"'", peek(term.by_ref()))
356                     .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
357                     .parse_next(input)
358             }
359             res => res,
360         }
361     }
362 }
363 
364 #[cfg(test)]
365 #[cfg(feature = "parse")]
366 #[cfg(feature = "display")]
367 mod test {
368     use super::*;
369 
370     #[test]
basic_string()371     fn basic_string() {
372         let input =
373             r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
374         let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}";
375         let parsed = string.parse(new_input(input));
376         assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
377     }
378 
379     #[test]
ml_basic_string()380     fn ml_basic_string() {
381         let cases = [
382             (
383                 r#""""
384 Roses are red
385 Violets are blue""""#,
386                 r#"Roses are red
387 Violets are blue"#,
388             ),
389             (r#"""" \""" """"#, " \"\"\" "),
390             (r#"""" \\""""#, " \\"),
391         ];
392 
393         for &(input, expected) in &cases {
394             let parsed = string.parse(new_input(input));
395             assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
396         }
397 
398         let invalid_cases = [r#""""  """#, r#""""  \""""#];
399 
400         for input in &invalid_cases {
401             let parsed = string.parse(new_input(input));
402             assert!(parsed.is_err());
403         }
404     }
405 
406     #[test]
ml_basic_string_escape_ws()407     fn ml_basic_string_escape_ws() {
408         let inputs = [
409             r#""""
410 The quick brown \
411 
412 
413   fox jumps over \
414     the lazy dog.""""#,
415             r#""""\
416        The quick brown \
417        fox jumps over \
418        the lazy dog.\
419        """"#,
420         ];
421         for input in &inputs {
422             let expected = "The quick brown fox jumps over the lazy dog.";
423             let parsed = string.parse(new_input(input));
424             assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
425         }
426         let empties = [
427             r#""""\
428        """"#,
429             r#""""
430 \
431   \
432 """"#,
433         ];
434         for input in &empties {
435             let expected = "";
436             let parsed = string.parse(new_input(input));
437             assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
438         }
439     }
440 
441     #[test]
literal_string()442     fn literal_string() {
443         let inputs = [
444             r"'C:\Users\nodejs\templates'",
445             r"'\\ServerX\admin$\system32\'",
446             r#"'Tom "Dubs" Preston-Werner'"#,
447             r"'<\i\c*\s*>'",
448         ];
449 
450         for input in &inputs {
451             let expected = &input[1..input.len() - 1];
452             let parsed = string.parse(new_input(input));
453             assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
454         }
455     }
456 
457     #[test]
ml_literal_string()458     fn ml_literal_string() {
459         let inputs = [
460             r"'''I [dw]on't need \d{2} apples'''",
461             r#"''''one_quote''''"#,
462         ];
463         for input in &inputs {
464             let expected = &input[3..input.len() - 3];
465             let parsed = string.parse(new_input(input));
466             assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
467         }
468 
469         let input = r#"'''
470 The first newline is
471 trimmed in raw strings.
472    All other whitespace
473    is preserved.
474 '''"#;
475         let expected = &input[4..input.len() - 3];
476         let parsed = string.parse(new_input(input));
477         assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
478     }
479 }
480