1 use std::borrow::Cow;
2 use std::char;
3 use std::ops::RangeInclusive;
4
5 use winnow::combinator::alt;
6 use winnow::combinator::cut_err;
7 use winnow::combinator::delimited;
8 use winnow::combinator::empty;
9 use winnow::combinator::fail;
10 use winnow::combinator::opt;
11 use winnow::combinator::peek;
12 use winnow::combinator::preceded;
13 use winnow::combinator::repeat;
14 use winnow::combinator::terminated;
15 use winnow::combinator::trace;
16 use winnow::prelude::*;
17 use winnow::stream::Stream;
18 use winnow::token::any;
19 use winnow::token::none_of;
20 use winnow::token::one_of;
21 use winnow::token::take_while;
22
23 use crate::parser::error::CustomError;
24 use crate::parser::numbers::HEXDIG;
25 use crate::parser::prelude::*;
26 use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
27
28 // ;; String
29
30 // string = ml-basic-string / basic-string / ml-literal-string / literal-string
string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>31 pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
32 trace(
33 "string",
34 alt((
35 ml_basic_string,
36 basic_string,
37 ml_literal_string,
38 literal_string.map(Cow::Borrowed),
39 )),
40 )
41 .parse_next(input)
42 }
43
44 // ;; Basic String
45
46 // basic-string = quotation-mark *basic-char quotation-mark
basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>47 pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
48 trace("basic-string", |input: &mut Input<'i>| {
49 let _ = one_of(QUOTATION_MARK).parse_next(input)?;
50
51 let mut c = Cow::Borrowed("");
52 if let Some(ci) = opt(basic_chars).parse_next(input)? {
53 c = ci;
54 }
55 while let Some(ci) = opt(basic_chars).parse_next(input)? {
56 c.to_mut().push_str(&ci);
57 }
58
59 let _ = cut_err(one_of(QUOTATION_MARK))
60 .context(StrContext::Label("basic string"))
61 .parse_next(input)?;
62
63 Ok(c)
64 })
65 .parse_next(input)
66 }
67
68 // quotation-mark = %x22 ; "
69 pub(crate) const QUOTATION_MARK: u8 = b'"';
70
71 // basic-char = basic-unescaped / escaped
basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>72 fn basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
73 alt((
74 // Deviate from the official grammar by batching the unescaped chars so we build a string a
75 // chunk at a time, rather than a `char` at a time.
76 take_while(1.., BASIC_UNESCAPED)
77 .try_map(std::str::from_utf8)
78 .map(Cow::Borrowed),
79 escaped.map(|c| Cow::Owned(String::from(c))),
80 ))
81 .parse_next(input)
82 }
83
84 // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
85 pub(crate) const BASIC_UNESCAPED: (
86 (u8, u8),
87 u8,
88 RangeInclusive<u8>,
89 RangeInclusive<u8>,
90 RangeInclusive<u8>,
91 ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
92
93 // escaped = escape escape-seq-char
escaped(input: &mut Input<'_>) -> PResult<char>94 fn escaped(input: &mut Input<'_>) -> PResult<char> {
95 preceded(ESCAPE, escape_seq_char).parse_next(input)
96 }
97
98 // escape = %x5C ; \
99 pub(crate) const ESCAPE: u8 = b'\\';
100
101 // escape-seq-char = %x22 ; " quotation mark U+0022
102 // escape-seq-char =/ %x5C ; \ reverse solidus U+005C
103 // escape-seq-char =/ %x62 ; b backspace U+0008
104 // escape-seq-char =/ %x66 ; f form feed U+000C
105 // escape-seq-char =/ %x6E ; n line feed U+000A
106 // escape-seq-char =/ %x72 ; r carriage return U+000D
107 // escape-seq-char =/ %x74 ; t tab U+0009
108 // escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX
109 // escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX
escape_seq_char(input: &mut Input<'_>) -> PResult<char>110 fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> {
111 dispatch! {any;
112 b'b' => empty.value('\u{8}'),
113 b'f' => empty.value('\u{c}'),
114 b'n' => empty.value('\n'),
115 b'r' => empty.value('\r'),
116 b't' => empty.value('\t'),
117 b'u' => cut_err(hexescape::<4>).context(StrContext::Label("unicode 4-digit hex code")),
118 b'U' => cut_err(hexescape::<8>).context(StrContext::Label("unicode 8-digit hex code")),
119 b'\\' => empty.value('\\'),
120 b'"' => empty.value('"'),
121 _ => {
122 cut_err(fail::<_, char, _>)
123 .context(StrContext::Label("escape sequence"))
124 .context(StrContext::Expected(StrContextValue::CharLiteral('b')))
125 .context(StrContext::Expected(StrContextValue::CharLiteral('f')))
126 .context(StrContext::Expected(StrContextValue::CharLiteral('n')))
127 .context(StrContext::Expected(StrContextValue::CharLiteral('r')))
128 .context(StrContext::Expected(StrContextValue::CharLiteral('t')))
129 .context(StrContext::Expected(StrContextValue::CharLiteral('u')))
130 .context(StrContext::Expected(StrContextValue::CharLiteral('U')))
131 .context(StrContext::Expected(StrContextValue::CharLiteral('\\')))
132 .context(StrContext::Expected(StrContextValue::CharLiteral('"')))
133 }
134 }
135 .parse_next(input)
136 }
137
hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char>138 pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> {
139 take_while(0..=N, HEXDIG)
140 .verify(|b: &[u8]| b.len() == N)
141 .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") })
142 .verify_map(|s| u32::from_str_radix(s, 16).ok())
143 .try_map(|h| char::from_u32(h).ok_or(CustomError::OutOfRange))
144 .parse_next(input)
145 }
146
147 // ;; Multiline Basic String
148
149 // ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
150 // ml-basic-string-delim
ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>151 fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
152 trace(
153 "ml-basic-string",
154 delimited(
155 ML_BASIC_STRING_DELIM,
156 preceded(opt(newline), cut_err(ml_basic_body))
157 .context(StrContext::Label("multiline basic string")),
158 cut_err(ML_BASIC_STRING_DELIM).context(StrContext::Label("multiline basic string")),
159 ),
160 )
161 .parse_next(input)
162 }
163
164 // ml-basic-string-delim = 3quotation-mark
165 pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\"";
166
167 // ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>168 fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
169 let mut c = Cow::Borrowed("");
170 if let Some(ci) = opt(mlb_content).parse_next(input)? {
171 c = ci;
172 }
173 while let Some(ci) = opt(mlb_content).parse_next(input)? {
174 c.to_mut().push_str(&ci);
175 }
176
177 while let Some(qi) = opt(mlb_quotes(none_of(b'\"').value(()))).parse_next(input)? {
178 if let Some(ci) = opt(mlb_content).parse_next(input)? {
179 c.to_mut().push_str(qi);
180 c.to_mut().push_str(&ci);
181 while let Some(ci) = opt(mlb_content).parse_next(input)? {
182 c.to_mut().push_str(&ci);
183 }
184 } else {
185 break;
186 }
187 }
188
189 if let Some(qi) = opt(mlb_quotes(ML_BASIC_STRING_DELIM.void())).parse_next(input)? {
190 c.to_mut().push_str(qi);
191 }
192
193 Ok(c)
194 }
195
196 // mlb-content = mlb-char / newline / mlb-escaped-nl
197 // mlb-char = mlb-unescaped / escaped
mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>198 fn mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
199 alt((
200 // Deviate from the official grammar by batching the unescaped chars so we build a string a
201 // chunk at a time, rather than a `char` at a time.
202 take_while(1.., MLB_UNESCAPED)
203 .try_map(std::str::from_utf8)
204 .map(Cow::Borrowed),
205 // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences
206 mlb_escaped_nl.map(|_| Cow::Borrowed("")),
207 escaped.map(|c| Cow::Owned(String::from(c))),
208 newline.map(|_| Cow::Borrowed("\n")),
209 ))
210 .parse_next(input)
211 }
212
213 // mlb-quotes = 1*2quotation-mark
mlb_quotes<'i>( mut term: impl Parser<Input<'i>, (), ContextError>, ) -> impl Parser<Input<'i>, &'i str, ContextError>214 fn mlb_quotes<'i>(
215 mut term: impl Parser<Input<'i>, (), ContextError>,
216 ) -> impl Parser<Input<'i>, &'i str, ContextError> {
217 move |input: &mut Input<'i>| {
218 let start = input.checkpoint();
219 let res = terminated(b"\"\"", peek(term.by_ref()))
220 .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
221 .parse_next(input);
222
223 match res {
224 Err(winnow::error::ErrMode::Backtrack(_)) => {
225 input.reset(&start);
226 terminated(b"\"", peek(term.by_ref()))
227 .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
228 .parse_next(input)
229 }
230 res => res,
231 }
232 }
233 }
234
235 // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
236 pub(crate) const MLB_UNESCAPED: (
237 (u8, u8),
238 u8,
239 RangeInclusive<u8>,
240 RangeInclusive<u8>,
241 RangeInclusive<u8>,
242 ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
243
244 // mlb-escaped-nl = escape ws newline *( wschar / newline
245 // When the last non-whitespace character on a line is a \,
246 // it will be trimmed along with all whitespace
247 // (including newlines) up to the next non-whitespace
248 // character or closing delimiter.
mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()>249 fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> {
250 repeat(1.., (ESCAPE, ws, ws_newlines))
251 .map(|()| ())
252 .value(())
253 .parse_next(input)
254 }
255
256 // ;; Literal String
257
258 // literal-string = apostrophe *literal-char apostrophe
literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str>259 pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
260 trace(
261 "literal-string",
262 delimited(
263 APOSTROPHE,
264 cut_err(take_while(0.., LITERAL_CHAR)),
265 cut_err(APOSTROPHE),
266 )
267 .try_map(std::str::from_utf8)
268 .context(StrContext::Label("literal string")),
269 )
270 .parse_next(input)
271 }
272
273 // apostrophe = %x27 ; ' apostrophe
274 pub(crate) const APOSTROPHE: u8 = b'\'';
275
276 // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
277 pub(crate) const LITERAL_CHAR: (
278 u8,
279 RangeInclusive<u8>,
280 RangeInclusive<u8>,
281 RangeInclusive<u8>,
282 ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
283
284 // ;; Multiline Literal String
285
286 // ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
287 // ml-literal-string-delim
ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>288 fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
289 trace(
290 "ml-literal-string",
291 delimited(
292 (ML_LITERAL_STRING_DELIM, opt(newline)),
293 cut_err(ml_literal_body.map(|t| {
294 if t.contains("\r\n") {
295 Cow::Owned(t.replace("\r\n", "\n"))
296 } else {
297 Cow::Borrowed(t)
298 }
299 }))
300 .context(StrContext::Label("multiline literal string")),
301 cut_err(ML_LITERAL_STRING_DELIM).context(StrContext::Label("multiline literal string")),
302 ),
303 )
304 .parse_next(input)
305 }
306
307 // ml-literal-string-delim = 3apostrophe
308 pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
309
310 // ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str>311 fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
312 (
313 repeat(0.., mll_content).map(|()| ()),
314 repeat(
315 0..,
316 (
317 mll_quotes(none_of(APOSTROPHE).value(())),
318 repeat(1.., mll_content).map(|()| ()),
319 ),
320 )
321 .map(|()| ()),
322 opt(mll_quotes(ML_LITERAL_STRING_DELIM.void())),
323 )
324 .take()
325 .try_map(std::str::from_utf8)
326 .parse_next(input)
327 }
328
329 // mll-content = mll-char / newline
mll_content(input: &mut Input<'_>) -> PResult<u8>330 fn mll_content(input: &mut Input<'_>) -> PResult<u8> {
331 alt((one_of(MLL_CHAR), newline.value(b'\n'))).parse_next(input)
332 }
333
334 // mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
335 const MLL_CHAR: (
336 u8,
337 RangeInclusive<u8>,
338 RangeInclusive<u8>,
339 RangeInclusive<u8>,
340 ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
341
342 // mll-quotes = 1*2apostrophe
mll_quotes<'i>( mut term: impl Parser<Input<'i>, (), ContextError>, ) -> impl Parser<Input<'i>, &'i str, ContextError>343 fn mll_quotes<'i>(
344 mut term: impl Parser<Input<'i>, (), ContextError>,
345 ) -> impl Parser<Input<'i>, &'i str, ContextError> {
346 move |input: &mut Input<'i>| {
347 let start = input.checkpoint();
348 let res = terminated(b"''", peek(term.by_ref()))
349 .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
350 .parse_next(input);
351
352 match res {
353 Err(winnow::error::ErrMode::Backtrack(_)) => {
354 input.reset(&start);
355 terminated(b"'", peek(term.by_ref()))
356 .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
357 .parse_next(input)
358 }
359 res => res,
360 }
361 }
362 }
363
364 #[cfg(test)]
365 #[cfg(feature = "parse")]
366 #[cfg(feature = "display")]
367 mod test {
368 use super::*;
369
370 #[test]
basic_string()371 fn basic_string() {
372 let input =
373 r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
374 let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}";
375 let parsed = string.parse(new_input(input));
376 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
377 }
378
379 #[test]
ml_basic_string()380 fn ml_basic_string() {
381 let cases = [
382 (
383 r#""""
384 Roses are red
385 Violets are blue""""#,
386 r#"Roses are red
387 Violets are blue"#,
388 ),
389 (r#"""" \""" """"#, " \"\"\" "),
390 (r#"""" \\""""#, " \\"),
391 ];
392
393 for &(input, expected) in &cases {
394 let parsed = string.parse(new_input(input));
395 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
396 }
397
398 let invalid_cases = [r#"""" """#, r#"""" \""""#];
399
400 for input in &invalid_cases {
401 let parsed = string.parse(new_input(input));
402 assert!(parsed.is_err());
403 }
404 }
405
406 #[test]
ml_basic_string_escape_ws()407 fn ml_basic_string_escape_ws() {
408 let inputs = [
409 r#""""
410 The quick brown \
411
412
413 fox jumps over \
414 the lazy dog.""""#,
415 r#""""\
416 The quick brown \
417 fox jumps over \
418 the lazy dog.\
419 """"#,
420 ];
421 for input in &inputs {
422 let expected = "The quick brown fox jumps over the lazy dog.";
423 let parsed = string.parse(new_input(input));
424 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
425 }
426 let empties = [
427 r#""""\
428 """"#,
429 r#""""
430 \
431 \
432 """"#,
433 ];
434 for input in &empties {
435 let expected = "";
436 let parsed = string.parse(new_input(input));
437 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
438 }
439 }
440
441 #[test]
literal_string()442 fn literal_string() {
443 let inputs = [
444 r"'C:\Users\nodejs\templates'",
445 r"'\\ServerX\admin$\system32\'",
446 r#"'Tom "Dubs" Preston-Werner'"#,
447 r"'<\i\c*\s*>'",
448 ];
449
450 for input in &inputs {
451 let expected = &input[1..input.len() - 1];
452 let parsed = string.parse(new_input(input));
453 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
454 }
455 }
456
457 #[test]
ml_literal_string()458 fn ml_literal_string() {
459 let inputs = [
460 r"'''I [dw]on't need \d{2} apples'''",
461 r#"''''one_quote''''"#,
462 ];
463 for input in &inputs {
464 let expected = &input[3..input.len() - 3];
465 let parsed = string.parse(new_input(input));
466 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
467 }
468
469 let input = r#"'''
470 The first newline is
471 trimmed in raw strings.
472 All other whitespace
473 is preserved.
474 '''"#;
475 let expected = &input[4..input.len() - 3];
476 let parsed = string.parse(new_input(input));
477 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
478 }
479 }
480