• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2 This module provides a regular expression printer for `Hir`.
3 */
4 
5 use std::fmt;
6 
7 use crate::hir::visitor::{self, Visitor};
8 use crate::hir::{self, Hir, HirKind};
9 use crate::is_meta_character;
10 
11 /// A builder for constructing a printer.
12 ///
13 /// Note that since a printer doesn't have any configuration knobs, this type
14 /// remains unexported.
15 #[derive(Clone, Debug)]
16 struct PrinterBuilder {
17     _priv: (),
18 }
19 
20 impl Default for PrinterBuilder {
default() -> PrinterBuilder21     fn default() -> PrinterBuilder {
22         PrinterBuilder::new()
23     }
24 }
25 
26 impl PrinterBuilder {
new() -> PrinterBuilder27     fn new() -> PrinterBuilder {
28         PrinterBuilder { _priv: () }
29     }
30 
build(&self) -> Printer31     fn build(&self) -> Printer {
32         Printer { _priv: () }
33     }
34 }
35 
36 /// A printer for a regular expression's high-level intermediate
37 /// representation.
38 ///
39 /// A printer converts a high-level intermediate representation (HIR) to a
40 /// regular expression pattern string. This particular printer uses constant
41 /// stack space and heap space proportional to the size of the HIR.
42 ///
43 /// Since this printer is only using the HIR, the pattern it prints will likely
44 /// not resemble the original pattern at all. For example, a pattern like
45 /// `\pL` will have its entire class written out.
46 ///
47 /// The purpose of this printer is to provide a means to mutate an HIR and then
48 /// build a regular expression from the result of that mutation. (A regex
49 /// library could provide a constructor from this HIR explicitly, but that
50 /// creates an unnecessary public coupling between the regex library and this
51 /// specific HIR representation.)
52 #[derive(Debug)]
53 pub struct Printer {
54     _priv: (),
55 }
56 
57 impl Printer {
58     /// Create a new printer.
new() -> Printer59     pub fn new() -> Printer {
60         PrinterBuilder::new().build()
61     }
62 
63     /// Print the given `Ast` to the given writer. The writer must implement
64     /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
65     /// here are a `fmt::Formatter` (which is available in `fmt::Display`
66     /// implementations) or a `&mut String`.
print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result67     pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
68         visitor::visit(hir, Writer { wtr })
69     }
70 }
71 
72 #[derive(Debug)]
73 struct Writer<W> {
74     wtr: W,
75 }
76 
77 impl<W: fmt::Write> Visitor for Writer<W> {
78     type Output = ();
79     type Err = fmt::Error;
80 
finish(self) -> fmt::Result81     fn finish(self) -> fmt::Result {
82         Ok(())
83     }
84 
visit_pre(&mut self, hir: &Hir) -> fmt::Result85     fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
86         match *hir.kind() {
87             HirKind::Empty
88             | HirKind::Repetition(_)
89             | HirKind::Concat(_)
90             | HirKind::Alternation(_) => {}
91             HirKind::Literal(hir::Literal::Unicode(c)) => {
92                 self.write_literal_char(c)?;
93             }
94             HirKind::Literal(hir::Literal::Byte(b)) => {
95                 self.write_literal_byte(b)?;
96             }
97             HirKind::Class(hir::Class::Unicode(ref cls)) => {
98                 self.wtr.write_str("[")?;
99                 for range in cls.iter() {
100                     if range.start() == range.end() {
101                         self.write_literal_char(range.start())?;
102                     } else {
103                         self.write_literal_char(range.start())?;
104                         self.wtr.write_str("-")?;
105                         self.write_literal_char(range.end())?;
106                     }
107                 }
108                 self.wtr.write_str("]")?;
109             }
110             HirKind::Class(hir::Class::Bytes(ref cls)) => {
111                 self.wtr.write_str("(?-u:[")?;
112                 for range in cls.iter() {
113                     if range.start() == range.end() {
114                         self.write_literal_class_byte(range.start())?;
115                     } else {
116                         self.write_literal_class_byte(range.start())?;
117                         self.wtr.write_str("-")?;
118                         self.write_literal_class_byte(range.end())?;
119                     }
120                 }
121                 self.wtr.write_str("])")?;
122             }
123             HirKind::Anchor(hir::Anchor::StartLine) => {
124                 self.wtr.write_str("(?m:^)")?;
125             }
126             HirKind::Anchor(hir::Anchor::EndLine) => {
127                 self.wtr.write_str("(?m:$)")?;
128             }
129             HirKind::Anchor(hir::Anchor::StartText) => {
130                 self.wtr.write_str(r"\A")?;
131             }
132             HirKind::Anchor(hir::Anchor::EndText) => {
133                 self.wtr.write_str(r"\z")?;
134             }
135             HirKind::WordBoundary(hir::WordBoundary::Unicode) => {
136                 self.wtr.write_str(r"\b")?;
137             }
138             HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => {
139                 self.wtr.write_str(r"\B")?;
140             }
141             HirKind::WordBoundary(hir::WordBoundary::Ascii) => {
142                 self.wtr.write_str(r"(?-u:\b)")?;
143             }
144             HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
145                 self.wtr.write_str(r"(?-u:\B)")?;
146             }
147             HirKind::Group(ref x) => match x.kind {
148                 hir::GroupKind::CaptureIndex(_) => {
149                     self.wtr.write_str("(")?;
150                 }
151                 hir::GroupKind::CaptureName { ref name, .. } => {
152                     write!(self.wtr, "(?P<{}>", name)?;
153                 }
154                 hir::GroupKind::NonCapturing => {
155                     self.wtr.write_str("(?:")?;
156                 }
157             },
158         }
159         Ok(())
160     }
161 
visit_post(&mut self, hir: &Hir) -> fmt::Result162     fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
163         match *hir.kind() {
164             // Handled during visit_pre
165             HirKind::Empty
166             | HirKind::Literal(_)
167             | HirKind::Class(_)
168             | HirKind::Anchor(_)
169             | HirKind::WordBoundary(_)
170             | HirKind::Concat(_)
171             | HirKind::Alternation(_) => {}
172             HirKind::Repetition(ref x) => {
173                 match x.kind {
174                     hir::RepetitionKind::ZeroOrOne => {
175                         self.wtr.write_str("?")?;
176                     }
177                     hir::RepetitionKind::ZeroOrMore => {
178                         self.wtr.write_str("*")?;
179                     }
180                     hir::RepetitionKind::OneOrMore => {
181                         self.wtr.write_str("+")?;
182                     }
183                     hir::RepetitionKind::Range(ref x) => match *x {
184                         hir::RepetitionRange::Exactly(m) => {
185                             write!(self.wtr, "{{{}}}", m)?;
186                         }
187                         hir::RepetitionRange::AtLeast(m) => {
188                             write!(self.wtr, "{{{},}}", m)?;
189                         }
190                         hir::RepetitionRange::Bounded(m, n) => {
191                             write!(self.wtr, "{{{},{}}}", m, n)?;
192                         }
193                     },
194                 }
195                 if !x.greedy {
196                     self.wtr.write_str("?")?;
197                 }
198             }
199             HirKind::Group(_) => {
200                 self.wtr.write_str(")")?;
201             }
202         }
203         Ok(())
204     }
205 
visit_alternation_in(&mut self) -> fmt::Result206     fn visit_alternation_in(&mut self) -> fmt::Result {
207         self.wtr.write_str("|")
208     }
209 }
210 
211 impl<W: fmt::Write> Writer<W> {
write_literal_char(&mut self, c: char) -> fmt::Result212     fn write_literal_char(&mut self, c: char) -> fmt::Result {
213         if is_meta_character(c) {
214             self.wtr.write_str("\\")?;
215         }
216         self.wtr.write_char(c)
217     }
218 
219     fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
220         let c = b as char;
221         if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
222             self.write_literal_char(c)
223         } else {
224             write!(self.wtr, "(?-u:\\x{:02X})", b)
225         }
226     }
227 
228     fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
229         let c = b as char;
230         if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
231             self.write_literal_char(c)
232         } else {
233             write!(self.wtr, "\\x{:02X}", b)
234         }
235     }
236 }
237 
238 #[cfg(test)]
239 mod tests {
240     use super::Printer;
241     use crate::ParserBuilder;
242 
243     fn roundtrip(given: &str, expected: &str) {
244         roundtrip_with(|b| b, given, expected);
245     }
246 
247     fn roundtrip_bytes(given: &str, expected: &str) {
248         roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected);
249     }
250 
251     fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
252     where
253         F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
254     {
255         let mut builder = ParserBuilder::new();
256         f(&mut builder);
257         let hir = builder.build().parse(given).unwrap();
258 
259         let mut printer = Printer::new();
260         let mut dst = String::new();
261         printer.print(&hir, &mut dst).unwrap();
262 
263         // Check that the result is actually valid.
264         builder.build().parse(&dst).unwrap();
265 
266         assert_eq!(expected, dst);
267     }
268 
269     #[test]
270     fn print_literal() {
271         roundtrip("a", "a");
272         roundtrip(r"\xff", "\u{FF}");
273         roundtrip_bytes(r"\xff", "\u{FF}");
274         roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
275         roundtrip("☃", "☃");
276     }
277 
278     #[test]
279     fn print_class() {
280         roundtrip(r"[a]", r"[a]");
281         roundtrip(r"[a-z]", r"[a-z]");
282         roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
283         roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]");
284         roundtrip(r"[-]", r"[\-]");
285         roundtrip(r"[☃-⛄]", r"[☃-⛄]");
286 
287         roundtrip(r"(?-u)[a]", r"(?-u:[a])");
288         roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
289         roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
290 
291         // The following test that the printer escapes meta characters
292         // in character classes.
293         roundtrip(r"[\[]", r"[\[]");
294         roundtrip(r"[Z-_]", r"[Z-_]");
295         roundtrip(r"[Z-_--Z]", r"[\[-_]");
296 
297         // The following test that the printer escapes meta characters
298         // in byte oriented character classes.
299         roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])");
300         roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
301         roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
302     }
303 
304     #[test]
305     fn print_anchor() {
306         roundtrip(r"^", r"\A");
307         roundtrip(r"$", r"\z");
308         roundtrip(r"(?m)^", r"(?m:^)");
309         roundtrip(r"(?m)$", r"(?m:$)");
310     }
311 
312     #[test]
313     fn print_word_boundary() {
314         roundtrip(r"\b", r"\b");
315         roundtrip(r"\B", r"\B");
316         roundtrip(r"(?-u)\b", r"(?-u:\b)");
317         roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
318     }
319 
320     #[test]
321     fn print_repetition() {
322         roundtrip("a?", "a?");
323         roundtrip("a??", "a??");
324         roundtrip("(?U)a?", "a??");
325 
326         roundtrip("a*", "a*");
327         roundtrip("a*?", "a*?");
328         roundtrip("(?U)a*", "a*?");
329 
330         roundtrip("a+", "a+");
331         roundtrip("a+?", "a+?");
332         roundtrip("(?U)a+", "a+?");
333 
334         roundtrip("a{1}", "a{1}");
335         roundtrip("a{1,}", "a{1,}");
336         roundtrip("a{1,5}", "a{1,5}");
337         roundtrip("a{1}?", "a{1}?");
338         roundtrip("a{1,}?", "a{1,}?");
339         roundtrip("a{1,5}?", "a{1,5}?");
340         roundtrip("(?U)a{1}", "a{1}?");
341         roundtrip("(?U)a{1,}", "a{1,}?");
342         roundtrip("(?U)a{1,5}", "a{1,5}?");
343     }
344 
345     #[test]
346     fn print_group() {
347         roundtrip("()", "()");
348         roundtrip("(?P<foo>)", "(?P<foo>)");
349         roundtrip("(?:)", "(?:)");
350 
351         roundtrip("(a)", "(a)");
352         roundtrip("(?P<foo>a)", "(?P<foo>a)");
353         roundtrip("(?:a)", "(?:a)");
354 
355         roundtrip("((((a))))", "((((a))))");
356     }
357 
358     #[test]
359     fn print_alternation() {
360         roundtrip("|", "|");
361         roundtrip("||", "||");
362 
363         roundtrip("a|b", "a|b");
364         roundtrip("a|b|c", "a|b|c");
365         roundtrip("foo|bar|quux", "foo|bar|quux");
366     }
367 }
368