• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::error;
2 use std::io::{self, Write};
3 use std::process;
4 use std::result;
5 
6 use docopt::Docopt;
7 use regex::internal::{Compiler, LiteralSearcher};
8 use regex_syntax::hir::literal::Literals;
9 use regex_syntax::hir::Hir;
10 
11 const USAGE: &'static str = "
12 Usage:
13     regex-debug [options] ast <pattern>
14     regex-debug [options] hir <pattern>
15     regex-debug [options] prefixes <patterns> ...
16     regex-debug [options] suffixes <patterns> ...
17     regex-debug [options] anchors <pattern>
18     regex-debug [options] captures <pattern>
19     regex-debug [options] compile <patterns> ...
20     regex-debug [options] utf8-ranges <class>
21     regex-debug [options] utf8-ranges-rev <class>
22     regex-debug --help
23 
24 Options:
25     --help               Show this usage message.
26     --size-limit ARG     An approximate size limit on the total size (in bytes)
27                          of a compiled regular expression program.
28                          [default: 10485760]
29     --bytes              Show the instruction codes for byte oriented programs.
30                          (As opposed to Unicode oriented programs.)
31     --dfa                Show the instruction codes for a DFA.
32     --dfa-reverse        Show the instruction codes for a reverse DFA.
33                          This implies --dfa.
34     -a, --all-literals   Shows all literals extracted.
35                          By default, only unambiguous literals are shown.
36     --literal-limit ARG  An approximate limit on the total size (in bytes)
37                          of all literals extracted. [default: 250]
38     --class-limit ARG    A limit on the size of character classes used to
39                          extract literals. [default: 10]
40     --literal-bytes      Show raw literal bytes instead of Unicode chars.
41     --lcp                Show the longest common prefix of all the literals
42                          extracted.
43     --lcs                Show the longest common suffix of all the literals
44                          extracted.
45     --searcher           Show the debug output for the literal searcher
46                          constructed by the literals found.
47     --quiet              Show less output.
48 ";
49 
50 #[derive(serde::Deserialize)]
51 struct Args {
52     cmd_ast: bool,
53     cmd_hir: bool,
54     cmd_prefixes: bool,
55     cmd_suffixes: bool,
56     cmd_anchors: bool,
57     cmd_captures: bool,
58     cmd_compile: bool,
59     cmd_utf8_ranges: bool,
60     cmd_utf8_ranges_rev: bool,
61 
62     arg_pattern: String,
63     arg_patterns: Vec<String>,
64     arg_class: String,
65 
66     flag_size_limit: usize,
67     flag_bytes: bool,
68     flag_dfa: bool,
69     flag_dfa_reverse: bool,
70     flag_all_literals: bool,
71     flag_literal_limit: usize,
72     flag_class_limit: usize,
73     flag_literal_bytes: bool,
74     flag_lcp: bool,
75     flag_lcs: bool,
76     flag_searcher: bool,
77     flag_quiet: bool,
78 }
79 
80 type Result<T> = result::Result<T, Box<dyn error::Error + Send + Sync>>;
81 
main()82 fn main() {
83     let mut args: Args = Docopt::new(USAGE)
84         .and_then(|d| d.deserialize())
85         .unwrap_or_else(|e| e.exit());
86     if args.flag_dfa_reverse {
87         args.flag_dfa = true;
88     }
89     match run(&args) {
90         Ok(_) => process::exit(0),
91         Err(err) => {
92             let _ = writeln!(&mut io::stderr(), "{}", err);
93             process::exit(1)
94         }
95     }
96 }
97 
run(args: &Args) -> Result<()>98 fn run(args: &Args) -> Result<()> {
99     if args.cmd_ast {
100         cmd_ast(args)
101     } else if args.cmd_hir {
102         cmd_hir(args)
103     } else if args.cmd_prefixes {
104         cmd_literals(args)
105     } else if args.cmd_suffixes {
106         cmd_literals(args)
107     } else if args.cmd_anchors {
108         cmd_anchors(args)
109     } else if args.cmd_captures {
110         cmd_captures(args)
111     } else if args.cmd_compile {
112         cmd_compile(args)
113     } else if args.cmd_utf8_ranges {
114         cmd_utf8_ranges(args)
115     } else if args.cmd_utf8_ranges_rev {
116         cmd_utf8_ranges_rev(args)
117     } else {
118         unreachable!()
119     }
120 }
121 
cmd_ast(args: &Args) -> Result<()>122 fn cmd_ast(args: &Args) -> Result<()> {
123     use regex_syntax::ast::parse::Parser;
124 
125     let mut parser = Parser::new();
126     let ast = parser.parse(&args.arg_pattern)?;
127     println!("{:#?}", ast);
128     Ok(())
129 }
130 
cmd_hir(args: &Args) -> Result<()>131 fn cmd_hir(args: &Args) -> Result<()> {
132     use regex_syntax::ParserBuilder;
133 
134     let mut parser = ParserBuilder::new().allow_invalid_utf8(false).build();
135     let hir = parser.parse(&args.arg_pattern)?;
136     println!("{:#?}", hir);
137     Ok(())
138 }
139 
cmd_literals(args: &Args) -> Result<()>140 fn cmd_literals(args: &Args) -> Result<()> {
141     let exprs = args.parse_many()?;
142     let mut lits = if args.cmd_prefixes {
143         args.literals(&exprs, |lits, e| lits.union_prefixes(e))
144     } else {
145         args.literals(&exprs, |lits, e| lits.union_suffixes(e))
146     };
147     if !args.flag_all_literals {
148         if args.cmd_prefixes {
149             lits = lits.unambiguous_prefixes();
150         } else {
151             lits = lits.unambiguous_suffixes();
152         }
153     }
154     if args.flag_searcher {
155         if args.cmd_prefixes {
156             println!("{:?}", LiteralSearcher::prefixes(lits))
157         } else {
158             println!("{:?}", LiteralSearcher::suffixes(lits))
159         }
160     } else if args.flag_lcp {
161         println!("{}", escape_unicode(lits.longest_common_prefix()));
162     } else if args.flag_lcs {
163         println!("{}", escape_unicode(lits.longest_common_suffix()));
164     } else {
165         for lit in lits.literals() {
166             if args.flag_literal_bytes {
167                 if lit.is_cut() {
168                     println!("Cut({})", escape_bytes(lit));
169                 } else {
170                     println!("Complete({})", escape_bytes(lit));
171                 }
172             } else {
173                 println!("{:?}", lit);
174             }
175         }
176     }
177     Ok(())
178 }
179 
cmd_anchors(args: &Args) -> Result<()>180 fn cmd_anchors(args: &Args) -> Result<()> {
181     let expr = args.parse_one()?;
182     if expr.is_anchored_start() {
183         println!("start");
184     }
185     if expr.is_anchored_end() {
186         println!("end");
187     }
188     Ok(())
189 }
190 
cmd_captures(args: &Args) -> Result<()>191 fn cmd_captures(args: &Args) -> Result<()> {
192     let expr = args.parse_one()?;
193     let prog = args.compiler().only_utf8(false).compile(&[expr])?;
194     for (i, name) in prog.captures.iter().enumerate() {
195         match *name {
196             None => println!("{}", i),
197             Some(ref name) => println!("{}:{}", i, name),
198         }
199     }
200     Ok(())
201 }
202 
cmd_compile(args: &Args) -> Result<()>203 fn cmd_compile(args: &Args) -> Result<()> {
204     let exprs = args.parse_many()?;
205     let compiler = args
206         .compiler()
207         .bytes(args.flag_bytes)
208         .only_utf8(!args.flag_bytes)
209         .dfa(args.flag_dfa)
210         .reverse(args.flag_dfa_reverse);
211     let prog = compiler.compile(&exprs)?;
212     if !args.flag_quiet {
213         print!("{:?}", prog);
214     } else {
215         println!("instruction count: {}", prog.insts.len());
216     }
217     Ok(())
218 }
219 
cmd_utf8_ranges(args: &Args) -> Result<()>220 fn cmd_utf8_ranges(args: &Args) -> Result<()> {
221     use regex_syntax::hir::{self, HirKind};
222     use regex_syntax::utf8::Utf8Sequences;
223     use regex_syntax::ParserBuilder;
224 
225     let hir = ParserBuilder::new()
226         .build()
227         .parse(&format!("[{}]", args.arg_class))?;
228     let cls = match hir.into_kind() {
229         HirKind::Class(hir::Class::Unicode(cls)) => cls,
230         _ => {
231             return Err(
232                 format!("unexpected HIR, expected Unicode class").into()
233             )
234         }
235     };
236     let mut char_count = 0;
237     for (i, range) in cls.iter().enumerate() {
238         if i > 0 {
239             println!("----------------------------");
240         }
241         char_count += (range.end() as u32) - (range.start() as u32) + 1;
242         for seq in Utf8Sequences::new(range.start(), range.end()) {
243             for utf8_range in seq.into_iter() {
244                 print!("[{:02X}-{:02X}]", utf8_range.start, utf8_range.end);
245             }
246             println!();
247         }
248     }
249     println!("codepoint count: {}", char_count);
250     Ok(())
251 }
252 
cmd_utf8_ranges_rev(args: &Args) -> Result<()>253 fn cmd_utf8_ranges_rev(args: &Args) -> Result<()> {
254     use regex_syntax::hir::{self, HirKind};
255     use regex_syntax::utf8::Utf8Sequences;
256     use regex_syntax::ParserBuilder;
257 
258     let hir = ParserBuilder::new()
259         .build()
260         .parse(&format!("[{}]", args.arg_class))?;
261     let cls = match hir.into_kind() {
262         HirKind::Class(hir::Class::Unicode(cls)) => cls,
263         _ => {
264             return Err(
265                 format!("unexpected HIR, expected Unicode class").into()
266             )
267         }
268     };
269     let mut char_count = 0;
270     let mut seqs = vec![];
271     for (_, range) in cls.iter().enumerate() {
272         char_count += (range.end() as u32) - (range.start() as u32) + 1;
273         for seq in Utf8Sequences::new(range.start(), range.end()) {
274             let mut seq = seq.as_slice().to_vec();
275             seq.reverse();
276             seqs.push(seq);
277         }
278     }
279     seqs.sort();
280     for seq in seqs {
281         for utf8_range in seq.into_iter() {
282             print!("[{:02X}-{:02X}]", utf8_range.start, utf8_range.end);
283         }
284         println!();
285     }
286     println!("codepoint count: {}", char_count);
287     Ok(())
288 }
289 
290 impl Args {
parse_one(&self) -> Result<Hir>291     fn parse_one(&self) -> Result<Hir> {
292         parse(&self.arg_pattern)
293     }
294 
parse_many(&self) -> Result<Vec<Hir>>295     fn parse_many(&self) -> Result<Vec<Hir>> {
296         self.arg_patterns.iter().map(|s| parse(s)).collect()
297     }
298 
literals<F: Fn(&mut Literals, &Hir) -> bool>( &self, exprs: &[Hir], get_literals: F, ) -> Literals299     fn literals<F: Fn(&mut Literals, &Hir) -> bool>(
300         &self,
301         exprs: &[Hir],
302         get_literals: F,
303     ) -> Literals {
304         let mut lits = Some(self.empty_literals());
305         for e in exprs {
306             lits = lits.and_then(|mut lits| {
307                 if !get_literals(&mut lits, e) {
308                     None
309                 } else {
310                     Some(lits)
311                 }
312             });
313         }
314         lits.unwrap_or(self.empty_literals())
315     }
316 
empty_literals(&self) -> Literals317     fn empty_literals(&self) -> Literals {
318         let mut lits = Literals::empty();
319         lits.set_limit_size(self.flag_literal_limit);
320         lits.set_limit_class(self.flag_class_limit);
321         lits
322     }
323 
compiler(&self) -> Compiler324     fn compiler(&self) -> Compiler {
325         Compiler::new().size_limit(self.flag_size_limit)
326     }
327 }
328 
parse(re: &str) -> Result<Hir>329 fn parse(re: &str) -> Result<Hir> {
330     use regex_syntax::ParserBuilder;
331     ParserBuilder::new()
332         .allow_invalid_utf8(true)
333         .build()
334         .parse(re)
335         .map_err(From::from)
336 }
337 
escape_unicode(bytes: &[u8]) -> String338 fn escape_unicode(bytes: &[u8]) -> String {
339     let show = match ::std::str::from_utf8(bytes) {
340         Ok(v) => v.to_string(),
341         Err(_) => escape_bytes(bytes),
342     };
343     let mut space_escaped = String::new();
344     for c in show.chars() {
345         if c.is_whitespace() {
346             let escaped = if c as u32 <= 0x7F {
347                 escape_byte(c as u8)
348             } else {
349                 if c as u32 <= 0xFFFF {
350                     format!(r"\u{{{:04x}}}", c as u32)
351                 } else {
352                     format!(r"\U{{{:08x}}}", c as u32)
353                 }
354             };
355             space_escaped.push_str(&escaped);
356         } else {
357             space_escaped.push(c);
358         }
359     }
360     space_escaped
361 }
362 
escape_bytes(bytes: &[u8]) -> String363 fn escape_bytes(bytes: &[u8]) -> String {
364     let mut s = String::new();
365     for &b in bytes {
366         s.push_str(&escape_byte(b));
367     }
368     s
369 }
370 
escape_byte(byte: u8) -> String371 fn escape_byte(byte: u8) -> String {
372     use std::ascii::escape_default;
373 
374     let escaped: Vec<u8> = escape_default(byte).collect();
375     String::from_utf8_lossy(&escaped).into_owned()
376 }
377