1 use std::error;
2 use std::io::{self, Write};
3 use std::process;
4 use std::result;
5
6 use docopt::Docopt;
7 use regex::internal::{Compiler, LiteralSearcher};
8 use regex_syntax::hir::literal::Literals;
9 use regex_syntax::hir::Hir;
10
11 const USAGE: &'static str = "
12 Usage:
13 regex-debug [options] ast <pattern>
14 regex-debug [options] hir <pattern>
15 regex-debug [options] prefixes <patterns> ...
16 regex-debug [options] suffixes <patterns> ...
17 regex-debug [options] anchors <pattern>
18 regex-debug [options] captures <pattern>
19 regex-debug [options] compile <patterns> ...
20 regex-debug [options] utf8-ranges <class>
21 regex-debug [options] utf8-ranges-rev <class>
22 regex-debug --help
23
24 Options:
25 --help Show this usage message.
26 --size-limit ARG An approximate size limit on the total size (in bytes)
27 of a compiled regular expression program.
28 [default: 10485760]
29 --bytes Show the instruction codes for byte oriented programs.
30 (As opposed to Unicode oriented programs.)
31 --dfa Show the instruction codes for a DFA.
32 --dfa-reverse Show the instruction codes for a reverse DFA.
33 This implies --dfa.
34 -a, --all-literals Shows all literals extracted.
35 By default, only unambiguous literals are shown.
36 --literal-limit ARG An approximate limit on the total size (in bytes)
37 of all literals extracted. [default: 250]
38 --class-limit ARG A limit on the size of character classes used to
39 extract literals. [default: 10]
40 --literal-bytes Show raw literal bytes instead of Unicode chars.
41 --lcp Show the longest common prefix of all the literals
42 extracted.
43 --lcs Show the longest common suffix of all the literals
44 extracted.
45 --searcher Show the debug output for the literal searcher
46 constructed by the literals found.
47 --quiet Show less output.
48 ";
49
50 #[derive(serde::Deserialize)]
51 struct Args {
52 cmd_ast: bool,
53 cmd_hir: bool,
54 cmd_prefixes: bool,
55 cmd_suffixes: bool,
56 cmd_anchors: bool,
57 cmd_captures: bool,
58 cmd_compile: bool,
59 cmd_utf8_ranges: bool,
60 cmd_utf8_ranges_rev: bool,
61
62 arg_pattern: String,
63 arg_patterns: Vec<String>,
64 arg_class: String,
65
66 flag_size_limit: usize,
67 flag_bytes: bool,
68 flag_dfa: bool,
69 flag_dfa_reverse: bool,
70 flag_all_literals: bool,
71 flag_literal_limit: usize,
72 flag_class_limit: usize,
73 flag_literal_bytes: bool,
74 flag_lcp: bool,
75 flag_lcs: bool,
76 flag_searcher: bool,
77 flag_quiet: bool,
78 }
79
80 type Result<T> = result::Result<T, Box<dyn error::Error + Send + Sync>>;
81
main()82 fn main() {
83 let mut args: Args = Docopt::new(USAGE)
84 .and_then(|d| d.deserialize())
85 .unwrap_or_else(|e| e.exit());
86 if args.flag_dfa_reverse {
87 args.flag_dfa = true;
88 }
89 match run(&args) {
90 Ok(_) => process::exit(0),
91 Err(err) => {
92 let _ = writeln!(&mut io::stderr(), "{}", err);
93 process::exit(1)
94 }
95 }
96 }
97
run(args: &Args) -> Result<()>98 fn run(args: &Args) -> Result<()> {
99 if args.cmd_ast {
100 cmd_ast(args)
101 } else if args.cmd_hir {
102 cmd_hir(args)
103 } else if args.cmd_prefixes {
104 cmd_literals(args)
105 } else if args.cmd_suffixes {
106 cmd_literals(args)
107 } else if args.cmd_anchors {
108 cmd_anchors(args)
109 } else if args.cmd_captures {
110 cmd_captures(args)
111 } else if args.cmd_compile {
112 cmd_compile(args)
113 } else if args.cmd_utf8_ranges {
114 cmd_utf8_ranges(args)
115 } else if args.cmd_utf8_ranges_rev {
116 cmd_utf8_ranges_rev(args)
117 } else {
118 unreachable!()
119 }
120 }
121
cmd_ast(args: &Args) -> Result<()>122 fn cmd_ast(args: &Args) -> Result<()> {
123 use regex_syntax::ast::parse::Parser;
124
125 let mut parser = Parser::new();
126 let ast = parser.parse(&args.arg_pattern)?;
127 println!("{:#?}", ast);
128 Ok(())
129 }
130
cmd_hir(args: &Args) -> Result<()>131 fn cmd_hir(args: &Args) -> Result<()> {
132 use regex_syntax::ParserBuilder;
133
134 let mut parser = ParserBuilder::new().allow_invalid_utf8(false).build();
135 let hir = parser.parse(&args.arg_pattern)?;
136 println!("{:#?}", hir);
137 Ok(())
138 }
139
cmd_literals(args: &Args) -> Result<()>140 fn cmd_literals(args: &Args) -> Result<()> {
141 let exprs = args.parse_many()?;
142 let mut lits = if args.cmd_prefixes {
143 args.literals(&exprs, |lits, e| lits.union_prefixes(e))
144 } else {
145 args.literals(&exprs, |lits, e| lits.union_suffixes(e))
146 };
147 if !args.flag_all_literals {
148 if args.cmd_prefixes {
149 lits = lits.unambiguous_prefixes();
150 } else {
151 lits = lits.unambiguous_suffixes();
152 }
153 }
154 if args.flag_searcher {
155 if args.cmd_prefixes {
156 println!("{:?}", LiteralSearcher::prefixes(lits))
157 } else {
158 println!("{:?}", LiteralSearcher::suffixes(lits))
159 }
160 } else if args.flag_lcp {
161 println!("{}", escape_unicode(lits.longest_common_prefix()));
162 } else if args.flag_lcs {
163 println!("{}", escape_unicode(lits.longest_common_suffix()));
164 } else {
165 for lit in lits.literals() {
166 if args.flag_literal_bytes {
167 if lit.is_cut() {
168 println!("Cut({})", escape_bytes(lit));
169 } else {
170 println!("Complete({})", escape_bytes(lit));
171 }
172 } else {
173 println!("{:?}", lit);
174 }
175 }
176 }
177 Ok(())
178 }
179
cmd_anchors(args: &Args) -> Result<()>180 fn cmd_anchors(args: &Args) -> Result<()> {
181 let expr = args.parse_one()?;
182 if expr.is_anchored_start() {
183 println!("start");
184 }
185 if expr.is_anchored_end() {
186 println!("end");
187 }
188 Ok(())
189 }
190
cmd_captures(args: &Args) -> Result<()>191 fn cmd_captures(args: &Args) -> Result<()> {
192 let expr = args.parse_one()?;
193 let prog = args.compiler().only_utf8(false).compile(&[expr])?;
194 for (i, name) in prog.captures.iter().enumerate() {
195 match *name {
196 None => println!("{}", i),
197 Some(ref name) => println!("{}:{}", i, name),
198 }
199 }
200 Ok(())
201 }
202
cmd_compile(args: &Args) -> Result<()>203 fn cmd_compile(args: &Args) -> Result<()> {
204 let exprs = args.parse_many()?;
205 let compiler = args
206 .compiler()
207 .bytes(args.flag_bytes)
208 .only_utf8(!args.flag_bytes)
209 .dfa(args.flag_dfa)
210 .reverse(args.flag_dfa_reverse);
211 let prog = compiler.compile(&exprs)?;
212 if !args.flag_quiet {
213 print!("{:?}", prog);
214 } else {
215 println!("instruction count: {}", prog.insts.len());
216 }
217 Ok(())
218 }
219
cmd_utf8_ranges(args: &Args) -> Result<()>220 fn cmd_utf8_ranges(args: &Args) -> Result<()> {
221 use regex_syntax::hir::{self, HirKind};
222 use regex_syntax::utf8::Utf8Sequences;
223 use regex_syntax::ParserBuilder;
224
225 let hir = ParserBuilder::new()
226 .build()
227 .parse(&format!("[{}]", args.arg_class))?;
228 let cls = match hir.into_kind() {
229 HirKind::Class(hir::Class::Unicode(cls)) => cls,
230 _ => {
231 return Err(
232 format!("unexpected HIR, expected Unicode class").into()
233 )
234 }
235 };
236 let mut char_count = 0;
237 for (i, range) in cls.iter().enumerate() {
238 if i > 0 {
239 println!("----------------------------");
240 }
241 char_count += (range.end() as u32) - (range.start() as u32) + 1;
242 for seq in Utf8Sequences::new(range.start(), range.end()) {
243 for utf8_range in seq.into_iter() {
244 print!("[{:02X}-{:02X}]", utf8_range.start, utf8_range.end);
245 }
246 println!();
247 }
248 }
249 println!("codepoint count: {}", char_count);
250 Ok(())
251 }
252
cmd_utf8_ranges_rev(args: &Args) -> Result<()>253 fn cmd_utf8_ranges_rev(args: &Args) -> Result<()> {
254 use regex_syntax::hir::{self, HirKind};
255 use regex_syntax::utf8::Utf8Sequences;
256 use regex_syntax::ParserBuilder;
257
258 let hir = ParserBuilder::new()
259 .build()
260 .parse(&format!("[{}]", args.arg_class))?;
261 let cls = match hir.into_kind() {
262 HirKind::Class(hir::Class::Unicode(cls)) => cls,
263 _ => {
264 return Err(
265 format!("unexpected HIR, expected Unicode class").into()
266 )
267 }
268 };
269 let mut char_count = 0;
270 let mut seqs = vec![];
271 for (_, range) in cls.iter().enumerate() {
272 char_count += (range.end() as u32) - (range.start() as u32) + 1;
273 for seq in Utf8Sequences::new(range.start(), range.end()) {
274 let mut seq = seq.as_slice().to_vec();
275 seq.reverse();
276 seqs.push(seq);
277 }
278 }
279 seqs.sort();
280 for seq in seqs {
281 for utf8_range in seq.into_iter() {
282 print!("[{:02X}-{:02X}]", utf8_range.start, utf8_range.end);
283 }
284 println!();
285 }
286 println!("codepoint count: {}", char_count);
287 Ok(())
288 }
289
290 impl Args {
parse_one(&self) -> Result<Hir>291 fn parse_one(&self) -> Result<Hir> {
292 parse(&self.arg_pattern)
293 }
294
parse_many(&self) -> Result<Vec<Hir>>295 fn parse_many(&self) -> Result<Vec<Hir>> {
296 self.arg_patterns.iter().map(|s| parse(s)).collect()
297 }
298
literals<F: Fn(&mut Literals, &Hir) -> bool>( &self, exprs: &[Hir], get_literals: F, ) -> Literals299 fn literals<F: Fn(&mut Literals, &Hir) -> bool>(
300 &self,
301 exprs: &[Hir],
302 get_literals: F,
303 ) -> Literals {
304 let mut lits = Some(self.empty_literals());
305 for e in exprs {
306 lits = lits.and_then(|mut lits| {
307 if !get_literals(&mut lits, e) {
308 None
309 } else {
310 Some(lits)
311 }
312 });
313 }
314 lits.unwrap_or(self.empty_literals())
315 }
316
empty_literals(&self) -> Literals317 fn empty_literals(&self) -> Literals {
318 let mut lits = Literals::empty();
319 lits.set_limit_size(self.flag_literal_limit);
320 lits.set_limit_class(self.flag_class_limit);
321 lits
322 }
323
compiler(&self) -> Compiler324 fn compiler(&self) -> Compiler {
325 Compiler::new().size_limit(self.flag_size_limit)
326 }
327 }
328
parse(re: &str) -> Result<Hir>329 fn parse(re: &str) -> Result<Hir> {
330 use regex_syntax::ParserBuilder;
331 ParserBuilder::new()
332 .allow_invalid_utf8(true)
333 .build()
334 .parse(re)
335 .map_err(From::from)
336 }
337
escape_unicode(bytes: &[u8]) -> String338 fn escape_unicode(bytes: &[u8]) -> String {
339 let show = match ::std::str::from_utf8(bytes) {
340 Ok(v) => v.to_string(),
341 Err(_) => escape_bytes(bytes),
342 };
343 let mut space_escaped = String::new();
344 for c in show.chars() {
345 if c.is_whitespace() {
346 let escaped = if c as u32 <= 0x7F {
347 escape_byte(c as u8)
348 } else {
349 if c as u32 <= 0xFFFF {
350 format!(r"\u{{{:04x}}}", c as u32)
351 } else {
352 format!(r"\U{{{:08x}}}", c as u32)
353 }
354 };
355 space_escaped.push_str(&escaped);
356 } else {
357 space_escaped.push(c);
358 }
359 }
360 space_escaped
361 }
362
escape_bytes(bytes: &[u8]) -> String363 fn escape_bytes(bytes: &[u8]) -> String {
364 let mut s = String::new();
365 for &b in bytes {
366 s.push_str(&escape_byte(b));
367 }
368 s
369 }
370
escape_byte(byte: u8) -> String371 fn escape_byte(byte: u8) -> String {
372 use std::ascii::escape_default;
373
374 let escaped: Vec<u8> = escape_default(byte).collect();
375 String::from_utf8_lossy(&escaped).into_owned()
376 }
377