1 use ast; 2 use hir; 3 4 use Result; 5 6 /// A builder for a regular expression parser. 7 /// 8 /// This builder permits modifying configuration options for the parser. 9 /// 10 /// This type combines the builder options for both the 11 /// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html) 12 /// and the 13 /// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html). 14 #[derive(Clone, Debug, Default)] 15 pub struct ParserBuilder { 16 ast: ast::parse::ParserBuilder, 17 hir: hir::translate::TranslatorBuilder, 18 } 19 20 impl ParserBuilder { 21 /// Create a new parser builder with a default configuration. new() -> ParserBuilder22 pub fn new() -> ParserBuilder { 23 ParserBuilder::default() 24 } 25 26 /// Build a parser from this configuration with the given pattern. build(&self) -> Parser27 pub fn build(&self) -> Parser { 28 Parser { ast: self.ast.build(), hir: self.hir.build() } 29 } 30 31 /// Set the nesting limit for this parser. 32 /// 33 /// The nesting limit controls how deep the abstract syntax tree is allowed 34 /// to be. If the AST exceeds the given limit (e.g., with too many nested 35 /// groups), then an error is returned by the parser. 36 /// 37 /// The purpose of this limit is to act as a heuristic to prevent stack 38 /// overflow for consumers that do structural induction on an `Ast` using 39 /// explicit recursion. While this crate never does this (instead using 40 /// constant stack space and moving the call stack to the heap), other 41 /// crates may. 42 /// 43 /// This limit is not checked until the entire Ast is parsed. Therefore, 44 /// if callers want to put a limit on the amount of heap space used, then 45 /// they should impose a limit on the length, in bytes, of the concrete 46 /// pattern string. In particular, this is viable since this parser 47 /// implementation will limit itself to heap space proportional to the 48 /// lenth of the pattern string. 49 /// 50 /// Note that a nest limit of `0` will return a nest limit error for most 51 /// patterns but not all. For example, a nest limit of `0` permits `a` but 52 /// not `ab`, since `ab` requires a concatenation, which results in a nest 53 /// depth of `1`. In general, a nest limit is not something that manifests 54 /// in an obvious way in the concrete syntax, therefore, it should not be 55 /// used in a granular way. nest_limit(&mut self, limit: u32) -> &mut ParserBuilder56 pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { 57 self.ast.nest_limit(limit); 58 self 59 } 60 61 /// Whether to support octal syntax or not. 62 /// 63 /// Octal syntax is a little-known way of uttering Unicode codepoints in 64 /// a regular expression. For example, `a`, `\x61`, `\u0061` and 65 /// `\141` are all equivalent regular expressions, where the last example 66 /// shows octal syntax. 67 /// 68 /// While supporting octal syntax isn't in and of itself a problem, it does 69 /// make good error messages harder. That is, in PCRE based regex engines, 70 /// syntax like `\0` invokes a backreference, which is explicitly 71 /// unsupported in Rust's regex engine. However, many users expect it to 72 /// be supported. Therefore, when octal support is disabled, the error 73 /// message will explicitly mention that backreferences aren't supported. 74 /// 75 /// Octal syntax is disabled by default. octal(&mut self, yes: bool) -> &mut ParserBuilder76 pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { 77 self.ast.octal(yes); 78 self 79 } 80 81 /// When enabled, the parser will permit the construction of a regular 82 /// expression that may match invalid UTF-8. 83 /// 84 /// When disabled (the default), the parser is guaranteed to produce 85 /// an expression that will only ever match valid UTF-8 (otherwise, the 86 /// parser will return an error). 87 /// 88 /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII 89 /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause 90 /// the parser to return an error. Namely, a negated ASCII word boundary 91 /// can result in matching positions that aren't valid UTF-8 boundaries. allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder92 pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder { 93 self.hir.allow_invalid_utf8(yes); 94 self 95 } 96 97 /// Enable verbose mode in the regular expression. 98 /// 99 /// When enabled, verbose mode permits insigificant whitespace in many 100 /// places in the regular expression, as well as comments. Comments are 101 /// started using `#` and continue until the end of the line. 102 /// 103 /// By default, this is disabled. It may be selectively enabled in the 104 /// regular expression by using the `x` flag regardless of this setting. ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder105 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { 106 self.ast.ignore_whitespace(yes); 107 self 108 } 109 110 /// Enable or disable the case insensitive flag by default. 111 /// 112 /// By default this is disabled. It may alternatively be selectively 113 /// enabled in the regular expression itself via the `i` flag. case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder114 pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder { 115 self.hir.case_insensitive(yes); 116 self 117 } 118 119 /// Enable or disable the multi-line matching flag by default. 120 /// 121 /// By default this is disabled. It may alternatively be selectively 122 /// enabled in the regular expression itself via the `m` flag. multi_line(&mut self, yes: bool) -> &mut ParserBuilder123 pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder { 124 self.hir.multi_line(yes); 125 self 126 } 127 128 /// Enable or disable the "dot matches any character" flag by default. 129 /// 130 /// By default this is disabled. It may alternatively be selectively 131 /// enabled in the regular expression itself via the `s` flag. dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder132 pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder { 133 self.hir.dot_matches_new_line(yes); 134 self 135 } 136 137 /// Enable or disable the "swap greed" flag by default. 138 /// 139 /// By default this is disabled. It may alternatively be selectively 140 /// enabled in the regular expression itself via the `U` flag. swap_greed(&mut self, yes: bool) -> &mut ParserBuilder141 pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder { 142 self.hir.swap_greed(yes); 143 self 144 } 145 146 /// Enable or disable the Unicode flag (`u`) by default. 147 /// 148 /// By default this is **enabled**. It may alternatively be selectively 149 /// disabled in the regular expression itself via the `u` flag. 150 /// 151 /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by 152 /// default), a regular expression will fail to parse if Unicode mode is 153 /// disabled and a sub-expression could possibly match invalid UTF-8. unicode(&mut self, yes: bool) -> &mut ParserBuilder154 pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { 155 self.hir.unicode(yes); 156 self 157 } 158 } 159 160 /// A convenience parser for regular expressions. 161 /// 162 /// This parser takes as input a regular expression pattern string (the 163 /// "concrete syntax") and returns a high-level intermediate representation 164 /// (the HIR) suitable for most types of analysis. In particular, this parser 165 /// hides the intermediate state of producing an AST (the "abstract syntax"). 166 /// The AST is itself far more complex than the HIR, so this parser serves as a 167 /// convenience for never having to deal with it at all. 168 /// 169 /// If callers have more fine grained use cases that need an AST, then please 170 /// see the [`ast::parse`](ast/parse/index.html) module. 171 /// 172 /// A `Parser` can be configured in more detail via a 173 /// [`ParserBuilder`](struct.ParserBuilder.html). 174 #[derive(Clone, Debug)] 175 pub struct Parser { 176 ast: ast::parse::Parser, 177 hir: hir::translate::Translator, 178 } 179 180 impl Parser { 181 /// Create a new parser with a default configuration. 182 /// 183 /// The parser can be run with `parse` method. The parse method returns 184 /// a high level intermediate representation of the given regular 185 /// expression. 186 /// 187 /// To set configuration options on the parser, use 188 /// [`ParserBuilder`](struct.ParserBuilder.html). new() -> Parser189 pub fn new() -> Parser { 190 ParserBuilder::new().build() 191 } 192 193 /// Parse the regular expression into a high level intermediate 194 /// representation. parse(&mut self, pattern: &str) -> Result<hir::Hir>195 pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir> { 196 let ast = self.ast.parse(pattern)?; 197 let hir = self.hir.translate(pattern, &ast)?; 198 Ok(hir) 199 } 200 } 201