1 /// The set of user configurable options for compiling zero or more regexes. 2 #[derive(Clone, Debug)] 3 #[allow(missing_docs)] 4 pub struct RegexOptions { 5 pub pats: Vec<String>, 6 pub size_limit: usize, 7 pub dfa_size_limit: usize, 8 pub nest_limit: u32, 9 pub case_insensitive: bool, 10 pub multi_line: bool, 11 pub dot_matches_new_line: bool, 12 pub swap_greed: bool, 13 pub ignore_whitespace: bool, 14 pub unicode: bool, 15 pub octal: bool, 16 } 17 18 impl Default for RegexOptions { default() -> Self19 fn default() -> Self { 20 RegexOptions { 21 pats: vec![], 22 size_limit: 10 * (1 << 20), 23 dfa_size_limit: 2 * (1 << 20), 24 nest_limit: 250, 25 case_insensitive: false, 26 multi_line: false, 27 dot_matches_new_line: false, 28 swap_greed: false, 29 ignore_whitespace: false, 30 unicode: true, 31 octal: false, 32 } 33 } 34 } 35 36 macro_rules! define_builder { 37 ($name:ident, $regex_mod:ident, $only_utf8:expr) => { 38 pub mod $name { 39 use super::RegexOptions; 40 use crate::error::Error; 41 use crate::exec::ExecBuilder; 42 43 use crate::$regex_mod::Regex; 44 45 /// A configurable builder for a regular expression. 46 /// 47 /// A builder can be used to configure how the regex is built, for example, by 48 /// setting the default flags (which can be overridden in the expression 49 /// itself) or setting various limits. 50 #[derive(Debug)] 51 pub struct RegexBuilder(RegexOptions); 52 53 impl RegexBuilder { 54 /// Create a new regular expression builder with the given pattern. 55 /// 56 /// If the pattern is invalid, then an error will be returned when 57 /// `build` is called. 58 pub fn new(pattern: &str) -> RegexBuilder { 59 let mut builder = RegexBuilder(RegexOptions::default()); 60 builder.0.pats.push(pattern.to_owned()); 61 builder 62 } 63 64 /// Consume the builder and compile the regular expression. 65 /// 66 /// Note that calling `as_str` on the resulting `Regex` will produce the 67 /// pattern given to `new` verbatim. Notably, it will not incorporate any 68 /// of the flags set on this builder. 69 pub fn build(&self) -> Result<Regex, Error> { 70 ExecBuilder::new_options(self.0.clone()) 71 .only_utf8($only_utf8) 72 .build() 73 .map(Regex::from) 74 } 75 76 /// Set the value for the case insensitive (`i`) flag. 77 /// 78 /// When enabled, letters in the pattern will match both upper case and 79 /// lower case variants. 80 pub fn case_insensitive( 81 &mut self, 82 yes: bool, 83 ) -> &mut RegexBuilder { 84 self.0.case_insensitive = yes; 85 self 86 } 87 88 /// Set the value for the multi-line matching (`m`) flag. 89 /// 90 /// When enabled, `^` matches the beginning of lines and `$` matches the 91 /// end of lines. 92 /// 93 /// By default, they match beginning/end of the input. 94 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { 95 self.0.multi_line = yes; 96 self 97 } 98 99 /// Set the value for the any character (`s`) flag, where in `.` matches 100 /// anything when `s` is set and matches anything except for new line when 101 /// it is not set (the default). 102 /// 103 /// N.B. "matches anything" means "any byte" when Unicode is disabled and 104 /// means "any valid UTF-8 encoding of any Unicode scalar value" when 105 /// Unicode is enabled. 106 pub fn dot_matches_new_line( 107 &mut self, 108 yes: bool, 109 ) -> &mut RegexBuilder { 110 self.0.dot_matches_new_line = yes; 111 self 112 } 113 114 /// Set the value for the greedy swap (`U`) flag. 115 /// 116 /// When enabled, a pattern like `a*` is lazy (tries to find shortest 117 /// match) and `a*?` is greedy (tries to find longest match). 118 /// 119 /// By default, `a*` is greedy and `a*?` is lazy. 120 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { 121 self.0.swap_greed = yes; 122 self 123 } 124 125 /// Set the value for the ignore whitespace (`x`) flag. 126 /// 127 /// When enabled, whitespace such as new lines and spaces will be ignored 128 /// between expressions of the pattern, and `#` can be used to start a 129 /// comment until the next new line. 130 pub fn ignore_whitespace( 131 &mut self, 132 yes: bool, 133 ) -> &mut RegexBuilder { 134 self.0.ignore_whitespace = yes; 135 self 136 } 137 138 /// Set the value for the Unicode (`u`) flag. 139 /// 140 /// Enabled by default. When disabled, character classes such as `\w` only 141 /// match ASCII word characters instead of all Unicode word characters. 142 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { 143 self.0.unicode = yes; 144 self 145 } 146 147 /// Whether to support octal syntax or not. 148 /// 149 /// Octal syntax is a little-known way of uttering Unicode codepoints in 150 /// a regular expression. For example, `a`, `\x61`, `\u0061` and 151 /// `\141` are all equivalent regular expressions, where the last example 152 /// shows octal syntax. 153 /// 154 /// While supporting octal syntax isn't in and of itself a problem, it does 155 /// make good error messages harder. That is, in PCRE based regex engines, 156 /// syntax like `\0` invokes a backreference, which is explicitly 157 /// unsupported in Rust's regex engine. However, many users expect it to 158 /// be supported. Therefore, when octal support is disabled, the error 159 /// message will explicitly mention that backreferences aren't supported. 160 /// 161 /// Octal syntax is disabled by default. 162 pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { 163 self.0.octal = yes; 164 self 165 } 166 167 /// Set the approximate size limit of the compiled regular expression. 168 /// 169 /// This roughly corresponds to the number of bytes occupied by a single 170 /// compiled program. If the program exceeds this number, then a 171 /// compilation error is returned. 172 pub fn size_limit( 173 &mut self, 174 limit: usize, 175 ) -> &mut RegexBuilder { 176 self.0.size_limit = limit; 177 self 178 } 179 180 /// Set the approximate size of the cache used by the DFA. 181 /// 182 /// This roughly corresponds to the number of bytes that the DFA will 183 /// use while searching. 184 /// 185 /// Note that this is a *per thread* limit. There is no way to set a global 186 /// limit. In particular, if a regex is used from multiple threads 187 /// simultaneously, then each thread may use up to the number of bytes 188 /// specified here. 189 pub fn dfa_size_limit( 190 &mut self, 191 limit: usize, 192 ) -> &mut RegexBuilder { 193 self.0.dfa_size_limit = limit; 194 self 195 } 196 197 /// Set the nesting limit for this parser. 198 /// 199 /// The nesting limit controls how deep the abstract syntax tree is allowed 200 /// to be. If the AST exceeds the given limit (e.g., with too many nested 201 /// groups), then an error is returned by the parser. 202 /// 203 /// The purpose of this limit is to act as a heuristic to prevent stack 204 /// overflow for consumers that do structural induction on an `Ast` using 205 /// explicit recursion. While this crate never does this (instead using 206 /// constant stack space and moving the call stack to the heap), other 207 /// crates may. 208 /// 209 /// This limit is not checked until the entire Ast is parsed. Therefore, 210 /// if callers want to put a limit on the amount of heap space used, then 211 /// they should impose a limit on the length, in bytes, of the concrete 212 /// pattern string. In particular, this is viable since this parser 213 /// implementation will limit itself to heap space proportional to the 214 /// length of the pattern string. 215 /// 216 /// Note that a nest limit of `0` will return a nest limit error for most 217 /// patterns but not all. For example, a nest limit of `0` permits `a` but 218 /// not `ab`, since `ab` requires a concatenation, which results in a nest 219 /// depth of `1`. In general, a nest limit is not something that manifests 220 /// in an obvious way in the concrete syntax, therefore, it should not be 221 /// used in a granular way. 222 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { 223 self.0.nest_limit = limit; 224 self 225 } 226 } 227 } 228 }; 229 } 230 231 define_builder!(bytes, re_bytes, false); 232 define_builder!(unicode, re_unicode, true); 233 234 macro_rules! define_set_builder { 235 ($name:ident, $regex_mod:ident, $only_utf8:expr) => { 236 pub mod $name { 237 use super::RegexOptions; 238 use crate::error::Error; 239 use crate::exec::ExecBuilder; 240 241 use crate::re_set::$regex_mod::RegexSet; 242 243 /// A configurable builder for a set of regular expressions. 244 /// 245 /// A builder can be used to configure how the regexes are built, for example, 246 /// by setting the default flags (which can be overridden in the expression 247 /// itself) or setting various limits. 248 #[derive(Debug)] 249 pub struct RegexSetBuilder(RegexOptions); 250 251 impl RegexSetBuilder { 252 /// Create a new regular expression builder with the given pattern. 253 /// 254 /// If the pattern is invalid, then an error will be returned when 255 /// `build` is called. 256 pub fn new<I, S>(patterns: I) -> RegexSetBuilder 257 where 258 S: AsRef<str>, 259 I: IntoIterator<Item = S>, 260 { 261 let mut builder = RegexSetBuilder(RegexOptions::default()); 262 for pat in patterns { 263 builder.0.pats.push(pat.as_ref().to_owned()); 264 } 265 builder 266 } 267 268 /// Consume the builder and compile the regular expressions into a set. 269 pub fn build(&self) -> Result<RegexSet, Error> { 270 ExecBuilder::new_options(self.0.clone()) 271 .only_utf8($only_utf8) 272 .build() 273 .map(RegexSet::from) 274 } 275 276 /// Set the value for the case insensitive (`i`) flag. 277 pub fn case_insensitive( 278 &mut self, 279 yes: bool, 280 ) -> &mut RegexSetBuilder { 281 self.0.case_insensitive = yes; 282 self 283 } 284 285 /// Set the value for the multi-line matching (`m`) flag. 286 pub fn multi_line( 287 &mut self, 288 yes: bool, 289 ) -> &mut RegexSetBuilder { 290 self.0.multi_line = yes; 291 self 292 } 293 294 /// Set the value for the any character (`s`) flag, where in `.` matches 295 /// anything when `s` is set and matches anything except for new line when 296 /// it is not set (the default). 297 /// 298 /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet` 299 /// expressions and means "any Unicode scalar value" for `regex::RegexSet` 300 /// expressions. 301 pub fn dot_matches_new_line( 302 &mut self, 303 yes: bool, 304 ) -> &mut RegexSetBuilder { 305 self.0.dot_matches_new_line = yes; 306 self 307 } 308 309 /// Set the value for the greedy swap (`U`) flag. 310 pub fn swap_greed( 311 &mut self, 312 yes: bool, 313 ) -> &mut RegexSetBuilder { 314 self.0.swap_greed = yes; 315 self 316 } 317 318 /// Set the value for the ignore whitespace (`x`) flag. 319 pub fn ignore_whitespace( 320 &mut self, 321 yes: bool, 322 ) -> &mut RegexSetBuilder { 323 self.0.ignore_whitespace = yes; 324 self 325 } 326 327 /// Set the value for the Unicode (`u`) flag. 328 pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { 329 self.0.unicode = yes; 330 self 331 } 332 333 /// Whether to support octal syntax or not. 334 /// 335 /// Octal syntax is a little-known way of uttering Unicode codepoints in 336 /// a regular expression. For example, `a`, `\x61`, `\u0061` and 337 /// `\141` are all equivalent regular expressions, where the last example 338 /// shows octal syntax. 339 /// 340 /// While supporting octal syntax isn't in and of itself a problem, it does 341 /// make good error messages harder. That is, in PCRE based regex engines, 342 /// syntax like `\0` invokes a backreference, which is explicitly 343 /// unsupported in Rust's regex engine. However, many users expect it to 344 /// be supported. Therefore, when octal support is disabled, the error 345 /// message will explicitly mention that backreferences aren't supported. 346 /// 347 /// Octal syntax is disabled by default. 348 pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { 349 self.0.octal = yes; 350 self 351 } 352 353 /// Set the approximate size limit of the compiled regular expression. 354 /// 355 /// This roughly corresponds to the number of bytes occupied by a single 356 /// compiled program. If the program exceeds this number, then a 357 /// compilation error is returned. 358 pub fn size_limit( 359 &mut self, 360 limit: usize, 361 ) -> &mut RegexSetBuilder { 362 self.0.size_limit = limit; 363 self 364 } 365 366 /// Set the approximate size of the cache used by the DFA. 367 /// 368 /// This roughly corresponds to the number of bytes that the DFA will 369 /// use while searching. 370 /// 371 /// Note that this is a *per thread* limit. There is no way to set a global 372 /// limit. In particular, if a regex is used from multiple threads 373 /// simultaneously, then each thread may use up to the number of bytes 374 /// specified here. 375 pub fn dfa_size_limit( 376 &mut self, 377 limit: usize, 378 ) -> &mut RegexSetBuilder { 379 self.0.dfa_size_limit = limit; 380 self 381 } 382 383 /// Set the nesting limit for this parser. 384 /// 385 /// The nesting limit controls how deep the abstract syntax tree is allowed 386 /// to be. If the AST exceeds the given limit (e.g., with too many nested 387 /// groups), then an error is returned by the parser. 388 /// 389 /// The purpose of this limit is to act as a heuristic to prevent stack 390 /// overflow for consumers that do structural induction on an `Ast` using 391 /// explicit recursion. While this crate never does this (instead using 392 /// constant stack space and moving the call stack to the heap), other 393 /// crates may. 394 /// 395 /// This limit is not checked until the entire Ast is parsed. Therefore, 396 /// if callers want to put a limit on the amount of heap space used, then 397 /// they should impose a limit on the length, in bytes, of the concrete 398 /// pattern string. In particular, this is viable since this parser 399 /// implementation will limit itself to heap space proportional to the 400 /// length of the pattern string. 401 /// 402 /// Note that a nest limit of `0` will return a nest limit error for most 403 /// patterns but not all. For example, a nest limit of `0` permits `a` but 404 /// not `ab`, since `ab` requires a concatenation, which results in a nest 405 /// depth of `1`. In general, a nest limit is not something that manifests 406 /// in an obvious way in the concrete syntax, therefore, it should not be 407 /// used in a granular way. 408 pub fn nest_limit( 409 &mut self, 410 limit: u32, 411 ) -> &mut RegexSetBuilder { 412 self.0.nest_limit = limit; 413 self 414 } 415 } 416 } 417 }; 418 } 419 420 define_set_builder!(set_bytes, bytes, false); 421 define_set_builder!(set_unicode, unicode, true); 422