• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /// The set of user configurable options for compiling zero or more regexes.
2 #[derive(Clone, Debug)]
3 #[allow(missing_docs)]
4 pub struct RegexOptions {
5     pub pats: Vec<String>,
6     pub size_limit: usize,
7     pub dfa_size_limit: usize,
8     pub nest_limit: u32,
9     pub case_insensitive: bool,
10     pub multi_line: bool,
11     pub dot_matches_new_line: bool,
12     pub swap_greed: bool,
13     pub ignore_whitespace: bool,
14     pub unicode: bool,
15     pub octal: bool,
16 }
17 
18 impl Default for RegexOptions {
default() -> Self19     fn default() -> Self {
20         RegexOptions {
21             pats: vec![],
22             size_limit: 10 * (1 << 20),
23             dfa_size_limit: 2 * (1 << 20),
24             nest_limit: 250,
25             case_insensitive: false,
26             multi_line: false,
27             dot_matches_new_line: false,
28             swap_greed: false,
29             ignore_whitespace: false,
30             unicode: true,
31             octal: false,
32         }
33     }
34 }
35 
36 macro_rules! define_builder {
37     ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
38         pub mod $name {
39             use super::RegexOptions;
40             use crate::error::Error;
41             use crate::exec::ExecBuilder;
42 
43             use crate::$regex_mod::Regex;
44 
45             /// A configurable builder for a regular expression.
46             ///
47             /// A builder can be used to configure how the regex is built, for example, by
48             /// setting the default flags (which can be overridden in the expression
49             /// itself) or setting various limits.
50             #[derive(Debug)]
51             pub struct RegexBuilder(RegexOptions);
52 
53             impl RegexBuilder {
54                 /// Create a new regular expression builder with the given pattern.
55                 ///
56                 /// If the pattern is invalid, then an error will be returned when
57                 /// `build` is called.
58                 pub fn new(pattern: &str) -> RegexBuilder {
59                     let mut builder = RegexBuilder(RegexOptions::default());
60                     builder.0.pats.push(pattern.to_owned());
61                     builder
62                 }
63 
64                 /// Consume the builder and compile the regular expression.
65                 ///
66                 /// Note that calling `as_str` on the resulting `Regex` will produce the
67                 /// pattern given to `new` verbatim. Notably, it will not incorporate any
68                 /// of the flags set on this builder.
69                 pub fn build(&self) -> Result<Regex, Error> {
70                     ExecBuilder::new_options(self.0.clone())
71                         .only_utf8($only_utf8)
72                         .build()
73                         .map(Regex::from)
74                 }
75 
76                 /// Set the value for the case insensitive (`i`) flag.
77                 ///
78                 /// When enabled, letters in the pattern will match both upper case and
79                 /// lower case variants.
80                 pub fn case_insensitive(
81                     &mut self,
82                     yes: bool,
83                 ) -> &mut RegexBuilder {
84                     self.0.case_insensitive = yes;
85                     self
86                 }
87 
88                 /// Set the value for the multi-line matching (`m`) flag.
89                 ///
90                 /// When enabled, `^` matches the beginning of lines and `$` matches the
91                 /// end of lines.
92                 ///
93                 /// By default, they match beginning/end of the input.
94                 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
95                     self.0.multi_line = yes;
96                     self
97                 }
98 
99                 /// Set the value for the any character (`s`) flag, where in `.` matches
100                 /// anything when `s` is set and matches anything except for new line when
101                 /// it is not set (the default).
102                 ///
103                 /// N.B. "matches anything" means "any byte" when Unicode is disabled and
104                 /// means "any valid UTF-8 encoding of any Unicode scalar value" when
105                 /// Unicode is enabled.
106                 pub fn dot_matches_new_line(
107                     &mut self,
108                     yes: bool,
109                 ) -> &mut RegexBuilder {
110                     self.0.dot_matches_new_line = yes;
111                     self
112                 }
113 
114                 /// Set the value for the greedy swap (`U`) flag.
115                 ///
116                 /// When enabled, a pattern like `a*` is lazy (tries to find shortest
117                 /// match) and `a*?` is greedy (tries to find longest match).
118                 ///
119                 /// By default, `a*` is greedy and `a*?` is lazy.
120                 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
121                     self.0.swap_greed = yes;
122                     self
123                 }
124 
125                 /// Set the value for the ignore whitespace (`x`) flag.
126                 ///
127                 /// When enabled, whitespace such as new lines and spaces will be ignored
128                 /// between expressions of the pattern, and `#` can be used to start a
129                 /// comment until the next new line.
130                 pub fn ignore_whitespace(
131                     &mut self,
132                     yes: bool,
133                 ) -> &mut RegexBuilder {
134                     self.0.ignore_whitespace = yes;
135                     self
136                 }
137 
138                 /// Set the value for the Unicode (`u`) flag.
139                 ///
140                 /// Enabled by default. When disabled, character classes such as `\w` only
141                 /// match ASCII word characters instead of all Unicode word characters.
142                 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
143                     self.0.unicode = yes;
144                     self
145                 }
146 
147                 /// Whether to support octal syntax or not.
148                 ///
149                 /// Octal syntax is a little-known way of uttering Unicode codepoints in
150                 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
151                 /// `\141` are all equivalent regular expressions, where the last example
152                 /// shows octal syntax.
153                 ///
154                 /// While supporting octal syntax isn't in and of itself a problem, it does
155                 /// make good error messages harder. That is, in PCRE based regex engines,
156                 /// syntax like `\0` invokes a backreference, which is explicitly
157                 /// unsupported in Rust's regex engine. However, many users expect it to
158                 /// be supported. Therefore, when octal support is disabled, the error
159                 /// message will explicitly mention that backreferences aren't supported.
160                 ///
161                 /// Octal syntax is disabled by default.
162                 pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
163                     self.0.octal = yes;
164                     self
165                 }
166 
167                 /// Set the approximate size limit of the compiled regular expression.
168                 ///
169                 /// This roughly corresponds to the number of bytes occupied by a single
170                 /// compiled program. If the program exceeds this number, then a
171                 /// compilation error is returned.
172                 pub fn size_limit(
173                     &mut self,
174                     limit: usize,
175                 ) -> &mut RegexBuilder {
176                     self.0.size_limit = limit;
177                     self
178                 }
179 
180                 /// Set the approximate size of the cache used by the DFA.
181                 ///
182                 /// This roughly corresponds to the number of bytes that the DFA will
183                 /// use while searching.
184                 ///
185                 /// Note that this is a *per thread* limit. There is no way to set a global
186                 /// limit. In particular, if a regex is used from multiple threads
187                 /// simultaneously, then each thread may use up to the number of bytes
188                 /// specified here.
189                 pub fn dfa_size_limit(
190                     &mut self,
191                     limit: usize,
192                 ) -> &mut RegexBuilder {
193                     self.0.dfa_size_limit = limit;
194                     self
195                 }
196 
197                 /// Set the nesting limit for this parser.
198                 ///
199                 /// The nesting limit controls how deep the abstract syntax tree is allowed
200                 /// to be. If the AST exceeds the given limit (e.g., with too many nested
201                 /// groups), then an error is returned by the parser.
202                 ///
203                 /// The purpose of this limit is to act as a heuristic to prevent stack
204                 /// overflow for consumers that do structural induction on an `Ast` using
205                 /// explicit recursion. While this crate never does this (instead using
206                 /// constant stack space and moving the call stack to the heap), other
207                 /// crates may.
208                 ///
209                 /// This limit is not checked until the entire Ast is parsed. Therefore,
210                 /// if callers want to put a limit on the amount of heap space used, then
211                 /// they should impose a limit on the length, in bytes, of the concrete
212                 /// pattern string. In particular, this is viable since this parser
213                 /// implementation will limit itself to heap space proportional to the
214                 /// length of the pattern string.
215                 ///
216                 /// Note that a nest limit of `0` will return a nest limit error for most
217                 /// patterns but not all. For example, a nest limit of `0` permits `a` but
218                 /// not `ab`, since `ab` requires a concatenation, which results in a nest
219                 /// depth of `1`. In general, a nest limit is not something that manifests
220                 /// in an obvious way in the concrete syntax, therefore, it should not be
221                 /// used in a granular way.
222                 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
223                     self.0.nest_limit = limit;
224                     self
225                 }
226             }
227         }
228     };
229 }
230 
231 define_builder!(bytes, re_bytes, false);
232 define_builder!(unicode, re_unicode, true);
233 
234 macro_rules! define_set_builder {
235     ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
236         pub mod $name {
237             use super::RegexOptions;
238             use crate::error::Error;
239             use crate::exec::ExecBuilder;
240 
241             use crate::re_set::$regex_mod::RegexSet;
242 
243             /// A configurable builder for a set of regular expressions.
244             ///
245             /// A builder can be used to configure how the regexes are built, for example,
246             /// by setting the default flags (which can be overridden in the expression
247             /// itself) or setting various limits.
248             #[derive(Debug)]
249             pub struct RegexSetBuilder(RegexOptions);
250 
251             impl RegexSetBuilder {
252                 /// Create a new regular expression builder with the given pattern.
253                 ///
254                 /// If the pattern is invalid, then an error will be returned when
255                 /// `build` is called.
256                 pub fn new<I, S>(patterns: I) -> RegexSetBuilder
257                 where
258                     S: AsRef<str>,
259                     I: IntoIterator<Item = S>,
260                 {
261                     let mut builder = RegexSetBuilder(RegexOptions::default());
262                     for pat in patterns {
263                         builder.0.pats.push(pat.as_ref().to_owned());
264                     }
265                     builder
266                 }
267 
268                 /// Consume the builder and compile the regular expressions into a set.
269                 pub fn build(&self) -> Result<RegexSet, Error> {
270                     ExecBuilder::new_options(self.0.clone())
271                         .only_utf8($only_utf8)
272                         .build()
273                         .map(RegexSet::from)
274                 }
275 
276                 /// Set the value for the case insensitive (`i`) flag.
277                 pub fn case_insensitive(
278                     &mut self,
279                     yes: bool,
280                 ) -> &mut RegexSetBuilder {
281                     self.0.case_insensitive = yes;
282                     self
283                 }
284 
285                 /// Set the value for the multi-line matching (`m`) flag.
286                 pub fn multi_line(
287                     &mut self,
288                     yes: bool,
289                 ) -> &mut RegexSetBuilder {
290                     self.0.multi_line = yes;
291                     self
292                 }
293 
294                 /// Set the value for the any character (`s`) flag, where in `.` matches
295                 /// anything when `s` is set and matches anything except for new line when
296                 /// it is not set (the default).
297                 ///
298                 /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
299                 /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
300                 /// expressions.
301                 pub fn dot_matches_new_line(
302                     &mut self,
303                     yes: bool,
304                 ) -> &mut RegexSetBuilder {
305                     self.0.dot_matches_new_line = yes;
306                     self
307                 }
308 
309                 /// Set the value for the greedy swap (`U`) flag.
310                 pub fn swap_greed(
311                     &mut self,
312                     yes: bool,
313                 ) -> &mut RegexSetBuilder {
314                     self.0.swap_greed = yes;
315                     self
316                 }
317 
318                 /// Set the value for the ignore whitespace (`x`) flag.
319                 pub fn ignore_whitespace(
320                     &mut self,
321                     yes: bool,
322                 ) -> &mut RegexSetBuilder {
323                     self.0.ignore_whitespace = yes;
324                     self
325                 }
326 
327                 /// Set the value for the Unicode (`u`) flag.
328                 pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
329                     self.0.unicode = yes;
330                     self
331                 }
332 
333                 /// Whether to support octal syntax or not.
334                 ///
335                 /// Octal syntax is a little-known way of uttering Unicode codepoints in
336                 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
337                 /// `\141` are all equivalent regular expressions, where the last example
338                 /// shows octal syntax.
339                 ///
340                 /// While supporting octal syntax isn't in and of itself a problem, it does
341                 /// make good error messages harder. That is, in PCRE based regex engines,
342                 /// syntax like `\0` invokes a backreference, which is explicitly
343                 /// unsupported in Rust's regex engine. However, many users expect it to
344                 /// be supported. Therefore, when octal support is disabled, the error
345                 /// message will explicitly mention that backreferences aren't supported.
346                 ///
347                 /// Octal syntax is disabled by default.
348                 pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
349                     self.0.octal = yes;
350                     self
351                 }
352 
353                 /// Set the approximate size limit of the compiled regular expression.
354                 ///
355                 /// This roughly corresponds to the number of bytes occupied by a single
356                 /// compiled program. If the program exceeds this number, then a
357                 /// compilation error is returned.
358                 pub fn size_limit(
359                     &mut self,
360                     limit: usize,
361                 ) -> &mut RegexSetBuilder {
362                     self.0.size_limit = limit;
363                     self
364                 }
365 
366                 /// Set the approximate size of the cache used by the DFA.
367                 ///
368                 /// This roughly corresponds to the number of bytes that the DFA will
369                 /// use while searching.
370                 ///
371                 /// Note that this is a *per thread* limit. There is no way to set a global
372                 /// limit. In particular, if a regex is used from multiple threads
373                 /// simultaneously, then each thread may use up to the number of bytes
374                 /// specified here.
375                 pub fn dfa_size_limit(
376                     &mut self,
377                     limit: usize,
378                 ) -> &mut RegexSetBuilder {
379                     self.0.dfa_size_limit = limit;
380                     self
381                 }
382 
383                 /// Set the nesting limit for this parser.
384                 ///
385                 /// The nesting limit controls how deep the abstract syntax tree is allowed
386                 /// to be. If the AST exceeds the given limit (e.g., with too many nested
387                 /// groups), then an error is returned by the parser.
388                 ///
389                 /// The purpose of this limit is to act as a heuristic to prevent stack
390                 /// overflow for consumers that do structural induction on an `Ast` using
391                 /// explicit recursion. While this crate never does this (instead using
392                 /// constant stack space and moving the call stack to the heap), other
393                 /// crates may.
394                 ///
395                 /// This limit is not checked until the entire Ast is parsed. Therefore,
396                 /// if callers want to put a limit on the amount of heap space used, then
397                 /// they should impose a limit on the length, in bytes, of the concrete
398                 /// pattern string. In particular, this is viable since this parser
399                 /// implementation will limit itself to heap space proportional to the
400                 /// length of the pattern string.
401                 ///
402                 /// Note that a nest limit of `0` will return a nest limit error for most
403                 /// patterns but not all. For example, a nest limit of `0` permits `a` but
404                 /// not `ab`, since `ab` requires a concatenation, which results in a nest
405                 /// depth of `1`. In general, a nest limit is not something that manifests
406                 /// in an obvious way in the concrete syntax, therefore, it should not be
407                 /// used in a granular way.
408                 pub fn nest_limit(
409                     &mut self,
410                     limit: u32,
411                 ) -> &mut RegexSetBuilder {
412                     self.0.nest_limit = limit;
413                     self
414                 }
415             }
416         }
417     };
418 }
419 
420 define_set_builder!(set_bytes, bytes, false);
421 define_set_builder!(set_unicode, unicode, true);
422