• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #ifndef RE2_FILTERED_RE2_H_
6 #define RE2_FILTERED_RE2_H_
7 
8 // The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
9 // It provides a prefilter mechanism that helps in cutting down the
10 // number of regexps that need to be actually searched.
11 //
12 // By design, it does not include a string matching engine. This is to
13 // allow the user of the class to use their favorite string match
14 // engine. The overall flow is: Add all the regexps using Add, then
15 // Compile the FilteredRE2. The compile returns strings that need to
16 // be matched. Note that all returned strings are lowercase. For
17 // applying regexps to a search text, the caller does the string
18 // matching using the strings returned. When doing the string match,
19 // note that the caller has to do that on lower cased version of the
20 // search text. Then call FirstMatch or AllMatches with a vector of
21 // indices of strings that were found in the text to get the actual
22 // regexp matches.
23 
24 #include <string>
25 #include <vector>
26 
27 #include "re2/re2.h"
28 
29 namespace re2 {
30 
31 class PrefilterTree;
32 
33 class FilteredRE2 {
34  public:
35   FilteredRE2();
36   explicit FilteredRE2(int min_atom_len);
37   ~FilteredRE2();
38 
39   // Uses RE2 constructor to create a RE2 object (re). Returns
40   // re->error_code(). If error_code is other than NoError, then re is
41   // deleted and not added to re2_vec_.
42   RE2::ErrorCode Add(const StringPiece& pattern,
43                      const RE2::Options& options,
44                      int *id);
45 
46   // Prepares the regexps added by Add for filtering.  Returns a set
47   // of strings that the caller should check for in candidate texts.
48   // The returned strings are lowercased. When doing string matching,
49   // the search text should be lowercased first to find matching
50   // strings from the set of strings returned by Compile.  Call after
51   // all Add calls are done.
52   void Compile(std::vector<string>* strings_to_match);
53 
54   // Returns the index of the first matching regexp.
55   // Returns -1 on no match. Can be called prior to Compile.
56   // Does not do any filtering: simply tries to Match the
57   // regexps in a loop.
58   int SlowFirstMatch(const StringPiece& text) const;
59 
60   // Returns the index of the first matching regexp.
61   // Returns -1 on no match. Compile has to be called before
62   // calling this.
63   int FirstMatch(const StringPiece& text,
64                  const std::vector<int>& atoms) const;
65 
66   // Returns the indices of all matching regexps, after first clearing
67   // matched_regexps.
68   bool AllMatches(const StringPiece& text,
69                   const std::vector<int>& atoms,
70                   std::vector<int>* matching_regexps) const;
71 
72   // Returns the indices of all potentially matching regexps after first
73   // clearing potential_regexps.
74   // A regexp is potentially matching if it passes the filter.
75   // If a regexp passes the filter it may still not match.
76   // A regexp that does not pass the filter is guaranteed to not match.
77   void AllPotentials(const std::vector<int>& atoms,
78                      std::vector<int>* potential_regexps) const;
79 
80   // The number of regexps added.
NumRegexps()81   int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
82 
83  private:
84 
85   // Get the individual RE2 objects. Useful for testing.
GetRE2(int regexpid)86   RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
87 
88   // Print prefilter.
89   void PrintPrefilter(int regexpid);
90 
91   // Useful for testing and debugging.
92   void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
93                            std::vector<int>* passed_regexps);
94 
95   // All the regexps in the FilteredRE2.
96   std::vector<RE2*> re2_vec_;
97 
98   // Has the FilteredRE2 been compiled using Compile()
99   bool compiled_;
100 
101   // An AND-OR tree of string atoms used for filtering regexps.
102   PrefilterTree* prefilter_tree_;
103 
104   FilteredRE2(const FilteredRE2&) = delete;
105   FilteredRE2& operator=(const FilteredRE2&) = delete;
106 };
107 
108 }  // namespace re2
109 
110 #endif  // RE2_FILTERED_RE2_H_
111