1 // Copyright 2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #ifndef RE2_FILTERED_RE2_H_ 6 #define RE2_FILTERED_RE2_H_ 7 8 // The class FilteredRE2 is used as a wrapper to multiple RE2 regexps. 9 // It provides a prefilter mechanism that helps in cutting down the 10 // number of regexps that need to be actually searched. 11 // 12 // By design, it does not include a string matching engine. This is to 13 // allow the user of the class to use their favorite string matching 14 // engine. The overall flow is: Add all the regexps using Add, then 15 // Compile the FilteredRE2. Compile returns strings that need to be 16 // matched. Note that the returned strings are lowercased and distinct. 17 // For applying regexps to a search text, the caller does the string 18 // matching using the returned strings. When doing the string match, 19 // note that the caller has to do that in a case-insensitive way or 20 // on a lowercased version of the search text. Then call FirstMatch 21 // or AllMatches with a vector of indices of strings that were found 22 // in the text to get the actual regexp matches. 23 24 #include <memory> 25 #include <string> 26 #include <vector> 27 28 #include "absl/strings/string_view.h" 29 #include "re2/re2.h" 30 31 namespace re2 { 32 33 class PrefilterTree; 34 35 class FilteredRE2 { 36 public: 37 FilteredRE2(); 38 explicit FilteredRE2(int min_atom_len); 39 ~FilteredRE2(); 40 41 // Not copyable. 42 FilteredRE2(const FilteredRE2&) = delete; 43 FilteredRE2& operator=(const FilteredRE2&) = delete; 44 // Movable. 45 FilteredRE2(FilteredRE2&& other); 46 FilteredRE2& operator=(FilteredRE2&& other); 47 48 // Uses RE2 constructor to create a RE2 object (re). Returns 49 // re->error_code(). If error_code is other than NoError, then re is 50 // deleted and not added to re2_vec_. 51 RE2::ErrorCode Add(absl::string_view pattern, 52 const RE2::Options& options, 53 int* id); 54 55 // Prepares the regexps added by Add for filtering. Returns a set 56 // of strings that the caller should check for in candidate texts. 57 // The returned strings are lowercased and distinct. When doing 58 // string matching, it should be performed in a case-insensitive 59 // way or the search text should be lowercased first. Call after 60 // all Add calls are done. 61 void Compile(std::vector<std::string>* strings_to_match); 62 63 // Returns the index of the first matching regexp. 64 // Returns -1 on no match. Can be called prior to Compile. 65 // Does not do any filtering: simply tries to Match the 66 // regexps in a loop. 67 int SlowFirstMatch(absl::string_view text) const; 68 69 // Returns the index of the first matching regexp. 70 // Returns -1 on no match. Compile has to be called before 71 // calling this. 72 int FirstMatch(absl::string_view text, 73 const std::vector<int>& atoms) const; 74 75 // Returns the indices of all matching regexps, after first clearing 76 // matched_regexps. 77 bool AllMatches(absl::string_view text, 78 const std::vector<int>& atoms, 79 std::vector<int>* matching_regexps) const; 80 81 // Returns the indices of all potentially matching regexps after first 82 // clearing potential_regexps. 83 // A regexp is potentially matching if it passes the filter. 84 // If a regexp passes the filter it may still not match. 85 // A regexp that does not pass the filter is guaranteed to not match. 86 void AllPotentials(const std::vector<int>& atoms, 87 std::vector<int>* potential_regexps) const; 88 89 // The number of regexps added. NumRegexps()90 int NumRegexps() const { return static_cast<int>(re2_vec_.size()); } 91 92 // Get the individual RE2 objects. GetRE2(int regexpid)93 const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; } 94 95 private: 96 // Print prefilter. 97 void PrintPrefilter(int regexpid); 98 99 // Useful for testing and debugging. 100 void RegexpsGivenStrings(const std::vector<int>& matched_atoms, 101 std::vector<int>* passed_regexps); 102 103 // All the regexps in the FilteredRE2. 104 std::vector<RE2*> re2_vec_; 105 106 // Has the FilteredRE2 been compiled using Compile() 107 bool compiled_; 108 109 // An AND-OR tree of string atoms used for filtering regexps. 110 std::unique_ptr<PrefilterTree> prefilter_tree_; 111 }; 112 113 } // namespace re2 114 115 #endif // RE2_FILTERED_RE2_H_ 116