1 // Copyright 2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #ifndef RE2_FILTERED_RE2_H_ 6 #define RE2_FILTERED_RE2_H_ 7 8 // The class FilteredRE2 is used as a wrapper to multiple RE2 regexps. 9 // It provides a prefilter mechanism that helps in cutting down the 10 // number of regexps that need to be actually searched. 11 // 12 // By design, it does not include a string matching engine. This is to 13 // allow the user of the class to use their favorite string match 14 // engine. The overall flow is: Add all the regexps using Add, then 15 // Compile the FilteredRE2. The compile returns strings that need to 16 // be matched. Note that all returned strings are lowercase. For 17 // applying regexps to a search text, the caller does the string 18 // matching using the strings returned. When doing the string match, 19 // note that the caller has to do that on lower cased version of the 20 // search text. Then call FirstMatch or AllMatches with a vector of 21 // indices of strings that were found in the text to get the actual 22 // regexp matches. 23 24 #include <string> 25 #include <vector> 26 27 #include "re2/re2.h" 28 29 namespace re2 { 30 31 class PrefilterTree; 32 33 class FilteredRE2 { 34 public: 35 FilteredRE2(); 36 explicit FilteredRE2(int min_atom_len); 37 ~FilteredRE2(); 38 39 // Uses RE2 constructor to create a RE2 object (re). Returns 40 // re->error_code(). If error_code is other than NoError, then re is 41 // deleted and not added to re2_vec_. 42 RE2::ErrorCode Add(const StringPiece& pattern, 43 const RE2::Options& options, 44 int *id); 45 46 // Prepares the regexps added by Add for filtering. Returns a set 47 // of strings that the caller should check for in candidate texts. 48 // The returned strings are lowercased. When doing string matching, 49 // the search text should be lowercased first to find matching 50 // strings from the set of strings returned by Compile. Call after 51 // all Add calls are done. 52 void Compile(std::vector<std::string>* strings_to_match); 53 54 // Returns the index of the first matching regexp. 55 // Returns -1 on no match. Can be called prior to Compile. 56 // Does not do any filtering: simply tries to Match the 57 // regexps in a loop. 58 int SlowFirstMatch(const StringPiece& text) const; 59 60 // Returns the index of the first matching regexp. 61 // Returns -1 on no match. Compile has to be called before 62 // calling this. 63 int FirstMatch(const StringPiece& text, 64 const std::vector<int>& atoms) const; 65 66 // Returns the indices of all matching regexps, after first clearing 67 // matched_regexps. 68 bool AllMatches(const StringPiece& text, 69 const std::vector<int>& atoms, 70 std::vector<int>* matching_regexps) const; 71 72 // Returns the indices of all potentially matching regexps after first 73 // clearing potential_regexps. 74 // A regexp is potentially matching if it passes the filter. 75 // If a regexp passes the filter it may still not match. 76 // A regexp that does not pass the filter is guaranteed to not match. 77 void AllPotentials(const std::vector<int>& atoms, 78 std::vector<int>* potential_regexps) const; 79 80 // The number of regexps added. NumRegexps()81 int NumRegexps() const { return static_cast<int>(re2_vec_.size()); } 82 83 // Get the individual RE2 objects. GetRE2(int regexpid)84 const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; } 85 86 private: 87 // Print prefilter. 88 void PrintPrefilter(int regexpid); 89 90 // Useful for testing and debugging. 91 void RegexpsGivenStrings(const std::vector<int>& matched_atoms, 92 std::vector<int>* passed_regexps); 93 94 // All the regexps in the FilteredRE2. 95 std::vector<RE2*> re2_vec_; 96 97 // Has the FilteredRE2 been compiled using Compile() 98 bool compiled_; 99 100 // An AND-OR tree of string atoms used for filtering regexps. 101 PrefilterTree* prefilter_tree_; 102 103 FilteredRE2(const FilteredRE2&) = delete; 104 FilteredRE2& operator=(const FilteredRE2&) = delete; 105 }; 106 107 } // namespace re2 108 109 #endif // RE2_FILTERED_RE2_H_ 110