• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/url_matcher/regex_set_matcher.h"
6 
7 #include "base/logging.h"
8 #include "base/stl_util.h"
9 #include "base/strings/string_util.h"
10 #include "components/url_matcher/substring_set_matcher.h"
11 #include "third_party/re2/re2/filtered_re2.h"
12 #include "third_party/re2/re2/re2.h"
13 
14 namespace url_matcher {
15 
RegexSetMatcher()16 RegexSetMatcher::RegexSetMatcher() {}
17 
~RegexSetMatcher()18 RegexSetMatcher::~RegexSetMatcher() {
19   DeleteSubstringPatterns();
20 }
21 
AddPatterns(const std::vector<const StringPattern * > & regex_list)22 void RegexSetMatcher::AddPatterns(
23     const std::vector<const StringPattern*>& regex_list) {
24   if (regex_list.empty())
25     return;
26   for (size_t i = 0; i < regex_list.size(); ++i) {
27     regexes_[regex_list[i]->id()] = regex_list[i];
28   }
29 
30   RebuildMatcher();
31 }
32 
ClearPatterns()33 void RegexSetMatcher::ClearPatterns() {
34   regexes_.clear();
35   RebuildMatcher();
36 }
37 
Match(const std::string & text,std::set<StringPattern::ID> * matches) const38 bool RegexSetMatcher::Match(const std::string& text,
39                             std::set<StringPattern::ID>* matches) const {
40   size_t old_number_of_matches = matches->size();
41   if (regexes_.empty())
42     return false;
43   if (!filtered_re2_.get()) {
44     LOG(ERROR) << "RegexSetMatcher was not initialized";
45     return false;
46   }
47 
48   // FilteredRE2 expects lowercase for prefiltering, but we still
49   // match case-sensitively.
50   std::vector<RE2ID> atoms(FindSubstringMatches(
51       base::StringToLowerASCII(text)));
52 
53   std::vector<RE2ID> re2_ids;
54   filtered_re2_->AllMatches(text, atoms, &re2_ids);
55 
56   for (size_t i = 0; i < re2_ids.size(); ++i) {
57     StringPattern::ID id = re2_id_map_[re2_ids[i]];
58     matches->insert(id);
59   }
60   return old_number_of_matches != matches->size();
61 }
62 
IsEmpty() const63 bool RegexSetMatcher::IsEmpty() const {
64   return regexes_.empty();
65 }
66 
FindSubstringMatches(const std::string & text) const67 std::vector<RegexSetMatcher::RE2ID> RegexSetMatcher::FindSubstringMatches(
68     const std::string& text) const {
69   std::set<int> atoms_set;
70   substring_matcher_->Match(text, &atoms_set);
71   return std::vector<RE2ID>(atoms_set.begin(), atoms_set.end());
72 }
73 
RebuildMatcher()74 void RegexSetMatcher::RebuildMatcher() {
75   re2_id_map_.clear();
76   filtered_re2_.reset(new re2::FilteredRE2());
77   if (regexes_.empty())
78     return;
79 
80   for (RegexMap::iterator it = regexes_.begin(); it != regexes_.end(); ++it) {
81     RE2ID re2_id;
82     RE2::ErrorCode error = filtered_re2_->Add(
83         it->second->pattern(), RE2::DefaultOptions, &re2_id);
84     if (error == RE2::NoError) {
85       DCHECK_EQ(static_cast<RE2ID>(re2_id_map_.size()), re2_id);
86       re2_id_map_.push_back(it->first);
87     } else {
88       // Unparseable regexes should have been rejected already in
89       // URLMatcherFactory::CreateURLMatchesCondition.
90       LOG(ERROR) << "Could not parse regex (id=" << it->first << ", "
91                  << it->second->pattern() << ")";
92     }
93   }
94 
95   std::vector<std::string> strings_to_match;
96   filtered_re2_->Compile(&strings_to_match);
97 
98   substring_matcher_.reset(new SubstringSetMatcher);
99   DeleteSubstringPatterns();
100   // Build SubstringSetMatcher from |strings_to_match|.
101   // SubstringSetMatcher doesn't own its strings.
102   for (size_t i = 0; i < strings_to_match.size(); ++i) {
103     substring_patterns_.push_back(
104         new StringPattern(strings_to_match[i], i));
105   }
106   substring_matcher_->RegisterPatterns(substring_patterns_);
107 }
108 
DeleteSubstringPatterns()109 void RegexSetMatcher::DeleteSubstringPatterns() {
110   STLDeleteElements(&substring_patterns_);
111 }
112 
113 }  // namespace url_matcher
114