• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2011 The Libphonenumber Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Author: George Yakovlev
16 //         Philippe Liard
17 
18 // Note that we don't use features of ICU that depend on std::string (e.g.
19 // UnicodeString::toUTF8String()) to support clients that build ICU without
20 // -DU_HAVE_STD_STRING.
21 
22 #include "phonenumbers/regexp_adapter_icu.h"
23 
24 #include <stddef.h>
25 #include <string>
26 
27 #include <unicode/regex.h>
28 #include <unicode/stringpiece.h>
29 #include <unicode/unistr.h>
30 
31 #include "phonenumbers/base/basictypes.h"
32 #include "phonenumbers/base/logging.h"
33 #include "phonenumbers/base/memory/scoped_ptr.h"
34 #include "phonenumbers/default_logger.h"
35 #include "phonenumbers/string_byte_sink.h"
36 
37 namespace i18n {
38 namespace phonenumbers {
39 
40 using icu::RegexMatcher;
41 using icu::RegexPattern;
42 using icu::UnicodeString;
43 
44 namespace {
45 
46 // Converts UnicodeString 'source' to a UTF8-formatted std::string.
UnicodeStringToUtf8String(const UnicodeString & source)47 string UnicodeStringToUtf8String(const UnicodeString& source) {
48   string data;
49   source.toUTF8String(data);
50   return data;
51 }
52 
53 // Converts UTF8-formatted std::string 'source' to a UnicodeString.
Utf8StringToUnicodeString(const string & source)54 UnicodeString Utf8StringToUnicodeString(const string& source) {
55   // Note that we don't use icu::StringPiece(const string&).
56   return UnicodeString::fromUTF8(
57       icu::StringPiece(source.c_str(), static_cast<int>(source.size())));
58 }
59 
60 }  // namespace
61 
62 // Implementation of the abstract classes RegExpInput and RegExp using ICU
63 // regular expression capabilities.
64 
65 // ICU implementation of the RegExpInput abstract class.
66 class IcuRegExpInput : public RegExpInput {
67  public:
IcuRegExpInput(const string & utf8_input)68   explicit IcuRegExpInput(const string& utf8_input)
69       : utf8_input_(Utf8StringToUnicodeString(utf8_input)),
70         position_(0) {}
71 
72 
73   // This type is neither copyable nor movable.
74   IcuRegExpInput(const IcuRegExpInput&) = delete;
75   IcuRegExpInput& operator=(const IcuRegExpInput&) = delete;
76 
~IcuRegExpInput()77   virtual ~IcuRegExpInput() {}
78 
ToString() const79   virtual string ToString() const {
80     return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_));
81   }
82 
Data()83   UnicodeString* Data() {
84     return &utf8_input_;
85   }
86 
87   // The current start position. For a newly created input, position is 0. Each
88   // call to ConsumeRegExp() or RegExp::Consume() advances the position in the
89   // case of the successful match to be after the match.
position() const90   int position() const {
91     return position_;
92   }
93 
set_position(int position)94   void set_position(int position) {
95     DCHECK(position >= 0 && position <= utf8_input_.length());
96     position_ = position;
97   }
98 
99  private:
100   UnicodeString utf8_input_;
101   int position_;
102 
103 };
104 
105 // ICU implementation of the RegExp abstract class.
106 class IcuRegExp : public RegExp {
107  public:
IcuRegExp(const string & utf8_regexp)108   explicit IcuRegExp(const string& utf8_regexp) {
109     UParseError parse_error;
110     UErrorCode status = U_ZERO_ERROR;
111     utf8_regexp_.reset(RegexPattern::compile(
112         Utf8StringToUnicodeString(utf8_regexp), 0, parse_error, status));
113     if (U_FAILURE(status)) {
114       // The provided regular expressions should compile correctly.
115       LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp;
116       utf8_regexp_.reset(NULL);
117     }
118   }
119 
120   // This type is neither copyable nor movable.
121   IcuRegExp(const IcuRegExp&) = delete;
122   IcuRegExp& operator=(const IcuRegExp&) = delete;
123 
~IcuRegExp()124   virtual ~IcuRegExp() {}
125 
Consume(RegExpInput * input_string,bool anchor_at_start,string * matched_string1,string * matched_string2,string * matched_string3,string * matched_string4,string * matched_string5,string * matched_string6) const126   virtual bool Consume(RegExpInput* input_string,
127                        bool anchor_at_start,
128                        string* matched_string1,
129                        string* matched_string2,
130                        string* matched_string3,
131                        string* matched_string4,
132                        string* matched_string5,
133                        string* matched_string6) const {
134     DCHECK(input_string);
135     if (!utf8_regexp_.get()) {
136       return false;
137     }
138     IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string);
139     UErrorCode status = U_ZERO_ERROR;
140     const scoped_ptr<RegexMatcher> matcher(
141         utf8_regexp_->matcher(*input->Data(), status));
142     bool match_succeeded = anchor_at_start
143         ? matcher->lookingAt(input->position(), status)
144         : matcher->find(input->position(), status);
145     if (!match_succeeded || U_FAILURE(status)) {
146       return false;
147     }
148     string* const matched_strings[] = {matched_string1, matched_string2,
149                                        matched_string3, matched_string4,
150                                        matched_string5, matched_string6};
151     // If less matches than expected - fail.
152     for (size_t i = 0; i < arraysize(matched_strings); ++i) {
153       if (matched_strings[i]) {
154         // Groups are counted from 1 rather than 0.
155         const int group_index = static_cast<int>(i + 1);
156         if (group_index > matcher->groupCount()) {
157           return false;
158         }
159         *matched_strings[i] =
160             UnicodeStringToUtf8String(matcher->group(group_index, status));
161       }
162     }
163     input->set_position(matcher->end(status));
164     return !U_FAILURE(status);
165   }
166 
Match(const string & input_string,bool full_match,string * matched_string) const167   bool Match(const string& input_string,
168              bool full_match,
169              string* matched_string) const {
170     if (!utf8_regexp_.get()) {
171       return false;
172     }
173     IcuRegExpInput input(input_string);
174     UErrorCode status = U_ZERO_ERROR;
175     const scoped_ptr<RegexMatcher> matcher(
176         utf8_regexp_->matcher(*input.Data(), status));
177     bool match_succeeded = full_match
178         ? matcher->matches(input.position(), status)
179         : matcher->find(input.position(), status);
180     if (!match_succeeded || U_FAILURE(status)) {
181       return false;
182     }
183     if (matcher->groupCount() > 0 && matched_string) {
184       *matched_string = UnicodeStringToUtf8String(matcher->group(1, status));
185     }
186     return !U_FAILURE(status);
187   }
188 
Replace(string * string_to_process,bool global,const string & replacement_string) const189   bool Replace(string* string_to_process,
190                bool global,
191                const string& replacement_string) const {
192     DCHECK(string_to_process);
193     if (!utf8_regexp_.get()) {
194       return false;
195     }
196     IcuRegExpInput input(*string_to_process);
197     UErrorCode status = U_ZERO_ERROR;
198     const scoped_ptr<RegexMatcher> matcher(
199         utf8_regexp_->matcher(*input.Data(), status));
200     if (U_FAILURE(status)) {
201       return false;
202     }
203 
204     UnicodeString output;
205     // We reimplement ReplaceFirst and ReplaceAll such that their behaviour is
206     // consistent with the RE2 reg-ex matcher.
207     if (!matcher->find()) {
208       return false;
209     }
210     matcher->appendReplacement(output,
211                                Utf8StringToUnicodeString(replacement_string),
212                                status);
213     if (global) {
214       // Continue and look for more matches.
215       while (matcher->find()) {
216         matcher->appendReplacement(
217             output,
218             Utf8StringToUnicodeString(replacement_string),
219             status);
220       }
221     }
222 
223     matcher->appendTail(output);
224     if (U_FAILURE(status)) {
225       return false;
226     }
227     const string replaced_string = UnicodeStringToUtf8String(output);
228     *string_to_process = replaced_string;
229     return true;
230   }
231 
232  private:
233   scoped_ptr<RegexPattern> utf8_regexp_;
234 };
235 
CreateInput(const string & utf8_input) const236 RegExpInput* ICURegExpFactory::CreateInput(const string& utf8_input) const {
237   return new IcuRegExpInput(utf8_input);
238 }
239 
CreateRegExp(const string & utf8_regexp) const240 RegExp* ICURegExpFactory::CreateRegExp(const string& utf8_regexp) const {
241   return new IcuRegExp(utf8_regexp);
242 }
243 
244 }  // namespace phonenumbers
245 }  // namespace i18n
246