1 // Copyright (C) 2011 The Libphonenumber Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // Author: George Yakovlev
16 // Philippe Liard
17
18 // Note that we don't use features of ICU that depend on std::string (e.g.
19 // UnicodeString::toUTF8String()) to support clients that build ICU without
20 // -DU_HAVE_STD_STRING.
21
22 #include "phonenumbers/regexp_adapter_icu.h"
23
24 #include <stddef.h>
25 #include <string>
26
27 #include <unicode/regex.h>
28 #include <unicode/stringpiece.h>
29 #include <unicode/unistr.h>
30
31 #include "phonenumbers/base/basictypes.h"
32 #include "phonenumbers/base/logging.h"
33 #include "phonenumbers/base/memory/scoped_ptr.h"
34 #include "phonenumbers/default_logger.h"
35 #include "phonenumbers/string_byte_sink.h"
36
37 namespace i18n {
38 namespace phonenumbers {
39
40 using icu::RegexMatcher;
41 using icu::RegexPattern;
42 using icu::UnicodeString;
43
44 namespace {
45
46 // Converts UnicodeString 'source' to a UTF8-formatted std::string.
UnicodeStringToUtf8String(const UnicodeString & source)47 string UnicodeStringToUtf8String(const UnicodeString& source) {
48 string data;
49 source.toUTF8String(data);
50 return data;
51 }
52
53 // Converts UTF8-formatted std::string 'source' to a UnicodeString.
Utf8StringToUnicodeString(const string & source)54 UnicodeString Utf8StringToUnicodeString(const string& source) {
55 // Note that we don't use icu::StringPiece(const string&).
56 return UnicodeString::fromUTF8(
57 icu::StringPiece(source.c_str(), static_cast<int>(source.size())));
58 }
59
60 } // namespace
61
62 // Implementation of the abstract classes RegExpInput and RegExp using ICU
63 // regular expression capabilities.
64
65 // ICU implementation of the RegExpInput abstract class.
66 class IcuRegExpInput : public RegExpInput {
67 public:
IcuRegExpInput(const string & utf8_input)68 explicit IcuRegExpInput(const string& utf8_input)
69 : utf8_input_(Utf8StringToUnicodeString(utf8_input)),
70 position_(0) {}
71
~IcuRegExpInput()72 virtual ~IcuRegExpInput() {}
73
ToString() const74 virtual string ToString() const {
75 return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_));
76 }
77
Data()78 UnicodeString* Data() {
79 return &utf8_input_;
80 }
81
82 // The current start position. For a newly created input, position is 0. Each
83 // call to ConsumeRegExp() or RegExp::Consume() advances the position in the
84 // case of the successful match to be after the match.
position() const85 int position() const {
86 return position_;
87 }
88
set_position(int position)89 void set_position(int position) {
90 DCHECK(position >= 0 && position <= utf8_input_.length());
91 position_ = position;
92 }
93
94 private:
95 UnicodeString utf8_input_;
96 int position_;
97
98 DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput);
99 };
100
101 // ICU implementation of the RegExp abstract class.
102 class IcuRegExp : public RegExp {
103 public:
IcuRegExp(const string & utf8_regexp)104 explicit IcuRegExp(const string& utf8_regexp) {
105 UParseError parse_error;
106 UErrorCode status = U_ZERO_ERROR;
107 utf8_regexp_.reset(RegexPattern::compile(
108 Utf8StringToUnicodeString(utf8_regexp), 0, parse_error, status));
109 if (U_FAILURE(status)) {
110 // The provided regular expressions should compile correctly.
111 LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp;
112 utf8_regexp_.reset(NULL);
113 }
114 }
115
~IcuRegExp()116 virtual ~IcuRegExp() {}
117
Consume(RegExpInput * input_string,bool anchor_at_start,string * matched_string1,string * matched_string2,string * matched_string3,string * matched_string4,string * matched_string5,string * matched_string6) const118 virtual bool Consume(RegExpInput* input_string,
119 bool anchor_at_start,
120 string* matched_string1,
121 string* matched_string2,
122 string* matched_string3,
123 string* matched_string4,
124 string* matched_string5,
125 string* matched_string6) const {
126 DCHECK(input_string);
127 if (!utf8_regexp_.get()) {
128 return false;
129 }
130 IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string);
131 UErrorCode status = U_ZERO_ERROR;
132 const scoped_ptr<RegexMatcher> matcher(
133 utf8_regexp_->matcher(*input->Data(), status));
134 bool match_succeeded = anchor_at_start
135 ? matcher->lookingAt(input->position(), status)
136 : matcher->find(input->position(), status);
137 if (!match_succeeded || U_FAILURE(status)) {
138 return false;
139 }
140 string* const matched_strings[] = {matched_string1, matched_string2,
141 matched_string3, matched_string4,
142 matched_string5, matched_string6};
143 // If less matches than expected - fail.
144 for (size_t i = 0; i < arraysize(matched_strings); ++i) {
145 if (matched_strings[i]) {
146 // Groups are counted from 1 rather than 0.
147 const int group_index = static_cast<int>(i + 1);
148 if (group_index > matcher->groupCount()) {
149 return false;
150 }
151 *matched_strings[i] =
152 UnicodeStringToUtf8String(matcher->group(group_index, status));
153 }
154 }
155 input->set_position(matcher->end(status));
156 return !U_FAILURE(status);
157 }
158
Match(const string & input_string,bool full_match,string * matched_string) const159 bool Match(const string& input_string,
160 bool full_match,
161 string* matched_string) const {
162 if (!utf8_regexp_.get()) {
163 return false;
164 }
165 IcuRegExpInput input(input_string);
166 UErrorCode status = U_ZERO_ERROR;
167 const scoped_ptr<RegexMatcher> matcher(
168 utf8_regexp_->matcher(*input.Data(), status));
169 bool match_succeeded = full_match
170 ? matcher->matches(input.position(), status)
171 : matcher->find(input.position(), status);
172 if (!match_succeeded || U_FAILURE(status)) {
173 return false;
174 }
175 if (matcher->groupCount() > 0 && matched_string) {
176 *matched_string = UnicodeStringToUtf8String(matcher->group(1, status));
177 }
178 return !U_FAILURE(status);
179 }
180
Replace(string * string_to_process,bool global,const string & replacement_string) const181 bool Replace(string* string_to_process,
182 bool global,
183 const string& replacement_string) const {
184 DCHECK(string_to_process);
185 if (!utf8_regexp_.get()) {
186 return false;
187 }
188 IcuRegExpInput input(*string_to_process);
189 UErrorCode status = U_ZERO_ERROR;
190 const scoped_ptr<RegexMatcher> matcher(
191 utf8_regexp_->matcher(*input.Data(), status));
192 if (U_FAILURE(status)) {
193 return false;
194 }
195
196 UnicodeString output;
197 // We reimplement ReplaceFirst and ReplaceAll such that their behaviour is
198 // consistent with the RE2 reg-ex matcher.
199 if (!matcher->find()) {
200 return false;
201 }
202 matcher->appendReplacement(output,
203 Utf8StringToUnicodeString(replacement_string),
204 status);
205 if (global) {
206 // Continue and look for more matches.
207 while (matcher->find()) {
208 matcher->appendReplacement(
209 output,
210 Utf8StringToUnicodeString(replacement_string),
211 status);
212 }
213 }
214
215 matcher->appendTail(output);
216 if (U_FAILURE(status)) {
217 return false;
218 }
219 const string replaced_string = UnicodeStringToUtf8String(output);
220 *string_to_process = replaced_string;
221 return true;
222 }
223
224 private:
225 scoped_ptr<RegexPattern> utf8_regexp_;
226
227 DISALLOW_COPY_AND_ASSIGN(IcuRegExp);
228 };
229
CreateInput(const string & utf8_input) const230 RegExpInput* ICURegExpFactory::CreateInput(const string& utf8_input) const {
231 return new IcuRegExpInput(utf8_input);
232 }
233
CreateRegExp(const string & utf8_regexp) const234 RegExp* ICURegExpFactory::CreateRegExp(const string& utf8_regexp) const {
235 return new IcuRegExp(utf8_regexp);
236 }
237
238 } // namespace phonenumbers
239 } // namespace i18n
240