1 // Copyright (C) 2011 The Libphonenumber Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: Lara Rennie
16 // Author: Tao Huang
17 //
18 // Implementation of a stateful class that finds and extracts telephone numbers
19 // from text.
20
21 #include "phonenumbers/phonenumbermatcher.h"
22
23 #ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP
24 #error phonenumbermatcher depends on ICU \
25 (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set)
26 #endif // I18N_PHONENUMBERS_USE_ICU_REGEXP
27
28 #include <ctype.h>
29 #include <stddef.h>
30 #include <limits>
31 #include <map>
32 #include <memory>
33 #include <string>
34 #include <utility>
35 #include <vector>
36 #include <unicode/uchar.h>
37
38 #include "phonenumbers/alternate_format.h"
39 #include "phonenumbers/base/logging.h"
40 #include "phonenumbers/base/memory/scoped_ptr.h"
41 #include "phonenumbers/base/memory/singleton.h"
42 #include "phonenumbers/callback.h"
43 #include "phonenumbers/default_logger.h"
44 #include "phonenumbers/encoding_utils.h"
45 #include "phonenumbers/normalize_utf8.h"
46 #include "phonenumbers/phonemetadata.pb.h"
47 #include "phonenumbers/phonenumber.pb.h"
48 #include "phonenumbers/phonenumbermatch.h"
49 #include "phonenumbers/phonenumberutil.h"
50 #include "phonenumbers/regexp_adapter.h"
51 #include "phonenumbers/regexp_adapter_icu.h"
52 #include "phonenumbers/regexp_cache.h"
53 #include "phonenumbers/stringutil.h"
54 #include "phonenumbers/utf/unicodetext.h"
55
56 #ifdef I18N_PHONENUMBERS_USE_RE2
57 #include "phonenumbers/regexp_adapter_re2.h"
58 #endif // I18N_PHONENUMBERS_USE_RE2
59
60 using std::map;
61 using std::numeric_limits;
62 using std::string;
63
64 namespace i18n {
65 namespace phonenumbers {
66
67 namespace {
68 // Returns a regular expression quantifier with an upper and lower limit.
Limit(int lower,int upper)69 string Limit(int lower, int upper) {
70 DCHECK_GE(lower, 0);
71 DCHECK_GT(upper, 0);
72 DCHECK_LT(lower, upper);
73 return StrCat("{", lower, ",", upper, "}");
74 }
75
IsInvalidPunctuationSymbol(char32 character)76 bool IsInvalidPunctuationSymbol(char32 character) {
77 return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL;
78 }
79
ContainsOnlyValidXChars(const PhoneNumber & number,const string & candidate,const PhoneNumberUtil & util)80 bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate,
81 const PhoneNumberUtil& util) {
82 // The characters 'x' and 'X' can be (1) a carrier code, in which case they
83 // always precede the national significant number or (2) an extension sign,
84 // in which case they always precede the extension number. We assume a
85 // carrier code is more than 1 digit, so the first case has to have more than
86 // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1
87 // 'x' or 'X'.
88 size_t found;
89 found = candidate.find_first_of("xX");
90 // We ignore the character if 'x' or 'X' appears as the last character of
91 // the string.
92 while (found != string::npos && found < candidate.length() - 1) {
93 // We only look for 'x' or 'X' in ASCII form.
94 char next_char = candidate[found + 1];
95 if (next_char == 'x' || next_char == 'X') {
96 // This is the carrier code case, in which the 'X's always precede the
97 // national significant number.
98 ++found;
99 if (util.IsNumberMatchWithOneString(
100 number, candidate.substr(found, candidate.length() - found))
101 != PhoneNumberUtil::NSN_MATCH) {
102 return false;
103 }
104 } else {
105 string normalized_extension(candidate.substr(found,
106 candidate.length() - found));
107 util.NormalizeDigitsOnly(&normalized_extension);
108 if (normalized_extension != number.extension()) {
109 return false;
110 }
111 }
112 found = candidate.find_first_of("xX", found + 1);
113 }
114 return true;
115 }
116
AllNumberGroupsRemainGrouped(const PhoneNumberUtil & util,const PhoneNumber & number,const string & normalized_candidate,const std::vector<string> & formatted_number_groups)117 bool AllNumberGroupsRemainGrouped(
118 const PhoneNumberUtil& util,
119 const PhoneNumber& number,
120 const string& normalized_candidate,
121 const std::vector<string>& formatted_number_groups) {
122 size_t from_index = 0;
123 if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
124 // First skip the country code if the normalized candidate contained it.
125 string country_code = SimpleItoa(number.country_code());
126 from_index = normalized_candidate.find(country_code) + country_code.size();
127 }
128 // Check each group of consecutive digits are not broken into separate
129 // groupings in the normalized_candidate string.
130 for (size_t i = 0; i < formatted_number_groups.size(); ++i) {
131 // Fails if the substring of normalized_candidate starting from from_index
132 // doesn't contain the consecutive digits in formatted_number_groups.at(i).
133 from_index = normalized_candidate.find(formatted_number_groups.at(i),
134 from_index);
135 if (from_index == string::npos) {
136 return false;
137 }
138 // Moves from_index forward.
139 from_index += formatted_number_groups.at(i).length();
140 if (i == 0 && from_index < normalized_candidate.length()) {
141 // We are at the position right after the NDC. We get the region used for
142 // formatting information based on the country code in the phone number,
143 // rather than the number itself, as we do not need to distinguish between
144 // different countries with the same country calling code and this is
145 // faster.
146 string region;
147 util.GetRegionCodeForCountryCode(number.country_code(), ®ion);
148 string ndd_prefix;
149 util.GetNddPrefixForRegion(region, true, &ndd_prefix);
150 // Note although normalized_candidate might contain non-ASCII formatting
151 // characters, they won't be treated as ASCII digits when converted to a
152 // char.
153 if (!ndd_prefix.empty() && isdigit(normalized_candidate.at(from_index))) {
154 // This means there is no formatting symbol after the NDC. In this case,
155 // we only accept the number if there is no formatting symbol at all in
156 // the number, except for extensions. This is only important for
157 // countries with national prefixes.
158 string national_significant_number;
159 util.GetNationalSignificantNumber(number, &national_significant_number);
160 return HasPrefixString(normalized_candidate.substr(
161 from_index - formatted_number_groups.at(i).length()),
162 national_significant_number);
163 }
164 }
165 }
166 // The check here makes sure that we haven't mistakenly already used the
167 // extension to match the last group of the subscriber number. Note the
168 // extension cannot have formatting in-between digits.
169 return normalized_candidate.substr(from_index)
170 .find(number.extension()) != string::npos;
171 }
172
LoadAlternateFormats(PhoneMetadataCollection * alternate_formats)173 bool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) {
174 #if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS)
175 if (!alternate_formats->ParseFromArray(alternate_format_get(),
176 alternate_format_size())) {
177 LOG(ERROR) << "Could not parse binary data.";
178 return false;
179 }
180 return true;
181 #else
182 return false;
183 #endif
184 }
185
186 } // namespace
187
188 class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
189 private:
190 friend class Singleton<PhoneNumberMatcherRegExps>;
191
192 string opening_parens_;
193 string closing_parens_;
194 string non_parens_;
195 // Limit on the number of pairs of brackets in a phone number.
196 string bracket_pair_limit_;
197 // Helper strings for the matching_brackets_ pattern.
198 // An opening bracket at the beginning may not be closed, but subsequent ones
199 // should be. It's also possible that the leading bracket was dropped, so we
200 // shouldn't be surprised if we see a closing bracket first.
201 string leading_maybe_matched_bracket_;
202 string bracket_pairs_;
203 // Limit on the number of leading (plus) characters.
204 string lead_limit_;
205 // Limit on the number of consecutive punctuation characters.
206 string punctuation_limit_;
207 // The maximum number of digits allowed in a digit-separated block. As we
208 // allow all digits in a single block, this should be set high enough to
209 // accommodate the entire national number and the international country code.
210 int digit_block_limit_;
211 // Limit on the number of blocks separated by punctuation. Uses
212 // kDigitBlockLimit since some formats use spaces to separate each digit.
213 string block_limit_;
214 // A punctuation sequence allowing white space.
215 string punctuation_;
216 // A digits block without punctuation.
217 string digit_sequence_;
218 // Punctuation that may be at the start of a phone number - brackets and plus
219 // signs.
220 string lead_class_chars_;
221 // Same as lead_class_chars_, but enclosed as a character class.
222 string lead_class_;
223
224 public:
225 // We use two different reg-ex factories here for performance reasons. RE2 is
226 // much faster for smaller reg-ex patterns, but the main pattern cannot be
227 // handled by RE2 in an efficient way.
228 scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_;
229 scoped_ptr<const AbstractRegExpFactory> regexp_factory_;
230
231 // A cache for popular reg-exps of leading digits used to match formatting
232 // patterns and the factory used to create it.
233 mutable RegExpCache regexp_cache_;
234
235 // Matches strings that look like publication pages. Example:
236 // Computing Complete Answers to Queries in the Presence of Limited Access
237 // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003).
238 //
239 // The string "211-227 (2003)" is not a telephone number.
240 scoped_ptr<const RegExp> pub_pages_;
241 // Matches strings that look like dates using "/" as a separator. Examples:
242 // 3/10/2011, 31/10/96 or 08/31/95.
243 scoped_ptr<const RegExp> slash_separated_dates_;
244 // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
245 // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_.
246 scoped_ptr<const RegExp> time_stamps_;
247 scoped_ptr<const RegExp> time_stamps_suffix_;
248 // Pattern to check that brackets match. Opening brackets should be closed
249 // within a phone number. This also checks that there is something inside the
250 // brackets. Having no brackets at all is also fine.
251 scoped_ptr<const RegExp> matching_brackets_;
252 // Patterns used to extract phone numbers from a larger phone-number-like
253 // pattern. These are ordered according to specificity. For example,
254 // white-space is last since that is frequently used in numbers, not just to
255 // separate two numbers. We have separate patterns since we don't want to
256 // break up the phone-number-like text on more than one different kind of
257 // symbol at one time, although symbols of the same type (e.g. space) can be
258 // safely grouped together.
259 //
260 // Note that if there is a match, we will always check any text found up to
261 // the first match as well.
262 scoped_ptr<std::vector<const RegExp*> > inner_matches_;
263 scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_;
264 scoped_ptr<const RegExp> capturing_ascii_digits_pattern_;
265 // Compiled reg-ex representing lead_class_;
266 scoped_ptr<const RegExp> lead_class_pattern_;
267 // Phone number pattern allowing optional punctuation.
268 scoped_ptr<const RegExp> pattern_;
269
PhoneNumberMatcherRegExps()270 PhoneNumberMatcherRegExps()
271 : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */),
272 closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */),
273 non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")),
274 bracket_pair_limit_(Limit(0, 3)),
275 leading_maybe_matched_bracket_(StrCat(
276 "(?:[", opening_parens_, "])?",
277 "(?:", non_parens_, "+[", closing_parens_, "])?")),
278 bracket_pairs_(StrCat(
279 "(?:[", opening_parens_, "]", non_parens_, "+",
280 "[", closing_parens_, "])", bracket_pair_limit_)),
281 lead_limit_(Limit(0, 2)),
282 punctuation_limit_(Limit(0, 4)),
283 digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn +
284 PhoneNumberUtil::kMaxLengthCountryCode),
285 block_limit_(Limit(0, digit_block_limit_)),
286 punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]",
287 punctuation_limit_)),
288 digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))),
289 lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)),
290 lead_class_(StrCat("[", lead_class_chars_, "]")),
291 regexp_factory_for_pattern_(new ICURegExpFactory()),
292 #ifdef I18N_PHONENUMBERS_USE_RE2
293 regexp_factory_(new RE2RegExpFactory()),
294 #else
295 regexp_factory_(new ICURegExpFactory()),
296 #endif // I18N_PHONENUMBERS_USE_RE2
297 // A cache for frequently used country-specific regular expressions. Set
298 // to 32 to cover ~2-3 countries being used for the same doc with ~10
299 // patterns for each country. Some pages will have a lot more countries
300 // in use, but typically fewer numbers for each so expanding the cache
301 // for that use-case won't have a lot of benefit.
302 regexp_cache_(*regexp_factory_, 32),
303 pub_pages_(regexp_factory_->CreateRegExp(
304 "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")),
305 slash_separated_dates_(regexp_factory_->CreateRegExp(
306 "(?:(?:[0-3]?\\d/[01]?\\d)|"
307 "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")),
308 time_stamps_(regexp_factory_->CreateRegExp(
309 "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d +[0-2]\\d$")),
310 time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")),
311 matching_brackets_(regexp_factory_->CreateRegExp(
312 StrCat(leading_maybe_matched_bracket_, non_parens_, "+",
313 bracket_pairs_, non_parens_, "*"))),
314 inner_matches_(new std::vector<const RegExp*>()),
315 capture_up_to_second_number_start_pattern_(
316 regexp_factory_->CreateRegExp(
317 PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
318 capturing_ascii_digits_pattern_(
319 regexp_factory_->CreateRegExp("(\\d+)")),
320 lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)),
321 pattern_(regexp_factory_for_pattern_->CreateRegExp(StrCat(
322 "((?:", lead_class_, punctuation_, ")", lead_limit_,
323 digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
324 block_limit_, "(?i)(?:",
325 PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
326 ")?)"))) {
327 inner_matches_->push_back(
328 // Breaks on the slash - e.g. "651-234-2345/332-445-1234"
329 regexp_factory_->CreateRegExp("/+(.*)"));
330 inner_matches_->push_back(
331 // Note that the bracket here is inside the capturing group, since we
332 // consider it part of the phone number. Will match a pattern like
333 // "(650) 223 3345 (754) 223 3321".
334 regexp_factory_->CreateRegExp("(\\([^(]*)"));
335 inner_matches_->push_back(
336 // Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number." We
337 // require a space on either side of the hyphen for it to be considered
338 // a separator.
339 regexp_factory_->CreateRegExp("(?:\\p{Z}-|-\\p{Z})\\p{Z}*(.+)"));
340 inner_matches_->push_back(
341 // Various types of wide hyphens. Note we have decided not to enforce a
342 // space here, since it's possible that it's supposed to be used to
343 // break two numbers without spaces, and we haven't seen many instances
344 // of it used within a number.
345 regexp_factory_->CreateRegExp(
346 "[\xE2\x80\x92-\xE2\x80\x95\xEF\xBC\x8D]" /* "‒-―-" */
347 "\\p{Z}*(.+)"));
348 inner_matches_->push_back(
349 // Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
350 regexp_factory_->CreateRegExp("\\.+\\p{Z}*([^.]+)"));
351 inner_matches_->push_back(
352 // Breaks on space - e.g. "3324451234 8002341234"
353 regexp_factory_->CreateRegExp("\\p{Z}+(\\P{Z}+)"));
354 }
355
356 private:
357 DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps);
358 };
359
360 class AlternateFormats : public Singleton<AlternateFormats> {
361 public:
362 PhoneMetadataCollection format_data_;
363
364 map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_;
365
AlternateFormats()366 AlternateFormats()
367 : format_data_(),
368 calling_code_to_alternate_formats_map_() {
369 if (!LoadAlternateFormats(&format_data_)) {
370 LOG(DFATAL) << "Could not parse compiled-in metadata.";
371 return;
372 }
373 for (RepeatedPtrField<PhoneMetadata>::const_iterator it =
374 format_data_.metadata().begin();
375 it != format_data_.metadata().end();
376 ++it) {
377 calling_code_to_alternate_formats_map_.insert(
378 std::make_pair(it->country_code(), &*it));
379 }
380 }
381
GetAlternateFormatsForCountry(int country_calling_code) const382 const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code)
383 const {
384 map<int, const PhoneMetadata*>::const_iterator it =
385 calling_code_to_alternate_formats_map_.find(country_calling_code);
386 if (it != calling_code_to_alternate_formats_map_.end()) {
387 return it->second;
388 }
389 return NULL;
390 }
391
392 private:
393 DISALLOW_COPY_AND_ASSIGN(AlternateFormats);
394 };
395
PhoneNumberMatcher(const PhoneNumberUtil & util,const string & text,const string & region_code,PhoneNumberMatcher::Leniency leniency,int max_tries)396 PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util,
397 const string& text,
398 const string& region_code,
399 PhoneNumberMatcher::Leniency leniency,
400 int max_tries)
401 : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
402 alternate_formats_(AlternateFormats::GetInstance()),
403 phone_util_(util),
404 text_(text),
405 preferred_region_(region_code),
406 leniency_(leniency),
407 max_tries_(max_tries),
408 state_(NOT_READY),
409 last_match_(NULL),
410 search_index_(0),
411 is_input_valid_utf8_(true) {
412 is_input_valid_utf8_ = IsInputUtf8();
413 }
414
PhoneNumberMatcher(const string & text,const string & region_code)415 PhoneNumberMatcher::PhoneNumberMatcher(const string& text,
416 const string& region_code)
417 : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
418 alternate_formats_(NULL), // Not used.
419 phone_util_(*PhoneNumberUtil::GetInstance()),
420 text_(text),
421 preferred_region_(region_code),
422 leniency_(VALID),
423 max_tries_(numeric_limits<int>::max()),
424 state_(NOT_READY),
425 last_match_(NULL),
426 search_index_(0),
427 is_input_valid_utf8_(true) {
428 is_input_valid_utf8_ = IsInputUtf8();
429 }
430
~PhoneNumberMatcher()431 PhoneNumberMatcher::~PhoneNumberMatcher() {
432 }
433
IsInputUtf8()434 bool PhoneNumberMatcher::IsInputUtf8() {
435 UnicodeText number_as_unicode;
436 number_as_unicode.PointToUTF8(text_.c_str(), text_.size());
437 return number_as_unicode.UTF8WasValid();
438 }
439
440 // static
IsLatinLetter(char32 letter)441 bool PhoneNumberMatcher::IsLatinLetter(char32 letter) {
442 // Combining marks are a subset of non-spacing-mark.
443 if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) {
444 return false;
445 }
446 UBlockCode block = ublock_getCode(letter);
447 return ((block == UBLOCK_BASIC_LATIN) ||
448 (block == UBLOCK_LATIN_1_SUPPLEMENT) ||
449 (block == UBLOCK_LATIN_EXTENDED_A) ||
450 (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
451 (block == UBLOCK_LATIN_EXTENDED_B) ||
452 (block == UBLOCK_COMBINING_DIACRITICAL_MARKS));
453 }
454
ParseAndVerify(const string & candidate,int offset,PhoneNumberMatch * match)455 bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset,
456 PhoneNumberMatch* match) {
457 DCHECK(match);
458 // Check the candidate doesn't contain any formatting which would indicate
459 // that it really isn't a phone number.
460 if (!reg_exps_->matching_brackets_->FullMatch(candidate) ||
461 reg_exps_->pub_pages_->PartialMatch(candidate)) {
462 return false;
463 }
464
465 // If leniency is set to VALID or stricter, we also want to skip numbers that
466 // are surrounded by Latin alphabetic characters, to skip cases like
467 // abc8005001234 or 8005001234def.
468 if (leniency_ >= VALID) {
469 // If the candidate is not at the start of the text, and does not start with
470 // phone-number punctuation, check the previous character.
471 scoped_ptr<RegExpInput> candidate_input(
472 reg_exps_->regexp_factory_->CreateInput(candidate));
473 if (offset > 0 &&
474 !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) {
475 char32 previous_char;
476 const char* previous_char_ptr =
477 EncodingUtils::BackUpOneUTF8Character(text_.c_str(),
478 text_.c_str() + offset);
479 EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char);
480 // We return false if it is a latin letter or an invalid punctuation
481 // symbol.
482 if (IsInvalidPunctuationSymbol(previous_char) ||
483 IsLatinLetter(previous_char)) {
484 return false;
485 }
486 }
487 size_t lastCharIndex = offset + candidate.length();
488 if (lastCharIndex < text_.length()) {
489 char32 next_char;
490 const char* next_char_ptr =
491 EncodingUtils::AdvanceOneUTF8Character(
492 text_.c_str() + lastCharIndex - 1);
493 EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char);
494 if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) {
495 return false;
496 }
497 }
498 }
499
500 PhoneNumber number;
501 if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) !=
502 PhoneNumberUtil::NO_PARSING_ERROR) {
503 return false;
504 }
505
506 if (VerifyAccordingToLeniency(leniency_, number, candidate)) {
507 match->set_start(offset);
508 match->set_raw_string(candidate);
509 // We used ParseAndKeepRawInput to create this number, but for now we don't
510 // return the extra values parsed. TODO: stop clearing all values here and
511 // switch all users over to using raw_input() rather than the raw_string()
512 // of PhoneNumberMatch.
513 number.clear_country_code_source();
514 number.clear_preferred_domestic_carrier_code();
515 number.clear_raw_input();
516 match->set_number(number);
517 return true;
518 }
519 return false;
520 }
521
522 // Helper method to replace the verification method for each enum in the Java
523 // version.
VerifyAccordingToLeniency(Leniency leniency,const PhoneNumber & number,const string & candidate) const524 bool PhoneNumberMatcher::VerifyAccordingToLeniency(
525 Leniency leniency, const PhoneNumber& number,
526 const string& candidate) const {
527 switch (leniency) {
528 case PhoneNumberMatcher::POSSIBLE:
529 return phone_util_.IsPossibleNumber(number);
530 case PhoneNumberMatcher::VALID:
531 if (!phone_util_.IsValidNumber(number) ||
532 !ContainsOnlyValidXChars(number, candidate, phone_util_)) {
533 return false;
534 }
535 return IsNationalPrefixPresentIfRequired(number);
536 case PhoneNumberMatcher::STRICT_GROUPING: {
537 if (!phone_util_.IsValidNumber(number) ||
538 !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
539 ContainsMoreThanOneSlashInNationalNumber(
540 number, candidate, phone_util_) ||
541 !IsNationalPrefixPresentIfRequired(number)) {
542 return false;
543 }
544 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
545 const string&, const std::vector<string>&>* callback =
546 NewPermanentCallback(&AllNumberGroupsRemainGrouped);
547 bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
548 delete(callback);
549 return is_valid;
550 }
551 case PhoneNumberMatcher::EXACT_GROUPING: {
552 if (!phone_util_.IsValidNumber(number) ||
553 !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
554 ContainsMoreThanOneSlashInNationalNumber(
555 number, candidate, phone_util_) ||
556 !IsNationalPrefixPresentIfRequired(number)) {
557 return false;
558 }
559 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
560 const string&, const std::vector<string>&>* callback =
561 NewPermanentCallback(
562 this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent);
563 bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
564 delete(callback);
565 return is_valid;
566 }
567 default:
568 LOG(ERROR) << "No implementation defined for verification for leniency "
569 << static_cast<int>(leniency);
570 return false;
571 }
572 }
573
ExtractInnerMatch(const string & candidate,int offset,PhoneNumberMatch * match)574 bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset,
575 PhoneNumberMatch* match) {
576 DCHECK(match);
577 for (std::vector<const RegExp*>::const_iterator regex =
578 reg_exps_->inner_matches_->begin();
579 regex != reg_exps_->inner_matches_->end(); regex++) {
580 scoped_ptr<RegExpInput> candidate_input(
581 reg_exps_->regexp_factory_->CreateInput(candidate));
582 bool is_first_match = true;
583 string group;
584 while ((*regex)->FindAndConsume(candidate_input.get(), &group) &&
585 max_tries_ > 0) {
586 int group_start_index = static_cast<int>(candidate.length() -
587 candidate_input->ToString().length() - group.length());
588 if (is_first_match) {
589 // We should handle any group before this one too.
590 string first_group_only = candidate.substr(0, group_start_index);
591 phone_util_.TrimUnwantedEndChars(&first_group_only);
592 bool success = ParseAndVerify(first_group_only, offset, match);
593 if (success) {
594 return true;
595 }
596 --max_tries_;
597 is_first_match = false;
598 }
599 phone_util_.TrimUnwantedEndChars(&group);
600 bool success = ParseAndVerify(group, offset + group_start_index, match);
601 if (success) {
602 return true;
603 }
604 --max_tries_;
605 }
606 }
607 return false;
608 }
609
ExtractMatch(const string & candidate,int offset,PhoneNumberMatch * match)610 bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset,
611 PhoneNumberMatch* match) {
612 DCHECK(match);
613 // Skip a match that is more likely to be a date.
614 if (reg_exps_->slash_separated_dates_->PartialMatch(candidate)) {
615 return false;
616 }
617
618 // Skip potential time-stamps.
619 if (reg_exps_->time_stamps_->PartialMatch(candidate)) {
620 scoped_ptr<RegExpInput> following_text(
621 reg_exps_->regexp_factory_->CreateInput(
622 text_.substr(offset + candidate.size())));
623 if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) {
624 return false;
625 }
626 }
627
628 // Try to come up with a valid match given the entire candidate.
629 if (ParseAndVerify(candidate, offset, match)) {
630 return true;
631 }
632
633 // If that failed, try to find an "inner match" - there might be a phone
634 // number within this candidate.
635 return ExtractInnerMatch(candidate, offset, match);
636 }
637
HasNext()638 bool PhoneNumberMatcher::HasNext() {
639 // Input should contain only UTF-8 characters.
640 if (!is_input_valid_utf8_) {
641 state_ = DONE;
642 return false;
643 }
644 if (state_ == NOT_READY) {
645 PhoneNumberMatch temp_match;
646 if (!Find(search_index_, &temp_match)) {
647 state_ = DONE;
648 } else {
649 last_match_.reset(new PhoneNumberMatch(temp_match.start(),
650 temp_match.raw_string(),
651 temp_match.number()));
652 search_index_ = last_match_->end();
653 state_ = READY;
654 }
655 }
656 return state_ == READY;
657 }
658
Next(PhoneNumberMatch * match)659 bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) {
660 DCHECK(match);
661 // Check the state and find the next match as a side-effect if necessary.
662 if (!HasNext()) {
663 return false;
664 }
665 match->CopyFrom(*last_match_);
666 state_ = NOT_READY;
667 last_match_.reset(NULL);
668 return true;
669 }
670
Find(int index,PhoneNumberMatch * match)671 bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) {
672 DCHECK(match);
673
674 scoped_ptr<RegExpInput> text(
675 reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index)));
676 string candidate;
677 while ((max_tries_ > 0) &&
678 reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) {
679 int start = static_cast<int>(text_.length() - text->ToString().length() - candidate.length());
680 // Check for extra numbers at the end.
681 reg_exps_->capture_up_to_second_number_start_pattern_->
682 PartialMatch(candidate, &candidate);
683 if (ExtractMatch(candidate, start, match)) {
684 return true;
685 }
686
687 index = static_cast<int>(start + candidate.length());
688 --max_tries_;
689 }
690 return false;
691 }
692
CheckNumberGroupingIsValid(const PhoneNumber & phone_number,const string & candidate,ResultCallback4<bool,const PhoneNumberUtil &,const PhoneNumber &,const string &,const std::vector<string> &> * checker) const693 bool PhoneNumberMatcher::CheckNumberGroupingIsValid(
694 const PhoneNumber& phone_number,
695 const string& candidate,
696 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
697 const string&, const std::vector<string>&>* checker) const {
698 DCHECK(checker);
699 string normalized_candidate =
700 NormalizeUTF8::NormalizeDecimalDigits(candidate);
701 std::vector<string> formatted_number_groups;
702 GetNationalNumberGroups(phone_number, &formatted_number_groups);
703 if (checker->Run(phone_util_, phone_number, normalized_candidate,
704 formatted_number_groups)) {
705 return true;
706 }
707 // If this didn't pass, see if there are any alternate formats that match, and
708 // try them instead.
709 const PhoneMetadata* alternate_formats =
710 alternate_formats_->GetAlternateFormatsForCountry(
711 phone_number.country_code());
712 if (alternate_formats) {
713 string national_significant_number;
714 phone_util_.GetNationalSignificantNumber(phone_number,
715 &national_significant_number);
716 for (RepeatedPtrField<NumberFormat>::const_iterator it =
717 alternate_formats->number_format().begin();
718 it != alternate_formats->number_format().end(); ++it) {
719 if (it->leading_digits_pattern_size() > 0) {
720 std::unique_ptr<RegExpInput> nsn_input(
721 reg_exps_->regexp_factory_->CreateInput(
722 national_significant_number));
723 // There is only one leading digits pattern for alternate formats.
724 if (!reg_exps_->regexp_cache_.GetRegExp(
725 it->leading_digits_pattern(0)).Consume(nsn_input.get())) {
726 // Leading digits don't match; try another one.
727 continue;
728 }
729 }
730 formatted_number_groups.clear();
731 GetNationalNumberGroupsForPattern(phone_number, &*it,
732 &formatted_number_groups);
733 if (checker->Run(phone_util_, phone_number, normalized_candidate,
734 formatted_number_groups)) {
735 return true;
736 }
737 }
738 }
739 return false;
740 }
741
GetNationalNumberGroups(const PhoneNumber & number,std::vector<string> * digit_blocks) const742 void PhoneNumberMatcher::GetNationalNumberGroups(
743 const PhoneNumber& number,
744 std::vector<string>* digit_blocks) const {
745 string rfc3966_format;
746 // This will be in the format +CC-DG1-DG2-DGX;ext=EXT where DG1..DGX
747 // represents groups of digits.
748 phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
749 // We remove the extension part from the formatted string before splitting
750 // it into different groups.
751 size_t end_index = rfc3966_format.find(';');
752 if (end_index == string::npos) {
753 end_index = rfc3966_format.length();
754 }
755 // The country-code will have a '-' following it.
756 size_t start_index = rfc3966_format.find('-') + 1;
757 SplitStringUsing(rfc3966_format.substr(start_index,
758 end_index - start_index),
759 '-', digit_blocks);
760 }
761
GetNationalNumberGroupsForPattern(const PhoneNumber & number,const NumberFormat * formatting_pattern,std::vector<string> * digit_blocks) const762 void PhoneNumberMatcher::GetNationalNumberGroupsForPattern(
763 const PhoneNumber& number,
764 const NumberFormat* formatting_pattern,
765 std::vector<string>* digit_blocks) const {
766 string rfc3966_format;
767 // We format the NSN only, and split that according to the separator.
768 string national_significant_number;
769 phone_util_.GetNationalSignificantNumber(number,
770 &national_significant_number);
771 phone_util_.FormatNsnUsingPattern(national_significant_number,
772 *formatting_pattern,
773 PhoneNumberUtil::RFC3966,
774 &rfc3966_format);
775 SplitStringUsing(rfc3966_format, '-', digit_blocks);
776 }
777
IsNationalPrefixPresentIfRequired(const PhoneNumber & number) const778 bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired(
779 const PhoneNumber& number) const {
780 // First, check how we deduced the country code. If it was written in
781 // international format, then the national prefix is not required.
782 if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
783 return true;
784 }
785 string phone_number_region;
786 phone_util_.GetRegionCodeForCountryCode(
787 number.country_code(), &phone_number_region);
788 const PhoneMetadata* metadata =
789 phone_util_.GetMetadataForRegion(phone_number_region);
790 if (!metadata) {
791 return true;
792 }
793 // Check if a national prefix should be present when formatting this number.
794 string national_number;
795 phone_util_.GetNationalSignificantNumber(number, &national_number);
796 const NumberFormat* format_rule =
797 phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(),
798 national_number);
799 // To do this, we check that a national prefix formatting rule was present and
800 // that it wasn't just the first-group symbol ($1) with punctuation.
801 if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) {
802 if (format_rule->national_prefix_optional_when_formatting()) {
803 // The national-prefix is optional in these cases, so we don't need to
804 // check if it was present.
805 return true;
806 }
807 if (phone_util_.FormattingRuleHasFirstGroupOnly(
808 format_rule->national_prefix_formatting_rule())) {
809 // National Prefix not needed for this number.
810 return true;
811 }
812 // Normalize the remainder.
813 string raw_input_copy(number.raw_input());
814 // Check if we found a national prefix and/or carrier code at the start of
815 // the raw input, and return the result.
816 phone_util_.NormalizeDigitsOnly(&raw_input_copy);
817 return phone_util_.MaybeStripNationalPrefixAndCarrierCode(
818 *metadata,
819 &raw_input_copy,
820 NULL); // Don't need to keep the stripped carrier code.
821 }
822 return true;
823 }
824
AllNumberGroupsAreExactlyPresent(const PhoneNumberUtil & util,const PhoneNumber & phone_number,const string & normalized_candidate,const std::vector<string> & formatted_number_groups) const825 bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent(
826 const PhoneNumberUtil& util,
827 const PhoneNumber& phone_number,
828 const string& normalized_candidate,
829 const std::vector<string>& formatted_number_groups) const {
830 const scoped_ptr<RegExpInput> candidate_number(
831 reg_exps_->regexp_factory_->CreateInput(normalized_candidate));
832 std::vector<string> candidate_groups;
833 string digit_block;
834 while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume(
835 candidate_number.get(),
836 &digit_block)) {
837 candidate_groups.push_back(digit_block);
838 }
839
840 // Set this to the last group, skipping it if the number has an extension.
841 int candidate_number_group_index = static_cast<int>(
842 phone_number.has_extension() ? candidate_groups.size() - 2
843 : candidate_groups.size() - 1);
844 // First we check if the national significant number is formatted as a block.
845 // We use find and not equals, since the national significant number may be
846 // present with a prefix such as a national number prefix, or the country code
847 // itself.
848 string national_significant_number;
849 util.GetNationalSignificantNumber(phone_number,
850 &national_significant_number);
851 if (candidate_groups.size() == 1 ||
852 candidate_groups.at(candidate_number_group_index).find(
853 national_significant_number) != string::npos) {
854 return true;
855 }
856 // Starting from the end, go through in reverse, excluding the first group,
857 // and check the candidate and number groups are the same.
858 for (int formatted_number_group_index =
859 static_cast<int>(formatted_number_groups.size() - 1);
860 formatted_number_group_index > 0 &&
861 candidate_number_group_index >= 0;
862 --formatted_number_group_index, --candidate_number_group_index) {
863 if (candidate_groups.at(candidate_number_group_index) !=
864 formatted_number_groups.at(formatted_number_group_index)) {
865 return false;
866 }
867 }
868 // Now check the first group. There may be a national prefix at the start, so
869 // we only check that the candidate group ends with the formatted number
870 // group.
871 return (candidate_number_group_index >= 0 &&
872 HasSuffixString(candidate_groups.at(candidate_number_group_index),
873 formatted_number_groups.at(0)));
874 }
875
876 // static
ContainsMoreThanOneSlashInNationalNumber(const PhoneNumber & number,const string & candidate,const PhoneNumberUtil & util)877 bool PhoneNumberMatcher::ContainsMoreThanOneSlashInNationalNumber(
878 const PhoneNumber& number,
879 const string& candidate,
880 const PhoneNumberUtil& util) {
881 size_t first_slash_in_body = candidate.find('/');
882 if (first_slash_in_body == string::npos) {
883 // No slashes, this is okay.
884 return false;
885 }
886 // Now look for a second one.
887 size_t second_slash_in_body = candidate.find('/', first_slash_in_body + 1);
888 if (second_slash_in_body == string::npos) {
889 // Only one slash, this is okay.
890 return false;
891 }
892
893 // If the first slash is after the country calling code, this is permitted.
894 if (number.country_code_source() == PhoneNumber::FROM_NUMBER_WITH_PLUS_SIGN ||
895 number.country_code_source() ==
896 PhoneNumber::FROM_NUMBER_WITHOUT_PLUS_SIGN) {
897 string normalized_country_code =
898 candidate.substr(0, first_slash_in_body);
899 util.NormalizeDigitsOnly(&normalized_country_code);
900 if (normalized_country_code == SimpleItoa(number.country_code())) {
901 // Any more slashes and this is illegal.
902 return candidate.find('/', second_slash_in_body + 1) != string::npos;
903 }
904 }
905 return true;
906 }
907
908 } // namespace phonenumbers
909 } // namespace i18n
910