1 // Copyright (C) 2011 The Libphonenumber Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: Lara Rennie
16 // Author: Tao Huang
17 //
18 // Implementation of a stateful class that finds and extracts telephone numbers
19 // from text.
20
21 #include "phonenumbers/phonenumbermatcher.h"
22
23 #ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP
24 #error phonenumbermatcher depends on ICU \
25 (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set)
26 #endif // I18N_PHONENUMBERS_USE_ICU_REGEXP
27
28 #include <ctype.h>
29 #include <stddef.h>
30 #include <limits>
31 #include <map>
32 #include <string>
33 #include <utility>
34 #include <vector>
35
36 #include <unicode/uchar.h>
37
38 #include "phonenumbers/alternate_format.h"
39 #include "phonenumbers/base/logging.h"
40 #include "phonenumbers/base/memory/scoped_ptr.h"
41 #include "phonenumbers/base/memory/singleton.h"
42 #include "phonenumbers/callback.h"
43 #include "phonenumbers/default_logger.h"
44 #include "phonenumbers/encoding_utils.h"
45 #include "phonenumbers/normalize_utf8.h"
46 #include "phonenumbers/phonemetadata.pb.h"
47 #include "phonenumbers/phonenumber.pb.h"
48 #include "phonenumbers/phonenumbermatch.h"
49 #include "phonenumbers/phonenumberutil.h"
50 #include "phonenumbers/regexp_adapter.h"
51 #include "phonenumbers/regexp_adapter_icu.h"
52 #include "phonenumbers/stringutil.h"
53
54 #ifdef I18N_PHONENUMBERS_USE_RE2
55 #include "phonenumbers/regexp_adapter_re2.h"
56 #endif // I18N_PHONENUMBERS_USE_RE2_AND_ICU
57
58 using std::make_pair;
59 using std::map;
60 using std::numeric_limits;
61 using std::string;
62 using std::vector;
63
64 namespace i18n {
65 namespace phonenumbers {
66
67 namespace {
68 // Returns a regular expression quantifier with an upper and lower limit.
Limit(int lower,int upper)69 string Limit(int lower, int upper) {
70 DCHECK_GE(lower, 0);
71 DCHECK_GT(upper, 0);
72 DCHECK_LT(lower, upper);
73 return StrCat("{", lower, ",", upper, "}");
74 }
75
IsInvalidPunctuationSymbol(char32 character)76 bool IsInvalidPunctuationSymbol(char32 character) {
77 return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL;
78 }
79
ContainsOnlyValidXChars(const PhoneNumber & number,const string & candidate,const PhoneNumberUtil & util)80 bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate,
81 const PhoneNumberUtil& util) {
82 // The characters 'x' and 'X' can be (1) a carrier code, in which case they
83 // always precede the national significant number or (2) an extension sign,
84 // in which case they always precede the extension number. We assume a
85 // carrier code is more than 1 digit, so the first case has to have more than
86 // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1
87 // 'x' or 'X'.
88 size_t found;
89 found = candidate.find_first_of("xX");
90 // We ignore the character if 'x' or 'X' appears as the last character of
91 // the string.
92 while (found != string::npos && found < candidate.length() - 1) {
93 // We only look for 'x' or 'X' in ASCII form.
94 char next_char = candidate[found + 1];
95 if (next_char == 'x' || next_char == 'X') {
96 // This is the carrier code case, in which the 'X's always precede the
97 // national significant number.
98 ++found;
99 if (util.IsNumberMatchWithOneString(
100 number, candidate.substr(found, candidate.length() - found))
101 != PhoneNumberUtil::NSN_MATCH) {
102 return false;
103 }
104 } else {
105 string normalized_extension(candidate.substr(found,
106 candidate.length() - found));
107 util.NormalizeDigitsOnly(&normalized_extension);
108 if (normalized_extension != number.extension()) {
109 return false;
110 }
111 }
112 found = candidate.find_first_of("xX", found + 1);
113 }
114 return true;
115 }
116
AllNumberGroupsRemainGrouped(const PhoneNumberUtil & util,const PhoneNumber & phone_number,const string & normalized_candidate,const vector<string> & formatted_number_groups)117 bool AllNumberGroupsRemainGrouped(
118 const PhoneNumberUtil& util,
119 const PhoneNumber& phone_number,
120 const string& normalized_candidate,
121 const vector<string>& formatted_number_groups) {
122 size_t from_index = 0;
123 // Check each group of consecutive digits are not broken into separate
124 // groupings in the normalized_candidate string.
125 for (size_t i = 0; i < formatted_number_groups.size(); ++i) {
126 // Fails if the substring of normalized_candidate starting from from_index
127 // doesn't contain the consecutive digits in formatted_number_groups.at(i).
128 from_index = normalized_candidate.find(formatted_number_groups.at(i),
129 from_index);
130 if (from_index == string::npos) {
131 return false;
132 }
133 // Moves from_index forward.
134 from_index += formatted_number_groups.at(i).length();
135 if (i == 0 && from_index < normalized_candidate.length()) {
136 // We are at the position right after the NDC. We get the region used for
137 // formatting information based on the country code in the phone number,
138 // rather than the number itself, as we do not need to distinguish between
139 // different countries with the same country calling code and this is
140 // faster.
141 string region;
142 util.GetRegionCodeForCountryCode(phone_number.country_code(), ®ion);
143 string ndd_prefix;
144 util.GetNddPrefixForRegion(region, true, &ndd_prefix);
145 // Note although normalized_candidate might contain non-ASCII formatting
146 // characters, they won't be treated as ASCII digits when converted to a
147 // char.
148 if (!ndd_prefix.empty() && isdigit(normalized_candidate.at(from_index))) {
149 // This means there is no formatting symbol after the NDC. In this case,
150 // we only accept the number if there is no formatting symbol at all in
151 // the number, except for extensions. This is only important for
152 // countries with national prefixes.
153 string national_significant_number;
154 util.GetNationalSignificantNumber(
155 phone_number, &national_significant_number);
156 return HasPrefixString(normalized_candidate.substr(
157 from_index - formatted_number_groups.at(i).length()),
158 national_significant_number);
159 }
160 }
161 }
162 // The check here makes sure that we haven't mistakenly already used the
163 // extension to match the last group of the subscriber number. Note the
164 // extension cannot have formatting in-between digits.
165 return normalized_candidate.substr(from_index)
166 .find(phone_number.extension()) != string::npos;
167 }
168
LoadAlternateFormats(PhoneMetadataCollection * alternate_formats)169 bool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) {
170 #if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS)
171 if (!alternate_formats->ParseFromArray(alternate_format_get(),
172 alternate_format_size())) {
173 LOG(ERROR) << "Could not parse binary data.";
174 return false;
175 }
176 return true;
177 #else
178 return false;
179 #endif
180 }
181
182 } // namespace
183
184 class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
185 private:
186 friend class Singleton<PhoneNumberMatcherRegExps>;
187
188 string opening_parens_;
189 string closing_parens_;
190 string non_parens_;
191 // Limit on the number of pairs of brackets in a phone number.
192 string bracket_pair_limit_;
193 // Helper strings for the matching_brackets_ pattern.
194 // An opening bracket at the beginning may not be closed, but subsequent ones
195 // should be. It's also possible that the leading bracket was dropped, so we
196 // shouldn't be surprised if we see a closing bracket first.
197 string leading_maybe_matched_bracket_;
198 string bracket_pairs_;
199 // Limit on the number of leading (plus) characters.
200 string lead_limit_;
201 // Limit on the number of consecutive punctuation characters.
202 string punctuation_limit_;
203 // The maximum number of digits allowed in a digit-separated block. As we
204 // allow all digits in a single block, this should be set high enough to
205 // accommodate the entire national number and the international country code.
206 int digit_block_limit_;
207 // Limit on the number of blocks separated by punctuation. Uses
208 // kDigitBlockLimit since some formats use spaces to separate each digit.
209 string block_limit_;
210 // A punctuation sequence allowing white space.
211 string punctuation_;
212 // A digits block without punctuation.
213 string digit_sequence_;
214 // Punctuation that may be at the start of a phone number - brackets and plus
215 // signs.
216 string lead_class_chars_;
217 // Same as lead_class_chars_, but enclosed as a character class.
218 string lead_class_;
219 // Extra helper strings that form part of pattern_. These are stored
220 // separately since StrCat has a limit of 12 args.
221 string opening_punctuation_;
222 string optional_extn_pattern_;
223
224 public:
225 // We use two different reg-ex factories here for performance reasons. RE2 is
226 // much faster for smaller reg-ex patterns, but the main pattern cannot be
227 // handled by RE2 in an efficient way.
228 scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_;
229 scoped_ptr<const AbstractRegExpFactory> regexp_factory_;
230
231 // Matches strings that look like publication pages. Example:
232 // Computing Complete Answers to Queries in the Presence of Limited Access
233 // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003).
234 //
235 // The string "211-227 (2003)" is not a telephone number.
236 scoped_ptr<const RegExp> pub_pages_;
237 // Matches strings that look like dates using "/" as a separator. Examples:
238 // 3/10/2011, 31/10/96 or 08/31/95.
239 scoped_ptr<const RegExp> slash_separated_dates_;
240 // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
241 // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_.
242 scoped_ptr<const RegExp> time_stamps_;
243 scoped_ptr<const RegExp> time_stamps_suffix_;
244 // Pattern to check that brackets match. Opening brackets should be closed
245 // within a phone number. This also checks that there is something inside the
246 // brackets. Having no brackets at all is also fine.
247 scoped_ptr<const RegExp> matching_brackets_;
248 // Matches white-space, which may indicate the end of a phone number and the
249 // start of something else (such as a neighbouring zip-code). If white-space
250 // is found, continues to match all characters that are not typically used to
251 // start a phone number.
252 scoped_ptr<const RegExp> group_separator_;
253 scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_;
254 scoped_ptr<const RegExp> capturing_ascii_digits_pattern_;
255 // Compiled reg-ex representing lead_class_;
256 scoped_ptr<const RegExp> lead_class_pattern_;
257 // Phone number pattern allowing optional punctuation.
258 scoped_ptr<const RegExp> pattern_;
259
PhoneNumberMatcherRegExps()260 PhoneNumberMatcherRegExps()
261 : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */),
262 closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */),
263 non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")),
264 bracket_pair_limit_(Limit(0, 3)),
265 leading_maybe_matched_bracket_(StrCat(
266 "(?:[", opening_parens_, "])?",
267 "(?:", non_parens_, "+[", closing_parens_, "])?")),
268 bracket_pairs_(StrCat(
269 "(?:[", opening_parens_, "]", non_parens_, "+",
270 "[", closing_parens_, "])", bracket_pair_limit_)),
271 lead_limit_(Limit(0, 2)),
272 punctuation_limit_(Limit(0, 4)),
273 digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn +
274 PhoneNumberUtil::kMaxLengthCountryCode),
275 block_limit_(Limit(0, digit_block_limit_)),
276 punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]",
277 punctuation_limit_)),
278 digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))),
279 lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)),
280 lead_class_(StrCat("[", lead_class_chars_, "]")),
281 opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")),
282 optional_extn_pattern_(StrCat(
283 "(?i)(?:",
284 PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
285 ")?")),
286 regexp_factory_for_pattern_(new ICURegExpFactory()),
287 #ifdef I18N_PHONENUMBERS_USE_RE2
288 regexp_factory_(new RE2RegExpFactory()),
289 #else
290 regexp_factory_(new ICURegExpFactory()),
291 #endif // I18N_PHONENUMBERS_USE_RE2
292 pub_pages_(regexp_factory_->CreateRegExp(
293 "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")),
294 slash_separated_dates_(regexp_factory_->CreateRegExp(
295 "(?:(?:[0-3]?\\d/[01]?\\d)|"
296 "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")),
297 time_stamps_(regexp_factory_->CreateRegExp(
298 "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$")),
299 time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")),
300 matching_brackets_(regexp_factory_->CreateRegExp(
301 StrCat(leading_maybe_matched_bracket_, non_parens_, "+",
302 bracket_pairs_, non_parens_, "*"))),
303 group_separator_(regexp_factory_->CreateRegExp(
304 StrCat("\\p{Z}", "[^", lead_class_chars_, "\\p{Nd}]*"))),
305 capture_up_to_second_number_start_pattern_(
306 regexp_factory_->CreateRegExp(
307 PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
308 capturing_ascii_digits_pattern_(
309 regexp_factory_->CreateRegExp("(\\d+)")),
310 lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)),
311 pattern_(regexp_factory_for_pattern_->CreateRegExp(
312 StrCat("(", opening_punctuation_, lead_limit_,
313 digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
314 block_limit_, optional_extn_pattern_, ")"))) {
315 }
316
317 private:
318 DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps);
319 };
320
321 class AlternateFormats : public Singleton<AlternateFormats> {
322 public:
323 PhoneMetadataCollection format_data_;
324
325 map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_;
326
AlternateFormats()327 AlternateFormats()
328 : format_data_(),
329 calling_code_to_alternate_formats_map_() {
330 if (!LoadAlternateFormats(&format_data_)) {
331 LOG(DFATAL) << "Could not parse compiled-in metadata.";
332 return;
333 }
334 for (RepeatedPtrField<PhoneMetadata>::const_iterator it =
335 format_data_.metadata().begin();
336 it != format_data_.metadata().end();
337 ++it) {
338 calling_code_to_alternate_formats_map_.insert(
339 make_pair(it->country_code(), &*it));
340 }
341 }
342
GetAlternateFormatsForCountry(int country_calling_code) const343 const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code)
344 const {
345 map<int, const PhoneMetadata*>::const_iterator it =
346 calling_code_to_alternate_formats_map_.find(country_calling_code);
347 if (it != calling_code_to_alternate_formats_map_.end()) {
348 return it->second;
349 }
350 return NULL;
351 }
352
353 private:
354 DISALLOW_COPY_AND_ASSIGN(AlternateFormats);
355 };
356
PhoneNumberMatcher(const PhoneNumberUtil & util,const string & text,const string & region_code,PhoneNumberMatcher::Leniency leniency,int max_tries)357 PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util,
358 const string& text,
359 const string& region_code,
360 PhoneNumberMatcher::Leniency leniency,
361 int max_tries)
362 : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
363 alternate_formats_(AlternateFormats::GetInstance()),
364 phone_util_(util),
365 text_(text),
366 preferred_region_(region_code),
367 leniency_(leniency),
368 max_tries_(max_tries),
369 state_(NOT_READY),
370 last_match_(NULL),
371 search_index_(0) {
372 }
373
PhoneNumberMatcher(const string & text,const string & region_code)374 PhoneNumberMatcher::PhoneNumberMatcher(const string& text,
375 const string& region_code)
376 : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
377 alternate_formats_(NULL), // Not used.
378 phone_util_(*PhoneNumberUtil::GetInstance()),
379 text_(text),
380 preferred_region_(region_code),
381 leniency_(VALID),
382 max_tries_(numeric_limits<int>::max()),
383 state_(NOT_READY),
384 last_match_(NULL),
385 search_index_(0) {
386 }
387
~PhoneNumberMatcher()388 PhoneNumberMatcher::~PhoneNumberMatcher() {
389 }
390
391 // static
IsLatinLetter(char32 letter)392 bool PhoneNumberMatcher::IsLatinLetter(char32 letter) {
393 // Combining marks are a subset of non-spacing-mark.
394 if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) {
395 return false;
396 }
397 UBlockCode block = ublock_getCode(letter);
398 return ((block == UBLOCK_BASIC_LATIN) ||
399 (block == UBLOCK_LATIN_1_SUPPLEMENT) ||
400 (block == UBLOCK_LATIN_EXTENDED_A) ||
401 (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
402 (block == UBLOCK_LATIN_EXTENDED_B) ||
403 (block == UBLOCK_COMBINING_DIACRITICAL_MARKS));
404 }
405
ParseAndVerify(const string & candidate,int offset,PhoneNumberMatch * match)406 bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset,
407 PhoneNumberMatch* match) {
408 DCHECK(match);
409 // Check the candidate doesn't contain any formatting which would indicate
410 // that it really isn't a phone number.
411 if (!reg_exps_->matching_brackets_->FullMatch(candidate)) {
412 return false;
413 }
414
415 // If leniency is set to VALID or stricter, we also want to skip numbers that
416 // are surrounded by Latin alphabetic characters, to skip cases like
417 // abc8005001234 or 8005001234def.
418 if (leniency_ >= VALID) {
419 // If the candidate is not at the start of the text, and does not start with
420 // phone-number punctuation, check the previous character.
421 scoped_ptr<RegExpInput> candidate_input(
422 reg_exps_->regexp_factory_->CreateInput(candidate));
423 if (offset > 0 &&
424 !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) {
425 char32 previous_char;
426 const char* previous_char_ptr =
427 EncodingUtils::BackUpOneUTF8Character(text_.c_str(),
428 text_.c_str() + offset);
429 EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char);
430 // We return false if it is a latin letter or an invalid punctuation
431 // symbol.
432 if (IsInvalidPunctuationSymbol(previous_char) ||
433 IsLatinLetter(previous_char)) {
434 return false;
435 }
436 }
437 size_t lastCharIndex = offset + candidate.length();
438 if (lastCharIndex < text_.length()) {
439 char32 next_char;
440 const char* next_char_ptr =
441 EncodingUtils::AdvanceOneUTF8Character(
442 text_.c_str() + lastCharIndex - 1);
443 EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char);
444 if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) {
445 return false;
446 }
447 }
448 }
449
450 PhoneNumber number;
451 if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) !=
452 PhoneNumberUtil::NO_PARSING_ERROR) {
453 return false;
454 }
455 if (VerifyAccordingToLeniency(leniency_, number, candidate)) {
456 match->set_start(offset);
457 match->set_raw_string(candidate);
458 // We used ParseAndKeepRawInput to create this number, but for now we don't
459 // return the extra values parsed. TODO: stop clearing all values here and
460 // switch all users over to using raw_input() rather than the raw_string()
461 // of PhoneNumberMatch.
462 number.clear_country_code_source();
463 number.clear_preferred_domestic_carrier_code();
464 number.clear_raw_input();
465 match->set_number(number);
466 return true;
467 }
468 return false;
469 }
470
471 // Helper method to replace the verification method for each enum in the Java
472 // version.
VerifyAccordingToLeniency(Leniency leniency,const PhoneNumber & number,const string & candidate) const473 bool PhoneNumberMatcher::VerifyAccordingToLeniency(
474 Leniency leniency, const PhoneNumber& number,
475 const string& candidate) const {
476 switch (leniency) {
477 case PhoneNumberMatcher::POSSIBLE:
478 return phone_util_.IsPossibleNumber(number);
479 case PhoneNumberMatcher::VALID:
480 if (!phone_util_.IsValidNumber(number) ||
481 !ContainsOnlyValidXChars(number, candidate, phone_util_)) {
482 return false;
483 }
484 return IsNationalPrefixPresentIfRequired(number);
485 case PhoneNumberMatcher::STRICT_GROUPING: {
486 if (!phone_util_.IsValidNumber(number) ||
487 !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
488 ContainsMoreThanOneSlashInNationalNumber(
489 number, candidate, phone_util_) ||
490 !IsNationalPrefixPresentIfRequired(number)) {
491 return false;
492 }
493 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
494 const string&, const vector<string>&>* callback =
495 NewPermanentCallback(&AllNumberGroupsRemainGrouped);
496 bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
497 delete(callback);
498 return is_valid;
499 }
500 case PhoneNumberMatcher::EXACT_GROUPING: {
501 if (!phone_util_.IsValidNumber(number) ||
502 !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
503 ContainsMoreThanOneSlashInNationalNumber(
504 number, candidate, phone_util_) ||
505 !IsNationalPrefixPresentIfRequired(number)) {
506 return false;
507 }
508 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
509 const string&, const vector<string>&>* callback =
510 NewPermanentCallback(
511 this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent);
512 bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
513 delete(callback);
514 return is_valid;
515 }
516 default:
517 LOG(ERROR) << "No implementation defined for verification for leniency "
518 << static_cast<int>(leniency);
519 return false;
520 }
521 }
522
ExtractInnerMatch(const string & candidate,int offset,PhoneNumberMatch * match)523 bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset,
524 PhoneNumberMatch* match) {
525 DCHECK(match);
526 // Try removing either the first or last "group" in the number and see if this
527 // gives a result. We consider white space to be a possible indication of
528 // the start or end of the phone number.
529 scoped_ptr<RegExpInput> candidate_input(
530 reg_exps_->regexp_factory_->CreateInput(candidate));
531 if (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
532 NULL)) {
533 // Try the first group by itself.
534 int group_start_index =
535 candidate.length() - candidate_input->ToString().length();
536 string first_group_only = candidate.substr(0, group_start_index);
537 phone_util_.TrimUnwantedEndChars(&first_group_only);
538 bool success = ParseAndVerify(first_group_only, offset, match);
539 if (success) {
540 return true;
541 }
542 --max_tries_;
543
544 // Try the rest of the candidate without the first group.
545 string without_first_group(candidate_input->ToString());
546 phone_util_.TrimUnwantedEndChars(&without_first_group);
547 success =
548 ParseAndVerify(without_first_group, offset + group_start_index, match);
549 if (success) {
550 return true;
551 }
552 --max_tries_;
553
554 if (max_tries_ > 0) {
555 while (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
556 NULL)) {
557 // Find the last group.
558 }
559 int last_group_start =
560 candidate.length() - candidate_input->ToString().length();
561 string without_last_group = candidate.substr(0, last_group_start);
562 phone_util_.TrimUnwantedEndChars(&without_last_group);
563 if (without_last_group == first_group_only) {
564 // If there are only two groups, then the group "without the last group"
565 // is the same as the first group. In these cases, we don't want to
566 // re-check the number group, so we exit already.
567 return false;
568 }
569 success = ParseAndVerify(without_last_group, offset, match);
570 if (success) {
571 return true;
572 }
573 --max_tries_;
574 }
575 }
576 return false;
577 }
578
ExtractMatch(const string & candidate,int offset,PhoneNumberMatch * match)579 bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset,
580 PhoneNumberMatch* match) {
581 DCHECK(match);
582 // Skip a match that is more likely a publication page reference or a date.
583 if (reg_exps_->pub_pages_->PartialMatch(candidate) ||
584 reg_exps_->slash_separated_dates_->PartialMatch(candidate)) {
585 return false;
586 }
587 // Skip potential time-stamps.
588 if (reg_exps_->time_stamps_->PartialMatch(candidate)) {
589 scoped_ptr<RegExpInput> following_text(
590 reg_exps_->regexp_factory_->CreateInput(
591 text_.substr(offset + candidate.size())));
592 if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) {
593 return false;
594 }
595 }
596
597 // Try to come up with a valid match given the entire candidate.
598 if (ParseAndVerify(candidate, offset, match)) {
599 return true;
600 }
601
602 // If that failed, try to find an "inner match" - there might be a phone
603 // number within this candidate.
604 return ExtractInnerMatch(candidate, offset, match);
605 }
606
HasNext()607 bool PhoneNumberMatcher::HasNext() {
608 if (state_ == NOT_READY) {
609 PhoneNumberMatch temp_match;
610 if (!Find(search_index_, &temp_match)) {
611 state_ = DONE;
612 } else {
613 last_match_.reset(new PhoneNumberMatch(temp_match.start(),
614 temp_match.raw_string(),
615 temp_match.number()));
616 search_index_ = last_match_->end();
617 state_ = READY;
618 }
619 }
620 return state_ == READY;
621 }
622
Next(PhoneNumberMatch * match)623 bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) {
624 DCHECK(match);
625 // Check the state and find the next match as a side-effect if necessary.
626 if (!HasNext()) {
627 return false;
628 }
629 match->CopyFrom(*last_match_);
630 state_ = NOT_READY;
631 last_match_.reset(NULL);
632 return true;
633 }
634
Find(int index,PhoneNumberMatch * match)635 bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) {
636 DCHECK(match);
637
638 scoped_ptr<RegExpInput> text(
639 reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index)));
640 string candidate;
641 while ((max_tries_ > 0) &&
642 reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) {
643 int start = text_.length() - text->ToString().length() - candidate.length();
644 // Check for extra numbers at the end.
645 reg_exps_->capture_up_to_second_number_start_pattern_->
646 PartialMatch(candidate, &candidate);
647 if (ExtractMatch(candidate, start, match)) {
648 return true;
649 }
650
651 index = start + candidate.length();
652 --max_tries_;
653 }
654 return false;
655 }
656
CheckNumberGroupingIsValid(const PhoneNumber & phone_number,const string & candidate,ResultCallback4<bool,const PhoneNumberUtil &,const PhoneNumber &,const string &,const vector<string> &> * checker) const657 bool PhoneNumberMatcher::CheckNumberGroupingIsValid(
658 const PhoneNumber& phone_number,
659 const string& candidate,
660 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
661 const string&, const vector<string>&>* checker) const {
662 DCHECK(checker);
663 // TODO: Evaluate how this works for other locales (testing has been limited
664 // to NANPA regions) and optimise if necessary.
665 string normalized_candidate =
666 NormalizeUTF8::NormalizeDecimalDigits(candidate);
667 vector<string> formatted_number_groups;
668 GetNationalNumberGroups(phone_number, NULL, // Use default formatting pattern
669 &formatted_number_groups);
670 if (checker->Run(phone_util_, phone_number, normalized_candidate,
671 formatted_number_groups)) {
672 return true;
673 }
674 // If this didn't pass, see if there are any alternate formats, and try them
675 // instead.
676 const PhoneMetadata* alternate_formats =
677 alternate_formats_->GetAlternateFormatsForCountry(
678 phone_number.country_code());
679 if (alternate_formats) {
680 for (RepeatedPtrField<NumberFormat>::const_iterator it =
681 alternate_formats->number_format().begin();
682 it != alternate_formats->number_format().end(); ++it) {
683 formatted_number_groups.clear();
684 GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups);
685 if (checker->Run(phone_util_, phone_number, normalized_candidate,
686 formatted_number_groups)) {
687 return true;
688 }
689 }
690 }
691 return false;
692 }
693
694 // Helper method to get the national-number part of a number, formatted without
695 // any national prefix, and return it as a set of digit blocks that would be
696 // formatted together.
GetNationalNumberGroups(const PhoneNumber & number,const NumberFormat * formatting_pattern,vector<string> * digit_blocks) const697 void PhoneNumberMatcher::GetNationalNumberGroups(
698 const PhoneNumber& number,
699 const NumberFormat* formatting_pattern,
700 vector<string>* digit_blocks) const {
701 string rfc3966_format;
702 if (!formatting_pattern) {
703 // This will be in the format +CC-DG;ext=EXT where DG represents groups of
704 // digits.
705 phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
706 // We remove the extension part from the formatted string before splitting
707 // it into different groups.
708 size_t end_index = rfc3966_format.find(';');
709 if (end_index == string::npos) {
710 end_index = rfc3966_format.length();
711 }
712 // The country-code will have a '-' following it.
713 size_t start_index = rfc3966_format.find('-') + 1;
714 SplitStringUsing(rfc3966_format.substr(start_index,
715 end_index - start_index),
716 "-", digit_blocks);
717 } else {
718 // We format the NSN only, and split that according to the separator.
719 string national_significant_number;
720 phone_util_.GetNationalSignificantNumber(number,
721 &national_significant_number);
722 phone_util_.FormatNsnUsingPattern(national_significant_number,
723 *formatting_pattern,
724 PhoneNumberUtil::RFC3966,
725 &rfc3966_format);
726 SplitStringUsing(rfc3966_format, "-", digit_blocks);
727 }
728 }
729
IsNationalPrefixPresentIfRequired(const PhoneNumber & number) const730 bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired(
731 const PhoneNumber& number) const {
732 // First, check how we deduced the country code. If it was written in
733 // international format, then the national prefix is not required.
734 if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
735 return true;
736 }
737 string phone_number_region;
738 phone_util_.GetRegionCodeForCountryCode(
739 number.country_code(), &phone_number_region);
740 const PhoneMetadata* metadata =
741 phone_util_.GetMetadataForRegion(phone_number_region);
742 if (!metadata) {
743 return true;
744 }
745 // Check if a national prefix should be present when formatting this number.
746 string national_number;
747 phone_util_.GetNationalSignificantNumber(number, &national_number);
748 const NumberFormat* format_rule =
749 phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(),
750 national_number);
751 // To do this, we check that a national prefix formatting rule was present and
752 // that it wasn't just the first-group symbol ($1) with punctuation.
753 if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) {
754 if (format_rule->national_prefix_optional_when_formatting()) {
755 // The national-prefix is optional in these cases, so we don't need to
756 // check if it was present.
757 return true;
758 }
759 if (phone_util_.FormattingRuleHasFirstGroupOnly(
760 format_rule->national_prefix_formatting_rule())) {
761 // National Prefix not needed for this number.
762 return true;
763 }
764 // Normalize the remainder.
765 string raw_input_copy(number.raw_input());
766 // Check if we found a national prefix and/or carrier code at the start of
767 // the raw input, and return the result.
768 phone_util_.NormalizeDigitsOnly(&raw_input_copy);
769 return phone_util_.MaybeStripNationalPrefixAndCarrierCode(
770 *metadata,
771 &raw_input_copy,
772 NULL); // Don't need to keep the stripped carrier code.
773 }
774 return true;
775 }
776
AllNumberGroupsAreExactlyPresent(const PhoneNumberUtil & util,const PhoneNumber & phone_number,const string & normalized_candidate,const vector<string> & formatted_number_groups) const777 bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent(
778 const PhoneNumberUtil& util,
779 const PhoneNumber& phone_number,
780 const string& normalized_candidate,
781 const vector<string>& formatted_number_groups) const {
782 const scoped_ptr<RegExpInput> candidate_number(
783 reg_exps_->regexp_factory_->CreateInput(normalized_candidate));
784 vector<string> candidate_groups;
785 string digit_block;
786 while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume(
787 candidate_number.get(),
788 &digit_block)) {
789 candidate_groups.push_back(digit_block);
790 }
791
792 // Set this to the last group, skipping it if the number has an extension.
793 int candidate_number_group_index =
794 phone_number.has_extension() ? candidate_groups.size() - 2
795 : candidate_groups.size() - 1;
796 // First we check if the national significant number is formatted as a block.
797 // We use find and not equals, since the national significant number may be
798 // present with a prefix such as a national number prefix, or the country code
799 // itself.
800 string national_significant_number;
801 util.GetNationalSignificantNumber(phone_number,
802 &national_significant_number);
803 if (candidate_groups.size() == 1 ||
804 candidate_groups.at(candidate_number_group_index).find(
805 national_significant_number) != string::npos) {
806 return true;
807 }
808 // Starting from the end, go through in reverse, excluding the first group,
809 // and check the candidate and number groups are the same.
810 for (int formatted_number_group_index =
811 (formatted_number_groups.size() - 1);
812 formatted_number_group_index > 0 &&
813 candidate_number_group_index >= 0;
814 --formatted_number_group_index, --candidate_number_group_index) {
815 if (candidate_groups.at(candidate_number_group_index) !=
816 formatted_number_groups.at(formatted_number_group_index)) {
817 return false;
818 }
819 }
820 // Now check the first group. There may be a national prefix at the start, so
821 // we only check that the candidate group ends with the formatted number
822 // group.
823 return (candidate_number_group_index >= 0 &&
824 HasSuffixString(candidate_groups.at(candidate_number_group_index),
825 formatted_number_groups.at(0)));
826 }
827
828 // static
ContainsMoreThanOneSlashInNationalNumber(const PhoneNumber & number,const string & candidate,const PhoneNumberUtil & util)829 bool PhoneNumberMatcher::ContainsMoreThanOneSlashInNationalNumber(
830 const PhoneNumber& number,
831 const string& candidate,
832 const PhoneNumberUtil& util) {
833 size_t first_slash_in_body = candidate.find('/');
834 if (first_slash_in_body == string::npos) {
835 // No slashes, this is okay.
836 return false;
837 }
838 // Now look for a second one.
839 size_t second_slash_in_body = candidate.find('/', first_slash_in_body + 1);
840 if (second_slash_in_body == string::npos) {
841 // Only one slash, this is okay.
842 return false;
843 }
844
845 // If the first slash is after the country calling code, this is permitted.
846 if (number.country_code_source() == PhoneNumber::FROM_NUMBER_WITH_PLUS_SIGN ||
847 number.country_code_source() ==
848 PhoneNumber::FROM_NUMBER_WITHOUT_PLUS_SIGN) {
849 string normalized_country_code =
850 candidate.substr(0, first_slash_in_body);
851 util.NormalizeDigitsOnly(&normalized_country_code);
852 if (normalized_country_code == SimpleItoa(number.country_code())) {
853 // Any more slashes and this is illegal.
854 return candidate.find('/', second_slash_in_body + 1) != string::npos;
855 }
856 }
857 return true;
858 }
859
860 } // namespace phonenumbers
861 } // namespace i18n
862