• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2011 The Libphonenumber Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: Lara Rennie
16 // Author: Tao Huang
17 //
18 // Implementation of a stateful class that finds and extracts telephone numbers
19 // from text.
20 
21 #include "phonenumbers/phonenumbermatcher.h"
22 
23 #ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP
24 #error phonenumbermatcher depends on ICU \
25     (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set)
26 #endif  // I18N_PHONENUMBERS_USE_ICU_REGEXP
27 
28 #include <ctype.h>
29 #include <stddef.h>
30 #include <limits>
31 #include <map>
32 #include <memory>
33 #include <string>
34 #include <utility>
35 #include <vector>
36 #include <unicode/uchar.h>
37 
38 #include "phonenumbers/alternate_format.h"
39 #include "phonenumbers/base/logging.h"
40 #include "phonenumbers/base/memory/scoped_ptr.h"
41 #include "phonenumbers/base/memory/singleton.h"
42 #include "phonenumbers/callback.h"
43 #include "phonenumbers/default_logger.h"
44 #include "phonenumbers/encoding_utils.h"
45 #include "phonenumbers/normalize_utf8.h"
46 #ifdef LIBPHONENUMBER_UPGRADE
47 #include "phonenumbers/ohos/update_metadata.h"
48 #include "phonenumbers/ohos/update_libphonenumber.h"
49 #endif
50 #include "phonenumbers/phonemetadata.pb.h"
51 #include "phonenumbers/phonenumber.pb.h"
52 #include "phonenumbers/phonenumbermatch.h"
53 #include "phonenumbers/phonenumberutil.h"
54 #include "phonenumbers/regexp_adapter.h"
55 #include "phonenumbers/regexp_adapter_icu.h"
56 #include "phonenumbers/regexp_cache.h"
57 #include "phonenumbers/stringutil.h"
58 #include "phonenumbers/utf/unicodetext.h"
59 
60 #ifdef I18N_PHONENUMBERS_USE_RE2
61 #include "phonenumbers/regexp_adapter_re2.h"
62 #endif  // I18N_PHONENUMBERS_USE_RE2
63 
64 using std::map;
65 using std::numeric_limits;
66 using std::string;
67 
68 namespace i18n {
69 namespace phonenumbers {
70 
71 namespace {
72 // Returns a regular expression quantifier with an upper and lower limit.
Limit(int lower,int upper)73 string Limit(int lower, int upper) {
74   DCHECK_GE(lower, 0);
75   DCHECK_GT(upper, 0);
76   DCHECK_LT(lower, upper);
77   return StrCat("{", lower, ",", upper, "}");
78 }
79 
IsInvalidPunctuationSymbol(char32 character)80 bool IsInvalidPunctuationSymbol(char32 character) {
81   return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL;
82 }
83 
ContainsOnlyValidXChars(const PhoneNumber & number,const string & candidate,const PhoneNumberUtil & util)84 bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate,
85                              const PhoneNumberUtil& util) {
86   // The characters 'x' and 'X' can be (1) a carrier code, in which case they
87   // always precede the national significant number or (2) an extension sign,
88   // in which case they always precede the extension number. We assume a
89   // carrier code is more than 1 digit, so the first case has to have more than
90   // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1
91   // 'x' or 'X'.
92   size_t found;
93   found = candidate.find_first_of("xX");
94   // We ignore the character if 'x' or 'X' appears as the last character of
95   // the string.
96   while (found != string::npos && found < candidate.length() - 1) {
97     // We only look for 'x' or 'X' in ASCII form.
98     char next_char = candidate[found + 1];
99     if (next_char == 'x' || next_char == 'X') {
100       // This is the carrier code case, in which the 'X's always precede the
101       // national significant number.
102       ++found;
103       if (util.IsNumberMatchWithOneString(
104               number, candidate.substr(found, candidate.length() - found))
105           != PhoneNumberUtil::NSN_MATCH) {
106         return false;
107       }
108     } else {
109       string normalized_extension(candidate.substr(found,
110                                                    candidate.length() - found));
111       util.NormalizeDigitsOnly(&normalized_extension);
112       if (normalized_extension != number.extension()) {
113         return false;
114       }
115     }
116     found = candidate.find_first_of("xX", found + 1);
117   }
118   return true;
119 }
120 
AllNumberGroupsRemainGrouped(const PhoneNumberUtil & util,const PhoneNumber & number,const string & normalized_candidate,const std::vector<string> & formatted_number_groups)121 bool AllNumberGroupsRemainGrouped(
122     const PhoneNumberUtil& util,
123     const PhoneNumber& number,
124     const string& normalized_candidate,
125     const std::vector<string>& formatted_number_groups) {
126   size_t from_index = 0;
127   if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
128     // First skip the country code if the normalized candidate contained it.
129     string country_code = SimpleItoa(number.country_code());
130     from_index = normalized_candidate.find(country_code) + country_code.size();
131   }
132   // Check each group of consecutive digits are not broken into separate
133   // groupings in the normalized_candidate string.
134   for (size_t i = 0; i < formatted_number_groups.size(); ++i) {
135     // Fails if the substring of normalized_candidate starting from from_index
136     // doesn't contain the consecutive digits in formatted_number_groups.at(i).
137     from_index = normalized_candidate.find(formatted_number_groups.at(i),
138                                            from_index);
139     if (from_index == string::npos) {
140       return false;
141     }
142     // Moves from_index forward.
143     from_index += formatted_number_groups.at(i).length();
144     if (i == 0 && from_index < normalized_candidate.length()) {
145       // We are at the position right after the NDC. We get the region used for
146       // formatting information based on the country code in the phone number,
147       // rather than the number itself, as we do not need to distinguish between
148       // different countries with the same country calling code and this is
149       // faster.
150       string region;
151       util.GetRegionCodeForCountryCode(number.country_code(), &region);
152       string ndd_prefix;
153       util.GetNddPrefixForRegion(region, true, &ndd_prefix);
154       // Note although normalized_candidate might contain non-ASCII formatting
155       // characters, they won't be treated as ASCII digits when converted to a
156       // char.
157       if (!ndd_prefix.empty() && isdigit(normalized_candidate.at(from_index))) {
158         // This means there is no formatting symbol after the NDC. In this case,
159         // we only accept the number if there is no formatting symbol at all in
160         // the number, except for extensions. This is only important for
161         // countries with national prefixes.
162         string national_significant_number;
163         util.GetNationalSignificantNumber(number, &national_significant_number);
164         return HasPrefixString(normalized_candidate.substr(
165             from_index - formatted_number_groups.at(i).length()),
166             national_significant_number);
167         }
168       }
169     }
170     // The check here makes sure that we haven't mistakenly already used the
171     // extension to match the last group of the subscriber number. Note the
172     // extension cannot have formatting in-between digits.
173     return normalized_candidate.substr(from_index)
174         .find(number.extension()) != string::npos;
175 }
176 
LoadAlternateFormats(PhoneMetadataCollection * alternate_formats)177 bool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) {
178 #if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS)
179   if (!alternate_formats->ParseFromArray(alternate_format_get(),
180                                          alternate_format_size())) {
181     LOG(ERROR) << "Could not parse binary data.";
182     return false;
183   }
184   return true;
185 #else
186   return false;
187 #endif
188 }
189 
190 }  // namespace
191 
192 class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
193  private:
194   friend class Singleton<PhoneNumberMatcherRegExps>;
195 
196   string opening_parens_;
197   string closing_parens_;
198   string non_parens_;
199   // Limit on the number of pairs of brackets in a phone number.
200   string bracket_pair_limit_;
201   // Helper strings for the matching_brackets_ pattern.
202   // An opening bracket at the beginning may not be closed, but subsequent ones
203   // should be. It's also possible that the leading bracket was dropped, so we
204   // shouldn't be surprised if we see a closing bracket first.
205   string leading_maybe_matched_bracket_;
206   string bracket_pairs_;
207   // Limit on the number of leading (plus) characters.
208   string lead_limit_;
209   // Limit on the number of consecutive punctuation characters.
210   string punctuation_limit_;
211   // The maximum number of digits allowed in a digit-separated block. As we
212   // allow all digits in a single block, this should be set high enough to
213   // accommodate the entire national number and the international country code.
214   int digit_block_limit_;
215   // Limit on the number of blocks separated by punctuation. Uses
216   // kDigitBlockLimit since some formats use spaces to separate each digit.
217   string block_limit_;
218   // A punctuation sequence allowing white space.
219   string punctuation_;
220   // A digits block without punctuation.
221   string digit_sequence_;
222   // Punctuation that may be at the start of a phone number - brackets and plus
223   // signs.
224   string lead_class_chars_;
225   // Same as lead_class_chars_, but enclosed as a character class.
226   string lead_class_;
227 
228  public:
229   // We use two different reg-ex factories here for performance reasons. RE2 is
230   // much faster for smaller reg-ex patterns, but the main pattern cannot be
231   // handled by RE2 in an efficient way.
232   scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_;
233   scoped_ptr<const AbstractRegExpFactory> regexp_factory_;
234 
235   // A cache for popular reg-exps of leading digits used to match formatting
236   // patterns and the factory used to create it.
237   mutable RegExpCache regexp_cache_;
238 
239   // Matches strings that look like publication pages. Example:
240   // Computing Complete Answers to Queries in the Presence of Limited Access
241   // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003).
242   //
243   // The string "211-227 (2003)" is not a telephone number.
244   scoped_ptr<const RegExp> pub_pages_;
245   // Matches strings that look like dates using "/" as a separator. Examples:
246   // 3/10/2011, 31/10/96 or 08/31/95.
247   scoped_ptr<const RegExp> slash_separated_dates_;
248   // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
249   // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_.
250   scoped_ptr<const RegExp> time_stamps_;
251   scoped_ptr<const RegExp> time_stamps_suffix_;
252   // Pattern to check that brackets match. Opening brackets should be closed
253   // within a phone number. This also checks that there is something inside the
254   // brackets. Having no brackets at all is also fine.
255   scoped_ptr<const RegExp> matching_brackets_;
256   // Patterns used to extract phone numbers from a larger phone-number-like
257   // pattern. These are ordered according to specificity. For example,
258   // white-space is last since that is frequently used in numbers, not just to
259   // separate two numbers. We have separate patterns since we don't want to
260   // break up the phone-number-like text on more than one different kind of
261   // symbol at one time, although symbols of the same type (e.g. space) can be
262   // safely grouped together.
263   //
264   // Note that if there is a match, we will always check any text found up to
265   // the first match as well.
266   scoped_ptr<std::vector<const RegExp*> > inner_matches_;
267   scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_;
268   scoped_ptr<const RegExp> capturing_ascii_digits_pattern_;
269   // Compiled reg-ex representing lead_class_;
270   scoped_ptr<const RegExp> lead_class_pattern_;
271   // Phone number pattern allowing optional punctuation.
272   scoped_ptr<const RegExp> pattern_;
273 
PhoneNumberMatcherRegExps()274   PhoneNumberMatcherRegExps()
275       : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */),
276         closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */),
277         non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")),
278         bracket_pair_limit_(Limit(0, 3)),
279         leading_maybe_matched_bracket_(StrCat(
280             "(?:[", opening_parens_, "])?",
281             "(?:", non_parens_, "+[", closing_parens_, "])?")),
282         bracket_pairs_(StrCat(
283             "(?:[", opening_parens_, "]", non_parens_, "+",
284             "[", closing_parens_, "])", bracket_pair_limit_)),
285         lead_limit_(Limit(0, 2)),
286         punctuation_limit_(Limit(0, 4)),
287         digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn +
288                            PhoneNumberUtil::kMaxLengthCountryCode),
289         block_limit_(Limit(0, digit_block_limit_)),
290         punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]",
291                             punctuation_limit_)),
292         digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))),
293         lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)),
294         lead_class_(StrCat("[", lead_class_chars_, "]")),
295         regexp_factory_for_pattern_(new ICURegExpFactory()),
296 #ifdef I18N_PHONENUMBERS_USE_RE2
297         regexp_factory_(new RE2RegExpFactory()),
298 #else
299         regexp_factory_(new ICURegExpFactory()),
300 #endif  // I18N_PHONENUMBERS_USE_RE2
301         // A cache for frequently used country-specific regular expressions. Set
302         // to 32 to cover ~2-3 countries being used for the same doc with ~10
303         // patterns for each country. Some pages will have a lot more countries
304         // in use, but typically fewer numbers for each so expanding the cache
305         // for that use-case won't have a lot of benefit.
306         regexp_cache_(*regexp_factory_, 32),
307         pub_pages_(regexp_factory_->CreateRegExp(
308             "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")),
309         slash_separated_dates_(regexp_factory_->CreateRegExp(
310             "(?:(?:[0-3]?\\d/[01]?\\d)|"
311             "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")),
312         time_stamps_(regexp_factory_->CreateRegExp(
313             "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d +[0-2]\\d$")),
314         time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")),
315         matching_brackets_(regexp_factory_->CreateRegExp(
316             StrCat(leading_maybe_matched_bracket_, non_parens_, "+",
317                    bracket_pairs_, non_parens_, "*"))),
318         inner_matches_(new std::vector<const RegExp*>()),
319         capture_up_to_second_number_start_pattern_(
320             regexp_factory_->CreateRegExp(
321                 PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
322         capturing_ascii_digits_pattern_(
323             regexp_factory_->CreateRegExp("(\\d+)")),
324         lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)),
325         pattern_(regexp_factory_for_pattern_->CreateRegExp(StrCat(
326             "((?:", lead_class_, punctuation_, ")", lead_limit_,
327             digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
328             block_limit_, "(?i)(?:",
329             PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
330             ")?)"))) {
331     inner_matches_->push_back(
332         // Breaks on the slash - e.g. "651-234-2345/332-445-1234"
333         regexp_factory_->CreateRegExp("/+(.*)"));
334     inner_matches_->push_back(
335         // Note that the bracket here is inside the capturing group, since we
336         // consider it part of the phone number. Will match a pattern like
337         // "(650) 223 3345 (754) 223 3321".
338         regexp_factory_->CreateRegExp("(\\([^(]*)"));
339     inner_matches_->push_back(
340         // Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number." We
341         // require a space on either side of the hyphen for it to be considered
342         // a separator.
343         regexp_factory_->CreateRegExp("(?:\\p{Z}-|-\\p{Z})\\p{Z}*(.+)"));
344     inner_matches_->push_back(
345         // Various types of wide hyphens. Note we have decided not to enforce a
346         // space here, since it's possible that it's supposed to be used to
347         // break two numbers without spaces, and we haven't seen many instances
348         // of it used within a number.
349         regexp_factory_->CreateRegExp(
350             "[\xE2\x80\x92-\xE2\x80\x95\xEF\xBC\x8D]" /* "‒-―-" */
351             "\\p{Z}*(.+)"));
352     inner_matches_->push_back(
353         // Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
354         regexp_factory_->CreateRegExp("\\.+\\p{Z}*([^.]+)"));
355     inner_matches_->push_back(
356         // Breaks on space - e.g. "3324451234 8002341234"
357         regexp_factory_->CreateRegExp("\\p{Z}+(\\P{Z}+)"));
358   }
359 
360  private:
361   DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps);
362 };
363 
364 class AlternateFormats : public Singleton<AlternateFormats> {
365  public:
366   PhoneMetadataCollection format_data_;
367 
368   map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_;
369 
AlternateFormats()370   AlternateFormats()
371       : format_data_(),
372         calling_code_to_alternate_formats_map_() {
373     if (!LoadAlternateFormats(&format_data_)) {
374       LOG(DFATAL) << "Could not parse compiled-in metadata.";
375       return;
376     }
377     for (RepeatedPtrField<PhoneMetadata>::const_iterator it =
378              format_data_.metadata().begin();
379          it != format_data_.metadata().end();
380          ++it) {
381       calling_code_to_alternate_formats_map_.insert(
382           std::make_pair(it->country_code(), &*it));
383     }
384 
385 #ifdef LIBPHONENUMBER_UPGRADE
386     UpdateLibphonenumber::LoadUpdateData();
387     UpdateMetadata::UpdateAlternateFormat(calling_code_to_alternate_formats_map_);
388 #endif
389   }
390 
GetAlternateFormatsForCountry(int country_calling_code) const391   const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code)
392       const {
393     map<int, const PhoneMetadata*>::const_iterator it =
394         calling_code_to_alternate_formats_map_.find(country_calling_code);
395     if (it != calling_code_to_alternate_formats_map_.end()) {
396       return it->second;
397     }
398     return NULL;
399   }
400 
401  private:
402   DISALLOW_COPY_AND_ASSIGN(AlternateFormats);
403 };
404 
PhoneNumberMatcher(const PhoneNumberUtil & util,const string & text,const string & region_code,PhoneNumberMatcher::Leniency leniency,int max_tries)405 PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util,
406                                        const string& text,
407                                        const string& region_code,
408                                        PhoneNumberMatcher::Leniency leniency,
409                                        int max_tries)
410     : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
411       alternate_formats_(AlternateFormats::GetInstance()),
412       phone_util_(util),
413       text_(text),
414       preferred_region_(region_code),
415       leniency_(leniency),
416       max_tries_(max_tries),
417       state_(NOT_READY),
418       last_match_(NULL),
419       search_index_(0),
420       is_input_valid_utf8_(true) {
421   is_input_valid_utf8_ = IsInputUtf8();
422 }
423 
PhoneNumberMatcher(const string & text,const string & region_code)424 PhoneNumberMatcher::PhoneNumberMatcher(const string& text,
425                                        const string& region_code)
426     : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
427       alternate_formats_(NULL),  // Not used.
428       phone_util_(*PhoneNumberUtil::GetInstance()),
429       text_(text),
430       preferred_region_(region_code),
431       leniency_(VALID),
432       max_tries_(numeric_limits<int>::max()),
433       state_(NOT_READY),
434       last_match_(NULL),
435       search_index_(0),
436       is_input_valid_utf8_(true) {
437   is_input_valid_utf8_ =  IsInputUtf8();
438 }
439 
~PhoneNumberMatcher()440 PhoneNumberMatcher::~PhoneNumberMatcher() {
441 }
442 
IsInputUtf8()443 bool PhoneNumberMatcher::IsInputUtf8() {
444   UnicodeText number_as_unicode;
445   number_as_unicode.PointToUTF8(text_.c_str(), text_.size());
446   return number_as_unicode.UTF8WasValid();
447 }
448 
449 // static
IsLatinLetter(char32 letter)450 bool PhoneNumberMatcher::IsLatinLetter(char32 letter) {
451   // Combining marks are a subset of non-spacing-mark.
452   if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) {
453     return false;
454   }
455   UBlockCode block = ublock_getCode(letter);
456   return ((block == UBLOCK_BASIC_LATIN) ||
457       (block == UBLOCK_LATIN_1_SUPPLEMENT) ||
458       (block == UBLOCK_LATIN_EXTENDED_A) ||
459       (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
460       (block == UBLOCK_LATIN_EXTENDED_B) ||
461       (block == UBLOCK_COMBINING_DIACRITICAL_MARKS));
462 }
463 
ParseAndVerify(const string & candidate,int offset,PhoneNumberMatch * match)464 bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset,
465                                         PhoneNumberMatch* match) {
466   DCHECK(match);
467   // Check the candidate doesn't contain any formatting which would indicate
468   // that it really isn't a phone number.
469   if (!reg_exps_->matching_brackets_->FullMatch(candidate) ||
470       reg_exps_->pub_pages_->PartialMatch(candidate)) {
471     return false;
472   }
473 
474   // If leniency is set to VALID or stricter, we also want to skip numbers that
475   // are surrounded by Latin alphabetic characters, to skip cases like
476   // abc8005001234 or 8005001234def.
477   if (leniency_ >= VALID) {
478     // If the candidate is not at the start of the text, and does not start with
479     // phone-number punctuation, check the previous character.
480     scoped_ptr<RegExpInput> candidate_input(
481         reg_exps_->regexp_factory_->CreateInput(candidate));
482     if (offset > 0 &&
483         !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) {
484       char32 previous_char;
485       const char* previous_char_ptr =
486           EncodingUtils::BackUpOneUTF8Character(text_.c_str(),
487                                                 text_.c_str() + offset);
488       EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char);
489       // We return false if it is a latin letter or an invalid punctuation
490       // symbol.
491       if (IsInvalidPunctuationSymbol(previous_char) ||
492           IsLatinLetter(previous_char)) {
493         return false;
494       }
495     }
496     size_t lastCharIndex = offset + candidate.length();
497     if (lastCharIndex < text_.length()) {
498       char32 next_char;
499       const char* next_char_ptr =
500           EncodingUtils::AdvanceOneUTF8Character(
501               text_.c_str() + lastCharIndex - 1);
502       EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char);
503       if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) {
504         return false;
505       }
506     }
507   }
508 
509   PhoneNumber number;
510   if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) !=
511       PhoneNumberUtil::NO_PARSING_ERROR) {
512     return false;
513   }
514 
515   if (VerifyAccordingToLeniency(leniency_, number, candidate)) {
516     match->set_start(offset);
517     match->set_raw_string(candidate);
518     // We used ParseAndKeepRawInput to create this number, but for now we don't
519     // return the extra values parsed. TODO: stop clearing all values here and
520     // switch all users over to using raw_input() rather than the raw_string()
521     // of PhoneNumberMatch.
522     number.clear_country_code_source();
523     number.clear_preferred_domestic_carrier_code();
524     number.clear_raw_input();
525     match->set_number(number);
526     return true;
527   }
528   return false;
529 }
530 
531 // Helper method to replace the verification method for each enum in the Java
532 // version.
VerifyAccordingToLeniency(Leniency leniency,const PhoneNumber & number,const string & candidate) const533 bool PhoneNumberMatcher::VerifyAccordingToLeniency(
534     Leniency leniency, const PhoneNumber& number,
535     const string& candidate) const {
536   switch (leniency) {
537     case PhoneNumberMatcher::POSSIBLE:
538       return phone_util_.IsPossibleNumber(number);
539     case PhoneNumberMatcher::VALID:
540       if (!phone_util_.IsValidNumber(number) ||
541           !ContainsOnlyValidXChars(number, candidate, phone_util_)) {
542         return false;
543       }
544       return IsNationalPrefixPresentIfRequired(number);
545     case PhoneNumberMatcher::STRICT_GROUPING: {
546       if (!phone_util_.IsValidNumber(number) ||
547           !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
548           ContainsMoreThanOneSlashInNationalNumber(
549               number, candidate, phone_util_) ||
550           !IsNationalPrefixPresentIfRequired(number)) {
551         return false;
552       }
553       ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
554                       const string&, const std::vector<string>&>* callback =
555           NewPermanentCallback(&AllNumberGroupsRemainGrouped);
556       bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
557       delete(callback);
558       return is_valid;
559     }
560     case PhoneNumberMatcher::EXACT_GROUPING: {
561       if (!phone_util_.IsValidNumber(number) ||
562           !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
563           ContainsMoreThanOneSlashInNationalNumber(
564               number, candidate, phone_util_) ||
565           !IsNationalPrefixPresentIfRequired(number)) {
566         return false;
567       }
568       ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
569                       const string&, const std::vector<string>&>* callback =
570           NewPermanentCallback(
571               this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent);
572       bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
573       delete(callback);
574       return is_valid;
575     }
576     default:
577       LOG(ERROR) << "No implementation defined for verification for leniency "
578                  << static_cast<int>(leniency);
579       return false;
580   }
581 }
582 
ExtractInnerMatch(const string & candidate,int offset,PhoneNumberMatch * match)583 bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset,
584                                            PhoneNumberMatch* match) {
585   DCHECK(match);
586   for (std::vector<const RegExp*>::const_iterator regex =
587            reg_exps_->inner_matches_->begin();
588            regex != reg_exps_->inner_matches_->end(); regex++) {
589     scoped_ptr<RegExpInput> candidate_input(
590         reg_exps_->regexp_factory_->CreateInput(candidate));
591     bool is_first_match = true;
592     string group;
593     while ((*regex)->FindAndConsume(candidate_input.get(), &group) &&
594            max_tries_ > 0) {
595       int group_start_index = static_cast<int>(candidate.length() -
596           candidate_input->ToString().length() - group.length());
597       if (is_first_match) {
598         // We should handle any group before this one too.
599         string first_group_only = candidate.substr(0, group_start_index);
600         phone_util_.TrimUnwantedEndChars(&first_group_only);
601         bool success = ParseAndVerify(first_group_only, offset, match);
602         if (success) {
603           return true;
604         }
605         --max_tries_;
606         is_first_match = false;
607       }
608       phone_util_.TrimUnwantedEndChars(&group);
609       bool success = ParseAndVerify(group, offset + group_start_index, match);
610       if (success) {
611         return true;
612       }
613       --max_tries_;
614     }
615   }
616   return false;
617 }
618 
ExtractMatch(const string & candidate,int offset,PhoneNumberMatch * match)619 bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset,
620                                       PhoneNumberMatch* match) {
621   DCHECK(match);
622   // Skip a match that is more likely to be a date.
623   if (reg_exps_->slash_separated_dates_->PartialMatch(candidate)) {
624     return false;
625   }
626 
627   // Skip potential time-stamps.
628   if (reg_exps_->time_stamps_->PartialMatch(candidate)) {
629     scoped_ptr<RegExpInput> following_text(
630         reg_exps_->regexp_factory_->CreateInput(
631             text_.substr(offset + candidate.size())));
632     if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) {
633       return false;
634     }
635   }
636 
637   // Try to come up with a valid match given the entire candidate.
638   if (ParseAndVerify(candidate, offset, match)) {
639     return true;
640   }
641 
642   // If that failed, try to find an "inner match" - there might be a phone
643   // number within this candidate.
644   return ExtractInnerMatch(candidate, offset, match);
645 }
646 
HasNext()647 bool PhoneNumberMatcher::HasNext() {
648   // Input should contain only UTF-8 characters.
649   if (!is_input_valid_utf8_) {
650     state_ = DONE;
651     return false;
652   }
653   if (state_ == NOT_READY) {
654     PhoneNumberMatch temp_match;
655     if (!Find(search_index_, &temp_match)) {
656       state_ = DONE;
657     } else {
658       last_match_.reset(new PhoneNumberMatch(temp_match.start(),
659                                              temp_match.raw_string(),
660                                              temp_match.number()));
661       search_index_ = last_match_->end();
662       state_ = READY;
663     }
664   }
665   return state_ == READY;
666 }
667 
Next(PhoneNumberMatch * match)668 bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) {
669   DCHECK(match);
670   // Check the state and find the next match as a side-effect if necessary.
671   if (!HasNext()) {
672     return false;
673   }
674   match->CopyFrom(*last_match_);
675   state_ = NOT_READY;
676   last_match_.reset(NULL);
677   return true;
678 }
679 
Find(int index,PhoneNumberMatch * match)680 bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) {
681   DCHECK(match);
682 
683   scoped_ptr<RegExpInput> text(
684       reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index)));
685   string candidate;
686   while ((max_tries_ > 0) &&
687          reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) {
688     int start = static_cast<int>(text_.length() - text->ToString().length() - candidate.length());
689     // Check for extra numbers at the end.
690     reg_exps_->capture_up_to_second_number_start_pattern_->
691         PartialMatch(candidate, &candidate);
692     if (ExtractMatch(candidate, start, match)) {
693       return true;
694     }
695 
696     index = static_cast<int>(start + candidate.length());
697     --max_tries_;
698   }
699   return false;
700 }
701 
CheckNumberGroupingIsValid(const PhoneNumber & phone_number,const string & candidate,ResultCallback4<bool,const PhoneNumberUtil &,const PhoneNumber &,const string &,const std::vector<string> &> * checker) const702 bool PhoneNumberMatcher::CheckNumberGroupingIsValid(
703     const PhoneNumber& phone_number,
704     const string& candidate,
705     ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
706                     const string&, const std::vector<string>&>* checker) const {
707   DCHECK(checker);
708   string normalized_candidate =
709       NormalizeUTF8::NormalizeDecimalDigits(candidate);
710   std::vector<string> formatted_number_groups;
711   GetNationalNumberGroups(phone_number, &formatted_number_groups);
712   if (checker->Run(phone_util_, phone_number, normalized_candidate,
713                    formatted_number_groups)) {
714     return true;
715   }
716   // If this didn't pass, see if there are any alternate formats that match, and
717   // try them instead.
718   const PhoneMetadata* alternate_formats =
719     alternate_formats_->GetAlternateFormatsForCountry(
720         phone_number.country_code());
721   if (alternate_formats) {
722     string national_significant_number;
723     phone_util_.GetNationalSignificantNumber(phone_number,
724                                              &national_significant_number);
725     for (RepeatedPtrField<NumberFormat>::const_iterator it =
726              alternate_formats->number_format().begin();
727          it != alternate_formats->number_format().end(); ++it) {
728       if (it->leading_digits_pattern_size() > 0) {
729         std::unique_ptr<RegExpInput> nsn_input(
730             reg_exps_->regexp_factory_->CreateInput(
731                 national_significant_number));
732         // There is only one leading digits pattern for alternate formats.
733         if (!reg_exps_->regexp_cache_.GetRegExp(
734                 it->leading_digits_pattern(0)).Consume(nsn_input.get())) {
735           // Leading digits don't match; try another one.
736           continue;
737         }
738       }
739       formatted_number_groups.clear();
740       GetNationalNumberGroupsForPattern(phone_number, &*it,
741                                         &formatted_number_groups);
742       if (checker->Run(phone_util_, phone_number, normalized_candidate,
743                        formatted_number_groups)) {
744         return true;
745       }
746     }
747   }
748   return false;
749 }
750 
GetNationalNumberGroups(const PhoneNumber & number,std::vector<string> * digit_blocks) const751 void PhoneNumberMatcher::GetNationalNumberGroups(
752     const PhoneNumber& number,
753     std::vector<string>* digit_blocks) const {
754   string rfc3966_format;
755   // This will be in the format +CC-DG1-DG2-DGX;ext=EXT where DG1..DGX
756   // represents groups of digits.
757   phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
758   // We remove the extension part from the formatted string before splitting
759   // it into different groups.
760   size_t end_index = rfc3966_format.find(';');
761   if (end_index == string::npos) {
762     end_index = rfc3966_format.length();
763   }
764   // The country-code will have a '-' following it.
765   size_t start_index = rfc3966_format.find('-') + 1;
766   SplitStringUsing(rfc3966_format.substr(start_index,
767                                          end_index - start_index),
768                    '-', digit_blocks);
769 }
770 
GetNationalNumberGroupsForPattern(const PhoneNumber & number,const NumberFormat * formatting_pattern,std::vector<string> * digit_blocks) const771 void PhoneNumberMatcher::GetNationalNumberGroupsForPattern(
772     const PhoneNumber& number,
773     const NumberFormat* formatting_pattern,
774     std::vector<string>* digit_blocks) const {
775   string rfc3966_format;
776   // We format the NSN only, and split that according to the separator.
777   string national_significant_number;
778   phone_util_.GetNationalSignificantNumber(number,
779                                            &national_significant_number);
780   phone_util_.FormatNsnUsingPattern(national_significant_number,
781                                     *formatting_pattern,
782                                     PhoneNumberUtil::RFC3966,
783                                     &rfc3966_format);
784   SplitStringUsing(rfc3966_format, '-', digit_blocks);
785 }
786 
IsNationalPrefixPresentIfRequired(const PhoneNumber & number) const787 bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired(
788     const PhoneNumber& number) const {
789   // First, check how we deduced the country code. If it was written in
790   // international format, then the national prefix is not required.
791   if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
792     return true;
793   }
794   string phone_number_region;
795   phone_util_.GetRegionCodeForCountryCode(
796       number.country_code(), &phone_number_region);
797   const PhoneMetadata* metadata =
798       phone_util_.GetMetadataForRegion(phone_number_region);
799   if (!metadata) {
800     return true;
801   }
802   // Check if a national prefix should be present when formatting this number.
803   string national_number;
804   phone_util_.GetNationalSignificantNumber(number, &national_number);
805   const NumberFormat* format_rule =
806       phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(),
807                                                    national_number);
808   // To do this, we check that a national prefix formatting rule was present and
809   // that it wasn't just the first-group symbol ($1) with punctuation.
810   if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) {
811     if (format_rule->national_prefix_optional_when_formatting()) {
812       // The national-prefix is optional in these cases, so we don't need to
813       // check if it was present.
814       return true;
815     }
816     if (phone_util_.FormattingRuleHasFirstGroupOnly(
817         format_rule->national_prefix_formatting_rule())) {
818       // National Prefix not needed for this number.
819       return true;
820     }
821     // Normalize the remainder.
822     string raw_input_copy(number.raw_input());
823     // Check if we found a national prefix and/or carrier code at the start of
824     // the raw input, and return the result.
825     phone_util_.NormalizeDigitsOnly(&raw_input_copy);
826     return phone_util_.MaybeStripNationalPrefixAndCarrierCode(
827         *metadata,
828         &raw_input_copy,
829         NULL);  // Don't need to keep the stripped carrier code.
830   }
831   return true;
832 }
833 
AllNumberGroupsAreExactlyPresent(const PhoneNumberUtil & util,const PhoneNumber & phone_number,const string & normalized_candidate,const std::vector<string> & formatted_number_groups) const834 bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent(
835     const PhoneNumberUtil& util,
836     const PhoneNumber& phone_number,
837     const string& normalized_candidate,
838     const std::vector<string>& formatted_number_groups) const {
839   const scoped_ptr<RegExpInput> candidate_number(
840       reg_exps_->regexp_factory_->CreateInput(normalized_candidate));
841   std::vector<string> candidate_groups;
842   string digit_block;
843   while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume(
844              candidate_number.get(),
845              &digit_block)) {
846     candidate_groups.push_back(digit_block);
847   }
848 
849   // Set this to the last group, skipping it if the number has an extension.
850   int candidate_number_group_index = static_cast<int>(
851       phone_number.has_extension() ? candidate_groups.size() - 2
852                                    : candidate_groups.size() - 1);
853   // First we check if the national significant number is formatted as a block.
854   // We use find and not equals, since the national significant number may be
855   // present with a prefix such as a national number prefix, or the country code
856   // itself.
857   string national_significant_number;
858   util.GetNationalSignificantNumber(phone_number,
859                                     &national_significant_number);
860   if (candidate_groups.size() == 1 ||
861       candidate_groups.at(candidate_number_group_index).find(
862           national_significant_number) != string::npos) {
863     return true;
864   }
865   // Starting from the end, go through in reverse, excluding the first group,
866   // and check the candidate and number groups are the same.
867   for (int formatted_number_group_index =
868            static_cast<int>(formatted_number_groups.size() - 1);
869        formatted_number_group_index > 0 &&
870        candidate_number_group_index >= 0;
871        --formatted_number_group_index, --candidate_number_group_index) {
872     if (candidate_groups.at(candidate_number_group_index) !=
873         formatted_number_groups.at(formatted_number_group_index)) {
874       return false;
875     }
876   }
877   // Now check the first group. There may be a national prefix at the start, so
878   // we only check that the candidate group ends with the formatted number
879   // group.
880   return (candidate_number_group_index >= 0 &&
881           HasSuffixString(candidate_groups.at(candidate_number_group_index),
882                           formatted_number_groups.at(0)));
883 }
884 
885 // static
ContainsMoreThanOneSlashInNationalNumber(const PhoneNumber & number,const string & candidate,const PhoneNumberUtil & util)886 bool PhoneNumberMatcher::ContainsMoreThanOneSlashInNationalNumber(
887     const PhoneNumber& number,
888     const string& candidate,
889     const PhoneNumberUtil& util) {
890   size_t first_slash_in_body = candidate.find('/');
891   if (first_slash_in_body == string::npos) {
892     // No slashes, this is okay.
893     return false;
894   }
895   // Now look for a second one.
896   size_t second_slash_in_body = candidate.find('/', first_slash_in_body + 1);
897   if (second_slash_in_body == string::npos) {
898     // Only one slash, this is okay.
899     return false;
900   }
901 
902   // If the first slash is after the country calling code, this is permitted.
903   if (number.country_code_source() == PhoneNumber::FROM_NUMBER_WITH_PLUS_SIGN ||
904       number.country_code_source() ==
905           PhoneNumber::FROM_NUMBER_WITHOUT_PLUS_SIGN) {
906     string normalized_country_code =
907         candidate.substr(0, first_slash_in_body);
908     util.NormalizeDigitsOnly(&normalized_country_code);
909     if (normalized_country_code == SimpleItoa(number.country_code())) {
910       // Any more slashes and this is illegal.
911       return candidate.find('/', second_slash_in_body + 1) != string::npos;
912     }
913   }
914   return true;
915 }
916 
917 }  // namespace phonenumbers
918 }  // namespace i18n
919