• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2011 The Libphonenumber Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: Lara Rennie
16 // Author: Tao Huang
17 //
18 // Implementation of a stateful class that finds and extracts telephone numbers
19 // from text.
20 
21 #include "phonenumbers/phonenumbermatcher.h"
22 
23 #ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP
24 #error phonenumbermatcher depends on ICU \
25     (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set)
26 #endif  // I18N_PHONENUMBERS_USE_ICU_REGEXP
27 
28 #include <ctype.h>
29 #include <stddef.h>
30 #include <limits>
31 #include <map>
32 #include <string>
33 #include <utility>
34 #include <vector>
35 
36 #include <unicode/uchar.h>
37 
38 #include "phonenumbers/alternate_format.h"
39 #include "phonenumbers/base/logging.h"
40 #include "phonenumbers/base/memory/scoped_ptr.h"
41 #include "phonenumbers/base/memory/singleton.h"
42 #include "phonenumbers/callback.h"
43 #include "phonenumbers/default_logger.h"
44 #include "phonenumbers/encoding_utils.h"
45 #include "phonenumbers/normalize_utf8.h"
46 #include "phonenumbers/phonemetadata.pb.h"
47 #include "phonenumbers/phonenumber.pb.h"
48 #include "phonenumbers/phonenumbermatch.h"
49 #include "phonenumbers/phonenumberutil.h"
50 #include "phonenumbers/regexp_adapter.h"
51 #include "phonenumbers/regexp_adapter_icu.h"
52 #include "phonenumbers/regexp_cache.h"
53 #include "phonenumbers/stringutil.h"
54 
55 #ifdef I18N_PHONENUMBERS_USE_RE2
56 #include "phonenumbers/regexp_adapter_re2.h"
57 #endif  // I18N_PHONENUMBERS_USE_RE2
58 
59 using std::map;
60 using std::numeric_limits;
61 using std::string;
62 
63 namespace i18n {
64 namespace phonenumbers {
65 
66 namespace {
67 // Returns a regular expression quantifier with an upper and lower limit.
Limit(int lower,int upper)68 string Limit(int lower, int upper) {
69   DCHECK_GE(lower, 0);
70   DCHECK_GT(upper, 0);
71   DCHECK_LT(lower, upper);
72   return StrCat("{", lower, ",", upper, "}");
73 }
74 
IsInvalidPunctuationSymbol(char32 character)75 bool IsInvalidPunctuationSymbol(char32 character) {
76   return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL;
77 }
78 
ContainsOnlyValidXChars(const PhoneNumber & number,const string & candidate,const PhoneNumberUtil & util)79 bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate,
80                              const PhoneNumberUtil& util) {
81   // The characters 'x' and 'X' can be (1) a carrier code, in which case they
82   // always precede the national significant number or (2) an extension sign,
83   // in which case they always precede the extension number. We assume a
84   // carrier code is more than 1 digit, so the first case has to have more than
85   // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1
86   // 'x' or 'X'.
87   size_t found;
88   found = candidate.find_first_of("xX");
89   // We ignore the character if 'x' or 'X' appears as the last character of
90   // the string.
91   while (found != string::npos && found < candidate.length() - 1) {
92     // We only look for 'x' or 'X' in ASCII form.
93     char next_char = candidate[found + 1];
94     if (next_char == 'x' || next_char == 'X') {
95       // This is the carrier code case, in which the 'X's always precede the
96       // national significant number.
97       ++found;
98       if (util.IsNumberMatchWithOneString(
99               number, candidate.substr(found, candidate.length() - found))
100           != PhoneNumberUtil::NSN_MATCH) {
101         return false;
102       }
103     } else {
104       string normalized_extension(candidate.substr(found,
105                                                    candidate.length() - found));
106       util.NormalizeDigitsOnly(&normalized_extension);
107       if (normalized_extension != number.extension()) {
108         return false;
109       }
110     }
111     found = candidate.find_first_of("xX", found + 1);
112   }
113   return true;
114 }
115 
AllNumberGroupsRemainGrouped(const PhoneNumberUtil & util,const PhoneNumber & number,const string & normalized_candidate,const std::vector<string> & formatted_number_groups)116 bool AllNumberGroupsRemainGrouped(
117     const PhoneNumberUtil& util,
118     const PhoneNumber& number,
119     const string& normalized_candidate,
120     const std::vector<string>& formatted_number_groups) {
121   size_t from_index = 0;
122   if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
123     // First skip the country code if the normalized candidate contained it.
124     string country_code = SimpleItoa(number.country_code());
125     from_index = normalized_candidate.find(country_code) + country_code.size();
126   }
127   // Check each group of consecutive digits are not broken into separate
128   // groupings in the normalized_candidate string.
129   for (size_t i = 0; i < formatted_number_groups.size(); ++i) {
130     // Fails if the substring of normalized_candidate starting from from_index
131     // doesn't contain the consecutive digits in formatted_number_groups.at(i).
132     from_index = normalized_candidate.find(formatted_number_groups.at(i),
133                                            from_index);
134     if (from_index == string::npos) {
135       return false;
136     }
137     // Moves from_index forward.
138     from_index += formatted_number_groups.at(i).length();
139     if (i == 0 && from_index < normalized_candidate.length()) {
140       // We are at the position right after the NDC. We get the region used for
141       // formatting information based on the country code in the phone number,
142       // rather than the number itself, as we do not need to distinguish between
143       // different countries with the same country calling code and this is
144       // faster.
145       string region;
146       util.GetRegionCodeForCountryCode(number.country_code(), &region);
147       string ndd_prefix;
148       util.GetNddPrefixForRegion(region, true, &ndd_prefix);
149       // Note although normalized_candidate might contain non-ASCII formatting
150       // characters, they won't be treated as ASCII digits when converted to a
151       // char.
152       if (!ndd_prefix.empty() && isdigit(normalized_candidate.at(from_index))) {
153         // This means there is no formatting symbol after the NDC. In this case,
154         // we only accept the number if there is no formatting symbol at all in
155         // the number, except for extensions. This is only important for
156         // countries with national prefixes.
157         string national_significant_number;
158         util.GetNationalSignificantNumber(number, &national_significant_number);
159         return HasPrefixString(normalized_candidate.substr(
160             from_index - formatted_number_groups.at(i).length()),
161             national_significant_number);
162         }
163       }
164     }
165     // The check here makes sure that we haven't mistakenly already used the
166     // extension to match the last group of the subscriber number. Note the
167     // extension cannot have formatting in-between digits.
168     return normalized_candidate.substr(from_index)
169         .find(number.extension()) != string::npos;
170 }
171 
LoadAlternateFormats(PhoneMetadataCollection * alternate_formats)172 bool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) {
173 #if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS)
174   if (!alternate_formats->ParseFromArray(alternate_format_get(),
175                                          alternate_format_size())) {
176     LOG(ERROR) << "Could not parse binary data.";
177     return false;
178   }
179   return true;
180 #else
181   return false;
182 #endif
183 }
184 
185 }  // namespace
186 
187 class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
188  private:
189   friend class Singleton<PhoneNumberMatcherRegExps>;
190 
191   string opening_parens_;
192   string closing_parens_;
193   string non_parens_;
194   // Limit on the number of pairs of brackets in a phone number.
195   string bracket_pair_limit_;
196   // Helper strings for the matching_brackets_ pattern.
197   // An opening bracket at the beginning may not be closed, but subsequent ones
198   // should be. It's also possible that the leading bracket was dropped, so we
199   // shouldn't be surprised if we see a closing bracket first.
200   string leading_maybe_matched_bracket_;
201   string bracket_pairs_;
202   // Limit on the number of leading (plus) characters.
203   string lead_limit_;
204   // Limit on the number of consecutive punctuation characters.
205   string punctuation_limit_;
206   // The maximum number of digits allowed in a digit-separated block. As we
207   // allow all digits in a single block, this should be set high enough to
208   // accommodate the entire national number and the international country code.
209   int digit_block_limit_;
210   // Limit on the number of blocks separated by punctuation. Uses
211   // kDigitBlockLimit since some formats use spaces to separate each digit.
212   string block_limit_;
213   // A punctuation sequence allowing white space.
214   string punctuation_;
215   // A digits block without punctuation.
216   string digit_sequence_;
217   // Punctuation that may be at the start of a phone number - brackets and plus
218   // signs.
219   string lead_class_chars_;
220   // Same as lead_class_chars_, but enclosed as a character class.
221   string lead_class_;
222 
223  public:
224   // We use two different reg-ex factories here for performance reasons. RE2 is
225   // much faster for smaller reg-ex patterns, but the main pattern cannot be
226   // handled by RE2 in an efficient way.
227   scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_;
228   scoped_ptr<const AbstractRegExpFactory> regexp_factory_;
229 
230   // A cache for popular reg-exps of leading digits used to match formatting
231   // patterns and the factory used to create it.
232   mutable RegExpCache regexp_cache_;
233 
234   // Matches strings that look like publication pages. Example:
235   // Computing Complete Answers to Queries in the Presence of Limited Access
236   // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003).
237   //
238   // The string "211-227 (2003)" is not a telephone number.
239   scoped_ptr<const RegExp> pub_pages_;
240   // Matches strings that look like dates using "/" as a separator. Examples:
241   // 3/10/2011, 31/10/96 or 08/31/95.
242   scoped_ptr<const RegExp> slash_separated_dates_;
243   // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
244   // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_.
245   scoped_ptr<const RegExp> time_stamps_;
246   scoped_ptr<const RegExp> time_stamps_suffix_;
247   // Pattern to check that brackets match. Opening brackets should be closed
248   // within a phone number. This also checks that there is something inside the
249   // brackets. Having no brackets at all is also fine.
250   scoped_ptr<const RegExp> matching_brackets_;
251   // Patterns used to extract phone numbers from a larger phone-number-like
252   // pattern. These are ordered according to specificity. For example,
253   // white-space is last since that is frequently used in numbers, not just to
254   // separate two numbers. We have separate patterns since we don't want to
255   // break up the phone-number-like text on more than one different kind of
256   // symbol at one time, although symbols of the same type (e.g. space) can be
257   // safely grouped together.
258   //
259   // Note that if there is a match, we will always check any text found up to
260   // the first match as well.
261   scoped_ptr<std::vector<const RegExp*> > inner_matches_;
262   scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_;
263   scoped_ptr<const RegExp> capturing_ascii_digits_pattern_;
264   // Compiled reg-ex representing lead_class_;
265   scoped_ptr<const RegExp> lead_class_pattern_;
266   // Phone number pattern allowing optional punctuation.
267   scoped_ptr<const RegExp> pattern_;
268 
PhoneNumberMatcherRegExps()269   PhoneNumberMatcherRegExps()
270       : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */),
271         closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */),
272         non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")),
273         bracket_pair_limit_(Limit(0, 3)),
274         leading_maybe_matched_bracket_(StrCat(
275             "(?:[", opening_parens_, "])?",
276             "(?:", non_parens_, "+[", closing_parens_, "])?")),
277         bracket_pairs_(StrCat(
278             "(?:[", opening_parens_, "]", non_parens_, "+",
279             "[", closing_parens_, "])", bracket_pair_limit_)),
280         lead_limit_(Limit(0, 2)),
281         punctuation_limit_(Limit(0, 4)),
282         digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn +
283                            PhoneNumberUtil::kMaxLengthCountryCode),
284         block_limit_(Limit(0, digit_block_limit_)),
285         punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]",
286                             punctuation_limit_)),
287         digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))),
288         lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)),
289         lead_class_(StrCat("[", lead_class_chars_, "]")),
290         regexp_factory_for_pattern_(new ICURegExpFactory()),
291 #ifdef I18N_PHONENUMBERS_USE_RE2
292         regexp_factory_(new RE2RegExpFactory()),
293 #else
294         regexp_factory_(new ICURegExpFactory()),
295 #endif  // I18N_PHONENUMBERS_USE_RE2
296         // A cache for frequently used country-specific regular expressions. Set
297         // to 32 to cover ~2-3 countries being used for the same doc with ~10
298         // patterns for each country. Some pages will have a lot more countries
299         // in use, but typically fewer numbers for each so expanding the cache
300         // for that use-case won't have a lot of benefit.
301         regexp_cache_(*regexp_factory_, 32),
302         pub_pages_(regexp_factory_->CreateRegExp(
303             "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")),
304         slash_separated_dates_(regexp_factory_->CreateRegExp(
305             "(?:(?:[0-3]?\\d/[01]?\\d)|"
306             "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")),
307         time_stamps_(regexp_factory_->CreateRegExp(
308             "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d +[0-2]\\d$")),
309         time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")),
310         matching_brackets_(regexp_factory_->CreateRegExp(
311             StrCat(leading_maybe_matched_bracket_, non_parens_, "+",
312                    bracket_pairs_, non_parens_, "*"))),
313         inner_matches_(new std::vector<const RegExp*>()),
314         capture_up_to_second_number_start_pattern_(
315             regexp_factory_->CreateRegExp(
316                 PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
317         capturing_ascii_digits_pattern_(
318             regexp_factory_->CreateRegExp("(\\d+)")),
319         lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)),
320         pattern_(regexp_factory_for_pattern_->CreateRegExp(StrCat(
321             "((?:", lead_class_, punctuation_, ")", lead_limit_,
322             digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
323             block_limit_, "(?i)(?:",
324             PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
325             ")?)"))) {
326     inner_matches_->push_back(
327         // Breaks on the slash - e.g. "651-234-2345/332-445-1234"
328         regexp_factory_->CreateRegExp("/+(.*)"));
329     inner_matches_->push_back(
330         // Note that the bracket here is inside the capturing group, since we
331         // consider it part of the phone number. Will match a pattern like
332         // "(650) 223 3345 (754) 223 3321".
333         regexp_factory_->CreateRegExp("(\\([^(]*)"));
334     inner_matches_->push_back(
335         // Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number." We
336         // require a space on either side of the hyphen for it to be considered
337         // a separator.
338         regexp_factory_->CreateRegExp("(?:\\p{Z}-|-\\p{Z})\\p{Z}*(.+)"));
339     inner_matches_->push_back(
340         // Various types of wide hyphens. Note we have decided not to enforce a
341         // space here, since it's possible that it's supposed to be used to
342         // break two numbers without spaces, and we haven't seen many instances
343         // of it used within a number.
344         regexp_factory_->CreateRegExp(
345             "[\xE2\x80\x92-\xE2\x80\x95\xEF\xBC\x8D]" /* "‒-―-" */
346             "\\p{Z}*(.+)"));
347     inner_matches_->push_back(
348         // Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
349         regexp_factory_->CreateRegExp("\\.+\\p{Z}*([^.]+)"));
350     inner_matches_->push_back(
351         // Breaks on space - e.g. "3324451234 8002341234"
352         regexp_factory_->CreateRegExp("\\p{Z}+(\\P{Z}+)"));
353   }
354 
355  private:
356   DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps);
357 };
358 
359 class AlternateFormats : public Singleton<AlternateFormats> {
360  public:
361   PhoneMetadataCollection format_data_;
362 
363   map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_;
364 
AlternateFormats()365   AlternateFormats()
366       : format_data_(),
367         calling_code_to_alternate_formats_map_() {
368     if (!LoadAlternateFormats(&format_data_)) {
369       LOG(DFATAL) << "Could not parse compiled-in metadata.";
370       return;
371     }
372     for (RepeatedPtrField<PhoneMetadata>::const_iterator it =
373              format_data_.metadata().begin();
374          it != format_data_.metadata().end();
375          ++it) {
376       calling_code_to_alternate_formats_map_.insert(
377           std::make_pair(it->country_code(), &*it));
378     }
379   }
380 
GetAlternateFormatsForCountry(int country_calling_code) const381   const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code)
382       const {
383     map<int, const PhoneMetadata*>::const_iterator it =
384         calling_code_to_alternate_formats_map_.find(country_calling_code);
385     if (it != calling_code_to_alternate_formats_map_.end()) {
386       return it->second;
387     }
388     return NULL;
389   }
390 
391  private:
392   DISALLOW_COPY_AND_ASSIGN(AlternateFormats);
393 };
394 
PhoneNumberMatcher(const PhoneNumberUtil & util,const string & text,const string & region_code,PhoneNumberMatcher::Leniency leniency,int max_tries)395 PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util,
396                                        const string& text,
397                                        const string& region_code,
398                                        PhoneNumberMatcher::Leniency leniency,
399                                        int max_tries)
400     : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
401       alternate_formats_(AlternateFormats::GetInstance()),
402       phone_util_(util),
403       text_(text),
404       preferred_region_(region_code),
405       leniency_(leniency),
406       max_tries_(max_tries),
407       state_(NOT_READY),
408       last_match_(NULL),
409       search_index_(0) {
410 }
411 
PhoneNumberMatcher(const string & text,const string & region_code)412 PhoneNumberMatcher::PhoneNumberMatcher(const string& text,
413                                        const string& region_code)
414     : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
415       alternate_formats_(NULL),  // Not used.
416       phone_util_(*PhoneNumberUtil::GetInstance()),
417       text_(text),
418       preferred_region_(region_code),
419       leniency_(VALID),
420       max_tries_(numeric_limits<int>::max()),
421       state_(NOT_READY),
422       last_match_(NULL),
423       search_index_(0) {
424 }
425 
~PhoneNumberMatcher()426 PhoneNumberMatcher::~PhoneNumberMatcher() {
427 }
428 
429 // static
IsLatinLetter(char32 letter)430 bool PhoneNumberMatcher::IsLatinLetter(char32 letter) {
431   // Combining marks are a subset of non-spacing-mark.
432   if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) {
433     return false;
434   }
435   UBlockCode block = ublock_getCode(letter);
436   return ((block == UBLOCK_BASIC_LATIN) ||
437       (block == UBLOCK_LATIN_1_SUPPLEMENT) ||
438       (block == UBLOCK_LATIN_EXTENDED_A) ||
439       (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
440       (block == UBLOCK_LATIN_EXTENDED_B) ||
441       (block == UBLOCK_COMBINING_DIACRITICAL_MARKS));
442 }
443 
ParseAndVerify(const string & candidate,int offset,PhoneNumberMatch * match)444 bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset,
445                                         PhoneNumberMatch* match) {
446   DCHECK(match);
447   // Check the candidate doesn't contain any formatting which would indicate
448   // that it really isn't a phone number.
449   if (!reg_exps_->matching_brackets_->FullMatch(candidate) ||
450       reg_exps_->pub_pages_->PartialMatch(candidate)) {
451     return false;
452   }
453 
454   // If leniency is set to VALID or stricter, we also want to skip numbers that
455   // are surrounded by Latin alphabetic characters, to skip cases like
456   // abc8005001234 or 8005001234def.
457   if (leniency_ >= VALID) {
458     // If the candidate is not at the start of the text, and does not start with
459     // phone-number punctuation, check the previous character.
460     scoped_ptr<RegExpInput> candidate_input(
461         reg_exps_->regexp_factory_->CreateInput(candidate));
462     if (offset > 0 &&
463         !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) {
464       char32 previous_char;
465       const char* previous_char_ptr =
466           EncodingUtils::BackUpOneUTF8Character(text_.c_str(),
467                                                 text_.c_str() + offset);
468       EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char);
469       // We return false if it is a latin letter or an invalid punctuation
470       // symbol.
471       if (IsInvalidPunctuationSymbol(previous_char) ||
472           IsLatinLetter(previous_char)) {
473         return false;
474       }
475     }
476     size_t lastCharIndex = offset + candidate.length();
477     if (lastCharIndex < text_.length()) {
478       char32 next_char;
479       const char* next_char_ptr =
480           EncodingUtils::AdvanceOneUTF8Character(
481               text_.c_str() + lastCharIndex - 1);
482       EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char);
483       if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) {
484         return false;
485       }
486     }
487   }
488 
489   PhoneNumber number;
490   if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) !=
491       PhoneNumberUtil::NO_PARSING_ERROR) {
492     return false;
493   }
494 
495   if (VerifyAccordingToLeniency(leniency_, number, candidate)) {
496     match->set_start(offset);
497     match->set_raw_string(candidate);
498     // We used ParseAndKeepRawInput to create this number, but for now we don't
499     // return the extra values parsed. TODO: stop clearing all values here and
500     // switch all users over to using raw_input() rather than the raw_string()
501     // of PhoneNumberMatch.
502     number.clear_country_code_source();
503     number.clear_preferred_domestic_carrier_code();
504     number.clear_raw_input();
505     match->set_number(number);
506     return true;
507   }
508   return false;
509 }
510 
511 // Helper method to replace the verification method for each enum in the Java
512 // version.
VerifyAccordingToLeniency(Leniency leniency,const PhoneNumber & number,const string & candidate) const513 bool PhoneNumberMatcher::VerifyAccordingToLeniency(
514     Leniency leniency, const PhoneNumber& number,
515     const string& candidate) const {
516   switch (leniency) {
517     case PhoneNumberMatcher::POSSIBLE:
518       return phone_util_.IsPossibleNumber(number);
519     case PhoneNumberMatcher::VALID:
520       if (!phone_util_.IsValidNumber(number) ||
521           !ContainsOnlyValidXChars(number, candidate, phone_util_)) {
522         return false;
523       }
524       return IsNationalPrefixPresentIfRequired(number);
525     case PhoneNumberMatcher::STRICT_GROUPING: {
526       if (!phone_util_.IsValidNumber(number) ||
527           !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
528           ContainsMoreThanOneSlashInNationalNumber(
529               number, candidate, phone_util_) ||
530           !IsNationalPrefixPresentIfRequired(number)) {
531         return false;
532       }
533       ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
534                       const string&, const std::vector<string>&>* callback =
535           NewPermanentCallback(&AllNumberGroupsRemainGrouped);
536       bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
537       delete(callback);
538       return is_valid;
539     }
540     case PhoneNumberMatcher::EXACT_GROUPING: {
541       if (!phone_util_.IsValidNumber(number) ||
542           !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
543           ContainsMoreThanOneSlashInNationalNumber(
544               number, candidate, phone_util_) ||
545           !IsNationalPrefixPresentIfRequired(number)) {
546         return false;
547       }
548       ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
549                       const string&, const std::vector<string>&>* callback =
550           NewPermanentCallback(
551               this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent);
552       bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
553       delete(callback);
554       return is_valid;
555     }
556     default:
557       LOG(ERROR) << "No implementation defined for verification for leniency "
558                  << static_cast<int>(leniency);
559       return false;
560   }
561 }
562 
ExtractInnerMatch(const string & candidate,int offset,PhoneNumberMatch * match)563 bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset,
564                                            PhoneNumberMatch* match) {
565   DCHECK(match);
566   for (std::vector<const RegExp*>::const_iterator regex =
567            reg_exps_->inner_matches_->begin();
568            regex != reg_exps_->inner_matches_->end(); regex++) {
569     scoped_ptr<RegExpInput> candidate_input(
570         reg_exps_->regexp_factory_->CreateInput(candidate));
571     bool is_first_match = true;
572     string group;
573     while ((*regex)->FindAndConsume(candidate_input.get(), &group) &&
574            max_tries_ > 0) {
575       int group_start_index = candidate.length() -
576           candidate_input->ToString().length() - group.length();
577       if (is_first_match) {
578         // We should handle any group before this one too.
579         string first_group_only = candidate.substr(0, group_start_index);
580         phone_util_.TrimUnwantedEndChars(&first_group_only);
581         bool success = ParseAndVerify(first_group_only, offset, match);
582         if (success) {
583           return true;
584         }
585         --max_tries_;
586         is_first_match = false;
587       }
588       phone_util_.TrimUnwantedEndChars(&group);
589       bool success = ParseAndVerify(group, offset + group_start_index, match);
590       if (success) {
591         return true;
592       }
593       --max_tries_;
594     }
595   }
596   return false;
597 }
598 
ExtractMatch(const string & candidate,int offset,PhoneNumberMatch * match)599 bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset,
600                                       PhoneNumberMatch* match) {
601   DCHECK(match);
602   // Skip a match that is more likely to be a date.
603   if (reg_exps_->slash_separated_dates_->PartialMatch(candidate)) {
604     return false;
605   }
606 
607   // Skip potential time-stamps.
608   if (reg_exps_->time_stamps_->PartialMatch(candidate)) {
609     scoped_ptr<RegExpInput> following_text(
610         reg_exps_->regexp_factory_->CreateInput(
611             text_.substr(offset + candidate.size())));
612     if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) {
613       return false;
614     }
615   }
616 
617   // Try to come up with a valid match given the entire candidate.
618   if (ParseAndVerify(candidate, offset, match)) {
619     return true;
620   }
621 
622   // If that failed, try to find an "inner match" - there might be a phone
623   // number within this candidate.
624   return ExtractInnerMatch(candidate, offset, match);
625 }
626 
HasNext()627 bool PhoneNumberMatcher::HasNext() {
628   if (state_ == NOT_READY) {
629     PhoneNumberMatch temp_match;
630     if (!Find(search_index_, &temp_match)) {
631       state_ = DONE;
632     } else {
633       last_match_.reset(new PhoneNumberMatch(temp_match.start(),
634                                              temp_match.raw_string(),
635                                              temp_match.number()));
636       search_index_ = last_match_->end();
637       state_ = READY;
638     }
639   }
640   return state_ == READY;
641 }
642 
Next(PhoneNumberMatch * match)643 bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) {
644   DCHECK(match);
645   // Check the state and find the next match as a side-effect if necessary.
646   if (!HasNext()) {
647     return false;
648   }
649   match->CopyFrom(*last_match_);
650   state_ = NOT_READY;
651   last_match_.reset(NULL);
652   return true;
653 }
654 
Find(int index,PhoneNumberMatch * match)655 bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) {
656   DCHECK(match);
657 
658   scoped_ptr<RegExpInput> text(
659       reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index)));
660   string candidate;
661   while ((max_tries_ > 0) &&
662          reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) {
663     int start = text_.length() - text->ToString().length() - candidate.length();
664     // Check for extra numbers at the end.
665     reg_exps_->capture_up_to_second_number_start_pattern_->
666         PartialMatch(candidate, &candidate);
667     if (ExtractMatch(candidate, start, match)) {
668       return true;
669     }
670 
671     index = start + candidate.length();
672     --max_tries_;
673   }
674   return false;
675 }
676 
CheckNumberGroupingIsValid(const PhoneNumber & phone_number,const string & candidate,ResultCallback4<bool,const PhoneNumberUtil &,const PhoneNumber &,const string &,const std::vector<string> &> * checker) const677 bool PhoneNumberMatcher::CheckNumberGroupingIsValid(
678     const PhoneNumber& phone_number,
679     const string& candidate,
680     ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
681                     const string&, const std::vector<string>&>* checker) const {
682   DCHECK(checker);
683   string normalized_candidate =
684       NormalizeUTF8::NormalizeDecimalDigits(candidate);
685   std::vector<string> formatted_number_groups;
686   GetNationalNumberGroups(phone_number, &formatted_number_groups);
687   if (checker->Run(phone_util_, phone_number, normalized_candidate,
688                    formatted_number_groups)) {
689     return true;
690   }
691   // If this didn't pass, see if there are any alternate formats that match, and
692   // try them instead.
693   const PhoneMetadata* alternate_formats =
694     alternate_formats_->GetAlternateFormatsForCountry(
695         phone_number.country_code());
696   if (alternate_formats) {
697     string national_significant_number;
698     phone_util_.GetNationalSignificantNumber(phone_number,
699                                              &national_significant_number);
700     for (RepeatedPtrField<NumberFormat>::const_iterator it =
701              alternate_formats->number_format().begin();
702          it != alternate_formats->number_format().end(); ++it) {
703       if (it->leading_digits_pattern_size() > 0) {
704         std::unique_ptr<RegExpInput> nsn_input(
705             reg_exps_->regexp_factory_->CreateInput(
706                 national_significant_number));
707         // There is only one leading digits pattern for alternate formats.
708         if (!reg_exps_->regexp_cache_.GetRegExp(
709                 it->leading_digits_pattern(0)).Consume(nsn_input.get())) {
710           // Leading digits don't match; try another one.
711           continue;
712         }
713       }
714       formatted_number_groups.clear();
715       GetNationalNumberGroupsForPattern(phone_number, &*it,
716                                         &formatted_number_groups);
717       if (checker->Run(phone_util_, phone_number, normalized_candidate,
718                        formatted_number_groups)) {
719         return true;
720       }
721     }
722   }
723   return false;
724 }
725 
GetNationalNumberGroups(const PhoneNumber & number,std::vector<string> * digit_blocks) const726 void PhoneNumberMatcher::GetNationalNumberGroups(
727     const PhoneNumber& number,
728     std::vector<string>* digit_blocks) const {
729   string rfc3966_format;
730   // This will be in the format +CC-DG1-DG2-DGX;ext=EXT where DG1..DGX
731   // represents groups of digits.
732   phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
733   // We remove the extension part from the formatted string before splitting
734   // it into different groups.
735   size_t end_index = rfc3966_format.find(';');
736   if (end_index == string::npos) {
737     end_index = rfc3966_format.length();
738   }
739   // The country-code will have a '-' following it.
740   size_t start_index = rfc3966_format.find('-') + 1;
741   SplitStringUsing(rfc3966_format.substr(start_index,
742                                          end_index - start_index),
743                    "-", digit_blocks);
744 }
745 
GetNationalNumberGroupsForPattern(const PhoneNumber & number,const NumberFormat * formatting_pattern,std::vector<string> * digit_blocks) const746 void PhoneNumberMatcher::GetNationalNumberGroupsForPattern(
747     const PhoneNumber& number,
748     const NumberFormat* formatting_pattern,
749     std::vector<string>* digit_blocks) const {
750   string rfc3966_format;
751   // We format the NSN only, and split that according to the separator.
752   string national_significant_number;
753   phone_util_.GetNationalSignificantNumber(number,
754                                            &national_significant_number);
755   phone_util_.FormatNsnUsingPattern(national_significant_number,
756                                     *formatting_pattern,
757                                     PhoneNumberUtil::RFC3966,
758                                     &rfc3966_format);
759   SplitStringUsing(rfc3966_format, "-", digit_blocks);
760 }
761 
IsNationalPrefixPresentIfRequired(const PhoneNumber & number) const762 bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired(
763     const PhoneNumber& number) const {
764   // First, check how we deduced the country code. If it was written in
765   // international format, then the national prefix is not required.
766   if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
767     return true;
768   }
769   string phone_number_region;
770   phone_util_.GetRegionCodeForCountryCode(
771       number.country_code(), &phone_number_region);
772   const PhoneMetadata* metadata =
773       phone_util_.GetMetadataForRegion(phone_number_region);
774   if (!metadata) {
775     return true;
776   }
777   // Check if a national prefix should be present when formatting this number.
778   string national_number;
779   phone_util_.GetNationalSignificantNumber(number, &national_number);
780   const NumberFormat* format_rule =
781       phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(),
782                                                    national_number);
783   // To do this, we check that a national prefix formatting rule was present and
784   // that it wasn't just the first-group symbol ($1) with punctuation.
785   if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) {
786     if (format_rule->national_prefix_optional_when_formatting()) {
787       // The national-prefix is optional in these cases, so we don't need to
788       // check if it was present.
789       return true;
790     }
791     if (phone_util_.FormattingRuleHasFirstGroupOnly(
792         format_rule->national_prefix_formatting_rule())) {
793       // National Prefix not needed for this number.
794       return true;
795     }
796     // Normalize the remainder.
797     string raw_input_copy(number.raw_input());
798     // Check if we found a national prefix and/or carrier code at the start of
799     // the raw input, and return the result.
800     phone_util_.NormalizeDigitsOnly(&raw_input_copy);
801     return phone_util_.MaybeStripNationalPrefixAndCarrierCode(
802         *metadata,
803         &raw_input_copy,
804         NULL);  // Don't need to keep the stripped carrier code.
805   }
806   return true;
807 }
808 
AllNumberGroupsAreExactlyPresent(const PhoneNumberUtil & util,const PhoneNumber & phone_number,const string & normalized_candidate,const std::vector<string> & formatted_number_groups) const809 bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent(
810     const PhoneNumberUtil& util,
811     const PhoneNumber& phone_number,
812     const string& normalized_candidate,
813     const std::vector<string>& formatted_number_groups) const {
814   const scoped_ptr<RegExpInput> candidate_number(
815       reg_exps_->regexp_factory_->CreateInput(normalized_candidate));
816   std::vector<string> candidate_groups;
817   string digit_block;
818   while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume(
819              candidate_number.get(),
820              &digit_block)) {
821     candidate_groups.push_back(digit_block);
822   }
823 
824   // Set this to the last group, skipping it if the number has an extension.
825   int candidate_number_group_index =
826       phone_number.has_extension() ? candidate_groups.size() - 2
827                                    : candidate_groups.size() - 1;
828   // First we check if the national significant number is formatted as a block.
829   // We use find and not equals, since the national significant number may be
830   // present with a prefix such as a national number prefix, or the country code
831   // itself.
832   string national_significant_number;
833   util.GetNationalSignificantNumber(phone_number,
834                                     &national_significant_number);
835   if (candidate_groups.size() == 1 ||
836       candidate_groups.at(candidate_number_group_index).find(
837           national_significant_number) != string::npos) {
838     return true;
839   }
840   // Starting from the end, go through in reverse, excluding the first group,
841   // and check the candidate and number groups are the same.
842   for (int formatted_number_group_index =
843            (formatted_number_groups.size() - 1);
844        formatted_number_group_index > 0 &&
845        candidate_number_group_index >= 0;
846        --formatted_number_group_index, --candidate_number_group_index) {
847     if (candidate_groups.at(candidate_number_group_index) !=
848         formatted_number_groups.at(formatted_number_group_index)) {
849       return false;
850     }
851   }
852   // Now check the first group. There may be a national prefix at the start, so
853   // we only check that the candidate group ends with the formatted number
854   // group.
855   return (candidate_number_group_index >= 0 &&
856           HasSuffixString(candidate_groups.at(candidate_number_group_index),
857                           formatted_number_groups.at(0)));
858 }
859 
860 // static
ContainsMoreThanOneSlashInNationalNumber(const PhoneNumber & number,const string & candidate,const PhoneNumberUtil & util)861 bool PhoneNumberMatcher::ContainsMoreThanOneSlashInNationalNumber(
862     const PhoneNumber& number,
863     const string& candidate,
864     const PhoneNumberUtil& util) {
865   size_t first_slash_in_body = candidate.find('/');
866   if (first_slash_in_body == string::npos) {
867     // No slashes, this is okay.
868     return false;
869   }
870   // Now look for a second one.
871   size_t second_slash_in_body = candidate.find('/', first_slash_in_body + 1);
872   if (second_slash_in_body == string::npos) {
873     // Only one slash, this is okay.
874     return false;
875   }
876 
877   // If the first slash is after the country calling code, this is permitted.
878   if (number.country_code_source() == PhoneNumber::FROM_NUMBER_WITH_PLUS_SIGN ||
879       number.country_code_source() ==
880           PhoneNumber::FROM_NUMBER_WITHOUT_PLUS_SIGN) {
881     string normalized_country_code =
882         candidate.substr(0, first_slash_in_body);
883     util.NormalizeDigitsOnly(&normalized_country_code);
884     if (normalized_country_code == SimpleItoa(number.country_code())) {
885       // Any more slashes and this is illegal.
886       return candidate.find('/', second_slash_in_body + 1) != string::npos;
887     }
888   }
889   return true;
890 }
891 
892 }  // namespace phonenumbers
893 }  // namespace i18n
894