• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2012 Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are
6  * met:
7  *
8  *    * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *    * Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following disclaimer
12  * in the documentation and/or other materials provided with the
13  * distribution.
14  *    * Neither the name of Google Inc. nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include "config.h"
32 
33 // Magic pretend-to-be-a-chromium-build flags
34 #undef WEBKIT_IMPLEMENTATION
35 #undef LOG
36 
37 #include "content/address_detector.h"
38 
39 #include <bitset>
40 
41 #include "base/utf_string_conversions.h"
42 #include "net/base/escape.h"
43 #include "Settings.h"
44 #include "WebString.h"
45 
46 #include <wtf/HashSet.h>
47 #include <wtf/Noncopyable.h>
48 #include <wtf/text/StringHash.h>
49 #include <wtf/text/WTFString.h>
50 
51 namespace {
52 
53 // Prefix used for geographical address intent URIs.
54 static const char kAddressSchemaPrefix[] = "geo:0,0?q=";
55 
56 // Maximum text length to be searched for address detection.
57 static const size_t kMaxAddressLength = 500;
58 
59 // Minimum number of words in an address after the house number
60 // before a state is expected to be found.
61 // A value too high can miss short addresses.
62 const size_t kMinAddressWords = 3;
63 
64 // Maximum number of words allowed in an address between the house number
65 // and the state, both not included.
66 const size_t kMaxAddressWords = 12;
67 
68 // Maximum number of lines allowed in an address between the house number
69 // and the state, both not included.
70 const size_t kMaxAddressLines = 5;
71 
72 // Maximum length allowed for any address word between the house number
73 // and the state, both not included.
74 const size_t kMaxAddressNameWordLength = 25;
75 
76 // Maximum number of words after the house number in which the location name
77 // should be found.
78 const size_t kMaxLocationNameDistance = 4;
79 
80 // Number of digits for a valid zip code.
81 const size_t kZipDigits = 5;
82 
83 // Number of digits for a valid zip code in the Zip Plus 4 format.
84 const size_t kZipPlus4Digits = 9;
85 
86 // Maximum number of digits of a house number, including possible hyphens.
87 const size_t kMaxHouseDigits = 5;
88 
89 // Additional characters used as new line delimiters.
90 const char16 kNewlineDelimiters[] = {
91   ',',
92   '*',
93   0x2022,  // Unicode bullet
94   0,
95 };
96 
SafePreviousChar(const string16::const_iterator & it,const string16::const_iterator & begin)97 char16 SafePreviousChar(const string16::const_iterator& it,
98     const string16::const_iterator& begin) {
99   if (it == begin)
100     return ' ';
101   return *(it - 1);
102 }
103 
SafeNextChar(const string16::const_iterator & it,const string16::const_iterator & end)104 char16 SafeNextChar(const string16::const_iterator& it,
105     const string16::const_iterator& end) {
106   if (it == end)
107     return ' ';
108   return *(it + 1);
109 }
110 
WordLowerCaseEqualsASCII(string16::const_iterator word_begin,string16::const_iterator word_end,const char * ascii_to_match)111 bool WordLowerCaseEqualsASCII(string16::const_iterator word_begin,
112     string16::const_iterator word_end, const char* ascii_to_match) {
113   for (string16::const_iterator it = word_begin; it != word_end;
114       ++it, ++ascii_to_match) {
115     if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
116       return false;
117   }
118   return *ascii_to_match == 0 || *ascii_to_match == ' ';
119 }
120 
LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin,string16::const_iterator word_end,const char * ascii_to_match,bool allow_plural)121 bool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin,
122     string16::const_iterator word_end, const char* ascii_to_match,
123     bool allow_plural) {
124   for (string16::const_iterator it = word_begin; it != word_end;
125       ++it, ++ascii_to_match) {
126     if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end)
127       return true;
128 
129     if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
130       return false;
131   }
132   return *ascii_to_match == 0;
133 }
134 
135 }  // anonymous namespace
136 
137 
AddressDetector()138 AddressDetector::AddressDetector() {
139 }
140 
~AddressDetector()141 AddressDetector::~AddressDetector() {
142 }
143 
GetContentText(const WebKit::WebRange & range)144 std::string AddressDetector::GetContentText(const WebKit::WebRange& range) {
145   // Get the address and replace unicode bullets with commas.
146   string16 address_16 = CollapseWhitespace(range.toPlainText(), false);
147   std::replace(address_16.begin(), address_16.end(),
148       static_cast<char16>(0x2022), static_cast<char16>(','));
149   return UTF16ToUTF8(address_16);
150 }
151 
GetIntentURL(const std::string & content_text)152 GURL AddressDetector::GetIntentURL(const std::string& content_text) {
153   return GURL(kAddressSchemaPrefix +
154       EscapeQueryParamValue(content_text, true));
155 }
156 
GetMaximumContentLength()157 size_t AddressDetector::GetMaximumContentLength() {
158   return kMaxAddressLength;
159 }
160 
IsEnabled(const WebKit::WebHitTestInfo & hit_test)161 bool AddressDetector::IsEnabled(const WebKit::WebHitTestInfo& hit_test) {
162   WebCore::Settings* settings = GetSettings(hit_test);
163   return settings && settings->formatDetectionAddress();
164 }
165 
FindContent(const string16::const_iterator & begin,const string16::const_iterator & end,size_t * start_pos,size_t * end_pos)166 bool AddressDetector::FindContent(const string16::const_iterator& begin,
167     const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) {
168   HouseNumberParser house_number_parser;
169 
170   // Keep going through the input string until a potential house number is
171   // detected. Start tokenizing the following words to find a valid
172   // street name within a word range. Then, find a state name followed
173   // by a valid zip code for that state. Also keep a look for any other
174   // possible house numbers to continue from in case of no match and for
175   // state names not followed by a zip code (e.g. New York, NY 10000).
176   const string16 newline_delimiters = kNewlineDelimiters;
177   const string16 delimiters = kWhitespaceUTF16 + newline_delimiters;
178   for (string16::const_iterator it = begin; it != end; ) {
179     Word house_number;
180     if (!house_number_parser.Parse(it, end, &house_number))
181       return false;
182 
183     String16Tokenizer tokenizer(house_number.end, end, delimiters);
184     tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
185 
186     std::vector<Word> words;
187     words.push_back(house_number);
188 
189     bool found_location_name = false;
190     bool continue_on_house_number = true;
191     size_t next_house_number_word = 0;
192     size_t num_lines = 1;
193 
194     // Don't include the house number in the word count.
195     size_t next_word = 1;
196     for (; next_word <= kMaxAddressWords + 1; ++next_word) {
197 
198       // Extract a new word from the tokenizer.
199       if (next_word == words.size()) {
200         do {
201           if (!tokenizer.GetNext())
202             return false;
203 
204           // Check the number of address lines.
205           if (tokenizer.token_is_delim() && newline_delimiters.find(
206               *tokenizer.token_begin()) != string16::npos) {
207             ++num_lines;
208           }
209         } while (tokenizer.token_is_delim());
210 
211         if (num_lines > kMaxAddressLines)
212           break;
213 
214         words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
215       }
216 
217       // Check the word length. If too long, don't try to continue from
218       // the next house number as no address can hold this word.
219       const Word& current_word = words[next_word];
220       DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
221       size_t current_word_length = std::distance(
222           current_word.begin, current_word.end);
223       if (current_word_length > kMaxAddressNameWordLength) {
224         continue_on_house_number = false;
225         break;
226       }
227 
228       // Check if the new word is a valid house number.
229       // This is used to properly resume parsing in case the maximum number
230       // of words is exceeded.
231       if (next_house_number_word == 0 &&
232           house_number_parser.Parse(current_word.begin, current_word.end, NULL)) {
233         next_house_number_word = next_word;
234         continue;
235       }
236 
237       // Look for location names in the words after the house number.
238       // A range limitation is introduced to avoid matching
239       // anything that starts with a number before a legitimate address.
240       if (next_word <= kMaxLocationNameDistance &&
241           IsValidLocationName(current_word)) {
242         found_location_name = true;
243         continue;
244       }
245 
246       // Don't count the house number.
247       if (next_word > kMinAddressWords) {
248         // Looking for the state is likely to add new words to the list while
249         // checking for multi-word state names.
250         size_t state_first_word = next_word;
251         size_t state_last_word, state_index;
252         if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
253             &tokenizer, &state_index)) {
254 
255           // A location name should have been found at this point.
256           if (!found_location_name)
257             break;
258 
259           // Explicitly exclude "et al", as "al" is a valid state code.
260           if (current_word_length == 2 && words.size() > 2) {
261             const Word& previous_word = words[state_first_word - 1];
262             if (previous_word.end - previous_word.begin == 2 &&
263                 LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
264                     "et") &&
265                 LowerCaseEqualsASCII(current_word.begin, current_word.end,
266                     "al"))
267               break;
268           }
269 
270           // Extract one more word from the tokenizer if not already available.
271           size_t zip_word = state_last_word + 1;
272           if (zip_word == words.size()) {
273             do {
274               if (!tokenizer.GetNext()) {
275                 // Zip is optional
276                 *start_pos = words[0].begin - begin;
277                 *end_pos = words[state_last_word].end - begin;
278                 return true;
279               }
280             } while (tokenizer.token_is_delim());
281             words.push_back(Word(tokenizer.token_begin(),
282                 tokenizer.token_end()));
283           }
284 
285           // Check the parsing validity and state range of the zip code.
286           next_word = state_last_word;
287           if (!IsZipValid(words[zip_word], state_index))
288             continue;
289 
290           *start_pos = words[0].begin - begin;
291           *end_pos = words[zip_word].end - begin;
292           return true;
293         }
294       }
295     }
296 
297     // Avoid skipping too many words because of a non-address number
298     // at the beginning of the contents to parse.
299     if (continue_on_house_number && next_house_number_word > 0) {
300       it = words[next_house_number_word].begin;
301     } else {
302       DCHECK(!words.empty());
303       next_word = std::min(next_word, words.size() - 1);
304       it = words[next_word].end;
305     }
306   }
307 
308   return false;
309 }
310 
IsPreDelimiter(char16 character)311 bool AddressDetector::HouseNumberParser::IsPreDelimiter(
312     char16 character) {
313   return character == ':' || IsPostDelimiter(character);
314 }
315 
IsPostDelimiter(char16 character)316 bool AddressDetector::HouseNumberParser::IsPostDelimiter(
317     char16 character) {
318   return IsWhitespace(character) || strchr(",\"'", character);
319 }
320 
RestartOnNextDelimiter()321 void AddressDetector::HouseNumberParser::RestartOnNextDelimiter() {
322   ResetState();
323   for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {}
324 }
325 
AcceptChars(size_t num_chars)326 void AddressDetector::HouseNumberParser::AcceptChars(size_t num_chars) {
327   size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)),
328       num_chars);
329   it_ += offset;
330   result_chars_ += offset;
331 }
332 
SkipChars(size_t num_chars)333 void AddressDetector::HouseNumberParser::SkipChars(size_t num_chars) {
334   it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars);
335 }
336 
ResetState()337 void AddressDetector::HouseNumberParser::ResetState() {
338   num_digits_ = 0;
339   result_chars_ = 0;
340 }
341 
CheckFinished(Word * word) const342 bool AddressDetector::HouseNumberParser::CheckFinished(Word* word) const {
343   // There should always be a number after a hyphen.
344   if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-')
345     return false;
346 
347   if (word) {
348     word->begin = it_ - result_chars_;
349     word->end = it_;
350   }
351   return true;
352 }
353 
Parse(const string16::const_iterator & begin,const string16::const_iterator & end,Word * word)354 bool AddressDetector::HouseNumberParser::Parse(
355     const string16::const_iterator& begin,
356     const string16::const_iterator& end, Word* word) {
357   it_ = begin_ = begin;
358   end_ = end;
359   ResetState();
360 
361   // Iterations only used as a fail-safe against any buggy infinite loops.
362   size_t iterations = 0;
363   size_t max_iterations = end - begin + 1;
364   for (; it_ != end_ && iterations < max_iterations; ++iterations) {
365 
366     // Word finished case.
367     if (IsPostDelimiter(*it_)) {
368       if (CheckFinished(word))
369         return true;
370       else if (result_chars_)
371         ResetState();
372 
373       SkipChars(1);
374       continue;
375     }
376 
377     // More digits. There should be no more after a letter was found.
378     if (IsAsciiDigit(*it_)) {
379       if (num_digits_ >= kMaxHouseDigits) {
380         RestartOnNextDelimiter();
381       } else {
382         AcceptChars(1);
383         ++num_digits_;
384       }
385       continue;
386     }
387 
388     if (IsAsciiAlpha(*it_)) {
389       // Handle special case 'one'.
390       if (result_chars_ == 0) {
391         if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one"))
392           AcceptChars(3);
393         else
394           RestartOnNextDelimiter();
395         continue;
396       }
397 
398       // There should be more than 1 character because of result_chars.
399       DCHECK_GT(result_chars_, 0U);
400       DCHECK_NE(it_, begin_);
401       char16 previous = SafePreviousChar(it_, begin_);
402       if (IsAsciiDigit(previous)) {
403         // Check cases like '12A'.
404         char16 next = SafeNextChar(it_, end_);
405         if (IsPostDelimiter(next)) {
406           AcceptChars(1);
407           continue;
408         }
409 
410         // Handle cases like 12a, 1st, 2nd, 3rd, 7th.
411         if (IsAsciiAlpha(next)) {
412           char16 last_digit = previous;
413           char16 first_letter = base::ToLowerASCII(*it_);
414           char16 second_letter = base::ToLowerASCII(next);
415           bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' &&
416               num_digits_ == 2;
417 
418           switch (last_digit - '0') {
419           case 1:
420             if ((first_letter == 's' && second_letter == 't') ||
421                 (first_letter == 't' && second_letter == 'h' && is_teen)) {
422               AcceptChars(2);
423               continue;
424             }
425             break;
426 
427           case 2:
428             if ((first_letter == 'n' && second_letter == 'd') ||
429                 (first_letter == 't' && second_letter == 'h' && is_teen)) {
430               AcceptChars(2);
431               continue;
432             }
433             break;
434 
435           case 3:
436             if ((first_letter == 'r' && second_letter == 'd') ||
437                 (first_letter == 't' && second_letter == 'h' && is_teen)) {
438               AcceptChars(2);
439               continue;
440             }
441             break;
442 
443           case 0:
444             // Explicitly exclude '0th'.
445             if (num_digits_ == 1)
446               break;
447 
448           case 4:
449           case 5:
450           case 6:
451           case 7:
452           case 8:
453           case 9:
454             if (first_letter == 't' && second_letter == 'h') {
455               AcceptChars(2);
456               continue;
457             }
458             break;
459 
460           default:
461             NOTREACHED();
462           }
463         }
464       }
465 
466       RestartOnNextDelimiter();
467       continue;
468     }
469 
470     if (*it_ == '-' && num_digits_ > 0) {
471       AcceptChars(1);
472       ++num_digits_;
473       continue;
474     }
475 
476     RestartOnNextDelimiter();
477     SkipChars(1);
478   }
479 
480   if (iterations >= max_iterations)
481     return false;
482 
483   return CheckFinished(word);
484 }
485 
FindStateStartingInWord(WordList * words,size_t state_first_word,size_t * state_last_word,String16Tokenizer * tokenizer,size_t * state_index)486 bool AddressDetector::FindStateStartingInWord(WordList* words,
487     size_t state_first_word, size_t* state_last_word,
488     String16Tokenizer* tokenizer, size_t* state_index) {
489 
490   // Bitmasks containing the allowed suffixes for 2-letter state codes.
491   static const int state_two_letter_suffix[23] = {
492     0x02060c00,  // A followed by: [KLRSZ].
493     0x00000000,  // B.
494     0x00084001,  // C followed by: [AOT].
495     0x00000014,  // D followed by: [CE].
496     0x00000000,  // E.
497     0x00001800,  // F followed by: [LM].
498     0x00100001,  // G followed by: [AU].
499     0x00000100,  // H followed by: [I].
500     0x00002809,  // I followed by: [ADLN].
501     0x00000000,  // J.
502     0x01040000,  // K followed by: [SY].
503     0x00000001,  // L followed by: [A].
504     0x000ce199,  // M followed by: [ADEHINOPST].
505     0x0120129c,  // N followed by: [CDEHJMVY].
506     0x00020480,  // O followed by: [HKR].
507     0x00420001,  // P followed by: [ARW].
508     0x00000000,  // Q.
509     0x00000100,  // R followed by: [I].
510     0x0000000c,  // S followed by: [CD].
511     0x00802000,  // T followed by: [NX].
512     0x00080000,  // U followed by: [T].
513     0x00080101,  // V followed by: [AIT].
514     0x01200101   // W followed by: [AIVY].
515   };
516 
517   // Accumulative number of states for the 2-letter code indexed by the first.
518   static const int state_two_letter_accumulative[24] = {
519      0,  5,  5,  8, 10, 10, 12, 14,
520     15, 19, 19, 21, 22, 32, 40, 43,
521     46, 46, 47, 49, 51, 52, 55, 59
522   };
523 
524   // State names sorted alphabetically with their lengths.
525   // There can be more than one possible name for a same state if desired.
526   static const struct StateNameInfo {
527     const char* string;
528     char first_word_length;
529     char length;
530     char state_index; // Relative to two-character code alphabetical order.
531   } state_names[59] = {
532     { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 },
533     { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 },
534     { "arkansas", 8, 8, 2 },
535     { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 },
536     { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 },
537     { "district of columbia", 8, 20, 8 },
538     { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 },
539     { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 },
540     { "hawaii", 6, 6, 14 },
541     { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 },
542     { "iowa", 4, 4, 15 },
543     { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 },
544     { "louisiana", 9, 9, 21 },
545     { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 },
546     { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 },
547     { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 },
548     { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 },
549     { "montana", 7, 7, 31 },
550     { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 },
551     { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 },
552     { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 },
553     { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 },
554     { "northern mariana islands", 8, 24, 29 },
555     { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 },
556     { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 },
557     { "puerto rico", 6, 11, 44 },
558     { "rhode island", 5, 5, 46 },
559     { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 },
560     { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 },
561     { "utah", 4, 4, 51 },
562     { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 },
563     { "virginia", 8, 8, 52 },
564     { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 },
565     { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 }
566   };
567 
568   // Accumulative number of states for sorted names indexed by the first letter.
569   // Required a different one since there are codes that don't share their
570   // first letter with the name of their state (MP = Northern Mariana Islands).
571   static const int state_names_accumulative[24] = {
572      0,  5,  5,  8, 10, 10, 12, 14,
573     15, 19, 19, 21, 22, 31, 40, 43,
574     46, 46, 47, 49, 51, 52, 55, 59
575   };
576 
577   DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1],
578       static_cast<int>(ARRAYSIZE_UNSAFE(state_names)));
579 
580   const Word& first_word = words->at(state_first_word);
581   int length = first_word.end - first_word.begin;
582   if (length < 2 || !IsAsciiAlpha(*first_word.begin))
583     return false;
584 
585   // No state names start with x, y, z.
586   char16 first_letter = base::ToLowerASCII(*first_word.begin);
587   if (first_letter > 'w')
588     return false;
589 
590   DCHECK(first_letter >= 'a');
591   int first_index = first_letter - 'a';
592 
593   // Look for two-letter state names.
594   if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) {
595     char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1));
596     DCHECK(second_letter >= 'a');
597 
598     int second_index = second_letter - 'a';
599     if (!(state_two_letter_suffix[first_index] & (1 << second_index)))
600       return false;
601 
602     std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] &
603         ((1 << second_index) - 1);
604     *state_last_word = state_first_word;
605     *state_index = state_two_letter_accumulative[first_index] +
606         previous_suffixes.count();
607     return true;
608   }
609 
610   // Look for full state names by their first letter. Discard by length.
611   for (int state = state_names_accumulative[first_index];
612       state < state_names_accumulative[first_index + 1]; ++state) {
613     if (state_names[state].first_word_length != length)
614       continue;
615 
616     bool state_match = false;
617     size_t state_word = state_first_word;
618     for (int pos = 0; true; ) {
619       if (!WordLowerCaseEqualsASCII(words->at(state_word).begin,
620           words->at(state_word).end, &state_names[state].string[pos]))
621         break;
622 
623       pos += words->at(state_word).end - words->at(state_word).begin + 1;
624       if (pos >= state_names[state].length) {
625         state_match = true;
626         break;
627       }
628 
629       // Ran out of words, extract more from the tokenizer.
630       if (++state_word == words->size()) {
631         do {
632           if (!tokenizer->GetNext())
633             break;
634         } while (tokenizer->token_is_delim());
635         words->push_back(Word(tokenizer->token_begin(), tokenizer->token_end()));
636       }
637     }
638 
639     if (state_match) {
640       *state_last_word = state_word;
641       *state_index = state_names[state].state_index;
642       return true;
643     }
644   }
645 
646   return false;
647 }
648 
IsZipValid(const Word & word,size_t state_index)649 bool AddressDetector::IsZipValid(const Word& word, size_t state_index) {
650   size_t length = word.end - word.begin;
651   if (length != kZipDigits && length != kZipPlus4Digits + 1)
652     return false;
653 
654   for (string16::const_iterator it = word.begin; it != word.end; ++it) {
655     size_t pos = it - word.begin;
656     if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits))
657       continue;
658     return false;
659   }
660   return IsZipValidForState(word, state_index);
661 }
662 
IsZipValidForState(const Word & word,size_t state_index)663 bool AddressDetector::IsZipValidForState(const Word& word, size_t state_index)
664 {
665     enum USState {
666         AP = -4, // AP (military base in the Pacific)
667         AA = -3, // AA (military base inside the US)
668         AE = -2, // AE (military base outside the US)
669         XX = -1, // (not in use)
670         AK =  0, // AK Alaska
671         AL =  1, // AL Alabama
672         AR =  2, // AR Arkansas
673         AS =  3, // AS American Samoa
674         AZ =  4, // AZ Arizona
675         CA =  5, // CA California
676         CO =  6, // CO Colorado
677         CT =  7, // CT Connecticut
678         DC =  8, // DC District of Columbia
679         DE =  9, // DE Delaware
680         FL = 10, // FL Florida
681         FM = 11, // FM Federated States of Micronesia
682         GA = 12, // GA Georgia
683         GU = 13, // GU Guam
684         HI = 14, // HI Hawaii
685         IA = 15, // IA Iowa
686         ID = 16, // ID Idaho
687         IL = 17, // IL Illinois
688         IN = 18, // IN Indiana
689         KS = 19, // KS Kansas
690         KY = 20, // KY Kentucky
691         LA = 21, // LA Louisiana
692         MA = 22, // MA Massachusetts
693         MD = 23, // MD Maryland
694         ME = 24, // ME Maine
695         MH = 25, // MH Marshall Islands
696         MI = 26, // MI Michigan
697         MN = 27, // MN Minnesota
698         MO = 28, // MO Missouri
699         MP = 29, // MP Northern Mariana Islands
700         MS = 30, // MS Mississippi
701         MT = 31, // MT Montana
702         NC = 32, // NC North Carolina
703         ND = 33, // ND North Dakota
704         NE = 34, // NE Nebraska
705         NH = 35, // NH New Hampshire
706         NJ = 36, // NJ New Jersey
707         NM = 37, // NM New Mexico
708         NV = 38, // NV Nevada
709         NY = 39, // NY New York
710         OH = 40, // OH Ohio
711         OK = 41, // OK Oklahoma
712         OR = 42, // OR Oregon
713         PA = 43, // PA Pennsylvania
714         PR = 44, // PR Puerto Rico
715         PW = 45, // PW Palau
716         RI = 46, // RI Rhode Island
717         SC = 47, // SC South Carolina
718         SD = 48, // SD South Dakota
719         TN = 49, // TN Tennessee
720         TX = 50, // TX Texas
721         UT = 51, // UT Utah
722         VA = 52, // VA Virginia
723         VI = 53, // VI Virgin Islands
724         VT = 54, // VT Vermont
725         WA = 55, // WA Washington
726         WI = 56, // WI Wisconsin
727         WV = 57, // WV West Virginia
728         WY = 58, // WY Wyoming
729     };
730 
731     static const USState stateForZipPrefix[] = {
732     //   0   1   2   3   4   5   6   7   8   9
733         XX, XX, XX, XX, XX, NY, PR, PR, VI, PR, // 000-009
734         MA, MA, MA, MA, MA, MA, MA, MA, MA, MA, // 010-019
735         MA, MA, MA, MA, MA, MA, MA, MA, RI, RI, // 020-029
736         NH, NH, NH, NH, NH, NH, NH, NH, NH, ME, // 030-039
737         ME, ME, ME, ME, ME, ME, ME, ME, ME, ME, // 040-049
738         VT, VT, VT, VT, VT, MA, VT, VT, VT, VT, // 050-059
739         CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 060-069
740         NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 070-079
741         NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 080-089
742         AE, AE, AE, AE, AE, AE, AE, AE, AE, XX, // 090-099
743         NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 100-109
744         NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 110-119
745         NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 120-129
746         NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 130-139
747         NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 140-149
748         PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 150-159
749         PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 160-169
750         PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 170-179
751         PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 180-189
752         PA, PA, PA, PA, PA, PA, PA, DE, DE, DE, // 190-199
753         DC, VA, DC, DC, DC, DC, MD, MD, MD, MD, // 200-209
754         MD, MD, MD, XX, MD, MD, MD, MD, MD, MD, // 210-219
755         VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 220-229
756         VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 230-239
757         VA, VA, VA, VA, VA, VA, VA, WV, WV, WV, // 240-249
758         WV, WV, WV, WV, WV, WV, WV, WV, WV, WV, // 250-259
759         WV, WV, WV, WV, WV, WV, WV, WV, WV, XX, // 260-269
760         NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 270-279
761         NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 280-289
762         SC, SC, SC, SC, SC, SC, SC, SC, SC, SC, // 290-299
763         GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 300-309
764         GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 310-319
765         FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 320-329
766         FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 330-339
767         AA, FL, FL, XX, FL, XX, FL, FL, XX, FL, // 340-349
768         AL, AL, AL, XX, AL, AL, AL, AL, AL, AL, // 350-359
769         AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 360-369
770         TN, TN, TN, TN, TN, TN, TN, TN, TN, TN, // 370-379
771         TN, TN, TN, TN, TN, TN, MS, MS, MS, MS, // 380-389
772         MS, MS, MS, MS, MS, MS, MS, MS, GA, GA, // 390-399
773         KY, KY, KY, KY, KY, KY, KY, KY, KY, KY, // 400-409
774         KY, KY, KY, KY, KY, KY, KY, KY, KY, XX, // 410-419
775         KY, KY, KY, KY, KY, KY, KY, KY, XX, XX, // 420-429
776         OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 430-439
777         OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 440-449
778         OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 450-459
779         IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 460-469
780         IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 470-479
781         MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 480-489
782         MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 490-499
783         IA, IA, IA, IA, IA, IA, IA, IA, IA, IA, // 500-509
784         IA, IA, IA, IA, IA, IA, IA, XX, XX, XX, // 510-519
785         IA, IA, IA, IA, IA, IA, IA, IA, IA, XX, // 520-529
786         WI, WI, WI, XX, WI, WI, XX, WI, WI, WI, // 530-539
787         WI, WI, WI, WI, WI, WI, WI, WI, WI, WI, // 540-549
788         MN, MN, XX, MN, MN, MN, MN, MN, MN, MN, // 550-559
789         MN, MN, MN, MN, MN, MN, MN, MN, XX, DC, // 560-569
790         SD, SD, SD, SD, SD, SD, SD, SD, XX, XX, // 570-579
791         ND, ND, ND, ND, ND, ND, ND, ND, ND, XX, // 580-589
792         MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, // 590-599
793         IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 600-609
794         IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 610-619
795         IL, XX, IL, IL, IL, IL, IL, IL, IL, IL, // 620-629
796         MO, MO, XX, MO, MO, MO, MO, MO, MO, MO, // 630-639
797         MO, MO, XX, XX, MO, MO, MO, MO, MO, MO, // 640-649
798         MO, MO, MO, MO, MO, MO, MO, MO, MO, XX, // 650-659
799         KS, KS, KS, XX, KS, KS, KS, KS, KS, KS, // 660-669
800         KS, KS, KS, KS, KS, KS, KS, KS, KS, KS, // 670-679
801         NE, NE, XX, NE, NE, NE, NE, NE, NE, NE, // 680-689
802         NE, NE, NE, NE, XX, XX, XX, XX, XX, XX, // 690-699
803         LA, LA, XX, LA, LA, LA, LA, LA, LA, XX, // 700-709
804         LA, LA, LA, LA, LA, XX, AR, AR, AR, AR, // 710-719
805         AR, AR, AR, AR, AR, AR, AR, AR, AR, AR, // 720-729
806         OK, OK, XX, TX, OK, OK, OK, OK, OK, OK, // 730-739
807         OK, OK, XX, OK, OK, OK, OK, OK, OK, OK, // 740-749
808         TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 750-759
809         TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 760-769
810         TX, XX, TX, TX, TX, TX, TX, TX, TX, TX, // 770-779
811         TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 780-789
812         TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 790-799
813         CO, CO, CO, CO, CO, CO, CO, CO, CO, CO, // 800-809
814         CO, CO, CO, CO, CO, CO, CO, XX, XX, XX, // 810-819
815         WY, WY, WY, WY, WY, WY, WY, WY, WY, WY, // 820-829
816         WY, WY, ID, ID, ID, ID, ID, ID, ID, XX, // 830-839
817         UT, UT, UT, UT, UT, UT, UT, UT, XX, XX, // 840-849
818         AZ, AZ, AZ, AZ, XX, AZ, AZ, AZ, XX, AZ, // 850-859
819         AZ, XX, XX, AZ, AZ, AZ, XX, XX, XX, XX, // 860-869
820         NM, NM, NM, NM, NM, NM, XX, NM, NM, NM, // 870-879
821         NM, NM, NM, NM, NM, TX, XX, XX, XX, NV, // 880-889
822         NV, NV, XX, NV, NV, NV, XX, NV, NV, XX, // 890-899
823         CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 900-909
824         CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 910-919
825         CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 920-929
826         CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 930-939
827         CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 940-949
828         CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 950-959
829         CA, CA, AP, AP, AP, AP, AP, HI, HI, GU, // 960-969
830         OR, OR, OR, OR, OR, OR, OR, OR, OR, OR, // 970-979
831         WA, WA, WA, WA, WA, WA, WA, XX, WA, WA, // 980-989
832         WA, WA, WA, WA, WA, AK, AK, AK, AK, AK, // 990-999
833     };
834 
835     if (!word.begin || !word.end || (word.end - word.begin) < 3)
836         return false;
837     const char16* zipPtr = word.begin;
838     if (zipPtr[0] < '0' || zipPtr[0] > '9' ||
839         zipPtr[1] < '0' || zipPtr[1] > '9' ||
840         zipPtr[2] < '0' || zipPtr[2] > '9')
841         return false;
842 
843     int zip = zipPtr[0] - '0';
844     zip *= 10;
845     zip += zipPtr[1] - '0';
846     zip *= 10;
847     zip += zipPtr[2] - '0';
848     return stateForZipPrefix[zip] == (int) state_index;
849 }
850 
851 static const char* s_rawStreetSuffixes[] = {
852     "allee", "alley", "ally", "aly",
853     "anex", "annex", "anx", "arc", "arcade", "av", "ave", "aven", "avenu",
854     "avenue", "avn", "avnue", "bayoo", "bayou", "bch", "beach", "bend",
855     "bg", "bgs", "blf", "blfs", "bluf", "bluff", "bluffs", "blvd", "bnd",
856     "bot", "bottm", "bottom", "boul", "boulevard", "boulv", "br", "branch",
857     "brdge", "brg", "bridge", "brk", "brks", "brnch", "brook", "brooks",
858     "btm", "burg", "burgs", "byp", "bypa", "bypas", "bypass", "byps", "byu",
859     "camp", "canyn", "canyon", "cape", "causeway", "causway", "cen", "cent",
860     "center", "centers", "centr", "centre", "cir", "circ", "circl",
861     "circle", "circles", "cirs", "ck", "clb", "clf", "clfs", "cliff",
862     "cliffs", "club", "cmn", "cmp", "cnter", "cntr", "cnyn", "common",
863     "cor", "corner", "corners", "cors", "course", "court", "courts", "cove",
864     "coves", "cp", "cpe", "cr", "crcl", "crcle", "crecent", "creek", "cres",
865     "crescent", "cresent", "crest", "crk", "crossing", "crossroad",
866     "crscnt", "crse", "crsent", "crsnt", "crssing", "crssng", "crst", "crt",
867     "cswy", "ct", "ctr", "ctrs", "cts", "curv", "curve", "cv", "cvs", "cyn",
868     "dale", "dam", "div", "divide", "dl", "dm", "dr", "driv", "drive",
869     "drives", "drs", "drv", "dv", "dvd", "est", "estate", "estates", "ests",
870     "exp", "expr", "express", "expressway", "expw", "expy", "ext",
871     "extension", "extensions", "extn", "extnsn", "exts", "fall", "falls",
872     "ferry", "field", "fields", "flat", "flats", "fld", "flds", "fls",
873     "flt", "flts", "ford", "fords", "forest", "forests", "forg", "forge",
874     "forges", "fork", "forks", "fort", "frd", "frds", "freeway", "freewy",
875     "frg", "frgs", "frk", "frks", "frry", "frst", "frt", "frway", "frwy",
876     "fry", "ft", "fwy", "garden", "gardens", "gardn", "gateway", "gatewy",
877     "gatway", "gdn", "gdns", "glen", "glens", "gln", "glns", "grden",
878     "grdn", "grdns", "green", "greens", "grn", "grns", "grov", "grove",
879     "groves", "grv", "grvs", "gtway", "gtwy", "harb", "harbor", "harbors",
880     "harbr", "haven", "havn", "hbr", "hbrs", "height", "heights", "hgts",
881     "highway", "highwy", "hill", "hills", "hiway", "hiwy", "hl", "hllw",
882     "hls", "hollow", "hollows", "holw", "holws", "hrbor", "ht", "hts",
883     "hvn", "hway", "hwy", "inlet", "inlt", "is", "island", "islands",
884     "isle", "isles", "islnd", "islnds", "iss", "jct", "jction", "jctn",
885     "jctns", "jcts", "junction", "junctions", "junctn", "juncton", "key",
886     "keys", "knl", "knls", "knol", "knoll", "knolls", "ky", "kys", "la",
887     "lake", "lakes", "land", "landing", "lane", "lanes", "lck", "lcks",
888     "ldg", "ldge", "lf", "lgt", "lgts", "light", "lights", "lk", "lks",
889     "ln", "lndg", "lndng", "loaf", "lock", "locks", "lodg", "lodge", "loop",
890     "loops", "mall", "manor", "manors", "mdw", "mdws", "meadow", "meadows",
891     "medows", "mews", "mill", "mills", "mission", "missn", "ml", "mls",
892     "mnr", "mnrs", "mnt", "mntain", "mntn", "mntns", "motorway", "mount",
893     "mountain", "mountains", "mountin", "msn", "mssn", "mt", "mtin", "mtn",
894     "mtns", "mtwy", "nck", "neck", "opas", "orch", "orchard", "orchrd",
895     "oval", "overpass", "ovl", "park", "parks", "parkway", "parkways",
896     "parkwy", "pass", "passage", "path", "paths", "pike", "pikes", "pine",
897     "pines", "pk", "pkway", "pkwy", "pkwys", "pky", "pl", "place", "plain",
898     "plaines", "plains", "plaza", "pln", "plns", "plz", "plza", "pne",
899     "pnes", "point", "points", "port", "ports", "pr", "prairie", "prarie",
900     "prk", "prr", "prt", "prts", "psge", "pt", "pts", "rad", "radial",
901     "radiel", "radl", "ramp", "ranch", "ranches", "rapid", "rapids", "rd",
902     "rdg", "rdge", "rdgs", "rds", "real", "rest", "ridge", "ridges", "riv", "river",
903     "rivr", "rnch", "rnchs", "road", "roads", "route", "row", "rpd", "rpds",
904     "rst", "rte", "rue", "run", "rvr", "shl", "shls", "shoal", "shoals",
905     "shoar", "shoars", "shore", "shores", "shr", "shrs", "skwy", "skyway",
906     "smt", "spg", "spgs", "spng", "spngs", "spring", "springs", "sprng",
907     "sprngs", "spur", "spurs", "sq", "sqr", "sqre", "sqrs", "sqs", "squ",
908     "square", "squares", "st", "sta", "station", "statn", "stn", "str",
909     "stra", "strav", "strave", "straven", "stravenue", "stravn", "stream",
910     "street", "streets", "streme", "strm", "strt", "strvn", "strvnue",
911     "sts", "sumit", "sumitt", "summit", "ter", "terr", "terrace",
912     "throughway", "tpk", "tpke", "tr", "trace", "traces", "track", "tracks",
913     "trafficway", "trail", "trails", "trak", "trce", "trfy", "trk", "trks",
914     "trl", "trls", "trnpk", "trpk", "trwy", "tunel", "tunl", "tunls",
915     "tunnel", "tunnels", "tunnl", "turnpike", "turnpk", "un", "underpass",
916     "union", "unions", "uns", "upas", "valley", "valleys", "vally", "vdct",
917     "via", "viadct", "viaduct", "view", "views", "vill", "villag",
918     "village", "villages", "ville", "villg", "villiage", "vis", "vist",
919     "vista", "vl", "vlg", "vlgs", "vlly", "vly", "vlys", "vst", "vsta",
920     "vw", "vws", "walk", "walks", "wall", "way", "ways", "well", "wells",
921     "wl", "wls", "wy", "xing", "xrd",
922     0,
923 };
924 
IsValidLocationName(const Word & word)925 bool AddressDetector::IsValidLocationName(const Word& word) {
926     using namespace WTF;
927     static HashSet<String> streetNames;
928     if (!streetNames.size()) {
929         const char** suffixes = s_rawStreetSuffixes;
930         while (const char* suffix = *suffixes) {
931             int index = suffix[0] - 'a';
932             streetNames.add(suffix);
933             suffixes++;
934         }
935     }
936     char16 first_letter = base::ToLowerASCII(*word.begin);
937     if (first_letter > 'z' || first_letter < 'a')
938         return false;
939     int index = first_letter - 'a';
940     int length = std::distance(word.begin, word.end);
941     if (*word.end == '.')
942         length--;
943     String value(word.begin, length);
944     return streetNames.contains(value.lower());
945 }
946