1 /*
2 * Copyright (C) 2012 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "config.h"
32
33 // Magic pretend-to-be-a-chromium-build flags
34 #undef WEBKIT_IMPLEMENTATION
35 #undef LOG
36
37 #include "content/address_detector.h"
38
39 #include <bitset>
40
41 #include "base/utf_string_conversions.h"
42 #include "net/base/escape.h"
43 #include "Settings.h"
44 #include "WebString.h"
45
46 #include <wtf/HashSet.h>
47 #include <wtf/Noncopyable.h>
48 #include <wtf/text/StringHash.h>
49 #include <wtf/text/WTFString.h>
50
51 namespace {
52
53 // Prefix used for geographical address intent URIs.
54 static const char kAddressSchemaPrefix[] = "geo:0,0?q=";
55
56 // Maximum text length to be searched for address detection.
57 static const size_t kMaxAddressLength = 500;
58
59 // Minimum number of words in an address after the house number
60 // before a state is expected to be found.
61 // A value too high can miss short addresses.
62 const size_t kMinAddressWords = 3;
63
64 // Maximum number of words allowed in an address between the house number
65 // and the state, both not included.
66 const size_t kMaxAddressWords = 12;
67
68 // Maximum number of lines allowed in an address between the house number
69 // and the state, both not included.
70 const size_t kMaxAddressLines = 5;
71
72 // Maximum length allowed for any address word between the house number
73 // and the state, both not included.
74 const size_t kMaxAddressNameWordLength = 25;
75
76 // Maximum number of words after the house number in which the location name
77 // should be found.
78 const size_t kMaxLocationNameDistance = 4;
79
80 // Number of digits for a valid zip code.
81 const size_t kZipDigits = 5;
82
83 // Number of digits for a valid zip code in the Zip Plus 4 format.
84 const size_t kZipPlus4Digits = 9;
85
86 // Maximum number of digits of a house number, including possible hyphens.
87 const size_t kMaxHouseDigits = 5;
88
89 // Additional characters used as new line delimiters.
90 const char16 kNewlineDelimiters[] = {
91 ',',
92 '*',
93 0x2022, // Unicode bullet
94 0,
95 };
96
SafePreviousChar(const string16::const_iterator & it,const string16::const_iterator & begin)97 char16 SafePreviousChar(const string16::const_iterator& it,
98 const string16::const_iterator& begin) {
99 if (it == begin)
100 return ' ';
101 return *(it - 1);
102 }
103
SafeNextChar(const string16::const_iterator & it,const string16::const_iterator & end)104 char16 SafeNextChar(const string16::const_iterator& it,
105 const string16::const_iterator& end) {
106 if (it == end)
107 return ' ';
108 return *(it + 1);
109 }
110
WordLowerCaseEqualsASCII(string16::const_iterator word_begin,string16::const_iterator word_end,const char * ascii_to_match)111 bool WordLowerCaseEqualsASCII(string16::const_iterator word_begin,
112 string16::const_iterator word_end, const char* ascii_to_match) {
113 for (string16::const_iterator it = word_begin; it != word_end;
114 ++it, ++ascii_to_match) {
115 if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
116 return false;
117 }
118 return *ascii_to_match == 0 || *ascii_to_match == ' ';
119 }
120
LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin,string16::const_iterator word_end,const char * ascii_to_match,bool allow_plural)121 bool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin,
122 string16::const_iterator word_end, const char* ascii_to_match,
123 bool allow_plural) {
124 for (string16::const_iterator it = word_begin; it != word_end;
125 ++it, ++ascii_to_match) {
126 if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end)
127 return true;
128
129 if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
130 return false;
131 }
132 return *ascii_to_match == 0;
133 }
134
135 } // anonymous namespace
136
137
AddressDetector()138 AddressDetector::AddressDetector() {
139 }
140
~AddressDetector()141 AddressDetector::~AddressDetector() {
142 }
143
GetContentText(const WebKit::WebRange & range)144 std::string AddressDetector::GetContentText(const WebKit::WebRange& range) {
145 // Get the address and replace unicode bullets with commas.
146 string16 address_16 = CollapseWhitespace(range.toPlainText(), false);
147 std::replace(address_16.begin(), address_16.end(),
148 static_cast<char16>(0x2022), static_cast<char16>(','));
149 return UTF16ToUTF8(address_16);
150 }
151
GetIntentURL(const std::string & content_text)152 GURL AddressDetector::GetIntentURL(const std::string& content_text) {
153 return GURL(kAddressSchemaPrefix +
154 EscapeQueryParamValue(content_text, true));
155 }
156
GetMaximumContentLength()157 size_t AddressDetector::GetMaximumContentLength() {
158 return kMaxAddressLength;
159 }
160
IsEnabled(const WebKit::WebHitTestInfo & hit_test)161 bool AddressDetector::IsEnabled(const WebKit::WebHitTestInfo& hit_test) {
162 WebCore::Settings* settings = GetSettings(hit_test);
163 return settings && settings->formatDetectionAddress();
164 }
165
FindContent(const string16::const_iterator & begin,const string16::const_iterator & end,size_t * start_pos,size_t * end_pos)166 bool AddressDetector::FindContent(const string16::const_iterator& begin,
167 const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) {
168 HouseNumberParser house_number_parser;
169
170 // Keep going through the input string until a potential house number is
171 // detected. Start tokenizing the following words to find a valid
172 // street name within a word range. Then, find a state name followed
173 // by a valid zip code for that state. Also keep a look for any other
174 // possible house numbers to continue from in case of no match and for
175 // state names not followed by a zip code (e.g. New York, NY 10000).
176 const string16 newline_delimiters = kNewlineDelimiters;
177 const string16 delimiters = kWhitespaceUTF16 + newline_delimiters;
178 for (string16::const_iterator it = begin; it != end; ) {
179 Word house_number;
180 if (!house_number_parser.Parse(it, end, &house_number))
181 return false;
182
183 String16Tokenizer tokenizer(house_number.end, end, delimiters);
184 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
185
186 std::vector<Word> words;
187 words.push_back(house_number);
188
189 bool found_location_name = false;
190 bool continue_on_house_number = true;
191 size_t next_house_number_word = 0;
192 size_t num_lines = 1;
193
194 // Don't include the house number in the word count.
195 size_t next_word = 1;
196 for (; next_word <= kMaxAddressWords + 1; ++next_word) {
197
198 // Extract a new word from the tokenizer.
199 if (next_word == words.size()) {
200 do {
201 if (!tokenizer.GetNext())
202 return false;
203
204 // Check the number of address lines.
205 if (tokenizer.token_is_delim() && newline_delimiters.find(
206 *tokenizer.token_begin()) != string16::npos) {
207 ++num_lines;
208 }
209 } while (tokenizer.token_is_delim());
210
211 if (num_lines > kMaxAddressLines)
212 break;
213
214 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
215 }
216
217 // Check the word length. If too long, don't try to continue from
218 // the next house number as no address can hold this word.
219 const Word& current_word = words[next_word];
220 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
221 size_t current_word_length = std::distance(
222 current_word.begin, current_word.end);
223 if (current_word_length > kMaxAddressNameWordLength) {
224 continue_on_house_number = false;
225 break;
226 }
227
228 // Check if the new word is a valid house number.
229 // This is used to properly resume parsing in case the maximum number
230 // of words is exceeded.
231 if (next_house_number_word == 0 &&
232 house_number_parser.Parse(current_word.begin, current_word.end, NULL)) {
233 next_house_number_word = next_word;
234 continue;
235 }
236
237 // Look for location names in the words after the house number.
238 // A range limitation is introduced to avoid matching
239 // anything that starts with a number before a legitimate address.
240 if (next_word <= kMaxLocationNameDistance &&
241 IsValidLocationName(current_word)) {
242 found_location_name = true;
243 continue;
244 }
245
246 // Don't count the house number.
247 if (next_word > kMinAddressWords) {
248 // Looking for the state is likely to add new words to the list while
249 // checking for multi-word state names.
250 size_t state_first_word = next_word;
251 size_t state_last_word, state_index;
252 if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
253 &tokenizer, &state_index)) {
254
255 // A location name should have been found at this point.
256 if (!found_location_name)
257 break;
258
259 // Explicitly exclude "et al", as "al" is a valid state code.
260 if (current_word_length == 2 && words.size() > 2) {
261 const Word& previous_word = words[state_first_word - 1];
262 if (previous_word.end - previous_word.begin == 2 &&
263 LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
264 "et") &&
265 LowerCaseEqualsASCII(current_word.begin, current_word.end,
266 "al"))
267 break;
268 }
269
270 // Extract one more word from the tokenizer if not already available.
271 size_t zip_word = state_last_word + 1;
272 if (zip_word == words.size()) {
273 do {
274 if (!tokenizer.GetNext()) {
275 // Zip is optional
276 *start_pos = words[0].begin - begin;
277 *end_pos = words[state_last_word].end - begin;
278 return true;
279 }
280 } while (tokenizer.token_is_delim());
281 words.push_back(Word(tokenizer.token_begin(),
282 tokenizer.token_end()));
283 }
284
285 // Check the parsing validity and state range of the zip code.
286 next_word = state_last_word;
287 if (!IsZipValid(words[zip_word], state_index))
288 continue;
289
290 *start_pos = words[0].begin - begin;
291 *end_pos = words[zip_word].end - begin;
292 return true;
293 }
294 }
295 }
296
297 // Avoid skipping too many words because of a non-address number
298 // at the beginning of the contents to parse.
299 if (continue_on_house_number && next_house_number_word > 0) {
300 it = words[next_house_number_word].begin;
301 } else {
302 DCHECK(!words.empty());
303 next_word = std::min(next_word, words.size() - 1);
304 it = words[next_word].end;
305 }
306 }
307
308 return false;
309 }
310
IsPreDelimiter(char16 character)311 bool AddressDetector::HouseNumberParser::IsPreDelimiter(
312 char16 character) {
313 return character == ':' || IsPostDelimiter(character);
314 }
315
IsPostDelimiter(char16 character)316 bool AddressDetector::HouseNumberParser::IsPostDelimiter(
317 char16 character) {
318 return IsWhitespace(character) || strchr(",\"'", character);
319 }
320
RestartOnNextDelimiter()321 void AddressDetector::HouseNumberParser::RestartOnNextDelimiter() {
322 ResetState();
323 for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {}
324 }
325
AcceptChars(size_t num_chars)326 void AddressDetector::HouseNumberParser::AcceptChars(size_t num_chars) {
327 size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)),
328 num_chars);
329 it_ += offset;
330 result_chars_ += offset;
331 }
332
SkipChars(size_t num_chars)333 void AddressDetector::HouseNumberParser::SkipChars(size_t num_chars) {
334 it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars);
335 }
336
ResetState()337 void AddressDetector::HouseNumberParser::ResetState() {
338 num_digits_ = 0;
339 result_chars_ = 0;
340 }
341
CheckFinished(Word * word) const342 bool AddressDetector::HouseNumberParser::CheckFinished(Word* word) const {
343 // There should always be a number after a hyphen.
344 if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-')
345 return false;
346
347 if (word) {
348 word->begin = it_ - result_chars_;
349 word->end = it_;
350 }
351 return true;
352 }
353
Parse(const string16::const_iterator & begin,const string16::const_iterator & end,Word * word)354 bool AddressDetector::HouseNumberParser::Parse(
355 const string16::const_iterator& begin,
356 const string16::const_iterator& end, Word* word) {
357 it_ = begin_ = begin;
358 end_ = end;
359 ResetState();
360
361 // Iterations only used as a fail-safe against any buggy infinite loops.
362 size_t iterations = 0;
363 size_t max_iterations = end - begin + 1;
364 for (; it_ != end_ && iterations < max_iterations; ++iterations) {
365
366 // Word finished case.
367 if (IsPostDelimiter(*it_)) {
368 if (CheckFinished(word))
369 return true;
370 else if (result_chars_)
371 ResetState();
372
373 SkipChars(1);
374 continue;
375 }
376
377 // More digits. There should be no more after a letter was found.
378 if (IsAsciiDigit(*it_)) {
379 if (num_digits_ >= kMaxHouseDigits) {
380 RestartOnNextDelimiter();
381 } else {
382 AcceptChars(1);
383 ++num_digits_;
384 }
385 continue;
386 }
387
388 if (IsAsciiAlpha(*it_)) {
389 // Handle special case 'one'.
390 if (result_chars_ == 0) {
391 if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one"))
392 AcceptChars(3);
393 else
394 RestartOnNextDelimiter();
395 continue;
396 }
397
398 // There should be more than 1 character because of result_chars.
399 DCHECK_GT(result_chars_, 0U);
400 DCHECK_NE(it_, begin_);
401 char16 previous = SafePreviousChar(it_, begin_);
402 if (IsAsciiDigit(previous)) {
403 // Check cases like '12A'.
404 char16 next = SafeNextChar(it_, end_);
405 if (IsPostDelimiter(next)) {
406 AcceptChars(1);
407 continue;
408 }
409
410 // Handle cases like 12a, 1st, 2nd, 3rd, 7th.
411 if (IsAsciiAlpha(next)) {
412 char16 last_digit = previous;
413 char16 first_letter = base::ToLowerASCII(*it_);
414 char16 second_letter = base::ToLowerASCII(next);
415 bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' &&
416 num_digits_ == 2;
417
418 switch (last_digit - '0') {
419 case 1:
420 if ((first_letter == 's' && second_letter == 't') ||
421 (first_letter == 't' && second_letter == 'h' && is_teen)) {
422 AcceptChars(2);
423 continue;
424 }
425 break;
426
427 case 2:
428 if ((first_letter == 'n' && second_letter == 'd') ||
429 (first_letter == 't' && second_letter == 'h' && is_teen)) {
430 AcceptChars(2);
431 continue;
432 }
433 break;
434
435 case 3:
436 if ((first_letter == 'r' && second_letter == 'd') ||
437 (first_letter == 't' && second_letter == 'h' && is_teen)) {
438 AcceptChars(2);
439 continue;
440 }
441 break;
442
443 case 0:
444 // Explicitly exclude '0th'.
445 if (num_digits_ == 1)
446 break;
447
448 case 4:
449 case 5:
450 case 6:
451 case 7:
452 case 8:
453 case 9:
454 if (first_letter == 't' && second_letter == 'h') {
455 AcceptChars(2);
456 continue;
457 }
458 break;
459
460 default:
461 NOTREACHED();
462 }
463 }
464 }
465
466 RestartOnNextDelimiter();
467 continue;
468 }
469
470 if (*it_ == '-' && num_digits_ > 0) {
471 AcceptChars(1);
472 ++num_digits_;
473 continue;
474 }
475
476 RestartOnNextDelimiter();
477 SkipChars(1);
478 }
479
480 if (iterations >= max_iterations)
481 return false;
482
483 return CheckFinished(word);
484 }
485
FindStateStartingInWord(WordList * words,size_t state_first_word,size_t * state_last_word,String16Tokenizer * tokenizer,size_t * state_index)486 bool AddressDetector::FindStateStartingInWord(WordList* words,
487 size_t state_first_word, size_t* state_last_word,
488 String16Tokenizer* tokenizer, size_t* state_index) {
489
490 // Bitmasks containing the allowed suffixes for 2-letter state codes.
491 static const int state_two_letter_suffix[23] = {
492 0x02060c00, // A followed by: [KLRSZ].
493 0x00000000, // B.
494 0x00084001, // C followed by: [AOT].
495 0x00000014, // D followed by: [CE].
496 0x00000000, // E.
497 0x00001800, // F followed by: [LM].
498 0x00100001, // G followed by: [AU].
499 0x00000100, // H followed by: [I].
500 0x00002809, // I followed by: [ADLN].
501 0x00000000, // J.
502 0x01040000, // K followed by: [SY].
503 0x00000001, // L followed by: [A].
504 0x000ce199, // M followed by: [ADEHINOPST].
505 0x0120129c, // N followed by: [CDEHJMVY].
506 0x00020480, // O followed by: [HKR].
507 0x00420001, // P followed by: [ARW].
508 0x00000000, // Q.
509 0x00000100, // R followed by: [I].
510 0x0000000c, // S followed by: [CD].
511 0x00802000, // T followed by: [NX].
512 0x00080000, // U followed by: [T].
513 0x00080101, // V followed by: [AIT].
514 0x01200101 // W followed by: [AIVY].
515 };
516
517 // Accumulative number of states for the 2-letter code indexed by the first.
518 static const int state_two_letter_accumulative[24] = {
519 0, 5, 5, 8, 10, 10, 12, 14,
520 15, 19, 19, 21, 22, 32, 40, 43,
521 46, 46, 47, 49, 51, 52, 55, 59
522 };
523
524 // State names sorted alphabetically with their lengths.
525 // There can be more than one possible name for a same state if desired.
526 static const struct StateNameInfo {
527 const char* string;
528 char first_word_length;
529 char length;
530 char state_index; // Relative to two-character code alphabetical order.
531 } state_names[59] = {
532 { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 },
533 { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 },
534 { "arkansas", 8, 8, 2 },
535 { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 },
536 { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 },
537 { "district of columbia", 8, 20, 8 },
538 { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 },
539 { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 },
540 { "hawaii", 6, 6, 14 },
541 { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 },
542 { "iowa", 4, 4, 15 },
543 { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 },
544 { "louisiana", 9, 9, 21 },
545 { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 },
546 { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 },
547 { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 },
548 { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 },
549 { "montana", 7, 7, 31 },
550 { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 },
551 { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 },
552 { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 },
553 { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 },
554 { "northern mariana islands", 8, 24, 29 },
555 { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 },
556 { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 },
557 { "puerto rico", 6, 11, 44 },
558 { "rhode island", 5, 5, 46 },
559 { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 },
560 { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 },
561 { "utah", 4, 4, 51 },
562 { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 },
563 { "virginia", 8, 8, 52 },
564 { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 },
565 { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 }
566 };
567
568 // Accumulative number of states for sorted names indexed by the first letter.
569 // Required a different one since there are codes that don't share their
570 // first letter with the name of their state (MP = Northern Mariana Islands).
571 static const int state_names_accumulative[24] = {
572 0, 5, 5, 8, 10, 10, 12, 14,
573 15, 19, 19, 21, 22, 31, 40, 43,
574 46, 46, 47, 49, 51, 52, 55, 59
575 };
576
577 DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1],
578 static_cast<int>(ARRAYSIZE_UNSAFE(state_names)));
579
580 const Word& first_word = words->at(state_first_word);
581 int length = first_word.end - first_word.begin;
582 if (length < 2 || !IsAsciiAlpha(*first_word.begin))
583 return false;
584
585 // No state names start with x, y, z.
586 char16 first_letter = base::ToLowerASCII(*first_word.begin);
587 if (first_letter > 'w')
588 return false;
589
590 DCHECK(first_letter >= 'a');
591 int first_index = first_letter - 'a';
592
593 // Look for two-letter state names.
594 if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) {
595 char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1));
596 DCHECK(second_letter >= 'a');
597
598 int second_index = second_letter - 'a';
599 if (!(state_two_letter_suffix[first_index] & (1 << second_index)))
600 return false;
601
602 std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] &
603 ((1 << second_index) - 1);
604 *state_last_word = state_first_word;
605 *state_index = state_two_letter_accumulative[first_index] +
606 previous_suffixes.count();
607 return true;
608 }
609
610 // Look for full state names by their first letter. Discard by length.
611 for (int state = state_names_accumulative[first_index];
612 state < state_names_accumulative[first_index + 1]; ++state) {
613 if (state_names[state].first_word_length != length)
614 continue;
615
616 bool state_match = false;
617 size_t state_word = state_first_word;
618 for (int pos = 0; true; ) {
619 if (!WordLowerCaseEqualsASCII(words->at(state_word).begin,
620 words->at(state_word).end, &state_names[state].string[pos]))
621 break;
622
623 pos += words->at(state_word).end - words->at(state_word).begin + 1;
624 if (pos >= state_names[state].length) {
625 state_match = true;
626 break;
627 }
628
629 // Ran out of words, extract more from the tokenizer.
630 if (++state_word == words->size()) {
631 do {
632 if (!tokenizer->GetNext())
633 break;
634 } while (tokenizer->token_is_delim());
635 words->push_back(Word(tokenizer->token_begin(), tokenizer->token_end()));
636 }
637 }
638
639 if (state_match) {
640 *state_last_word = state_word;
641 *state_index = state_names[state].state_index;
642 return true;
643 }
644 }
645
646 return false;
647 }
648
IsZipValid(const Word & word,size_t state_index)649 bool AddressDetector::IsZipValid(const Word& word, size_t state_index) {
650 size_t length = word.end - word.begin;
651 if (length != kZipDigits && length != kZipPlus4Digits + 1)
652 return false;
653
654 for (string16::const_iterator it = word.begin; it != word.end; ++it) {
655 size_t pos = it - word.begin;
656 if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits))
657 continue;
658 return false;
659 }
660 return IsZipValidForState(word, state_index);
661 }
662
IsZipValidForState(const Word & word,size_t state_index)663 bool AddressDetector::IsZipValidForState(const Word& word, size_t state_index)
664 {
665 enum USState {
666 AP = -4, // AP (military base in the Pacific)
667 AA = -3, // AA (military base inside the US)
668 AE = -2, // AE (military base outside the US)
669 XX = -1, // (not in use)
670 AK = 0, // AK Alaska
671 AL = 1, // AL Alabama
672 AR = 2, // AR Arkansas
673 AS = 3, // AS American Samoa
674 AZ = 4, // AZ Arizona
675 CA = 5, // CA California
676 CO = 6, // CO Colorado
677 CT = 7, // CT Connecticut
678 DC = 8, // DC District of Columbia
679 DE = 9, // DE Delaware
680 FL = 10, // FL Florida
681 FM = 11, // FM Federated States of Micronesia
682 GA = 12, // GA Georgia
683 GU = 13, // GU Guam
684 HI = 14, // HI Hawaii
685 IA = 15, // IA Iowa
686 ID = 16, // ID Idaho
687 IL = 17, // IL Illinois
688 IN = 18, // IN Indiana
689 KS = 19, // KS Kansas
690 KY = 20, // KY Kentucky
691 LA = 21, // LA Louisiana
692 MA = 22, // MA Massachusetts
693 MD = 23, // MD Maryland
694 ME = 24, // ME Maine
695 MH = 25, // MH Marshall Islands
696 MI = 26, // MI Michigan
697 MN = 27, // MN Minnesota
698 MO = 28, // MO Missouri
699 MP = 29, // MP Northern Mariana Islands
700 MS = 30, // MS Mississippi
701 MT = 31, // MT Montana
702 NC = 32, // NC North Carolina
703 ND = 33, // ND North Dakota
704 NE = 34, // NE Nebraska
705 NH = 35, // NH New Hampshire
706 NJ = 36, // NJ New Jersey
707 NM = 37, // NM New Mexico
708 NV = 38, // NV Nevada
709 NY = 39, // NY New York
710 OH = 40, // OH Ohio
711 OK = 41, // OK Oklahoma
712 OR = 42, // OR Oregon
713 PA = 43, // PA Pennsylvania
714 PR = 44, // PR Puerto Rico
715 PW = 45, // PW Palau
716 RI = 46, // RI Rhode Island
717 SC = 47, // SC South Carolina
718 SD = 48, // SD South Dakota
719 TN = 49, // TN Tennessee
720 TX = 50, // TX Texas
721 UT = 51, // UT Utah
722 VA = 52, // VA Virginia
723 VI = 53, // VI Virgin Islands
724 VT = 54, // VT Vermont
725 WA = 55, // WA Washington
726 WI = 56, // WI Wisconsin
727 WV = 57, // WV West Virginia
728 WY = 58, // WY Wyoming
729 };
730
731 static const USState stateForZipPrefix[] = {
732 // 0 1 2 3 4 5 6 7 8 9
733 XX, XX, XX, XX, XX, NY, PR, PR, VI, PR, // 000-009
734 MA, MA, MA, MA, MA, MA, MA, MA, MA, MA, // 010-019
735 MA, MA, MA, MA, MA, MA, MA, MA, RI, RI, // 020-029
736 NH, NH, NH, NH, NH, NH, NH, NH, NH, ME, // 030-039
737 ME, ME, ME, ME, ME, ME, ME, ME, ME, ME, // 040-049
738 VT, VT, VT, VT, VT, MA, VT, VT, VT, VT, // 050-059
739 CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 060-069
740 NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 070-079
741 NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 080-089
742 AE, AE, AE, AE, AE, AE, AE, AE, AE, XX, // 090-099
743 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 100-109
744 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 110-119
745 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 120-129
746 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 130-139
747 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 140-149
748 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 150-159
749 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 160-169
750 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 170-179
751 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 180-189
752 PA, PA, PA, PA, PA, PA, PA, DE, DE, DE, // 190-199
753 DC, VA, DC, DC, DC, DC, MD, MD, MD, MD, // 200-209
754 MD, MD, MD, XX, MD, MD, MD, MD, MD, MD, // 210-219
755 VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 220-229
756 VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 230-239
757 VA, VA, VA, VA, VA, VA, VA, WV, WV, WV, // 240-249
758 WV, WV, WV, WV, WV, WV, WV, WV, WV, WV, // 250-259
759 WV, WV, WV, WV, WV, WV, WV, WV, WV, XX, // 260-269
760 NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 270-279
761 NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 280-289
762 SC, SC, SC, SC, SC, SC, SC, SC, SC, SC, // 290-299
763 GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 300-309
764 GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 310-319
765 FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 320-329
766 FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 330-339
767 AA, FL, FL, XX, FL, XX, FL, FL, XX, FL, // 340-349
768 AL, AL, AL, XX, AL, AL, AL, AL, AL, AL, // 350-359
769 AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 360-369
770 TN, TN, TN, TN, TN, TN, TN, TN, TN, TN, // 370-379
771 TN, TN, TN, TN, TN, TN, MS, MS, MS, MS, // 380-389
772 MS, MS, MS, MS, MS, MS, MS, MS, GA, GA, // 390-399
773 KY, KY, KY, KY, KY, KY, KY, KY, KY, KY, // 400-409
774 KY, KY, KY, KY, KY, KY, KY, KY, KY, XX, // 410-419
775 KY, KY, KY, KY, KY, KY, KY, KY, XX, XX, // 420-429
776 OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 430-439
777 OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 440-449
778 OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 450-459
779 IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 460-469
780 IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 470-479
781 MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 480-489
782 MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 490-499
783 IA, IA, IA, IA, IA, IA, IA, IA, IA, IA, // 500-509
784 IA, IA, IA, IA, IA, IA, IA, XX, XX, XX, // 510-519
785 IA, IA, IA, IA, IA, IA, IA, IA, IA, XX, // 520-529
786 WI, WI, WI, XX, WI, WI, XX, WI, WI, WI, // 530-539
787 WI, WI, WI, WI, WI, WI, WI, WI, WI, WI, // 540-549
788 MN, MN, XX, MN, MN, MN, MN, MN, MN, MN, // 550-559
789 MN, MN, MN, MN, MN, MN, MN, MN, XX, DC, // 560-569
790 SD, SD, SD, SD, SD, SD, SD, SD, XX, XX, // 570-579
791 ND, ND, ND, ND, ND, ND, ND, ND, ND, XX, // 580-589
792 MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, // 590-599
793 IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 600-609
794 IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 610-619
795 IL, XX, IL, IL, IL, IL, IL, IL, IL, IL, // 620-629
796 MO, MO, XX, MO, MO, MO, MO, MO, MO, MO, // 630-639
797 MO, MO, XX, XX, MO, MO, MO, MO, MO, MO, // 640-649
798 MO, MO, MO, MO, MO, MO, MO, MO, MO, XX, // 650-659
799 KS, KS, KS, XX, KS, KS, KS, KS, KS, KS, // 660-669
800 KS, KS, KS, KS, KS, KS, KS, KS, KS, KS, // 670-679
801 NE, NE, XX, NE, NE, NE, NE, NE, NE, NE, // 680-689
802 NE, NE, NE, NE, XX, XX, XX, XX, XX, XX, // 690-699
803 LA, LA, XX, LA, LA, LA, LA, LA, LA, XX, // 700-709
804 LA, LA, LA, LA, LA, XX, AR, AR, AR, AR, // 710-719
805 AR, AR, AR, AR, AR, AR, AR, AR, AR, AR, // 720-729
806 OK, OK, XX, TX, OK, OK, OK, OK, OK, OK, // 730-739
807 OK, OK, XX, OK, OK, OK, OK, OK, OK, OK, // 740-749
808 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 750-759
809 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 760-769
810 TX, XX, TX, TX, TX, TX, TX, TX, TX, TX, // 770-779
811 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 780-789
812 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 790-799
813 CO, CO, CO, CO, CO, CO, CO, CO, CO, CO, // 800-809
814 CO, CO, CO, CO, CO, CO, CO, XX, XX, XX, // 810-819
815 WY, WY, WY, WY, WY, WY, WY, WY, WY, WY, // 820-829
816 WY, WY, ID, ID, ID, ID, ID, ID, ID, XX, // 830-839
817 UT, UT, UT, UT, UT, UT, UT, UT, XX, XX, // 840-849
818 AZ, AZ, AZ, AZ, XX, AZ, AZ, AZ, XX, AZ, // 850-859
819 AZ, XX, XX, AZ, AZ, AZ, XX, XX, XX, XX, // 860-869
820 NM, NM, NM, NM, NM, NM, XX, NM, NM, NM, // 870-879
821 NM, NM, NM, NM, NM, TX, XX, XX, XX, NV, // 880-889
822 NV, NV, XX, NV, NV, NV, XX, NV, NV, XX, // 890-899
823 CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 900-909
824 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 910-919
825 CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 920-929
826 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 930-939
827 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 940-949
828 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 950-959
829 CA, CA, AP, AP, AP, AP, AP, HI, HI, GU, // 960-969
830 OR, OR, OR, OR, OR, OR, OR, OR, OR, OR, // 970-979
831 WA, WA, WA, WA, WA, WA, WA, XX, WA, WA, // 980-989
832 WA, WA, WA, WA, WA, AK, AK, AK, AK, AK, // 990-999
833 };
834
835 if (!word.begin || !word.end || (word.end - word.begin) < 3)
836 return false;
837 const char16* zipPtr = word.begin;
838 if (zipPtr[0] < '0' || zipPtr[0] > '9' ||
839 zipPtr[1] < '0' || zipPtr[1] > '9' ||
840 zipPtr[2] < '0' || zipPtr[2] > '9')
841 return false;
842
843 int zip = zipPtr[0] - '0';
844 zip *= 10;
845 zip += zipPtr[1] - '0';
846 zip *= 10;
847 zip += zipPtr[2] - '0';
848 return stateForZipPrefix[zip] == (int) state_index;
849 }
850
851 static const char* s_rawStreetSuffixes[] = {
852 "allee", "alley", "ally", "aly",
853 "anex", "annex", "anx", "arc", "arcade", "av", "ave", "aven", "avenu",
854 "avenue", "avn", "avnue", "bayoo", "bayou", "bch", "beach", "bend",
855 "bg", "bgs", "blf", "blfs", "bluf", "bluff", "bluffs", "blvd", "bnd",
856 "bot", "bottm", "bottom", "boul", "boulevard", "boulv", "br", "branch",
857 "brdge", "brg", "bridge", "brk", "brks", "brnch", "brook", "brooks",
858 "btm", "burg", "burgs", "byp", "bypa", "bypas", "bypass", "byps", "byu",
859 "camp", "canyn", "canyon", "cape", "causeway", "causway", "cen", "cent",
860 "center", "centers", "centr", "centre", "cir", "circ", "circl",
861 "circle", "circles", "cirs", "ck", "clb", "clf", "clfs", "cliff",
862 "cliffs", "club", "cmn", "cmp", "cnter", "cntr", "cnyn", "common",
863 "cor", "corner", "corners", "cors", "course", "court", "courts", "cove",
864 "coves", "cp", "cpe", "cr", "crcl", "crcle", "crecent", "creek", "cres",
865 "crescent", "cresent", "crest", "crk", "crossing", "crossroad",
866 "crscnt", "crse", "crsent", "crsnt", "crssing", "crssng", "crst", "crt",
867 "cswy", "ct", "ctr", "ctrs", "cts", "curv", "curve", "cv", "cvs", "cyn",
868 "dale", "dam", "div", "divide", "dl", "dm", "dr", "driv", "drive",
869 "drives", "drs", "drv", "dv", "dvd", "est", "estate", "estates", "ests",
870 "exp", "expr", "express", "expressway", "expw", "expy", "ext",
871 "extension", "extensions", "extn", "extnsn", "exts", "fall", "falls",
872 "ferry", "field", "fields", "flat", "flats", "fld", "flds", "fls",
873 "flt", "flts", "ford", "fords", "forest", "forests", "forg", "forge",
874 "forges", "fork", "forks", "fort", "frd", "frds", "freeway", "freewy",
875 "frg", "frgs", "frk", "frks", "frry", "frst", "frt", "frway", "frwy",
876 "fry", "ft", "fwy", "garden", "gardens", "gardn", "gateway", "gatewy",
877 "gatway", "gdn", "gdns", "glen", "glens", "gln", "glns", "grden",
878 "grdn", "grdns", "green", "greens", "grn", "grns", "grov", "grove",
879 "groves", "grv", "grvs", "gtway", "gtwy", "harb", "harbor", "harbors",
880 "harbr", "haven", "havn", "hbr", "hbrs", "height", "heights", "hgts",
881 "highway", "highwy", "hill", "hills", "hiway", "hiwy", "hl", "hllw",
882 "hls", "hollow", "hollows", "holw", "holws", "hrbor", "ht", "hts",
883 "hvn", "hway", "hwy", "inlet", "inlt", "is", "island", "islands",
884 "isle", "isles", "islnd", "islnds", "iss", "jct", "jction", "jctn",
885 "jctns", "jcts", "junction", "junctions", "junctn", "juncton", "key",
886 "keys", "knl", "knls", "knol", "knoll", "knolls", "ky", "kys", "la",
887 "lake", "lakes", "land", "landing", "lane", "lanes", "lck", "lcks",
888 "ldg", "ldge", "lf", "lgt", "lgts", "light", "lights", "lk", "lks",
889 "ln", "lndg", "lndng", "loaf", "lock", "locks", "lodg", "lodge", "loop",
890 "loops", "mall", "manor", "manors", "mdw", "mdws", "meadow", "meadows",
891 "medows", "mews", "mill", "mills", "mission", "missn", "ml", "mls",
892 "mnr", "mnrs", "mnt", "mntain", "mntn", "mntns", "motorway", "mount",
893 "mountain", "mountains", "mountin", "msn", "mssn", "mt", "mtin", "mtn",
894 "mtns", "mtwy", "nck", "neck", "opas", "orch", "orchard", "orchrd",
895 "oval", "overpass", "ovl", "park", "parks", "parkway", "parkways",
896 "parkwy", "pass", "passage", "path", "paths", "pike", "pikes", "pine",
897 "pines", "pk", "pkway", "pkwy", "pkwys", "pky", "pl", "place", "plain",
898 "plaines", "plains", "plaza", "pln", "plns", "plz", "plza", "pne",
899 "pnes", "point", "points", "port", "ports", "pr", "prairie", "prarie",
900 "prk", "prr", "prt", "prts", "psge", "pt", "pts", "rad", "radial",
901 "radiel", "radl", "ramp", "ranch", "ranches", "rapid", "rapids", "rd",
902 "rdg", "rdge", "rdgs", "rds", "real", "rest", "ridge", "ridges", "riv", "river",
903 "rivr", "rnch", "rnchs", "road", "roads", "route", "row", "rpd", "rpds",
904 "rst", "rte", "rue", "run", "rvr", "shl", "shls", "shoal", "shoals",
905 "shoar", "shoars", "shore", "shores", "shr", "shrs", "skwy", "skyway",
906 "smt", "spg", "spgs", "spng", "spngs", "spring", "springs", "sprng",
907 "sprngs", "spur", "spurs", "sq", "sqr", "sqre", "sqrs", "sqs", "squ",
908 "square", "squares", "st", "sta", "station", "statn", "stn", "str",
909 "stra", "strav", "strave", "straven", "stravenue", "stravn", "stream",
910 "street", "streets", "streme", "strm", "strt", "strvn", "strvnue",
911 "sts", "sumit", "sumitt", "summit", "ter", "terr", "terrace",
912 "throughway", "tpk", "tpke", "tr", "trace", "traces", "track", "tracks",
913 "trafficway", "trail", "trails", "trak", "trce", "trfy", "trk", "trks",
914 "trl", "trls", "trnpk", "trpk", "trwy", "tunel", "tunl", "tunls",
915 "tunnel", "tunnels", "tunnl", "turnpike", "turnpk", "un", "underpass",
916 "union", "unions", "uns", "upas", "valley", "valleys", "vally", "vdct",
917 "via", "viadct", "viaduct", "view", "views", "vill", "villag",
918 "village", "villages", "ville", "villg", "villiage", "vis", "vist",
919 "vista", "vl", "vlg", "vlgs", "vlly", "vly", "vlys", "vst", "vsta",
920 "vw", "vws", "walk", "walks", "wall", "way", "ways", "well", "wells",
921 "wl", "wls", "wy", "xing", "xrd",
922 0,
923 };
924
IsValidLocationName(const Word & word)925 bool AddressDetector::IsValidLocationName(const Word& word) {
926 using namespace WTF;
927 static HashSet<String> streetNames;
928 if (!streetNames.size()) {
929 const char** suffixes = s_rawStreetSuffixes;
930 while (const char* suffix = *suffixes) {
931 int index = suffix[0] - 'a';
932 streetNames.add(suffix);
933 suffixes++;
934 }
935 }
936 char16 first_letter = base::ToLowerASCII(*word.begin);
937 if (first_letter > 'z' || first_letter < 'a')
938 return false;
939 int index = first_letter - 'a';
940 int length = std::distance(word.begin, word.end);
941 if (*word.end == '.')
942 length--;
943 String value(word.begin, length);
944 return streetNames.contains(value.lower());
945 }
946