1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package android.webkit; 18 19 import java.util.Locale; 20 import java.util.regex.MatchResult; 21 import java.util.regex.Matcher; 22 import java.util.regex.Pattern; 23 24 /** 25 * Java implementation of legacy WebView.findAddress algorithm. 26 * 27 * @hide 28 */ 29 class FindAddress { 30 static class ZipRange { 31 int mLow; 32 int mHigh; 33 int mException1; 34 int mException2; ZipRange(int low, int high, int exception1, int exception2)35 ZipRange(int low, int high, int exception1, int exception2) { 36 mLow = low; 37 mHigh = high; 38 mException1 = exception1; 39 mException2 = exception1; 40 } matches(String zipCode)41 boolean matches(String zipCode) { 42 int prefix = Integer.parseInt(zipCode.substring(0, 2)); 43 return (mLow <= prefix && prefix <= mHigh) || prefix == mException1 44 || prefix == mException2; 45 } 46 } 47 48 // Addresses consist of at least this many words, not including state and zip code. 49 private static final int MIN_ADDRESS_WORDS = 4; 50 51 // Adddresses consist of at most this many words, not including state and zip code. 52 private static final int MAX_ADDRESS_WORDS = 14; 53 54 // Addresses consist of at most this many lines. 55 private static final int MAX_ADDRESS_LINES = 5; 56 57 // No words in an address are longer than this many characters. 58 private static final int kMaxAddressNameWordLength = 25; 59 60 // Location name should be in the first MAX_LOCATION_NAME_DISTANCE words 61 private static final int MAX_LOCATION_NAME_DISTANCE = 5; 62 63 private static final ZipRange[] sStateZipCodeRanges = { 64 new ZipRange(99, 99, -1, -1), // AK Alaska. 65 new ZipRange(35, 36, -1, -1), // AL Alabama. 66 new ZipRange(71, 72, -1, -1), // AR Arkansas. 67 new ZipRange(96, 96, -1, -1), // AS American Samoa. 68 new ZipRange(85, 86, -1, -1), // AZ Arizona. 69 new ZipRange(90, 96, -1, -1), // CA California. 70 new ZipRange(80, 81, -1, -1), // CO Colorado. 71 new ZipRange(6, 6, -1, -1), // CT Connecticut. 72 new ZipRange(20, 20, -1, -1), // DC District of Columbia. 73 new ZipRange(19, 19, -1, -1), // DE Delaware. 74 new ZipRange(32, 34, -1, -1), // FL Florida. 75 new ZipRange(96, 96, -1, -1), // FM Federated States of Micronesia. 76 new ZipRange(30, 31, -1, -1), // GA Georgia. 77 new ZipRange(96, 96, -1, -1), // GU Guam. 78 new ZipRange(96, 96, -1, -1), // HI Hawaii. 79 new ZipRange(50, 52, -1, -1), // IA Iowa. 80 new ZipRange(83, 83, -1, -1), // ID Idaho. 81 new ZipRange(60, 62, -1, -1), // IL Illinois. 82 new ZipRange(46, 47, -1, -1), // IN Indiana. 83 new ZipRange(66, 67, 73, -1), // KS Kansas. 84 new ZipRange(40, 42, -1, -1), // KY Kentucky. 85 new ZipRange(70, 71, -1, -1), // LA Louisiana. 86 new ZipRange(1, 2, -1, -1), // MA Massachusetts. 87 new ZipRange(20, 21, -1, -1), // MD Maryland. 88 new ZipRange(3, 4, -1, -1), // ME Maine. 89 new ZipRange(96, 96, -1, -1), // MH Marshall Islands. 90 new ZipRange(48, 49, -1, -1), // MI Michigan. 91 new ZipRange(55, 56, -1, -1), // MN Minnesota. 92 new ZipRange(63, 65, -1, -1), // MO Missouri. 93 new ZipRange(96, 96, -1, -1), // MP Northern Mariana Islands. 94 new ZipRange(38, 39, -1, -1), // MS Mississippi. 95 new ZipRange(55, 56, -1, -1), // MT Montana. 96 new ZipRange(27, 28, -1, -1), // NC North Carolina. 97 new ZipRange(58, 58, -1, -1), // ND North Dakota. 98 new ZipRange(68, 69, -1, -1), // NE Nebraska. 99 new ZipRange(3, 4, -1, -1), // NH New Hampshire. 100 new ZipRange(7, 8, -1, -1), // NJ New Jersey. 101 new ZipRange(87, 88, 86, -1), // NM New Mexico. 102 new ZipRange(88, 89, 96, -1), // NV Nevada. 103 new ZipRange(10, 14, 0, 6), // NY New York. 104 new ZipRange(43, 45, -1, -1), // OH Ohio. 105 new ZipRange(73, 74, -1, -1), // OK Oklahoma. 106 new ZipRange(97, 97, -1, -1), // OR Oregon. 107 new ZipRange(15, 19, -1, -1), // PA Pennsylvania. 108 new ZipRange(6, 6, 0, 9), // PR Puerto Rico. 109 new ZipRange(96, 96, -1, -1), // PW Palau. 110 new ZipRange(2, 2, -1, -1), // RI Rhode Island. 111 new ZipRange(29, 29, -1, -1), // SC South Carolina. 112 new ZipRange(57, 57, -1, -1), // SD South Dakota. 113 new ZipRange(37, 38, -1, -1), // TN Tennessee. 114 new ZipRange(75, 79, 87, 88), // TX Texas. 115 new ZipRange(84, 84, -1, -1), // UT Utah. 116 new ZipRange(22, 24, 20, -1), // VA Virginia. 117 new ZipRange(6, 9, -1, -1), // VI Virgin Islands. 118 new ZipRange(5, 5, -1, -1), // VT Vermont. 119 new ZipRange(98, 99, -1, -1), // WA Washington. 120 new ZipRange(53, 54, -1, -1), // WI Wisconsin. 121 new ZipRange(24, 26, -1, -1), // WV West Virginia. 122 new ZipRange(82, 83, -1, -1) // WY Wyoming. 123 }; 124 125 // Newlines 126 private static final String NL = "\n\u000B\u000C\r\u0085\u2028\u2029"; 127 128 // Space characters 129 private static final String SP = "\u0009\u0020\u00A0\u1680\u2000\u2001" 130 + "\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u202F" 131 + "\u205F\u3000"; 132 133 // Whitespace 134 private static final String WS = SP + NL; 135 136 // Characters that are considered word delimiters. 137 private static final String WORD_DELIM = ",*\u2022" + WS; 138 139 // Lookahead for word end. 140 private static final String WORD_END = "(?=[" + WORD_DELIM + "]|$)"; 141 142 // Address words are a sequence of non-delimiter characters. 143 private static final Pattern sWordRe = 144 Pattern.compile("[^" + WORD_DELIM + "]+" + WORD_END, Pattern.CASE_INSENSITIVE); 145 146 // Characters that are considered suffix delimiters for house numbers. 147 private static final String HOUSE_POST_DELIM = ",\"'" + WS; 148 149 // Lookahead for house end. 150 private static final String HOUSE_END = "(?=[" + HOUSE_POST_DELIM + "]|$)"; 151 152 // Characters that are considered prefix delimiters for house numbers. 153 private static final String HOUSE_PRE_DELIM = ":" + HOUSE_POST_DELIM; 154 155 // A house number component is "one" or a number, optionally 156 // followed by a single alphabetic character, or 157 private static final String HOUSE_COMPONENT = "(?:one|\\d+([a-z](?=[^a-z]|$)|st|nd|rd|th)?)"; 158 159 // House numbers are a repetition of |HOUSE_COMPONENT|, separated by -, and followed by 160 // a delimiter character. 161 private static final Pattern sHouseNumberRe = 162 Pattern.compile(HOUSE_COMPONENT + "(?:-" + HOUSE_COMPONENT + ")*" + HOUSE_END, 163 Pattern.CASE_INSENSITIVE); 164 165 // XXX: do we want to accept whitespace other than 0x20 in state names? 166 private static final Pattern sStateRe = Pattern.compile("(?:" 167 + "(ak|alaska)|" 168 + "(al|alabama)|" 169 + "(ar|arkansas)|" 170 + "(as|american[" + SP + "]+samoa)|" 171 + "(az|arizona)|" 172 + "(ca|california)|" 173 + "(co|colorado)|" 174 + "(ct|connecticut)|" 175 + "(dc|district[" + SP + "]+of[" + SP + "]+columbia)|" 176 + "(de|delaware)|" 177 + "(fl|florida)|" 178 + "(fm|federated[" + SP + "]+states[" + SP + "]+of[" + SP + "]+micronesia)|" 179 + "(ga|georgia)|" 180 + "(gu|guam)|" 181 + "(hi|hawaii)|" 182 + "(ia|iowa)|" 183 + "(id|idaho)|" 184 + "(il|illinois)|" 185 + "(in|indiana)|" 186 + "(ks|kansas)|" 187 + "(ky|kentucky)|" 188 + "(la|louisiana)|" 189 + "(ma|massachusetts)|" 190 + "(md|maryland)|" 191 + "(me|maine)|" 192 + "(mh|marshall[" + SP + "]+islands)|" 193 + "(mi|michigan)|" 194 + "(mn|minnesota)|" 195 + "(mo|missouri)|" 196 + "(mp|northern[" + SP + "]+mariana[" + SP + "]+islands)|" 197 + "(ms|mississippi)|" 198 + "(mt|montana)|" 199 + "(nc|north[" + SP + "]+carolina)|" 200 + "(nd|north[" + SP + "]+dakota)|" 201 + "(ne|nebraska)|" 202 + "(nh|new[" + SP + "]+hampshire)|" 203 + "(nj|new[" + SP + "]+jersey)|" 204 + "(nm|new[" + SP + "]+mexico)|" 205 + "(nv|nevada)|" 206 + "(ny|new[" + SP + "]+york)|" 207 + "(oh|ohio)|" 208 + "(ok|oklahoma)|" 209 + "(or|oregon)|" 210 + "(pa|pennsylvania)|" 211 + "(pr|puerto[" + SP + "]+rico)|" 212 + "(pw|palau)|" 213 + "(ri|rhode[" + SP + "]+island)|" 214 + "(sc|south[" + SP + "]+carolina)|" 215 + "(sd|south[" + SP + "]+dakota)|" 216 + "(tn|tennessee)|" 217 + "(tx|texas)|" 218 + "(ut|utah)|" 219 + "(va|virginia)|" 220 + "(vi|virgin[" + SP + "]+islands)|" 221 + "(vt|vermont)|" 222 + "(wa|washington)|" 223 + "(wi|wisconsin)|" 224 + "(wv|west[" + SP + "]+virginia)|" 225 + "(wy|wyoming)" 226 + ")" + WORD_END, 227 Pattern.CASE_INSENSITIVE); 228 229 private static final Pattern sLocationNameRe = Pattern.compile("(?:" 230 + "alley|annex|arcade|ave[.]?|avenue|alameda|bayou|" 231 + "beach|bend|bluffs?|bottom|boulevard|branch|bridge|" 232 + "brooks?|burgs?|bypass|broadway|camino|camp|canyon|" 233 + "cape|causeway|centers?|circles?|cliffs?|club|common|" 234 + "corners?|course|courts?|coves?|creek|crescent|crest|" 235 + "crossing|crossroad|curve|circulo|dale|dam|divide|" 236 + "drives?|estates?|expressway|extensions?|falls?|ferry|" 237 + "fields?|flats?|fords?|forest|forges?|forks?|fort|" 238 + "freeway|gardens?|gateway|glens?|greens?|groves?|" 239 + "harbors?|haven|heights|highway|hills?|hollow|inlet|" 240 + "islands?|isle|junctions?|keys?|knolls?|lakes?|land|" 241 + "landing|lane|lights?|loaf|locks?|lodge|loop|mall|" 242 + "manors?|meadows?|mews|mills?|mission|motorway|mount|" 243 + "mountains?|neck|orchard|oval|overpass|parks?|" 244 + "parkways?|pass|passage|path|pike|pines?|plains?|" 245 + "plaza|points?|ports?|prairie|privada|radial|ramp|" 246 + "ranch|rapids?|rd[.]?|rest|ridges?|river|roads?|route|" 247 + "row|rue|run|shoals?|shores?|skyway|springs?|spurs?|" 248 + "squares?|station|stravenue|stream|st[.]?|streets?|" 249 + "summit|speedway|terrace|throughway|trace|track|" 250 + "trafficway|trail|tunnel|turnpike|underpass|unions?|" 251 + "valleys?|viaduct|views?|villages?|ville|vista|walks?|" 252 + "wall|ways?|wells?|xing|xrd)" + WORD_END, 253 Pattern.CASE_INSENSITIVE); 254 255 private static final Pattern sSuffixedNumberRe = 256 Pattern.compile("(\\d+)(st|nd|rd|th)", Pattern.CASE_INSENSITIVE); 257 258 private static final Pattern sZipCodeRe = 259 Pattern.compile("(?:\\d{5}(?:-\\d{4})?)" + WORD_END, Pattern.CASE_INSENSITIVE); 260 checkHouseNumber(String houseNumber)261 private static boolean checkHouseNumber(String houseNumber) { 262 // Make sure that there are at most 5 digits. 263 int digitCount = 0; 264 for (int i = 0; i < houseNumber.length(); ++i) { 265 if (Character.isDigit(houseNumber.charAt(i))) ++digitCount; 266 } 267 if (digitCount > 5) return false; 268 269 // Make sure that any ordinals are valid. 270 Matcher suffixMatcher = sSuffixedNumberRe.matcher(houseNumber); 271 while (suffixMatcher.find()) { 272 int num = Integer.parseInt(suffixMatcher.group(1)); 273 if (num == 0) { 274 return false; // 0th is invalid. 275 } 276 String suffix = suffixMatcher.group(2).toLowerCase(Locale.getDefault()); 277 switch (num % 10) { 278 case 1: 279 return suffix.equals(num % 100 == 11 ? "th" : "st"); 280 case 2: 281 return suffix.equals(num % 100 == 12 ? "th" : "nd"); 282 case 3: 283 return suffix.equals(num % 100 == 13 ? "th" : "rd"); 284 default: 285 return suffix.equals("th"); 286 } 287 } 288 return true; 289 } 290 291 /** 292 * Attempt to match a house number beginnning at position offset 293 * in content. The house number must be followed by a word 294 * delimiter or the end of the string, and if offset is non-zero, 295 * then it must also be preceded by a word delimiter. 296 * 297 * @return a MatchResult if a valid house number was found. 298 */ matchHouseNumber(String content, int offset)299 private static MatchResult matchHouseNumber(String content, int offset) { 300 if (offset > 0 && HOUSE_PRE_DELIM.indexOf(content.charAt(offset - 1)) == -1) return null; 301 Matcher matcher = sHouseNumberRe.matcher(content).region(offset, content.length()); 302 if (matcher.lookingAt()) { 303 MatchResult matchResult = matcher.toMatchResult(); 304 if (checkHouseNumber(matchResult.group(0))) return matchResult; 305 } 306 return null; 307 } 308 309 /** 310 * Attempt to match a US state beginnning at position offset in 311 * content. The matching state must be followed by a word 312 * delimiter or the end of the string, and if offset is non-zero, 313 * then it must also be preceded by a word delimiter. 314 * 315 * @return a MatchResult if a valid US state (or two letter code) 316 * was found. 317 */ matchState(String content, int offset)318 private static MatchResult matchState(String content, int offset) { 319 if (offset > 0 && WORD_DELIM.indexOf(content.charAt(offset - 1)) == -1) return null; 320 Matcher stateMatcher = sStateRe.matcher(content).region(offset, content.length()); 321 return stateMatcher.lookingAt() ? stateMatcher.toMatchResult() : null; 322 } 323 324 /** 325 * Test whether zipCode matches the U.S. zip code format (ddddd or 326 * ddddd-dddd) and is within the expected range, given that 327 * stateMatch is a match of sStateRe. 328 * 329 * @return true if zipCode is a valid zip code, is legal for the 330 * matched state, and is followed by a word delimiter or the end 331 * of the string. 332 */ isValidZipCode(String zipCode, MatchResult stateMatch)333 private static boolean isValidZipCode(String zipCode, MatchResult stateMatch) { 334 if (stateMatch == null) return false; 335 // Work out the index of the state, based on which group matched. 336 int stateIndex = stateMatch.groupCount(); 337 while (stateIndex > 0) { 338 if (stateMatch.group(stateIndex--) != null) break; 339 } 340 return sZipCodeRe.matcher(zipCode).matches() 341 && sStateZipCodeRanges[stateIndex].matches(zipCode); 342 } 343 344 /** 345 * Test whether location is one of the valid locations. 346 * 347 * @return true if location starts with a valid location name 348 * followed by a word delimiter or the end of the string. 349 */ isValidLocationName(String location)350 private static boolean isValidLocationName(String location) { 351 return sLocationNameRe.matcher(location).matches(); 352 } 353 354 /** 355 * Attempt to match a complete address in content, starting with 356 * houseNumberMatch. 357 * 358 * @param content The string to search. 359 * @param houseNumberMatch A matching house number to start extending. 360 * @return +ve: the end of the match 361 * +ve: the position to restart searching for house numbers, negated. 362 */ attemptMatch(String content, MatchResult houseNumberMatch)363 private static int attemptMatch(String content, MatchResult houseNumberMatch) { 364 int restartPos = -1; 365 int nonZipMatch = -1; 366 int it = houseNumberMatch.end(); 367 int numLines = 1; 368 boolean consecutiveHouseNumbers = true; 369 boolean foundLocationName = false; 370 int wordCount = 1; 371 String lastWord = ""; 372 373 Matcher matcher = sWordRe.matcher(content); 374 375 for (; it < content.length(); lastWord = matcher.group(0), it = matcher.end()) { 376 if (!matcher.find(it)) { 377 // No more words in the input sequence. 378 return -content.length(); 379 } 380 if (matcher.end() - matcher.start() > kMaxAddressNameWordLength) { 381 // Word is too long to be part of an address. Fail. 382 return -matcher.end(); 383 } 384 385 // Count the number of newlines we just consumed. 386 while (it < matcher.start()) { 387 if (NL.indexOf(content.charAt(it++)) != -1) ++numLines; 388 } 389 390 // Consumed too many lines. Fail. 391 if (numLines > MAX_ADDRESS_LINES) break; 392 393 // Consumed too many words. Fail. 394 if (++wordCount > MAX_ADDRESS_WORDS) break; 395 396 if (matchHouseNumber(content, it) != null) { 397 if (consecutiveHouseNumbers && numLines > 1) { 398 // Last line ended with a number, and this this line starts with one. 399 // Restart at this number. 400 return -it; 401 } 402 // Remember the position of this match as the restart position. 403 if (restartPos == -1) restartPos = it; 404 continue; 405 } 406 407 consecutiveHouseNumbers = false; 408 409 if (isValidLocationName(matcher.group(0))) { 410 foundLocationName = true; 411 continue; 412 } 413 414 if (wordCount == MAX_LOCATION_NAME_DISTANCE && !foundLocationName) { 415 // Didn't find a location name in time. Fail. 416 it = matcher.end(); 417 break; 418 } 419 420 if (foundLocationName && wordCount > MIN_ADDRESS_WORDS) { 421 // We can now attempt to match a state. 422 MatchResult stateMatch = matchState(content, it); 423 if (stateMatch != null) { 424 if (lastWord.equals("et") && stateMatch.group(0).equals("al")) { 425 // Reject "et al" as a false postitive. 426 it = stateMatch.end(); 427 break; 428 } 429 430 // At this point we've matched a state; try to match a zip code after it. 431 Matcher zipMatcher = sWordRe.matcher(content); 432 if (zipMatcher.find(stateMatch.end())) { 433 if (isValidZipCode(zipMatcher.group(0), stateMatch)) { 434 return zipMatcher.end(); 435 } 436 } else { 437 // The content ends with a state but no zip 438 // code. This is a legal match according to the 439 // documentation. N.B. This is equivalent to the 440 // original c++ implementation, which only allowed 441 // the zip code to be optional at the end of the 442 // string, which presumably is a bug. We tried 443 // relaxing this to work in other places but it 444 // caused too many false positives. 445 nonZipMatch = stateMatch.end(); 446 } 447 } 448 } 449 } 450 451 if (nonZipMatch > 0) return nonZipMatch; 452 453 return -(restartPos > 0 ? restartPos : it); 454 } 455 456 /** 457 * Return the first matching address in content. 458 * 459 * @param content The string to search. 460 * @return The first valid address, or null if no address was matched. 461 */ findAddress(String content)462 static String findAddress(String content) { 463 Matcher houseNumberMatcher = sHouseNumberRe.matcher(content); 464 int start = 0; 465 while (houseNumberMatcher.find(start)) { 466 if (checkHouseNumber(houseNumberMatcher.group(0))) { 467 start = houseNumberMatcher.start(); 468 int end = attemptMatch(content, houseNumberMatcher); 469 if (end > 0) { 470 return content.substring(start, end); 471 } 472 start = -end; 473 } else { 474 start = houseNumberMatcher.end(); 475 } 476 } 477 return null; 478 } 479 } 480