1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.tools.lint.checks; 18 19 import static com.android.SdkConstants.DOT_XML; 20 import static com.android.tools.lint.detector.api.LintUtils.assertionsEnabled; 21 22 import com.android.annotations.NonNull; 23 import com.android.annotations.Nullable; 24 import com.android.annotations.VisibleForTesting; 25 import com.android.tools.lint.client.api.LintClient; 26 import com.android.tools.lint.detector.api.LintUtils; 27 import com.google.common.base.Charsets; 28 import com.google.common.base.Splitter; 29 import com.google.common.io.Files; 30 31 import java.io.File; 32 import java.io.FileOutputStream; 33 import java.io.IOException; 34 import java.nio.ByteBuffer; 35 import java.nio.ByteOrder; 36 import java.nio.MappedByteBuffer; 37 import java.nio.channels.FileChannel.MapMode; 38 import java.util.ArrayList; 39 import java.util.Arrays; 40 import java.util.Comparator; 41 import java.util.List; 42 import java.util.WeakHashMap; 43 44 /** 45 * Database of common typos / misspellings. 46 */ 47 public class TypoLookup { 48 private static final TypoLookup NONE = new TypoLookup(); 49 50 /** String separating misspellings and suggested replacements in the text file */ 51 private static final String WORD_SEPARATOR = "->"; //$NON-NLS-1$ 52 53 /** Relative path to the typos database file within the Lint installation */ 54 private static final String XML_FILE_PATH = "tools/support/typos-%1$s.txt"; //$NON-NLS-1$ 55 private static final String FILE_HEADER = "Typo database used by Android lint\000"; 56 private static final int BINARY_FORMAT_VERSION = 2; 57 private static final boolean DEBUG_FORCE_REGENERATE_BINARY = false; 58 private static final boolean DEBUG_SEARCH = false; 59 private static final boolean WRITE_STATS = false; 60 /** Default size to reserve for each API entry when creating byte buffer to build up data */ 61 private static final int BYTES_PER_ENTRY = 28; 62 63 private final LintClient mClient; 64 private final File mXmlFile; 65 private final File mBinaryFile; 66 private byte[] mData; 67 private int[] mIndices; 68 private int mWordCount; 69 70 private static WeakHashMap<String, TypoLookup> sInstanceMap = 71 new WeakHashMap<String, TypoLookup>(); 72 73 /** 74 * Returns an instance of the Typo database for the given locale 75 * 76 * @param client the client to associate with this database - used only for 77 * logging. The database object may be shared among repeated 78 * invocations, and in that case client used will be the one 79 * originally passed in. In other words, this parameter may be 80 * ignored if the client created is not new. 81 * @param locale the locale to look up a typo database for (should be a 82 * language code (ISO 639-1, two lowercase character names) 83 * @param region the region to look up a typo database for (should be a two 84 * letter ISO 3166-1 alpha-2 country code in upper case) language 85 * code 86 * @return a (possibly shared) instance of the typo database, or null if its 87 * data can't be found 88 */ 89 @Nullable get(@onNull LintClient client, @NonNull String locale, @Nullable String region)90 public static TypoLookup get(@NonNull LintClient client, @NonNull String locale, 91 @Nullable String region) { 92 synchronized (TypoLookup.class) { 93 String key = locale; 94 95 if (region != null) { 96 // Allow for region-specific dictionaries. See for example 97 // http://en.wikipedia.org/wiki/American_and_British_English_spelling_differences 98 assert region.length() == 2 99 && Character.isUpperCase(region.charAt(0)) 100 && Character.isUpperCase(region.charAt(1)) : region; 101 // Look for typos-en-rUS.txt etc 102 key = locale + 'r' + region; 103 } 104 105 TypoLookup db = sInstanceMap.get(key); 106 if (db == null) { 107 String path = String.format(XML_FILE_PATH, key); 108 File file = client.findResource(path); 109 if (file == null) { 110 // AOSP build environment? 111 String build = System.getenv("ANDROID_BUILD_TOP"); //$NON-NLS-1$ 112 if (build != null) { 113 file = new File(build, ("sdk/files/" //$NON-NLS-1$ 114 + path.substring(path.lastIndexOf('/') + 1)) 115 .replace('/', File.separatorChar)); 116 } 117 } 118 119 if (file == null || !file.exists()) { 120 if (region != null) { 121 // Fall back to the generic locale (non-region-specific) database 122 return get(client, locale, null); 123 } 124 db = NONE; 125 } else { 126 db = get(client, file); 127 assert db != null : file; 128 } 129 sInstanceMap.put(key, db); 130 } 131 132 if (db == NONE) { 133 return null; 134 } else { 135 return db; 136 } 137 } 138 } 139 140 /** 141 * Returns an instance of the typo database 142 * 143 * @param client the client to associate with this database - used only for 144 * logging 145 * @param xmlFile the XML file containing configuration data to use for this 146 * database 147 * @return a (possibly shared) instance of the typo database, or null 148 * if its data can't be found 149 */ 150 @Nullable get(LintClient client, File xmlFile)151 private static TypoLookup get(LintClient client, File xmlFile) { 152 if (!xmlFile.exists()) { 153 client.log(null, "The typo database file %1$s does not exist", xmlFile); 154 return null; 155 } 156 157 String name = xmlFile.getName(); 158 if (LintUtils.endsWith(name, DOT_XML)) { 159 name = name.substring(0, name.length() - DOT_XML.length()); 160 } 161 File cacheDir = client.getCacheDir(true/*create*/); 162 if (cacheDir == null) { 163 cacheDir = xmlFile.getParentFile(); 164 } 165 166 File binaryData = new File(cacheDir, name 167 // Incorporate version number in the filename to avoid upgrade filename 168 // conflicts on Windows (such as issue #26663) 169 + "-" + BINARY_FORMAT_VERSION + ".bin"); //$NON-NLS-1$ //$NON-NLS-2$ 170 171 if (DEBUG_FORCE_REGENERATE_BINARY) { 172 System.err.println("\nTemporarily regenerating binary data unconditionally \nfrom " 173 + xmlFile + "\nto " + binaryData); 174 if (!createCache(client, xmlFile, binaryData)) { 175 return null; 176 } 177 } else if (!binaryData.exists() || binaryData.lastModified() < xmlFile.lastModified()) { 178 if (!createCache(client, xmlFile, binaryData)) { 179 return null; 180 } 181 } 182 183 if (!binaryData.exists()) { 184 client.log(null, "The typo database file %1$s does not exist", binaryData); 185 return null; 186 } 187 188 return new TypoLookup(client, xmlFile, binaryData); 189 } 190 createCache(LintClient client, File xmlFile, File binaryData)191 private static boolean createCache(LintClient client, File xmlFile, File binaryData) { 192 long begin = 0; 193 if (WRITE_STATS) { 194 begin = System.currentTimeMillis(); 195 } 196 197 // Read in data 198 List<String> lines; 199 try { 200 lines = Files.readLines(xmlFile, Charsets.UTF_8); 201 } catch (IOException e) { 202 client.log(e, "Can't read typo database file"); 203 return false; 204 } 205 206 if (WRITE_STATS) { 207 long end = System.currentTimeMillis(); 208 System.out.println("Reading data structures took " + (end - begin) + " ms)"); 209 } 210 211 try { 212 writeDatabase(binaryData, lines); 213 return true; 214 } catch (IOException ioe) { 215 client.log(ioe, "Can't write typo cache file"); 216 } 217 218 return false; 219 } 220 221 /** Use one of the {@link #get} factory methods instead */ TypoLookup( @onNull LintClient client, @NonNull File xmlFile, @Nullable File binaryFile)222 private TypoLookup( 223 @NonNull LintClient client, 224 @NonNull File xmlFile, 225 @Nullable File binaryFile) { 226 mClient = client; 227 mXmlFile = xmlFile; 228 mBinaryFile = binaryFile; 229 230 if (binaryFile != null) { 231 readData(); 232 } 233 } 234 TypoLookup()235 private TypoLookup() { 236 mClient = null; 237 mXmlFile = null; 238 mBinaryFile = null; 239 } 240 readData()241 private void readData() { 242 if (!mBinaryFile.exists()) { 243 mClient.log(null, "%1$s does not exist", mBinaryFile); 244 return; 245 } 246 long start = System.currentTimeMillis(); 247 try { 248 MappedByteBuffer buffer = Files.map(mBinaryFile, MapMode.READ_ONLY); 249 assert buffer.order() == ByteOrder.BIG_ENDIAN; 250 251 // First skip the header 252 byte[] expectedHeader = FILE_HEADER.getBytes(Charsets.US_ASCII); 253 buffer.rewind(); 254 for (int offset = 0; offset < expectedHeader.length; offset++) { 255 if (expectedHeader[offset] != buffer.get()) { 256 mClient.log(null, "Incorrect file header: not an typo database cache " + 257 "file, or a corrupt cache file"); 258 return; 259 } 260 } 261 262 // Read in the format number 263 if (buffer.get() != BINARY_FORMAT_VERSION) { 264 // Force regeneration of new binary data with up to date format 265 if (createCache(mClient, mXmlFile, mBinaryFile)) { 266 readData(); // Recurse 267 } 268 269 return; 270 } 271 272 mWordCount = buffer.getInt(); 273 274 // Read in the word table indices; 275 int count = mWordCount; 276 int[] offsets = new int[count]; 277 278 // Another idea: I can just store the DELTAS in the file (and add them up 279 // when reading back in) such that it takes just ONE byte instead of four! 280 281 for (int i = 0; i < count; i++) { 282 offsets[i] = buffer.getInt(); 283 } 284 285 // No need to read in the rest -- we'll just keep the whole byte array in memory 286 // TODO: Make this code smarter/more efficient. 287 int size = buffer.limit(); 288 byte[] b = new byte[size]; 289 buffer.rewind(); 290 buffer.get(b); 291 mData = b; 292 mIndices = offsets; 293 294 // TODO: We only need to keep the data portion here since we've initialized 295 // the offset array separately. 296 // TODO: Investigate (profile) accessing the byte buffer directly instead of 297 // accessing a byte array. 298 } catch (IOException e) { 299 mClient.log(e, null); 300 } 301 if (WRITE_STATS) { 302 long end = System.currentTimeMillis(); 303 System.out.println("\nRead typo database in " + (end - start) 304 + " milliseconds."); 305 System.out.println("Size of data table: " + mData.length + " bytes (" 306 + Integer.toString(mData.length/1024) + "k)\n"); 307 } 308 } 309 310 /** See the {@link #readData()} for documentation on the data format. */ writeDatabase(File file, List<String> lines)311 private static void writeDatabase(File file, List<String> lines) throws IOException { 312 /* 313 * 1. A file header, which is the exact contents of {@link FILE_HEADER} encoded 314 * as ASCII characters. The purpose of the header is to identify what the file 315 * is for, for anyone attempting to open the file. 316 * 2. A file version number. If the binary file does not match the reader's expected 317 * version, it can ignore it (and regenerate the cache from XML). 318 */ 319 320 // Drop comments etc 321 List<String> words = new ArrayList<String>(lines.size()); 322 for (String line : lines) { 323 if (!line.isEmpty() && Character.isLetter(line.charAt(0))) { 324 int end = line.indexOf(WORD_SEPARATOR); 325 if (end == -1) { 326 end = line.trim().length(); 327 } 328 String typo = line.substring(0, end).trim(); 329 String replacements = line.substring(end + WORD_SEPARATOR.length()).trim(); 330 if (replacements.isEmpty()) { 331 // We don't support empty replacements 332 continue; 333 } 334 String combined = typo + (char) 0 + replacements; 335 336 words.add(combined); 337 } 338 } 339 340 byte[][] wordArrays = new byte[words.size()][]; 341 for (int i = 0, n = words.size(); i < n; i++) { 342 String word = words.get(i); 343 wordArrays[i] = word.getBytes(Charsets.UTF_8); 344 } 345 // Sort words, using our own comparator to ensure that it matches the 346 // binary search in getTypos() 347 Comparator<byte[]> comparator = new Comparator<byte[]>() { 348 @Override 349 public int compare(byte[] o1, byte[] o2) { 350 return TypoLookup.compare(o1, 0, (byte) 0, o2, 0, o2.length); 351 } 352 }; 353 Arrays.sort(wordArrays, comparator); 354 355 int entryCount = wordArrays.length; 356 int capacity = entryCount * BYTES_PER_ENTRY; 357 ByteBuffer buffer = ByteBuffer.allocate(capacity); 358 buffer.order(ByteOrder.BIG_ENDIAN); 359 // 1. A file header, which is the exact contents of {@link FILE_HEADER} encoded 360 // as ASCII characters. The purpose of the header is to identify what the file 361 // is for, for anyone attempting to open the file. 362 buffer.put(FILE_HEADER.getBytes(Charsets.US_ASCII)); 363 364 // 2. A file version number. If the binary file does not match the reader's expected 365 // version, it can ignore it (and regenerate the cache from XML). 366 buffer.put((byte) BINARY_FORMAT_VERSION); 367 368 // 3. The number of words [1 int] 369 buffer.putInt(entryCount); 370 371 // 4. Word offset table (one integer per word, pointing to the byte offset in the 372 // file (relative to the beginning of the file) where each word begins. 373 // The words are always sorted alphabetically. 374 int wordOffsetTable = buffer.position(); 375 376 // Reserve enough room for the offset table here: we will backfill it with pointers 377 // as we're writing out the data structures below 378 for (int i = 0, n = entryCount; i < n; i++) { 379 buffer.putInt(0); 380 } 381 382 int nextEntry = buffer.position(); 383 int nextOffset = wordOffsetTable; 384 385 // 7. Word entry table. Each word entry consists of the word, followed by the byte 0 386 // as a terminator, followed by a comma separated list of suggestions (which 387 // may be empty), or a final 0. 388 for (int i = 0; i < entryCount; i++) { 389 byte[] word = wordArrays[i]; 390 buffer.position(nextOffset); 391 buffer.putInt(nextEntry); 392 nextOffset = buffer.position(); 393 buffer.position(nextEntry); 394 395 buffer.put(word); // already embeds 0 to separate typo from words 396 buffer.put((byte) 0); 397 398 nextEntry = buffer.position(); 399 } 400 401 int size = buffer.position(); 402 assert size <= buffer.limit(); 403 buffer.mark(); 404 405 if (WRITE_STATS) { 406 System.out.println("Wrote " + words.size() + " word entries"); 407 System.out.print("Actual binary size: " + size + " bytes"); 408 System.out.println(String.format(" (%.1fM)", size/(1024*1024.f))); 409 410 System.out.println("Allocated size: " + (entryCount * BYTES_PER_ENTRY) + " bytes"); 411 System.out.println("Required bytes per entry: " + (size/ entryCount) + " bytes"); 412 } 413 414 // Now dump this out as a file 415 // There's probably an API to do this more efficiently; TODO: Look into this. 416 byte[] b = new byte[size]; 417 buffer.rewind(); 418 buffer.get(b); 419 FileOutputStream output = Files.newOutputStreamSupplier(file).getOutput(); 420 output.write(b); 421 output.close(); 422 } 423 424 // For debugging only dumpEntry(int offset)425 private String dumpEntry(int offset) { 426 if (DEBUG_SEARCH) { 427 int end = offset; 428 while (mData[end] != 0) { 429 end++; 430 } 431 return new String(mData, offset, end - offset, Charsets.UTF_8); 432 } else { 433 return "<disabled>"; //$NON-NLS-1$ 434 } 435 } 436 437 /** Comparison function: *only* used for ASCII strings */ 438 @VisibleForTesting compare(byte[] data, int offset, byte terminator, CharSequence s, int begin, int end)439 static int compare(byte[] data, int offset, byte terminator, CharSequence s, 440 int begin, int end) { 441 int i = offset; 442 int j = begin; 443 for (; ; i++, j++) { 444 byte b = data[i]; 445 if (b == ' ') { 446 // We've matched up to the space in a split-word typo, such as 447 // in German all zu=>allzu; here we've matched just past "all". 448 // Rather than terminating, attempt to continue in the buffer. 449 if (j == end) { 450 int max = s.length(); 451 if (end < max && s.charAt(end) == ' ') { 452 // Find next word 453 for (; end < max; end++) { 454 char c = s.charAt(end); 455 if (!Character.isLetter(c)) { 456 if (c == ' ' && end == j) { 457 continue; 458 } 459 break; 460 } 461 } 462 } 463 } 464 } 465 466 if (j == end) { 467 break; 468 } 469 470 if (b == '*') { 471 // Glob match (only supported at the end) 472 return 0; 473 } 474 char c = s.charAt(j); 475 byte cb = (byte) c; 476 int delta = b - cb; 477 if (delta != 0) { 478 cb = (byte) Character.toLowerCase(c); 479 if (b != cb) { 480 // Ensure that it has the right sign 481 b = (byte) Character.toLowerCase(b); 482 delta = b - cb; 483 if (delta != 0) { 484 return delta; 485 } 486 } 487 } 488 } 489 490 return data[i] - terminator; 491 } 492 493 /** Comparison function used for general UTF-8 encoded strings */ 494 @VisibleForTesting compare(byte[] data, int offset, byte terminator, byte[] s, int begin, int end)495 static int compare(byte[] data, int offset, byte terminator, byte[] s, 496 int begin, int end) { 497 int i = offset; 498 int j = begin; 499 for (; ; i++, j++) { 500 byte b = data[i]; 501 if (b == ' ') { 502 // We've matched up to the space in a split-word typo, such as 503 // in German all zu=>allzu; here we've matched just past "all". 504 // Rather than terminating, attempt to continue in the buffer. 505 // We've matched up to the space in a split-word typo, such as 506 // in German all zu=>allzu; here we've matched just past "all". 507 // Rather than terminating, attempt to continue in the buffer. 508 if (j == end) { 509 int max = s.length; 510 if (end < max && s[end] == ' ') { 511 // Find next word 512 for (; end < max; end++) { 513 byte cb = s[end]; 514 if (!isLetter(cb)) { 515 if (cb == ' ' && end == j) { 516 continue; 517 } 518 break; 519 } 520 } 521 } 522 } 523 } 524 525 if (j == end) { 526 break; 527 } 528 if (b == '*') { 529 // Glob match (only supported at the end) 530 return 0; 531 } 532 byte cb = s[j]; 533 int delta = b - cb; 534 if (delta != 0) { 535 cb = toLowerCase(cb); 536 b = toLowerCase(b); 537 delta = b - cb; 538 if (delta != 0) { 539 return delta; 540 } 541 } 542 543 if (b == terminator || cb == terminator) { 544 return delta; 545 } 546 } 547 548 return data[i] - terminator; 549 } 550 551 /** 552 * Look up whether this word is a typo, and if so, return the typo itself 553 * and one or more likely meanings 554 * 555 * @param text the string containing the word 556 * @param begin the index of the first character in the word 557 * @param end the index of the first character after the word. Note that the 558 * search may extend <b>beyond</b> this index, if for example the 559 * word matches a multi-word typo in the dictionary 560 * @return a list of the typo itself followed by the replacement strings if 561 * the word represents a typo, and null otherwise 562 */ 563 @Nullable getTypos(@onNull CharSequence text, int begin, int end)564 public List<String> getTypos(@NonNull CharSequence text, int begin, int end) { 565 assert end <= text.length(); 566 567 if (assertionsEnabled()) { 568 for (int i = begin; i < end; i++) { 569 char c = text.charAt(i); 570 if (c >= 128) { 571 assert false : "Call the UTF-8 version of this method instead"; 572 return null; 573 } 574 } 575 } 576 577 int low = 0; 578 int high = mWordCount - 1; 579 while (low <= high) { 580 int middle = (low + high) >>> 1; 581 int offset = mIndices[middle]; 582 583 if (DEBUG_SEARCH) { 584 System.out.println("Comparing string " + text +" with entry at " + offset 585 + ": " + dumpEntry(offset)); 586 } 587 588 // Compare the word at the given index. 589 int compare = compare(mData, offset, (byte) 0, text, begin, end); 590 591 if (compare == 0) { 592 offset = mIndices[middle]; 593 594 // Don't allow matching uncapitalized words, such as "enlish", when 595 // the dictionary word is capitalized, "Enlish". 596 if (mData[offset] != text.charAt(begin) 597 && Character.isLowerCase(text.charAt(begin))) { 598 return null; 599 } 600 601 // Make sure there is a case match; we only want to allow 602 // matching capitalized words to capitalized typos or uncapitalized typos 603 // (e.g. "Teh" and "teh" to "the"), but not uncapitalized words to capitalized 604 // typos (e.g. "enlish" to "Enlish"). 605 String glob = null; 606 for (int i = begin; ; i++) { 607 byte b = mData[offset++]; 608 if (b == 0) { 609 offset--; 610 break; 611 } else if (b == '*') { 612 int globEnd = i; 613 while (globEnd < text.length() 614 && Character.isLetter(text.charAt(globEnd))) { 615 globEnd++; 616 } 617 glob = text.subSequence(i, globEnd).toString(); 618 break; 619 } 620 char c = text.charAt(i); 621 byte cb = (byte) c; 622 if (b != cb && i > begin) { 623 return null; 624 } 625 } 626 627 return computeSuggestions(mIndices[middle], offset, glob); 628 } 629 630 if (compare < 0) { 631 low = middle + 1; 632 } else if (compare > 0) { 633 high = middle - 1; 634 } else { 635 assert false; // compare == 0 already handled above 636 return null; 637 } 638 } 639 640 return null; 641 } 642 643 /** 644 * Look up whether this word is a typo, and if so, return the typo itself 645 * and one or more likely meanings 646 * 647 * @param utf8Text the string containing the word, encoded as UTF-8 648 * @param begin the index of the first character in the word 649 * @param end the index of the first character after the word. Note that the 650 * search may extend <b>beyond</b> this index, if for example the 651 * word matches a multi-word typo in the dictionary 652 * @return a list of the typo itself followed by the replacement strings if 653 * the word represents a typo, and null otherwise 654 */ 655 @Nullable getTypos(@onNull byte[] utf8Text, int begin, int end)656 public List<String> getTypos(@NonNull byte[] utf8Text, int begin, int end) { 657 assert end <= utf8Text.length; 658 659 int low = 0; 660 int high = mWordCount - 1; 661 while (low <= high) { 662 int middle = (low + high) >>> 1; 663 int offset = mIndices[middle]; 664 665 if (DEBUG_SEARCH) { 666 String s = new String(Arrays.copyOfRange(utf8Text, begin, end), Charsets.UTF_8); 667 System.out.println("Comparing string " + s +" with entry at " + offset 668 + ": " + dumpEntry(offset)); 669 System.out.println(" middle=" + middle + ", low=" + low + ", high=" + high); 670 } 671 672 // Compare the word at the given index. 673 int compare = compare(mData, offset, (byte) 0, utf8Text, begin, end); 674 675 if (DEBUG_SEARCH) { 676 System.out.println(" signum=" + (int)Math.signum(compare) + ", delta=" + compare); 677 } 678 679 if (compare == 0) { 680 offset = mIndices[middle]; 681 682 // Don't allow matching uncapitalized words, such as "enlish", when 683 // the dictionary word is capitalized, "Enlish". 684 if (mData[offset] != utf8Text[begin] && isUpperCase(mData[offset])) { 685 return null; 686 } 687 688 // Make sure there is a case match; we only want to allow 689 // matching capitalized words to capitalized typos or uncapitalized typos 690 // (e.g. "Teh" and "teh" to "the"), but not uncapitalized words to capitalized 691 // typos (e.g. "enlish" to "Enlish"). 692 String glob = null; 693 for (int i = begin; ; i++) { 694 byte b = mData[offset++]; 695 if (b == 0) { 696 offset--; 697 break; 698 } else if (b == '*') { 699 int globEnd = i; 700 while (globEnd < utf8Text.length && isLetter(utf8Text[globEnd])) { 701 globEnd++; 702 } 703 glob = new String(utf8Text, i, globEnd - i, Charsets.UTF_8); 704 break; 705 } 706 byte cb = utf8Text[i]; 707 if (b != cb && i > begin) { 708 return null; 709 } 710 } 711 712 return computeSuggestions(mIndices[middle], offset, glob); 713 } 714 715 if (compare < 0) { 716 low = middle + 1; 717 } else if (compare > 0) { 718 high = middle - 1; 719 } else { 720 assert false; // compare == 0 already handled above 721 return null; 722 } 723 } 724 725 return null; 726 } 727 computeSuggestions(int begin, int offset, String glob)728 private List<String> computeSuggestions(int begin, int offset, String glob) { 729 String typo = new String(mData, begin, offset - begin, Charsets.UTF_8); 730 731 if (glob != null) { 732 typo = typo.replaceAll("\\*", glob); //$NON-NLS-1$ 733 } 734 735 assert mData[offset] == 0; 736 offset++; 737 int replacementEnd = offset; 738 while (mData[replacementEnd] != 0) { 739 replacementEnd++; 740 } 741 String replacements = new String(mData, offset, replacementEnd - offset, Charsets.UTF_8); 742 List<String> words = new ArrayList<String>(); 743 words.add(typo); 744 745 // The first entry should be the typo itself. We need to pass this back since due 746 // to multi-match words and globbing it could extend beyond the initial word range 747 748 for (String s : Splitter.on(',').omitEmptyStrings().trimResults().split(replacements)) { 749 if (glob != null) { 750 // Need to append the glob string to each result 751 words.add(s.replaceAll("\\*", glob)); //$NON-NLS-1$ 752 } else { 753 words.add(s); 754 } 755 } 756 757 return words; 758 } 759 760 // "Character" handling for bytes. This assumes that the bytes correspond to Unicode 761 // characters in the ISO 8859-1 range, which is are encoded the same way in UTF-8. 762 // This obviously won't work to for example uppercase to lowercase conversions for 763 // multi byte characters, which means we simply won't catch typos if the dictionaries 764 // contain these. None of the currently included dictionaries do. However, it does 765 // help us properly deal with punctuation and spacing characters. 766 isUpperCase(byte b)767 static final boolean isUpperCase(byte b) { 768 return Character.isUpperCase((char) b); 769 } 770 toLowerCase(byte b)771 static final byte toLowerCase(byte b) { 772 return (byte) Character.toLowerCase((char) b); 773 } 774 isSpace(byte b)775 static final boolean isSpace(byte b) { 776 return Character.isWhitespace((char) b); 777 } 778 isLetter(byte b)779 static final boolean isLetter(byte b) { 780 // Assume that multi byte characters represent letters in other languages. 781 // Obviously, it could be unusual punctuation etc but letters are more likely 782 // in this context. 783 return Character.isLetter((char) b) || (b & 0x80) != 0; 784 } 785 } 786