• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.tools.lint.checks;
18 
19 import static com.android.SdkConstants.DOT_XML;
20 import static com.android.tools.lint.detector.api.LintUtils.assertionsEnabled;
21 
22 import com.android.annotations.NonNull;
23 import com.android.annotations.Nullable;
24 import com.android.annotations.VisibleForTesting;
25 import com.android.tools.lint.client.api.LintClient;
26 import com.android.tools.lint.detector.api.LintUtils;
27 import com.google.common.base.Charsets;
28 import com.google.common.base.Splitter;
29 import com.google.common.io.Files;
30 
31 import java.io.File;
32 import java.io.FileOutputStream;
33 import java.io.IOException;
34 import java.nio.ByteBuffer;
35 import java.nio.ByteOrder;
36 import java.nio.MappedByteBuffer;
37 import java.nio.channels.FileChannel.MapMode;
38 import java.util.ArrayList;
39 import java.util.Arrays;
40 import java.util.Comparator;
41 import java.util.List;
42 import java.util.WeakHashMap;
43 
44 /**
45  * Database of common typos / misspellings.
46  */
47 public class TypoLookup {
48     private static final TypoLookup NONE = new TypoLookup();
49 
50     /** String separating misspellings and suggested replacements in the text file */
51     private static final String WORD_SEPARATOR = "->";  //$NON-NLS-1$
52 
53     /** Relative path to the typos database file within the Lint installation */
54     private static final String XML_FILE_PATH = "tools/support/typos-%1$s.txt"; //$NON-NLS-1$
55     private static final String FILE_HEADER = "Typo database used by Android lint\000";
56     private static final int BINARY_FORMAT_VERSION = 2;
57     private static final boolean DEBUG_FORCE_REGENERATE_BINARY = false;
58     private static final boolean DEBUG_SEARCH = false;
59     private static final boolean WRITE_STATS = false;
60     /** Default size to reserve for each API entry when creating byte buffer to build up data */
61     private static final int BYTES_PER_ENTRY = 28;
62 
63     private final LintClient mClient;
64     private final File mXmlFile;
65     private final File mBinaryFile;
66     private byte[] mData;
67     private int[] mIndices;
68     private int mWordCount;
69 
70     private static WeakHashMap<String, TypoLookup> sInstanceMap =
71             new WeakHashMap<String, TypoLookup>();
72 
73     /**
74      * Returns an instance of the Typo database for the given locale
75      *
76      * @param client the client to associate with this database - used only for
77      *            logging. The database object may be shared among repeated
78      *            invocations, and in that case client used will be the one
79      *            originally passed in. In other words, this parameter may be
80      *            ignored if the client created is not new.
81      * @param locale the locale to look up a typo database for (should be a
82      *            language code (ISO 639-1, two lowercase character names)
83      * @param region the region to look up a typo database for (should be a two
84      *            letter ISO 3166-1 alpha-2 country code in upper case) language
85      *            code
86      * @return a (possibly shared) instance of the typo database, or null if its
87      *         data can't be found
88      */
89     @Nullable
get(@onNull LintClient client, @NonNull String locale, @Nullable String region)90     public static TypoLookup get(@NonNull LintClient client, @NonNull String locale,
91             @Nullable String region) {
92         synchronized (TypoLookup.class) {
93             String key = locale;
94 
95             if (region != null) {
96                 // Allow for region-specific dictionaries. See for example
97                 // http://en.wikipedia.org/wiki/American_and_British_English_spelling_differences
98                 assert region.length() == 2
99                         && Character.isUpperCase(region.charAt(0))
100                         && Character.isUpperCase(region.charAt(1)) : region;
101                 // Look for typos-en-rUS.txt etc
102                 key = locale + 'r' + region;
103             }
104 
105             TypoLookup db = sInstanceMap.get(key);
106             if (db == null) {
107                 String path = String.format(XML_FILE_PATH, key);
108                 File file = client.findResource(path);
109                 if (file == null) {
110                     // AOSP build environment?
111                     String build = System.getenv("ANDROID_BUILD_TOP");   //$NON-NLS-1$
112                     if (build != null) {
113                         file = new File(build, ("sdk/files/" //$NON-NLS-1$
114                                     + path.substring(path.lastIndexOf('/') + 1))
115                                       .replace('/', File.separatorChar));
116                     }
117                 }
118 
119                 if (file == null || !file.exists()) {
120                     if (region != null) {
121                         // Fall back to the generic locale (non-region-specific) database
122                         return get(client, locale, null);
123                     }
124                     db = NONE;
125                 } else {
126                     db = get(client, file);
127                     assert db != null : file;
128                 }
129                 sInstanceMap.put(key, db);
130             }
131 
132             if (db == NONE) {
133                 return null;
134             } else {
135                 return db;
136             }
137         }
138     }
139 
140     /**
141      * Returns an instance of the typo database
142      *
143      * @param client the client to associate with this database - used only for
144      *            logging
145      * @param xmlFile the XML file containing configuration data to use for this
146      *            database
147      * @return a (possibly shared) instance of the typo database, or null
148      *         if its data can't be found
149      */
150     @Nullable
get(LintClient client, File xmlFile)151     private static TypoLookup get(LintClient client, File xmlFile) {
152         if (!xmlFile.exists()) {
153             client.log(null, "The typo database file %1$s does not exist", xmlFile);
154             return null;
155         }
156 
157         String name = xmlFile.getName();
158         if (LintUtils.endsWith(name, DOT_XML)) {
159             name = name.substring(0, name.length() - DOT_XML.length());
160         }
161         File cacheDir = client.getCacheDir(true/*create*/);
162         if (cacheDir == null) {
163             cacheDir = xmlFile.getParentFile();
164         }
165 
166         File binaryData = new File(cacheDir, name
167                 // Incorporate version number in the filename to avoid upgrade filename
168                 // conflicts on Windows (such as issue #26663)
169                 + "-" + BINARY_FORMAT_VERSION + ".bin"); //$NON-NLS-1$ //$NON-NLS-2$
170 
171         if (DEBUG_FORCE_REGENERATE_BINARY) {
172             System.err.println("\nTemporarily regenerating binary data unconditionally \nfrom "
173                     + xmlFile + "\nto " + binaryData);
174             if (!createCache(client, xmlFile, binaryData)) {
175                 return null;
176             }
177         } else if (!binaryData.exists() || binaryData.lastModified() < xmlFile.lastModified()) {
178             if (!createCache(client, xmlFile, binaryData)) {
179                 return null;
180             }
181         }
182 
183         if (!binaryData.exists()) {
184             client.log(null, "The typo database file %1$s does not exist", binaryData);
185             return null;
186         }
187 
188         return new TypoLookup(client, xmlFile, binaryData);
189     }
190 
createCache(LintClient client, File xmlFile, File binaryData)191     private static boolean createCache(LintClient client, File xmlFile, File binaryData) {
192         long begin = 0;
193         if (WRITE_STATS) {
194             begin = System.currentTimeMillis();
195         }
196 
197         // Read in data
198         List<String> lines;
199         try {
200             lines = Files.readLines(xmlFile, Charsets.UTF_8);
201         } catch (IOException e) {
202             client.log(e, "Can't read typo database file");
203             return false;
204         }
205 
206         if (WRITE_STATS) {
207             long end = System.currentTimeMillis();
208             System.out.println("Reading data structures took " + (end - begin) + " ms)");
209         }
210 
211         try {
212             writeDatabase(binaryData, lines);
213             return true;
214         } catch (IOException ioe) {
215             client.log(ioe, "Can't write typo cache file");
216         }
217 
218         return false;
219     }
220 
221     /** Use one of the {@link #get} factory methods instead */
TypoLookup( @onNull LintClient client, @NonNull File xmlFile, @Nullable File binaryFile)222     private TypoLookup(
223             @NonNull LintClient client,
224             @NonNull File xmlFile,
225             @Nullable File binaryFile) {
226         mClient = client;
227         mXmlFile = xmlFile;
228         mBinaryFile = binaryFile;
229 
230         if (binaryFile != null) {
231             readData();
232         }
233     }
234 
TypoLookup()235     private TypoLookup() {
236         mClient = null;
237         mXmlFile = null;
238         mBinaryFile = null;
239     }
240 
readData()241     private void readData() {
242         if (!mBinaryFile.exists()) {
243             mClient.log(null, "%1$s does not exist", mBinaryFile);
244             return;
245         }
246         long start = System.currentTimeMillis();
247         try {
248             MappedByteBuffer buffer = Files.map(mBinaryFile, MapMode.READ_ONLY);
249             assert buffer.order() == ByteOrder.BIG_ENDIAN;
250 
251             // First skip the header
252             byte[] expectedHeader = FILE_HEADER.getBytes(Charsets.US_ASCII);
253             buffer.rewind();
254             for (int offset = 0; offset < expectedHeader.length; offset++) {
255                 if (expectedHeader[offset] != buffer.get()) {
256                     mClient.log(null, "Incorrect file header: not an typo database cache " +
257                             "file, or a corrupt cache file");
258                     return;
259                 }
260             }
261 
262             // Read in the format number
263             if (buffer.get() != BINARY_FORMAT_VERSION) {
264                 // Force regeneration of new binary data with up to date format
265                 if (createCache(mClient, mXmlFile, mBinaryFile)) {
266                     readData(); // Recurse
267                 }
268 
269                 return;
270             }
271 
272             mWordCount = buffer.getInt();
273 
274             // Read in the word table indices;
275             int count = mWordCount;
276             int[] offsets = new int[count];
277 
278             // Another idea: I can just store the DELTAS in the file (and add them up
279             // when reading back in) such that it takes just ONE byte instead of four!
280 
281             for (int i = 0; i < count; i++) {
282                 offsets[i] = buffer.getInt();
283             }
284 
285             // No need to read in the rest -- we'll just keep the whole byte array in memory
286             // TODO: Make this code smarter/more efficient.
287             int size = buffer.limit();
288             byte[] b = new byte[size];
289             buffer.rewind();
290             buffer.get(b);
291             mData = b;
292             mIndices = offsets;
293 
294             // TODO: We only need to keep the data portion here since we've initialized
295             // the offset array separately.
296             // TODO: Investigate (profile) accessing the byte buffer directly instead of
297             // accessing a byte array.
298         } catch (IOException e) {
299             mClient.log(e, null);
300         }
301         if (WRITE_STATS) {
302             long end = System.currentTimeMillis();
303             System.out.println("\nRead typo database in " + (end - start)
304                     + " milliseconds.");
305             System.out.println("Size of data table: " + mData.length + " bytes ("
306                     + Integer.toString(mData.length/1024) + "k)\n");
307         }
308     }
309 
310     /** See the {@link #readData()} for documentation on the data format. */
writeDatabase(File file, List<String> lines)311     private static void writeDatabase(File file, List<String> lines) throws IOException {
312         /*
313          * 1. A file header, which is the exact contents of {@link FILE_HEADER} encoded
314          *     as ASCII characters. The purpose of the header is to identify what the file
315          *     is for, for anyone attempting to open the file.
316          * 2. A file version number. If the binary file does not match the reader's expected
317          *     version, it can ignore it (and regenerate the cache from XML).
318          */
319 
320         // Drop comments etc
321         List<String> words = new ArrayList<String>(lines.size());
322         for (String line : lines) {
323             if (!line.isEmpty() && Character.isLetter(line.charAt(0))) {
324                 int end = line.indexOf(WORD_SEPARATOR);
325                 if (end == -1) {
326                     end = line.trim().length();
327                 }
328                 String typo = line.substring(0, end).trim();
329                 String replacements = line.substring(end + WORD_SEPARATOR.length()).trim();
330                 if (replacements.isEmpty()) {
331                     // We don't support empty replacements
332                     continue;
333                 }
334                 String combined = typo + (char) 0 + replacements;
335 
336                 words.add(combined);
337             }
338         }
339 
340         byte[][] wordArrays = new byte[words.size()][];
341         for (int i = 0, n = words.size(); i < n; i++) {
342             String word = words.get(i);
343             wordArrays[i] = word.getBytes(Charsets.UTF_8);
344         }
345         // Sort words, using our own comparator to ensure that it matches the
346         // binary search in getTypos()
347         Comparator<byte[]> comparator = new Comparator<byte[]>() {
348             @Override
349             public int compare(byte[] o1, byte[] o2) {
350                 return TypoLookup.compare(o1, 0, (byte) 0, o2, 0, o2.length);
351             }
352         };
353         Arrays.sort(wordArrays, comparator);
354 
355         int entryCount = wordArrays.length;
356         int capacity = entryCount * BYTES_PER_ENTRY;
357         ByteBuffer buffer = ByteBuffer.allocate(capacity);
358         buffer.order(ByteOrder.BIG_ENDIAN);
359         //  1. A file header, which is the exact contents of {@link FILE_HEADER} encoded
360         //      as ASCII characters. The purpose of the header is to identify what the file
361         //      is for, for anyone attempting to open the file.
362         buffer.put(FILE_HEADER.getBytes(Charsets.US_ASCII));
363 
364         //  2. A file version number. If the binary file does not match the reader's expected
365         //      version, it can ignore it (and regenerate the cache from XML).
366         buffer.put((byte) BINARY_FORMAT_VERSION);
367 
368         //  3. The number of words [1 int]
369         buffer.putInt(entryCount);
370 
371         //  4. Word offset table (one integer per word, pointing to the byte offset in the
372         //       file (relative to the beginning of the file) where each word begins.
373         //       The words are always sorted alphabetically.
374         int wordOffsetTable = buffer.position();
375 
376         // Reserve enough room for the offset table here: we will backfill it with pointers
377         // as we're writing out the data structures below
378         for (int i = 0, n = entryCount; i < n; i++) {
379             buffer.putInt(0);
380         }
381 
382         int nextEntry = buffer.position();
383         int nextOffset = wordOffsetTable;
384 
385         // 7. Word entry table. Each word entry consists of the word, followed by the byte 0
386         //      as a terminator, followed by a comma separated list of suggestions (which
387         //      may be empty), or a final 0.
388         for (int i = 0; i < entryCount; i++) {
389             byte[] word = wordArrays[i];
390             buffer.position(nextOffset);
391             buffer.putInt(nextEntry);
392             nextOffset = buffer.position();
393             buffer.position(nextEntry);
394 
395             buffer.put(word); // already embeds 0 to separate typo from words
396             buffer.put((byte) 0);
397 
398             nextEntry = buffer.position();
399         }
400 
401         int size = buffer.position();
402         assert size <= buffer.limit();
403         buffer.mark();
404 
405         if (WRITE_STATS) {
406             System.out.println("Wrote " + words.size() + " word entries");
407             System.out.print("Actual binary size: " + size + " bytes");
408             System.out.println(String.format(" (%.1fM)", size/(1024*1024.f)));
409 
410             System.out.println("Allocated size: " + (entryCount * BYTES_PER_ENTRY) + " bytes");
411             System.out.println("Required bytes per entry: " + (size/ entryCount) + " bytes");
412         }
413 
414         // Now dump this out as a file
415         // There's probably an API to do this more efficiently; TODO: Look into this.
416         byte[] b = new byte[size];
417         buffer.rewind();
418         buffer.get(b);
419         FileOutputStream output = Files.newOutputStreamSupplier(file).getOutput();
420         output.write(b);
421         output.close();
422     }
423 
424     // For debugging only
dumpEntry(int offset)425     private String dumpEntry(int offset) {
426         if (DEBUG_SEARCH) {
427             int end = offset;
428             while (mData[end] != 0) {
429                 end++;
430             }
431             return new String(mData, offset, end - offset, Charsets.UTF_8);
432         } else {
433             return "<disabled>"; //$NON-NLS-1$
434         }
435     }
436 
437     /** Comparison function: *only* used for ASCII strings */
438     @VisibleForTesting
compare(byte[] data, int offset, byte terminator, CharSequence s, int begin, int end)439     static int compare(byte[] data, int offset, byte terminator, CharSequence s,
440             int begin, int end) {
441         int i = offset;
442         int j = begin;
443         for (; ; i++, j++) {
444             byte b = data[i];
445             if (b == ' ') {
446                 // We've matched up to the space in a split-word typo, such as
447                 // in German all zu=>allzu; here we've matched just past "all".
448                 // Rather than terminating, attempt to continue in the buffer.
449                 if (j == end) {
450                     int max = s.length();
451                     if (end < max && s.charAt(end) == ' ') {
452                         // Find next word
453                         for (; end < max; end++) {
454                             char c = s.charAt(end);
455                             if (!Character.isLetter(c)) {
456                                 if (c == ' ' && end == j) {
457                                     continue;
458                                 }
459                                 break;
460                             }
461                         }
462                     }
463                 }
464             }
465 
466             if (j == end) {
467                 break;
468             }
469 
470             if (b == '*') {
471                 // Glob match (only supported at the end)
472                 return 0;
473             }
474             char c = s.charAt(j);
475             byte cb = (byte) c;
476             int delta = b - cb;
477             if (delta != 0) {
478                 cb = (byte) Character.toLowerCase(c);
479                 if (b != cb) {
480                     // Ensure that it has the right sign
481                     b = (byte) Character.toLowerCase(b);
482                     delta = b - cb;
483                     if (delta != 0) {
484                         return delta;
485                     }
486                 }
487             }
488         }
489 
490         return data[i] - terminator;
491     }
492 
493     /** Comparison function used for general UTF-8 encoded strings */
494     @VisibleForTesting
compare(byte[] data, int offset, byte terminator, byte[] s, int begin, int end)495     static int compare(byte[] data, int offset, byte terminator, byte[] s,
496             int begin, int end) {
497         int i = offset;
498         int j = begin;
499         for (; ; i++, j++) {
500             byte b = data[i];
501             if (b == ' ') {
502                 // We've matched up to the space in a split-word typo, such as
503                 // in German all zu=>allzu; here we've matched just past "all".
504                 // Rather than terminating, attempt to continue in the buffer.
505                 // We've matched up to the space in a split-word typo, such as
506                 // in German all zu=>allzu; here we've matched just past "all".
507                 // Rather than terminating, attempt to continue in the buffer.
508                 if (j == end) {
509                     int max = s.length;
510                     if (end < max && s[end] == ' ') {
511                         // Find next word
512                         for (; end < max; end++) {
513                             byte cb = s[end];
514                             if (!isLetter(cb)) {
515                                 if (cb == ' ' && end == j) {
516                                     continue;
517                                 }
518                                 break;
519                             }
520                         }
521                     }
522                 }
523             }
524 
525             if (j == end) {
526                 break;
527             }
528             if (b == '*') {
529                 // Glob match (only supported at the end)
530                 return 0;
531             }
532             byte cb = s[j];
533             int delta = b - cb;
534             if (delta != 0) {
535                 cb = toLowerCase(cb);
536                 b = toLowerCase(b);
537                 delta = b - cb;
538                 if (delta != 0) {
539                     return delta;
540                 }
541             }
542 
543             if (b == terminator || cb == terminator) {
544                 return delta;
545             }
546         }
547 
548         return data[i] - terminator;
549     }
550 
551     /**
552      * Look up whether this word is a typo, and if so, return the typo itself
553      * and one or more likely meanings
554      *
555      * @param text the string containing the word
556      * @param begin the index of the first character in the word
557      * @param end the index of the first character after the word. Note that the
558      *            search may extend <b>beyond</b> this index, if for example the
559      *            word matches a multi-word typo in the dictionary
560      * @return a list of the typo itself followed by the replacement strings if
561      *         the word represents a typo, and null otherwise
562      */
563     @Nullable
getTypos(@onNull CharSequence text, int begin, int end)564     public List<String> getTypos(@NonNull CharSequence text, int begin, int end) {
565         assert end <= text.length();
566 
567         if (assertionsEnabled()) {
568             for (int i = begin; i < end; i++) {
569                 char c = text.charAt(i);
570                 if (c >= 128) {
571                     assert false : "Call the UTF-8 version of this method instead";
572                     return null;
573                 }
574             }
575         }
576 
577         int low = 0;
578         int high = mWordCount - 1;
579         while (low <= high) {
580             int middle = (low + high) >>> 1;
581             int offset = mIndices[middle];
582 
583             if (DEBUG_SEARCH) {
584                 System.out.println("Comparing string " + text +" with entry at " + offset
585                         + ": " + dumpEntry(offset));
586             }
587 
588             // Compare the word at the given index.
589             int compare = compare(mData, offset, (byte) 0, text, begin, end);
590 
591             if (compare == 0) {
592                 offset = mIndices[middle];
593 
594                 // Don't allow matching uncapitalized words, such as "enlish", when
595                 // the dictionary word is capitalized, "Enlish".
596                 if (mData[offset] != text.charAt(begin)
597                         && Character.isLowerCase(text.charAt(begin))) {
598                     return null;
599                 }
600 
601                 // Make sure there is a case match; we only want to allow
602                 // matching capitalized words to capitalized typos or uncapitalized typos
603                 //  (e.g. "Teh" and "teh" to "the"), but not uncapitalized words to capitalized
604                 // typos (e.g. "enlish" to "Enlish").
605                 String glob = null;
606                 for (int i = begin; ; i++) {
607                     byte b = mData[offset++];
608                     if (b == 0) {
609                         offset--;
610                         break;
611                     } else if (b == '*') {
612                         int globEnd = i;
613                         while (globEnd < text.length()
614                                 && Character.isLetter(text.charAt(globEnd))) {
615                             globEnd++;
616                         }
617                         glob = text.subSequence(i, globEnd).toString();
618                         break;
619                     }
620                     char c = text.charAt(i);
621                     byte cb = (byte) c;
622                     if (b != cb && i > begin) {
623                         return null;
624                     }
625                 }
626 
627                 return computeSuggestions(mIndices[middle], offset, glob);
628             }
629 
630             if (compare < 0) {
631                 low = middle + 1;
632             } else if (compare > 0) {
633                 high = middle - 1;
634             } else {
635                 assert false; // compare == 0 already handled above
636                 return null;
637             }
638         }
639 
640         return null;
641     }
642 
643     /**
644      * Look up whether this word is a typo, and if so, return the typo itself
645      * and one or more likely meanings
646      *
647      * @param utf8Text the string containing the word, encoded as UTF-8
648      * @param begin the index of the first character in the word
649      * @param end the index of the first character after the word. Note that the
650      *            search may extend <b>beyond</b> this index, if for example the
651      *            word matches a multi-word typo in the dictionary
652      * @return a list of the typo itself followed by the replacement strings if
653      *         the word represents a typo, and null otherwise
654      */
655     @Nullable
getTypos(@onNull byte[] utf8Text, int begin, int end)656     public List<String> getTypos(@NonNull byte[] utf8Text, int begin, int end) {
657         assert end <= utf8Text.length;
658 
659         int low = 0;
660         int high = mWordCount - 1;
661         while (low <= high) {
662             int middle = (low + high) >>> 1;
663             int offset = mIndices[middle];
664 
665             if (DEBUG_SEARCH) {
666                 String s = new String(Arrays.copyOfRange(utf8Text, begin, end), Charsets.UTF_8);
667                 System.out.println("Comparing string " + s +" with entry at " + offset
668                         + ": " + dumpEntry(offset));
669                 System.out.println("   middle=" + middle + ", low=" + low + ", high=" + high);
670             }
671 
672             // Compare the word at the given index.
673             int compare = compare(mData, offset, (byte) 0, utf8Text, begin, end);
674 
675             if (DEBUG_SEARCH) {
676                 System.out.println(" signum=" + (int)Math.signum(compare) + ", delta=" + compare);
677             }
678 
679             if (compare == 0) {
680                 offset = mIndices[middle];
681 
682                 // Don't allow matching uncapitalized words, such as "enlish", when
683                 // the dictionary word is capitalized, "Enlish".
684                 if (mData[offset] != utf8Text[begin] && isUpperCase(mData[offset])) {
685                     return null;
686                 }
687 
688                 // Make sure there is a case match; we only want to allow
689                 // matching capitalized words to capitalized typos or uncapitalized typos
690                 //  (e.g. "Teh" and "teh" to "the"), but not uncapitalized words to capitalized
691                 // typos (e.g. "enlish" to "Enlish").
692                 String glob = null;
693                 for (int i = begin; ; i++) {
694                     byte b = mData[offset++];
695                     if (b == 0) {
696                         offset--;
697                         break;
698                     } else if (b == '*') {
699                         int globEnd = i;
700                         while (globEnd < utf8Text.length && isLetter(utf8Text[globEnd])) {
701                             globEnd++;
702                         }
703                         glob = new String(utf8Text, i, globEnd - i, Charsets.UTF_8);
704                         break;
705                     }
706                     byte cb = utf8Text[i];
707                     if (b != cb && i > begin) {
708                         return null;
709                     }
710                 }
711 
712                 return computeSuggestions(mIndices[middle], offset, glob);
713             }
714 
715             if (compare < 0) {
716                 low = middle + 1;
717             } else if (compare > 0) {
718                 high = middle - 1;
719             } else {
720                 assert false; // compare == 0 already handled above
721                 return null;
722             }
723         }
724 
725         return null;
726     }
727 
computeSuggestions(int begin, int offset, String glob)728     private List<String> computeSuggestions(int begin, int offset, String glob) {
729         String typo = new String(mData, begin, offset - begin, Charsets.UTF_8);
730 
731         if (glob != null) {
732             typo = typo.replaceAll("\\*", glob); //$NON-NLS-1$
733         }
734 
735         assert mData[offset] == 0;
736         offset++;
737         int replacementEnd = offset;
738         while (mData[replacementEnd] != 0) {
739             replacementEnd++;
740         }
741         String replacements = new String(mData, offset, replacementEnd - offset, Charsets.UTF_8);
742         List<String> words = new ArrayList<String>();
743         words.add(typo);
744 
745         // The first entry should be the typo itself. We need to pass this back since due
746         // to multi-match words and globbing it could extend beyond the initial word range
747 
748         for (String s : Splitter.on(',').omitEmptyStrings().trimResults().split(replacements)) {
749             if (glob != null) {
750                 // Need to append the glob string to each result
751                 words.add(s.replaceAll("\\*", glob)); //$NON-NLS-1$
752             } else {
753                 words.add(s);
754             }
755         }
756 
757         return words;
758     }
759 
760     // "Character" handling for bytes. This assumes that the bytes correspond to Unicode
761     // characters in the ISO 8859-1 range, which is are encoded the same way in UTF-8.
762     // This obviously won't work to for example uppercase to lowercase conversions for
763     // multi byte characters, which means we simply won't catch typos if the dictionaries
764     // contain these. None of the currently included dictionaries do. However, it does
765     // help us properly deal with punctuation and spacing characters.
766 
isUpperCase(byte b)767     static final boolean isUpperCase(byte b) {
768         return Character.isUpperCase((char) b);
769     }
770 
toLowerCase(byte b)771     static final byte toLowerCase(byte b) {
772         return (byte) Character.toLowerCase((char) b);
773     }
774 
isSpace(byte b)775     static final boolean isSpace(byte b) {
776         return Character.isWhitespace((char) b);
777     }
778 
isLetter(byte b)779     static final boolean isLetter(byte b) {
780         // Assume that multi byte characters represent letters in other languages.
781         // Obviously, it could be unusual punctuation etc but letters are more likely
782         // in this context.
783         return Character.isLetter((char) b) || (b & 0x80) != 0;
784     }
785 }
786