1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.utils; 18 19 import android.content.ContentValues; 20 import android.content.Context; 21 import android.content.res.AssetManager; 22 import android.content.res.Resources; 23 import android.text.TextUtils; 24 import android.util.Log; 25 26 import com.android.inputmethod.latin.AssetFileAddress; 27 import com.android.inputmethod.latin.BinaryDictionaryGetter; 28 import com.android.inputmethod.latin.Constants; 29 import com.android.inputmethod.latin.R; 30 import com.android.inputmethod.latin.makedict.DictionaryHeader; 31 import com.android.inputmethod.latin.makedict.UnsupportedFormatException; 32 import com.android.inputmethod.latin.settings.SpacingAndPunctuations; 33 34 import java.io.File; 35 import java.io.IOException; 36 import java.util.ArrayList; 37 import java.util.Iterator; 38 import java.util.Locale; 39 import java.util.concurrent.TimeUnit; 40 41 /** 42 * This class encapsulates the logic for the Latin-IME side of dictionary information management. 43 */ 44 public class DictionaryInfoUtils { 45 private static final String TAG = DictionaryInfoUtils.class.getSimpleName(); 46 private static final String RESOURCE_PACKAGE_NAME = R.class.getPackage().getName(); 47 private static final String DEFAULT_MAIN_DICT = "main"; 48 private static final String MAIN_DICT_PREFIX = "main_"; 49 // 6 digits - unicode is limited to 21 bits 50 private static final int MAX_HEX_DIGITS_FOR_CODEPOINT = 6; 51 52 public static class DictionaryInfo { 53 private static final String LOCALE_COLUMN = "locale"; 54 private static final String WORDLISTID_COLUMN = "id"; 55 private static final String LOCAL_FILENAME_COLUMN = "filename"; 56 private static final String DESCRIPTION_COLUMN = "description"; 57 private static final String DATE_COLUMN = "date"; 58 private static final String FILESIZE_COLUMN = "filesize"; 59 private static final String VERSION_COLUMN = "version"; 60 public final String mId; 61 public final Locale mLocale; 62 public final String mDescription; 63 public final AssetFileAddress mFileAddress; 64 public final int mVersion; DictionaryInfo(final String id, final Locale locale, final String description, final AssetFileAddress fileAddress, final int version)65 public DictionaryInfo(final String id, final Locale locale, final String description, 66 final AssetFileAddress fileAddress, final int version) { 67 mId = id; 68 mLocale = locale; 69 mDescription = description; 70 mFileAddress = fileAddress; 71 mVersion = version; 72 } toContentValues()73 public ContentValues toContentValues() { 74 final ContentValues values = new ContentValues(); 75 values.put(WORDLISTID_COLUMN, mId); 76 values.put(LOCALE_COLUMN, mLocale.toString()); 77 values.put(DESCRIPTION_COLUMN, mDescription); 78 values.put(LOCAL_FILENAME_COLUMN, mFileAddress.mFilename); 79 values.put(DATE_COLUMN, TimeUnit.MILLISECONDS.toSeconds( 80 new File(mFileAddress.mFilename).lastModified())); 81 values.put(FILESIZE_COLUMN, mFileAddress.mLength); 82 values.put(VERSION_COLUMN, mVersion); 83 return values; 84 } 85 } 86 DictionaryInfoUtils()87 private DictionaryInfoUtils() { 88 // Private constructor to forbid instantation of this helper class. 89 } 90 91 /** 92 * Returns whether we may want to use this character as part of a file name. 93 * 94 * This basically only accepts ascii letters and numbers, and rejects everything else. 95 */ isFileNameCharacter(int codePoint)96 private static boolean isFileNameCharacter(int codePoint) { 97 if (codePoint >= 0x30 && codePoint <= 0x39) return true; // Digit 98 if (codePoint >= 0x41 && codePoint <= 0x5A) return true; // Uppercase 99 if (codePoint >= 0x61 && codePoint <= 0x7A) return true; // Lowercase 100 return codePoint == '_'; // Underscore 101 } 102 103 /** 104 * Escapes a string for any characters that may be suspicious for a file or directory name. 105 * 106 * Concretely this does a sort of URL-encoding except it will encode everything that's not 107 * alphanumeric or underscore. (true URL-encoding leaves alone characters like '*', which 108 * we cannot allow here) 109 */ 110 // TODO: create a unit test for this method replaceFileNameDangerousCharacters(final String name)111 public static String replaceFileNameDangerousCharacters(final String name) { 112 // This assumes '%' is fully available as a non-separator, normal 113 // character in a file name. This is probably true for all file systems. 114 final StringBuilder sb = new StringBuilder(); 115 final int nameLength = name.length(); 116 for (int i = 0; i < nameLength; i = name.offsetByCodePoints(i, 1)) { 117 final int codePoint = name.codePointAt(i); 118 if (DictionaryInfoUtils.isFileNameCharacter(codePoint)) { 119 sb.appendCodePoint(codePoint); 120 } else { 121 sb.append(String.format((Locale)null, "%%%1$0" + MAX_HEX_DIGITS_FOR_CODEPOINT + "x", 122 codePoint)); 123 } 124 } 125 return sb.toString(); 126 } 127 128 /** 129 * Helper method to get the top level cache directory. 130 */ getWordListCacheDirectory(final Context context)131 private static String getWordListCacheDirectory(final Context context) { 132 return context.getFilesDir() + File.separator + "dicts"; 133 } 134 135 /** 136 * Helper method to get the top level temp directory. 137 */ getWordListTempDirectory(final Context context)138 public static String getWordListTempDirectory(final Context context) { 139 return context.getFilesDir() + File.separator + "tmp"; 140 } 141 142 /** 143 * Reverse escaping done by replaceFileNameDangerousCharacters. 144 */ getWordListIdFromFileName(final String fname)145 public static String getWordListIdFromFileName(final String fname) { 146 final StringBuilder sb = new StringBuilder(); 147 final int fnameLength = fname.length(); 148 for (int i = 0; i < fnameLength; i = fname.offsetByCodePoints(i, 1)) { 149 final int codePoint = fname.codePointAt(i); 150 if ('%' != codePoint) { 151 sb.appendCodePoint(codePoint); 152 } else { 153 // + 1 to pass the % sign 154 final int encodedCodePoint = Integer.parseInt( 155 fname.substring(i + 1, i + 1 + MAX_HEX_DIGITS_FOR_CODEPOINT), 16); 156 i += MAX_HEX_DIGITS_FOR_CODEPOINT; 157 sb.appendCodePoint(encodedCodePoint); 158 } 159 } 160 return sb.toString(); 161 } 162 163 /** 164 * Helper method to the list of cache directories, one for each distinct locale. 165 */ getCachedDirectoryList(final Context context)166 public static File[] getCachedDirectoryList(final Context context) { 167 return new File(DictionaryInfoUtils.getWordListCacheDirectory(context)).listFiles(); 168 } 169 170 /** 171 * Returns the category for a given file name. 172 * 173 * This parses the file name, extracts the category, and returns it. See 174 * {@link #getMainDictId(Locale)} and {@link #isMainWordListId(String)}. 175 * @return The category as a string or null if it can't be found in the file name. 176 */ getCategoryFromFileName(final String fileName)177 public static String getCategoryFromFileName(final String fileName) { 178 final String id = getWordListIdFromFileName(fileName); 179 final String[] idArray = id.split(BinaryDictionaryGetter.ID_CATEGORY_SEPARATOR); 180 // An id is supposed to be in format category:locale, so splitting on the separator 181 // should yield a 2-elements array 182 if (2 != idArray.length) return null; 183 return idArray[0]; 184 } 185 186 /** 187 * Find out the cache directory associated with a specific locale. 188 */ getCacheDirectoryForLocale(final String locale, final Context context)189 private static String getCacheDirectoryForLocale(final String locale, final Context context) { 190 final String relativeDirectoryName = replaceFileNameDangerousCharacters(locale); 191 final String absoluteDirectoryName = getWordListCacheDirectory(context) + File.separator 192 + relativeDirectoryName; 193 final File directory = new File(absoluteDirectoryName); 194 if (!directory.exists()) { 195 if (!directory.mkdirs()) { 196 Log.e(TAG, "Could not create the directory for locale" + locale); 197 } 198 } 199 return absoluteDirectoryName; 200 } 201 202 /** 203 * Generates a file name for the id and locale passed as an argument. 204 * 205 * In the current implementation the file name returned will always be unique for 206 * any id/locale pair, but please do not expect that the id can be the same for 207 * different dictionaries with different locales. An id should be unique for any 208 * dictionary. 209 * The file name is pretty much an URL-encoded version of the id inside a directory 210 * named like the locale, except it will also escape characters that look dangerous 211 * to some file systems. 212 * @param id the id of the dictionary for which to get a file name 213 * @param locale the locale for which to get the file name as a string 214 * @param context the context to use for getting the directory 215 * @return the name of the file to be created 216 */ getCacheFileName(String id, String locale, Context context)217 public static String getCacheFileName(String id, String locale, Context context) { 218 final String fileName = replaceFileNameDangerousCharacters(id); 219 return getCacheDirectoryForLocale(locale, context) + File.separator + fileName; 220 } 221 isMainWordListId(final String id)222 public static boolean isMainWordListId(final String id) { 223 final String[] idArray = id.split(BinaryDictionaryGetter.ID_CATEGORY_SEPARATOR); 224 // An id is supposed to be in format category:locale, so splitting on the separator 225 // should yield a 2-elements array 226 if (2 != idArray.length) return false; 227 return BinaryDictionaryGetter.MAIN_DICTIONARY_CATEGORY.equals(idArray[0]); 228 } 229 230 /** 231 * Helper method to return a dictionary res id for a locale, or 0 if none. 232 * @param locale dictionary locale 233 * @return main dictionary resource id 234 */ getMainDictionaryResourceIdIfAvailableForLocale(final Resources res, final Locale locale)235 public static int getMainDictionaryResourceIdIfAvailableForLocale(final Resources res, 236 final Locale locale) { 237 int resId; 238 // Try to find main_language_country dictionary. 239 if (!locale.getCountry().isEmpty()) { 240 final String dictLanguageCountry = 241 MAIN_DICT_PREFIX + locale.toString().toLowerCase(Locale.ROOT); 242 if ((resId = res.getIdentifier( 243 dictLanguageCountry, "raw", RESOURCE_PACKAGE_NAME)) != 0) { 244 return resId; 245 } 246 } 247 248 // Try to find main_language dictionary. 249 final String dictLanguage = MAIN_DICT_PREFIX + locale.getLanguage(); 250 if ((resId = res.getIdentifier(dictLanguage, "raw", RESOURCE_PACKAGE_NAME)) != 0) { 251 return resId; 252 } 253 254 // Not found, return 0 255 return 0; 256 } 257 258 /** 259 * Returns a main dictionary resource id 260 * @param locale dictionary locale 261 * @return main dictionary resource id 262 */ getMainDictionaryResourceId(final Resources res, final Locale locale)263 public static int getMainDictionaryResourceId(final Resources res, final Locale locale) { 264 int resourceId = getMainDictionaryResourceIdIfAvailableForLocale(res, locale); 265 if (0 != resourceId) return resourceId; 266 return res.getIdentifier(DEFAULT_MAIN_DICT, "raw", RESOURCE_PACKAGE_NAME); 267 } 268 269 /** 270 * Returns the id associated with the main word list for a specified locale. 271 * 272 * Word lists stored in Android Keyboard's resources are referred to as the "main" 273 * word lists. Since they can be updated like any other list, we need to assign a 274 * unique ID to them. This ID is just the name of the language (locale-wise) they 275 * are for, and this method returns this ID. 276 */ getMainDictId(final Locale locale)277 public static String getMainDictId(final Locale locale) { 278 // This works because we don't include by default different dictionaries for 279 // different countries. This actually needs to return the id that we would 280 // like to use for word lists included in resources, and the following is okay. 281 return BinaryDictionaryGetter.MAIN_DICTIONARY_CATEGORY + 282 BinaryDictionaryGetter.ID_CATEGORY_SEPARATOR + locale.getLanguage().toString(); 283 } 284 getDictionaryFileHeaderOrNull(final File file)285 public static DictionaryHeader getDictionaryFileHeaderOrNull(final File file) { 286 return getDictionaryFileHeaderOrNull(file, 0, file.length()); 287 } 288 getDictionaryFileHeaderOrNull(final File file, final long offset, final long length)289 private static DictionaryHeader getDictionaryFileHeaderOrNull(final File file, 290 final long offset, final long length) { 291 try { 292 final DictionaryHeader header = 293 BinaryDictionaryUtils.getHeaderWithOffsetAndLength(file, offset, length); 294 return header; 295 } catch (UnsupportedFormatException e) { 296 return null; 297 } catch (IOException e) { 298 return null; 299 } 300 } 301 302 /** 303 * Returns information of the dictionary. 304 * 305 * @param fileAddress the asset dictionary file address. 306 * @return information of the specified dictionary. 307 */ createDictionaryInfoFromFileAddress( final AssetFileAddress fileAddress)308 private static DictionaryInfo createDictionaryInfoFromFileAddress( 309 final AssetFileAddress fileAddress) { 310 final DictionaryHeader header = getDictionaryFileHeaderOrNull( 311 new File(fileAddress.mFilename), fileAddress.mOffset, fileAddress.mLength); 312 if (header == null) { 313 return null; 314 } 315 final String id = header.getId(); 316 final Locale locale = LocaleUtils.constructLocaleFromString(header.getLocaleString()); 317 final String description = header.getDescription(); 318 final String version = header.getVersion(); 319 return new DictionaryInfo(id, locale, description, fileAddress, Integer.parseInt(version)); 320 } 321 addOrUpdateDictInfo(final ArrayList<DictionaryInfo> dictList, final DictionaryInfo newElement)322 private static void addOrUpdateDictInfo(final ArrayList<DictionaryInfo> dictList, 323 final DictionaryInfo newElement) { 324 final Iterator<DictionaryInfo> iter = dictList.iterator(); 325 while (iter.hasNext()) { 326 final DictionaryInfo thisDictInfo = iter.next(); 327 if (thisDictInfo.mLocale.equals(newElement.mLocale)) { 328 if (newElement.mVersion <= thisDictInfo.mVersion) { 329 return; 330 } 331 iter.remove(); 332 } 333 } 334 dictList.add(newElement); 335 } 336 getCurrentDictionaryFileNameAndVersionInfo( final Context context)337 public static ArrayList<DictionaryInfo> getCurrentDictionaryFileNameAndVersionInfo( 338 final Context context) { 339 final ArrayList<DictionaryInfo> dictList = new ArrayList<>(); 340 341 // Retrieve downloaded dictionaries 342 final File[] directoryList = getCachedDirectoryList(context); 343 if (null != directoryList) { 344 for (final File directory : directoryList) { 345 final String localeString = getWordListIdFromFileName(directory.getName()); 346 File[] dicts = BinaryDictionaryGetter.getCachedWordLists(localeString, context); 347 for (final File dict : dicts) { 348 final String wordListId = getWordListIdFromFileName(dict.getName()); 349 if (!DictionaryInfoUtils.isMainWordListId(wordListId)) continue; 350 final Locale locale = LocaleUtils.constructLocaleFromString(localeString); 351 final AssetFileAddress fileAddress = AssetFileAddress.makeFromFile(dict); 352 final DictionaryInfo dictionaryInfo = 353 createDictionaryInfoFromFileAddress(fileAddress); 354 // Protect against cases of a less-specific dictionary being found, like an 355 // en dictionary being used for an en_US locale. In this case, the en dictionary 356 // should be used for en_US but discounted for listing purposes. 357 if (dictionaryInfo == null || !dictionaryInfo.mLocale.equals(locale)) continue; 358 addOrUpdateDictInfo(dictList, dictionaryInfo); 359 } 360 } 361 } 362 363 // Retrieve files from assets 364 final Resources resources = context.getResources(); 365 final AssetManager assets = resources.getAssets(); 366 for (final String localeString : assets.getLocales()) { 367 final Locale locale = LocaleUtils.constructLocaleFromString(localeString); 368 final int resourceId = 369 DictionaryInfoUtils.getMainDictionaryResourceIdIfAvailableForLocale( 370 context.getResources(), locale); 371 if (0 == resourceId) continue; 372 final AssetFileAddress fileAddress = 373 BinaryDictionaryGetter.loadFallbackResource(context, resourceId); 374 final DictionaryInfo dictionaryInfo = createDictionaryInfoFromFileAddress(fileAddress); 375 // Protect against cases of a less-specific dictionary being found, like an 376 // en dictionary being used for an en_US locale. In this case, the en dictionary 377 // should be used for en_US but discounted for listing purposes. 378 if (!dictionaryInfo.mLocale.equals(locale)) continue; 379 addOrUpdateDictInfo(dictList, dictionaryInfo); 380 } 381 382 return dictList; 383 } 384 looksValidForDictionaryInsertion(final CharSequence text, final SpacingAndPunctuations spacingAndPunctuations)385 public static boolean looksValidForDictionaryInsertion(final CharSequence text, 386 final SpacingAndPunctuations spacingAndPunctuations) { 387 if (TextUtils.isEmpty(text)) return false; 388 final int length = text.length(); 389 if (length > Constants.DICTIONARY_MAX_WORD_LENGTH) { 390 return false; 391 } 392 int i = 0; 393 int digitCount = 0; 394 while (i < length) { 395 final int codePoint = Character.codePointAt(text, i); 396 final int charCount = Character.charCount(codePoint); 397 i += charCount; 398 if (Character.isDigit(codePoint)) { 399 // Count digits: see below 400 digitCount += charCount; 401 continue; 402 } 403 if (!spacingAndPunctuations.isWordCodePoint(codePoint)) return false; 404 } 405 // We reject strings entirely comprised of digits to avoid using PIN codes or credit 406 // card numbers. It would come in handy for word prediction though; a good example is 407 // when writing one's address where the street number is usually quite discriminative, 408 // as well as the postal code. 409 return digitCount < length; 410 } 411 } 412