1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.makedict.UnsupportedFormatException; 21 22 import java.io.File; 23 import java.io.IOException; 24 import java.io.OutputStream; 25 import java.nio.ByteBuffer; 26 import java.util.HashMap; 27 import java.util.LinkedList; 28 29 import javax.annotation.Nonnull; 30 31 /** 32 * Decodes binary files for a FusionDictionary. 33 * 34 * All the methods in this class are static. 35 * 36 * TODO: Move this file to makedict/internal. 37 * TODO: Rename this class to DictDecoderUtils. 38 */ 39 public final class BinaryDictDecoderUtils { BinaryDictDecoderUtils()40 private BinaryDictDecoderUtils() { 41 // This utility class is not publicly instantiable. 42 } 43 44 @UsedForTesting 45 public interface DictBuffer { readUnsignedByte()46 public int readUnsignedByte(); readUnsignedShort()47 public int readUnsignedShort(); readUnsignedInt24()48 public int readUnsignedInt24(); readInt()49 public int readInt(); position()50 public int position(); position(int newPosition)51 public void position(int newPosition); 52 @UsedForTesting put(final byte b)53 public void put(final byte b); limit()54 public int limit(); 55 @UsedForTesting capacity()56 public int capacity(); 57 } 58 59 public static final class ByteBufferDictBuffer implements DictBuffer { 60 private ByteBuffer mBuffer; 61 ByteBufferDictBuffer(final ByteBuffer buffer)62 public ByteBufferDictBuffer(final ByteBuffer buffer) { 63 mBuffer = buffer; 64 } 65 66 @Override readUnsignedByte()67 public int readUnsignedByte() { 68 return mBuffer.get() & 0xFF; 69 } 70 71 @Override readUnsignedShort()72 public int readUnsignedShort() { 73 return mBuffer.getShort() & 0xFFFF; 74 } 75 76 @Override readUnsignedInt24()77 public int readUnsignedInt24() { 78 final int retval = readUnsignedByte(); 79 return (retval << 16) + readUnsignedShort(); 80 } 81 82 @Override readInt()83 public int readInt() { 84 return mBuffer.getInt(); 85 } 86 87 @Override position()88 public int position() { 89 return mBuffer.position(); 90 } 91 92 @Override position(int newPos)93 public void position(int newPos) { 94 mBuffer.position(newPos); 95 } 96 97 @Override put(final byte b)98 public void put(final byte b) { 99 mBuffer.put(b); 100 } 101 102 @Override limit()103 public int limit() { 104 return mBuffer.limit(); 105 } 106 107 @Override capacity()108 public int capacity() { 109 return mBuffer.capacity(); 110 } 111 } 112 113 /** 114 * A class grouping utility function for our specific character encoding. 115 */ 116 static final class CharEncoding { 117 118 /** 119 * Helper method to find out whether this code fits on one byte 120 */ fitsOnOneByte(final int character, final HashMap<Integer, Integer> codePointToOneByteCodeMap)121 private static boolean fitsOnOneByte(final int character, 122 final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 123 int codePoint = character; 124 if (codePointToOneByteCodeMap != null) { 125 if (codePointToOneByteCodeMap.containsKey(character)) { 126 codePoint = codePointToOneByteCodeMap.get(character); 127 } 128 } 129 return codePoint >= FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE 130 && codePoint <= FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE; 131 } 132 133 /** 134 * Compute the size of a character given its character code. 135 * 136 * Char format is: 137 * 1 byte = bbbbbbbb match 138 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 139 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 140 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 141 * 00011111 would be outside unicode. 142 * else: iso-latin-1 code 143 * This allows for the whole unicode range to be encoded, including chars outside of 144 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 145 * characters which should never happen anyway (and still work, but take 3 bytes). 146 * 147 * @param character the character code. 148 * @return the size in binary encoded-form, either 1 or 3 bytes. 149 */ getCharSize(final int character, final HashMap<Integer, Integer> codePointToOneByteCodeMap)150 static int getCharSize(final int character, 151 final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 152 // See char encoding in FusionDictionary.java 153 if (fitsOnOneByte(character, codePointToOneByteCodeMap)) return 1; 154 if (FormatSpec.INVALID_CHARACTER == character) return 1; 155 return 3; 156 } 157 158 /** 159 * Compute the byte size of a character array. 160 */ getCharArraySize(final int[] chars, final HashMap<Integer, Integer> codePointToOneByteCodeMap)161 static int getCharArraySize(final int[] chars, 162 final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 163 int size = 0; 164 for (int character : chars) size += getCharSize(character, codePointToOneByteCodeMap); 165 return size; 166 } 167 168 /** 169 * Writes a char array to a byte buffer. 170 * 171 * @param codePoints the code point array to write. 172 * @param buffer the byte buffer to write to. 173 * @param fromIndex the index in buffer to write the character array to. 174 * @param codePointToOneByteCodeMap the map to convert the code point. 175 * @return the index after the last character. 176 */ writeCharArray(final int[] codePoints, final byte[] buffer, final int fromIndex, final HashMap<Integer, Integer> codePointToOneByteCodeMap)177 static int writeCharArray(final int[] codePoints, final byte[] buffer, final int fromIndex, 178 final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 179 int index = fromIndex; 180 for (int codePoint : codePoints) { 181 if (codePointToOneByteCodeMap != null) { 182 if (codePointToOneByteCodeMap.containsKey(codePoint)) { 183 // Convert code points 184 codePoint = codePointToOneByteCodeMap.get(codePoint); 185 } 186 } 187 if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) { 188 buffer[index++] = (byte)codePoint; 189 } else { 190 buffer[index++] = (byte)(0xFF & (codePoint >> 16)); 191 buffer[index++] = (byte)(0xFF & (codePoint >> 8)); 192 buffer[index++] = (byte)(0xFF & codePoint); 193 } 194 } 195 return index; 196 } 197 198 /** 199 * Writes a string with our character format to a byte buffer. 200 * 201 * This will also write the terminator byte. 202 * 203 * @param buffer the byte buffer to write to. 204 * @param origin the offset to write from. 205 * @param word the string to write. 206 * @return the size written, in bytes. 207 */ writeString(final byte[] buffer, final int origin, final String word, final HashMap<Integer, Integer> codePointToOneByteCodeMap)208 static int writeString(final byte[] buffer, final int origin, final String word, 209 final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 210 final int length = word.length(); 211 int index = origin; 212 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 213 int codePoint = word.codePointAt(i); 214 if (codePointToOneByteCodeMap != null) { 215 if (codePointToOneByteCodeMap.containsKey(codePoint)) { 216 // Convert code points 217 codePoint = codePointToOneByteCodeMap.get(codePoint); 218 } 219 } 220 if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) { 221 buffer[index++] = (byte)codePoint; 222 } else { 223 buffer[index++] = (byte)(0xFF & (codePoint >> 16)); 224 buffer[index++] = (byte)(0xFF & (codePoint >> 8)); 225 buffer[index++] = (byte)(0xFF & codePoint); 226 } 227 } 228 buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; 229 return index - origin; 230 } 231 232 /** 233 * Writes a string with our character format to an OutputStream. 234 * 235 * This will also write the terminator byte. 236 * 237 * @param stream the OutputStream to write to. 238 * @param word the string to write. 239 * @return the size written, in bytes. 240 */ writeString(final OutputStream stream, final String word, final HashMap<Integer, Integer> codePointToOneByteCodeMap)241 static int writeString(final OutputStream stream, final String word, 242 final HashMap<Integer, Integer> codePointToOneByteCodeMap) throws IOException { 243 final int length = word.length(); 244 int written = 0; 245 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 246 final int codePoint = word.codePointAt(i); 247 final int charSize = getCharSize(codePoint, codePointToOneByteCodeMap); 248 if (1 == charSize) { 249 stream.write((byte) codePoint); 250 } else { 251 stream.write((byte) (0xFF & (codePoint >> 16))); 252 stream.write((byte) (0xFF & (codePoint >> 8))); 253 stream.write((byte) (0xFF & codePoint)); 254 } 255 written += charSize; 256 } 257 stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR); 258 written += FormatSpec.PTNODE_TERMINATOR_SIZE; 259 return written; 260 } 261 262 /** 263 * Reads a string from a DictBuffer. This is the converse of the above method. 264 */ readString(final DictBuffer dictBuffer)265 static String readString(final DictBuffer dictBuffer) { 266 final StringBuilder s = new StringBuilder(); 267 int character = readChar(dictBuffer); 268 while (character != FormatSpec.INVALID_CHARACTER) { 269 s.appendCodePoint(character); 270 character = readChar(dictBuffer); 271 } 272 return s.toString(); 273 } 274 275 /** 276 * Reads a character from the buffer. 277 * 278 * This follows the character format documented earlier in this source file. 279 * 280 * @param dictBuffer the buffer, positioned over an encoded character. 281 * @return the character code. 282 */ readChar(final DictBuffer dictBuffer)283 static int readChar(final DictBuffer dictBuffer) { 284 int character = dictBuffer.readUnsignedByte(); 285 if (!fitsOnOneByte(character, null)) { 286 if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) { 287 return FormatSpec.INVALID_CHARACTER; 288 } 289 character <<= 16; 290 character += dictBuffer.readUnsignedShort(); 291 } 292 return character; 293 } 294 } 295 296 /** 297 * Reads and returns the PtNode count out of a buffer and forwards the pointer. 298 */ readPtNodeCount(final DictBuffer dictBuffer)299 /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) { 300 final int msb = dictBuffer.readUnsignedByte(); 301 if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) { 302 return msb; 303 } 304 return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8) 305 + dictBuffer.readUnsignedByte(); 306 } 307 308 /** 309 * Finds, as a string, the word at the position passed as an argument. 310 * 311 * @param dictDecoder the dict decoder. 312 * @param headerSize the size of the header. 313 * @param pos the position to seek. 314 * @return the word with its frequency, as a weighted string. 315 */ 316 @UsedForTesting getWordAtPosition(final DictDecoder dictDecoder, final int headerSize, final int pos)317 /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder, 318 final int headerSize, final int pos) { 319 final WeightedString result; 320 final int originalPos = dictDecoder.getPosition(); 321 dictDecoder.setPosition(pos); 322 result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos); 323 dictDecoder.setPosition(originalPos); 324 return result; 325 } 326 getWordAtPositionWithoutParentAddress( final DictDecoder dictDecoder, final int headerSize, final int pos)327 private static WeightedString getWordAtPositionWithoutParentAddress( 328 final DictDecoder dictDecoder, final int headerSize, final int pos) { 329 dictDecoder.setPosition(headerSize); 330 final int count = dictDecoder.readPtNodeCount(); 331 int groupPos = dictDecoder.getPosition(); 332 final StringBuilder builder = new StringBuilder(); 333 WeightedString result = null; 334 335 PtNodeInfo last = null; 336 for (int i = count - 1; i >= 0; --i) { 337 PtNodeInfo info = dictDecoder.readPtNode(groupPos); 338 groupPos = info.mEndAddress; 339 if (info.mOriginalAddress == pos) { 340 builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); 341 result = new WeightedString(builder.toString(), info.mProbabilityInfo); 342 break; // and return 343 } 344 if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) { 345 if (info.mChildrenAddress > pos) { 346 if (null == last) continue; 347 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 348 dictDecoder.setPosition(last.mChildrenAddress); 349 i = dictDecoder.readPtNodeCount(); 350 groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); 351 last = null; 352 continue; 353 } 354 last = info; 355 } 356 if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) { 357 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 358 dictDecoder.setPosition(last.mChildrenAddress); 359 i = dictDecoder.readPtNodeCount(); 360 groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); 361 last = null; 362 continue; 363 } 364 } 365 return result; 366 } 367 368 /** 369 * Helper method that brutally decodes a header from a byte array. 370 * 371 * @param headerBuffer a buffer containing the bytes of the header. 372 * @return a hashmap of the attributes stored in the header 373 */ 374 @Nonnull decodeHeaderAttributes(@onnull final byte[] headerBuffer)375 public static HashMap<String, String> decodeHeaderAttributes(@Nonnull final byte[] headerBuffer) 376 throws UnsupportedFormatException { 377 final StringBuilder sb = new StringBuilder(); 378 final LinkedList<String> keyValues = new LinkedList<>(); 379 int index = 0; 380 while (index < headerBuffer.length) { 381 if (headerBuffer[index] == FormatSpec.PTNODE_CHARACTERS_TERMINATOR) { 382 keyValues.add(sb.toString()); 383 sb.setLength(0); 384 } else if (CharEncoding.fitsOnOneByte(headerBuffer[index] & 0xFF, 385 null /* codePointTable */)) { 386 sb.appendCodePoint(headerBuffer[index] & 0xFF); 387 } else { 388 sb.appendCodePoint(((headerBuffer[index] & 0xFF) << 16) 389 + ((headerBuffer[index + 1] & 0xFF) << 8) 390 + (headerBuffer[index + 2] & 0xFF)); 391 index += 2; 392 } 393 index += 1; 394 } 395 if ((keyValues.size() & 1) != 0) { 396 throw new UnsupportedFormatException("Odd number of attributes"); 397 } 398 final HashMap<String, String> attributes = new HashMap<>(); 399 for (int i = 0; i < keyValues.size(); i += 2) { 400 attributes.put(keyValues.get(i), keyValues.get(i + 1)); 401 } 402 return attributes; 403 } 404 405 /** 406 * Helper method to pass a file name instead of a File object to isBinaryDictionary. 407 */ isBinaryDictionary(final String filename)408 public static boolean isBinaryDictionary(final String filename) { 409 final File file = new File(filename); 410 return isBinaryDictionary(file); 411 } 412 413 /** 414 * Basic test to find out whether the file is a binary dictionary or not. 415 * 416 * @param file The file to test. 417 * @return true if it's a binary dictionary, false otherwise 418 */ isBinaryDictionary(final File file)419 public static boolean isBinaryDictionary(final File file) { 420 final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length()); 421 if (dictDecoder == null) { 422 return false; 423 } 424 return dictDecoder.hasValidRawBinaryDictionary(); 425 } 426 } 427