1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; 20 import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; 21 import com.android.inputmethod.latin.makedict.FusionDictionary.Node; 22 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; 23 24 import java.io.ByteArrayOutputStream; 25 import java.io.FileNotFoundException; 26 import java.io.IOException; 27 import java.io.OutputStream; 28 import java.io.RandomAccessFile; 29 import java.util.ArrayList; 30 import java.util.Arrays; 31 import java.util.HashMap; 32 import java.util.Iterator; 33 import java.util.Map; 34 import java.util.TreeMap; 35 36 /** 37 * Reads and writes XML files for a FusionDictionary. 38 * 39 * All the methods in this class are static. 40 */ 41 public class BinaryDictInputOutput { 42 43 final static boolean DBG = MakedictLog.DBG; 44 45 /* Node layout is as follows: 46 * | addressType xx : mask with MASK_GROUP_ADDRESS_TYPE 47 * 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS 48 * f | 01 = 1 byte : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE 49 * l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES 50 * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES 51 * g | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS 52 * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL 53 * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS 54 * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS 55 * 56 * c | IF FLAG_HAS_MULTIPLE_CHARS 57 * h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers 58 * a | end 1 byte, = 0 59 * r | ELSE 60 * s | char 1 or 3 bytes 61 * | END 62 * 63 * f | 64 * r | IF FLAG_IS_TERMINAL 65 * e | frequency 1 byte 66 * q | 67 * 68 * c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType 69 * h | // nothing 70 * i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType 71 * l | children address, 1 byte 72 * d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType 73 * r | children address, 2 bytes 74 * e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType 75 * n | children address, 3 bytes 76 * A | END 77 * d 78 * dress 79 * 80 * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS 81 * | shortcut string list 82 * | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS 83 * | bigrams address list 84 * 85 * Char format is: 86 * 1 byte = bbbbbbbb match 87 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 88 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 89 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 90 * 00011111 would be outside unicode. 91 * else: iso-latin-1 code 92 * This allows for the whole unicode range to be encoded, including chars outside of 93 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 94 * characters which should never happen anyway (and still work, but take 3 bytes). 95 * 96 * bigram address list is: 97 * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT 98 * | addressSign = 1 bit, : FLAG_ATTRIBUTE_OFFSET_NEGATIVE 99 * | 1 = must take -address, 0 = must take +address 100 * | xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE 101 * | addressFormat = 2 bits, 00 = unused : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE 102 * | 01 = 1 byte : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE 103 * | 10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES 104 * | 11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES 105 * | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY 106 * <address> | IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat) 107 * | read 1 byte, add top 4 bits 108 * | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat) 109 * | read 2 bytes, add top 4 bits 110 * | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat 111 * | read 3 bytes, add top 4 bits 112 * | END 113 * | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address 114 * if (FLAG_ATTRIBUTE_HAS_NEXT) goto bigram_and_shortcut_address_list_is 115 * 116 * shortcut string list is: 117 * <byte size> = GROUP_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes. 118 * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT 119 * | reserved = 3 bits, must be 0 120 * | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY 121 * <shortcut> = | string of characters at the char format described above, with the terminator 122 * | used to signal the end of the string. 123 * if (FLAG_ATTRIBUTE_HAS_NEXT goto flags 124 */ 125 126 private static final int VERSION_1_MAGIC_NUMBER = 0x78B1; 127 private static final int VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; 128 private static final int MINIMUM_SUPPORTED_VERSION = 1; 129 private static final int MAXIMUM_SUPPORTED_VERSION = 2; 130 private static final int NOT_A_VERSION_NUMBER = -1; 131 private static final int FIRST_VERSION_WITH_HEADER_SIZE = 2; 132 133 // These options need to be the same numeric values as the one in the native reading code. 134 private static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; 135 private static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; 136 private static final int CONTAINS_BIGRAMS_FLAG = 0x8; 137 138 // TODO: Make this value adaptative to content data, store it in the header, and 139 // use it in the reading code. 140 private static final int MAX_WORD_LENGTH = 48; 141 142 private static final int MASK_GROUP_ADDRESS_TYPE = 0xC0; 143 private static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; 144 private static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; 145 private static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; 146 private static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; 147 148 private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20; 149 150 private static final int FLAG_IS_TERMINAL = 0x10; 151 private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08; 152 private static final int FLAG_HAS_BIGRAMS = 0x04; 153 154 private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; 155 private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; 156 private static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; 157 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; 158 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; 159 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; 160 private static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F; 161 162 private static final int GROUP_CHARACTERS_TERMINATOR = 0x1F; 163 164 private static final int GROUP_TERMINATOR_SIZE = 1; 165 private static final int GROUP_FLAGS_SIZE = 1; 166 private static final int GROUP_FREQUENCY_SIZE = 1; 167 private static final int GROUP_MAX_ADDRESS_SIZE = 3; 168 private static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1; 169 private static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; 170 private static final int GROUP_SHORTCUT_LIST_SIZE_SIZE = 2; 171 172 private static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; 173 private static final int INVALID_CHARACTER = -1; 174 175 private static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127 176 private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767 177 178 private static final int MAX_TERMINAL_FREQUENCY = 255; 179 private static final int MAX_BIGRAM_FREQUENCY = 15; 180 181 // Arbitrary limit to how much passes we consider address size compression should 182 // terminate in. At the time of this writing, our largest dictionary completes 183 // compression in five passes. 184 // If the number of passes exceeds this number, makedict bails with an exception on 185 // suspicion that a bug might be causing an infinite loop. 186 private static final int MAX_PASSES = 24; 187 188 /** 189 * A class grouping utility function for our specific character encoding. 190 */ 191 private static class CharEncoding { 192 193 private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; 194 private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; 195 196 /** 197 * Helper method to find out whether this code fits on one byte 198 */ fitsOnOneByte(int character)199 private static boolean fitsOnOneByte(int character) { 200 return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE 201 && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; 202 } 203 204 /** 205 * Compute the size of a character given its character code. 206 * 207 * Char format is: 208 * 1 byte = bbbbbbbb match 209 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 210 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 211 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 212 * 00011111 would be outside unicode. 213 * else: iso-latin-1 code 214 * This allows for the whole unicode range to be encoded, including chars outside of 215 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 216 * characters which should never happen anyway (and still work, but take 3 bytes). 217 * 218 * @param character the character code. 219 * @return the size in binary encoded-form, either 1 or 3 bytes. 220 */ getCharSize(int character)221 private static int getCharSize(int character) { 222 // See char encoding in FusionDictionary.java 223 if (fitsOnOneByte(character)) return 1; 224 if (INVALID_CHARACTER == character) return 1; 225 return 3; 226 } 227 228 /** 229 * Compute the byte size of a character array. 230 */ getCharArraySize(final int[] chars)231 private static int getCharArraySize(final int[] chars) { 232 int size = 0; 233 for (int character : chars) size += getCharSize(character); 234 return size; 235 } 236 237 /** 238 * Writes a char array to a byte buffer. 239 * 240 * @param codePoints the code point array to write. 241 * @param buffer the byte buffer to write to. 242 * @param index the index in buffer to write the character array to. 243 * @return the index after the last character. 244 */ writeCharArray(final int[] codePoints, final byte[] buffer, int index)245 private static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) { 246 for (int codePoint : codePoints) { 247 if (1 == getCharSize(codePoint)) { 248 buffer[index++] = (byte)codePoint; 249 } else { 250 buffer[index++] = (byte)(0xFF & (codePoint >> 16)); 251 buffer[index++] = (byte)(0xFF & (codePoint >> 8)); 252 buffer[index++] = (byte)(0xFF & codePoint); 253 } 254 } 255 return index; 256 } 257 258 /** 259 * Writes a string with our character format to a byte buffer. 260 * 261 * This will also write the terminator byte. 262 * 263 * @param buffer the byte buffer to write to. 264 * @param origin the offset to write from. 265 * @param word the string to write. 266 * @return the size written, in bytes. 267 */ writeString(final byte[] buffer, final int origin, final String word)268 private static int writeString(final byte[] buffer, final int origin, 269 final String word) { 270 final int length = word.length(); 271 int index = origin; 272 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 273 final int codePoint = word.codePointAt(i); 274 if (1 == getCharSize(codePoint)) { 275 buffer[index++] = (byte)codePoint; 276 } else { 277 buffer[index++] = (byte)(0xFF & (codePoint >> 16)); 278 buffer[index++] = (byte)(0xFF & (codePoint >> 8)); 279 buffer[index++] = (byte)(0xFF & codePoint); 280 } 281 } 282 buffer[index++] = GROUP_CHARACTERS_TERMINATOR; 283 return index - origin; 284 } 285 286 /** 287 * Writes a string with our character format to a ByteArrayOutputStream. 288 * 289 * This will also write the terminator byte. 290 * 291 * @param buffer the ByteArrayOutputStream to write to. 292 * @param word the string to write. 293 */ writeString(ByteArrayOutputStream buffer, final String word)294 private static void writeString(ByteArrayOutputStream buffer, final String word) { 295 final int length = word.length(); 296 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 297 final int codePoint = word.codePointAt(i); 298 if (1 == getCharSize(codePoint)) { 299 buffer.write((byte) codePoint); 300 } else { 301 buffer.write((byte) (0xFF & (codePoint >> 16))); 302 buffer.write((byte) (0xFF & (codePoint >> 8))); 303 buffer.write((byte) (0xFF & codePoint)); 304 } 305 } 306 buffer.write(GROUP_CHARACTERS_TERMINATOR); 307 } 308 309 /** 310 * Reads a string from a RandomAccessFile. This is the converse of the above method. 311 */ readString(final RandomAccessFile source)312 private static String readString(final RandomAccessFile source) throws IOException { 313 final StringBuilder s = new StringBuilder(); 314 int character = readChar(source); 315 while (character != INVALID_CHARACTER) { 316 s.appendCodePoint(character); 317 character = readChar(source); 318 } 319 return s.toString(); 320 } 321 322 /** 323 * Reads a character from the file. 324 * 325 * This follows the character format documented earlier in this source file. 326 * 327 * @param source the file, positioned over an encoded character. 328 * @return the character code. 329 */ readChar(RandomAccessFile source)330 private static int readChar(RandomAccessFile source) throws IOException { 331 int character = source.readUnsignedByte(); 332 if (!fitsOnOneByte(character)) { 333 if (GROUP_CHARACTERS_TERMINATOR == character) 334 return INVALID_CHARACTER; 335 character <<= 16; 336 character += source.readUnsignedShort(); 337 } 338 return character; 339 } 340 } 341 342 /** 343 * Compute the binary size of the character array in a group 344 * 345 * If only one character, this is the size of this character. If many, it's the sum of their 346 * sizes + 1 byte for the terminator. 347 * 348 * @param group the group 349 * @return the size of the char array, including the terminator if any 350 */ getGroupCharactersSize(CharGroup group)351 private static int getGroupCharactersSize(CharGroup group) { 352 int size = CharEncoding.getCharArraySize(group.mChars); 353 if (group.hasSeveralChars()) size += GROUP_TERMINATOR_SIZE; 354 return size; 355 } 356 357 /** 358 * Compute the binary size of the group count 359 * @param count the group count 360 * @return the size of the group count, either 1 or 2 bytes. 361 */ getGroupCountSize(final int count)362 private static int getGroupCountSize(final int count) { 363 if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= count) { 364 return 1; 365 } else if (MAX_CHARGROUPS_IN_A_NODE >= count) { 366 return 2; 367 } else { 368 throw new RuntimeException("Can't have more than " + MAX_CHARGROUPS_IN_A_NODE 369 + " groups in a node (found " + count +")"); 370 } 371 } 372 373 /** 374 * Compute the binary size of the group count for a node 375 * @param node the node 376 * @return the size of the group count, either 1 or 2 bytes. 377 */ getGroupCountSize(final Node node)378 private static int getGroupCountSize(final Node node) { 379 return getGroupCountSize(node.mData.size()); 380 } 381 382 /** 383 * Compute the size of a shortcut in bytes. 384 */ getShortcutSize(final WeightedString shortcut)385 private static int getShortcutSize(final WeightedString shortcut) { 386 int size = GROUP_ATTRIBUTE_FLAGS_SIZE; 387 final String word = shortcut.mWord; 388 final int length = word.length(); 389 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 390 final int codePoint = word.codePointAt(i); 391 size += CharEncoding.getCharSize(codePoint); 392 } 393 size += GROUP_TERMINATOR_SIZE; 394 return size; 395 } 396 397 /** 398 * Compute the size of a shortcut list in bytes. 399 * 400 * This is known in advance and does not change according to position in the file 401 * like address lists do. 402 */ getShortcutListSize(final ArrayList<WeightedString> shortcutList)403 private static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { 404 if (null == shortcutList) return 0; 405 int size = GROUP_SHORTCUT_LIST_SIZE_SIZE; 406 for (final WeightedString shortcut : shortcutList) { 407 size += getShortcutSize(shortcut); 408 } 409 return size; 410 } 411 412 /** 413 * Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything. 414 * 415 * @param group the CharGroup to compute the size of. 416 * @return the maximum size of the group. 417 */ getCharGroupMaximumSize(CharGroup group)418 private static int getCharGroupMaximumSize(CharGroup group) { 419 int size = getGroupCharactersSize(group) + GROUP_FLAGS_SIZE; 420 // If terminal, one byte for the frequency 421 if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE; 422 size += GROUP_MAX_ADDRESS_SIZE; // For children address 423 size += getShortcutListSize(group.mShortcutTargets); 424 if (null != group.mBigrams) { 425 size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE) 426 * group.mBigrams.size(); 427 } 428 return size; 429 } 430 431 /** 432 * Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches 433 * it in the 'actualSize' member of the node. 434 * 435 * @param node the node to compute the maximum size of. 436 */ setNodeMaximumSize(Node node)437 private static void setNodeMaximumSize(Node node) { 438 int size = getGroupCountSize(node); 439 for (CharGroup g : node.mData) { 440 final int groupSize = getCharGroupMaximumSize(g); 441 g.mCachedSize = groupSize; 442 size += groupSize; 443 } 444 node.mCachedSize = size; 445 } 446 447 /** 448 * Helper method to hide the actual value of the no children address. 449 */ hasChildrenAddress(int address)450 private static boolean hasChildrenAddress(int address) { 451 return NO_CHILDREN_ADDRESS != address; 452 } 453 454 /** 455 * Compute the size, in bytes, that an address will occupy. 456 * 457 * This can be used either for children addresses (which are always positive) or for 458 * attribute, which may be positive or negative but 459 * store their sign bit separately. 460 * 461 * @param address the address 462 * @return the byte size. 463 */ getByteSize(int address)464 private static int getByteSize(int address) { 465 assert(address < 0x1000000); 466 if (!hasChildrenAddress(address)) { 467 return 0; 468 } else if (Math.abs(address) < 0x100) { 469 return 1; 470 } else if (Math.abs(address) < 0x10000) { 471 return 2; 472 } else { 473 return 3; 474 } 475 } 476 // End utility methods. 477 478 // This method is responsible for finding a nice ordering of the nodes that favors run-time 479 // cache performance and dictionary size. 480 /* package for tests */ static ArrayList<Node> flattenTree(Node root) { 481 final int treeSize = FusionDictionary.countCharGroups(root); 482 MakedictLog.i("Counted nodes : " + treeSize); 483 final ArrayList<Node> flatTree = new ArrayList<Node>(treeSize); 484 return flattenTreeInner(flatTree, root); 485 } 486 487 private static ArrayList<Node> flattenTreeInner(ArrayList<Node> list, Node node) { 488 // Removing the node is necessary if the tails are merged, because we would then 489 // add the same node several times when we only want it once. A number of places in 490 // the code also depends on any node being only once in the list. 491 // Merging tails can only be done if there are no attributes. Searching for attributes 492 // in LatinIME code depends on a total breadth-first ordering, which merging tails 493 // breaks. If there are no attributes, it should be fine (and reduce the file size) 494 // to merge tails, and removing the node from the list would be necessary. However, 495 // we don't merge tails because breaking the breadth-first ordering would result in 496 // extreme overhead at bigram lookup time (it would make the search function O(n) instead 497 // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty 498 // high). 499 // If no nodes are ever merged, we can't have the same node twice in the list, hence 500 // searching for duplicates in unnecessary. It is also very performance consuming, 501 // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making 502 // this simple list.remove operation O(n*n) overall. On Android this overhead is very 503 // high. 504 // For future reference, the code to remove duplicate is a simple : list.remove(node); 505 list.add(node); 506 final ArrayList<CharGroup> branches = node.mData; 507 final int nodeSize = branches.size(); 508 for (CharGroup group : branches) { 509 if (null != group.mChildren) flattenTreeInner(list, group.mChildren); 510 } 511 return list; 512 } 513 514 /** 515 * Finds the absolute address of a word in the dictionary. 516 * 517 * @param dict the dictionary in which to search. 518 * @param word the word we are searching for. 519 * @return the word address. If it is not found, an exception is thrown. 520 */ 521 private static int findAddressOfWord(final FusionDictionary dict, final String word) { 522 return FusionDictionary.findWordInTree(dict.mRoot, word).mCachedAddress; 523 } 524 525 /** 526 * Computes the actual node size, based on the cached addresses of the children nodes. 527 * 528 * Each node stores its tentative address. During dictionary address computing, these 529 * are not final, but they can be used to compute the node size (the node size depends 530 * on the address of the children because the number of bytes necessary to store an 531 * address depends on its numeric value. The return value indicates whether the node 532 * contents (as in, any of the addresses stored in the cache fields) have changed with 533 * respect to their previous value. 534 * 535 * @param node the node to compute the size of. 536 * @param dict the dictionary in which the word/attributes are to be found. 537 * @return false if none of the cached addresses inside the node changed, true otherwise. 538 */ 539 private static boolean computeActualNodeSize(Node node, FusionDictionary dict) { 540 boolean changed = false; 541 int size = getGroupCountSize(node); 542 for (CharGroup group : node.mData) { 543 if (group.mCachedAddress != node.mCachedAddress + size) { 544 changed = true; 545 group.mCachedAddress = node.mCachedAddress + size; 546 } 547 int groupSize = GROUP_FLAGS_SIZE + getGroupCharactersSize(group); 548 if (group.isTerminal()) groupSize += GROUP_FREQUENCY_SIZE; 549 if (null != group.mChildren) { 550 final int offsetBasePoint= groupSize + node.mCachedAddress + size; 551 final int offset = group.mChildren.mCachedAddress - offsetBasePoint; 552 groupSize += getByteSize(offset); 553 } 554 groupSize += getShortcutListSize(group.mShortcutTargets); 555 if (null != group.mBigrams) { 556 for (WeightedString bigram : group.mBigrams) { 557 final int offsetBasePoint = groupSize + node.mCachedAddress + size 558 + GROUP_FLAGS_SIZE; 559 final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); 560 final int offset = addressOfBigram - offsetBasePoint; 561 groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE; 562 } 563 } 564 group.mCachedSize = groupSize; 565 size += groupSize; 566 } 567 if (node.mCachedSize != size) { 568 node.mCachedSize = size; 569 changed = true; 570 } 571 return changed; 572 } 573 574 /** 575 * Computes the byte size of a list of nodes and updates each node cached position. 576 * 577 * @param flatNodes the array of nodes. 578 * @return the byte size of the entire stack. 579 */ 580 private static int stackNodes(ArrayList<Node> flatNodes) { 581 int nodeOffset = 0; 582 for (Node n : flatNodes) { 583 n.mCachedAddress = nodeOffset; 584 int groupCountSize = getGroupCountSize(n); 585 int groupOffset = 0; 586 for (CharGroup g : n.mData) { 587 g.mCachedAddress = groupCountSize + nodeOffset + groupOffset; 588 groupOffset += g.mCachedSize; 589 } 590 if (groupOffset + groupCountSize != n.mCachedSize) { 591 throw new RuntimeException("Bug : Stored and computed node size differ"); 592 } 593 nodeOffset += n.mCachedSize; 594 } 595 return nodeOffset; 596 } 597 598 /** 599 * Compute the addresses and sizes of an ordered node array. 600 * 601 * This method takes a node array and will update its cached address and size values 602 * so that they can be written into a file. It determines the smallest size each of the 603 * nodes can be given the addresses of its children and attributes, and store that into 604 * each node. 605 * The order of the node is given by the order of the array. This method makes no effort 606 * to find a good order; it only mechanically computes the size this order results in. 607 * 608 * @param dict the dictionary 609 * @param flatNodes the ordered array of nodes 610 * @return the same array it was passed. The nodes have been updated for address and size. 611 */ 612 private static ArrayList<Node> computeAddresses(FusionDictionary dict, 613 ArrayList<Node> flatNodes) { 614 // First get the worst sizes and offsets 615 for (Node n : flatNodes) setNodeMaximumSize(n); 616 final int offset = stackNodes(flatNodes); 617 618 MakedictLog.i("Compressing the array addresses. Original size : " + offset); 619 MakedictLog.i("(Recursively seen size : " + offset + ")"); 620 621 int passes = 0; 622 boolean changesDone = false; 623 do { 624 changesDone = false; 625 for (Node n : flatNodes) { 626 final int oldNodeSize = n.mCachedSize; 627 final boolean changed = computeActualNodeSize(n, dict); 628 final int newNodeSize = n.mCachedSize; 629 if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!"); 630 changesDone |= changed; 631 } 632 stackNodes(flatNodes); 633 ++passes; 634 if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug"); 635 } while (changesDone); 636 637 final Node lastNode = flatNodes.get(flatNodes.size() - 1); 638 MakedictLog.i("Compression complete in " + passes + " passes."); 639 MakedictLog.i("After address compression : " 640 + (lastNode.mCachedAddress + lastNode.mCachedSize)); 641 642 return flatNodes; 643 } 644 645 /** 646 * Sanity-checking method. 647 * 648 * This method checks an array of node for juxtaposition, that is, it will do 649 * nothing if each node's cached address is actually the previous node's address 650 * plus the previous node's size. 651 * If this is not the case, it will throw an exception. 652 * 653 * @param array the array node to check 654 */ 655 private static void checkFlatNodeArray(ArrayList<Node> array) { 656 int offset = 0; 657 int index = 0; 658 for (Node n : array) { 659 if (n.mCachedAddress != offset) { 660 throw new RuntimeException("Wrong address for node " + index 661 + " : expected " + offset + ", got " + n.mCachedAddress); 662 } 663 ++index; 664 offset += n.mCachedSize; 665 } 666 } 667 668 /** 669 * Helper method to write a variable-size address to a file. 670 * 671 * @param buffer the buffer to write to. 672 * @param index the index in the buffer to write the address to. 673 * @param address the address to write. 674 * @return the size in bytes the address actually took. 675 */ 676 private static int writeVariableAddress(final byte[] buffer, int index, final int address) { 677 switch (getByteSize(address)) { 678 case 1: 679 buffer[index++] = (byte)address; 680 return 1; 681 case 2: 682 buffer[index++] = (byte)(0xFF & (address >> 8)); 683 buffer[index++] = (byte)(0xFF & address); 684 return 2; 685 case 3: 686 buffer[index++] = (byte)(0xFF & (address >> 16)); 687 buffer[index++] = (byte)(0xFF & (address >> 8)); 688 buffer[index++] = (byte)(0xFF & address); 689 return 3; 690 case 0: 691 return 0; 692 default: 693 throw new RuntimeException("Address " + address + " has a strange size"); 694 } 695 } 696 makeCharGroupFlags(final CharGroup group, final int groupAddress, final int childrenOffset)697 private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress, 698 final int childrenOffset) { 699 byte flags = 0; 700 if (group.mChars.length > 1) flags |= FLAG_HAS_MULTIPLE_CHARS; 701 if (group.mFrequency >= 0) { 702 flags |= FLAG_IS_TERMINAL; 703 } 704 if (null != group.mChildren) { 705 switch (getByteSize(childrenOffset)) { 706 case 1: 707 flags |= FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; 708 break; 709 case 2: 710 flags |= FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; 711 break; 712 case 3: 713 flags |= FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; 714 break; 715 default: 716 throw new RuntimeException("Node with a strange address"); 717 } 718 } 719 if (null != group.mShortcutTargets) { 720 if (DBG && 0 == group.mShortcutTargets.size()) { 721 throw new RuntimeException("0-sized shortcut list must be null"); 722 } 723 flags |= FLAG_HAS_SHORTCUT_TARGETS; 724 } 725 if (null != group.mBigrams) { 726 if (DBG && 0 == group.mBigrams.size()) { 727 throw new RuntimeException("0-sized bigram list must be null"); 728 } 729 flags |= FLAG_HAS_BIGRAMS; 730 } 731 return flags; 732 } 733 734 /** 735 * Makes the flag value for a bigram. 736 * 737 * @param more whether there are more bigrams after this one. 738 * @param offset the offset of the bigram. 739 * @param bigramFrequency the frequency of the bigram, 0..255. 740 * @param unigramFrequency the unigram frequency of the same word, 0..255. 741 * @param word the second bigram, for debugging purposes 742 * @return the flags 743 */ makeBigramFlags(final boolean more, final int offset, int bigramFrequency, final int unigramFrequency, final String word)744 private static final int makeBigramFlags(final boolean more, final int offset, 745 int bigramFrequency, final int unigramFrequency, final String word) { 746 int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) 747 + (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0); 748 switch (getByteSize(offset)) { 749 case 1: 750 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; 751 break; 752 case 2: 753 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; 754 break; 755 case 3: 756 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; 757 break; 758 default: 759 throw new RuntimeException("Strange offset size"); 760 } 761 if (unigramFrequency > bigramFrequency) { 762 MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word 763 + "\". Bigram freq is " + bigramFrequency + ", unigram freq for " 764 + word + " is " + unigramFrequency); 765 bigramFrequency = unigramFrequency; 766 } 767 // We compute the difference between 255 (which means probability = 1) and the 768 // unigram score. We split this into a number of discrete steps. 769 // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15 770 // represents an increase of 16 steps: a value of 15 will be interpreted as the median 771 // value of the 16th step. In all justice, if the bigram frequency is low enough to be 772 // rounded below the first step (which means it is less than half a step higher than the 773 // unigram frequency) then the unigram frequency itself is the best approximation of the 774 // bigram freq that we could possibly supply, hence we should *not* include this bigram 775 // in the file at all. 776 // until this is done, we'll write 0 and slightly overestimate this case. 777 // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step 778 // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to 779 // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the 780 // step size. Then we compute the start of the first step (the one where value 0 starts) 781 // by adding half-a-step to the unigramFrequency. From there, we compute the integer 782 // number of steps to the bigramFrequency. One last thing: we want our steps to include 783 // their lower bound and exclude their higher bound so we need to have the first step 784 // start at exactly 1 unit higher than floor(unigramFreq + half a step). 785 // Note : to reconstruct the score, the dictionary reader will need to divide 786 // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise, and add 787 // (discretizedFrequency + 0.5) times this value to get the median value of the step, 788 // which is the best approximation. This is how we get the most precise result with 789 // only four bits. 790 final double stepSize = 791 (double)(MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5 + MAX_BIGRAM_FREQUENCY); 792 final double firstStepStart = 1 + unigramFrequency + (stepSize / 2.0); 793 final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize); 794 // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1 795 // here. The best approximation would be the unigram freq itself, so we should not 796 // include this bigram in the dictionary. For now, register as 0, and live with the 797 // small over-estimation that we get in this case. TODO: actually remove this bigram 798 // if discretizedFrequency < 0. 799 final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0; 800 bigramFlags += finalBigramFrequency & FLAG_ATTRIBUTE_FREQUENCY; 801 return bigramFlags; 802 } 803 804 /** 805 * Makes the 2-byte value for options flags. 806 */ makeOptionsValue(final FusionDictionary dictionary)807 private static final int makeOptionsValue(final FusionDictionary dictionary) { 808 final DictionaryOptions options = dictionary.mOptions; 809 final boolean hasBigrams = dictionary.hasBigrams(); 810 return (options.mFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0) 811 + (options.mGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0) 812 + (hasBigrams ? CONTAINS_BIGRAMS_FLAG : 0); 813 } 814 815 /** 816 * Makes the flag value for a shortcut. 817 * 818 * @param more whether there are more attributes after this one. 819 * @param frequency the frequency of the attribute, 0..15 820 * @return the flags 821 */ makeShortcutFlags(final boolean more, final int frequency)822 private static final int makeShortcutFlags(final boolean more, final int frequency) { 823 return (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) + (frequency & FLAG_ATTRIBUTE_FREQUENCY); 824 } 825 826 /** 827 * Write a node to memory. The node is expected to have its final position cached. 828 * 829 * This can be an empty map, but the more is inside the faster the lookups will be. It can 830 * be carried on as long as nodes do not move. 831 * 832 * @param dict the dictionary the node is a part of (for relative offsets). 833 * @param buffer the memory buffer to write to. 834 * @param node the node to write. 835 * @return the address of the END of the node. 836 */ writePlacedNode(FusionDictionary dict, byte[] buffer, Node node)837 private static int writePlacedNode(FusionDictionary dict, byte[] buffer, Node node) { 838 int index = node.mCachedAddress; 839 840 final int groupCount = node.mData.size(); 841 final int countSize = getGroupCountSize(node); 842 if (1 == countSize) { 843 buffer[index++] = (byte)groupCount; 844 } else if (2 == countSize) { 845 // We need to signal 2-byte size by setting the top bit of the MSB to 1, so 846 // we | 0x80 to do this. 847 buffer[index++] = (byte)((groupCount >> 8) | 0x80); 848 buffer[index++] = (byte)(groupCount & 0xFF); 849 } else { 850 throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); 851 } 852 int groupAddress = index; 853 for (int i = 0; i < groupCount; ++i) { 854 CharGroup group = node.mData.get(i); 855 if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not " 856 + "the same as the cached address of the group : " 857 + index + " <> " + group.mCachedAddress); 858 groupAddress += GROUP_FLAGS_SIZE + getGroupCharactersSize(group); 859 // Sanity checks. 860 if (DBG && group.mFrequency > MAX_TERMINAL_FREQUENCY) { 861 throw new RuntimeException("A node has a frequency > " + MAX_TERMINAL_FREQUENCY 862 + " : " + group.mFrequency); 863 } 864 if (group.mFrequency >= 0) groupAddress += GROUP_FREQUENCY_SIZE; 865 final int childrenOffset = null == group.mChildren 866 ? NO_CHILDREN_ADDRESS : group.mChildren.mCachedAddress - groupAddress; 867 byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset); 868 buffer[index++] = flags; 869 index = CharEncoding.writeCharArray(group.mChars, buffer, index); 870 if (group.hasSeveralChars()) { 871 buffer[index++] = GROUP_CHARACTERS_TERMINATOR; 872 } 873 if (group.mFrequency >= 0) { 874 buffer[index++] = (byte) group.mFrequency; 875 } 876 final int shift = writeVariableAddress(buffer, index, childrenOffset); 877 index += shift; 878 groupAddress += shift; 879 880 // Write shortcuts 881 if (null != group.mShortcutTargets) { 882 final int indexOfShortcutByteSize = index; 883 index += GROUP_SHORTCUT_LIST_SIZE_SIZE; 884 groupAddress += GROUP_SHORTCUT_LIST_SIZE_SIZE; 885 final Iterator<WeightedString> shortcutIterator = group.mShortcutTargets.iterator(); 886 while (shortcutIterator.hasNext()) { 887 final WeightedString target = shortcutIterator.next(); 888 ++groupAddress; 889 int shortcutFlags = makeShortcutFlags(shortcutIterator.hasNext(), 890 target.mFrequency); 891 buffer[index++] = (byte)shortcutFlags; 892 final int shortcutShift = CharEncoding.writeString(buffer, index, target.mWord); 893 index += shortcutShift; 894 groupAddress += shortcutShift; 895 } 896 final int shortcutByteSize = index - indexOfShortcutByteSize; 897 if (shortcutByteSize > 0xFFFF) { 898 throw new RuntimeException("Shortcut list too large"); 899 } 900 buffer[indexOfShortcutByteSize] = (byte)(shortcutByteSize >> 8); 901 buffer[indexOfShortcutByteSize + 1] = (byte)(shortcutByteSize & 0xFF); 902 } 903 // Write bigrams 904 if (null != group.mBigrams) { 905 final Iterator<WeightedString> bigramIterator = group.mBigrams.iterator(); 906 while (bigramIterator.hasNext()) { 907 final WeightedString bigram = bigramIterator.next(); 908 final CharGroup target = 909 FusionDictionary.findWordInTree(dict.mRoot, bigram.mWord); 910 final int addressOfBigram = target.mCachedAddress; 911 final int unigramFrequencyForThisWord = target.mFrequency; 912 ++groupAddress; 913 final int offset = addressOfBigram - groupAddress; 914 int bigramFlags = makeBigramFlags(bigramIterator.hasNext(), offset, 915 bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord); 916 buffer[index++] = (byte)bigramFlags; 917 final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset)); 918 index += bigramShift; 919 groupAddress += bigramShift; 920 } 921 } 922 923 } 924 if (index != node.mCachedAddress + node.mCachedSize) throw new RuntimeException( 925 "Not the same size : written " 926 + (index - node.mCachedAddress) + " bytes out of a node that should have " 927 + node.mCachedSize + " bytes"); 928 return index; 929 } 930 931 /** 932 * Dumps a collection of useful statistics about a node array. 933 * 934 * This prints purely informative stuff, like the total estimated file size, the 935 * number of nodes, of character groups, the repartition of each address size, etc 936 * 937 * @param nodes the node array. 938 */ showStatistics(ArrayList<Node> nodes)939 private static void showStatistics(ArrayList<Node> nodes) { 940 int firstTerminalAddress = Integer.MAX_VALUE; 941 int lastTerminalAddress = Integer.MIN_VALUE; 942 int size = 0; 943 int charGroups = 0; 944 int maxGroups = 0; 945 int maxRuns = 0; 946 for (Node n : nodes) { 947 if (maxGroups < n.mData.size()) maxGroups = n.mData.size(); 948 for (CharGroup cg : n.mData) { 949 ++charGroups; 950 if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length; 951 if (cg.mFrequency >= 0) { 952 if (n.mCachedAddress < firstTerminalAddress) 953 firstTerminalAddress = n.mCachedAddress; 954 if (n.mCachedAddress > lastTerminalAddress) 955 lastTerminalAddress = n.mCachedAddress; 956 } 957 } 958 if (n.mCachedAddress + n.mCachedSize > size) size = n.mCachedAddress + n.mCachedSize; 959 } 960 final int[] groupCounts = new int[maxGroups + 1]; 961 final int[] runCounts = new int[maxRuns + 1]; 962 for (Node n : nodes) { 963 ++groupCounts[n.mData.size()]; 964 for (CharGroup cg : n.mData) { 965 ++runCounts[cg.mChars.length]; 966 } 967 } 968 969 MakedictLog.i("Statistics:\n" 970 + " total file size " + size + "\n" 971 + " " + nodes.size() + " nodes\n" 972 + " " + charGroups + " groups (" + ((float)charGroups / nodes.size()) 973 + " groups per node)\n" 974 + " first terminal at " + firstTerminalAddress + "\n" 975 + " last terminal at " + lastTerminalAddress + "\n" 976 + " Group stats : max = " + maxGroups); 977 for (int i = 0; i < groupCounts.length; ++i) { 978 MakedictLog.i(" " + i + " : " + groupCounts[i]); 979 } 980 MakedictLog.i(" Character run stats : max = " + maxRuns); 981 for (int i = 0; i < runCounts.length; ++i) { 982 MakedictLog.i(" " + i + " : " + runCounts[i]); 983 } 984 } 985 986 /** 987 * Dumps a FusionDictionary to a file. 988 * 989 * This is the public entry point to write a dictionary to a file. 990 * 991 * @param destination the stream to write the binary data to. 992 * @param dict the dictionary to write. 993 * @param version the version of the format to write, currently either 1 or 2. 994 */ writeDictionaryBinary(final OutputStream destination, final FusionDictionary dict, final int version)995 public static void writeDictionaryBinary(final OutputStream destination, 996 final FusionDictionary dict, final int version) 997 throws IOException, UnsupportedFormatException { 998 999 // Addresses are limited to 3 bytes, but since addresses can be relative to each node, the 1000 // structure itself is not limited to 16MB. However, if it is over 16MB deciding the order 1001 // of the nodes becomes a quite complicated problem, because though the dictionary itself 1002 // does not have a size limit, each node must still be within 16MB of all its children and 1003 // parents. As long as this is ensured, the dictionary file may grow to any size. 1004 1005 if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) { 1006 throw new UnsupportedFormatException("Requested file format version " + version 1007 + ", but this implementation only supports versions " 1008 + MINIMUM_SUPPORTED_VERSION + " through " + MAXIMUM_SUPPORTED_VERSION); 1009 } 1010 1011 ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256); 1012 1013 // The magic number in big-endian order. 1014 if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { 1015 // Magic number for version 2+. 1016 headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 24))); 1017 headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 16))); 1018 headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 8))); 1019 headerBuffer.write((byte) (0xFF & VERSION_2_MAGIC_NUMBER)); 1020 // Dictionary version. 1021 headerBuffer.write((byte) (0xFF & (version >> 8))); 1022 headerBuffer.write((byte) (0xFF & version)); 1023 } else { 1024 // Magic number for version 1. 1025 headerBuffer.write((byte) (0xFF & (VERSION_1_MAGIC_NUMBER >> 8))); 1026 headerBuffer.write((byte) (0xFF & VERSION_1_MAGIC_NUMBER)); 1027 // Dictionary version. 1028 headerBuffer.write((byte) (0xFF & version)); 1029 } 1030 // Options flags 1031 final int options = makeOptionsValue(dict); 1032 headerBuffer.write((byte) (0xFF & (options >> 8))); 1033 headerBuffer.write((byte) (0xFF & options)); 1034 if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { 1035 final int headerSizeOffset = headerBuffer.size(); 1036 // Placeholder to be written later with header size. 1037 for (int i = 0; i < 4; ++i) { 1038 headerBuffer.write(0); 1039 } 1040 // Write out the options. 1041 for (final String key : dict.mOptions.mAttributes.keySet()) { 1042 final String value = dict.mOptions.mAttributes.get(key); 1043 CharEncoding.writeString(headerBuffer, key); 1044 CharEncoding.writeString(headerBuffer, value); 1045 } 1046 final int size = headerBuffer.size(); 1047 final byte[] bytes = headerBuffer.toByteArray(); 1048 // Write out the header size. 1049 bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24)); 1050 bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16)); 1051 bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8)); 1052 bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0)); 1053 destination.write(bytes); 1054 } else { 1055 headerBuffer.writeTo(destination); 1056 } 1057 1058 headerBuffer.close(); 1059 1060 // Leave the choice of the optimal node order to the flattenTree function. 1061 MakedictLog.i("Flattening the tree..."); 1062 ArrayList<Node> flatNodes = flattenTree(dict.mRoot); 1063 1064 MakedictLog.i("Computing addresses..."); 1065 computeAddresses(dict, flatNodes); 1066 MakedictLog.i("Checking array..."); 1067 if (DBG) checkFlatNodeArray(flatNodes); 1068 1069 // Create a buffer that matches the final dictionary size. 1070 final Node lastNode = flatNodes.get(flatNodes.size() - 1); 1071 final int bufferSize =(lastNode.mCachedAddress + lastNode.mCachedSize); 1072 final byte[] buffer = new byte[bufferSize]; 1073 int index = 0; 1074 1075 MakedictLog.i("Writing file..."); 1076 int dataEndOffset = 0; 1077 for (Node n : flatNodes) { 1078 dataEndOffset = writePlacedNode(dict, buffer, n); 1079 } 1080 1081 if (DBG) showStatistics(flatNodes); 1082 1083 destination.write(buffer, 0, dataEndOffset); 1084 1085 destination.close(); 1086 MakedictLog.i("Done"); 1087 } 1088 1089 1090 // Input methods: Read a binary dictionary to memory. 1091 // readDictionaryBinary is the public entry point for them. 1092 1093 static final int[] characterBuffer = new int[MAX_WORD_LENGTH]; readCharGroup(RandomAccessFile source, final int originalGroupAddress)1094 private static CharGroupInfo readCharGroup(RandomAccessFile source, 1095 final int originalGroupAddress) throws IOException { 1096 int addressPointer = originalGroupAddress; 1097 final int flags = source.readUnsignedByte(); 1098 ++addressPointer; 1099 final int characters[]; 1100 if (0 != (flags & FLAG_HAS_MULTIPLE_CHARS)) { 1101 int index = 0; 1102 int character = CharEncoding.readChar(source); 1103 addressPointer += CharEncoding.getCharSize(character); 1104 while (-1 != character) { 1105 characterBuffer[index++] = character; 1106 character = CharEncoding.readChar(source); 1107 addressPointer += CharEncoding.getCharSize(character); 1108 } 1109 characters = Arrays.copyOfRange(characterBuffer, 0, index); 1110 } else { 1111 final int character = CharEncoding.readChar(source); 1112 addressPointer += CharEncoding.getCharSize(character); 1113 characters = new int[] { character }; 1114 } 1115 final int frequency; 1116 if (0 != (FLAG_IS_TERMINAL & flags)) { 1117 ++addressPointer; 1118 frequency = source.readUnsignedByte(); 1119 } else { 1120 frequency = CharGroup.NOT_A_TERMINAL; 1121 } 1122 int childrenAddress = addressPointer; 1123 switch (flags & MASK_GROUP_ADDRESS_TYPE) { 1124 case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: 1125 childrenAddress += source.readUnsignedByte(); 1126 addressPointer += 1; 1127 break; 1128 case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: 1129 childrenAddress += source.readUnsignedShort(); 1130 addressPointer += 2; 1131 break; 1132 case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: 1133 childrenAddress += (source.readUnsignedByte() << 16) + source.readUnsignedShort(); 1134 addressPointer += 3; 1135 break; 1136 case FLAG_GROUP_ADDRESS_TYPE_NOADDRESS: 1137 default: 1138 childrenAddress = NO_CHILDREN_ADDRESS; 1139 break; 1140 } 1141 ArrayList<WeightedString> shortcutTargets = null; 1142 if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) { 1143 final long pointerBefore = source.getFilePointer(); 1144 shortcutTargets = new ArrayList<WeightedString>(); 1145 source.readUnsignedShort(); // Skip the size 1146 while (true) { 1147 final int targetFlags = source.readUnsignedByte(); 1148 final String word = CharEncoding.readString(source); 1149 shortcutTargets.add(new WeightedString(word, 1150 targetFlags & FLAG_ATTRIBUTE_FREQUENCY)); 1151 if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; 1152 } 1153 addressPointer += (source.getFilePointer() - pointerBefore); 1154 } 1155 ArrayList<PendingAttribute> bigrams = null; 1156 if (0 != (flags & FLAG_HAS_BIGRAMS)) { 1157 bigrams = new ArrayList<PendingAttribute>(); 1158 while (true) { 1159 final int bigramFlags = source.readUnsignedByte(); 1160 ++addressPointer; 1161 final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; 1162 int bigramAddress = addressPointer; 1163 switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { 1164 case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: 1165 bigramAddress += sign * source.readUnsignedByte(); 1166 addressPointer += 1; 1167 break; 1168 case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: 1169 bigramAddress += sign * source.readUnsignedShort(); 1170 addressPointer += 2; 1171 break; 1172 case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: 1173 final int offset = ((source.readUnsignedByte() << 16) 1174 + source.readUnsignedShort()); 1175 bigramAddress += sign * offset; 1176 addressPointer += 3; 1177 break; 1178 default: 1179 throw new RuntimeException("Has bigrams with no address"); 1180 } 1181 bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY, 1182 bigramAddress)); 1183 if (0 == (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; 1184 } 1185 } 1186 return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency, 1187 childrenAddress, shortcutTargets, bigrams); 1188 } 1189 1190 /** 1191 * Reads and returns the char group count out of a file and forwards the pointer. 1192 */ readCharGroupCount(RandomAccessFile source)1193 private static int readCharGroupCount(RandomAccessFile source) throws IOException { 1194 final int msb = source.readUnsignedByte(); 1195 if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) { 1196 return msb; 1197 } else { 1198 return ((MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8) 1199 + source.readUnsignedByte(); 1200 } 1201 } 1202 1203 // The word cache here is a stopgap bandaid to help the catastrophic performance 1204 // of this method. Since it performs direct, unbuffered random access to the file and 1205 // may be called hundreds of thousands of times, the resulting performance is not 1206 // reasonable without some kind of cache. Thus: 1207 // TODO: perform buffered I/O here and in other places in the code. 1208 private static TreeMap<Integer, String> wordCache = new TreeMap<Integer, String>(); 1209 /** 1210 * Finds, as a string, the word at the address passed as an argument. 1211 * 1212 * @param source the file to read from. 1213 * @param headerSize the size of the header. 1214 * @param address the address to seek. 1215 * @return the word, as a string. 1216 * @throws IOException if the file can't be read. 1217 */ getWordAtAddress(final RandomAccessFile source, final long headerSize, int address)1218 private static String getWordAtAddress(final RandomAccessFile source, final long headerSize, 1219 int address) throws IOException { 1220 final String cachedString = wordCache.get(address); 1221 if (null != cachedString) return cachedString; 1222 final long originalPointer = source.getFilePointer(); 1223 source.seek(headerSize); 1224 final int count = readCharGroupCount(source); 1225 int groupOffset = getGroupCountSize(count); 1226 final StringBuilder builder = new StringBuilder(); 1227 String result = null; 1228 1229 CharGroupInfo last = null; 1230 for (int i = count - 1; i >= 0; --i) { 1231 CharGroupInfo info = readCharGroup(source, groupOffset); 1232 groupOffset = info.mEndAddress; 1233 if (info.mOriginalAddress == address) { 1234 builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); 1235 result = builder.toString(); 1236 break; // and return 1237 } 1238 if (hasChildrenAddress(info.mChildrenAddress)) { 1239 if (info.mChildrenAddress > address) { 1240 if (null == last) continue; 1241 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 1242 source.seek(last.mChildrenAddress + headerSize); 1243 groupOffset = last.mChildrenAddress + 1; 1244 i = source.readUnsignedByte(); 1245 last = null; 1246 continue; 1247 } 1248 last = info; 1249 } 1250 if (0 == i && hasChildrenAddress(last.mChildrenAddress)) { 1251 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 1252 source.seek(last.mChildrenAddress + headerSize); 1253 groupOffset = last.mChildrenAddress + 1; 1254 i = source.readUnsignedByte(); 1255 last = null; 1256 continue; 1257 } 1258 } 1259 source.seek(originalPointer); 1260 wordCache.put(address, result); 1261 return result; 1262 } 1263 1264 /** 1265 * Reads a single node from a binary file. 1266 * 1267 * This methods reads the file at the current position of its file pointer. A node is 1268 * fully expected to start at the current position. 1269 * This will recursively read other nodes into the structure, populating the reverse 1270 * maps on the fly and using them to keep track of already read nodes. 1271 * 1272 * @param source the data file, correctly positioned at the start of a node. 1273 * @param headerSize the size, in bytes, of the file header. 1274 * @param reverseNodeMap a mapping from addresses to already read nodes. 1275 * @param reverseGroupMap a mapping from addresses to already read character groups. 1276 * @return the read node with all his children already read. 1277 */ readNode(RandomAccessFile source, long headerSize, Map<Integer, Node> reverseNodeMap, Map<Integer, CharGroup> reverseGroupMap)1278 private static Node readNode(RandomAccessFile source, long headerSize, 1279 Map<Integer, Node> reverseNodeMap, Map<Integer, CharGroup> reverseGroupMap) 1280 throws IOException { 1281 final int nodeOrigin = (int)(source.getFilePointer() - headerSize); 1282 final int count = readCharGroupCount(source); 1283 final ArrayList<CharGroup> nodeContents = new ArrayList<CharGroup>(); 1284 int groupOffset = nodeOrigin + getGroupCountSize(count); 1285 for (int i = count; i > 0; --i) { 1286 CharGroupInfo info = readCharGroup(source, groupOffset); 1287 ArrayList<WeightedString> shortcutTargets = info.mShortcutTargets; 1288 ArrayList<WeightedString> bigrams = null; 1289 if (null != info.mBigrams) { 1290 bigrams = new ArrayList<WeightedString>(); 1291 for (PendingAttribute bigram : info.mBigrams) { 1292 final String word = getWordAtAddress(source, headerSize, bigram.mAddress); 1293 bigrams.add(new WeightedString(word, bigram.mFrequency)); 1294 } 1295 } 1296 if (hasChildrenAddress(info.mChildrenAddress)) { 1297 Node children = reverseNodeMap.get(info.mChildrenAddress); 1298 if (null == children) { 1299 final long currentPosition = source.getFilePointer(); 1300 source.seek(info.mChildrenAddress + headerSize); 1301 children = readNode(source, headerSize, reverseNodeMap, reverseGroupMap); 1302 source.seek(currentPosition); 1303 } 1304 nodeContents.add( 1305 new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, 1306 children)); 1307 } else { 1308 nodeContents.add( 1309 new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency)); 1310 } 1311 groupOffset = info.mEndAddress; 1312 } 1313 final Node node = new Node(nodeContents); 1314 node.mCachedAddress = nodeOrigin; 1315 reverseNodeMap.put(node.mCachedAddress, node); 1316 return node; 1317 } 1318 1319 /** 1320 * Helper function to get the binary format version from the header. 1321 */ getFormatVersion(final RandomAccessFile source)1322 private static int getFormatVersion(final RandomAccessFile source) throws IOException { 1323 final int magic_v1 = source.readUnsignedShort(); 1324 if (VERSION_1_MAGIC_NUMBER == magic_v1) return source.readUnsignedByte(); 1325 final int magic_v2 = (magic_v1 << 16) + source.readUnsignedShort(); 1326 if (VERSION_2_MAGIC_NUMBER == magic_v2) return source.readUnsignedShort(); 1327 return NOT_A_VERSION_NUMBER; 1328 } 1329 1330 /** 1331 * Reads a random access file and returns the memory representation of the dictionary. 1332 * 1333 * This high-level method takes a binary file and reads its contents, populating a 1334 * FusionDictionary structure. The optional dict argument is an existing dictionary to 1335 * which words from the file should be added. If it is null, a new dictionary is created. 1336 * 1337 * @param source the file to read. 1338 * @param dict an optional dictionary to add words to, or null. 1339 * @return the created (or merged) dictionary. 1340 */ readDictionaryBinary(final RandomAccessFile source, final FusionDictionary dict)1341 public static FusionDictionary readDictionaryBinary(final RandomAccessFile source, 1342 final FusionDictionary dict) throws IOException, UnsupportedFormatException { 1343 // Check file version 1344 final int version = getFormatVersion(source); 1345 if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION ) { 1346 throw new UnsupportedFormatException("This file has version " + version 1347 + ", but this implementation does not support versions above " 1348 + MAXIMUM_SUPPORTED_VERSION); 1349 } 1350 1351 // Read options 1352 final int optionsFlags = source.readUnsignedShort(); 1353 1354 final long headerSize; 1355 final HashMap<String, String> options = new HashMap<String, String>(); 1356 if (version < FIRST_VERSION_WITH_HEADER_SIZE) { 1357 headerSize = source.getFilePointer(); 1358 } else { 1359 headerSize = (source.readUnsignedByte() << 24) + (source.readUnsignedByte() << 16) 1360 + (source.readUnsignedByte() << 8) + source.readUnsignedByte(); 1361 while (source.getFilePointer() < headerSize) { 1362 final String key = CharEncoding.readString(source); 1363 final String value = CharEncoding.readString(source); 1364 options.put(key, value); 1365 } 1366 source.seek(headerSize); 1367 } 1368 1369 Map<Integer, Node> reverseNodeMapping = new TreeMap<Integer, Node>(); 1370 Map<Integer, CharGroup> reverseGroupMapping = new TreeMap<Integer, CharGroup>(); 1371 final Node root = readNode(source, headerSize, reverseNodeMapping, reverseGroupMapping); 1372 1373 FusionDictionary newDict = new FusionDictionary(root, 1374 new FusionDictionary.DictionaryOptions(options, 1375 0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG), 1376 0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG))); 1377 if (null != dict) { 1378 for (final Word w : dict) { 1379 newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets); 1380 } 1381 for (final Word w : dict) { 1382 // By construction a binary dictionary may not have bigrams pointing to 1383 // words that are not also registered as unigrams so we don't have to avoid 1384 // them explicitly here. 1385 for (final WeightedString bigram : w.mBigrams) { 1386 newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency); 1387 } 1388 } 1389 } 1390 1391 return newDict; 1392 } 1393 1394 /** 1395 * Basic test to find out whether the file is a binary dictionary or not. 1396 * 1397 * Concretely this only tests the magic number. 1398 * 1399 * @param filename The name of the file to test. 1400 * @return true if it's a binary dictionary, false otherwise 1401 */ isBinaryDictionary(final String filename)1402 public static boolean isBinaryDictionary(final String filename) { 1403 try { 1404 RandomAccessFile f = new RandomAccessFile(filename, "r"); 1405 final int version = getFormatVersion(f); 1406 return (version >= MINIMUM_SUPPORTED_VERSION && version <= MAXIMUM_SUPPORTED_VERSION); 1407 } catch (FileNotFoundException e) { 1408 return false; 1409 } catch (IOException e) { 1410 return false; 1411 } 1412 } 1413 } 1414