1 /*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef LATINIME_BINARY_FORMAT_H
18 #define LATINIME_BINARY_FORMAT_H
19
20 #include "unigram_dictionary.h"
21
22 namespace latinime {
23
24 class BinaryFormat {
25 private:
26 const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
27 const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
28 const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
29
30 public:
31 const static int UNKNOWN_FORMAT = -1;
32 const static int FORMAT_VERSION_1 = 1;
33 const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1;
34
35 static int detectFormat(const uint8_t* const dict);
36 static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
37 static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
38 static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
39 static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
40 static int skipOtherCharacters(const uint8_t* const dict, const int pos);
41 static int skipAttributes(const uint8_t* const dict, const int pos);
42 static int skipChildrenPosition(const uint8_t flags, const int pos);
43 static int skipFrequency(const uint8_t flags, const int pos);
44 static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
45 static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
46 const int pos);
47 static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);
48 static bool hasChildrenInFlags(const uint8_t flags);
49 static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
50 int *pos);
51 static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
52 const int length);
53 static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
54 uint16_t* outWord);
55 };
56
detectFormat(const uint8_t * const dict)57 inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
58 const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian
59 if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1;
60 return UNKNOWN_FORMAT;
61 }
62
getGroupCountAndForwardPointer(const uint8_t * const dict,int * pos)63 inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
64 return dict[(*pos)++];
65 }
66
getFlagsAndForwardPointer(const uint8_t * const dict,int * pos)67 inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
68 return dict[(*pos)++];
69 }
70
getCharCodeAndForwardPointer(const uint8_t * const dict,int * pos)71 inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {
72 const int origin = *pos;
73 const int32_t character = dict[origin];
74 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
75 if (character == CHARACTER_ARRAY_TERMINATOR) {
76 *pos = origin + 1;
77 return NOT_A_CHARACTER;
78 } else {
79 *pos = origin + 3;
80 const int32_t char_1 = character << 16;
81 const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
82 return char_2 + dict[origin + 2];
83 }
84 } else {
85 *pos = origin + 1;
86 return character;
87 }
88 }
89
readFrequencyWithoutMovingPointer(const uint8_t * const dict,const int pos)90 inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,
91 const int pos) {
92 return dict[pos];
93 }
94
skipOtherCharacters(const uint8_t * const dict,const int pos)95 inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {
96 int currentPos = pos;
97 int32_t character = dict[currentPos++];
98 while (CHARACTER_ARRAY_TERMINATOR != character) {
99 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
100 currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
101 }
102 character = dict[currentPos++];
103 }
104 return currentPos;
105 }
106
attributeAddressSize(const uint8_t flags)107 static inline int attributeAddressSize(const uint8_t flags) {
108 static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
109 return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
110 /* Note: this is a value-dependant optimization of what may probably be
111 more readably written this way:
112 switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
113 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
114 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
115 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
116 default: return 0;
117 }
118 */
119 }
120
skipAttributes(const uint8_t * const dict,const int pos)121 inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
122 int currentPos = pos;
123 uint8_t flags = getFlagsAndForwardPointer(dict, ¤tPos);
124 while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
125 currentPos += attributeAddressSize(flags);
126 flags = getFlagsAndForwardPointer(dict, ¤tPos);
127 }
128 currentPos += attributeAddressSize(flags);
129 return currentPos;
130 }
131
childrenAddressSize(const uint8_t flags)132 static inline int childrenAddressSize(const uint8_t flags) {
133 static const int CHILDREN_ADDRESS_SHIFT = 6;
134 return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
135 /* See the note in attributeAddressSize. The same applies here */
136 }
137
skipChildrenPosition(const uint8_t flags,const int pos)138 inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
139 return pos + childrenAddressSize(flags);
140 }
141
skipFrequency(const uint8_t flags,const int pos)142 inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
143 return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
144 }
145
skipAllAttributes(const uint8_t * const dict,const uint8_t flags,const int pos)146 inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
147 const int pos) {
148 // This function skips all attributes. The format makes provision for future extension
149 // with other attributes (notably shortcuts) but for the time being, bigrams are the
150 // only attributes that may be found in a character group, so we only look at bigrams
151 // in this version.
152 if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
153 return skipAttributes(dict, pos);
154 } else {
155 return pos;
156 }
157 }
158
skipChildrenPosAndAttributes(const uint8_t * const dict,const uint8_t flags,const int pos)159 inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
160 const uint8_t flags, const int pos) {
161 int currentPos = pos;
162 currentPos = skipChildrenPosition(flags, currentPos);
163 currentPos = skipAllAttributes(dict, flags, currentPos);
164 return currentPos;
165 }
166
readChildrenPosition(const uint8_t * const dict,const uint8_t flags,const int pos)167 inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags,
168 const int pos) {
169 int offset = 0;
170 switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
171 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
172 offset = dict[pos];
173 break;
174 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
175 offset = dict[pos] << 8;
176 offset += dict[pos + 1];
177 break;
178 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
179 offset = dict[pos] << 16;
180 offset += dict[pos + 1] << 8;
181 offset += dict[pos + 2];
182 break;
183 default:
184 // If we come here, it means we asked for the children of a word with
185 // no children.
186 return -1;
187 }
188 return pos + offset;
189 }
190
hasChildrenInFlags(const uint8_t flags)191 inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
192 return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
193 != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
194 }
195
getAttributeAddressAndForwardPointer(const uint8_t * const dict,const uint8_t flags,int * pos)196 inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict,
197 const uint8_t flags, int *pos) {
198 int offset = 0;
199 const int origin = *pos;
200 switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
201 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
202 offset = dict[origin];
203 *pos = origin + 1;
204 break;
205 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
206 offset = dict[origin] << 8;
207 offset += dict[origin + 1];
208 *pos = origin + 2;
209 break;
210 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
211 offset = dict[origin] << 16;
212 offset += dict[origin + 1] << 8;
213 offset += dict[origin + 2];
214 *pos = origin + 3;
215 break;
216 }
217 if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
218 return origin - offset;
219 } else {
220 return origin + offset;
221 }
222 }
223
224 // This function gets the byte position of the last chargroup of the exact matching word in the
225 // dictionary. If no match is found, it returns NOT_VALID_WORD.
getTerminalPosition(const uint8_t * const root,const uint16_t * const inWord,const int length)226 inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
227 const uint16_t* const inWord, const int length) {
228 int pos = 0;
229 int wordPos = 0;
230
231 while (true) {
232 // If we already traversed the tree further than the word is long, there means
233 // there was no match (or we would have found it).
234 if (wordPos > length) return NOT_VALID_WORD;
235 int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
236 const uint16_t wChar = inWord[wordPos];
237 while (true) {
238 // If there are no more character groups in this node, it means we could not
239 // find a matching character for this depth, therefore there is no match.
240 if (0 >= charGroupCount) return NOT_VALID_WORD;
241 const int charGroupPos = pos;
242 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
243 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
244 if (character == wChar) {
245 // This is the correct node. Only one character group may start with the same
246 // char within a node, so either we found our match in this node, or there is
247 // no match and we can return NOT_VALID_WORD. So we will check all the characters
248 // in this character group indeed does match.
249 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
250 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
251 while (NOT_A_CHARACTER != character) {
252 ++wordPos;
253 // If we shoot the length of the word we search for, or if we find a single
254 // character that does not match, as explained above, it means the word is
255 // not in the dictionary (by virtue of this chargroup being the only one to
256 // match the word on the first character, but not matching the whole word).
257 if (wordPos > length) return NOT_VALID_WORD;
258 if (inWord[wordPos] != character) return NOT_VALID_WORD;
259 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
260 }
261 }
262 // If we come here we know that so far, we do match. Either we are on a terminal
263 // and we match the length, in which case we found it, or we traverse children.
264 // If we don't match the length AND don't have children, then a word in the
265 // dictionary fully matches a prefix of the searched word but not the full word.
266 ++wordPos;
267 if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
268 if (wordPos == length) {
269 return charGroupPos;
270 }
271 pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
272 }
273 if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
274 == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
275 return NOT_VALID_WORD;
276 }
277 // We have children and we are still shorter than the word we are searching for, so
278 // we need to traverse children. Put the pointer on the children position, and
279 // break
280 pos = BinaryFormat::readChildrenPosition(root, flags, pos);
281 break;
282 } else {
283 // This chargroup does not match, so skip the remaining part and go to the next.
284 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
285 pos = BinaryFormat::skipOtherCharacters(root, pos);
286 }
287 pos = BinaryFormat::skipFrequency(flags, pos);
288 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
289 }
290 --charGroupCount;
291 }
292 }
293 }
294
295 // This function searches for a terminal in the dictionary by its address.
296 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
297 // it is possible to check for this with advantageous complexity. For each node, we search
298 // for groups with children and compare the children address with the address we look for.
299 // When we shoot the address we look for, it means the word we look for is in the children
300 // of the previous group. The only tricky part is the fact that if we arrive at the end of a
301 // node with the last group's children address still less than what we are searching for, we
302 // must descend the last group's children (for example, if the word we are searching for starts
303 // with a z, it's the last group of the root node, so all children addresses will be smaller
304 // than the address we look for, and we have to descend the z node).
305 /* Parameters :
306 * root: the dictionary buffer
307 * address: the byte position of the last chargroup of the word we are searching for (this is
308 * what is stored as the "bigram address" in each bigram)
309 * outword: an array to write the found word, with MAX_WORD_LENGTH size.
310 * Return value : the length of the word, of 0 if the word was not found.
311 */
getWordAtAddress(const uint8_t * const root,const int address,const int maxDepth,uint16_t * outWord)312 inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address,
313 const int maxDepth, uint16_t* outWord) {
314 int pos = 0;
315 int wordPos = 0;
316
317 // One iteration of the outer loop iterates through nodes. As stated above, we will only
318 // traverse nodes that are actually a part of the terminal we are searching, so each time
319 // we enter this loop we are one depth level further than last time.
320 // The only reason we count nodes is because we want to reduce the probability of infinite
321 // looping in case there is a bug. Since we know there is an upper bound to the depth we are
322 // supposed to traverse, it does not hurt to count iterations.
323 for (int loopCount = maxDepth; loopCount > 0; --loopCount) {
324 int lastCandidateGroupPos = 0;
325 // Let's loop through char groups in this node searching for either the terminal
326 // or one of its ascendants.
327 for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0;
328 --charGroupCount) {
329 const int startPos = pos;
330 const uint8_t flags = getFlagsAndForwardPointer(root, &pos);
331 const int32_t character = getCharCodeAndForwardPointer(root, &pos);
332 if (address == startPos) {
333 // We found the address. Copy the rest of the word in the buffer and return
334 // the length.
335 outWord[wordPos] = character;
336 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
337 int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
338 // We count chars in order to avoid infinite loops if the file is broken or
339 // if there is some other bug
340 int charCount = maxDepth;
341 while (-1 != nextChar && --charCount > 0) {
342 outWord[++wordPos] = nextChar;
343 nextChar = getCharCodeAndForwardPointer(root, &pos);
344 }
345 }
346 return ++wordPos;
347 }
348 // We need to skip past this char group, so skip any remaining chars after the
349 // first and possibly the frequency.
350 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
351 pos = skipOtherCharacters(root, pos);
352 }
353 pos = skipFrequency(flags, pos);
354
355 // The fact that this group has children is very important. Since we already know
356 // that this group does not match, if it has no children we know it is irrelevant
357 // to what we are searching for.
358 const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
359 (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
360 // We will write in `found' whether we have passed the children address we are
361 // searching for. For example if we search for "beer", the children of b are less
362 // than the address we are searching for and the children of c are greater. When we
363 // come here for c, we realize this is too big, and that we should descend b.
364 bool found;
365 if (hasChildren) {
366 // Here comes the tricky part. First, read the children position.
367 const int childrenPos = readChildrenPosition(root, flags, pos);
368 if (childrenPos > address) {
369 // If the children pos is greater than address, it means the previous chargroup,
370 // which address is stored in lastCandidateGroupPos, was the right one.
371 found = true;
372 } else if (1 >= charGroupCount) {
373 // However if we are on the LAST group of this node, and we have NOT shot the
374 // address we should descend THIS node. So we trick the lastCandidateGroupPos
375 // so that we will descend this node, not the previous one.
376 lastCandidateGroupPos = startPos;
377 found = true;
378 } else {
379 // Else, we should continue looking.
380 found = false;
381 }
382 } else {
383 // Even if we don't have children here, we could still be on the last group of this
384 // node. If this is the case, we should descend the last group that had children,
385 // and their address is already in lastCandidateGroup.
386 found = (1 >= charGroupCount);
387 }
388
389 if (found) {
390 // Okay, we found the group we should descend. Its address is in
391 // the lastCandidateGroupPos variable, so we just re-read it.
392 if (0 != lastCandidateGroupPos) {
393 const uint8_t lastFlags =
394 getFlagsAndForwardPointer(root, &lastCandidateGroupPos);
395 const int32_t lastChar =
396 getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
397 // We copy all the characters in this group to the buffer
398 outWord[wordPos] = lastChar;
399 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
400 int32_t nextChar =
401 getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
402 int charCount = maxDepth;
403 while (-1 != nextChar && --charCount > 0) {
404 outWord[++wordPos] = nextChar;
405 nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
406 }
407 }
408 ++wordPos;
409 // Now we only need to branch to the children address. Skip the frequency if
410 // it's there, read pos, and break to resume the search at pos.
411 lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos);
412 pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos);
413 break;
414 } else {
415 // Here is a little tricky part: we come here if we found out that all children
416 // addresses in this group are bigger than the address we are searching for.
417 // Should we conclude the word is not in the dictionary? No! It could still be
418 // one of the remaining chargroups in this node, so we have to keep looking in
419 // this node until we find it (or we realize it's not there either, in which
420 // case it's actually not in the dictionary). Pass the end of this group, ready
421 // to start the next one.
422 pos = skipChildrenPosAndAttributes(root, flags, pos);
423 }
424 } else {
425 // If we did not find it, we should record the last children address for the next
426 // iteration.
427 if (hasChildren) lastCandidateGroupPos = startPos;
428 // Now skip the end of this group (children pos and the attributes if any) so that
429 // our pos is after the end of this char group, at the start of the next one.
430 pos = skipChildrenPosAndAttributes(root, flags, pos);
431 }
432
433 }
434 }
435 // If we have looked through all the chargroups and found no match, the address is
436 // not the address of a terminal in this dictionary.
437 return 0;
438 }
439
440 } // namespace latinime
441
442 #endif // LATINIME_BINARY_FORMAT_H
443