• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LATINIME_BINARY_FORMAT_H
18 #define LATINIME_BINARY_FORMAT_H
19 
20 #include "unigram_dictionary.h"
21 
22 namespace latinime {
23 
24 class BinaryFormat {
25 private:
26     const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
27     const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
28     const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
29 
30 public:
31     const static int UNKNOWN_FORMAT = -1;
32     const static int FORMAT_VERSION_1 = 1;
33     const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1;
34 
35     static int detectFormat(const uint8_t* const dict);
36     static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
37     static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
38     static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
39     static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
40     static int skipOtherCharacters(const uint8_t* const dict, const int pos);
41     static int skipAttributes(const uint8_t* const dict, const int pos);
42     static int skipChildrenPosition(const uint8_t flags, const int pos);
43     static int skipFrequency(const uint8_t flags, const int pos);
44     static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
45     static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
46             const int pos);
47     static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);
48     static bool hasChildrenInFlags(const uint8_t flags);
49     static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
50             int *pos);
51     static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
52             const int length);
53     static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
54             uint16_t* outWord);
55 };
56 
detectFormat(const uint8_t * const dict)57 inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
58     const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian
59     if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1;
60     return UNKNOWN_FORMAT;
61 }
62 
getGroupCountAndForwardPointer(const uint8_t * const dict,int * pos)63 inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
64     return dict[(*pos)++];
65 }
66 
getFlagsAndForwardPointer(const uint8_t * const dict,int * pos)67 inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
68     return dict[(*pos)++];
69 }
70 
getCharCodeAndForwardPointer(const uint8_t * const dict,int * pos)71 inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {
72     const int origin = *pos;
73     const int32_t character = dict[origin];
74     if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
75         if (character == CHARACTER_ARRAY_TERMINATOR) {
76             *pos = origin + 1;
77             return NOT_A_CHARACTER;
78         } else {
79             *pos = origin + 3;
80             const int32_t char_1 = character << 16;
81             const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
82             return char_2 + dict[origin + 2];
83         }
84     } else {
85         *pos = origin + 1;
86         return character;
87     }
88 }
89 
readFrequencyWithoutMovingPointer(const uint8_t * const dict,const int pos)90 inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,
91         const int pos) {
92     return dict[pos];
93 }
94 
skipOtherCharacters(const uint8_t * const dict,const int pos)95 inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {
96     int currentPos = pos;
97     int32_t character = dict[currentPos++];
98     while (CHARACTER_ARRAY_TERMINATOR != character) {
99         if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
100             currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
101         }
102         character = dict[currentPos++];
103     }
104     return currentPos;
105 }
106 
attributeAddressSize(const uint8_t flags)107 static inline int attributeAddressSize(const uint8_t flags) {
108     static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
109     return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
110     /* Note: this is a value-dependant optimization of what may probably be
111        more readably written this way:
112        switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
113        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
114        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
115        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
116        default: return 0;
117        }
118     */
119 }
120 
skipAttributes(const uint8_t * const dict,const int pos)121 inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
122     int currentPos = pos;
123     uint8_t flags = getFlagsAndForwardPointer(dict, &currentPos);
124     while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
125         currentPos += attributeAddressSize(flags);
126         flags = getFlagsAndForwardPointer(dict, &currentPos);
127     }
128     currentPos += attributeAddressSize(flags);
129     return currentPos;
130 }
131 
childrenAddressSize(const uint8_t flags)132 static inline int childrenAddressSize(const uint8_t flags) {
133     static const int CHILDREN_ADDRESS_SHIFT = 6;
134     return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
135     /* See the note in attributeAddressSize. The same applies here */
136 }
137 
skipChildrenPosition(const uint8_t flags,const int pos)138 inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
139     return pos + childrenAddressSize(flags);
140 }
141 
skipFrequency(const uint8_t flags,const int pos)142 inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
143     return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
144 }
145 
skipAllAttributes(const uint8_t * const dict,const uint8_t flags,const int pos)146 inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
147         const int pos) {
148     // This function skips all attributes. The format makes provision for future extension
149     // with other attributes (notably shortcuts) but for the time being, bigrams are the
150     // only attributes that may be found in a character group, so we only look at bigrams
151     // in this version.
152     if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
153         return skipAttributes(dict, pos);
154     } else {
155         return pos;
156     }
157 }
158 
skipChildrenPosAndAttributes(const uint8_t * const dict,const uint8_t flags,const int pos)159 inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
160         const uint8_t flags, const int pos) {
161     int currentPos = pos;
162     currentPos = skipChildrenPosition(flags, currentPos);
163     currentPos = skipAllAttributes(dict, flags, currentPos);
164     return currentPos;
165 }
166 
readChildrenPosition(const uint8_t * const dict,const uint8_t flags,const int pos)167 inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags,
168         const int pos) {
169     int offset = 0;
170     switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
171         case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
172             offset = dict[pos];
173             break;
174         case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
175             offset = dict[pos] << 8;
176             offset += dict[pos + 1];
177             break;
178         case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
179             offset = dict[pos] << 16;
180             offset += dict[pos + 1] << 8;
181             offset += dict[pos + 2];
182             break;
183         default:
184             // If we come here, it means we asked for the children of a word with
185             // no children.
186             return -1;
187     }
188     return pos + offset;
189 }
190 
hasChildrenInFlags(const uint8_t flags)191 inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
192     return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
193             != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
194 }
195 
getAttributeAddressAndForwardPointer(const uint8_t * const dict,const uint8_t flags,int * pos)196 inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict,
197         const uint8_t flags, int *pos) {
198     int offset = 0;
199     const int origin = *pos;
200     switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
201         case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
202             offset = dict[origin];
203             *pos = origin + 1;
204             break;
205         case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
206             offset = dict[origin] << 8;
207             offset += dict[origin + 1];
208             *pos = origin + 2;
209             break;
210         case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
211             offset = dict[origin] << 16;
212             offset += dict[origin + 1] << 8;
213             offset += dict[origin + 2];
214             *pos = origin + 3;
215             break;
216     }
217     if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
218         return origin - offset;
219     } else {
220         return origin + offset;
221     }
222 }
223 
224 // This function gets the byte position of the last chargroup of the exact matching word in the
225 // dictionary. If no match is found, it returns NOT_VALID_WORD.
getTerminalPosition(const uint8_t * const root,const uint16_t * const inWord,const int length)226 inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
227         const uint16_t* const inWord, const int length) {
228     int pos = 0;
229     int wordPos = 0;
230 
231     while (true) {
232         // If we already traversed the tree further than the word is long, there means
233         // there was no match (or we would have found it).
234         if (wordPos > length) return NOT_VALID_WORD;
235         int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
236         const uint16_t wChar = inWord[wordPos];
237         while (true) {
238             // If there are no more character groups in this node, it means we could not
239             // find a matching character for this depth, therefore there is no match.
240             if (0 >= charGroupCount) return NOT_VALID_WORD;
241             const int charGroupPos = pos;
242             const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
243             int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
244             if (character == wChar) {
245                 // This is the correct node. Only one character group may start with the same
246                 // char within a node, so either we found our match in this node, or there is
247                 // no match and we can return NOT_VALID_WORD. So we will check all the characters
248                 // in this character group indeed does match.
249                 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
250                     character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
251                     while (NOT_A_CHARACTER != character) {
252                         ++wordPos;
253                         // If we shoot the length of the word we search for, or if we find a single
254                         // character that does not match, as explained above, it means the word is
255                         // not in the dictionary (by virtue of this chargroup being the only one to
256                         // match the word on the first character, but not matching the whole word).
257                         if (wordPos > length) return NOT_VALID_WORD;
258                         if (inWord[wordPos] != character) return NOT_VALID_WORD;
259                         character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
260                     }
261                 }
262                 // If we come here we know that so far, we do match. Either we are on a terminal
263                 // and we match the length, in which case we found it, or we traverse children.
264                 // If we don't match the length AND don't have children, then a word in the
265                 // dictionary fully matches a prefix of the searched word but not the full word.
266                 ++wordPos;
267                 if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
268                     if (wordPos == length) {
269                         return charGroupPos;
270                     }
271                     pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
272                 }
273                 if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
274                         == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
275                     return NOT_VALID_WORD;
276                 }
277                 // We have children and we are still shorter than the word we are searching for, so
278                 // we need to traverse children. Put the pointer on the children position, and
279                 // break
280                 pos = BinaryFormat::readChildrenPosition(root, flags, pos);
281                 break;
282             } else {
283                 // This chargroup does not match, so skip the remaining part and go to the next.
284                 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
285                     pos = BinaryFormat::skipOtherCharacters(root, pos);
286                 }
287                 pos = BinaryFormat::skipFrequency(flags, pos);
288                 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
289             }
290             --charGroupCount;
291         }
292     }
293 }
294 
295 // This function searches for a terminal in the dictionary by its address.
296 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
297 // it is possible to check for this with advantageous complexity. For each node, we search
298 // for groups with children and compare the children address with the address we look for.
299 // When we shoot the address we look for, it means the word we look for is in the children
300 // of the previous group. The only tricky part is the fact that if we arrive at the end of a
301 // node with the last group's children address still less than what we are searching for, we
302 // must descend the last group's children (for example, if the word we are searching for starts
303 // with a z, it's the last group of the root node, so all children addresses will be smaller
304 // than the address we look for, and we have to descend the z node).
305 /* Parameters :
306  * root: the dictionary buffer
307  * address: the byte position of the last chargroup of the word we are searching for (this is
308  *   what is stored as the "bigram address" in each bigram)
309  * outword: an array to write the found word, with MAX_WORD_LENGTH size.
310  * Return value : the length of the word, of 0 if the word was not found.
311  */
getWordAtAddress(const uint8_t * const root,const int address,const int maxDepth,uint16_t * outWord)312 inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address,
313         const int maxDepth, uint16_t* outWord) {
314     int pos = 0;
315     int wordPos = 0;
316 
317     // One iteration of the outer loop iterates through nodes. As stated above, we will only
318     // traverse nodes that are actually a part of the terminal we are searching, so each time
319     // we enter this loop we are one depth level further than last time.
320     // The only reason we count nodes is because we want to reduce the probability of infinite
321     // looping in case there is a bug. Since we know there is an upper bound to the depth we are
322     // supposed to traverse, it does not hurt to count iterations.
323     for (int loopCount = maxDepth; loopCount > 0; --loopCount) {
324         int lastCandidateGroupPos = 0;
325         // Let's loop through char groups in this node searching for either the terminal
326         // or one of its ascendants.
327         for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0;
328                  --charGroupCount) {
329             const int startPos = pos;
330             const uint8_t flags = getFlagsAndForwardPointer(root, &pos);
331             const int32_t character = getCharCodeAndForwardPointer(root, &pos);
332             if (address == startPos) {
333                 // We found the address. Copy the rest of the word in the buffer and return
334                 // the length.
335                 outWord[wordPos] = character;
336                 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
337                     int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
338                     // We count chars in order to avoid infinite loops if the file is broken or
339                     // if there is some other bug
340                     int charCount = maxDepth;
341                     while (-1 != nextChar && --charCount > 0) {
342                         outWord[++wordPos] = nextChar;
343                         nextChar = getCharCodeAndForwardPointer(root, &pos);
344                     }
345                 }
346                 return ++wordPos;
347             }
348             // We need to skip past this char group, so skip any remaining chars after the
349             // first and possibly the frequency.
350             if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
351                 pos = skipOtherCharacters(root, pos);
352             }
353             pos = skipFrequency(flags, pos);
354 
355             // The fact that this group has children is very important. Since we already know
356             // that this group does not match, if it has no children we know it is irrelevant
357             // to what we are searching for.
358             const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
359                     (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
360             // We will write in `found' whether we have passed the children address we are
361             // searching for. For example if we search for "beer", the children of b are less
362             // than the address we are searching for and the children of c are greater. When we
363             // come here for c, we realize this is too big, and that we should descend b.
364             bool found;
365             if (hasChildren) {
366                 // Here comes the tricky part. First, read the children position.
367                 const int childrenPos = readChildrenPosition(root, flags, pos);
368                 if (childrenPos > address) {
369                     // If the children pos is greater than address, it means the previous chargroup,
370                     // which address is stored in lastCandidateGroupPos, was the right one.
371                     found = true;
372                 } else if (1 >= charGroupCount) {
373                     // However if we are on the LAST group of this node, and we have NOT shot the
374                     // address we should descend THIS node. So we trick the lastCandidateGroupPos
375                     // so that we will descend this node, not the previous one.
376                     lastCandidateGroupPos = startPos;
377                     found = true;
378                 } else {
379                     // Else, we should continue looking.
380                     found = false;
381                 }
382             } else {
383                 // Even if we don't have children here, we could still be on the last group of this
384                 // node. If this is the case, we should descend the last group that had children,
385                 // and their address is already in lastCandidateGroup.
386                 found = (1 >= charGroupCount);
387             }
388 
389             if (found) {
390                 // Okay, we found the group we should descend. Its address is in
391                 // the lastCandidateGroupPos variable, so we just re-read it.
392                 if (0 != lastCandidateGroupPos) {
393                     const uint8_t lastFlags =
394                             getFlagsAndForwardPointer(root, &lastCandidateGroupPos);
395                     const int32_t lastChar =
396                             getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
397                     // We copy all the characters in this group to the buffer
398                     outWord[wordPos] = lastChar;
399                     if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
400                         int32_t nextChar =
401                                 getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
402                         int charCount = maxDepth;
403                         while (-1 != nextChar && --charCount > 0) {
404                             outWord[++wordPos] = nextChar;
405                             nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
406                         }
407                     }
408                     ++wordPos;
409                     // Now we only need to branch to the children address. Skip the frequency if
410                     // it's there, read pos, and break to resume the search at pos.
411                     lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos);
412                     pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos);
413                     break;
414                 } else {
415                     // Here is a little tricky part: we come here if we found out that all children
416                     // addresses in this group are bigger than the address we are searching for.
417                     // Should we conclude the word is not in the dictionary? No! It could still be
418                     // one of the remaining chargroups in this node, so we have to keep looking in
419                     // this node until we find it (or we realize it's not there either, in which
420                     // case it's actually not in the dictionary). Pass the end of this group, ready
421                     // to start the next one.
422                     pos = skipChildrenPosAndAttributes(root, flags, pos);
423                 }
424             } else {
425                 // If we did not find it, we should record the last children address for the next
426                 // iteration.
427                 if (hasChildren) lastCandidateGroupPos = startPos;
428                 // Now skip the end of this group (children pos and the attributes if any) so that
429                 // our pos is after the end of this char group, at the start of the next one.
430                 pos = skipChildrenPosAndAttributes(root, flags, pos);
431             }
432 
433         }
434     }
435     // If we have looked through all the chargroups and found no match, the address is
436     // not the address of a terminal in this dictionary.
437     return 0;
438 }
439 
440 } // namespace latinime
441 
442 #endif // LATINIME_BINARY_FORMAT_H
443