• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2010 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LATINIME_UNIGRAM_DICTIONARY_H
18 #define LATINIME_UNIGRAM_DICTIONARY_H
19 
20 #include <stdint.h>
21 #include "correction.h"
22 #include "correction_state.h"
23 #include "defines.h"
24 #include "proximity_info.h"
25 
26 #ifndef NULL
27 #define NULL 0
28 #endif
29 
30 namespace latinime {
31 
32 class UnigramDictionary {
33 
34 public:
35 
36     // Mask and flags for children address type selection.
37     static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
38     static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
39     static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
40     static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
41     static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
42 
43     // Flag for single/multiple char group
44     static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
45 
46     // Flag for terminal groups
47     static const int FLAG_IS_TERMINAL = 0x10;
48 
49     // Flag for bigram presence
50     static const int FLAG_HAS_BIGRAMS = 0x04;
51 
52     // Attribute (bigram/shortcut) related flags:
53     // Flag for presence of more attributes
54     static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
55     // Flag for sign of offset. If this flag is set, the offset value must be negated.
56     static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
57 
58     // Mask for attribute frequency, stored on 4 bits inside the flags byte.
59     static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
60 
61     // Mask and flags for attribute address type selection.
62     static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
63     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
64     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
65     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
66 
67     UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
68             int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
69             const bool isLatestDictVersion);
70     bool isValidWord(const uint16_t* const inWord, const int length) const;
71     int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
72     int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
73             const int *ycoordinates, const int *codes, const int codesSize, const int flags,
74             unsigned short *outWords, int *frequencies);
75     virtual ~UnigramDictionary();
76 
77 private:
78 
79     void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
80             const int *ycoordinates, const int *codes, const int codesSize,
81             unsigned short *outWords, int *frequencies, const int flags);
82     bool isDigraph(const int* codes, const int i, const int codesSize) const;
83     void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
84         const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
85         const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
86         const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
87     void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
88             const int *ycoordinates, const int *codes, const int codesSize,
89             unsigned short *outWords, int *frequencies);
90     void getSuggestionCandidates(const bool useFullEditDistance);
91     bool addWord(unsigned short *word, int length, int frequency);
92     void getSplitTwoWordsSuggestion(const int inputLength, Correction *correction);
93     void getMissingSpaceWords(const int inputLength, const int missingSpacePos,
94             Correction *correction, const bool useFullEditDistance);
95     void getMistypedSpaceWords(const int inputLength, const int spaceProximityPos,
96             Correction *correction, const bool useFullEditDistance);
97     void onTerminal(const int freq, Correction *correction);
98     bool needsToSkipCurrentNode(const unsigned short c,
99             const int inputIndex, const int skipPos, const int depth);
100     // Process a node by considering proximity, missing and excessive character
101     bool processCurrentNode(const int initialPos,
102             Correction *correction, int *newCount,
103             int *newChildPosition, int *nextSiblingPosition);
104     int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
105             unsigned short *word);
106     int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
107             short unsigned int* outWord);
108 
109     const uint8_t* const DICT_ROOT;
110     const int MAX_WORD_LENGTH;
111     const int MAX_WORDS;
112     const int MAX_PROXIMITY_CHARS;
113     const bool IS_LATEST_DICT_VERSION;
114     const int TYPED_LETTER_MULTIPLIER;
115     const int FULL_WORD_MULTIPLIER;
116     const int ROOT_POS;
117     const unsigned int BYTES_IN_ONE_CHAR;
118     const int MAX_UMLAUT_SEARCH_DEPTH;
119 
120     // Flags for special processing
121     // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
122     // or something very bad (like, the apocalypse) will happen.
123     // Please update both at the same time.
124     enum {
125         REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1,
126         USE_FULL_EDIT_DISTANCE = 0x2
127     };
128     static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];
129 
130     int *mFrequencies;
131     unsigned short *mOutputChars;
132     ProximityInfo *mProximityInfo;
133     Correction *mCorrection;
134     int mInputLength;
135     // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
136     unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
137 
138     int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
139     int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
140     int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
141 };
142 } // namespace latinime
143 
144 #endif // LATINIME_UNIGRAM_DICTIONARY_H
145