1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef PINYINIME_INCLUDE_SPELLINGTRIE_H__ 18 #define PINYINIME_INCLUDE_SPELLINGTRIE_H__ 19 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include "./dictdef.h" 23 24 namespace ime_pinyin { 25 26 static const unsigned short kFullSplIdStart = kHalfSpellingIdNum + 1; 27 28 // Node used for the trie of spellings 29 struct SpellingNode { 30 SpellingNode *first_son; 31 // The spelling id for each node. If you need more bits to store 32 // spelling id, please adjust this structure. 33 uint16 spelling_idx:11; 34 uint16 num_of_son:5; 35 char char_this_node; 36 unsigned char score; 37 }; 38 39 class SpellingTrie { 40 private: 41 static const int kMaxYmNum = 64; 42 static const size_t kValidSplCharNum = 26; 43 44 static const uint16 kHalfIdShengmuMask = 0x01; 45 static const uint16 kHalfIdYunmuMask = 0x02; 46 static const uint16 kHalfIdSzmMask = 0x04; 47 48 // Map from half spelling id to single char. 49 // For half ids of Zh/Ch/Sh, map to z/c/s (low case) respectively. 50 // For example, 1 to 'A', 2 to 'B', 3 to 'C', 4 to 'c', 5 to 'D', ..., 51 // 28 to 'Z', 29 to 'z'. 52 // [0] is not used to achieve better efficiency. 53 static const char kHalfId2Sc_[kFullSplIdStart + 1]; 54 55 static unsigned char char_flags_[]; 56 static SpellingTrie* instance_; 57 58 // The spelling table 59 char *spelling_buf_; 60 61 // The size of longest spelling string, includes '\0' and an extra char to 62 // store score. For example, "zhuang" is the longgest item in Pinyin list, 63 // so spelling_size_ is 8. 64 // Structure: The string ended with '\0' + score char. 65 // An item with a lower score has a higher probability. 66 size_t spelling_size_; 67 68 // Number of full spelling ids. 69 size_t spelling_num_; 70 71 float score_amplifier_; 72 unsigned char average_score_; 73 74 // The Yunmu id list for the spelling ids (for half ids of Shengmu, 75 // the Yunmu id is 0). 76 // The length of the list is spelling_num_ + kFullSplIdStart, 77 // so that spl_ym_ids_[splid] is the Yunmu id of the splid. 78 uint8 *spl_ym_ids_; 79 80 // The Yunmu table. 81 // Each Yunmu will be assigned with Yunmu id from 1. 82 char *ym_buf_; 83 size_t ym_size_; // The size of longest Yunmu string, '\0'included. 84 size_t ym_num_; 85 86 // The spelling string just queried 87 char *splstr_queried_; 88 89 // The spelling string just queried 90 char16 *splstr16_queried_; 91 92 // The root node of the spelling tree 93 SpellingNode* root_; 94 95 // If a none qwerty key such as a fnction key like ENTER is given, this node 96 // will be used to indicate that this is not a QWERTY node. 97 SpellingNode* dumb_node_; 98 99 // If a splitter key is pressed, this node will be used to indicate that this 100 // is a splitter key. 101 SpellingNode* splitter_node_; 102 103 // Used to get the first level sons. 104 SpellingNode* level1_sons_[kValidSplCharNum]; 105 106 // The full spl_id range for specific half id. 107 // h2f means half to full. 108 // A half id can be a ShouZiMu id (id to represent the first char of a full 109 // spelling, including Shengmu and Yunmu), or id of zh/ch/sh. 110 // [1..kFullSplIdStart-1] is the arrange of half id. 111 uint16 h2f_start_[kFullSplIdStart]; 112 uint16 h2f_num_[kFullSplIdStart]; 113 114 // Map from full id to half id. 115 uint16 *f2h_; 116 117 #ifdef ___BUILD_MODEL___ 118 // How many node used to build the trie. 119 size_t node_num_; 120 #endif 121 122 SpellingTrie(); 123 124 void free_son_trie(SpellingNode* node); 125 126 // Construct a subtree using a subset of the spelling array (from 127 // item_star to item_end). 128 // Member spelliing_buf_ and spelling_size_ should be valid. 129 // parent is used to update its num_of_son and score. 130 SpellingNode* construct_spellings_subset(size_t item_start, size_t item_end, 131 size_t level, SpellingNode *parent); 132 bool build_f2h(); 133 134 // The caller should guarantee ch >= 'A' && ch <= 'Z' 135 bool is_shengmu_char(char ch) const; 136 137 // The caller should guarantee ch >= 'A' && ch <= 'Z' 138 bool is_yunmu_char(char ch) const; 139 140 #ifdef ___BUILD_MODEL___ 141 // Given a spelling string, return its Yunmu string. 142 // The caller guaratees spl_str is valid. 143 const char* get_ym_str(const char *spl_str); 144 145 // Build the Yunmu list, and the mapping relation between the full ids and the 146 // Yunmu ids. This functin is called after the spelling trie is built. 147 bool build_ym_info(); 148 #endif 149 150 friend class SpellingParser; 151 friend class SmartSplParser; 152 friend class SmartSplParser2; 153 154 public: 155 ~SpellingTrie(); 156 is_valid_spl_char(char ch)157 inline static bool is_valid_spl_char(char ch) { 158 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); 159 } 160 161 // The caller guarantees that the two chars are valid spelling chars. is_same_spl_char(char ch1,char ch2)162 inline static bool is_same_spl_char(char ch1, char ch2) { 163 return ch1 == ch2 || ch1 - ch2 == 'a' - 'A' || ch2 - ch1 == 'a' - 'A'; 164 } 165 166 // Construct the tree from the input pinyin array 167 // The given string list should have been sorted. 168 // score_amplifier is used to convert a possibility value into score. 169 // average_score is the average_score of all spellings. The dumb node is 170 // assigned with this score. 171 bool construct(const char* spelling_arr, size_t item_size, size_t item_num, 172 float score_amplifier, unsigned char average_score); 173 174 // Test if the given id is a valid spelling id. 175 // If function returns true, the given splid may be updated like this: 176 // When 'A' is not enabled in ShouZiMu mode, the parsing result for 'A' is 177 // first given as a half id 1, but because 'A' is a one-char Yunmu and 178 // it is a valid id, it needs to updated to its corresponding full id. 179 bool if_valid_id_update(uint16 *splid) const; 180 181 // Test if the given id is a half id. 182 bool is_half_id(uint16 splid) const; 183 184 bool is_full_id(uint16 splid) const; 185 186 // Test if the given id is a one-char Yunmu id (obviously, it is also a half 187 // id), such as 'A', 'E' and 'O'. 188 bool is_half_id_yunmu(uint16 splid) const; 189 190 // Test if this char is a ShouZiMu char. This ShouZiMu char may be not enabled. 191 // For Pinyin, only i/u/v is not a ShouZiMu char. 192 // The caller should guarantee that ch >= 'A' && ch <= 'Z' 193 bool is_szm_char(char ch) const; 194 195 // Test If this char is enabled in ShouZiMu mode. 196 // The caller should guarantee that ch >= 'A' && ch <= 'Z' 197 bool szm_is_enabled(char ch) const; 198 199 // Enable/disable Shengmus in ShouZiMu mode(using the first char of a spelling 200 // to input). 201 void szm_enable_shm(bool enable); 202 203 // Enable/disable Yunmus in ShouZiMu mode. 204 void szm_enable_ym(bool enable); 205 206 // Test if this char is enabled in ShouZiMu mode. 207 // The caller should guarantee ch >= 'A' && ch <= 'Z' 208 bool is_szm_enabled(char ch) const; 209 210 // Return the number of full ids for the given half id. 211 uint16 half2full_num(uint16 half_id) const; 212 213 // Return the number of full ids for the given half id, and fill spl_id_start 214 // to return the first full id. 215 uint16 half_to_full(uint16 half_id, uint16 *spl_id_start) const; 216 217 // Return the corresponding half id for the given full id. 218 // Not frequently used, low efficient. 219 // Return 0 if fails. 220 uint16 full_to_half(uint16 full_id) const; 221 222 // To test whether a half id is compatible with a full id. 223 // Generally, when half_id == full_to_half(full_id), return true. 224 // But for "Zh, Ch, Sh", if fussy mode is on, half id for 'Z' is compatible 225 // with a full id like "Zhe". (Fussy mode is not ready). 226 bool half_full_compatible(uint16 half_id, uint16 full_id) const; 227 228 static const SpellingTrie* get_cpinstance(); 229 230 static SpellingTrie& get_instance(); 231 232 // Save to the file stream 233 bool save_spl_trie(FILE *fp); 234 235 // Load from the file stream 236 bool load_spl_trie(FILE *fp); 237 238 // Get the number of spellings 239 size_t get_spelling_num(); 240 241 // Return the Yunmu id for the given Yunmu string. 242 // If the string is not valid, return 0; 243 uint8 get_ym_id(const char* ym_str); 244 245 // Get the readonly Pinyin string for a given spelling id 246 const char* get_spelling_str(uint16 splid); 247 248 // Get the readonly Pinyin string for a given spelling id 249 const char16* get_spelling_str16(uint16 splid); 250 251 // Get Pinyin string for a given spelling id. Return the length of the 252 // string, and fill-in '\0' at the end. 253 size_t get_spelling_str16(uint16 splid, char16 *splstr16, 254 size_t splstr16_len); 255 }; 256 } 257 258 #endif // PINYINIME_INCLUDE_SPELLINGTRIE_H__ 259