• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef PINYINIME_INCLUDE_SPELLINGTRIE_H__
18 #define PINYINIME_INCLUDE_SPELLINGTRIE_H__
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include "./dictdef.h"
23 
24 namespace ime_pinyin {
25 
26 static const unsigned short kFullSplIdStart = kHalfSpellingIdNum + 1;
27 
28 // Node used for the trie of spellings
29 struct SpellingNode {
30   SpellingNode *first_son;
31   // The spelling id for each node. If you need more bits to store
32   // spelling id, please adjust this structure.
33   uint16 spelling_idx:11;
34   uint16  num_of_son:5;
35   char char_this_node;
36   unsigned char score;
37 };
38 
39 class SpellingTrie {
40  private:
41   static const int kMaxYmNum = 64;
42   static const size_t kValidSplCharNum = 26;
43 
44   static const uint16 kHalfIdShengmuMask = 0x01;
45   static const uint16 kHalfIdYunmuMask = 0x02;
46   static const uint16 kHalfIdSzmMask = 0x04;
47 
48   // Map from half spelling id to single char.
49   // For half ids of Zh/Ch/Sh, map to z/c/s (low case) respectively.
50   // For example, 1 to 'A', 2 to 'B', 3 to 'C', 4 to 'c', 5 to 'D', ...,
51   // 28 to 'Z', 29 to 'z'.
52   // [0] is not used to achieve better efficiency.
53   static const char kHalfId2Sc_[kFullSplIdStart + 1];
54 
55   static unsigned char char_flags_[];
56   static SpellingTrie* instance_;
57 
58   // The spelling table
59   char *spelling_buf_;
60 
61   // The size of longest spelling string, includes '\0' and an extra char to
62   // store score. For example, "zhuang" is the longgest item in Pinyin list,
63   // so spelling_size_ is 8.
64   // Structure: The string ended with '\0' + score char.
65   // An item with a lower score has a higher probability.
66   size_t spelling_size_;
67 
68   // Number of full spelling ids.
69   size_t spelling_num_;
70 
71   float score_amplifier_;
72   unsigned char average_score_;
73 
74   // The Yunmu id list for the spelling ids (for half ids of Shengmu,
75   // the Yunmu id is 0).
76   // The length of the list is spelling_num_ + kFullSplIdStart,
77   // so that spl_ym_ids_[splid] is the Yunmu id of the splid.
78   uint8 *spl_ym_ids_;
79 
80   // The Yunmu table.
81   // Each Yunmu will be assigned with Yunmu id from 1.
82   char *ym_buf_;
83   size_t ym_size_;  // The size of longest Yunmu string, '\0'included.
84   size_t ym_num_;
85 
86   // The spelling string just queried
87   char *splstr_queried_;
88 
89   // The spelling string just queried
90   char16 *splstr16_queried_;
91 
92   // The root node of the spelling tree
93   SpellingNode* root_;
94 
95   // If a none qwerty key such as a fnction key like ENTER is given, this node
96   // will be used to indicate that this is not a QWERTY node.
97   SpellingNode* dumb_node_;
98 
99   // If a splitter key is pressed, this node will be used to indicate that this
100   // is a splitter key.
101   SpellingNode* splitter_node_;
102 
103   // Used to get the first level sons.
104   SpellingNode* level1_sons_[kValidSplCharNum];
105 
106   // The full spl_id range for specific half id.
107   // h2f means half to full.
108   // A half id can be a ShouZiMu id (id to represent the first char of a full
109   // spelling, including Shengmu and Yunmu), or id of zh/ch/sh.
110   // [1..kFullSplIdStart-1] is the arrange of half id.
111   uint16 h2f_start_[kFullSplIdStart];
112   uint16 h2f_num_[kFullSplIdStart];
113 
114   // Map from full id to half id.
115   uint16 *f2h_;
116 
117 #ifdef ___BUILD_MODEL___
118   // How many node used to build the trie.
119   size_t node_num_;
120 #endif
121 
122   SpellingTrie();
123 
124   void free_son_trie(SpellingNode* node);
125 
126   // Construct a subtree using a subset of the spelling array (from
127   // item_star to item_end).
128   // Member spelliing_buf_ and spelling_size_ should be valid.
129   // parent is used to update its num_of_son and score.
130   SpellingNode* construct_spellings_subset(size_t item_start, size_t item_end,
131                                            size_t level, SpellingNode *parent);
132   bool build_f2h();
133 
134   // The caller should guarantee ch >= 'A' && ch <= 'Z'
135   bool is_shengmu_char(char ch) const;
136 
137   // The caller should guarantee ch >= 'A' && ch <= 'Z'
138   bool is_yunmu_char(char ch) const;
139 
140 #ifdef ___BUILD_MODEL___
141   // Given a spelling string, return its Yunmu string.
142   // The caller guaratees spl_str is valid.
143   const char* get_ym_str(const char *spl_str);
144 
145   // Build the Yunmu list, and the mapping relation between the full ids and the
146   // Yunmu ids. This functin is called after the spelling trie is built.
147   bool build_ym_info();
148 #endif
149 
150   friend class SpellingParser;
151   friend class SmartSplParser;
152   friend class SmartSplParser2;
153 
154  public:
155   ~SpellingTrie();
156 
is_valid_spl_char(char ch)157   inline static bool is_valid_spl_char(char ch) {
158     return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
159   }
160 
161   // The caller guarantees that the two chars are valid spelling chars.
is_same_spl_char(char ch1,char ch2)162   inline static bool is_same_spl_char(char ch1, char ch2) {
163     return ch1 == ch2 || ch1 - ch2 == 'a' - 'A' || ch2 - ch1 == 'a' - 'A';
164   }
165 
166   // Construct the tree from the input pinyin array
167   // The given string list should have been sorted.
168   // score_amplifier is used to convert a possibility value into score.
169   // average_score is the average_score of all spellings. The dumb node is
170   // assigned with this score.
171   bool construct(const char* spelling_arr, size_t item_size, size_t item_num,
172                  float score_amplifier, unsigned char average_score);
173 
174   // Test if the given id is a valid spelling id.
175   // If function returns true, the given splid may be updated like this:
176   // When 'A' is not enabled in ShouZiMu mode, the parsing result for 'A' is
177   // first given as a half id 1, but because 'A' is a one-char Yunmu and
178   // it is a valid id, it needs to updated to its corresponding full id.
179   bool if_valid_id_update(uint16 *splid) const;
180 
181   // Test if the given id is a half id.
182   bool is_half_id(uint16 splid) const;
183 
184   bool is_full_id(uint16 splid) const;
185 
186   // Test if the given id is a one-char Yunmu id (obviously, it is also a half
187   // id), such as 'A', 'E' and 'O'.
188   bool is_half_id_yunmu(uint16 splid) const;
189 
190   // Test if this char is a ShouZiMu char. This ShouZiMu char may be not enabled.
191   // For Pinyin, only i/u/v is not a ShouZiMu char.
192   // The caller should guarantee that ch >= 'A' && ch <= 'Z'
193   bool is_szm_char(char ch) const;
194 
195   // Test If this char is enabled in ShouZiMu mode.
196   // The caller should guarantee that ch >= 'A' && ch <= 'Z'
197   bool szm_is_enabled(char ch) const;
198 
199   // Enable/disable Shengmus in ShouZiMu mode(using the first char of a spelling
200   // to input).
201   void szm_enable_shm(bool enable);
202 
203   // Enable/disable Yunmus in ShouZiMu mode.
204   void szm_enable_ym(bool enable);
205 
206   // Test if this char is enabled in ShouZiMu mode.
207   // The caller should guarantee ch >= 'A' && ch <= 'Z'
208   bool is_szm_enabled(char ch) const;
209 
210   // Return the number of full ids for the given half id.
211   uint16 half2full_num(uint16 half_id) const;
212 
213   // Return the number of full ids for the given half id, and fill spl_id_start
214   // to return the first full id.
215   uint16 half_to_full(uint16 half_id, uint16 *spl_id_start) const;
216 
217   // Return the corresponding half id for the given full id.
218   // Not frequently used, low efficient.
219   // Return 0 if fails.
220   uint16 full_to_half(uint16 full_id) const;
221 
222   // To test whether a half id is compatible with a full id.
223   // Generally, when half_id == full_to_half(full_id), return true.
224   // But for "Zh, Ch, Sh", if fussy mode is on, half id for 'Z' is compatible
225   // with a full id like "Zhe". (Fussy mode is not ready).
226   bool half_full_compatible(uint16 half_id, uint16 full_id) const;
227 
228   static const SpellingTrie* get_cpinstance();
229 
230   static SpellingTrie& get_instance();
231 
232   // Save to the file stream
233   bool save_spl_trie(FILE *fp);
234 
235   // Load from the file stream
236   bool load_spl_trie(FILE *fp);
237 
238   // Get the number of spellings
239   size_t get_spelling_num();
240 
241   // Return the Yunmu id for the given Yunmu string.
242   // If the string is not valid, return 0;
243   uint8 get_ym_id(const char* ym_str);
244 
245   // Get the readonly Pinyin string for a given spelling id
246   const char* get_spelling_str(uint16 splid);
247 
248   // Get the readonly Pinyin string for a given spelling id
249   const char16* get_spelling_str16(uint16 splid);
250 
251   // Get Pinyin string for a given spelling id. Return the length of the
252   // string, and fill-in '\0' at the end.
253   size_t get_spelling_str16(uint16 splid, char16 *splstr16,
254                             size_t splstr16_len);
255 };
256 }
257 
258 #endif  // PINYINIME_INCLUDE_SPELLINGTRIE_H__
259