1 /*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <assert.h>
18 #include "../include/splparser.h"
19
20 namespace ime_pinyin {
21
SpellingParser()22 SpellingParser::SpellingParser() {
23 spl_trie_ = SpellingTrie::get_cpinstance();
24 }
25
is_valid_to_parse(char ch)26 bool SpellingParser::is_valid_to_parse(char ch) {
27 return SpellingTrie::is_valid_spl_char(ch);
28 }
29
splstr_to_idxs(const char * splstr,uint16 str_len,uint16 spl_idx[],uint16 start_pos[],uint16 max_size,bool & last_is_pre)30 uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len,
31 uint16 spl_idx[], uint16 start_pos[],
32 uint16 max_size, bool &last_is_pre) {
33 if (NULL == splstr || 0 == max_size || 0 == str_len)
34 return 0;
35
36 if (!SpellingTrie::is_valid_spl_char(splstr[0]))
37 return 0;
38
39 last_is_pre = false;
40
41 const SpellingNode *node_this = spl_trie_->root_;
42
43 uint16 str_pos = 0;
44 uint16 idx_num = 0;
45 if (NULL != start_pos)
46 start_pos[0] = 0;
47 bool last_is_splitter = false;
48
49 while (str_pos < str_len) {
50 char char_this = splstr[str_pos];
51 // all characters outside of [a, z] are considered as splitters
52 if (!SpellingTrie::is_valid_spl_char(char_this)) {
53 // test if the current node is endable
54 uint16 id_this = node_this->spelling_idx;
55 if (spl_trie_->if_valid_id_update(&id_this)) {
56 spl_idx[idx_num] = id_this;
57
58 idx_num++;
59 str_pos++;
60 if (NULL != start_pos)
61 start_pos[idx_num] = str_pos;
62 if (idx_num >= max_size)
63 return idx_num;
64
65 node_this = spl_trie_->root_;
66 last_is_splitter = true;
67 continue;
68 } else {
69 if (last_is_splitter) {
70 str_pos++;
71 if (NULL != start_pos)
72 start_pos[idx_num] = str_pos;
73 continue;
74 } else {
75 return idx_num;
76 }
77 }
78 }
79
80 last_is_splitter = false;
81
82 SpellingNode *found_son = NULL;
83
84 if (0 == str_pos) {
85 if (char_this >= 'a')
86 found_son = spl_trie_->level1_sons_[char_this - 'a'];
87 else
88 found_son = spl_trie_->level1_sons_[char_this - 'A'];
89 } else {
90 SpellingNode *first_son = node_this->first_son;
91 // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
92 // frequently used, so we scan from the end.
93 for (int i = 0; i < node_this->num_of_son; i++) {
94 SpellingNode *this_son = first_son + i;
95 if (SpellingTrie::is_same_spl_char(
96 this_son->char_this_node, char_this)) {
97 found_son = this_son;
98 break;
99 }
100 }
101 }
102
103 // found, just move the current node pointer to the the son
104 if (NULL != found_son) {
105 node_this = found_son;
106 } else {
107 // not found, test if it is endable
108 uint16 id_this = node_this->spelling_idx;
109 if (spl_trie_->if_valid_id_update(&id_this)) {
110 // endable, remember the index
111 spl_idx[idx_num] = id_this;
112
113 idx_num++;
114 if (NULL != start_pos)
115 start_pos[idx_num] = str_pos;
116 if (idx_num >= max_size)
117 return idx_num;
118 node_this = spl_trie_->root_;
119 continue;
120 } else {
121 return idx_num;
122 }
123 }
124
125 str_pos++;
126 }
127
128 uint16 id_this = node_this->spelling_idx;
129 if (spl_trie_->if_valid_id_update(&id_this)) {
130 // endable, remember the index
131 spl_idx[idx_num] = id_this;
132
133 idx_num++;
134 if (NULL != start_pos)
135 start_pos[idx_num] = str_pos;
136 }
137
138 last_is_pre = !last_is_splitter;
139
140 return idx_num;
141 }
142
splstr_to_idxs_f(const char * splstr,uint16 str_len,uint16 spl_idx[],uint16 start_pos[],uint16 max_size,bool & last_is_pre)143 uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len,
144 uint16 spl_idx[], uint16 start_pos[],
145 uint16 max_size, bool &last_is_pre) {
146 uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos,
147 max_size, last_is_pre);
148 for (uint16 pos = 0; pos < idx_num; pos++) {
149 if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
150 spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
151 if (pos == idx_num - 1) {
152 last_is_pre = false;
153 }
154 }
155 }
156 return idx_num;
157 }
158
splstr16_to_idxs(const char16 * splstr,uint16 str_len,uint16 spl_idx[],uint16 start_pos[],uint16 max_size,bool & last_is_pre)159 uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len,
160 uint16 spl_idx[], uint16 start_pos[],
161 uint16 max_size, bool &last_is_pre) {
162 if (NULL == splstr || 0 == max_size || 0 == str_len)
163 return 0;
164
165 if (!SpellingTrie::is_valid_spl_char(splstr[0]))
166 return 0;
167
168 last_is_pre = false;
169
170 const SpellingNode *node_this = spl_trie_->root_;
171
172 uint16 str_pos = 0;
173 uint16 idx_num = 0;
174 if (NULL != start_pos)
175 start_pos[0] = 0;
176 bool last_is_splitter = false;
177
178 while (str_pos < str_len) {
179 char16 char_this = splstr[str_pos];
180 // all characters outside of [a, z] are considered as splitters
181 if (!SpellingTrie::is_valid_spl_char(char_this)) {
182 // test if the current node is endable
183 uint16 id_this = node_this->spelling_idx;
184 if (spl_trie_->if_valid_id_update(&id_this)) {
185 spl_idx[idx_num] = id_this;
186
187 idx_num++;
188 str_pos++;
189 if (NULL != start_pos)
190 start_pos[idx_num] = str_pos;
191 if (idx_num >= max_size)
192 return idx_num;
193
194 node_this = spl_trie_->root_;
195 last_is_splitter = true;
196 continue;
197 } else {
198 if (last_is_splitter) {
199 str_pos++;
200 if (NULL != start_pos)
201 start_pos[idx_num] = str_pos;
202 continue;
203 } else {
204 return idx_num;
205 }
206 }
207 }
208
209 last_is_splitter = false;
210
211 SpellingNode *found_son = NULL;
212
213 if (0 == str_pos) {
214 if (char_this >= 'a')
215 found_son = spl_trie_->level1_sons_[char_this - 'a'];
216 else
217 found_son = spl_trie_->level1_sons_[char_this - 'A'];
218 } else {
219 SpellingNode *first_son = node_this->first_son;
220 // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
221 // frequently used, so we scan from the end.
222 for (int i = 0; i < node_this->num_of_son; i++) {
223 SpellingNode *this_son = first_son + i;
224 if (SpellingTrie::is_same_spl_char(
225 this_son->char_this_node, char_this)) {
226 found_son = this_son;
227 break;
228 }
229 }
230 }
231
232 // found, just move the current node pointer to the the son
233 if (NULL != found_son) {
234 node_this = found_son;
235 } else {
236 // not found, test if it is endable
237 uint16 id_this = node_this->spelling_idx;
238 if (spl_trie_->if_valid_id_update(&id_this)) {
239 // endable, remember the index
240 spl_idx[idx_num] = id_this;
241
242 idx_num++;
243 if (NULL != start_pos)
244 start_pos[idx_num] = str_pos;
245 if (idx_num >= max_size)
246 return idx_num;
247 node_this = spl_trie_->root_;
248 continue;
249 } else {
250 return idx_num;
251 }
252 }
253
254 str_pos++;
255 }
256
257 uint16 id_this = node_this->spelling_idx;
258 if (spl_trie_->if_valid_id_update(&id_this)) {
259 // endable, remember the index
260 spl_idx[idx_num] = id_this;
261
262 idx_num++;
263 if (NULL != start_pos)
264 start_pos[idx_num] = str_pos;
265 }
266
267 last_is_pre = !last_is_splitter;
268
269 return idx_num;
270 }
271
splstr16_to_idxs_f(const char16 * splstr,uint16 str_len,uint16 spl_idx[],uint16 start_pos[],uint16 max_size,bool & last_is_pre)272 uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len,
273 uint16 spl_idx[], uint16 start_pos[],
274 uint16 max_size, bool &last_is_pre) {
275 uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos,
276 max_size, last_is_pre);
277 for (uint16 pos = 0; pos < idx_num; pos++) {
278 if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
279 spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
280 if (pos == idx_num - 1) {
281 last_is_pre = false;
282 }
283 }
284 }
285 return idx_num;
286 }
287
get_splid_by_str(const char * splstr,uint16 str_len,bool * is_pre)288 uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len,
289 bool *is_pre) {
290 if (NULL == is_pre)
291 return 0;
292
293 uint16 spl_idx[2];
294 uint16 start_pos[3];
295
296 if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
297 return 0;
298
299 if (start_pos[1] != str_len)
300 return 0;
301 return spl_idx[0];
302 }
303
get_splid_by_str_f(const char * splstr,uint16 str_len,bool * is_pre)304 uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len,
305 bool *is_pre) {
306 if (NULL == is_pre)
307 return 0;
308
309 uint16 spl_idx[2];
310 uint16 start_pos[3];
311
312 if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
313 return 0;
314
315 if (start_pos[1] != str_len)
316 return 0;
317 if (spl_trie_->is_half_id_yunmu(spl_idx[0])) {
318 spl_trie_->half_to_full(spl_idx[0], spl_idx);
319 *is_pre = false;
320 }
321
322 return spl_idx[0];
323 }
324
get_splids_parallel(const char * splstr,uint16 str_len,uint16 splidx[],uint16 max_size,uint16 & full_id_num,bool & is_pre)325 uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len,
326 uint16 splidx[], uint16 max_size,
327 uint16 &full_id_num, bool &is_pre) {
328 if (max_size <= 0 || !is_valid_to_parse(splstr[0]))
329 return 0;
330
331 splidx[0] = get_splid_by_str(splstr, str_len, &is_pre);
332 full_id_num = 0;
333 if (0 != splidx[0]) {
334 if (splidx[0] >= kFullSplIdStart)
335 full_id_num = 1;
336 return 1;
337 }
338 return 0;
339 }
340
341 } // namespace ime_pinyin
342