• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <assert.h>
18 #include "../include/splparser.h"
19 
20 namespace ime_pinyin {
21 
SpellingParser()22 SpellingParser::SpellingParser() {
23   spl_trie_ = SpellingTrie::get_cpinstance();
24 }
25 
is_valid_to_parse(char ch)26 bool SpellingParser::is_valid_to_parse(char ch) {
27   return SpellingTrie::is_valid_spl_char(ch);
28 }
29 
splstr_to_idxs(const char * splstr,uint16 str_len,uint16 spl_idx[],uint16 start_pos[],uint16 max_size,bool & last_is_pre)30 uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len,
31                                       uint16 spl_idx[], uint16 start_pos[],
32                                       uint16 max_size, bool &last_is_pre) {
33   if (NULL == splstr || 0 == max_size || 0 == str_len)
34     return 0;
35 
36   if (!SpellingTrie::is_valid_spl_char(splstr[0]))
37     return 0;
38 
39   last_is_pre = false;
40 
41   const SpellingNode *node_this = spl_trie_->root_;
42 
43   uint16 str_pos = 0;
44   uint16 idx_num = 0;
45   if (NULL != start_pos)
46     start_pos[0] = 0;
47   bool last_is_splitter = false;
48 
49   while (str_pos < str_len) {
50     char char_this = splstr[str_pos];
51     // all characters outside of [a, z] are considered as splitters
52     if (!SpellingTrie::is_valid_spl_char(char_this)) {
53       // test if the current node is endable
54       uint16 id_this = node_this->spelling_idx;
55       if (spl_trie_->if_valid_id_update(&id_this)) {
56         spl_idx[idx_num] = id_this;
57 
58         idx_num++;
59         str_pos++;
60         if (NULL != start_pos)
61           start_pos[idx_num] = str_pos;
62         if (idx_num >= max_size)
63           return idx_num;
64 
65         node_this = spl_trie_->root_;
66         last_is_splitter = true;
67         continue;
68       } else {
69         if (last_is_splitter) {
70           str_pos++;
71           if (NULL != start_pos)
72             start_pos[idx_num] = str_pos;
73           continue;
74         } else {
75           return idx_num;
76         }
77       }
78     }
79 
80     last_is_splitter = false;
81 
82     SpellingNode *found_son = NULL;
83 
84     if (0 == str_pos) {
85       if (char_this >= 'a')
86         found_son = spl_trie_->level1_sons_[char_this - 'a'];
87       else
88         found_son = spl_trie_->level1_sons_[char_this - 'A'];
89     } else {
90       SpellingNode *first_son = node_this->first_son;
91       // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
92       // frequently used, so we scan from the end.
93       for (int i = 0; i < node_this->num_of_son; i++) {
94         SpellingNode *this_son = first_son + i;
95         if (SpellingTrie::is_same_spl_char(
96             this_son->char_this_node, char_this)) {
97           found_son = this_son;
98           break;
99         }
100       }
101     }
102 
103     // found, just move the current node pointer to the the son
104     if (NULL != found_son) {
105       node_this = found_son;
106     } else {
107       // not found, test if it is endable
108       uint16 id_this = node_this->spelling_idx;
109       if (spl_trie_->if_valid_id_update(&id_this)) {
110         // endable, remember the index
111         spl_idx[idx_num] = id_this;
112 
113         idx_num++;
114         if (NULL != start_pos)
115           start_pos[idx_num] = str_pos;
116         if (idx_num >= max_size)
117           return idx_num;
118         node_this = spl_trie_->root_;
119         continue;
120       } else {
121         return idx_num;
122       }
123     }
124 
125     str_pos++;
126   }
127 
128   uint16 id_this = node_this->spelling_idx;
129   if (spl_trie_->if_valid_id_update(&id_this)) {
130     // endable, remember the index
131     spl_idx[idx_num] = id_this;
132 
133     idx_num++;
134     if (NULL != start_pos)
135       start_pos[idx_num] = str_pos;
136   }
137 
138   last_is_pre = !last_is_splitter;
139 
140   return idx_num;
141 }
142 
splstr_to_idxs_f(const char * splstr,uint16 str_len,uint16 spl_idx[],uint16 start_pos[],uint16 max_size,bool & last_is_pre)143 uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len,
144                                         uint16 spl_idx[], uint16 start_pos[],
145                                         uint16 max_size, bool &last_is_pre) {
146   uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos,
147                                   max_size, last_is_pre);
148   for (uint16 pos = 0; pos < idx_num; pos++) {
149     if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
150       spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
151       if (pos == idx_num - 1) {
152         last_is_pre = false;
153       }
154     }
155   }
156   return idx_num;
157 }
158 
splstr16_to_idxs(const char16 * splstr,uint16 str_len,uint16 spl_idx[],uint16 start_pos[],uint16 max_size,bool & last_is_pre)159 uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len,
160                                         uint16 spl_idx[], uint16 start_pos[],
161                                         uint16 max_size, bool &last_is_pre) {
162   if (NULL == splstr || 0 == max_size || 0 == str_len)
163     return 0;
164 
165   if (!SpellingTrie::is_valid_spl_char(splstr[0]))
166     return 0;
167 
168   last_is_pre = false;
169 
170   const SpellingNode *node_this = spl_trie_->root_;
171 
172   uint16 str_pos = 0;
173   uint16 idx_num = 0;
174   if (NULL != start_pos)
175     start_pos[0] = 0;
176   bool last_is_splitter = false;
177 
178   while (str_pos < str_len) {
179     char16 char_this = splstr[str_pos];
180     // all characters outside of [a, z] are considered as splitters
181     if (!SpellingTrie::is_valid_spl_char(char_this)) {
182       // test if the current node is endable
183       uint16 id_this = node_this->spelling_idx;
184       if (spl_trie_->if_valid_id_update(&id_this)) {
185         spl_idx[idx_num] = id_this;
186 
187         idx_num++;
188         str_pos++;
189         if (NULL != start_pos)
190           start_pos[idx_num] = str_pos;
191         if (idx_num >= max_size)
192           return idx_num;
193 
194         node_this = spl_trie_->root_;
195         last_is_splitter = true;
196         continue;
197       } else {
198         if (last_is_splitter) {
199           str_pos++;
200           if (NULL != start_pos)
201             start_pos[idx_num] = str_pos;
202           continue;
203         } else {
204           return idx_num;
205         }
206       }
207     }
208 
209     last_is_splitter = false;
210 
211     SpellingNode *found_son = NULL;
212 
213     if (0 == str_pos) {
214       if (char_this >= 'a')
215         found_son = spl_trie_->level1_sons_[char_this - 'a'];
216       else
217         found_son = spl_trie_->level1_sons_[char_this - 'A'];
218     } else {
219       SpellingNode *first_son = node_this->first_son;
220       // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
221       // frequently used, so we scan from the end.
222       for (int i = 0; i < node_this->num_of_son; i++) {
223         SpellingNode *this_son = first_son + i;
224         if (SpellingTrie::is_same_spl_char(
225             this_son->char_this_node, char_this)) {
226           found_son = this_son;
227           break;
228         }
229       }
230     }
231 
232     // found, just move the current node pointer to the the son
233     if (NULL != found_son) {
234       node_this = found_son;
235     } else {
236       // not found, test if it is endable
237       uint16 id_this = node_this->spelling_idx;
238       if (spl_trie_->if_valid_id_update(&id_this)) {
239         // endable, remember the index
240         spl_idx[idx_num] = id_this;
241 
242         idx_num++;
243         if (NULL != start_pos)
244           start_pos[idx_num] = str_pos;
245         if (idx_num >= max_size)
246           return idx_num;
247         node_this = spl_trie_->root_;
248         continue;
249       } else {
250         return idx_num;
251       }
252     }
253 
254     str_pos++;
255   }
256 
257   uint16 id_this = node_this->spelling_idx;
258   if (spl_trie_->if_valid_id_update(&id_this)) {
259     // endable, remember the index
260     spl_idx[idx_num] = id_this;
261 
262     idx_num++;
263     if (NULL != start_pos)
264       start_pos[idx_num] = str_pos;
265   }
266 
267   last_is_pre = !last_is_splitter;
268 
269   return idx_num;
270 }
271 
splstr16_to_idxs_f(const char16 * splstr,uint16 str_len,uint16 spl_idx[],uint16 start_pos[],uint16 max_size,bool & last_is_pre)272 uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len,
273                                           uint16 spl_idx[], uint16 start_pos[],
274                                           uint16 max_size, bool &last_is_pre) {
275   uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos,
276                                     max_size, last_is_pre);
277   for (uint16 pos = 0; pos < idx_num; pos++) {
278     if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
279       spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
280       if (pos == idx_num - 1) {
281         last_is_pre = false;
282       }
283     }
284   }
285   return idx_num;
286 }
287 
get_splid_by_str(const char * splstr,uint16 str_len,bool * is_pre)288 uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len,
289                                         bool *is_pre) {
290   if (NULL == is_pre)
291     return 0;
292 
293   uint16 spl_idx[2];
294   uint16 start_pos[3];
295 
296   if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
297     return 0;
298 
299   if (start_pos[1] != str_len)
300     return 0;
301   return spl_idx[0];
302 }
303 
get_splid_by_str_f(const char * splstr,uint16 str_len,bool * is_pre)304 uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len,
305                                           bool *is_pre) {
306   if (NULL == is_pre)
307     return 0;
308 
309   uint16 spl_idx[2];
310   uint16 start_pos[3];
311 
312   if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
313     return 0;
314 
315   if (start_pos[1] != str_len)
316     return 0;
317   if (spl_trie_->is_half_id_yunmu(spl_idx[0])) {
318     spl_trie_->half_to_full(spl_idx[0], spl_idx);
319     *is_pre = false;
320   }
321 
322   return spl_idx[0];
323 }
324 
get_splids_parallel(const char * splstr,uint16 str_len,uint16 splidx[],uint16 max_size,uint16 & full_id_num,bool & is_pre)325 uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len,
326     uint16 splidx[], uint16 max_size,
327     uint16 &full_id_num, bool &is_pre) {
328   if (max_size <= 0 || !is_valid_to_parse(splstr[0]))
329     return 0;
330 
331   splidx[0] = get_splid_by_str(splstr, str_len, &is_pre);
332   full_id_num = 0;
333   if (0 != splidx[0]) {
334     if (splidx[0] >= kFullSplIdStart)
335       full_id_num = 1;
336     return 1;
337   }
338   return 0;
339 }
340 
341 }  // namespace ime_pinyin
342