1 /*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <assert.h>
18 #include <math.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include "../include/lpicache.h"
22 #include "../include/matrixsearch.h"
23 #include "../include/mystdlib.h"
24 #include "../include/ngram.h"
25 #include "../include/userdict.h"
26
27 namespace ime_pinyin {
28
29 #define PRUMING_SCORE 8000.0
30
MatrixSearch()31 MatrixSearch::MatrixSearch() {
32 inited_ = false;
33 spl_trie_ = SpellingTrie::get_cpinstance();
34
35 reset_pointers_to_null();
36
37 pys_decoded_len_ = 0;
38 mtrx_nd_pool_used_ = 0;
39 dmi_pool_used_ = 0;
40 xi_an_enabled_ = false;
41 dmi_c_phrase_ = false;
42
43 assert(kMaxSearchSteps > 0);
44 max_sps_len_ = kMaxSearchSteps - 1;
45 max_hzs_len_ = kMaxSearchSteps;
46 }
47
~MatrixSearch()48 MatrixSearch::~MatrixSearch() {
49 free_resource();
50 }
51
reset_pointers_to_null()52 void MatrixSearch::reset_pointers_to_null() {
53 dict_trie_ = NULL;
54 user_dict_ = NULL;
55 spl_parser_ = NULL;
56
57 share_buf_ = NULL;
58
59 // The following four buffers are used for decoding, and they are based on
60 // share_buf_, no need to delete them.
61 mtrx_nd_pool_ = NULL;
62 dmi_pool_ = NULL;
63 matrix_ = NULL;
64 dep_ = NULL;
65
66 // Based on share_buf_, no need to delete them.
67 npre_items_ = NULL;
68 }
69
alloc_resource()70 bool MatrixSearch::alloc_resource() {
71 free_resource();
72
73 dict_trie_ = new DictTrie();
74 user_dict_ = static_cast<AtomDictBase*>(new UserDict());
75 spl_parser_ = new SpellingParser();
76
77 size_t mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize;
78 mtrx_nd_size = align_to_size_t(mtrx_nd_size) / sizeof(size_t);
79 size_t dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize;
80 dmi_size = align_to_size_t(dmi_size) / sizeof(size_t);
81 size_t matrix_size = sizeof(MatrixRow) * kMaxRowNum;
82 matrix_size = align_to_size_t(matrix_size) / sizeof(size_t);
83 size_t dep_size = sizeof(DictExtPara);
84 dep_size = align_to_size_t(dep_size) / sizeof(size_t);
85
86 // share_buf's size is determined by the buffers for search.
87 share_buf_ = new size_t[mtrx_nd_size + dmi_size + matrix_size + dep_size];
88
89 if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ ||
90 NULL == share_buf_)
91 return false;
92
93 // The buffers for search are based on the share buffer
94 mtrx_nd_pool_ = reinterpret_cast<MatrixNode*>(share_buf_);
95 dmi_pool_ = reinterpret_cast<DictMatchInfo*>(share_buf_ + mtrx_nd_size);
96 matrix_ = reinterpret_cast<MatrixRow*>(share_buf_ + mtrx_nd_size + dmi_size);
97 dep_ = reinterpret_cast<DictExtPara*>
98 (share_buf_ + mtrx_nd_size + dmi_size + matrix_size);
99
100 // The prediction buffer is also based on the share buffer.
101 npre_items_ = reinterpret_cast<NPredictItem*>(share_buf_);
102 npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) *
103 sizeof(size_t) / sizeof(NPredictItem);
104 return true;
105 }
106
free_resource()107 void MatrixSearch::free_resource() {
108 if (NULL != dict_trie_)
109 delete dict_trie_;
110
111 if (NULL != user_dict_)
112 delete user_dict_;
113
114 if (NULL != spl_parser_)
115 delete spl_parser_;
116
117 if (NULL != share_buf_)
118 delete [] share_buf_;
119
120 reset_pointers_to_null();
121 }
122
init(const char * fn_sys_dict,const char * fn_usr_dict)123 bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) {
124 if (NULL == fn_sys_dict || NULL == fn_usr_dict)
125 return false;
126
127 if (!alloc_resource())
128 return false;
129
130 if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd))
131 return false;
132
133 // If engine fails to load the user dictionary, reset the user dictionary
134 // to NULL.
135 if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
136 delete user_dict_;
137 user_dict_ = NULL;
138 } else{
139 user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
140 }
141
142 reset_search0();
143
144 inited_ = true;
145 return true;
146 }
147
init_fd(int sys_fd,long start_offset,long length,const char * fn_usr_dict)148 bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length,
149 const char *fn_usr_dict) {
150 if (NULL == fn_usr_dict)
151 return false;
152
153 if (!alloc_resource())
154 return false;
155
156 if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd))
157 return false;
158
159 if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
160 delete user_dict_;
161 user_dict_ = NULL;
162 } else {
163 user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
164 }
165
166 reset_search0();
167
168 inited_ = true;
169 return true;
170 }
171
set_max_lens(size_t max_sps_len,size_t max_hzs_len)172 void MatrixSearch::set_max_lens(size_t max_sps_len, size_t max_hzs_len) {
173 if (0 != max_sps_len)
174 max_sps_len_ = max_sps_len;
175 if (0 != max_hzs_len)
176 max_hzs_len_ = max_hzs_len;
177 }
178
close()179 void MatrixSearch::close() {
180 flush_cache();
181 free_resource();
182 inited_ = false;
183 }
184
flush_cache()185 void MatrixSearch::flush_cache() {
186 if (NULL != user_dict_)
187 user_dict_->flush_cache();
188 }
189
set_xi_an_switch(bool xi_an_enabled)190 void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) {
191 xi_an_enabled_ = xi_an_enabled;
192 }
193
get_xi_an_switch()194 bool MatrixSearch::get_xi_an_switch() {
195 return xi_an_enabled_;
196 }
197
reset_search()198 bool MatrixSearch::reset_search() {
199 if (!inited_)
200 return false;
201 return reset_search0();
202 }
203
reset_search0()204 bool MatrixSearch::reset_search0() {
205 if (!inited_)
206 return false;
207
208 pys_decoded_len_ = 0;
209 mtrx_nd_pool_used_ = 0;
210 dmi_pool_used_ = 0;
211
212 // Get a MatrixNode from the pool
213 matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_;
214 matrix_[0].mtrx_nd_num = 1;
215 mtrx_nd_pool_used_ += 1;
216
217 // Update the node, and make it to be a starting node
218 MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos;
219 node->id = 0;
220 node->score = 0;
221 node->from = NULL;
222 node->step = 0;
223 node->dmi_fr = (PoolPosType)-1;
224
225 matrix_[0].dmi_pos = 0;
226 matrix_[0].dmi_num = 0;
227 matrix_[0].dmi_has_full_id = 1;
228 matrix_[0].mtrx_nd_fixed = node;
229
230 lma_start_[0] = 0;
231 fixed_lmas_ = 0;
232 spl_start_[0] = 0;
233 fixed_hzs_ = 0;
234
235 dict_trie_->reset_milestones(0, 0);
236 if (NULL != user_dict_)
237 user_dict_->reset_milestones(0, 0);
238
239 return true;
240 }
241
reset_search(size_t ch_pos,bool clear_fixed_this_step,bool clear_dmi_this_step,bool clear_mtrx_this_step)242 bool MatrixSearch::reset_search(size_t ch_pos, bool clear_fixed_this_step,
243 bool clear_dmi_this_step,
244 bool clear_mtrx_this_step) {
245 if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum)
246 return false;
247
248 if (0 == ch_pos) {
249 reset_search0();
250 } else {
251 // Prepare mile stones of this step to clear.
252 MileStoneHandle *dict_handles_to_clear = NULL;
253 if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) {
254 dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles;
255 }
256
257 // If there are more steps, and this step is not allowed to clear, find
258 // milestones of next step.
259 if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) {
260 dict_handles_to_clear = NULL;
261 if (matrix_[ch_pos + 1].dmi_num > 0) {
262 dict_handles_to_clear =
263 dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles;
264 }
265 }
266
267 if (NULL != dict_handles_to_clear) {
268 dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]);
269 if (NULL != user_dict_)
270 user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]);
271 }
272
273 pys_decoded_len_ = ch_pos;
274
275 if (clear_dmi_this_step) {
276 dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos
277 + matrix_[ch_pos - 1].dmi_num;
278 matrix_[ch_pos].dmi_num = 0;
279 } else {
280 dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num;
281 }
282
283 if (clear_mtrx_this_step) {
284 mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos
285 + matrix_[ch_pos - 1].mtrx_nd_num;
286 matrix_[ch_pos].mtrx_nd_num = 0;
287 } else {
288 mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos
289 + matrix_[ch_pos].mtrx_nd_num;
290 }
291
292 // Modify fixed_hzs_
293 if (fixed_hzs_ > 0 &&
294 ((kLemmaIdComposing != lma_id_[0]) ||
295 (kLemmaIdComposing == lma_id_[0] &&
296 spl_start_[c_phrase_.length] <= ch_pos))) {
297 size_t fixed_ch_pos = ch_pos;
298 if (clear_fixed_this_step)
299 fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0;
300 while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0)
301 fixed_ch_pos--;
302
303 fixed_lmas_ = 0;
304 fixed_hzs_ = 0;
305 if (fixed_ch_pos > 0) {
306 while (spl_start_[fixed_hzs_] < fixed_ch_pos)
307 fixed_hzs_++;
308 assert(spl_start_[fixed_hzs_] == fixed_ch_pos);
309
310 while (lma_start_[fixed_lmas_] < fixed_hzs_)
311 fixed_lmas_++;
312 assert(lma_start_[fixed_lmas_] == fixed_hzs_);
313 }
314
315 // Re-search the Pinyin string for the unlocked lemma
316 // which was previously fixed.
317 //
318 // Prepare mile stones of this step to clear.
319 MileStoneHandle *dict_handles_to_clear = NULL;
320 if (clear_dmi_this_step && ch_pos == fixed_ch_pos &&
321 matrix_[fixed_ch_pos].dmi_num > 0) {
322 dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles;
323 }
324
325 // If there are more steps, and this step is not allowed to clear, find
326 // milestones of next step.
327 if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) {
328 dict_handles_to_clear = NULL;
329 if (matrix_[fixed_ch_pos + 1].dmi_num > 0) {
330 dict_handles_to_clear =
331 dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles;
332 }
333 }
334
335 if (NULL != dict_handles_to_clear) {
336 dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]);
337 if (NULL != user_dict_)
338 user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]);
339 }
340
341
342 pys_decoded_len_ = fixed_ch_pos;
343
344 if (clear_dmi_this_step && ch_pos == fixed_ch_pos) {
345 dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos
346 + matrix_[fixed_ch_pos - 1].dmi_num;
347 matrix_[fixed_ch_pos].dmi_num = 0;
348 } else {
349 dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos +
350 matrix_[fixed_ch_pos].dmi_num;
351 }
352
353 if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) {
354 mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos
355 + matrix_[fixed_ch_pos - 1].mtrx_nd_num;
356 matrix_[fixed_ch_pos].mtrx_nd_num = 0;
357 } else {
358 mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos
359 + matrix_[fixed_ch_pos].mtrx_nd_num;
360 }
361
362 for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) {
363 add_char(pys_[re_pos]);
364 }
365 } else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) {
366 for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) {
367 uint16 splpos_begin = c_phrase_.sublma_start[subpos];
368 uint16 splpos_end = c_phrase_.sublma_start[subpos + 1];
369 for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) {
370 // If ch_pos is in this spelling
371 uint16 spl_start = c_phrase_.spl_start[splpos];
372 uint16 spl_end = c_phrase_.spl_start[splpos + 1];
373 if (ch_pos >= spl_start && ch_pos < spl_end) {
374 // Clear everything after this position
375 c_phrase_.chn_str[splpos] = static_cast<char16>('\0');
376 c_phrase_.sublma_start[subpos + 1] = splpos;
377 c_phrase_.sublma_num = subpos + 1;
378 c_phrase_.length = splpos;
379
380 if (splpos == splpos_begin) {
381 c_phrase_.sublma_num = subpos;
382 }
383 }
384 }
385 }
386
387 // Extend the composing phrase.
388 reset_search0();
389 dmi_c_phrase_ = true;
390 uint16 c_py_pos = 0;
391 while (c_py_pos < spl_start_[c_phrase_.length]) {
392 bool b_ac_tmp = add_char(pys_[c_py_pos]);
393 assert(b_ac_tmp);
394 c_py_pos++;
395 }
396 dmi_c_phrase_ = false;
397
398 lma_id_num_ = 1;
399 fixed_lmas_ = 1;
400 fixed_lmas_no1_[0] = 0; // A composing string is always modified.
401 fixed_hzs_ = c_phrase_.length;
402 lma_start_[1] = fixed_hzs_;
403 lma_id_[0] = kLemmaIdComposing;
404 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
405 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
406 }
407 }
408
409 return true;
410 }
411
del_in_pys(size_t start,size_t len)412 void MatrixSearch::del_in_pys(size_t start, size_t len) {
413 while (start < kMaxRowNum - len && '\0' != pys_[start]) {
414 pys_[start] = pys_[start + len];
415 start++;
416 }
417 }
418
search(const char * py,size_t py_len)419 size_t MatrixSearch::search(const char *py, size_t py_len) {
420 if (!inited_ || NULL == py)
421 return 0;
422
423 // If the search Pinyin string is too long, it will be truncated.
424 if (py_len > kMaxRowNum - 1)
425 py_len = kMaxRowNum - 1;
426
427 // Compare the new string with the previous one. Find their prefix to
428 // increase search efficiency.
429 size_t ch_pos = 0;
430 for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) {
431 if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos])
432 break;
433 }
434
435 bool clear_fix = true;
436 if (ch_pos == pys_decoded_len_)
437 clear_fix = false;
438
439 reset_search(ch_pos, clear_fix, false, false);
440
441 memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos);
442 pys_[py_len] = '\0';
443
444 while ('\0' != pys_[ch_pos]) {
445 if (!add_char(py[ch_pos])) {
446 pys_decoded_len_ = ch_pos;
447 break;
448 }
449 ch_pos++;
450 }
451
452 // Get spelling ids and starting positions.
453 get_spl_start_id();
454
455 // If there are too many spellings, remove the last letter until the spelling
456 // number is acceptable.
457 while (spl_id_num_ > 9) {
458 py_len--;
459 reset_search(py_len, false, false, false);
460 pys_[py_len] = '\0';
461 get_spl_start_id();
462 }
463
464 prepare_candidates();
465
466 if (kPrintDebug0) {
467 printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_);
468 printf("--DMI Pool Used: %d\n", dmi_pool_used_);
469
470 if (kPrintDebug1) {
471 for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) {
472 debug_print_dmi(pos, 1);
473 }
474 }
475 }
476
477 return ch_pos;
478 }
479
delsearch(size_t pos,bool is_pos_in_splid,bool clear_fixed_this_step)480 size_t MatrixSearch::delsearch(size_t pos, bool is_pos_in_splid,
481 bool clear_fixed_this_step) {
482 if (!inited_)
483 return 0;
484
485 size_t reset_pos = pos;
486
487 // Out of range for both Pinyin mode and Spelling id mode.
488 if (pys_decoded_len_ <= pos) {
489 del_in_pys(pos, 1);
490
491 reset_pos = pys_decoded_len_;
492 // Decode the string after the un-decoded position
493 while ('\0' != pys_[reset_pos]) {
494 if (!add_char(pys_[reset_pos])) {
495 pys_decoded_len_ = reset_pos;
496 break;
497 }
498 reset_pos++;
499 }
500 get_spl_start_id();
501 prepare_candidates();
502 return pys_decoded_len_;
503 }
504
505 // Spelling id mode, but out of range.
506 if (is_pos_in_splid && spl_id_num_ <= pos)
507 return pys_decoded_len_;
508
509 // Begin to handle two modes respectively.
510 // Pinyin mode by default
511 size_t c_py_len = 0; // The length of composing phrase's Pinyin
512 size_t del_py_len = 1;
513 if (!is_pos_in_splid) {
514 // Pinyin mode is only allowed to delete beyond the fixed lemmas.
515 if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]])
516 return pys_decoded_len_;
517
518 del_in_pys(pos, 1);
519
520 // If the deleted character is just the one after the last fixed lemma
521 if (pos == spl_start_[lma_start_[fixed_lmas_]]) {
522 // If all fixed lemmas have been merged, and the caller of the function
523 // request to unlock the last fixed lemma.
524 if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) {
525 // Unlock the last sub lemma in the composing phrase. Because it is not
526 // easy to unlock it directly. Instead, we re-decode the modified
527 // composing phrase.
528 c_phrase_.sublma_num--;
529 c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num];
530 reset_pos = spl_start_[c_phrase_.length];
531 c_py_len = reset_pos;
532 }
533 }
534 } else {
535 del_py_len = spl_start_[pos + 1] - spl_start_[pos];
536
537 del_in_pys(spl_start_[pos], del_py_len);
538
539 if (pos >= lma_start_[fixed_lmas_]) {
540 c_py_len = 0;
541 reset_pos = spl_start_[pos + 1] - del_py_len;
542 } else {
543 c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len;
544 reset_pos = c_py_len;
545 if (c_py_len > 0)
546 merge_fixed_lmas(pos);
547 }
548 }
549
550 if (c_py_len > 0) {
551 assert(c_phrase_.length > 0 && c_py_len ==
552 c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]);
553 // The composing phrase is valid, reset all search space,
554 // and begin a new search which will only extend the composing
555 // phrase.
556 reset_search0();
557
558 dmi_c_phrase_ = true;
559 // Extend the composing phrase.
560 uint16 c_py_pos = 0;
561 while (c_py_pos < c_py_len) {
562 bool b_ac_tmp = add_char(pys_[c_py_pos]);
563 assert(b_ac_tmp);
564 c_py_pos++;
565 }
566 dmi_c_phrase_ = false;
567
568 // Fixd the composing phrase as the first choice.
569 lma_id_num_ = 1;
570 fixed_lmas_ = 1;
571 fixed_lmas_no1_[0] = 0; // A composing string is always modified.
572 fixed_hzs_ = c_phrase_.length;
573 lma_start_[1] = fixed_hzs_;
574 lma_id_[0] = kLemmaIdComposing;
575 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
576 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
577 } else {
578 // Reseting search only clear pys_decoded_len_, but the string is kept.
579 reset_search(reset_pos, clear_fixed_this_step, false, false);
580 }
581
582 // Decode the string after the delete position.
583 while ('\0' != pys_[reset_pos]) {
584 if (!add_char(pys_[reset_pos])) {
585 pys_decoded_len_ = reset_pos;
586 break;
587 }
588 reset_pos++;
589 }
590
591 get_spl_start_id();
592 prepare_candidates();
593 return pys_decoded_len_;
594 }
595
get_candidate_num()596 size_t MatrixSearch::get_candidate_num() {
597 if (!inited_ || 0 == pys_decoded_len_ ||
598 0 == matrix_[pys_decoded_len_].mtrx_nd_num)
599 return 0;
600
601 return 1 + lpi_total_;
602 }
603
get_candidate(size_t cand_id,char16 * cand_str,size_t max_len)604 char16* MatrixSearch::get_candidate(size_t cand_id, char16 *cand_str,
605 size_t max_len) {
606 if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str)
607 return NULL;
608
609 if (0 == cand_id) {
610 return get_candidate0(cand_str, max_len, NULL, false);
611 } else {
612 cand_id--;
613 }
614
615 // For this case: the current sentence is a word only, and the user fixed it,
616 // so the result will be fixed to the sentence space, and
617 // lpi_total_ will be set to 0.
618 if (0 == lpi_total_) {
619 return get_candidate0(cand_str, max_len, NULL, false);
620 }
621
622 LemmaIdType id = lpi_items_[cand_id].id;
623 char16 s[kMaxLemmaSize + 1];
624
625 uint16 s_len = lpi_items_[cand_id].lma_len;
626 if (s_len > 1) {
627 s_len = get_lemma_str(id, s, kMaxLemmaSize + 1);
628 } else {
629 // For a single character, Hanzi is ready.
630 s[0] = lpi_items_[cand_id].hanzi;
631 s[1] = static_cast<char16>(0);
632 }
633
634 if (s_len > 0 && max_len > s_len) {
635 utf16_strncpy(cand_str, s, s_len);
636 cand_str[s_len] = (char16)'\0';
637 return cand_str;
638 }
639
640 return NULL;
641 }
642
update_dict_freq()643 void MatrixSearch::update_dict_freq() {
644 if (NULL != user_dict_) {
645 // Update the total frequency of all lemmas, including system lemmas and
646 // user dictionary lemmas.
647 size_t total_freq = user_dict_->get_total_lemma_count();
648 dict_trie_->set_total_lemma_count_of_others(total_freq);
649 }
650 }
651
add_lma_to_userdict(uint16 lma_fr,uint16 lma_to,float score)652 bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to,
653 float score) {
654 if (lma_to - lma_fr <= 1 || NULL == user_dict_)
655 return false;
656
657 char16 word_str[kMaxLemmaSize + 1];
658 uint16 spl_ids[kMaxLemmaSize];
659
660 uint16 spl_id_fr = 0;
661
662 for (uint16 pos = lma_fr; pos < lma_to; pos++) {
663 LemmaIdType lma_id = lma_id_[pos];
664 if (is_user_lemma(lma_id)) {
665 user_dict_->update_lemma(lma_id, 1, true);
666 }
667 uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos];
668 utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len);
669
670 uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr,
671 kMaxLemmaSize + 1 - spl_id_fr);
672 assert(tmp == lma_len);
673
674 tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true);
675 if (tmp != lma_len) {
676 return false;
677 }
678
679 spl_id_fr += lma_len;
680 }
681
682 assert(spl_id_fr <= kMaxLemmaSize);
683
684 return user_dict_->put_lemma(static_cast<char16*>(word_str), spl_ids,
685 spl_id_fr, 1);
686 }
687
debug_print_dmi(PoolPosType dmi_pos,uint16 nest_level)688 void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) {
689 if (dmi_pos >= dmi_pool_used_) return;
690
691 DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
692
693 if (1 == nest_level) {
694 printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos);
695 }
696 if (dmi->dict_level > 1) {
697 debug_print_dmi(dmi->dmi_fr, nest_level + 1);
698 }
699 printf("---%d\n", dmi->dict_level);
700 printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]);
701 printf(" Spelling : %s, %d\n", SpellingTrie::get_instance().
702 get_spelling_str(dmi->spl_id), dmi->spl_id);
703 printf(" Total Pinyin Len: %d\n", dmi->splstr_len);
704 if (1 == nest_level) {
705 printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos);
706 }
707 }
708
try_add_cand0_to_userdict()709 bool MatrixSearch::try_add_cand0_to_userdict() {
710 size_t new_cand_num = get_candidate_num();
711 if (fixed_hzs_ > 0 && 1 == new_cand_num) {
712 float score_from = 0;
713 uint16 lma_id_from = 0;
714 uint16 pos = 0;
715 bool modified = false;
716 while (pos < fixed_lmas_) {
717 if (lma_start_[pos + 1] - lma_start_[lma_id_from] >
718 static_cast<uint16>(kMaxLemmaSize)) {
719 float score_to_add =
720 mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
721 .mtrx_nd_pos].score - score_from;
722 if (modified) {
723 score_to_add += 1.0;
724 if (score_to_add > NGram::kMaxScore) {
725 score_to_add = NGram::kMaxScore;
726 }
727 add_lma_to_userdict(lma_id_from, pos, score_to_add);
728 }
729 lma_id_from = pos;
730 score_from += score_to_add;
731
732 // Clear the flag for next user lemma.
733 modified = false;
734 }
735
736 if (0 == fixed_lmas_no1_[pos]) {
737 modified = true;
738 }
739 pos++;
740 }
741
742 // Single-char word is not allowed to add to userdict.
743 if (lma_start_[pos] - lma_start_[lma_id_from] > 1) {
744 float score_to_add =
745 mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
746 .mtrx_nd_pos].score - score_from;
747 if (modified) {
748 score_to_add += 1.0;
749 if (score_to_add > NGram::kMaxScore) {
750 score_to_add = NGram::kMaxScore;
751 }
752 add_lma_to_userdict(lma_id_from, pos, score_to_add);
753 }
754 }
755 }
756 return true;
757 }
758
759 // Choose a candidate, and give new candidates for next step.
760 // If user finishes selection, we will try to communicate with user dictionary
761 // to add new items or update score of some existing items.
762 //
763 // Basic rule:
764 // 1. If user selects the first choice:
765 // 1.1. If the first choice is not a sentence, instead, it is a lemma:
766 // 1.1.1. If the first choice is a user lemma, notify the user
767 // dictionary that a user lemma is hit, and add occuring count
768 // by 1.
769 // 1.1.2. If the first choice is a system lemma, do nothing.
770 // 1.2. If the first choice is a sentence containing more than one lemma:
771 // 1.2.1. The whole sentence will be added as a user lemma. If the
772 // sentence contains user lemmas, -> hit, and add occuring count
773 // by 1.
choose(size_t cand_id)774 size_t MatrixSearch::choose(size_t cand_id) {
775 if (!inited_ || 0 == pys_decoded_len_)
776 return 0;
777
778 if (0 == cand_id) {
779 fixed_hzs_ = spl_id_num_;
780 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
781 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
782 for (size_t pos = fixed_lmas_; pos < lma_id_num_; pos++) {
783 fixed_lmas_no1_[pos] = 1;
784 }
785 fixed_lmas_ = lma_id_num_;
786 lpi_total_ = 0; // Clean all other candidates.
787
788 // 1. It is the first choice
789 if (1 == lma_id_num_) {
790 // 1.1. The first choice is not a sentence but a lemma
791 if (is_user_lemma(lma_id_[0])) {
792 // 1.1.1. The first choice is a user lemma, notify the user dictionary
793 // that it is hit.
794 if (NULL != user_dict_)
795 user_dict_->update_lemma(lma_id_[0], 1, true);
796 } else {
797 // 1.1.2. do thing for a system lemma.
798 }
799 } else {
800 // 1.2. The first choice is a sentence.
801 // 1.2.1 Try to add the whole sentence to user dictionary, the whole
802 // sentence may be splitted into many items.
803 if (NULL != user_dict_) {
804 try_add_cand0_to_userdict();
805 }
806 }
807 update_dict_freq();
808 return 1;
809 } else {
810 cand_id--;
811 }
812
813 // 2. It is not the full sentence candidate.
814 // Find the length of the candidate.
815 LemmaIdType id_chosen = lpi_items_[cand_id].id;
816 LmaScoreType score_chosen = lpi_items_[cand_id].psb;
817 size_t cand_len = lpi_items_[cand_id].lma_len;
818
819 assert(cand_len > 0);
820
821 // Notify the atom dictionary that this item is hit.
822 if (is_user_lemma(id_chosen)) {
823 if (NULL != user_dict_) {
824 user_dict_->update_lemma(id_chosen, 1, true);
825 }
826 update_dict_freq();
827 }
828
829 // 3. Fixed the chosen item.
830 // 3.1 Get the steps number.
831 size_t step_fr = spl_start_[fixed_hzs_];
832 size_t step_to = spl_start_[fixed_hzs_ + cand_len];
833
834 // 3.2 Save the length of the original string.
835 size_t pys_decoded_len = pys_decoded_len_;
836
837 // 3.2 Reset the space of the fixed part.
838 reset_search(step_to, false, false, true);
839
840 // 3.3 For the last character of the fixed part, the previous DMI
841 // information will be kept, while the MTRX information will be re-extended,
842 // and only one node will be extended.
843 matrix_[step_to].mtrx_nd_num = 0;
844
845 LmaPsbItem lpi_item;
846 lpi_item.psb = score_chosen;
847 lpi_item.id = id_chosen;
848
849 PoolPosType step_to_dmi_fr = match_dmi(step_to,
850 spl_id_ + fixed_hzs_, cand_len);
851 assert(step_to_dmi_fr != static_cast<PoolPosType>(-1));
852
853 extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1,
854 step_to_dmi_fr, step_to);
855
856 matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos;
857 mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos +
858 matrix_[step_to].mtrx_nd_num;
859
860 if (id_chosen == lma_id_[fixed_lmas_])
861 fixed_lmas_no1_[fixed_lmas_] = 1;
862 else
863 fixed_lmas_no1_[fixed_lmas_] = 0;
864 lma_id_[fixed_lmas_] = id_chosen;
865 lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len;
866 fixed_lmas_++;
867 fixed_hzs_ = fixed_hzs_ + cand_len;
868
869 while (step_to != pys_decoded_len) {
870 bool b = add_char(pys_[step_to]);
871 assert(b);
872 step_to++;
873 }
874
875 if (fixed_hzs_ < spl_id_num_) {
876 prepare_candidates();
877 } else {
878 lpi_total_ = 0;
879 if (NULL != user_dict_) {
880 try_add_cand0_to_userdict();
881 }
882 }
883
884 return get_candidate_num();
885 }
886
cancel_last_choice()887 size_t MatrixSearch::cancel_last_choice() {
888 if (!inited_ || 0 == pys_decoded_len_)
889 return 0;
890
891 size_t step_start = 0;
892 if (fixed_hzs_ > 0) {
893 size_t step_end = spl_start_[fixed_hzs_];
894 MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed;
895 assert(NULL != end_node);
896
897 step_start = end_node->from->step;
898
899 if (step_start > 0) {
900 DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr;
901 fixed_hzs_ -= dmi->dict_level;
902 } else {
903 fixed_hzs_ = 0;
904 }
905
906 reset_search(step_start, false, false, false);
907
908 while (pys_[step_start] != '\0') {
909 bool b = add_char(pys_[step_start]);
910 assert(b);
911 step_start++;
912 }
913
914 prepare_candidates();
915 }
916 return get_candidate_num();
917 }
918
get_fixedlen()919 size_t MatrixSearch::get_fixedlen() {
920 if (!inited_ || 0 == pys_decoded_len_)
921 return 0;
922 return fixed_hzs_;
923 }
924
prepare_add_char(char ch)925 bool MatrixSearch::prepare_add_char(char ch) {
926 if (pys_decoded_len_ >= kMaxRowNum - 1 ||
927 (!spl_parser_->is_valid_to_parse(ch) && ch != '\''))
928 return false;
929
930 if (dmi_pool_used_ >= kDmiPoolSize) return false;
931
932 pys_[pys_decoded_len_] = ch;
933 pys_decoded_len_++;
934
935 MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_;
936 mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_;
937 mtrx_this_row->mtrx_nd_num = 0;
938 mtrx_this_row->dmi_pos = dmi_pool_used_;
939 mtrx_this_row->dmi_num = 0;
940 mtrx_this_row->dmi_has_full_id = 0;
941
942 return true;
943 }
944
is_split_at(uint16 pos)945 bool MatrixSearch::is_split_at(uint16 pos) {
946 return !spl_parser_->is_valid_to_parse(pys_[pos - 1]);
947 }
948
fill_dmi(DictMatchInfo * dmi,MileStoneHandle * handles,PoolPosType dmi_fr,uint16 spl_id,uint16 node_num,unsigned char dict_level,bool splid_end_split,unsigned char splstr_len,unsigned char all_full_id)949 void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
950 PoolPosType dmi_fr, uint16 spl_id,
951 uint16 node_num, unsigned char dict_level,
952 bool splid_end_split, unsigned char splstr_len,
953 unsigned char all_full_id) {
954 dmi->dict_handles[0] = handles[0];
955 dmi->dict_handles[1] = handles[1];
956 dmi->dmi_fr = dmi_fr;
957 dmi->spl_id = spl_id;
958 dmi->dict_level = dict_level;
959 dmi->splid_end_split = splid_end_split ? 1 : 0;
960 dmi->splstr_len = splstr_len;
961 dmi->all_full_id = all_full_id;
962 dmi->c_phrase = 0;
963 }
964
add_char(char ch)965 bool MatrixSearch::add_char(char ch) {
966 if (!prepare_add_char(ch))
967 return false;
968 return add_char_qwerty();
969 }
970
add_char_qwerty()971 bool MatrixSearch::add_char_qwerty() {
972 matrix_[pys_decoded_len_].mtrx_nd_num = 0;
973
974 bool spl_matched = false;
975 uint16 longest_ext = 0;
976 // Extend the search matrix, from the oldest unfixed row. ext_len means
977 // extending length.
978 for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) {
979 if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_])
980 continue;
981
982 // Refer to the declaration of the variable dmi_has_full_id for the
983 // explanation of this piece of code. In one word, it is used to prevent
984 // from the unwise extending of "shoud ou" but allow the reasonable
985 // extending of "heng ao", "lang a", etc.
986 if (ext_len > 1 && 0 != longest_ext &&
987 0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) {
988 if (xi_an_enabled_)
989 continue;
990 else
991 break;
992 }
993
994 uint16 oldrow = pys_decoded_len_ - ext_len;
995
996 // 0. If that row is before the last fixed step, ignore.
997 if (spl_start_[fixed_hzs_] > oldrow)
998 continue;
999
1000 // 1. Check if that old row has valid MatrixNode. If no, means that row is
1001 // not a boundary, either a word boundary or a spelling boundary.
1002 // If it is for extending composing phrase, it's OK to ignore the 0.
1003 if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_)
1004 continue;
1005
1006 // 2. Get spelling id(s) for the last ext_len chars.
1007 uint16 spl_idx;
1008 bool is_pre = false;
1009 spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow,
1010 ext_len, &is_pre);
1011 if (is_pre)
1012 spl_matched = true;
1013
1014 if (0 == spl_idx)
1015 continue;
1016
1017 bool splid_end_split = is_split_at(oldrow + ext_len);
1018
1019 // 3. Extend the DMI nodes of that old row
1020 // + 1 is to extend an extra node from the root
1021 for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos;
1022 dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1;
1023 dmi_pos++) {
1024 DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
1025 if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) {
1026 dmi = NULL; // The last one, NULL means extending from the root.
1027 } else {
1028 // If the dmi is covered by the fixed arrange, ignore it.
1029 if (fixed_hzs_ > 0 &&
1030 pys_decoded_len_ - ext_len - dmi->splstr_len <
1031 spl_start_[fixed_hzs_]) {
1032 continue;
1033 }
1034 // If it is not in mode for composing phrase, and the source DMI node
1035 // is marked for composing phrase, ignore this node.
1036 if (dmi->c_phrase != 0 && !dmi_c_phrase_) {
1037 continue;
1038 }
1039 }
1040
1041 // For example, if "gao" is extended, "g ao" is not allowed.
1042 // or "zh" has been passed, "z h" is not allowed.
1043 // Both word and word-connection will be prevented.
1044 if (longest_ext > ext_len) {
1045 if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) {
1046 continue;
1047 }
1048
1049 // "z h" is not allowed.
1050 if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) {
1051 continue;
1052 }
1053 }
1054
1055 dep_->splids_extended = 0;
1056 if (NULL != dmi) {
1057 uint16 prev_ids_num = dmi->dict_level;
1058 if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) ||
1059 (dmi_c_phrase_ && prev_ids_num >= kMaxRowNum)) {
1060 continue;
1061 }
1062
1063 DictMatchInfo *d = dmi;
1064 while (d) {
1065 dep_->splids[--prev_ids_num] = d->spl_id;
1066 if ((PoolPosType)-1 == d->dmi_fr)
1067 break;
1068 d = dmi_pool_ + d->dmi_fr;
1069 }
1070 assert(0 == prev_ids_num);
1071 dep_->splids_extended = dmi->dict_level;
1072 }
1073 dep_->splids[dep_->splids_extended] = spl_idx;
1074 dep_->ext_len = ext_len;
1075 dep_->splid_end_split = splid_end_split;
1076
1077 dep_->id_num = 1;
1078 dep_->id_start = spl_idx;
1079 if (spl_trie_->is_half_id(spl_idx)) {
1080 // Get the full id list
1081 dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start));
1082 assert(dep_->id_num > 0);
1083 }
1084
1085 uint16 new_dmi_num;
1086
1087 new_dmi_num = extend_dmi(dep_, dmi);
1088
1089 if (new_dmi_num > 0) {
1090 if (dmi_c_phrase_) {
1091 dmi_pool_[dmi_pool_used_].c_phrase = 1;
1092 }
1093 matrix_[pys_decoded_len_].dmi_num += new_dmi_num;
1094 dmi_pool_used_ += new_dmi_num;
1095
1096 if (!spl_trie_->is_half_id(spl_idx))
1097 matrix_[pys_decoded_len_].dmi_has_full_id = 1;
1098 }
1099
1100 // If get candiate lemmas, try to extend the path
1101 if (lpi_total_ > 0) {
1102 uint16 fr_row;
1103 if (NULL == dmi) {
1104 fr_row = oldrow;
1105 } else {
1106 assert(oldrow >= dmi->splstr_len);
1107 fr_row = oldrow - dmi->splstr_len;
1108 }
1109 for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos;
1110 mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos +
1111 matrix_[fr_row].mtrx_nd_num;
1112 mtrx_nd_pos++) {
1113 MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos;
1114
1115 extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_,
1116 dmi_pool_used_ - new_dmi_num, pys_decoded_len_);
1117 if (longest_ext == 0)
1118 longest_ext = ext_len;
1119 }
1120 }
1121 } // for dmi_pos
1122 } // for ext_len
1123 mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num;
1124
1125 if (dmi_c_phrase_)
1126 return true;
1127
1128 return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched);
1129 }
1130
prepare_candidates()1131 void MatrixSearch::prepare_candidates() {
1132 // Get candiates from the first un-fixed step.
1133 uint16 lma_size_max = kMaxLemmaSize;
1134 if (lma_size_max > spl_id_num_ - fixed_hzs_)
1135 lma_size_max = spl_id_num_ - fixed_hzs_;
1136
1137 uint16 lma_size = lma_size_max;
1138
1139 // If the full sentense candidate's unfixed part may be the same with a normal
1140 // lemma. Remove the lemma candidate in this case.
1141 char16 fullsent[kMaxLemmaSize + 1];
1142 char16 *pfullsent = NULL;
1143 uint16 sent_len;
1144 pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true);
1145
1146 // If the unfixed part contains more than one ids, it is not necessary to
1147 // check whether a lemma's string is the same to the unfixed part of the full
1148 // sentence candidate, so, set it to NULL;
1149 if (sent_len > kMaxLemmaSize)
1150 pfullsent = NULL;
1151
1152 lpi_total_ = 0;
1153 size_t lpi_num_full_match = 0; // Number of items which are fully-matched.
1154 while (lma_size > 0) {
1155 size_t lma_num;
1156 lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size,
1157 lpi_items_ + lpi_total_,
1158 size_t(kMaxLmaPsbItems - lpi_total_),
1159 pfullsent, lma_size == lma_size_max);
1160
1161 if (lma_num > 0) {
1162 lpi_total_ += lma_num;
1163 // For next lemma candidates which are not the longest, it is not
1164 // necessary to compare with the full sentence candiate.
1165 pfullsent = NULL;
1166 }
1167 if (lma_size == lma_size_max) {
1168 lpi_num_full_match = lpi_total_;
1169 }
1170 lma_size--;
1171 }
1172
1173 // Sort those partially-matched items by their unified scores.
1174 myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match,
1175 sizeof(LmaPsbItem), cmp_lpi_with_unified_psb);
1176
1177 if (kPrintDebug0) {
1178 printf("-----Prepare candidates, score:\n");
1179 for (size_t a = 0; a < lpi_total_; a++) {
1180 printf("[%03d]%d ", a, lpi_items_[a].psb);
1181 if ((a + 1) % 6 == 0) printf("\n");
1182 }
1183 printf("\n");
1184 }
1185
1186 if (kPrintDebug0) {
1187 printf("--- lpi_total_ = %d\n", lpi_total_);
1188 }
1189 }
1190
get_pystr(size_t * decoded_len)1191 const char* MatrixSearch::get_pystr(size_t *decoded_len) {
1192 if (!inited_ || NULL == decoded_len)
1193 return NULL;
1194
1195 *decoded_len = pys_decoded_len_;
1196 return pys_;
1197 }
1198
merge_fixed_lmas(size_t del_spl_pos)1199 void MatrixSearch::merge_fixed_lmas(size_t del_spl_pos) {
1200 if (fixed_lmas_ == 0)
1201 return;
1202 // Update spelling segmentation information first.
1203 spl_id_num_ -= 1;
1204 uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos];
1205 for (size_t pos = del_spl_pos; pos <= spl_id_num_; pos++) {
1206 spl_start_[pos] = spl_start_[pos + 1] - del_py_len;
1207 if (pos == spl_id_num_)
1208 break;
1209 spl_id_[pos] = spl_id_[pos + 1];
1210 }
1211
1212 // Begin to merge.
1213 uint16 phrase_len = 0;
1214
1215 // Update the spelling ids to the composing phrase.
1216 // We need to convert these ids into full id in the future.
1217 memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16));
1218 memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16));
1219
1220 // If composing phrase has not been created, first merge all fixed
1221 // lemmas into a composing phrase without deletion.
1222 if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) {
1223 uint16 bp = 1; // Begin position of real fixed lemmas.
1224 // There is no existing composing phrase.
1225 if (kLemmaIdComposing != lma_id_[0]) {
1226 c_phrase_.sublma_num = 0;
1227 bp = 0;
1228 }
1229
1230 uint16 sub_num = c_phrase_.sublma_num;
1231 for (uint16 pos = bp; pos <= fixed_lmas_; pos++) {
1232 c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos];
1233 if (lma_start_[pos] > del_spl_pos) {
1234 c_phrase_.sublma_start[sub_num + pos - bp] -= 1;
1235 }
1236
1237 if (pos == fixed_lmas_)
1238 break;
1239
1240 uint16 lma_len;
1241 char16 *lma_str = c_phrase_.chn_str +
1242 c_phrase_.sublma_start[sub_num] + phrase_len;
1243
1244 lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len);
1245 assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]);
1246 phrase_len += lma_len;
1247 }
1248 assert(phrase_len == lma_start_[fixed_lmas_]);
1249 c_phrase_.length = phrase_len; // will be deleted by 1
1250 c_phrase_.sublma_num += fixed_lmas_ - bp;
1251 } else {
1252 for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) {
1253 if (c_phrase_.sublma_start[pos] > del_spl_pos) {
1254 c_phrase_.sublma_start[pos] -= 1;
1255 }
1256 }
1257 phrase_len = c_phrase_.length;
1258 }
1259
1260 assert(phrase_len > 0);
1261 if (1 == phrase_len) {
1262 // After the only one is deleted, nothing will be left.
1263 fixed_lmas_ = 0;
1264 return;
1265 }
1266
1267 // Delete the Chinese character in the merged phrase.
1268 // The corresponding elements in spl_ids and spl_start of the
1269 // phrase have been deleted.
1270 char16 *chn_str = c_phrase_.chn_str + del_spl_pos;
1271 for (uint16 pos = 0;
1272 pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos;
1273 pos++) {
1274 chn_str[pos] = chn_str[pos + 1];
1275 }
1276 c_phrase_.length -= 1;
1277
1278 // If the deleted spelling id is in a sub lemma which contains more than
1279 // one id, del_a_sub will be false; but if the deleted id is in a sub lemma
1280 // which only contains 1 id, the whole sub lemma needs to be deleted, so
1281 // del_a_sub will be true.
1282 bool del_a_sub = false;
1283 for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) {
1284 if (c_phrase_.sublma_start[pos - 1] ==
1285 c_phrase_.sublma_start[pos]) {
1286 del_a_sub = true;
1287 }
1288 if (del_a_sub) {
1289 c_phrase_.sublma_start[pos - 1] =
1290 c_phrase_.sublma_start[pos];
1291 }
1292 }
1293 if (del_a_sub)
1294 c_phrase_.sublma_num -= 1;
1295
1296 return;
1297 }
1298
get_spl_start_id()1299 void MatrixSearch::get_spl_start_id() {
1300 lma_id_num_ = 0;
1301 lma_start_[0] = 0;
1302
1303 spl_id_num_ = 0;
1304 spl_start_[0] = 0;
1305 if (!inited_ || 0 == pys_decoded_len_ ||
1306 0 == matrix_[pys_decoded_len_].mtrx_nd_num)
1307 return;
1308
1309 // Calculate number of lemmas and spellings
1310 // Only scan those part which is not fixed.
1311 lma_id_num_ = fixed_lmas_;
1312 spl_id_num_ = fixed_hzs_;
1313
1314 MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
1315 while (mtrx_nd != mtrx_nd_pool_) {
1316 if (fixed_hzs_ > 0) {
1317 if (mtrx_nd->step <= spl_start_[fixed_hzs_])
1318 break;
1319 }
1320
1321 // Update the spelling segamentation information
1322 unsigned char word_splstr_len = 0;
1323 PoolPosType dmi_fr = mtrx_nd->dmi_fr;
1324 if ((PoolPosType)-1 != dmi_fr)
1325 word_splstr_len = dmi_pool_[dmi_fr].splstr_len;
1326
1327 while ((PoolPosType)-1 != dmi_fr) {
1328 spl_start_[spl_id_num_ + 1] = mtrx_nd->step -
1329 (word_splstr_len - dmi_pool_[dmi_fr].splstr_len);
1330 spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id;
1331 spl_id_num_++;
1332 dmi_fr = dmi_pool_[dmi_fr].dmi_fr;
1333 }
1334
1335 // Update the lemma segmentation information
1336 lma_start_[lma_id_num_ + 1] = spl_id_num_;
1337 lma_id_[lma_id_num_] = mtrx_nd->id;
1338 lma_id_num_++;
1339
1340 mtrx_nd = mtrx_nd->from;
1341 }
1342
1343 // Reverse the result of spelling info
1344 for (size_t pos = fixed_hzs_;
1345 pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) {
1346 if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) {
1347 spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
1348 spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1];
1349 spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
1350
1351 spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1];
1352 spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos];
1353 spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1];
1354 }
1355 }
1356
1357 // Reverse the result of lemma info
1358 for (size_t pos = fixed_lmas_;
1359 pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) {
1360 assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos);
1361
1362 if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) {
1363 lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
1364 lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1];
1365 lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
1366
1367 lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
1368 lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos];
1369 lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
1370 }
1371 }
1372
1373 for (size_t pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) {
1374 if (pos < lma_id_num_)
1375 lma_start_[pos] = lma_start_[pos - 1] +
1376 (lma_start_[pos] - lma_start_[pos + 1]);
1377 else
1378 lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] -
1379 lma_start_[fixed_lmas_];
1380 }
1381
1382 // Find the last fixed position
1383 fixed_hzs_ = 0;
1384 for (size_t pos = spl_id_num_; pos > 0; pos--) {
1385 if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) {
1386 fixed_hzs_ = pos;
1387 break;
1388 }
1389 }
1390
1391 return;
1392 }
1393
get_spl_start(const uint16 * & spl_start)1394 size_t MatrixSearch::get_spl_start(const uint16 *&spl_start) {
1395 get_spl_start_id();
1396 spl_start = spl_start_;
1397 return spl_id_num_;
1398 }
1399
extend_dmi(DictExtPara * dep,DictMatchInfo * dmi_s)1400 size_t MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) {
1401 if (dmi_pool_used_ >= kDmiPoolSize) return 0;
1402
1403 if (dmi_c_phrase_)
1404 return extend_dmi_c(dep, dmi_s);
1405
1406 LpiCache& lpi_cache = LpiCache::get_instance();
1407 uint16 splid = dep->splids[dep->splids_extended];
1408
1409 bool cached = false;
1410 if (0 == dep->splids_extended)
1411 cached = lpi_cache.is_cached(splid);
1412
1413 // 1. If this is a half Id, get its corresponding full starting Id and
1414 // number of full Id.
1415 size_t ret_val = 0;
1416 PoolPosType mtrx_dmi_fr = (PoolPosType)-1; // From which dmi node
1417
1418 lpi_total_ = 0;
1419
1420 MileStoneHandle from_h[3];
1421 from_h[0] = 0;
1422 from_h[1] = 0;
1423
1424 if (0 != dep->splids_extended) {
1425 from_h[0] = dmi_s->dict_handles[0];
1426 from_h[1] = dmi_s->dict_handles[1];
1427 }
1428
1429 // 2. Begin exgtending in the system dictionary
1430 size_t lpi_num = 0;
1431 MileStoneHandle handles[2];
1432 handles[0] = handles[1] = 0;
1433 if (from_h[0] > 0 || NULL == dmi_s) {
1434 handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_,
1435 kMaxLmaPsbItems, &lpi_num);
1436 }
1437 if (handles[0] > 0)
1438 lpi_total_ = lpi_num;
1439
1440 if (NULL == dmi_s) { // from root
1441 assert(0 != handles[0]);
1442 mtrx_dmi_fr = dmi_pool_used_;
1443 }
1444
1445 // 3. Begin extending in the user dictionary
1446 if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) {
1447 handles[1] = user_dict_->extend_dict(from_h[1], dep,
1448 lpi_items_ + lpi_total_,
1449 kMaxLmaPsbItems - lpi_total_,
1450 &lpi_num);
1451 if (handles[1] > 0) {
1452 if (kPrintDebug0) {
1453 for (size_t t = 0; t < lpi_num; t++) {
1454 printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id,
1455 lpi_items_[lpi_total_ + t].psb);
1456 }
1457 }
1458 lpi_total_ += lpi_num;
1459 }
1460 }
1461
1462 if (0 != handles[0] || 0 != handles[1]) {
1463 if (dmi_pool_used_ >= kDmiPoolSize) return 0;
1464
1465 DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
1466 if (NULL == dmi_s) {
1467 fill_dmi(dmi_add, handles,
1468 (PoolPosType)-1, splid,
1469 1, 1, dep->splid_end_split, dep->ext_len,
1470 spl_trie_->is_half_id(splid) ? 0 : 1);
1471 } else {
1472 fill_dmi(dmi_add, handles,
1473 dmi_s - dmi_pool_, splid, 1,
1474 dmi_s->dict_level + 1, dep->splid_end_split,
1475 dmi_s->splstr_len + dep->ext_len,
1476 spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
1477 }
1478
1479 ret_val = 1;
1480 }
1481
1482 if (!cached) {
1483 if (0 == lpi_total_)
1484 return ret_val;
1485
1486 if (kPrintDebug0) {
1487 printf("--- lpi_total_ = %d\n", lpi_total_);
1488 }
1489
1490 myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb);
1491 if (NULL == dmi_s && spl_trie_->is_half_id(splid))
1492 lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_);
1493 } else {
1494 assert(spl_trie_->is_half_id(splid));
1495 lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems);
1496 }
1497
1498 return ret_val;
1499 }
1500
extend_dmi_c(DictExtPara * dep,DictMatchInfo * dmi_s)1501 size_t MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) {
1502 lpi_total_ = 0;
1503
1504 uint16 pos = dep->splids_extended;
1505 assert(dmi_c_phrase_);
1506 if (pos >= c_phrase_.length)
1507 return 0;
1508
1509 uint16 splid = dep->splids[pos];
1510 if (splid == c_phrase_.spl_ids[pos]) {
1511 DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
1512 MileStoneHandle handles[2]; // Actually never used.
1513 if (NULL == dmi_s)
1514 fill_dmi(dmi_add, handles,
1515 (PoolPosType)-1, splid,
1516 1, 1, dep->splid_end_split, dep->ext_len,
1517 spl_trie_->is_half_id(splid) ? 0 : 1);
1518 else
1519 fill_dmi(dmi_add, handles,
1520 dmi_s - dmi_pool_, splid, 1,
1521 dmi_s->dict_level + 1, dep->splid_end_split,
1522 dmi_s->splstr_len + dep->ext_len,
1523 spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
1524
1525 if (pos == c_phrase_.length - 1) {
1526 lpi_items_[0].id = kLemmaIdComposing;
1527 lpi_items_[0].psb = 0; // 0 is bigger than normal lemma score.
1528 lpi_total_ = 1;
1529 }
1530 return 1;
1531 }
1532 return 0;
1533 }
1534
extend_mtrx_nd(MatrixNode * mtrx_nd,LmaPsbItem lpi_items[],size_t lpi_num,PoolPosType dmi_fr,size_t res_row)1535 size_t MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
1536 size_t lpi_num, PoolPosType dmi_fr,
1537 size_t res_row) {
1538 assert(NULL != mtrx_nd);
1539 matrix_[res_row].mtrx_nd_fixed = NULL;
1540
1541 if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow)
1542 return 0;
1543
1544 if (0 == mtrx_nd->step) {
1545 // Because the list is sorted, if the source step is 0, it is only
1546 // necessary to pick up the first kMaxNodeARow items.
1547 if (lpi_num > kMaxNodeARow)
1548 lpi_num = kMaxNodeARow;
1549 }
1550
1551 MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos;
1552 for (size_t pos = 0; pos < lpi_num; pos++) {
1553 float score = mtrx_nd->score + lpi_items[pos].psb;
1554 if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score)
1555 break;
1556
1557 // Try to add a new node
1558 size_t mtrx_nd_num = matrix_[res_row].mtrx_nd_num;
1559 MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num;
1560 bool replace = false;
1561 // Find its position
1562 while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) {
1563 if (static_cast<size_t>(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow)
1564 *mtrx_nd_res = *(mtrx_nd_res - 1);
1565 mtrx_nd_res--;
1566 replace = true;
1567 }
1568 if (replace || (mtrx_nd_num < kMaxNodeARow &&
1569 matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) {
1570 mtrx_nd_res->id = lpi_items[pos].id;
1571 mtrx_nd_res->score = score;
1572 mtrx_nd_res->from = mtrx_nd;
1573 mtrx_nd_res->dmi_fr = dmi_fr;
1574 mtrx_nd_res->step = res_row;
1575 if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow)
1576 matrix_[res_row].mtrx_nd_num++;
1577 }
1578 }
1579 return matrix_[res_row].mtrx_nd_num;
1580 }
1581
match_dmi(size_t step_to,uint16 spl_ids[],uint16 spl_id_num)1582 PoolPosType MatrixSearch::match_dmi(size_t step_to, uint16 spl_ids[],
1583 uint16 spl_id_num) {
1584 if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) {
1585 return static_cast<PoolPosType>(-1);
1586 }
1587
1588 for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) {
1589 DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos;
1590
1591 if (dmi->dict_level != spl_id_num)
1592 continue;
1593
1594 bool matched = true;
1595 for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) {
1596 if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) {
1597 matched = false;
1598 break;
1599 }
1600
1601 dmi = dmi_pool_ + dmi->dmi_fr;
1602 }
1603 if (matched) {
1604 return matrix_[step_to].dmi_pos + dmi_pos;
1605 }
1606 }
1607
1608 return static_cast<PoolPosType>(-1);
1609 }
1610
get_candidate0(char16 * cand_str,size_t max_len,uint16 * retstr_len,bool only_unfixed)1611 char16* MatrixSearch::get_candidate0(char16 *cand_str, size_t max_len,
1612 uint16 *retstr_len,
1613 bool only_unfixed) {
1614 if (pys_decoded_len_ == 0 ||
1615 matrix_[pys_decoded_len_].mtrx_nd_num == 0)
1616 return NULL;
1617
1618 LemmaIdType idxs[kMaxRowNum];
1619 size_t id_num = 0;
1620
1621 MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
1622
1623 if (kPrintDebug0) {
1624 printf("--- sentence score: %f\n", mtrx_nd->score);
1625 }
1626
1627 if (kPrintDebug1) {
1628 printf("==============Sentence DMI (reverse order) begin===========>>\n");
1629 }
1630
1631 while (mtrx_nd != NULL) {
1632 idxs[id_num] = mtrx_nd->id;
1633 id_num++;
1634
1635 if (kPrintDebug1) {
1636 printf("---MatrixNode [step: %d, lma_idx: %d, total score:%.5f]\n",
1637 mtrx_nd->step, mtrx_nd->id, mtrx_nd->score);
1638 debug_print_dmi(mtrx_nd->dmi_fr, 1);
1639 }
1640
1641 mtrx_nd = mtrx_nd->from;
1642 }
1643
1644 if (kPrintDebug1) {
1645 printf("<<==============Sentence DMI (reverse order) end=============\n");
1646 }
1647
1648 size_t ret_pos = 0;
1649 do {
1650 id_num--;
1651 if (0 == idxs[id_num])
1652 continue;
1653
1654 char16 str[kMaxLemmaSize + 1];
1655 uint16 str_len = get_lemma_str(idxs[id_num], str, kMaxLemmaSize + 1);
1656 if (str_len > 0 && ((!only_unfixed && max_len - ret_pos > str_len) ||
1657 (only_unfixed && max_len - ret_pos + fixed_hzs_ > str_len))) {
1658 if (!only_unfixed)
1659 utf16_strncpy(cand_str + ret_pos, str, str_len);
1660 else if (ret_pos >= fixed_hzs_)
1661 utf16_strncpy(cand_str + ret_pos - fixed_hzs_, str, str_len);
1662
1663 ret_pos += str_len;
1664 } else {
1665 return NULL;
1666 }
1667 } while (id_num != 0);
1668
1669 if (!only_unfixed) {
1670 if (NULL != retstr_len)
1671 *retstr_len = ret_pos;
1672 cand_str[ret_pos] = (char16)'\0';
1673 } else {
1674 if (NULL != retstr_len)
1675 *retstr_len = ret_pos - fixed_hzs_;
1676 cand_str[ret_pos - fixed_hzs_] = (char16)'\0';
1677 }
1678 return cand_str;
1679 }
1680
get_lpis(const uint16 * splid_str,size_t splid_str_len,LmaPsbItem * lma_buf,size_t max_lma_buf,const char16 * pfullsent,bool sort_by_psb)1681 size_t MatrixSearch::get_lpis(const uint16* splid_str, size_t splid_str_len,
1682 LmaPsbItem* lma_buf, size_t max_lma_buf,
1683 const char16 *pfullsent, bool sort_by_psb) {
1684 if (splid_str_len > kMaxLemmaSize)
1685 return 0;
1686
1687 size_t num1 = dict_trie_->get_lpis(splid_str, splid_str_len,
1688 lma_buf, max_lma_buf);
1689 size_t num2 = 0;
1690 if (NULL != user_dict_) {
1691 num2 = user_dict_->get_lpis(splid_str, splid_str_len,
1692 lma_buf + num1, max_lma_buf - num1);
1693 }
1694
1695 size_t num = num1 + num2;
1696
1697 if (0 == num)
1698 return 0;
1699
1700 // Remove repeated items.
1701 if (splid_str_len > 1) {
1702 LmaPsbStrItem *lpsis = reinterpret_cast<LmaPsbStrItem*>(lma_buf + num);
1703 size_t lpsi_num = (max_lma_buf - num) * sizeof(LmaPsbItem) /
1704 sizeof(LmaPsbStrItem);
1705 assert(lpsi_num > num);
1706 if (num > lpsi_num) num = lpsi_num;
1707 lpsi_num = num;
1708
1709 for (size_t pos = 0; pos < lpsi_num; pos++) {
1710 lpsis[pos].lpi = lma_buf[pos];
1711 get_lemma_str(lma_buf[pos].id, lpsis[pos].str, kMaxLemmaSize + 1);
1712 }
1713
1714 myqsort(lpsis, lpsi_num, sizeof(LmaPsbStrItem), cmp_lpsi_with_str);
1715
1716 size_t remain_num = 0;
1717 for (size_t pos = 0; pos < lpsi_num; pos++) {
1718 if (pos > 0 && utf16_strcmp(lpsis[pos].str, lpsis[pos - 1].str) == 0) {
1719 if (lpsis[pos].lpi.psb < lpsis[pos - 1].lpi.psb) {
1720 assert(remain_num > 0);
1721 lma_buf[remain_num - 1] = lpsis[pos].lpi;
1722 }
1723 continue;
1724 }
1725 if (NULL != pfullsent && utf16_strcmp(lpsis[pos].str, pfullsent) == 0)
1726 continue;
1727
1728 lma_buf[remain_num] = lpsis[pos].lpi;
1729 remain_num++;
1730 }
1731
1732 // Update the result number
1733 num = remain_num;
1734 } else {
1735 // For single character, some characters have more than one spelling, for
1736 // example, "de" and "di" are all valid for a Chinese character, so when
1737 // the user input "d", repeated items are generated.
1738 // For single character lemmas, Hanzis will be gotten
1739 for (size_t pos = 0; pos < num; pos++) {
1740 char16 hanzis[2];
1741 get_lemma_str(lma_buf[pos].id, hanzis, 2);
1742 lma_buf[pos].hanzi = hanzis[0];
1743 }
1744
1745 myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_hanzi);
1746
1747 size_t remain_num = 0;
1748 for (size_t pos = 0; pos < num; pos++) {
1749 if (pos > 0 && lma_buf[pos].hanzi == lma_buf[pos - 1].hanzi) {
1750 if (NULL != pfullsent &&
1751 static_cast<char16>(0) == pfullsent[1] &&
1752 lma_buf[pos].hanzi == pfullsent[0])
1753 continue;
1754
1755 if (lma_buf[pos].psb < lma_buf[pos - 1].psb) {
1756 assert(remain_num > 0);
1757 assert(lma_buf[remain_num - 1].hanzi == lma_buf[pos].hanzi);
1758 lma_buf[remain_num - 1] = lma_buf[pos];
1759 }
1760 continue;
1761 }
1762 if (NULL != pfullsent &&
1763 static_cast<char16>(0) == pfullsent[1] &&
1764 lma_buf[pos].hanzi == pfullsent[0])
1765 continue;
1766
1767 lma_buf[remain_num] = lma_buf[pos];
1768 remain_num++;
1769 }
1770
1771 num = remain_num;
1772 }
1773
1774 if (sort_by_psb) {
1775 myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_psb);
1776 }
1777 return num;
1778 }
1779
get_lemma_str(LemmaIdType id_lemma,char16 * str_buf,uint16 str_max)1780 uint16 MatrixSearch::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
1781 uint16 str_max) {
1782 uint16 str_len = 0;
1783
1784 if (is_system_lemma(id_lemma)) {
1785 str_len = dict_trie_->get_lemma_str(id_lemma, str_buf, str_max);
1786 } else if (is_user_lemma(id_lemma)) {
1787 if (NULL != user_dict_) {
1788 str_len = user_dict_->get_lemma_str(id_lemma, str_buf, str_max);
1789 } else {
1790 str_len = 0;
1791 str_buf[0] = static_cast<char16>('\0');
1792 }
1793 } else if (is_composing_lemma(id_lemma)) {
1794 if (str_max <= 1)
1795 return 0;
1796 str_len = c_phrase_.sublma_start[c_phrase_.sublma_num];
1797 if (str_len > str_max - 1)
1798 str_len = str_max - 1;
1799 utf16_strncpy(str_buf, c_phrase_.chn_str, str_len);
1800 str_buf[str_len] = (char16)'\0';
1801 return str_len;
1802 }
1803
1804 return str_len;
1805 }
1806
get_lemma_splids(LemmaIdType id_lemma,uint16 * splids,uint16 splids_max,bool arg_valid)1807 uint16 MatrixSearch::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
1808 uint16 splids_max, bool arg_valid) {
1809 uint16 splid_num = 0;
1810
1811 if (arg_valid) {
1812 for (splid_num = 0; splid_num < splids_max; splid_num++) {
1813 if (spl_trie_->is_half_id(splids[splid_num]))
1814 break;
1815 }
1816 if (splid_num == splids_max)
1817 return splid_num;
1818 }
1819
1820 if (is_system_lemma(id_lemma)) {
1821 splid_num = dict_trie_->get_lemma_splids(id_lemma, splids, splids_max,
1822 arg_valid);
1823 } else if (is_user_lemma(id_lemma)) {
1824 if (NULL != user_dict_) {
1825 splid_num = user_dict_->get_lemma_splids(id_lemma, splids, splids_max,
1826 arg_valid);
1827 } else {
1828 splid_num = 0;
1829 }
1830 } else if (is_composing_lemma(id_lemma)) {
1831 if (c_phrase_.length > splids_max) {
1832 return 0;
1833 }
1834 for (uint16 pos = 0; pos < c_phrase_.length; pos++) {
1835 splids[pos] = c_phrase_.spl_ids[pos];
1836 if (spl_trie_->is_half_id(splids[pos])) {
1837 return 0;
1838 }
1839 }
1840 }
1841 return splid_num;
1842 }
1843
inner_predict(const char16 * fixed_buf,uint16 fixed_len,char16 predict_buf[][kMaxPredictSize+1],size_t buf_len)1844 size_t MatrixSearch::inner_predict(const char16 *fixed_buf, uint16 fixed_len,
1845 char16 predict_buf[][kMaxPredictSize + 1],
1846 size_t buf_len) {
1847 size_t res_total = 0;
1848 memset(npre_items_, 0, sizeof(NPredictItem) * npre_items_len_);
1849 // In order to shorten the comments, j-character candidates predicted by
1850 // i-character prefix are called P(i,j). All candiates predicted by
1851 // i-character prefix are called P(i,*)
1852 // Step 1. Get P(kMaxPredictSize, *) and sort them, here
1853 // P(kMaxPredictSize, *) == P(kMaxPredictSize, 1)
1854 for (size_t len = fixed_len; len >0; len--) {
1855 // How many blank items are available
1856 size_t this_max = npre_items_len_ - res_total;
1857 size_t res_this;
1858 // If the history is longer than 1, and we can not get prediction from
1859 // lemmas longer than 2, in this case, we will add lemmas with
1860 // highest scores as the prediction result.
1861 if (fixed_len > 1 && 1 == len && 0 == res_total) {
1862 // Try to find if recent n (n>1) characters can be a valid lemma in system
1863 // dictionary.
1864 bool nearest_n_word = false;
1865 for (size_t nlen = 2; nlen <= fixed_len; nlen++) {
1866 if (dict_trie_->get_lemma_id(fixed_buf + fixed_len - nlen, nlen) > 0) {
1867 nearest_n_word = true;
1868 break;
1869 }
1870 }
1871 res_this = dict_trie_->predict_top_lmas(nearest_n_word ? len : 0,
1872 npre_items_ + res_total,
1873 this_max, res_total);
1874 res_total += res_this;
1875 }
1876
1877 // How many blank items are available
1878 this_max = npre_items_len_ - res_total;
1879 res_this = 0;
1880 if (!kOnlyUserDictPredict) {
1881 res_this =
1882 dict_trie_->predict(fixed_buf + fixed_len - len, len,
1883 npre_items_ + res_total, this_max,
1884 res_total);
1885 }
1886
1887 if (NULL != user_dict_) {
1888 res_this = res_this +
1889 user_dict_->predict(fixed_buf + fixed_len - len, len,
1890 npre_items_ + res_total + res_this,
1891 this_max - res_this, res_total + res_this);
1892 }
1893
1894 if (kPredictLimitGt1) {
1895 myqsort(npre_items_ + res_total, res_this, sizeof(NPredictItem),
1896 cmp_npre_by_score);
1897
1898 if (len > 3) {
1899 if (res_this > kMaxPredictNumByGt3)
1900 res_this = kMaxPredictNumByGt3;
1901 } else if (3 == len) {
1902 if (res_this > kMaxPredictNumBy3)
1903 res_this = kMaxPredictNumBy3;
1904 } else if (2 == len) {
1905 if (res_this > kMaxPredictNumBy2)
1906 res_this = kMaxPredictNumBy2;
1907 }
1908 }
1909
1910 res_total += res_this;
1911 }
1912
1913 res_total = remove_duplicate_npre(npre_items_, res_total);
1914
1915 if (kPreferLongHistoryPredict) {
1916 myqsort(npre_items_, res_total, sizeof(NPredictItem),
1917 cmp_npre_by_hislen_score);
1918 } else {
1919 myqsort(npre_items_, res_total, sizeof(NPredictItem),
1920 cmp_npre_by_score);
1921 }
1922
1923 if (buf_len < res_total) {
1924 res_total = buf_len;
1925 }
1926
1927 if (kPrintDebug2) {
1928 printf("/////////////////Predicted Items Begin////////////////////>>\n");
1929 for (size_t i = 0; i < res_total; i++) {
1930 printf("---");
1931 for (size_t j = 0; j < kMaxPredictSize; j++) {
1932 printf("%d ", npre_items_[i].pre_hzs[j]);
1933 }
1934 printf("\n");
1935 }
1936 printf("<<///////////////Predicted Items End////////////////////////\n");
1937 }
1938
1939 for (size_t i = 0; i < res_total; i++) {
1940 utf16_strncpy(predict_buf[i], npre_items_[i].pre_hzs,
1941 kMaxPredictSize);
1942 predict_buf[i][kMaxPredictSize] = '\0';
1943 }
1944
1945 return res_total;
1946 }
1947
get_predicts(const char16 fixed_buf[],char16 predict_buf[][kMaxPredictSize+1],size_t buf_len)1948 size_t MatrixSearch::get_predicts(const char16 fixed_buf[],
1949 char16 predict_buf[][kMaxPredictSize + 1],
1950 size_t buf_len) {
1951 size_t fixed_len = utf16_strlen(fixed_buf);
1952 if (0 ==fixed_len || fixed_len > kMaxPredictSize || 0 == buf_len)
1953 return 0;
1954
1955 return inner_predict(fixed_buf, fixed_len, predict_buf, buf_len);
1956 }
1957
1958 } // namespace ime_pinyin
1959