• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <assert.h>
18 #include <math.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include "../include/lpicache.h"
22 #include "../include/matrixsearch.h"
23 #include "../include/mystdlib.h"
24 #include "../include/ngram.h"
25 #include "../include/userdict.h"
26 
27 namespace ime_pinyin {
28 
29 #define PRUMING_SCORE 8000.0
30 
MatrixSearch()31 MatrixSearch::MatrixSearch() {
32   inited_ = false;
33   spl_trie_ = SpellingTrie::get_cpinstance();
34 
35   reset_pointers_to_null();
36 
37   pys_decoded_len_ = 0;
38   mtrx_nd_pool_used_ = 0;
39   dmi_pool_used_ = 0;
40   xi_an_enabled_ = false;
41   dmi_c_phrase_ = false;
42 
43   assert(kMaxSearchSteps > 0);
44   max_sps_len_ = kMaxSearchSteps - 1;
45   max_hzs_len_ = kMaxSearchSteps;
46 }
47 
~MatrixSearch()48 MatrixSearch::~MatrixSearch() {
49   free_resource();
50 }
51 
reset_pointers_to_null()52 void MatrixSearch::reset_pointers_to_null() {
53   dict_trie_ = NULL;
54   user_dict_ = NULL;
55   spl_parser_ = NULL;
56 
57   share_buf_ = NULL;
58 
59   // The following four buffers are used for decoding, and they are based on
60   // share_buf_, no need to delete them.
61   mtrx_nd_pool_ = NULL;
62   dmi_pool_ = NULL;
63   matrix_ = NULL;
64   dep_ = NULL;
65 
66   // Based on share_buf_, no need to delete them.
67   npre_items_ = NULL;
68 }
69 
alloc_resource()70 bool MatrixSearch::alloc_resource() {
71   free_resource();
72 
73   dict_trie_ = new DictTrie();
74   user_dict_ = static_cast<AtomDictBase*>(new UserDict());
75   spl_parser_ = new SpellingParser();
76 
77   size_t mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize;
78   mtrx_nd_size = align_to_size_t(mtrx_nd_size) / sizeof(size_t);
79   size_t dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize;
80   dmi_size = align_to_size_t(dmi_size) / sizeof(size_t);
81   size_t matrix_size = sizeof(MatrixRow) * kMaxRowNum;
82   matrix_size = align_to_size_t(matrix_size) / sizeof(size_t);
83   size_t dep_size = sizeof(DictExtPara);
84   dep_size = align_to_size_t(dep_size) / sizeof(size_t);
85 
86   // share_buf's size is determined by the buffers for search.
87   share_buf_ = new size_t[mtrx_nd_size + dmi_size + matrix_size + dep_size];
88 
89   if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ ||
90       NULL == share_buf_)
91     return false;
92 
93   // The buffers for search are based on the share buffer
94   mtrx_nd_pool_ = reinterpret_cast<MatrixNode*>(share_buf_);
95   dmi_pool_ = reinterpret_cast<DictMatchInfo*>(share_buf_ + mtrx_nd_size);
96   matrix_ = reinterpret_cast<MatrixRow*>(share_buf_ + mtrx_nd_size + dmi_size);
97   dep_ = reinterpret_cast<DictExtPara*>
98       (share_buf_ + mtrx_nd_size + dmi_size + matrix_size);
99 
100   // The prediction buffer is also based on the share buffer.
101   npre_items_ = reinterpret_cast<NPredictItem*>(share_buf_);
102   npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) *
103       sizeof(size_t) / sizeof(NPredictItem);
104   return true;
105 }
106 
free_resource()107 void MatrixSearch::free_resource() {
108   if (NULL != dict_trie_)
109     delete dict_trie_;
110 
111   if (NULL != user_dict_)
112     delete user_dict_;
113 
114   if (NULL != spl_parser_)
115     delete spl_parser_;
116 
117   if (NULL != share_buf_)
118     delete [] share_buf_;
119 
120   reset_pointers_to_null();
121 }
122 
init(const char * fn_sys_dict,const char * fn_usr_dict)123 bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) {
124   if (NULL == fn_sys_dict || NULL == fn_usr_dict)
125     return false;
126 
127   if (!alloc_resource())
128     return false;
129 
130   if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd))
131     return false;
132 
133   // If engine fails to load the user dictionary, reset the user dictionary
134   // to NULL.
135   if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
136     delete user_dict_;
137     user_dict_ = NULL;
138   } else{
139     user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
140   }
141 
142   reset_search0();
143 
144   inited_ = true;
145   return true;
146 }
147 
init_fd(int sys_fd,long start_offset,long length,const char * fn_usr_dict)148 bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length,
149                            const char *fn_usr_dict) {
150   if (NULL == fn_usr_dict)
151     return false;
152 
153   if (!alloc_resource())
154     return false;
155 
156   if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd))
157     return false;
158 
159   if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
160     delete user_dict_;
161     user_dict_ = NULL;
162   } else {
163     user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
164   }
165 
166   reset_search0();
167 
168   inited_ = true;
169   return true;
170 }
171 
set_max_lens(size_t max_sps_len,size_t max_hzs_len)172 void MatrixSearch::set_max_lens(size_t max_sps_len, size_t max_hzs_len) {
173   if (0 != max_sps_len)
174     max_sps_len_ = max_sps_len;
175   if (0 != max_hzs_len)
176     max_hzs_len_ = max_hzs_len;
177 }
178 
close()179 void MatrixSearch::close() {
180   flush_cache();
181   free_resource();
182   inited_ = false;
183 }
184 
flush_cache()185 void MatrixSearch::flush_cache() {
186   if (NULL != user_dict_)
187     user_dict_->flush_cache();
188 }
189 
set_xi_an_switch(bool xi_an_enabled)190 void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) {
191   xi_an_enabled_ = xi_an_enabled;
192 }
193 
get_xi_an_switch()194 bool MatrixSearch::get_xi_an_switch() {
195   return xi_an_enabled_;
196 }
197 
reset_search()198 bool MatrixSearch::reset_search() {
199   if (!inited_)
200     return false;
201   return reset_search0();
202 }
203 
reset_search0()204 bool MatrixSearch::reset_search0() {
205     if (!inited_)
206         return false;
207 
208     pys_decoded_len_ = 0;
209     mtrx_nd_pool_used_ = 0;
210     dmi_pool_used_ = 0;
211 
212     // Get a MatrixNode from the pool
213     matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_;
214     matrix_[0].mtrx_nd_num = 1;
215     mtrx_nd_pool_used_ += 1;
216 
217     // Update the node, and make it to be a starting node
218     MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos;
219     node->id = 0;
220     node->score = 0;
221     node->from = NULL;
222     node->step = 0;
223     node->dmi_fr = (PoolPosType)-1;
224 
225     matrix_[0].dmi_pos = 0;
226     matrix_[0].dmi_num = 0;
227     matrix_[0].dmi_has_full_id = 1;
228     matrix_[0].mtrx_nd_fixed = node;
229 
230     lma_start_[0] = 0;
231     fixed_lmas_ = 0;
232     spl_start_[0] = 0;
233     fixed_hzs_ = 0;
234 
235     dict_trie_->reset_milestones(0, 0);
236     if (NULL != user_dict_)
237       user_dict_->reset_milestones(0, 0);
238 
239     return true;
240 }
241 
reset_search(size_t ch_pos,bool clear_fixed_this_step,bool clear_dmi_this_step,bool clear_mtrx_this_step)242 bool MatrixSearch::reset_search(size_t ch_pos, bool clear_fixed_this_step,
243                                 bool clear_dmi_this_step,
244                                 bool clear_mtrx_this_step) {
245   if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum)
246     return false;
247 
248   if (0 == ch_pos) {
249     reset_search0();
250   } else {
251     // Prepare mile stones of this step to clear.
252     MileStoneHandle *dict_handles_to_clear = NULL;
253     if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) {
254       dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles;
255     }
256 
257     // If there are more steps, and this step is not allowed to clear, find
258     // milestones of next step.
259     if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) {
260       dict_handles_to_clear = NULL;
261       if (matrix_[ch_pos + 1].dmi_num > 0) {
262         dict_handles_to_clear =
263             dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles;
264       }
265     }
266 
267     if (NULL != dict_handles_to_clear) {
268       dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]);
269       if (NULL != user_dict_)
270         user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]);
271     }
272 
273     pys_decoded_len_ = ch_pos;
274 
275     if (clear_dmi_this_step) {
276       dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos
277                        + matrix_[ch_pos - 1].dmi_num;
278       matrix_[ch_pos].dmi_num = 0;
279     } else {
280       dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num;
281     }
282 
283     if (clear_mtrx_this_step) {
284       mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos
285                            + matrix_[ch_pos - 1].mtrx_nd_num;
286       matrix_[ch_pos].mtrx_nd_num = 0;
287     } else {
288       mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos
289                            + matrix_[ch_pos].mtrx_nd_num;
290     }
291 
292     // Modify fixed_hzs_
293     if (fixed_hzs_ > 0 &&
294         ((kLemmaIdComposing != lma_id_[0]) ||
295          (kLemmaIdComposing == lma_id_[0] &&
296           spl_start_[c_phrase_.length] <= ch_pos))) {
297       size_t fixed_ch_pos = ch_pos;
298       if (clear_fixed_this_step)
299         fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0;
300       while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0)
301         fixed_ch_pos--;
302 
303       fixed_lmas_ = 0;
304       fixed_hzs_ = 0;
305       if (fixed_ch_pos > 0) {
306         while (spl_start_[fixed_hzs_] < fixed_ch_pos)
307           fixed_hzs_++;
308         assert(spl_start_[fixed_hzs_] == fixed_ch_pos);
309 
310         while (lma_start_[fixed_lmas_] < fixed_hzs_)
311           fixed_lmas_++;
312         assert(lma_start_[fixed_lmas_] == fixed_hzs_);
313       }
314 
315       // Re-search the Pinyin string for the unlocked lemma
316       // which was previously fixed.
317       //
318       // Prepare mile stones of this step to clear.
319       MileStoneHandle *dict_handles_to_clear = NULL;
320       if (clear_dmi_this_step && ch_pos == fixed_ch_pos &&
321           matrix_[fixed_ch_pos].dmi_num > 0) {
322         dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles;
323       }
324 
325       // If there are more steps, and this step is not allowed to clear, find
326       // milestones of next step.
327       if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) {
328         dict_handles_to_clear = NULL;
329         if (matrix_[fixed_ch_pos + 1].dmi_num > 0) {
330           dict_handles_to_clear =
331               dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles;
332         }
333       }
334 
335       if (NULL != dict_handles_to_clear) {
336         dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]);
337         if (NULL != user_dict_)
338           user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]);
339       }
340 
341 
342       pys_decoded_len_ = fixed_ch_pos;
343 
344       if (clear_dmi_this_step && ch_pos == fixed_ch_pos) {
345         dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos
346                          + matrix_[fixed_ch_pos - 1].dmi_num;
347         matrix_[fixed_ch_pos].dmi_num = 0;
348       } else {
349         dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos +
350             matrix_[fixed_ch_pos].dmi_num;
351       }
352 
353       if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) {
354         mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos
355                              + matrix_[fixed_ch_pos - 1].mtrx_nd_num;
356         matrix_[fixed_ch_pos].mtrx_nd_num = 0;
357       } else {
358         mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos
359                              + matrix_[fixed_ch_pos].mtrx_nd_num;
360       }
361 
362       for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) {
363         add_char(pys_[re_pos]);
364       }
365     } else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) {
366       for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) {
367         uint16 splpos_begin = c_phrase_.sublma_start[subpos];
368         uint16 splpos_end = c_phrase_.sublma_start[subpos + 1];
369         for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) {
370           // If ch_pos is in this spelling
371           uint16 spl_start = c_phrase_.spl_start[splpos];
372           uint16 spl_end = c_phrase_.spl_start[splpos + 1];
373           if (ch_pos >= spl_start && ch_pos < spl_end) {
374             // Clear everything after this position
375             c_phrase_.chn_str[splpos] = static_cast<char16>('\0');
376             c_phrase_.sublma_start[subpos + 1] = splpos;
377             c_phrase_.sublma_num = subpos + 1;
378             c_phrase_.length = splpos;
379 
380             if (splpos == splpos_begin) {
381               c_phrase_.sublma_num = subpos;
382             }
383           }
384         }
385       }
386 
387       // Extend the composing phrase.
388       reset_search0();
389       dmi_c_phrase_ = true;
390       uint16 c_py_pos = 0;
391       while (c_py_pos < spl_start_[c_phrase_.length]) {
392         bool b_ac_tmp = add_char(pys_[c_py_pos]);
393         assert(b_ac_tmp);
394         c_py_pos++;
395       }
396       dmi_c_phrase_ = false;
397 
398       lma_id_num_ = 1;
399       fixed_lmas_ = 1;
400       fixed_lmas_no1_[0] = 0;  // A composing string is always modified.
401       fixed_hzs_ = c_phrase_.length;
402       lma_start_[1] = fixed_hzs_;
403       lma_id_[0] = kLemmaIdComposing;
404       matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
405           matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
406     }
407   }
408 
409   return true;
410 }
411 
del_in_pys(size_t start,size_t len)412 void MatrixSearch::del_in_pys(size_t start, size_t len) {
413   while (start < kMaxRowNum - len && '\0' != pys_[start]) {
414     pys_[start] = pys_[start + len];
415     start++;
416   }
417 }
418 
search(const char * py,size_t py_len)419 size_t MatrixSearch::search(const char *py, size_t py_len) {
420   if (!inited_ || NULL == py)
421     return 0;
422 
423   // If the search Pinyin string is too long, it will be truncated.
424   if (py_len > kMaxRowNum - 1)
425     py_len = kMaxRowNum - 1;
426 
427   // Compare the new string with the previous one. Find their prefix to
428   // increase search efficiency.
429   size_t ch_pos = 0;
430   for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) {
431     if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos])
432       break;
433   }
434 
435   bool clear_fix = true;
436   if (ch_pos == pys_decoded_len_)
437     clear_fix = false;
438 
439   reset_search(ch_pos, clear_fix, false, false);
440 
441   memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos);
442   pys_[py_len] = '\0';
443 
444   while ('\0' != pys_[ch_pos]) {
445     if (!add_char(py[ch_pos])) {
446       pys_decoded_len_ = ch_pos;
447       break;
448     }
449     ch_pos++;
450   }
451 
452   // Get spelling ids and starting positions.
453   get_spl_start_id();
454 
455   // If there are too many spellings, remove the last letter until the spelling
456   // number is acceptable.
457   while (spl_id_num_ > 9) {
458     py_len--;
459     reset_search(py_len, false, false, false);
460     pys_[py_len] = '\0';
461     get_spl_start_id();
462   }
463 
464   prepare_candidates();
465 
466   if (kPrintDebug0) {
467     printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_);
468     printf("--DMI Pool Used: %d\n", dmi_pool_used_);
469 
470     if (kPrintDebug1) {
471       for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) {
472         debug_print_dmi(pos, 1);
473       }
474     }
475   }
476 
477   return ch_pos;
478 }
479 
delsearch(size_t pos,bool is_pos_in_splid,bool clear_fixed_this_step)480 size_t MatrixSearch::delsearch(size_t pos, bool is_pos_in_splid,
481                                bool clear_fixed_this_step) {
482   if (!inited_)
483     return 0;
484 
485   size_t reset_pos = pos;
486 
487   // Out of range for both Pinyin mode and Spelling id mode.
488   if (pys_decoded_len_ <= pos) {
489     del_in_pys(pos, 1);
490 
491     reset_pos = pys_decoded_len_;
492     // Decode the string after the un-decoded position
493     while ('\0' != pys_[reset_pos]) {
494       if (!add_char(pys_[reset_pos])) {
495         pys_decoded_len_ = reset_pos;
496         break;
497       }
498       reset_pos++;
499     }
500     get_spl_start_id();
501     prepare_candidates();
502     return pys_decoded_len_;
503   }
504 
505   // Spelling id mode, but out of range.
506   if (is_pos_in_splid && spl_id_num_ <= pos)
507     return pys_decoded_len_;
508 
509   // Begin to handle two modes respectively.
510   // Pinyin mode by default
511   size_t c_py_len = 0;  // The length of composing phrase's Pinyin
512   size_t del_py_len = 1;
513   if (!is_pos_in_splid) {
514     // Pinyin mode is only allowed to delete beyond the fixed lemmas.
515     if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]])
516       return pys_decoded_len_;
517 
518     del_in_pys(pos, 1);
519 
520     // If the deleted character is just the one after the last fixed lemma
521     if (pos == spl_start_[lma_start_[fixed_lmas_]]) {
522       // If all fixed lemmas have been merged, and the caller of the function
523       // request to unlock the last fixed lemma.
524       if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) {
525         // Unlock the last sub lemma in the composing phrase. Because it is not
526         // easy to unlock it directly. Instead, we re-decode the modified
527         // composing phrase.
528         c_phrase_.sublma_num--;
529         c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num];
530         reset_pos = spl_start_[c_phrase_.length];
531         c_py_len = reset_pos;
532       }
533     }
534   } else {
535     del_py_len = spl_start_[pos + 1] - spl_start_[pos];
536 
537     del_in_pys(spl_start_[pos], del_py_len);
538 
539     if (pos >= lma_start_[fixed_lmas_]) {
540       c_py_len = 0;
541       reset_pos = spl_start_[pos + 1] - del_py_len;
542     } else {
543       c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len;
544       reset_pos = c_py_len;
545       if (c_py_len > 0)
546         merge_fixed_lmas(pos);
547     }
548   }
549 
550   if (c_py_len > 0) {
551     assert(c_phrase_.length > 0 && c_py_len ==
552         c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]);
553     // The composing phrase is valid, reset all search space,
554     // and begin a new search which will only extend the composing
555     // phrase.
556     reset_search0();
557 
558     dmi_c_phrase_ = true;
559     // Extend the composing phrase.
560     uint16 c_py_pos = 0;
561     while (c_py_pos < c_py_len) {
562       bool b_ac_tmp = add_char(pys_[c_py_pos]);
563       assert(b_ac_tmp);
564       c_py_pos++;
565     }
566     dmi_c_phrase_ = false;
567 
568     // Fixd the composing phrase as the first choice.
569     lma_id_num_ = 1;
570     fixed_lmas_ = 1;
571     fixed_lmas_no1_[0] = 0;  // A composing string is always modified.
572     fixed_hzs_ = c_phrase_.length;
573     lma_start_[1] = fixed_hzs_;
574     lma_id_[0] = kLemmaIdComposing;
575     matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
576         matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
577   } else {
578     // Reseting search only clear pys_decoded_len_, but the string is kept.
579     reset_search(reset_pos, clear_fixed_this_step, false, false);
580   }
581 
582   // Decode the string after the delete position.
583   while ('\0' != pys_[reset_pos]) {
584     if (!add_char(pys_[reset_pos])) {
585       pys_decoded_len_ = reset_pos;
586       break;
587     }
588     reset_pos++;
589   }
590 
591   get_spl_start_id();
592   prepare_candidates();
593   return pys_decoded_len_;
594 }
595 
get_candidate_num()596 size_t MatrixSearch::get_candidate_num() {
597   if (!inited_ || 0 == pys_decoded_len_ ||
598       0 == matrix_[pys_decoded_len_].mtrx_nd_num)
599     return 0;
600 
601   return 1 + lpi_total_;
602 }
603 
get_candidate(size_t cand_id,char16 * cand_str,size_t max_len)604 char16* MatrixSearch::get_candidate(size_t cand_id, char16 *cand_str,
605                                     size_t max_len) {
606   if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str)
607     return NULL;
608 
609   if (0 == cand_id) {
610     return get_candidate0(cand_str, max_len, NULL, false);
611   } else {
612     cand_id--;
613   }
614 
615   // For this case: the current sentence is a word only, and the user fixed it,
616   // so the result will be fixed to the sentence space, and
617   // lpi_total_ will be set to 0.
618   if (0 == lpi_total_) {
619     return get_candidate0(cand_str, max_len, NULL, false);
620   }
621 
622   LemmaIdType id = lpi_items_[cand_id].id;
623   char16 s[kMaxLemmaSize + 1];
624 
625   uint16 s_len = lpi_items_[cand_id].lma_len;
626   if (s_len > 1) {
627     s_len = get_lemma_str(id, s, kMaxLemmaSize + 1);
628   } else {
629     // For a single character, Hanzi is ready.
630     s[0] = lpi_items_[cand_id].hanzi;
631     s[1] = static_cast<char16>(0);
632   }
633 
634   if (s_len > 0 &&  max_len > s_len) {
635     utf16_strncpy(cand_str, s, s_len);
636     cand_str[s_len] = (char16)'\0';
637     return cand_str;
638   }
639 
640   return NULL;
641 }
642 
update_dict_freq()643 void MatrixSearch::update_dict_freq() {
644   if (NULL != user_dict_) {
645     // Update the total frequency of all lemmas, including system lemmas and
646     // user dictionary lemmas.
647     size_t total_freq = user_dict_->get_total_lemma_count();
648     dict_trie_->set_total_lemma_count_of_others(total_freq);
649   }
650 }
651 
add_lma_to_userdict(uint16 lma_fr,uint16 lma_to,float score)652 bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to,
653                                        float score) {
654   if (lma_to - lma_fr <= 1 || NULL == user_dict_)
655     return false;
656 
657   char16 word_str[kMaxLemmaSize + 1];
658   uint16 spl_ids[kMaxLemmaSize];
659 
660   uint16 spl_id_fr = 0;
661 
662   for (uint16 pos = lma_fr; pos < lma_to; pos++) {
663     LemmaIdType lma_id = lma_id_[pos];
664     if (is_user_lemma(lma_id)) {
665       user_dict_->update_lemma(lma_id, 1, true);
666     }
667     uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos];
668     utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len);
669 
670     uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr,
671                                kMaxLemmaSize + 1 - spl_id_fr);
672     assert(tmp == lma_len);
673 
674     tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true);
675     if (tmp != lma_len) {
676       return false;
677     }
678 
679     spl_id_fr += lma_len;
680   }
681 
682   assert(spl_id_fr <= kMaxLemmaSize);
683 
684   return user_dict_->put_lemma(static_cast<char16*>(word_str), spl_ids,
685                                  spl_id_fr, 1);
686 }
687 
debug_print_dmi(PoolPosType dmi_pos,uint16 nest_level)688 void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) {
689   if (dmi_pos >= dmi_pool_used_) return;
690 
691   DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
692 
693   if (1 == nest_level) {
694     printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos);
695   }
696   if (dmi->dict_level > 1) {
697     debug_print_dmi(dmi->dmi_fr, nest_level + 1);
698   }
699   printf("---%d\n", dmi->dict_level);
700   printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]);
701   printf(" Spelling : %s, %d\n", SpellingTrie::get_instance().
702          get_spelling_str(dmi->spl_id), dmi->spl_id);
703   printf(" Total Pinyin Len: %d\n", dmi->splstr_len);
704   if (1 == nest_level) {
705     printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos);
706   }
707 }
708 
try_add_cand0_to_userdict()709 bool MatrixSearch::try_add_cand0_to_userdict() {
710   size_t new_cand_num = get_candidate_num();
711   if (fixed_hzs_ > 0 && 1 == new_cand_num) {
712     float score_from = 0;
713     uint16 lma_id_from = 0;
714     uint16 pos = 0;
715     bool modified = false;
716     while (pos < fixed_lmas_) {
717       if (lma_start_[pos + 1] - lma_start_[lma_id_from] >
718           static_cast<uint16>(kMaxLemmaSize)) {
719         float score_to_add =
720             mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
721             .mtrx_nd_pos].score - score_from;
722         if (modified) {
723           score_to_add += 1.0;
724           if (score_to_add > NGram::kMaxScore) {
725             score_to_add = NGram::kMaxScore;
726           }
727           add_lma_to_userdict(lma_id_from, pos, score_to_add);
728         }
729         lma_id_from = pos;
730         score_from += score_to_add;
731 
732         // Clear the flag for next user lemma.
733         modified = false;
734       }
735 
736       if (0 == fixed_lmas_no1_[pos]) {
737         modified = true;
738       }
739       pos++;
740     }
741 
742     // Single-char word is not allowed to add to userdict.
743     if (lma_start_[pos] - lma_start_[lma_id_from] > 1) {
744       float score_to_add =
745           mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
746           .mtrx_nd_pos].score - score_from;
747       if (modified) {
748         score_to_add += 1.0;
749         if (score_to_add > NGram::kMaxScore) {
750           score_to_add = NGram::kMaxScore;
751         }
752         add_lma_to_userdict(lma_id_from, pos, score_to_add);
753       }
754     }
755   }
756   return true;
757 }
758 
759 // Choose a candidate, and give new candidates for next step.
760 // If user finishes selection, we will try to communicate with user dictionary
761 // to add new items or update score of some existing items.
762 //
763 // Basic rule:
764 // 1. If user selects the first choice:
765 //    1.1. If the first choice is not a sentence, instead, it is a lemma:
766 //         1.1.1. If the first choice is a user lemma, notify the user
767 //                dictionary that a user lemma is hit, and add occuring count
768 //                by 1.
769 //         1.1.2. If the first choice is a system lemma, do nothing.
770 //    1.2. If the first choice is a sentence containing more than one lemma:
771 //         1.2.1. The whole sentence will be added as a user lemma. If the
772 //                sentence contains user lemmas, -> hit, and add occuring count
773 //                by 1.
choose(size_t cand_id)774 size_t MatrixSearch::choose(size_t cand_id) {
775   if (!inited_ || 0 == pys_decoded_len_)
776     return 0;
777 
778   if (0 == cand_id) {
779     fixed_hzs_ = spl_id_num_;
780     matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
781         matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
782     for (size_t pos = fixed_lmas_; pos < lma_id_num_; pos++) {
783       fixed_lmas_no1_[pos] = 1;
784     }
785     fixed_lmas_ = lma_id_num_;
786     lpi_total_ = 0;  // Clean all other candidates.
787 
788     // 1. It is the first choice
789     if (1 == lma_id_num_) {
790       // 1.1. The first choice is not a sentence but a lemma
791       if (is_user_lemma(lma_id_[0])) {
792         // 1.1.1. The first choice is a user lemma, notify the user dictionary
793         // that it is hit.
794         if (NULL != user_dict_)
795           user_dict_->update_lemma(lma_id_[0], 1, true);
796       } else {
797         // 1.1.2. do thing for a system lemma.
798       }
799     } else {
800       // 1.2. The first choice is a sentence.
801       // 1.2.1 Try to add the whole sentence to user dictionary, the whole
802       // sentence may be splitted into many items.
803       if (NULL != user_dict_) {
804         try_add_cand0_to_userdict();
805       }
806     }
807     update_dict_freq();
808     return 1;
809   } else {
810     cand_id--;
811   }
812 
813   // 2. It is not the full sentence candidate.
814   // Find the length of the candidate.
815   LemmaIdType id_chosen = lpi_items_[cand_id].id;
816   LmaScoreType score_chosen = lpi_items_[cand_id].psb;
817   size_t cand_len = lpi_items_[cand_id].lma_len;
818 
819   assert(cand_len > 0);
820 
821   // Notify the atom dictionary that this item is hit.
822   if (is_user_lemma(id_chosen)) {
823     if (NULL != user_dict_) {
824       user_dict_->update_lemma(id_chosen, 1, true);
825     }
826     update_dict_freq();
827   }
828 
829   // 3. Fixed the chosen item.
830   // 3.1 Get the steps number.
831   size_t step_fr = spl_start_[fixed_hzs_];
832   size_t step_to = spl_start_[fixed_hzs_ + cand_len];
833 
834   // 3.2 Save the length of the original string.
835   size_t pys_decoded_len = pys_decoded_len_;
836 
837   // 3.2 Reset the space of the fixed part.
838   reset_search(step_to, false, false, true);
839 
840   // 3.3 For the last character of the fixed part, the previous DMI
841   // information will be kept, while the MTRX information will be re-extended,
842   // and only one node will be extended.
843   matrix_[step_to].mtrx_nd_num = 0;
844 
845   LmaPsbItem lpi_item;
846   lpi_item.psb = score_chosen;
847   lpi_item.id = id_chosen;
848 
849   PoolPosType step_to_dmi_fr = match_dmi(step_to,
850                                          spl_id_ + fixed_hzs_, cand_len);
851   assert(step_to_dmi_fr != static_cast<PoolPosType>(-1));
852 
853   extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1,
854                  step_to_dmi_fr, step_to);
855 
856   matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos;
857   mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos +
858                        matrix_[step_to].mtrx_nd_num;
859 
860   if (id_chosen == lma_id_[fixed_lmas_])
861     fixed_lmas_no1_[fixed_lmas_] = 1;
862   else
863     fixed_lmas_no1_[fixed_lmas_] = 0;
864   lma_id_[fixed_lmas_] = id_chosen;
865   lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len;
866   fixed_lmas_++;
867   fixed_hzs_ = fixed_hzs_ + cand_len;
868 
869   while (step_to != pys_decoded_len) {
870     bool b = add_char(pys_[step_to]);
871     assert(b);
872     step_to++;
873   }
874 
875   if (fixed_hzs_ < spl_id_num_) {
876     prepare_candidates();
877   } else {
878     lpi_total_ = 0;
879     if (NULL != user_dict_) {
880       try_add_cand0_to_userdict();
881     }
882   }
883 
884   return get_candidate_num();
885 }
886 
cancel_last_choice()887 size_t MatrixSearch::cancel_last_choice() {
888   if (!inited_ || 0 == pys_decoded_len_)
889     return 0;
890 
891   size_t step_start = 0;
892   if (fixed_hzs_ > 0) {
893     size_t step_end = spl_start_[fixed_hzs_];
894     MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed;
895     assert(NULL != end_node);
896 
897     step_start = end_node->from->step;
898 
899     if (step_start > 0) {
900       DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr;
901       fixed_hzs_ -= dmi->dict_level;
902     } else {
903       fixed_hzs_ = 0;
904     }
905 
906     reset_search(step_start, false, false, false);
907 
908     while (pys_[step_start] != '\0') {
909       bool b = add_char(pys_[step_start]);
910       assert(b);
911       step_start++;
912     }
913 
914     prepare_candidates();
915   }
916   return get_candidate_num();
917 }
918 
get_fixedlen()919 size_t MatrixSearch::get_fixedlen() {
920   if (!inited_ || 0 == pys_decoded_len_)
921     return 0;
922   return fixed_hzs_;
923 }
924 
prepare_add_char(char ch)925 bool MatrixSearch::prepare_add_char(char ch) {
926   if (pys_decoded_len_ >= kMaxRowNum - 1 ||
927       (!spl_parser_->is_valid_to_parse(ch) && ch != '\''))
928     return false;
929 
930   if (dmi_pool_used_ >= kDmiPoolSize) return false;
931 
932   pys_[pys_decoded_len_] = ch;
933   pys_decoded_len_++;
934 
935   MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_;
936   mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_;
937   mtrx_this_row->mtrx_nd_num = 0;
938   mtrx_this_row->dmi_pos = dmi_pool_used_;
939   mtrx_this_row->dmi_num = 0;
940   mtrx_this_row->dmi_has_full_id = 0;
941 
942   return true;
943 }
944 
is_split_at(uint16 pos)945 bool MatrixSearch::is_split_at(uint16 pos) {
946   return !spl_parser_->is_valid_to_parse(pys_[pos - 1]);
947 }
948 
fill_dmi(DictMatchInfo * dmi,MileStoneHandle * handles,PoolPosType dmi_fr,uint16 spl_id,uint16 node_num,unsigned char dict_level,bool splid_end_split,unsigned char splstr_len,unsigned char all_full_id)949 void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
950                             PoolPosType dmi_fr, uint16 spl_id,
951                             uint16 node_num, unsigned char dict_level,
952                             bool splid_end_split, unsigned char splstr_len,
953                             unsigned char all_full_id) {
954   dmi->dict_handles[0] = handles[0];
955   dmi->dict_handles[1] = handles[1];
956   dmi->dmi_fr = dmi_fr;
957   dmi->spl_id = spl_id;
958   dmi->dict_level = dict_level;
959   dmi->splid_end_split = splid_end_split ? 1 : 0;
960   dmi->splstr_len = splstr_len;
961   dmi->all_full_id = all_full_id;
962   dmi->c_phrase = 0;
963 }
964 
add_char(char ch)965 bool MatrixSearch::add_char(char ch) {
966   if (!prepare_add_char(ch))
967     return false;
968   return add_char_qwerty();
969 }
970 
add_char_qwerty()971 bool MatrixSearch::add_char_qwerty() {
972   matrix_[pys_decoded_len_].mtrx_nd_num = 0;
973 
974   bool spl_matched = false;
975   uint16 longest_ext = 0;
976   // Extend the search matrix, from the oldest unfixed row. ext_len means
977   // extending length.
978   for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) {
979     if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_])
980       continue;
981 
982     // Refer to the declaration of the variable dmi_has_full_id for the
983     // explanation of this piece of code. In one word, it is used to prevent
984     // from the unwise extending of "shoud ou" but allow the reasonable
985     // extending of "heng ao", "lang a", etc.
986     if (ext_len > 1 && 0 != longest_ext &&
987         0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) {
988       if (xi_an_enabled_)
989         continue;
990       else
991         break;
992     }
993 
994     uint16 oldrow = pys_decoded_len_ - ext_len;
995 
996     // 0. If that row is before the last fixed step, ignore.
997     if (spl_start_[fixed_hzs_] > oldrow)
998       continue;
999 
1000     // 1. Check if that old row has valid MatrixNode. If no, means that row is
1001     // not a boundary, either a word boundary or a spelling boundary.
1002     // If it is for extending composing phrase, it's OK to ignore the 0.
1003     if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_)
1004       continue;
1005 
1006     // 2. Get spelling id(s) for the last ext_len chars.
1007     uint16 spl_idx;
1008     bool is_pre = false;
1009     spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow,
1010                                             ext_len, &is_pre);
1011     if (is_pre)
1012       spl_matched = true;
1013 
1014     if (0 == spl_idx)
1015       continue;
1016 
1017     bool splid_end_split = is_split_at(oldrow + ext_len);
1018 
1019     // 3. Extend the DMI nodes of that old row
1020     // + 1 is to extend an extra node from the root
1021     for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos;
1022          dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1;
1023          dmi_pos++) {
1024       DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
1025       if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) {
1026         dmi = NULL;  // The last one, NULL means extending from the root.
1027       } else {
1028         // If the dmi is covered by the fixed arrange, ignore it.
1029         if (fixed_hzs_ > 0 &&
1030             pys_decoded_len_ - ext_len - dmi->splstr_len <
1031             spl_start_[fixed_hzs_]) {
1032           continue;
1033         }
1034         // If it is not in mode for composing phrase, and the source DMI node
1035         // is marked for composing phrase, ignore this node.
1036         if (dmi->c_phrase != 0 && !dmi_c_phrase_) {
1037           continue;
1038         }
1039       }
1040 
1041       // For example, if "gao" is extended, "g ao" is not allowed.
1042       // or "zh" has been passed, "z h" is not allowed.
1043       // Both word and word-connection will be prevented.
1044       if (longest_ext > ext_len) {
1045         if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) {
1046           continue;
1047         }
1048 
1049         // "z h" is not allowed.
1050         if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) {
1051           continue;
1052         }
1053       }
1054 
1055       dep_->splids_extended = 0;
1056       if (NULL != dmi) {
1057         uint16 prev_ids_num = dmi->dict_level;
1058         if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) ||
1059             (dmi_c_phrase_ && prev_ids_num >=  kMaxRowNum)) {
1060           continue;
1061         }
1062 
1063         DictMatchInfo *d = dmi;
1064         while (d) {
1065           dep_->splids[--prev_ids_num] = d->spl_id;
1066           if ((PoolPosType)-1 == d->dmi_fr)
1067             break;
1068           d = dmi_pool_ + d->dmi_fr;
1069         }
1070         assert(0 == prev_ids_num);
1071         dep_->splids_extended = dmi->dict_level;
1072       }
1073       dep_->splids[dep_->splids_extended] = spl_idx;
1074       dep_->ext_len = ext_len;
1075       dep_->splid_end_split = splid_end_split;
1076 
1077       dep_->id_num = 1;
1078       dep_->id_start = spl_idx;
1079       if (spl_trie_->is_half_id(spl_idx)) {
1080         // Get the full id list
1081         dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start));
1082         assert(dep_->id_num > 0);
1083       }
1084 
1085       uint16 new_dmi_num;
1086 
1087       new_dmi_num = extend_dmi(dep_, dmi);
1088 
1089       if (new_dmi_num > 0) {
1090         if (dmi_c_phrase_) {
1091           dmi_pool_[dmi_pool_used_].c_phrase = 1;
1092         }
1093         matrix_[pys_decoded_len_].dmi_num += new_dmi_num;
1094         dmi_pool_used_ += new_dmi_num;
1095 
1096         if (!spl_trie_->is_half_id(spl_idx))
1097           matrix_[pys_decoded_len_].dmi_has_full_id = 1;
1098       }
1099 
1100       // If get candiate lemmas, try to extend the path
1101       if (lpi_total_ > 0) {
1102         uint16 fr_row;
1103         if (NULL == dmi) {
1104           fr_row = oldrow;
1105         } else {
1106           assert(oldrow >= dmi->splstr_len);
1107           fr_row = oldrow - dmi->splstr_len;
1108         }
1109         for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos;
1110              mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos +
1111              matrix_[fr_row].mtrx_nd_num;
1112              mtrx_nd_pos++) {
1113           MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos;
1114 
1115           extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_,
1116                          dmi_pool_used_ - new_dmi_num, pys_decoded_len_);
1117           if (longest_ext == 0)
1118             longest_ext = ext_len;
1119         }
1120       }
1121     }  // for dmi_pos
1122   }  // for ext_len
1123   mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num;
1124 
1125   if (dmi_c_phrase_)
1126     return true;
1127 
1128   return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched);
1129 }
1130 
prepare_candidates()1131 void MatrixSearch::prepare_candidates() {
1132   // Get candiates from the first un-fixed step.
1133   uint16 lma_size_max = kMaxLemmaSize;
1134   if (lma_size_max > spl_id_num_ - fixed_hzs_)
1135     lma_size_max = spl_id_num_ - fixed_hzs_;
1136 
1137   uint16 lma_size = lma_size_max;
1138 
1139   // If the full sentense candidate's unfixed part may be the same with a normal
1140   // lemma. Remove the lemma candidate in this case.
1141   char16 fullsent[kMaxLemmaSize + 1];
1142   char16 *pfullsent = NULL;
1143   uint16 sent_len;
1144   pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true);
1145 
1146   // If the unfixed part contains more than one ids, it is not necessary to
1147   // check whether a lemma's string is the same to the unfixed part of the full
1148   // sentence candidate, so, set it to NULL;
1149   if (sent_len > kMaxLemmaSize)
1150     pfullsent = NULL;
1151 
1152   lpi_total_ = 0;
1153   size_t lpi_num_full_match = 0;  // Number of items which are fully-matched.
1154   while (lma_size > 0) {
1155     size_t lma_num;
1156     lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size,
1157                        lpi_items_ + lpi_total_,
1158                        size_t(kMaxLmaPsbItems - lpi_total_),
1159                        pfullsent, lma_size == lma_size_max);
1160 
1161     if (lma_num > 0) {
1162       lpi_total_ += lma_num;
1163       // For next lemma candidates which are not the longest, it is not
1164       // necessary to compare with the full sentence candiate.
1165       pfullsent = NULL;
1166     }
1167     if (lma_size == lma_size_max) {
1168       lpi_num_full_match = lpi_total_;
1169     }
1170     lma_size--;
1171   }
1172 
1173   // Sort those partially-matched items by their unified scores.
1174   myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match,
1175           sizeof(LmaPsbItem), cmp_lpi_with_unified_psb);
1176 
1177   if (kPrintDebug0) {
1178     printf("-----Prepare candidates, score:\n");
1179     for (size_t a = 0; a < lpi_total_; a++) {
1180       printf("[%03d]%d    ", a, lpi_items_[a].psb);
1181       if ((a + 1) % 6 == 0) printf("\n");
1182     }
1183     printf("\n");
1184   }
1185 
1186   if (kPrintDebug0) {
1187     printf("--- lpi_total_ = %d\n", lpi_total_);
1188   }
1189 }
1190 
get_pystr(size_t * decoded_len)1191 const char* MatrixSearch::get_pystr(size_t *decoded_len) {
1192   if (!inited_ || NULL == decoded_len)
1193     return NULL;
1194 
1195   *decoded_len = pys_decoded_len_;
1196   return pys_;
1197 }
1198 
merge_fixed_lmas(size_t del_spl_pos)1199 void MatrixSearch::merge_fixed_lmas(size_t del_spl_pos) {
1200   if (fixed_lmas_ == 0)
1201     return;
1202   // Update spelling segmentation information first.
1203   spl_id_num_ -= 1;
1204   uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos];
1205   for (size_t pos = del_spl_pos; pos <= spl_id_num_; pos++) {
1206     spl_start_[pos] = spl_start_[pos + 1] - del_py_len;
1207     if (pos == spl_id_num_)
1208       break;
1209     spl_id_[pos] = spl_id_[pos + 1];
1210   }
1211 
1212   // Begin to merge.
1213   uint16 phrase_len = 0;
1214 
1215   // Update the spelling ids to the composing phrase.
1216   // We need to convert these ids into full id in the future.
1217   memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16));
1218   memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16));
1219 
1220   // If composing phrase has not been created, first merge all fixed
1221   //  lemmas into a composing phrase without deletion.
1222   if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) {
1223     uint16 bp = 1;  // Begin position of real fixed lemmas.
1224     // There is no existing composing phrase.
1225     if (kLemmaIdComposing != lma_id_[0]) {
1226       c_phrase_.sublma_num = 0;
1227       bp = 0;
1228     }
1229 
1230     uint16 sub_num = c_phrase_.sublma_num;
1231     for (uint16 pos = bp; pos <= fixed_lmas_; pos++) {
1232       c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos];
1233       if (lma_start_[pos] > del_spl_pos) {
1234         c_phrase_.sublma_start[sub_num + pos - bp] -= 1;
1235       }
1236 
1237       if (pos == fixed_lmas_)
1238         break;
1239 
1240       uint16 lma_len;
1241       char16 *lma_str = c_phrase_.chn_str +
1242           c_phrase_.sublma_start[sub_num] + phrase_len;
1243 
1244       lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len);
1245       assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]);
1246       phrase_len += lma_len;
1247     }
1248     assert(phrase_len == lma_start_[fixed_lmas_]);
1249     c_phrase_.length = phrase_len;  // will be deleted by 1
1250     c_phrase_.sublma_num += fixed_lmas_ - bp;
1251   } else {
1252     for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) {
1253       if (c_phrase_.sublma_start[pos] > del_spl_pos) {
1254         c_phrase_.sublma_start[pos] -= 1;
1255       }
1256     }
1257     phrase_len = c_phrase_.length;
1258   }
1259 
1260   assert(phrase_len > 0);
1261   if (1 == phrase_len) {
1262     // After the only one is deleted, nothing will be left.
1263     fixed_lmas_ = 0;
1264     return;
1265   }
1266 
1267   // Delete the Chinese character in the merged phrase.
1268   // The corresponding elements in spl_ids and spl_start of the
1269   // phrase have been deleted.
1270   char16 *chn_str = c_phrase_.chn_str + del_spl_pos;
1271   for (uint16 pos = 0;
1272       pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos;
1273       pos++) {
1274     chn_str[pos] = chn_str[pos + 1];
1275   }
1276   c_phrase_.length -= 1;
1277 
1278   // If the deleted spelling id is in a sub lemma which contains more than
1279   // one id, del_a_sub will be false; but if the deleted id is in a sub lemma
1280   // which only contains 1 id, the whole sub lemma needs to be deleted, so
1281   // del_a_sub will be true.
1282   bool del_a_sub = false;
1283   for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) {
1284     if (c_phrase_.sublma_start[pos - 1] ==
1285         c_phrase_.sublma_start[pos]) {
1286       del_a_sub = true;
1287     }
1288     if (del_a_sub) {
1289       c_phrase_.sublma_start[pos - 1] =
1290           c_phrase_.sublma_start[pos];
1291     }
1292   }
1293   if (del_a_sub)
1294     c_phrase_.sublma_num -= 1;
1295 
1296   return;
1297 }
1298 
get_spl_start_id()1299 void MatrixSearch::get_spl_start_id() {
1300   lma_id_num_ = 0;
1301   lma_start_[0] = 0;
1302 
1303   spl_id_num_ = 0;
1304   spl_start_[0] = 0;
1305   if (!inited_ || 0 == pys_decoded_len_ ||
1306       0 == matrix_[pys_decoded_len_].mtrx_nd_num)
1307     return;
1308 
1309   // Calculate number of lemmas and spellings
1310   // Only scan those part which is not fixed.
1311   lma_id_num_ = fixed_lmas_;
1312   spl_id_num_ = fixed_hzs_;
1313 
1314   MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
1315   while (mtrx_nd != mtrx_nd_pool_) {
1316     if (fixed_hzs_ > 0) {
1317       if (mtrx_nd->step <= spl_start_[fixed_hzs_])
1318         break;
1319     }
1320 
1321     // Update the spelling segamentation information
1322     unsigned char word_splstr_len = 0;
1323     PoolPosType dmi_fr = mtrx_nd->dmi_fr;
1324     if ((PoolPosType)-1 != dmi_fr)
1325       word_splstr_len = dmi_pool_[dmi_fr].splstr_len;
1326 
1327     while ((PoolPosType)-1 != dmi_fr) {
1328       spl_start_[spl_id_num_ + 1] = mtrx_nd->step -
1329           (word_splstr_len - dmi_pool_[dmi_fr].splstr_len);
1330       spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id;
1331       spl_id_num_++;
1332       dmi_fr = dmi_pool_[dmi_fr].dmi_fr;
1333     }
1334 
1335     // Update the lemma segmentation information
1336     lma_start_[lma_id_num_ + 1] = spl_id_num_;
1337     lma_id_[lma_id_num_] = mtrx_nd->id;
1338     lma_id_num_++;
1339 
1340     mtrx_nd = mtrx_nd->from;
1341   }
1342 
1343   // Reverse the result of spelling info
1344   for (size_t pos = fixed_hzs_;
1345        pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) {
1346     if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) {
1347       spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
1348       spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1];
1349       spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
1350 
1351       spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1];
1352       spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos];
1353       spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1];
1354     }
1355   }
1356 
1357   // Reverse the result of lemma info
1358   for (size_t pos = fixed_lmas_;
1359        pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) {
1360     assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos);
1361 
1362     if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) {
1363       lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
1364       lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1];
1365       lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
1366 
1367       lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
1368       lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos];
1369       lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
1370     }
1371   }
1372 
1373   for (size_t pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) {
1374     if (pos < lma_id_num_)
1375       lma_start_[pos] = lma_start_[pos - 1] +
1376           (lma_start_[pos] - lma_start_[pos + 1]);
1377     else
1378       lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] -
1379           lma_start_[fixed_lmas_];
1380   }
1381 
1382   // Find the last fixed position
1383   fixed_hzs_ = 0;
1384   for (size_t pos = spl_id_num_; pos > 0; pos--) {
1385     if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) {
1386       fixed_hzs_ = pos;
1387       break;
1388     }
1389   }
1390 
1391   return;
1392 }
1393 
get_spl_start(const uint16 * & spl_start)1394 size_t MatrixSearch::get_spl_start(const uint16 *&spl_start) {
1395   get_spl_start_id();
1396   spl_start = spl_start_;
1397   return spl_id_num_;
1398 }
1399 
extend_dmi(DictExtPara * dep,DictMatchInfo * dmi_s)1400 size_t MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) {
1401   if (dmi_pool_used_ >= kDmiPoolSize) return 0;
1402 
1403   if (dmi_c_phrase_)
1404     return extend_dmi_c(dep, dmi_s);
1405 
1406   LpiCache& lpi_cache = LpiCache::get_instance();
1407   uint16 splid = dep->splids[dep->splids_extended];
1408 
1409   bool cached = false;
1410   if (0 == dep->splids_extended)
1411     cached = lpi_cache.is_cached(splid);
1412 
1413   // 1. If this is a half Id, get its corresponding full starting Id and
1414   // number of full Id.
1415   size_t ret_val = 0;
1416   PoolPosType mtrx_dmi_fr = (PoolPosType)-1;  // From which dmi node
1417 
1418   lpi_total_ = 0;
1419 
1420   MileStoneHandle from_h[3];
1421   from_h[0] = 0;
1422   from_h[1] = 0;
1423 
1424   if (0 != dep->splids_extended) {
1425     from_h[0] = dmi_s->dict_handles[0];
1426     from_h[1] = dmi_s->dict_handles[1];
1427   }
1428 
1429   // 2. Begin exgtending in the system dictionary
1430   size_t lpi_num = 0;
1431   MileStoneHandle handles[2];
1432   handles[0] = handles[1] = 0;
1433   if (from_h[0] > 0 || NULL == dmi_s) {
1434     handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_,
1435                                          kMaxLmaPsbItems, &lpi_num);
1436   }
1437   if (handles[0] > 0)
1438     lpi_total_ = lpi_num;
1439 
1440   if (NULL == dmi_s) {  // from root
1441     assert(0 != handles[0]);
1442     mtrx_dmi_fr = dmi_pool_used_;
1443   }
1444 
1445   // 3. Begin extending in the user dictionary
1446   if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) {
1447     handles[1] = user_dict_->extend_dict(from_h[1], dep,
1448                                          lpi_items_ + lpi_total_,
1449                                          kMaxLmaPsbItems - lpi_total_,
1450                                          &lpi_num);
1451     if (handles[1] > 0) {
1452       if (kPrintDebug0) {
1453         for (size_t t = 0; t < lpi_num; t++) {
1454           printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id,
1455                  lpi_items_[lpi_total_ + t].psb);
1456         }
1457       }
1458       lpi_total_ += lpi_num;
1459     }
1460   }
1461 
1462   if (0 != handles[0] || 0 != handles[1]) {
1463     if (dmi_pool_used_ >= kDmiPoolSize) return 0;
1464 
1465     DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
1466     if (NULL == dmi_s) {
1467       fill_dmi(dmi_add, handles,
1468                (PoolPosType)-1, splid,
1469                1, 1, dep->splid_end_split, dep->ext_len,
1470                spl_trie_->is_half_id(splid) ? 0 : 1);
1471     } else {
1472       fill_dmi(dmi_add, handles,
1473                dmi_s - dmi_pool_, splid, 1,
1474                dmi_s->dict_level + 1, dep->splid_end_split,
1475                dmi_s->splstr_len + dep->ext_len,
1476                spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
1477     }
1478 
1479     ret_val = 1;
1480   }
1481 
1482   if (!cached) {
1483     if (0 == lpi_total_)
1484       return ret_val;
1485 
1486     if (kPrintDebug0) {
1487       printf("--- lpi_total_ = %d\n", lpi_total_);
1488     }
1489 
1490     myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb);
1491     if (NULL == dmi_s && spl_trie_->is_half_id(splid))
1492       lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_);
1493   } else {
1494     assert(spl_trie_->is_half_id(splid));
1495     lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems);
1496   }
1497 
1498   return ret_val;
1499 }
1500 
extend_dmi_c(DictExtPara * dep,DictMatchInfo * dmi_s)1501 size_t MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) {
1502   lpi_total_ = 0;
1503 
1504   uint16 pos = dep->splids_extended;
1505   assert(dmi_c_phrase_);
1506   if (pos >= c_phrase_.length)
1507     return 0;
1508 
1509   uint16 splid = dep->splids[pos];
1510   if (splid == c_phrase_.spl_ids[pos]) {
1511     DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
1512     MileStoneHandle handles[2];  // Actually never used.
1513     if (NULL == dmi_s)
1514       fill_dmi(dmi_add, handles,
1515                (PoolPosType)-1, splid,
1516                1, 1, dep->splid_end_split, dep->ext_len,
1517                spl_trie_->is_half_id(splid) ? 0 : 1);
1518     else
1519       fill_dmi(dmi_add, handles,
1520                dmi_s - dmi_pool_, splid, 1,
1521                dmi_s->dict_level + 1, dep->splid_end_split,
1522                dmi_s->splstr_len + dep->ext_len,
1523                spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
1524 
1525     if (pos == c_phrase_.length - 1) {
1526       lpi_items_[0].id = kLemmaIdComposing;
1527       lpi_items_[0].psb = 0;  // 0 is bigger than normal lemma score.
1528       lpi_total_ = 1;
1529     }
1530     return 1;
1531   }
1532   return 0;
1533 }
1534 
extend_mtrx_nd(MatrixNode * mtrx_nd,LmaPsbItem lpi_items[],size_t lpi_num,PoolPosType dmi_fr,size_t res_row)1535 size_t MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
1536                                     size_t lpi_num, PoolPosType dmi_fr,
1537                                     size_t res_row) {
1538   assert(NULL != mtrx_nd);
1539   matrix_[res_row].mtrx_nd_fixed = NULL;
1540 
1541   if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow)
1542     return 0;
1543 
1544   if (0 == mtrx_nd->step) {
1545     // Because the list is sorted, if the source step is 0, it is only
1546     // necessary to pick up the first kMaxNodeARow items.
1547     if (lpi_num > kMaxNodeARow)
1548       lpi_num = kMaxNodeARow;
1549   }
1550 
1551   MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos;
1552   for (size_t pos = 0; pos < lpi_num; pos++) {
1553     float score = mtrx_nd->score + lpi_items[pos].psb;
1554     if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score)
1555       break;
1556 
1557     // Try to add a new node
1558     size_t mtrx_nd_num = matrix_[res_row].mtrx_nd_num;
1559     MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num;
1560     bool replace = false;
1561     // Find its position
1562     while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) {
1563       if (static_cast<size_t>(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow)
1564         *mtrx_nd_res = *(mtrx_nd_res - 1);
1565       mtrx_nd_res--;
1566       replace = true;
1567     }
1568     if (replace || (mtrx_nd_num < kMaxNodeARow &&
1569         matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) {
1570       mtrx_nd_res->id = lpi_items[pos].id;
1571       mtrx_nd_res->score = score;
1572       mtrx_nd_res->from = mtrx_nd;
1573       mtrx_nd_res->dmi_fr = dmi_fr;
1574       mtrx_nd_res->step = res_row;
1575       if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow)
1576         matrix_[res_row].mtrx_nd_num++;
1577     }
1578   }
1579   return matrix_[res_row].mtrx_nd_num;
1580 }
1581 
match_dmi(size_t step_to,uint16 spl_ids[],uint16 spl_id_num)1582 PoolPosType MatrixSearch::match_dmi(size_t step_to, uint16 spl_ids[],
1583                                     uint16 spl_id_num) {
1584   if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) {
1585     return static_cast<PoolPosType>(-1);
1586   }
1587 
1588   for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) {
1589     DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos;
1590 
1591     if (dmi->dict_level != spl_id_num)
1592       continue;
1593 
1594     bool matched = true;
1595     for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) {
1596       if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) {
1597         matched = false;
1598         break;
1599       }
1600 
1601       dmi = dmi_pool_ + dmi->dmi_fr;
1602     }
1603     if (matched) {
1604       return matrix_[step_to].dmi_pos + dmi_pos;
1605     }
1606   }
1607 
1608   return static_cast<PoolPosType>(-1);
1609 }
1610 
get_candidate0(char16 * cand_str,size_t max_len,uint16 * retstr_len,bool only_unfixed)1611 char16* MatrixSearch::get_candidate0(char16 *cand_str, size_t max_len,
1612                                      uint16 *retstr_len,
1613                                      bool only_unfixed) {
1614   if (pys_decoded_len_ == 0 ||
1615       matrix_[pys_decoded_len_].mtrx_nd_num == 0)
1616     return NULL;
1617 
1618   LemmaIdType idxs[kMaxRowNum];
1619   size_t id_num = 0;
1620 
1621   MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
1622 
1623   if (kPrintDebug0) {
1624     printf("--- sentence score: %f\n", mtrx_nd->score);
1625   }
1626 
1627   if (kPrintDebug1) {
1628     printf("==============Sentence DMI (reverse order) begin===========>>\n");
1629   }
1630 
1631   while (mtrx_nd != NULL) {
1632     idxs[id_num] = mtrx_nd->id;
1633     id_num++;
1634 
1635     if (kPrintDebug1) {
1636        printf("---MatrixNode [step: %d, lma_idx: %d, total score:%.5f]\n",
1637               mtrx_nd->step, mtrx_nd->id, mtrx_nd->score);
1638        debug_print_dmi(mtrx_nd->dmi_fr, 1);
1639     }
1640 
1641     mtrx_nd = mtrx_nd->from;
1642   }
1643 
1644   if (kPrintDebug1) {
1645     printf("<<==============Sentence DMI (reverse order) end=============\n");
1646   }
1647 
1648   size_t ret_pos = 0;
1649   do {
1650     id_num--;
1651     if (0 == idxs[id_num])
1652       continue;
1653 
1654     char16 str[kMaxLemmaSize + 1];
1655     uint16 str_len = get_lemma_str(idxs[id_num], str, kMaxLemmaSize + 1);
1656     if (str_len > 0 && ((!only_unfixed && max_len - ret_pos > str_len) ||
1657         (only_unfixed && max_len - ret_pos + fixed_hzs_ > str_len))) {
1658       if (!only_unfixed)
1659         utf16_strncpy(cand_str + ret_pos, str, str_len);
1660       else if (ret_pos >= fixed_hzs_)
1661         utf16_strncpy(cand_str + ret_pos - fixed_hzs_, str, str_len);
1662 
1663       ret_pos += str_len;
1664     } else {
1665       return NULL;
1666     }
1667   } while (id_num != 0);
1668 
1669   if (!only_unfixed) {
1670     if (NULL != retstr_len)
1671       *retstr_len = ret_pos;
1672     cand_str[ret_pos] = (char16)'\0';
1673   } else {
1674     if (NULL != retstr_len)
1675       *retstr_len = ret_pos - fixed_hzs_;
1676     cand_str[ret_pos - fixed_hzs_] = (char16)'\0';
1677   }
1678   return cand_str;
1679 }
1680 
get_lpis(const uint16 * splid_str,size_t splid_str_len,LmaPsbItem * lma_buf,size_t max_lma_buf,const char16 * pfullsent,bool sort_by_psb)1681 size_t MatrixSearch::get_lpis(const uint16* splid_str, size_t splid_str_len,
1682                               LmaPsbItem* lma_buf, size_t max_lma_buf,
1683                               const char16 *pfullsent, bool sort_by_psb) {
1684   if (splid_str_len > kMaxLemmaSize)
1685     return 0;
1686 
1687   size_t num1 = dict_trie_->get_lpis(splid_str, splid_str_len,
1688                                      lma_buf, max_lma_buf);
1689   size_t num2 = 0;
1690   if (NULL != user_dict_) {
1691     num2 = user_dict_->get_lpis(splid_str, splid_str_len,
1692                          lma_buf + num1, max_lma_buf - num1);
1693   }
1694 
1695   size_t num = num1 + num2;
1696 
1697   if (0 == num)
1698     return 0;
1699 
1700   // Remove repeated items.
1701   if (splid_str_len > 1) {
1702     LmaPsbStrItem *lpsis = reinterpret_cast<LmaPsbStrItem*>(lma_buf + num);
1703     size_t lpsi_num = (max_lma_buf - num) * sizeof(LmaPsbItem) /
1704         sizeof(LmaPsbStrItem);
1705     assert(lpsi_num > num);
1706     if (num > lpsi_num) num = lpsi_num;
1707     lpsi_num = num;
1708 
1709     for (size_t pos = 0; pos < lpsi_num; pos++) {
1710       lpsis[pos].lpi = lma_buf[pos];
1711       get_lemma_str(lma_buf[pos].id, lpsis[pos].str, kMaxLemmaSize + 1);
1712     }
1713 
1714     myqsort(lpsis, lpsi_num, sizeof(LmaPsbStrItem), cmp_lpsi_with_str);
1715 
1716     size_t remain_num = 0;
1717     for (size_t pos = 0; pos < lpsi_num; pos++) {
1718       if (pos > 0 && utf16_strcmp(lpsis[pos].str, lpsis[pos - 1].str) == 0) {
1719         if (lpsis[pos].lpi.psb < lpsis[pos - 1].lpi.psb) {
1720           assert(remain_num > 0);
1721           lma_buf[remain_num - 1] = lpsis[pos].lpi;
1722         }
1723         continue;
1724       }
1725       if (NULL != pfullsent && utf16_strcmp(lpsis[pos].str, pfullsent) == 0)
1726         continue;
1727 
1728       lma_buf[remain_num] = lpsis[pos].lpi;
1729       remain_num++;
1730     }
1731 
1732     // Update the result number
1733     num = remain_num;
1734   } else {
1735     // For single character, some characters have more than one spelling, for
1736     // example, "de" and "di" are all valid for a Chinese character, so when
1737     // the user input  "d", repeated items are generated.
1738     // For single character lemmas, Hanzis will be gotten
1739     for (size_t pos = 0; pos < num; pos++) {
1740       char16 hanzis[2];
1741       get_lemma_str(lma_buf[pos].id, hanzis, 2);
1742       lma_buf[pos].hanzi = hanzis[0];
1743     }
1744 
1745     myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_hanzi);
1746 
1747     size_t remain_num = 0;
1748     for (size_t pos = 0; pos < num; pos++) {
1749       if (pos > 0 && lma_buf[pos].hanzi == lma_buf[pos - 1].hanzi) {
1750         if (NULL != pfullsent &&
1751             static_cast<char16>(0) == pfullsent[1] &&
1752             lma_buf[pos].hanzi == pfullsent[0])
1753           continue;
1754 
1755         if (lma_buf[pos].psb < lma_buf[pos - 1].psb) {
1756           assert(remain_num > 0);
1757           assert(lma_buf[remain_num - 1].hanzi == lma_buf[pos].hanzi);
1758           lma_buf[remain_num - 1] = lma_buf[pos];
1759         }
1760         continue;
1761       }
1762       if (NULL != pfullsent &&
1763           static_cast<char16>(0) == pfullsent[1] &&
1764           lma_buf[pos].hanzi == pfullsent[0])
1765           continue;
1766 
1767       lma_buf[remain_num] = lma_buf[pos];
1768       remain_num++;
1769     }
1770 
1771     num = remain_num;
1772   }
1773 
1774   if (sort_by_psb) {
1775     myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_psb);
1776   }
1777   return num;
1778 }
1779 
get_lemma_str(LemmaIdType id_lemma,char16 * str_buf,uint16 str_max)1780 uint16 MatrixSearch::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
1781                                    uint16 str_max) {
1782   uint16 str_len = 0;
1783 
1784   if (is_system_lemma(id_lemma)) {
1785     str_len = dict_trie_->get_lemma_str(id_lemma, str_buf, str_max);
1786   } else if (is_user_lemma(id_lemma)) {
1787     if (NULL != user_dict_) {
1788       str_len = user_dict_->get_lemma_str(id_lemma, str_buf, str_max);
1789     } else {
1790       str_len = 0;
1791       str_buf[0] = static_cast<char16>('\0');
1792     }
1793   } else if (is_composing_lemma(id_lemma)) {
1794     if (str_max <= 1)
1795       return 0;
1796     str_len = c_phrase_.sublma_start[c_phrase_.sublma_num];
1797     if (str_len > str_max - 1)
1798       str_len = str_max - 1;
1799     utf16_strncpy(str_buf, c_phrase_.chn_str, str_len);
1800     str_buf[str_len] = (char16)'\0';
1801     return str_len;
1802   }
1803 
1804   return str_len;
1805 }
1806 
get_lemma_splids(LemmaIdType id_lemma,uint16 * splids,uint16 splids_max,bool arg_valid)1807 uint16 MatrixSearch::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
1808                                       uint16 splids_max, bool arg_valid) {
1809   uint16 splid_num = 0;
1810 
1811   if (arg_valid) {
1812     for (splid_num = 0; splid_num < splids_max; splid_num++) {
1813       if (spl_trie_->is_half_id(splids[splid_num]))
1814         break;
1815     }
1816     if (splid_num == splids_max)
1817       return splid_num;
1818   }
1819 
1820   if (is_system_lemma(id_lemma)) {
1821     splid_num = dict_trie_->get_lemma_splids(id_lemma, splids, splids_max,
1822                                               arg_valid);
1823   } else if (is_user_lemma(id_lemma)) {
1824     if (NULL != user_dict_) {
1825       splid_num = user_dict_->get_lemma_splids(id_lemma, splids, splids_max,
1826                                                arg_valid);
1827     } else {
1828       splid_num = 0;
1829     }
1830   } else if (is_composing_lemma(id_lemma)) {
1831     if (c_phrase_.length > splids_max) {
1832       return 0;
1833     }
1834     for (uint16 pos = 0; pos < c_phrase_.length; pos++) {
1835       splids[pos] = c_phrase_.spl_ids[pos];
1836       if (spl_trie_->is_half_id(splids[pos])) {
1837         return 0;
1838       }
1839     }
1840   }
1841   return splid_num;
1842 }
1843 
inner_predict(const char16 * fixed_buf,uint16 fixed_len,char16 predict_buf[][kMaxPredictSize+1],size_t buf_len)1844 size_t MatrixSearch::inner_predict(const char16 *fixed_buf, uint16 fixed_len,
1845                                    char16 predict_buf[][kMaxPredictSize + 1],
1846                                    size_t buf_len) {
1847   size_t res_total = 0;
1848   memset(npre_items_, 0, sizeof(NPredictItem) * npre_items_len_);
1849   // In order to shorten the comments, j-character candidates predicted by
1850   // i-character prefix are called P(i,j). All candiates predicted by
1851   // i-character prefix are called P(i,*)
1852   // Step 1. Get P(kMaxPredictSize, *) and sort them, here
1853   // P(kMaxPredictSize, *) == P(kMaxPredictSize, 1)
1854   for (size_t len = fixed_len; len >0; len--) {
1855     // How many blank items are available
1856     size_t this_max = npre_items_len_ - res_total;
1857     size_t res_this;
1858     // If the history is longer than 1, and we can not get prediction from
1859     // lemmas longer than 2, in this case, we will add lemmas with
1860     // highest scores as the prediction result.
1861     if (fixed_len > 1 && 1 == len && 0 == res_total) {
1862       // Try to find if recent n (n>1) characters can be a valid lemma in system
1863       // dictionary.
1864       bool nearest_n_word = false;
1865       for (size_t nlen = 2; nlen <= fixed_len; nlen++) {
1866         if (dict_trie_->get_lemma_id(fixed_buf + fixed_len - nlen, nlen) > 0) {
1867           nearest_n_word = true;
1868           break;
1869         }
1870       }
1871       res_this = dict_trie_->predict_top_lmas(nearest_n_word ? len : 0,
1872                                               npre_items_ + res_total,
1873                                               this_max, res_total);
1874       res_total += res_this;
1875     }
1876 
1877     // How many blank items are available
1878     this_max = npre_items_len_ - res_total;
1879     res_this = 0;
1880     if (!kOnlyUserDictPredict) {
1881       res_this =
1882           dict_trie_->predict(fixed_buf + fixed_len - len, len,
1883                               npre_items_ + res_total, this_max,
1884                               res_total);
1885     }
1886 
1887     if (NULL != user_dict_) {
1888       res_this = res_this +
1889                  user_dict_->predict(fixed_buf + fixed_len - len, len,
1890                                      npre_items_ + res_total + res_this,
1891                                      this_max - res_this, res_total + res_this);
1892     }
1893 
1894     if (kPredictLimitGt1) {
1895       myqsort(npre_items_ + res_total, res_this, sizeof(NPredictItem),
1896               cmp_npre_by_score);
1897 
1898       if (len > 3) {
1899         if (res_this > kMaxPredictNumByGt3)
1900           res_this = kMaxPredictNumByGt3;
1901       } else if (3 == len) {
1902         if (res_this > kMaxPredictNumBy3)
1903           res_this = kMaxPredictNumBy3;
1904       } else if (2 == len) {
1905         if (res_this > kMaxPredictNumBy2)
1906           res_this = kMaxPredictNumBy2;
1907       }
1908     }
1909 
1910     res_total += res_this;
1911   }
1912 
1913   res_total = remove_duplicate_npre(npre_items_, res_total);
1914 
1915   if (kPreferLongHistoryPredict) {
1916     myqsort(npre_items_, res_total, sizeof(NPredictItem),
1917             cmp_npre_by_hislen_score);
1918   } else {
1919     myqsort(npre_items_, res_total, sizeof(NPredictItem),
1920             cmp_npre_by_score);
1921   }
1922 
1923   if (buf_len < res_total) {
1924     res_total = buf_len;
1925   }
1926 
1927   if (kPrintDebug2) {
1928     printf("/////////////////Predicted Items Begin////////////////////>>\n");
1929     for (size_t i = 0; i < res_total; i++) {
1930       printf("---");
1931       for (size_t j = 0; j < kMaxPredictSize; j++) {
1932         printf("%d  ", npre_items_[i].pre_hzs[j]);
1933       }
1934       printf("\n");
1935     }
1936     printf("<<///////////////Predicted Items End////////////////////////\n");
1937   }
1938 
1939   for (size_t i = 0; i < res_total; i++) {
1940     utf16_strncpy(predict_buf[i], npre_items_[i].pre_hzs,
1941                   kMaxPredictSize);
1942     predict_buf[i][kMaxPredictSize] = '\0';
1943   }
1944 
1945   return res_total;
1946 }
1947 
get_predicts(const char16 fixed_buf[],char16 predict_buf[][kMaxPredictSize+1],size_t buf_len)1948 size_t MatrixSearch::get_predicts(const char16 fixed_buf[],
1949                                   char16 predict_buf[][kMaxPredictSize + 1],
1950                                   size_t buf_len) {
1951   size_t fixed_len = utf16_strlen(fixed_buf);
1952   if (0 ==fixed_len || fixed_len > kMaxPredictSize || 0 == buf_len)
1953     return 0;
1954 
1955   return inner_predict(fixed_buf, fixed_len, predict_buf, buf_len);
1956 }
1957 
1958 }  // namespace ime_pinyin
1959