• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <assert.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 
22 #include "../include/dictbuilder.h"
23 #include "../include/dicttrie.h"
24 #include "../include/mystdlib.h"
25 #include "../include/ngram.h"
26 #include "../include/searchutility.h"
27 #include "../include/spellingtable.h"
28 #include "../include/spellingtrie.h"
29 #include "../include/splparser.h"
30 #include "../include/utf16reader.h"
31 
32 namespace ime_pinyin {
33 
34 #ifdef ___BUILD_MODEL___
35 
36 static const size_t kReadBufLen = 512;
37 static const size_t kSplTableHashLen = 2000;
38 
39 // Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by
40 // frequencies.
cmp_scis_hz_splid_freq(const void * p1,const void * p2)41 int cmp_scis_hz_splid_freq(const void* p1, const void* p2) {
42   const SingleCharItem *s1, *s2;
43   s1 = static_cast<const SingleCharItem*>(p1);
44   s2 = static_cast<const SingleCharItem*>(p2);
45 
46   if (s1->hz < s2->hz)
47     return -1;
48   if (s1->hz > s2->hz)
49     return 1;
50 
51   if (s1->splid.half_splid < s2->splid.half_splid)
52     return -1;
53   if (s1->splid.half_splid > s2->splid.half_splid)
54     return 1;
55 
56   if (s1->splid.full_splid < s2->splid.full_splid)
57     return -1;
58   if (s1->splid.full_splid > s2->splid.full_splid)
59     return 1;
60 
61   if (s1->freq > s2->freq)
62     return -1;
63   if (s1->freq < s2->freq)
64     return 1;
65   return 0;
66 }
67 
cmp_scis_hz_splid(const void * p1,const void * p2)68 int cmp_scis_hz_splid(const void* p1, const void* p2) {
69   const SingleCharItem *s1, *s2;
70   s1 = static_cast<const SingleCharItem*>(p1);
71   s2 = static_cast<const SingleCharItem*>(p2);
72 
73   if (s1->hz < s2->hz)
74     return -1;
75   if (s1->hz > s2->hz)
76     return 1;
77 
78   if (s1->splid.half_splid < s2->splid.half_splid)
79     return -1;
80   if (s1->splid.half_splid > s2->splid.half_splid)
81     return 1;
82 
83   if (s1->splid.full_splid < s2->splid.full_splid)
84     return -1;
85   if (s1->splid.full_splid > s2->splid.full_splid)
86     return 1;
87 
88   return 0;
89 }
90 
cmp_lemma_entry_hzs(const void * p1,const void * p2)91 int cmp_lemma_entry_hzs(const void* p1, const void* p2) {
92   size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
93   size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
94   if (size1 < size2)
95     return -1;
96   else if (size1 > size2)
97     return 1;
98 
99   return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
100                       ((const LemmaEntry*)p2)->hanzi_str);
101 }
102 
compare_char16(const void * p1,const void * p2)103 int compare_char16(const void* p1, const void* p2) {
104   if (*((const char16*)p1) < *((const char16*)p2))
105     return -1;
106   if (*((const char16*)p1) > *((const char16*)p2))
107     return 1;
108   return 0;
109 }
110 
compare_py(const void * p1,const void * p2)111 int compare_py(const void* p1, const void* p2) {
112   int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
113                          ((const LemmaEntry*)p2)->spl_idx_arr);
114 
115   if (0 != ret)
116     return ret;
117 
118   return static_cast<int>(((const LemmaEntry*)p2)->freq) -
119          static_cast<int>(((const LemmaEntry*)p1)->freq);
120 }
121 
122 // First hanzi, if the same, then Pinyin
cmp_lemma_entry_hzspys(const void * p1,const void * p2)123 int cmp_lemma_entry_hzspys(const void* p1, const void* p2) {
124   size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
125   size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
126   if (size1 < size2)
127     return -1;
128   else if (size1 > size2)
129     return 1;
130   int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
131                          ((const LemmaEntry*)p2)->hanzi_str);
132 
133   if (0 != ret)
134     return ret;
135 
136   ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
137                      ((const LemmaEntry*)p2)->spl_idx_arr);
138   return ret;
139 }
140 
compare_splid2(const void * p1,const void * p2)141 int compare_splid2(const void* p1, const void* p2) {
142   int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
143                          ((const LemmaEntry*)p2)->spl_idx_arr);
144   return ret;
145 }
146 
DictBuilder()147 DictBuilder::DictBuilder() {
148   lemma_arr_ = NULL;
149   lemma_num_ = 0;
150 
151   scis_ = NULL;
152   scis_num_ = 0;
153 
154   lma_nodes_le0_ = NULL;
155   lma_nodes_ge1_ = NULL;
156 
157   lma_nds_used_num_le0_ = 0;
158   lma_nds_used_num_ge1_ = 0;
159 
160   homo_idx_buf_ = NULL;
161   homo_idx_num_eq1_ = 0;
162   homo_idx_num_gt1_ = 0;
163 
164   top_lmas_ = NULL;
165   top_lmas_num_ = 0;
166 
167   spl_table_ = NULL;
168   spl_parser_ = NULL;
169 }
170 
~DictBuilder()171 DictBuilder::~DictBuilder() {
172   free_resource();
173 }
174 
alloc_resource(size_t lma_num)175 bool DictBuilder::alloc_resource(size_t lma_num) {
176   if (0 == lma_num)
177     return false;
178 
179   free_resource();
180 
181   lemma_num_ = lma_num;
182   lemma_arr_ = new LemmaEntry[lemma_num_];
183 
184   top_lmas_num_ = 0;
185   top_lmas_ = new LemmaEntry[kTopScoreLemmaNum];
186 
187   // New the scis_ buffer to the possible maximum size.
188   scis_num_ = lemma_num_ * kMaxLemmaSize;
189   scis_ = new SingleCharItem[scis_num_];
190 
191   // The root and first level nodes is less than kMaxSpellingNum + 1
192   lma_nds_used_num_le0_ = 0;
193   lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1];
194 
195   // Other nodes is less than lemma_num
196   lma_nds_used_num_ge1_ = 0;
197   lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_];
198 
199   homo_idx_buf_ = new LemmaIdType[lemma_num_];
200   spl_table_ = new SpellingTable();
201   spl_parser_ = new SpellingParser();
202 
203   if (NULL == lemma_arr_ || NULL == top_lmas_ ||
204       NULL == scis_ || NULL == spl_table_ ||
205       NULL == spl_parser_ || NULL == lma_nodes_le0_ ||
206       NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) {
207     free_resource();
208     return false;
209   }
210 
211   memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_);
212   memset(scis_, 0, sizeof(SingleCharItem) * scis_num_);
213   memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1));
214   memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_);
215   memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_);
216   spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true);
217 
218   return true;
219 }
220 
read_valid_hanzis(const char * fn_validhzs,size_t * num)221 char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, size_t *num) {
222   if (NULL == fn_validhzs || NULL == num)
223     return NULL;
224 
225   *num = 0;
226   FILE *fp = fopen(fn_validhzs, "rb");
227   if (NULL == fp)
228     return NULL;
229 
230   char16 utf16header;
231   if (fread(&utf16header, sizeof(char16), 1, fp) != 1 ||
232       0xfeff != utf16header) {
233     fclose(fp);
234     return NULL;
235   }
236 
237   fseek(fp, 0, SEEK_END);
238   *num = ftell(fp) / sizeof(char16);
239   assert(*num >= 1);
240   *num -= 1;
241 
242   char16 *hzs = new char16[*num];
243   if (NULL == hzs) {
244     fclose(fp);
245     return NULL;
246   }
247 
248   fseek(fp, 2, SEEK_SET);
249 
250   if (fread(hzs, sizeof(char16), *num, fp) != *num) {
251     fclose(fp);
252     delete [] hzs;
253     return NULL;
254   }
255   fclose(fp);
256 
257   myqsort(hzs, *num, sizeof(char16), compare_char16);
258   return hzs;
259 }
260 
hz_in_hanzis_list(const char16 * hzs,size_t hzs_len,char16 hz)261 bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, size_t hzs_len,
262                                     char16 hz) {
263   if (NULL == hzs)
264     return false;
265 
266   char16 *found;
267   found = static_cast<char16*>(
268       mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16));
269   if (NULL == found)
270     return false;
271 
272   assert(*found == hz);
273   return true;
274 }
275 
276 // The caller makes sure that the parameters are valid.
str_in_hanzis_list(const char16 * hzs,size_t hzs_len,const char16 * str,size_t str_len)277 bool DictBuilder::str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
278                                      const char16 *str, size_t str_len) {
279   if (NULL == hzs || NULL == str)
280     return false;
281 
282   for (size_t pos = 0; pos < str_len; pos++) {
283     if (!hz_in_hanzis_list(hzs, hzs_len, str[pos]))
284       return false;
285   }
286   return true;
287 }
288 
get_top_lemmas()289 void DictBuilder::get_top_lemmas() {
290   top_lmas_num_ = 0;
291   if (NULL == lemma_arr_)
292     return;
293 
294   for (size_t pos = 0; pos < lemma_num_; pos++) {
295     if (0 == top_lmas_num_) {
296       top_lmas_[0] = lemma_arr_[pos];
297       top_lmas_num_ = 1;
298       continue;
299     }
300 
301     if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) {
302       if (kTopScoreLemmaNum > top_lmas_num_)
303         top_lmas_num_ += 1;
304 
305       size_t move_pos;
306       for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) {
307         top_lmas_[move_pos] = top_lmas_[move_pos - 1];
308         if (0 == move_pos - 1 ||
309             (move_pos - 1 > 0 &&
310              top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) {
311           break;
312         }
313       }
314       assert(move_pos > 0);
315       top_lmas_[move_pos - 1] = lemma_arr_[pos];
316     } else if (kTopScoreLemmaNum > top_lmas_num_) {
317       top_lmas_[top_lmas_num_] = lemma_arr_[pos];
318       top_lmas_num_ += 1;
319     }
320   }
321 
322   if (kPrintDebug0) {
323     printf("\n------Top Lemmas------------------\n");
324     for (size_t pos = 0; pos < top_lmas_num_; pos++) {
325       printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz,
326              top_lmas_[pos].freq);
327     }
328   }
329 }
330 
free_resource()331 void DictBuilder::free_resource() {
332   if (NULL != lemma_arr_)
333     delete [] lemma_arr_;
334 
335   if (NULL != scis_)
336     delete [] scis_;
337 
338   if (NULL != lma_nodes_le0_)
339     delete [] lma_nodes_le0_;
340 
341   if (NULL != lma_nodes_ge1_)
342     delete [] lma_nodes_ge1_;
343 
344   if (NULL != homo_idx_buf_)
345     delete [] homo_idx_buf_;
346 
347   if (NULL != spl_table_)
348     delete spl_table_;
349 
350   if (NULL != spl_parser_)
351     delete spl_parser_;
352 
353   lemma_arr_ = NULL;
354   scis_ = NULL;
355   lma_nodes_le0_ = NULL;
356   lma_nodes_ge1_ = NULL;
357   homo_idx_buf_ = NULL;
358   spl_table_ = NULL;
359   spl_parser_ = NULL;
360 
361   lemma_num_ = 0;
362   lma_nds_used_num_le0_ = 0;
363   lma_nds_used_num_ge1_ = 0;
364   homo_idx_num_eq1_ = 0;
365   homo_idx_num_gt1_ = 0;
366 }
367 
read_raw_dict(const char * fn_raw,const char * fn_validhzs,size_t max_item)368 size_t DictBuilder::read_raw_dict(const char* fn_raw,
369                                   const char *fn_validhzs,
370                                   size_t max_item) {
371   if (NULL == fn_raw) return 0;
372 
373   Utf16Reader utf16_reader;
374   if (!utf16_reader.open(fn_raw, kReadBufLen * 10))
375     return false;
376 
377   char16 read_buf[kReadBufLen];
378 
379   // Read the number of lemmas in the file
380   size_t lemma_num = 240000;
381 
382   // allocate resource required
383   if (!alloc_resource(lemma_num)) {
384     utf16_reader.close();
385   }
386 
387   // Read the valid Hanzi list.
388   char16 *valid_hzs = NULL;
389   size_t valid_hzs_num = 0;
390   valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num);
391 
392   // Begin reading the lemma entries
393   for (size_t i = 0; i < max_item; i++) {
394     // read next entry
395     if (!utf16_reader.readline(read_buf, kReadBufLen)) {
396       lemma_num = i;
397       break;
398     }
399 
400     size_t token_size;
401     char16 *token;
402     char16 *to_tokenize = read_buf;
403 
404     // Get the Hanzi string
405     token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
406     if (NULL == token) {
407       free_resource();
408       utf16_reader.close();
409       return false;
410     }
411 
412     size_t lemma_size = utf16_strlen(token);
413 
414     if (lemma_size > kMaxLemmaSize) {
415       i--;
416       continue;
417     }
418 
419     if (lemma_size > 4) {
420       i--;
421       continue;
422     }
423 
424     // Copy to the lemma entry
425     utf16_strcpy(lemma_arr_[i].hanzi_str, token);
426 
427     lemma_arr_[i].hz_str_len = token_size;
428 
429     // Get the freq string
430     token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
431     if (NULL == token) {
432       free_resource();
433       utf16_reader.close();
434       return false;
435     }
436     lemma_arr_[i].freq = utf16_atof(token);
437 
438     if (lemma_size > 1 && lemma_arr_[i].freq < 60) {
439       i--;
440       continue;
441     }
442 
443     // Get GBK mark, if no valid Hanzi list available, all items which contains
444     // GBK characters will be discarded. Otherwise, all items which contains
445     // characters outside of the valid Hanzi list will be discarded.
446     token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
447     assert(NULL != token);
448     int gbk_flag = utf16_atoi(token);
449     if (NULL == valid_hzs || 0 == valid_hzs_num) {
450       if (0 != gbk_flag) {
451         i--;
452         continue;
453       }
454     } else {
455       if (!str_in_hanzis_list(valid_hzs, valid_hzs_num,
456           lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) {
457         i--;
458         continue;
459       }
460     }
461 
462     // Get spelling String
463     bool spelling_not_support = false;
464     for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len;
465          hz_pos++) {
466       // Get a Pinyin
467       token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
468       if (NULL == token) {
469         free_resource();
470         utf16_reader.close();
471         return false;
472       }
473 
474       assert(utf16_strlen(token) <= kMaxPinyinSize);
475 
476       utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token);
477 
478       format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]);
479 
480       // Put the pinyin to the spelling table
481       if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos],
482                                     lemma_arr_[i].freq)) {
483         spelling_not_support = true;
484         break;
485       }
486     }
487 
488     // The whole line must have been parsed fully, otherwise discard this one.
489     token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
490     if (spelling_not_support || NULL != token) {
491       i--;
492       continue;
493     }
494   }
495 
496   delete [] valid_hzs;
497   utf16_reader.close();
498 
499   printf("read succesfully, lemma num: %d\n", lemma_num);
500 
501   return lemma_num;
502 }
503 
build_dict(const char * fn_raw,const char * fn_validhzs,DictTrie * dict_trie)504 bool DictBuilder::build_dict(const char *fn_raw,
505                              const char *fn_validhzs,
506                              DictTrie *dict_trie) {
507   if (NULL == fn_raw || NULL == dict_trie)
508     return false;
509 
510   lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000);
511   if (0 == lemma_num_)
512     return false;
513 
514   // Arrange the spelling table, and build a spelling tree
515   // The size of an spelling. '\0' is included. If the spelling table is
516   // initialized to calculate the spelling scores, the last char in the
517   // spelling string will be score, and it is also included in spl_item_size.
518   size_t spl_item_size;
519   size_t spl_num;
520   const char* spl_buf;
521   spl_buf = spl_table_->arrange(&spl_item_size, &spl_num);
522   if (NULL == spl_buf) {
523     free_resource();
524     return false;
525   }
526 
527   SpellingTrie &spl_trie = SpellingTrie::get_instance();
528 
529   if (!spl_trie.construct(spl_buf, spl_item_size, spl_num,
530                           spl_table_->get_score_amplifier(),
531                           spl_table_->get_average_score())) {
532     free_resource();
533     return false;
534   }
535 
536   printf("spelling tree construct successfully.\n");
537 
538   // Convert the spelling string to idxs
539   for (size_t i = 0; i < lemma_num_; i++) {
540     for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len;
541          hz_pos++) {
542       uint16 spl_idxs[2];
543       uint16 spl_start_pos[3];
544       bool is_pre = true;
545       int spl_idx_num =
546         spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos],
547                                     strlen(lemma_arr_[i].pinyin_str[hz_pos]),
548                                     spl_idxs, spl_start_pos, 2, is_pre);
549       assert(1 == spl_idx_num);
550 
551       if (spl_trie.is_half_id(spl_idxs[0])) {
552         uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs);
553         assert(0 != num);
554       }
555       lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0];
556     }
557   }
558 
559   // Sort the lemma items according to the hanzi, and give each unique item a
560   // id
561   sort_lemmas_by_hz();
562 
563   scis_num_ = build_scis();
564 
565   // Construct the dict list
566   dict_trie->dict_list_ = new DictList();
567   bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_,
568                                                      lemma_arr_, lemma_num_);
569   assert(dl_success);
570 
571   // Construct the NGram information
572   NGram& ngram = NGram::get_instance();
573   ngram.build_unigram(lemma_arr_, lemma_num_,
574                       lemma_arr_[lemma_num_ - 1].idx_by_hz + 1);
575 
576   // sort the lemma items according to the spelling idx string
577   myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py);
578 
579   get_top_lemmas();
580 
581 #ifdef ___DO_STATISTICS___
582   stat_init();
583 #endif
584 
585   lma_nds_used_num_le0_ = 1;  // The root node
586   bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_),
587                                      lemma_arr_, 0, lemma_num_, 0);
588   if (!dt_success) {
589     free_resource();
590     return false;
591   }
592 
593 #ifdef ___DO_STATISTICS___
594   stat_print();
595 #endif
596 
597   // Move the node data and homo data to the DictTrie
598   dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_];
599   dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_];
600   size_t lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_;
601   dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize];
602   assert(NULL != dict_trie->root_);
603   assert(NULL != dict_trie->lma_idx_buf_);
604   dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_;
605   dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_;
606   dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize;
607   dict_trie->top_lmas_num_ = top_lmas_num_;
608 
609   memcpy(dict_trie->root_, lma_nodes_le0_,
610          sizeof(LmaNodeLE0) * lma_nds_used_num_le0_);
611   memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_,
612          sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_);
613 
614   for (size_t pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) {
615     id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize,
616                   homo_idx_buf_[pos]);
617   }
618 
619   for (size_t pos = homo_idx_num_eq1_ + homo_idx_num_gt1_;
620        pos < lma_idx_num; pos++) {
621     LemmaIdType idx =
622         top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz;
623     id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx);
624   }
625 
626   if (kPrintDebug0) {
627     printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_);
628     printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_);
629     printf("top_lmas_num_: %d\n", top_lmas_num_);
630   }
631 
632   free_resource();
633 
634   if (kPrintDebug0) {
635     printf("Building dict succeds\n");
636   }
637   return dt_success;
638 }
639 
id_to_charbuf(unsigned char * buf,LemmaIdType id)640 void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) {
641   if (NULL == buf) return;
642   for (size_t pos = 0; pos < kLemmaIdSize; pos++) {
643     (buf)[pos] = (unsigned char)(id >> (pos * 8));
644   }
645 }
646 
set_son_offset(LmaNodeGE1 * node,size_t offset)647 void DictBuilder::set_son_offset(LmaNodeGE1 *node, size_t offset) {
648   node->son_1st_off_l = static_cast<uint16>(offset);
649   node->son_1st_off_h = static_cast<unsigned char>(offset >> 16);
650 }
651 
set_homo_id_buf_offset(LmaNodeGE1 * node,size_t offset)652 void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset) {
653   node->homo_idx_buf_off_l = static_cast<uint16>(offset);
654   node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16);
655 
656 }
657 
658 // All spelling strings will be converted to upper case, except that
659 // spellings started with "ZH"/"CH"/"SH" will be converted to
660 // "Zh"/"Ch"/"Sh"
format_spelling_str(char * spl_str)661 void DictBuilder::format_spelling_str(char *spl_str) {
662   if (NULL == spl_str)
663     return;
664 
665   uint16 pos = 0;
666   while ('\0' != spl_str[pos]) {
667     if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z')
668       spl_str[pos] = spl_str[pos] - 'a' + 'A';
669 
670     if (1 == pos && 'H' == spl_str[pos]) {
671       if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) {
672         spl_str[pos] = 'h';
673       }
674     }
675     pos++;
676   }
677 }
678 
sort_lemmas_by_hz()679 LemmaIdType DictBuilder::sort_lemmas_by_hz() {
680   if (NULL == lemma_arr_ || 0 == lemma_num_)
681     return 0;
682 
683   myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs);
684 
685   lemma_arr_[0].idx_by_hz = 1;
686   LemmaIdType idx_max = 1;
687   for (size_t i = 1; i < lemma_num_; i++) {
688     if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) {
689       idx_max++;
690       lemma_arr_[i].idx_by_hz = idx_max;
691     } else {
692       idx_max++;
693       lemma_arr_[i].idx_by_hz = idx_max;
694     }
695   }
696   return idx_max + 1;
697 }
698 
build_scis()699 size_t DictBuilder::build_scis() {
700   if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_)
701     return 0;
702 
703   SpellingTrie &spl_trie = SpellingTrie::get_instance();
704 
705   // This first one is blank, because id 0 is invalid.
706   scis_[0].freq = 0;
707   scis_[0].hz = 0;
708   scis_[0].splid.full_splid = 0;
709   scis_[0].splid.half_splid = 0;
710   scis_num_ = 1;
711 
712   // Copy the hanzis to the buffer
713   for (size_t pos = 0; pos < lemma_num_; pos++) {
714     size_t hz_num = lemma_arr_[pos].hz_str_len;
715     for (size_t hzpos = 0; hzpos < hz_num; hzpos++) {
716       scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos];
717       scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
718       scis_[scis_num_].splid.half_splid =
719           spl_trie.full_to_half(scis_[scis_num_].splid.full_splid);
720       if (1 == hz_num)
721         scis_[scis_num_].freq = lemma_arr_[pos].freq;
722       else
723         scis_[scis_num_].freq = 0.000001;
724       scis_num_++;
725     }
726   }
727 
728   myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq);
729 
730   // Remove repeated items
731   size_t unique_scis_num = 1;
732   for (size_t pos = 1; pos < scis_num_; pos++) {
733     if (scis_[pos].hz == scis_[pos - 1].hz &&
734         scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid)
735       continue;
736     scis_[unique_scis_num] = scis_[pos];
737     scis_[unique_scis_num].splid.half_splid =
738         spl_trie.full_to_half(scis_[pos].splid.full_splid);
739     unique_scis_num++;
740   }
741 
742   scis_num_ = unique_scis_num;
743 
744   // Update the lemma list.
745   for (size_t pos = 0; pos < lemma_num_; pos++) {
746     size_t hz_num = lemma_arr_[pos].hz_str_len;
747     for (size_t hzpos = 0; hzpos < hz_num; hzpos++) {
748       SingleCharItem key;
749       key.hz = lemma_arr_[pos].hanzi_str[hzpos];
750       key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
751       key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid);
752 
753       SingleCharItem *found;
754       found = static_cast<SingleCharItem*>(mybsearch(&key, scis_,
755                                                      unique_scis_num,
756                                                      sizeof(SingleCharItem),
757                                                      cmp_scis_hz_splid));
758 
759       assert(found);
760 
761       lemma_arr_[pos].hanzi_scis_ids[hzpos] =
762           static_cast<uint16>(found - scis_);
763       lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid;
764     }
765   }
766 
767   return scis_num_;
768 }
769 
construct_subset(void * parent,LemmaEntry * lemma_arr,size_t item_start,size_t item_end,size_t level)770 bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr,
771                                    size_t item_start, size_t item_end,
772                                    size_t level) {
773   if (level >= kMaxLemmaSize || item_end <= item_start)
774     return false;
775 
776   // 1. Scan for how many sons
777   size_t parent_son_num = 0;
778   // LemmaNode *son_1st = NULL;
779   // parent.num_of_son = 0;
780 
781   LemmaEntry *lma_last_start = lemma_arr_ + item_start;
782   uint16 spl_idx_node = lma_last_start->spl_idx_arr[level];
783 
784   // Scan for how many sons to be allocaed
785   for (size_t i = item_start + 1; i< item_end; i++) {
786     LemmaEntry *lma_current = lemma_arr + i;
787     uint16 spl_idx_current = lma_current->spl_idx_arr[level];
788     if (spl_idx_current != spl_idx_node) {
789       parent_son_num++;
790       spl_idx_node = spl_idx_current;
791     }
792   }
793   parent_son_num++;
794 
795 #ifdef ___DO_STATISTICS___
796   // Use to indicate whether all nodes of this layer have no son.
797   bool allson_noson = true;
798 
799   assert(level < kMaxLemmaSize);
800   if (parent_son_num > max_sonbuf_len_[level])
801     max_sonbuf_len_[level] = parent_son_num;
802 
803   total_son_num_[level] += parent_son_num;
804   total_sonbuf_num_[level] += 1;
805 
806   if (parent_son_num == 1)
807     sonbufs_num1_++;
808   else
809     sonbufs_numgt1_++;
810   total_lma_node_num_ += parent_son_num;
811 #endif
812 
813   // 2. Update the parent's information
814   //    Update the parent's son list;
815   LmaNodeLE0 *son_1st_le0 = NULL;  // only one of le0 or ge1 is used
816   LmaNodeGE1 *son_1st_ge1 = NULL;  // only one of le0 or ge1 is used.
817   if (0 == level) {                 // the parent is root
818     (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
819       lma_nds_used_num_le0_;
820     son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_;
821     lma_nds_used_num_le0_ += parent_son_num;
822 
823     assert(parent_son_num <= 65535);
824     (static_cast<LmaNodeLE0*>(parent))->num_of_son =
825       static_cast<uint16>(parent_son_num);
826   } else if (1 == level) {  // the parent is a son of root
827     (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
828       lma_nds_used_num_ge1_;
829     son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
830     lma_nds_used_num_ge1_ += parent_son_num;
831 
832     assert(parent_son_num <= 65535);
833     (static_cast<LmaNodeLE0*>(parent))->num_of_son =
834       static_cast<uint16>(parent_son_num);
835   } else {
836     set_son_offset((static_cast<LmaNodeGE1*>(parent)),
837                    lma_nds_used_num_ge1_);
838     son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
839     lma_nds_used_num_ge1_ += parent_son_num;
840 
841     assert(parent_son_num <= 255);
842     (static_cast<LmaNodeGE1*>(parent))->num_of_son =
843       (unsigned char)parent_son_num;
844   }
845 
846   // 3. Now begin to construct the son one by one
847   size_t son_pos = 0;
848 
849   lma_last_start = lemma_arr_ + item_start;
850   spl_idx_node = lma_last_start->spl_idx_arr[level];
851 
852   size_t homo_num = 0;
853   if (lma_last_start->spl_idx_arr[level + 1] == 0)
854     homo_num = 1;
855 
856   size_t item_start_next = item_start;
857 
858   for (size_t i = item_start + 1; i < item_end; i++) {
859     LemmaEntry* lma_current = lemma_arr_ + i;
860     uint16 spl_idx_current = lma_current->spl_idx_arr[level];
861 
862     if (spl_idx_current == spl_idx_node) {
863       if (lma_current->spl_idx_arr[level + 1] == 0)
864         homo_num++;
865     } else {
866       // Construct a node
867       LmaNodeLE0 *node_cur_le0 = NULL;  // only one of them is valid
868       LmaNodeGE1 *node_cur_ge1 = NULL;
869       if (0 == level) {
870         node_cur_le0 = son_1st_le0 + son_pos;
871         node_cur_le0->spl_idx = spl_idx_node;
872         node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
873         node_cur_le0->son_1st_off = 0;
874         homo_idx_num_eq1_ += homo_num;
875       } else {
876         node_cur_ge1 = son_1st_ge1 + son_pos;
877         node_cur_ge1->spl_idx = spl_idx_node;
878 
879         set_homo_id_buf_offset(node_cur_ge1,
880                                (homo_idx_num_eq1_ + homo_idx_num_gt1_));
881         set_son_offset(node_cur_ge1, 0);
882         homo_idx_num_gt1_ += homo_num;
883       }
884 
885       if (homo_num > 0) {
886         LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
887               homo_idx_num_gt1_ - homo_num;
888         if (0 == level) {
889           assert(homo_num <= 65535);
890           node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
891         } else {
892           assert(homo_num <= 255);
893           node_cur_ge1->num_of_homo = (unsigned char)homo_num;
894         }
895 
896         for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) {
897           idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz;
898         }
899 
900 #ifdef ___DO_STATISTICS___
901         if (homo_num > max_homobuf_len_[level])
902           max_homobuf_len_[level] = homo_num;
903 
904         total_homo_num_[level] += homo_num;
905 #endif
906       }
907 
908       if (i - item_start_next > homo_num) {
909         void *next_parent;
910         if (0 == level)
911           next_parent = static_cast<void*>(node_cur_le0);
912         else
913           next_parent = static_cast<void*>(node_cur_ge1);
914         construct_subset(next_parent, lemma_arr,
915                          item_start_next + homo_num, i, level + 1);
916 #ifdef ___DO_STATISTICS___
917 
918         total_node_hasson_[level] += 1;
919         allson_noson = false;
920 #endif
921       }
922 
923       // for the next son
924       lma_last_start = lma_current;
925       spl_idx_node = spl_idx_current;
926       item_start_next = i;
927       homo_num = 0;
928       if (lma_current->spl_idx_arr[level + 1] == 0)
929         homo_num = 1;
930 
931       son_pos++;
932     }
933   }
934 
935   // 4. The last one to construct
936   LmaNodeLE0 *node_cur_le0 = NULL;  // only one of them is valid
937   LmaNodeGE1 *node_cur_ge1 = NULL;
938   if (0 == level) {
939     node_cur_le0 = son_1st_le0 + son_pos;
940     node_cur_le0->spl_idx = spl_idx_node;
941     node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
942     node_cur_le0->son_1st_off = 0;
943     homo_idx_num_eq1_ += homo_num;
944   } else {
945     node_cur_ge1 = son_1st_ge1 + son_pos;
946     node_cur_ge1->spl_idx = spl_idx_node;
947 
948     set_homo_id_buf_offset(node_cur_ge1,
949                            (homo_idx_num_eq1_ + homo_idx_num_gt1_));
950     set_son_offset(node_cur_ge1, 0);
951     homo_idx_num_gt1_ += homo_num;
952   }
953 
954   if (homo_num > 0) {
955     LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
956           homo_idx_num_gt1_ - homo_num;
957     if (0 == level) {
958       assert(homo_num <= 65535);
959       node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
960     } else {
961       assert(homo_num <= 255);
962       node_cur_ge1->num_of_homo = (unsigned char)homo_num;
963     }
964 
965     for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) {
966       idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz;
967     }
968 
969 #ifdef ___DO_STATISTICS___
970     if (homo_num > max_homobuf_len_[level])
971       max_homobuf_len_[level] = homo_num;
972 
973     total_homo_num_[level] += homo_num;
974 #endif
975   }
976 
977   if (item_end - item_start_next > homo_num) {
978     void *next_parent;
979     if (0 == level)
980       next_parent = static_cast<void*>(node_cur_le0);
981     else
982       next_parent = static_cast<void*>(node_cur_ge1);
983     construct_subset(next_parent, lemma_arr,
984                      item_start_next + homo_num, item_end, level + 1);
985 #ifdef ___DO_STATISTICS___
986 
987     total_node_hasson_[level] += 1;
988     allson_noson = false;
989 #endif
990   }
991 
992 #ifdef ___DO_STATISTICS___
993   if (allson_noson) {
994     total_sonbuf_allnoson_[level] += 1;
995     total_node_in_sonbuf_allnoson_[level] += parent_son_num;
996   }
997 #endif
998 
999   assert(son_pos + 1 == parent_son_num);
1000   return true;
1001 }
1002 
1003 #ifdef ___DO_STATISTICS___
stat_init()1004 void DictBuilder::stat_init() {
1005   memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
1006   memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
1007   memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize);
1008   memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize);
1009   memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize);
1010   memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
1011   memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
1012   memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize);
1013 
1014   sonbufs_num1_ = 0;
1015   sonbufs_numgt1_ = 0;
1016   total_lma_node_num_ = 0;
1017 }
1018 
stat_print()1019 void DictBuilder::stat_print() {
1020   printf("\n------------STAT INFO-------------\n");
1021   printf("[root is layer -1]\n");
1022   printf(".. max_sonbuf_len per layer(from layer 0):\n   ");
1023   for (size_t i = 0; i < kMaxLemmaSize; i++)
1024     printf("%d, ", max_sonbuf_len_[i]);
1025   printf("-, \n");
1026 
1027   printf(".. max_homobuf_len per layer:\n   -, ");
1028   for (size_t i = 0; i < kMaxLemmaSize; i++)
1029     printf("%d, ", max_homobuf_len_[i]);
1030   printf("\n");
1031 
1032   printf(".. total_son_num per layer:\n   ");
1033   for (size_t i = 0; i < kMaxLemmaSize; i++)
1034     printf("%d, ", total_son_num_[i]);
1035   printf("-, \n");
1036 
1037   printf(".. total_node_hasson per layer:\n   1, ");
1038   for (size_t i = 0; i < kMaxLemmaSize; i++)
1039     printf("%d, ", total_node_hasson_[i]);
1040   printf("\n");
1041 
1042   printf(".. total_sonbuf_num per layer:\n   ");
1043   for (size_t i = 0; i < kMaxLemmaSize; i++)
1044     printf("%d, ", total_sonbuf_num_[i]);
1045   printf("-, \n");
1046 
1047   printf(".. total_sonbuf_allnoson per layer:\n   ");
1048   for (size_t i = 0; i < kMaxLemmaSize; i++)
1049     printf("%d, ", total_sonbuf_allnoson_[i]);
1050   printf("-, \n");
1051 
1052   printf(".. total_node_in_sonbuf_allnoson per layer:\n   ");
1053   for (size_t i = 0; i < kMaxLemmaSize; i++)
1054     printf("%d, ", total_node_in_sonbuf_allnoson_[i]);
1055   printf("-, \n");
1056 
1057   printf(".. total_homo_num per layer:\n   0, ");
1058   for (size_t i = 0; i < kMaxLemmaSize; i++)
1059     printf("%d, ", total_homo_num_[i]);
1060   printf("\n");
1061 
1062   printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_);
1063   printf(".. son buf allocation number with more than 1 son: %d\n",
1064          sonbufs_numgt1_);
1065   printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1);
1066 }
1067 #endif  // ___DO_STATISTICS___
1068 
1069 #endif  // ___BUILD_MODEL___
1070 }  // namespace ime_pinyin
1071