1 /*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <assert.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21
22 #include "../include/dictbuilder.h"
23 #include "../include/dicttrie.h"
24 #include "../include/mystdlib.h"
25 #include "../include/ngram.h"
26 #include "../include/searchutility.h"
27 #include "../include/spellingtable.h"
28 #include "../include/spellingtrie.h"
29 #include "../include/splparser.h"
30 #include "../include/utf16reader.h"
31
32 namespace ime_pinyin {
33
34 #ifdef ___BUILD_MODEL___
35
36 static const size_t kReadBufLen = 512;
37 static const size_t kSplTableHashLen = 2000;
38
39 // Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by
40 // frequencies.
cmp_scis_hz_splid_freq(const void * p1,const void * p2)41 int cmp_scis_hz_splid_freq(const void* p1, const void* p2) {
42 const SingleCharItem *s1, *s2;
43 s1 = static_cast<const SingleCharItem*>(p1);
44 s2 = static_cast<const SingleCharItem*>(p2);
45
46 if (s1->hz < s2->hz)
47 return -1;
48 if (s1->hz > s2->hz)
49 return 1;
50
51 if (s1->splid.half_splid < s2->splid.half_splid)
52 return -1;
53 if (s1->splid.half_splid > s2->splid.half_splid)
54 return 1;
55
56 if (s1->splid.full_splid < s2->splid.full_splid)
57 return -1;
58 if (s1->splid.full_splid > s2->splid.full_splid)
59 return 1;
60
61 if (s1->freq > s2->freq)
62 return -1;
63 if (s1->freq < s2->freq)
64 return 1;
65 return 0;
66 }
67
cmp_scis_hz_splid(const void * p1,const void * p2)68 int cmp_scis_hz_splid(const void* p1, const void* p2) {
69 const SingleCharItem *s1, *s2;
70 s1 = static_cast<const SingleCharItem*>(p1);
71 s2 = static_cast<const SingleCharItem*>(p2);
72
73 if (s1->hz < s2->hz)
74 return -1;
75 if (s1->hz > s2->hz)
76 return 1;
77
78 if (s1->splid.half_splid < s2->splid.half_splid)
79 return -1;
80 if (s1->splid.half_splid > s2->splid.half_splid)
81 return 1;
82
83 if (s1->splid.full_splid < s2->splid.full_splid)
84 return -1;
85 if (s1->splid.full_splid > s2->splid.full_splid)
86 return 1;
87
88 return 0;
89 }
90
cmp_lemma_entry_hzs(const void * p1,const void * p2)91 int cmp_lemma_entry_hzs(const void* p1, const void* p2) {
92 size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
93 size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
94 if (size1 < size2)
95 return -1;
96 else if (size1 > size2)
97 return 1;
98
99 return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
100 ((const LemmaEntry*)p2)->hanzi_str);
101 }
102
compare_char16(const void * p1,const void * p2)103 int compare_char16(const void* p1, const void* p2) {
104 if (*((const char16*)p1) < *((const char16*)p2))
105 return -1;
106 if (*((const char16*)p1) > *((const char16*)p2))
107 return 1;
108 return 0;
109 }
110
compare_py(const void * p1,const void * p2)111 int compare_py(const void* p1, const void* p2) {
112 int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
113 ((const LemmaEntry*)p2)->spl_idx_arr);
114
115 if (0 != ret)
116 return ret;
117
118 return static_cast<int>(((const LemmaEntry*)p2)->freq) -
119 static_cast<int>(((const LemmaEntry*)p1)->freq);
120 }
121
122 // First hanzi, if the same, then Pinyin
cmp_lemma_entry_hzspys(const void * p1,const void * p2)123 int cmp_lemma_entry_hzspys(const void* p1, const void* p2) {
124 size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
125 size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
126 if (size1 < size2)
127 return -1;
128 else if (size1 > size2)
129 return 1;
130 int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
131 ((const LemmaEntry*)p2)->hanzi_str);
132
133 if (0 != ret)
134 return ret;
135
136 ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
137 ((const LemmaEntry*)p2)->spl_idx_arr);
138 return ret;
139 }
140
compare_splid2(const void * p1,const void * p2)141 int compare_splid2(const void* p1, const void* p2) {
142 int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
143 ((const LemmaEntry*)p2)->spl_idx_arr);
144 return ret;
145 }
146
DictBuilder()147 DictBuilder::DictBuilder() {
148 lemma_arr_ = NULL;
149 lemma_num_ = 0;
150
151 scis_ = NULL;
152 scis_num_ = 0;
153
154 lma_nodes_le0_ = NULL;
155 lma_nodes_ge1_ = NULL;
156
157 lma_nds_used_num_le0_ = 0;
158 lma_nds_used_num_ge1_ = 0;
159
160 homo_idx_buf_ = NULL;
161 homo_idx_num_eq1_ = 0;
162 homo_idx_num_gt1_ = 0;
163
164 top_lmas_ = NULL;
165 top_lmas_num_ = 0;
166
167 spl_table_ = NULL;
168 spl_parser_ = NULL;
169 }
170
~DictBuilder()171 DictBuilder::~DictBuilder() {
172 free_resource();
173 }
174
alloc_resource(size_t lma_num)175 bool DictBuilder::alloc_resource(size_t lma_num) {
176 if (0 == lma_num)
177 return false;
178
179 free_resource();
180
181 lemma_num_ = lma_num;
182 lemma_arr_ = new LemmaEntry[lemma_num_];
183
184 top_lmas_num_ = 0;
185 top_lmas_ = new LemmaEntry[kTopScoreLemmaNum];
186
187 // New the scis_ buffer to the possible maximum size.
188 scis_num_ = lemma_num_ * kMaxLemmaSize;
189 scis_ = new SingleCharItem[scis_num_];
190
191 // The root and first level nodes is less than kMaxSpellingNum + 1
192 lma_nds_used_num_le0_ = 0;
193 lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1];
194
195 // Other nodes is less than lemma_num
196 lma_nds_used_num_ge1_ = 0;
197 lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_];
198
199 homo_idx_buf_ = new LemmaIdType[lemma_num_];
200 spl_table_ = new SpellingTable();
201 spl_parser_ = new SpellingParser();
202
203 if (NULL == lemma_arr_ || NULL == top_lmas_ ||
204 NULL == scis_ || NULL == spl_table_ ||
205 NULL == spl_parser_ || NULL == lma_nodes_le0_ ||
206 NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) {
207 free_resource();
208 return false;
209 }
210
211 memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_);
212 memset(scis_, 0, sizeof(SingleCharItem) * scis_num_);
213 memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1));
214 memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_);
215 memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_);
216 spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true);
217
218 return true;
219 }
220
read_valid_hanzis(const char * fn_validhzs,size_t * num)221 char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, size_t *num) {
222 if (NULL == fn_validhzs || NULL == num)
223 return NULL;
224
225 *num = 0;
226 FILE *fp = fopen(fn_validhzs, "rb");
227 if (NULL == fp)
228 return NULL;
229
230 char16 utf16header;
231 if (fread(&utf16header, sizeof(char16), 1, fp) != 1 ||
232 0xfeff != utf16header) {
233 fclose(fp);
234 return NULL;
235 }
236
237 fseek(fp, 0, SEEK_END);
238 *num = ftell(fp) / sizeof(char16);
239 assert(*num >= 1);
240 *num -= 1;
241
242 char16 *hzs = new char16[*num];
243 if (NULL == hzs) {
244 fclose(fp);
245 return NULL;
246 }
247
248 fseek(fp, 2, SEEK_SET);
249
250 if (fread(hzs, sizeof(char16), *num, fp) != *num) {
251 fclose(fp);
252 delete [] hzs;
253 return NULL;
254 }
255 fclose(fp);
256
257 myqsort(hzs, *num, sizeof(char16), compare_char16);
258 return hzs;
259 }
260
hz_in_hanzis_list(const char16 * hzs,size_t hzs_len,char16 hz)261 bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, size_t hzs_len,
262 char16 hz) {
263 if (NULL == hzs)
264 return false;
265
266 char16 *found;
267 found = static_cast<char16*>(
268 mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16));
269 if (NULL == found)
270 return false;
271
272 assert(*found == hz);
273 return true;
274 }
275
276 // The caller makes sure that the parameters are valid.
str_in_hanzis_list(const char16 * hzs,size_t hzs_len,const char16 * str,size_t str_len)277 bool DictBuilder::str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
278 const char16 *str, size_t str_len) {
279 if (NULL == hzs || NULL == str)
280 return false;
281
282 for (size_t pos = 0; pos < str_len; pos++) {
283 if (!hz_in_hanzis_list(hzs, hzs_len, str[pos]))
284 return false;
285 }
286 return true;
287 }
288
get_top_lemmas()289 void DictBuilder::get_top_lemmas() {
290 top_lmas_num_ = 0;
291 if (NULL == lemma_arr_)
292 return;
293
294 for (size_t pos = 0; pos < lemma_num_; pos++) {
295 if (0 == top_lmas_num_) {
296 top_lmas_[0] = lemma_arr_[pos];
297 top_lmas_num_ = 1;
298 continue;
299 }
300
301 if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) {
302 if (kTopScoreLemmaNum > top_lmas_num_)
303 top_lmas_num_ += 1;
304
305 size_t move_pos;
306 for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) {
307 top_lmas_[move_pos] = top_lmas_[move_pos - 1];
308 if (0 == move_pos - 1 ||
309 (move_pos - 1 > 0 &&
310 top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) {
311 break;
312 }
313 }
314 assert(move_pos > 0);
315 top_lmas_[move_pos - 1] = lemma_arr_[pos];
316 } else if (kTopScoreLemmaNum > top_lmas_num_) {
317 top_lmas_[top_lmas_num_] = lemma_arr_[pos];
318 top_lmas_num_ += 1;
319 }
320 }
321
322 if (kPrintDebug0) {
323 printf("\n------Top Lemmas------------------\n");
324 for (size_t pos = 0; pos < top_lmas_num_; pos++) {
325 printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz,
326 top_lmas_[pos].freq);
327 }
328 }
329 }
330
free_resource()331 void DictBuilder::free_resource() {
332 if (NULL != lemma_arr_)
333 delete [] lemma_arr_;
334
335 if (NULL != scis_)
336 delete [] scis_;
337
338 if (NULL != lma_nodes_le0_)
339 delete [] lma_nodes_le0_;
340
341 if (NULL != lma_nodes_ge1_)
342 delete [] lma_nodes_ge1_;
343
344 if (NULL != homo_idx_buf_)
345 delete [] homo_idx_buf_;
346
347 if (NULL != spl_table_)
348 delete spl_table_;
349
350 if (NULL != spl_parser_)
351 delete spl_parser_;
352
353 lemma_arr_ = NULL;
354 scis_ = NULL;
355 lma_nodes_le0_ = NULL;
356 lma_nodes_ge1_ = NULL;
357 homo_idx_buf_ = NULL;
358 spl_table_ = NULL;
359 spl_parser_ = NULL;
360
361 lemma_num_ = 0;
362 lma_nds_used_num_le0_ = 0;
363 lma_nds_used_num_ge1_ = 0;
364 homo_idx_num_eq1_ = 0;
365 homo_idx_num_gt1_ = 0;
366 }
367
read_raw_dict(const char * fn_raw,const char * fn_validhzs,size_t max_item)368 size_t DictBuilder::read_raw_dict(const char* fn_raw,
369 const char *fn_validhzs,
370 size_t max_item) {
371 if (NULL == fn_raw) return 0;
372
373 Utf16Reader utf16_reader;
374 if (!utf16_reader.open(fn_raw, kReadBufLen * 10))
375 return false;
376
377 char16 read_buf[kReadBufLen];
378
379 // Read the number of lemmas in the file
380 size_t lemma_num = 240000;
381
382 // allocate resource required
383 if (!alloc_resource(lemma_num)) {
384 utf16_reader.close();
385 }
386
387 // Read the valid Hanzi list.
388 char16 *valid_hzs = NULL;
389 size_t valid_hzs_num = 0;
390 valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num);
391
392 // Begin reading the lemma entries
393 for (size_t i = 0; i < max_item; i++) {
394 // read next entry
395 if (!utf16_reader.readline(read_buf, kReadBufLen)) {
396 lemma_num = i;
397 break;
398 }
399
400 size_t token_size;
401 char16 *token;
402 char16 *to_tokenize = read_buf;
403
404 // Get the Hanzi string
405 token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
406 if (NULL == token) {
407 free_resource();
408 utf16_reader.close();
409 return false;
410 }
411
412 size_t lemma_size = utf16_strlen(token);
413
414 if (lemma_size > kMaxLemmaSize) {
415 i--;
416 continue;
417 }
418
419 if (lemma_size > 4) {
420 i--;
421 continue;
422 }
423
424 // Copy to the lemma entry
425 utf16_strcpy(lemma_arr_[i].hanzi_str, token);
426
427 lemma_arr_[i].hz_str_len = token_size;
428
429 // Get the freq string
430 token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
431 if (NULL == token) {
432 free_resource();
433 utf16_reader.close();
434 return false;
435 }
436 lemma_arr_[i].freq = utf16_atof(token);
437
438 if (lemma_size > 1 && lemma_arr_[i].freq < 60) {
439 i--;
440 continue;
441 }
442
443 // Get GBK mark, if no valid Hanzi list available, all items which contains
444 // GBK characters will be discarded. Otherwise, all items which contains
445 // characters outside of the valid Hanzi list will be discarded.
446 token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
447 assert(NULL != token);
448 int gbk_flag = utf16_atoi(token);
449 if (NULL == valid_hzs || 0 == valid_hzs_num) {
450 if (0 != gbk_flag) {
451 i--;
452 continue;
453 }
454 } else {
455 if (!str_in_hanzis_list(valid_hzs, valid_hzs_num,
456 lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) {
457 i--;
458 continue;
459 }
460 }
461
462 // Get spelling String
463 bool spelling_not_support = false;
464 for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len;
465 hz_pos++) {
466 // Get a Pinyin
467 token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
468 if (NULL == token) {
469 free_resource();
470 utf16_reader.close();
471 return false;
472 }
473
474 assert(utf16_strlen(token) <= kMaxPinyinSize);
475
476 utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token);
477
478 format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]);
479
480 // Put the pinyin to the spelling table
481 if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos],
482 lemma_arr_[i].freq)) {
483 spelling_not_support = true;
484 break;
485 }
486 }
487
488 // The whole line must have been parsed fully, otherwise discard this one.
489 token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
490 if (spelling_not_support || NULL != token) {
491 i--;
492 continue;
493 }
494 }
495
496 delete [] valid_hzs;
497 utf16_reader.close();
498
499 printf("read succesfully, lemma num: %d\n", lemma_num);
500
501 return lemma_num;
502 }
503
build_dict(const char * fn_raw,const char * fn_validhzs,DictTrie * dict_trie)504 bool DictBuilder::build_dict(const char *fn_raw,
505 const char *fn_validhzs,
506 DictTrie *dict_trie) {
507 if (NULL == fn_raw || NULL == dict_trie)
508 return false;
509
510 lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000);
511 if (0 == lemma_num_)
512 return false;
513
514 // Arrange the spelling table, and build a spelling tree
515 // The size of an spelling. '\0' is included. If the spelling table is
516 // initialized to calculate the spelling scores, the last char in the
517 // spelling string will be score, and it is also included in spl_item_size.
518 size_t spl_item_size;
519 size_t spl_num;
520 const char* spl_buf;
521 spl_buf = spl_table_->arrange(&spl_item_size, &spl_num);
522 if (NULL == spl_buf) {
523 free_resource();
524 return false;
525 }
526
527 SpellingTrie &spl_trie = SpellingTrie::get_instance();
528
529 if (!spl_trie.construct(spl_buf, spl_item_size, spl_num,
530 spl_table_->get_score_amplifier(),
531 spl_table_->get_average_score())) {
532 free_resource();
533 return false;
534 }
535
536 printf("spelling tree construct successfully.\n");
537
538 // Convert the spelling string to idxs
539 for (size_t i = 0; i < lemma_num_; i++) {
540 for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len;
541 hz_pos++) {
542 uint16 spl_idxs[2];
543 uint16 spl_start_pos[3];
544 bool is_pre = true;
545 int spl_idx_num =
546 spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos],
547 strlen(lemma_arr_[i].pinyin_str[hz_pos]),
548 spl_idxs, spl_start_pos, 2, is_pre);
549 assert(1 == spl_idx_num);
550
551 if (spl_trie.is_half_id(spl_idxs[0])) {
552 uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs);
553 assert(0 != num);
554 }
555 lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0];
556 }
557 }
558
559 // Sort the lemma items according to the hanzi, and give each unique item a
560 // id
561 sort_lemmas_by_hz();
562
563 scis_num_ = build_scis();
564
565 // Construct the dict list
566 dict_trie->dict_list_ = new DictList();
567 bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_,
568 lemma_arr_, lemma_num_);
569 assert(dl_success);
570
571 // Construct the NGram information
572 NGram& ngram = NGram::get_instance();
573 ngram.build_unigram(lemma_arr_, lemma_num_,
574 lemma_arr_[lemma_num_ - 1].idx_by_hz + 1);
575
576 // sort the lemma items according to the spelling idx string
577 myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py);
578
579 get_top_lemmas();
580
581 #ifdef ___DO_STATISTICS___
582 stat_init();
583 #endif
584
585 lma_nds_used_num_le0_ = 1; // The root node
586 bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_),
587 lemma_arr_, 0, lemma_num_, 0);
588 if (!dt_success) {
589 free_resource();
590 return false;
591 }
592
593 #ifdef ___DO_STATISTICS___
594 stat_print();
595 #endif
596
597 // Move the node data and homo data to the DictTrie
598 dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_];
599 dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_];
600 size_t lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_;
601 dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize];
602 assert(NULL != dict_trie->root_);
603 assert(NULL != dict_trie->lma_idx_buf_);
604 dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_;
605 dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_;
606 dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize;
607 dict_trie->top_lmas_num_ = top_lmas_num_;
608
609 memcpy(dict_trie->root_, lma_nodes_le0_,
610 sizeof(LmaNodeLE0) * lma_nds_used_num_le0_);
611 memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_,
612 sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_);
613
614 for (size_t pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) {
615 id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize,
616 homo_idx_buf_[pos]);
617 }
618
619 for (size_t pos = homo_idx_num_eq1_ + homo_idx_num_gt1_;
620 pos < lma_idx_num; pos++) {
621 LemmaIdType idx =
622 top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz;
623 id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx);
624 }
625
626 if (kPrintDebug0) {
627 printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_);
628 printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_);
629 printf("top_lmas_num_: %d\n", top_lmas_num_);
630 }
631
632 free_resource();
633
634 if (kPrintDebug0) {
635 printf("Building dict succeds\n");
636 }
637 return dt_success;
638 }
639
id_to_charbuf(unsigned char * buf,LemmaIdType id)640 void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) {
641 if (NULL == buf) return;
642 for (size_t pos = 0; pos < kLemmaIdSize; pos++) {
643 (buf)[pos] = (unsigned char)(id >> (pos * 8));
644 }
645 }
646
set_son_offset(LmaNodeGE1 * node,size_t offset)647 void DictBuilder::set_son_offset(LmaNodeGE1 *node, size_t offset) {
648 node->son_1st_off_l = static_cast<uint16>(offset);
649 node->son_1st_off_h = static_cast<unsigned char>(offset >> 16);
650 }
651
set_homo_id_buf_offset(LmaNodeGE1 * node,size_t offset)652 void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset) {
653 node->homo_idx_buf_off_l = static_cast<uint16>(offset);
654 node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16);
655
656 }
657
658 // All spelling strings will be converted to upper case, except that
659 // spellings started with "ZH"/"CH"/"SH" will be converted to
660 // "Zh"/"Ch"/"Sh"
format_spelling_str(char * spl_str)661 void DictBuilder::format_spelling_str(char *spl_str) {
662 if (NULL == spl_str)
663 return;
664
665 uint16 pos = 0;
666 while ('\0' != spl_str[pos]) {
667 if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z')
668 spl_str[pos] = spl_str[pos] - 'a' + 'A';
669
670 if (1 == pos && 'H' == spl_str[pos]) {
671 if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) {
672 spl_str[pos] = 'h';
673 }
674 }
675 pos++;
676 }
677 }
678
sort_lemmas_by_hz()679 LemmaIdType DictBuilder::sort_lemmas_by_hz() {
680 if (NULL == lemma_arr_ || 0 == lemma_num_)
681 return 0;
682
683 myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs);
684
685 lemma_arr_[0].idx_by_hz = 1;
686 LemmaIdType idx_max = 1;
687 for (size_t i = 1; i < lemma_num_; i++) {
688 if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) {
689 idx_max++;
690 lemma_arr_[i].idx_by_hz = idx_max;
691 } else {
692 idx_max++;
693 lemma_arr_[i].idx_by_hz = idx_max;
694 }
695 }
696 return idx_max + 1;
697 }
698
build_scis()699 size_t DictBuilder::build_scis() {
700 if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_)
701 return 0;
702
703 SpellingTrie &spl_trie = SpellingTrie::get_instance();
704
705 // This first one is blank, because id 0 is invalid.
706 scis_[0].freq = 0;
707 scis_[0].hz = 0;
708 scis_[0].splid.full_splid = 0;
709 scis_[0].splid.half_splid = 0;
710 scis_num_ = 1;
711
712 // Copy the hanzis to the buffer
713 for (size_t pos = 0; pos < lemma_num_; pos++) {
714 size_t hz_num = lemma_arr_[pos].hz_str_len;
715 for (size_t hzpos = 0; hzpos < hz_num; hzpos++) {
716 scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos];
717 scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
718 scis_[scis_num_].splid.half_splid =
719 spl_trie.full_to_half(scis_[scis_num_].splid.full_splid);
720 if (1 == hz_num)
721 scis_[scis_num_].freq = lemma_arr_[pos].freq;
722 else
723 scis_[scis_num_].freq = 0.000001;
724 scis_num_++;
725 }
726 }
727
728 myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq);
729
730 // Remove repeated items
731 size_t unique_scis_num = 1;
732 for (size_t pos = 1; pos < scis_num_; pos++) {
733 if (scis_[pos].hz == scis_[pos - 1].hz &&
734 scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid)
735 continue;
736 scis_[unique_scis_num] = scis_[pos];
737 scis_[unique_scis_num].splid.half_splid =
738 spl_trie.full_to_half(scis_[pos].splid.full_splid);
739 unique_scis_num++;
740 }
741
742 scis_num_ = unique_scis_num;
743
744 // Update the lemma list.
745 for (size_t pos = 0; pos < lemma_num_; pos++) {
746 size_t hz_num = lemma_arr_[pos].hz_str_len;
747 for (size_t hzpos = 0; hzpos < hz_num; hzpos++) {
748 SingleCharItem key;
749 key.hz = lemma_arr_[pos].hanzi_str[hzpos];
750 key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
751 key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid);
752
753 SingleCharItem *found;
754 found = static_cast<SingleCharItem*>(mybsearch(&key, scis_,
755 unique_scis_num,
756 sizeof(SingleCharItem),
757 cmp_scis_hz_splid));
758
759 assert(found);
760
761 lemma_arr_[pos].hanzi_scis_ids[hzpos] =
762 static_cast<uint16>(found - scis_);
763 lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid;
764 }
765 }
766
767 return scis_num_;
768 }
769
construct_subset(void * parent,LemmaEntry * lemma_arr,size_t item_start,size_t item_end,size_t level)770 bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr,
771 size_t item_start, size_t item_end,
772 size_t level) {
773 if (level >= kMaxLemmaSize || item_end <= item_start)
774 return false;
775
776 // 1. Scan for how many sons
777 size_t parent_son_num = 0;
778 // LemmaNode *son_1st = NULL;
779 // parent.num_of_son = 0;
780
781 LemmaEntry *lma_last_start = lemma_arr_ + item_start;
782 uint16 spl_idx_node = lma_last_start->spl_idx_arr[level];
783
784 // Scan for how many sons to be allocaed
785 for (size_t i = item_start + 1; i< item_end; i++) {
786 LemmaEntry *lma_current = lemma_arr + i;
787 uint16 spl_idx_current = lma_current->spl_idx_arr[level];
788 if (spl_idx_current != spl_idx_node) {
789 parent_son_num++;
790 spl_idx_node = spl_idx_current;
791 }
792 }
793 parent_son_num++;
794
795 #ifdef ___DO_STATISTICS___
796 // Use to indicate whether all nodes of this layer have no son.
797 bool allson_noson = true;
798
799 assert(level < kMaxLemmaSize);
800 if (parent_son_num > max_sonbuf_len_[level])
801 max_sonbuf_len_[level] = parent_son_num;
802
803 total_son_num_[level] += parent_son_num;
804 total_sonbuf_num_[level] += 1;
805
806 if (parent_son_num == 1)
807 sonbufs_num1_++;
808 else
809 sonbufs_numgt1_++;
810 total_lma_node_num_ += parent_son_num;
811 #endif
812
813 // 2. Update the parent's information
814 // Update the parent's son list;
815 LmaNodeLE0 *son_1st_le0 = NULL; // only one of le0 or ge1 is used
816 LmaNodeGE1 *son_1st_ge1 = NULL; // only one of le0 or ge1 is used.
817 if (0 == level) { // the parent is root
818 (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
819 lma_nds_used_num_le0_;
820 son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_;
821 lma_nds_used_num_le0_ += parent_son_num;
822
823 assert(parent_son_num <= 65535);
824 (static_cast<LmaNodeLE0*>(parent))->num_of_son =
825 static_cast<uint16>(parent_son_num);
826 } else if (1 == level) { // the parent is a son of root
827 (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
828 lma_nds_used_num_ge1_;
829 son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
830 lma_nds_used_num_ge1_ += parent_son_num;
831
832 assert(parent_son_num <= 65535);
833 (static_cast<LmaNodeLE0*>(parent))->num_of_son =
834 static_cast<uint16>(parent_son_num);
835 } else {
836 set_son_offset((static_cast<LmaNodeGE1*>(parent)),
837 lma_nds_used_num_ge1_);
838 son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
839 lma_nds_used_num_ge1_ += parent_son_num;
840
841 assert(parent_son_num <= 255);
842 (static_cast<LmaNodeGE1*>(parent))->num_of_son =
843 (unsigned char)parent_son_num;
844 }
845
846 // 3. Now begin to construct the son one by one
847 size_t son_pos = 0;
848
849 lma_last_start = lemma_arr_ + item_start;
850 spl_idx_node = lma_last_start->spl_idx_arr[level];
851
852 size_t homo_num = 0;
853 if (lma_last_start->spl_idx_arr[level + 1] == 0)
854 homo_num = 1;
855
856 size_t item_start_next = item_start;
857
858 for (size_t i = item_start + 1; i < item_end; i++) {
859 LemmaEntry* lma_current = lemma_arr_ + i;
860 uint16 spl_idx_current = lma_current->spl_idx_arr[level];
861
862 if (spl_idx_current == spl_idx_node) {
863 if (lma_current->spl_idx_arr[level + 1] == 0)
864 homo_num++;
865 } else {
866 // Construct a node
867 LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid
868 LmaNodeGE1 *node_cur_ge1 = NULL;
869 if (0 == level) {
870 node_cur_le0 = son_1st_le0 + son_pos;
871 node_cur_le0->spl_idx = spl_idx_node;
872 node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
873 node_cur_le0->son_1st_off = 0;
874 homo_idx_num_eq1_ += homo_num;
875 } else {
876 node_cur_ge1 = son_1st_ge1 + son_pos;
877 node_cur_ge1->spl_idx = spl_idx_node;
878
879 set_homo_id_buf_offset(node_cur_ge1,
880 (homo_idx_num_eq1_ + homo_idx_num_gt1_));
881 set_son_offset(node_cur_ge1, 0);
882 homo_idx_num_gt1_ += homo_num;
883 }
884
885 if (homo_num > 0) {
886 LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
887 homo_idx_num_gt1_ - homo_num;
888 if (0 == level) {
889 assert(homo_num <= 65535);
890 node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
891 } else {
892 assert(homo_num <= 255);
893 node_cur_ge1->num_of_homo = (unsigned char)homo_num;
894 }
895
896 for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) {
897 idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz;
898 }
899
900 #ifdef ___DO_STATISTICS___
901 if (homo_num > max_homobuf_len_[level])
902 max_homobuf_len_[level] = homo_num;
903
904 total_homo_num_[level] += homo_num;
905 #endif
906 }
907
908 if (i - item_start_next > homo_num) {
909 void *next_parent;
910 if (0 == level)
911 next_parent = static_cast<void*>(node_cur_le0);
912 else
913 next_parent = static_cast<void*>(node_cur_ge1);
914 construct_subset(next_parent, lemma_arr,
915 item_start_next + homo_num, i, level + 1);
916 #ifdef ___DO_STATISTICS___
917
918 total_node_hasson_[level] += 1;
919 allson_noson = false;
920 #endif
921 }
922
923 // for the next son
924 lma_last_start = lma_current;
925 spl_idx_node = spl_idx_current;
926 item_start_next = i;
927 homo_num = 0;
928 if (lma_current->spl_idx_arr[level + 1] == 0)
929 homo_num = 1;
930
931 son_pos++;
932 }
933 }
934
935 // 4. The last one to construct
936 LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid
937 LmaNodeGE1 *node_cur_ge1 = NULL;
938 if (0 == level) {
939 node_cur_le0 = son_1st_le0 + son_pos;
940 node_cur_le0->spl_idx = spl_idx_node;
941 node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
942 node_cur_le0->son_1st_off = 0;
943 homo_idx_num_eq1_ += homo_num;
944 } else {
945 node_cur_ge1 = son_1st_ge1 + son_pos;
946 node_cur_ge1->spl_idx = spl_idx_node;
947
948 set_homo_id_buf_offset(node_cur_ge1,
949 (homo_idx_num_eq1_ + homo_idx_num_gt1_));
950 set_son_offset(node_cur_ge1, 0);
951 homo_idx_num_gt1_ += homo_num;
952 }
953
954 if (homo_num > 0) {
955 LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
956 homo_idx_num_gt1_ - homo_num;
957 if (0 == level) {
958 assert(homo_num <= 65535);
959 node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
960 } else {
961 assert(homo_num <= 255);
962 node_cur_ge1->num_of_homo = (unsigned char)homo_num;
963 }
964
965 for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) {
966 idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz;
967 }
968
969 #ifdef ___DO_STATISTICS___
970 if (homo_num > max_homobuf_len_[level])
971 max_homobuf_len_[level] = homo_num;
972
973 total_homo_num_[level] += homo_num;
974 #endif
975 }
976
977 if (item_end - item_start_next > homo_num) {
978 void *next_parent;
979 if (0 == level)
980 next_parent = static_cast<void*>(node_cur_le0);
981 else
982 next_parent = static_cast<void*>(node_cur_ge1);
983 construct_subset(next_parent, lemma_arr,
984 item_start_next + homo_num, item_end, level + 1);
985 #ifdef ___DO_STATISTICS___
986
987 total_node_hasson_[level] += 1;
988 allson_noson = false;
989 #endif
990 }
991
992 #ifdef ___DO_STATISTICS___
993 if (allson_noson) {
994 total_sonbuf_allnoson_[level] += 1;
995 total_node_in_sonbuf_allnoson_[level] += parent_son_num;
996 }
997 #endif
998
999 assert(son_pos + 1 == parent_son_num);
1000 return true;
1001 }
1002
1003 #ifdef ___DO_STATISTICS___
stat_init()1004 void DictBuilder::stat_init() {
1005 memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
1006 memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
1007 memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize);
1008 memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize);
1009 memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize);
1010 memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
1011 memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
1012 memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize);
1013
1014 sonbufs_num1_ = 0;
1015 sonbufs_numgt1_ = 0;
1016 total_lma_node_num_ = 0;
1017 }
1018
stat_print()1019 void DictBuilder::stat_print() {
1020 printf("\n------------STAT INFO-------------\n");
1021 printf("[root is layer -1]\n");
1022 printf(".. max_sonbuf_len per layer(from layer 0):\n ");
1023 for (size_t i = 0; i < kMaxLemmaSize; i++)
1024 printf("%d, ", max_sonbuf_len_[i]);
1025 printf("-, \n");
1026
1027 printf(".. max_homobuf_len per layer:\n -, ");
1028 for (size_t i = 0; i < kMaxLemmaSize; i++)
1029 printf("%d, ", max_homobuf_len_[i]);
1030 printf("\n");
1031
1032 printf(".. total_son_num per layer:\n ");
1033 for (size_t i = 0; i < kMaxLemmaSize; i++)
1034 printf("%d, ", total_son_num_[i]);
1035 printf("-, \n");
1036
1037 printf(".. total_node_hasson per layer:\n 1, ");
1038 for (size_t i = 0; i < kMaxLemmaSize; i++)
1039 printf("%d, ", total_node_hasson_[i]);
1040 printf("\n");
1041
1042 printf(".. total_sonbuf_num per layer:\n ");
1043 for (size_t i = 0; i < kMaxLemmaSize; i++)
1044 printf("%d, ", total_sonbuf_num_[i]);
1045 printf("-, \n");
1046
1047 printf(".. total_sonbuf_allnoson per layer:\n ");
1048 for (size_t i = 0; i < kMaxLemmaSize; i++)
1049 printf("%d, ", total_sonbuf_allnoson_[i]);
1050 printf("-, \n");
1051
1052 printf(".. total_node_in_sonbuf_allnoson per layer:\n ");
1053 for (size_t i = 0; i < kMaxLemmaSize; i++)
1054 printf("%d, ", total_node_in_sonbuf_allnoson_[i]);
1055 printf("-, \n");
1056
1057 printf(".. total_homo_num per layer:\n 0, ");
1058 for (size_t i = 0; i < kMaxLemmaSize; i++)
1059 printf("%d, ", total_homo_num_[i]);
1060 printf("\n");
1061
1062 printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_);
1063 printf(".. son buf allocation number with more than 1 son: %d\n",
1064 sonbufs_numgt1_);
1065 printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1);
1066 }
1067 #endif // ___DO_STATISTICS___
1068
1069 #endif // ___BUILD_MODEL___
1070 } // namespace ime_pinyin
1071