1 /*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "../include/userdict.h"
18 #include "../include/splparser.h"
19 #include "../include/ngram.h"
20 #include <stdio.h>
21 #include <string.h>
22 #include <stdlib.h>
23 #include <cutils/log.h>
24 #include <unistd.h>
25 #include <fcntl.h>
26 #include <sys/stat.h>
27 #include <assert.h>
28 #include <ctype.h>
29 #include <sys/types.h>
30 #include <sys/time.h>
31 #include <time.h>
32 #include <pthread.h>
33 #include <math.h>
34
35 namespace ime_pinyin {
36
37 #ifdef ___DEBUG_PERF___
38 static uint64 _ellapse_ = 0;
39 static struct timeval _tv_start_, _tv_end_;
40 #define DEBUG_PERF_BEGIN \
41 do { \
42 gettimeofday(&_tv_start_, NULL); \
43 } while(0)
44 #define DEBUG_PERF_END \
45 do { \
46 gettimeofday(&_tv_end_, NULL); \
47 _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
48 (_tv_end_.tv_usec - _tv_start_.tv_usec); \
49 } while(0)
50 #define LOGD_PERF(message) \
51 ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
52 #else
53 #define DEBUG_PERF_BEGIN
54 #define DEBUG_PERF_END
55 #define LOGD_PERF(message)
56 #endif
57
58 // XXX File load and write are thread-safe by g_mutex_
59 static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
60 static struct timeval g_last_update_ = {0, 0};
61
get_dict_file_size(UserDictInfo * info)62 inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
63 return (4 + info->lemma_size + (info->lemma_count << 3)
64 #ifdef ___PREDICT_ENABLED___
65 + (info->lemma_count << 2)
66 #endif
67 #ifdef ___SYNC_ENABLED___
68 + (info->sync_count << 2)
69 #endif
70 + sizeof(*info));
71 }
72
translate_score(int raw_score)73 inline LmaScoreType UserDict::translate_score(int raw_score) {
74 // 1) ori_freq: original user frequency
75 uint32 ori_freq = extract_score_freq(raw_score);
76 // 2) lmt_off: lmt index (week offset for example)
77 uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
78 if (kUserDictLMTBitWidth < 16) {
79 uint64 mask = ~(1 << kUserDictLMTBitWidth);
80 lmt_off &= mask;
81 }
82 // 3) now_off: current time index (current week offset for example)
83 // assuming load_time_ is around current time
84 uint64 now_off = load_time_.tv_sec;
85 now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
86 now_off = (now_off << (64 - kUserDictLMTBitWidth));
87 now_off = (now_off >> (64 - kUserDictLMTBitWidth));
88 // 4) factor: decide expand-factor
89 int delta = now_off - lmt_off;
90 if (delta > 4)
91 delta = 4;
92 int factor = 80 - (delta << 4);
93
94 double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
95 return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
96 * NGram::kLogValueAmplifier);
97 }
98
extract_score_freq(int raw_score)99 inline int UserDict::extract_score_freq(int raw_score) {
100 // Frequence stored in lowest 16 bits
101 int freq = (raw_score & 0x0000ffff);
102 return freq;
103 }
104
extract_score_lmt(int raw_score)105 inline uint64 UserDict::extract_score_lmt(int raw_score) {
106 uint64 lmt = ((raw_score & 0xffff0000) >> 16);
107 if (kUserDictLMTBitWidth < 16) {
108 uint64 mask = ~(1 << kUserDictLMTBitWidth);
109 lmt &= mask;
110 }
111 lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
112 return lmt;
113 }
114
build_score(uint64 lmt,int freq)115 inline int UserDict::build_score(uint64 lmt, int freq) {
116 lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
117 lmt = (lmt << (64 - kUserDictLMTBitWidth));
118 lmt = (lmt >> (64 - kUserDictLMTBitWidth));
119 uint16 lmt16 = (uint16)lmt;
120 int s = freq;
121 s &= 0x0000ffff;
122 s = (lmt16 << 16) | s;
123 return s;
124 }
125
utf16le_atoll(uint16 * s,int len)126 inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
127 int64 ret = 0;
128 if (len <= 0)
129 return ret;
130
131 int flag = 1;
132 const uint16 * endp = s + len;
133 if (*s == '-') {
134 flag = -1;
135 s++;
136 } else if (*s == '+') {
137 s++;
138 }
139
140 while (*s >= '0' && *s <= '9' && s < endp) {
141 ret += ret * 10 + (*s) - '0';
142 s++;
143 }
144 return ret * flag;
145 }
146
utf16le_lltoa(int64 v,uint16 * s,int size)147 inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
148 if (!s || size <= 0)
149 return 0;
150 uint16 *endp = s + size;
151 int ret_len = 0;
152 if (v < 0) {
153 *(s++) = '-';
154 ++ret_len;
155 v *= -1;
156 }
157
158 uint16 *b = s;
159 while (s < endp && v != 0) {
160 *(s++) = '0' + (v % 10);
161 v = v / 10;
162 ++ret_len;
163 }
164
165 if (v != 0)
166 return 0;
167
168 --s;
169
170 while (b < s) {
171 *b = *s;
172 ++b, --s;
173 }
174
175 return ret_len;
176 }
177
set_lemma_flag(uint32 offset,uint8 flag)178 inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
179 offset &= kUserDictOffsetMask;
180 lemmas_[offset] |= flag;
181 }
182
get_lemma_flag(uint32 offset)183 inline char UserDict::get_lemma_flag(uint32 offset) {
184 offset &= kUserDictOffsetMask;
185 return (char)(lemmas_[offset]);
186 }
187
get_lemma_nchar(uint32 offset)188 inline char UserDict::get_lemma_nchar(uint32 offset) {
189 offset &= kUserDictOffsetMask;
190 return (char)(lemmas_[offset + 1]);
191 }
192
get_lemma_spell_ids(uint32 offset)193 inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
194 offset &= kUserDictOffsetMask;
195 return (uint16 *)(lemmas_ + offset + 2);
196 }
197
get_lemma_word(uint32 offset)198 inline uint16 * UserDict::get_lemma_word(uint32 offset) {
199 offset &= kUserDictOffsetMask;
200 uint8 nchar = get_lemma_nchar(offset);
201 return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
202 }
203
get_max_lemma_id()204 inline LemmaIdType UserDict::get_max_lemma_id() {
205 // When a lemma is deleted, we don't not claim its id back for
206 // simplicity and performance
207 return start_id_ + dict_info_.lemma_count - 1;
208 }
209
is_valid_lemma_id(LemmaIdType id)210 inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
211 if (id >= start_id_ && id <= get_max_lemma_id())
212 return true;
213 return false;
214 }
215
is_valid_state()216 inline bool UserDict::is_valid_state() {
217 if (state_ == USER_DICT_NONE)
218 return false;
219 return true;
220 }
221
UserDict()222 UserDict::UserDict()
223 : start_id_(0),
224 version_(0),
225 lemmas_(NULL),
226 offsets_(NULL),
227 scores_(NULL),
228 ids_(NULL),
229 #ifdef ___PREDICT_ENABLED___
230 predicts_(NULL),
231 #endif
232 #ifdef ___SYNC_ENABLED___
233 syncs_(NULL),
234 sync_count_size_(0),
235 #endif
236 offsets_by_id_(NULL),
237 lemma_count_left_(0),
238 lemma_size_left_(0),
239 dict_file_(NULL),
240 state_(USER_DICT_NONE) {
241 memset(&dict_info_, 0, sizeof(dict_info_));
242 memset(&load_time_, 0, sizeof(load_time_));
243 #ifdef ___CACHE_ENABLED___
244 cache_init();
245 #endif
246 }
247
~UserDict()248 UserDict::~UserDict() {
249 close_dict();
250 }
251
load_dict(const char * file_name,LemmaIdType start_id,LemmaIdType end_id)252 bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
253 LemmaIdType end_id) {
254 #ifdef ___DEBUG_PERF___
255 DEBUG_PERF_BEGIN;
256 #endif
257 dict_file_ = strdup(file_name);
258 if (!dict_file_)
259 return false;
260
261 start_id_ = start_id;
262
263 if (false == validate(file_name) && false == reset(file_name)) {
264 goto error;
265 }
266 if (false == load(file_name, start_id)) {
267 goto error;
268 }
269
270 state_ = USER_DICT_SYNC;
271
272 gettimeofday(&load_time_, NULL);
273
274 #ifdef ___DEBUG_PERF___
275 DEBUG_PERF_END;
276 LOGD_PERF("load_dict");
277 #endif
278 return true;
279 error:
280 free((void*)dict_file_);
281 start_id_ = 0;
282 return false;
283 }
284
close_dict()285 bool UserDict::close_dict() {
286 if (state_ == USER_DICT_NONE)
287 return true;
288 if (state_ == USER_DICT_SYNC)
289 goto out;
290
291 // If dictionary is written back by others,
292 // we can not simply write back here
293 // To do a safe flush, we have to discard all newly added
294 // lemmas and try to reload dict file.
295 pthread_mutex_lock(&g_mutex_);
296 if (load_time_.tv_sec > g_last_update_.tv_sec ||
297 (load_time_.tv_sec == g_last_update_.tv_sec &&
298 load_time_.tv_usec > g_last_update_.tv_usec)) {
299 write_back();
300 gettimeofday(&g_last_update_, NULL);
301 }
302 pthread_mutex_unlock(&g_mutex_);
303
304 out:
305 free((void*)dict_file_);
306 free(lemmas_);
307 free(offsets_);
308 free(offsets_by_id_);
309 free(scores_);
310 free(ids_);
311 #ifdef ___PREDICT_ENABLED___
312 free(predicts_);
313 #endif
314
315 version_ = 0;
316 dict_file_ = NULL;
317 lemmas_ = NULL;
318 #ifdef ___SYNC_ENABLED___
319 syncs_ = NULL;
320 sync_count_size_ = 0;
321 #endif
322 offsets_ = NULL;
323 offsets_by_id_ = NULL;
324 scores_ = NULL;
325 ids_ = NULL;
326 #ifdef ___PREDICT_ENABLED___
327 predicts_ = NULL;
328 #endif
329
330 memset(&dict_info_, 0, sizeof(dict_info_));
331 lemma_count_left_ = 0;
332 lemma_size_left_ = 0;
333 state_ = USER_DICT_NONE;
334
335 return true;
336 }
337
number_of_lemmas()338 size_t UserDict::number_of_lemmas() {
339 return dict_info_.lemma_count;
340 }
341
reset_milestones(uint16 from_step,MileStoneHandle from_handle)342 void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
343 return;
344 }
345
extend_dict(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)346 MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
347 const DictExtPara *dep,
348 LmaPsbItem *lpi_items,
349 size_t lpi_max, size_t *lpi_num) {
350 if (is_valid_state() == false)
351 return 0;
352
353 bool need_extend = false;
354
355 #ifdef ___DEBUG_PERF___
356 DEBUG_PERF_BEGIN;
357 #endif
358 *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
359 lpi_items, lpi_max, &need_extend);
360 #ifdef ___DEBUG_PERF___
361 DEBUG_PERF_END;
362 LOGD_PERF("extend_dict");
363 #endif
364 return ((*lpi_num > 0 || need_extend) ? 1 : 0);
365 }
366
is_fuzzy_prefix_spell_id(const uint16 * id1,uint16 len1,const UserDictSearchable * searchable)367 int UserDict::is_fuzzy_prefix_spell_id(
368 const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
369 if (len1 < searchable->splids_len)
370 return 0;
371
372 SpellingTrie &spl_trie = SpellingTrie::get_instance();
373 uint32 i = 0;
374 for (i = 0; i < searchable->splids_len; i++) {
375 const char py1 = *spl_trie.get_spelling_str(id1[i]);
376 uint16 off = 8 * (i % 4);
377 const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
378 if (py1 == py2)
379 continue;
380 return 0;
381 }
382 return 1;
383 }
384
fuzzy_compare_spell_id(const uint16 * id1,uint16 len1,const UserDictSearchable * searchable)385 int UserDict::fuzzy_compare_spell_id(
386 const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
387 if (len1 < searchable->splids_len)
388 return -1;
389 if (len1 > searchable->splids_len)
390 return 1;
391
392 SpellingTrie &spl_trie = SpellingTrie::get_instance();
393 uint32 i = 0;
394 for (i = 0; i < len1; i++) {
395 const char py1 = *spl_trie.get_spelling_str(id1[i]);
396 uint16 off = 8 * (i % 4);
397 const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
398 if (py1 == py2)
399 continue;
400 if (py1 > py2)
401 return 1;
402 return -1;
403 }
404 return 0;
405 }
406
is_prefix_spell_id(const uint16 * fullids,uint16 fulllen,const UserDictSearchable * searchable)407 bool UserDict::is_prefix_spell_id(
408 const uint16 * fullids, uint16 fulllen,
409 const UserDictSearchable *searchable) {
410 if (fulllen < searchable->splids_len)
411 return false;
412
413 uint32 i = 0;
414 for (; i < searchable->splids_len; i++) {
415 uint16 start_id = searchable->splid_start[i];
416 uint16 count = searchable->splid_count[i];
417 if (fullids[i] >= start_id && fullids[i] < start_id + count)
418 continue;
419 else
420 return false;
421 }
422 return true;
423 }
424
equal_spell_id(const uint16 * fullids,uint16 fulllen,const UserDictSearchable * searchable)425 bool UserDict::equal_spell_id(
426 const uint16 * fullids, uint16 fulllen,
427 const UserDictSearchable *searchable) {
428 if (fulllen != searchable->splids_len)
429 return false;
430
431 uint32 i = 0;
432 for (; i < fulllen; i++) {
433 uint16 start_id = searchable->splid_start[i];
434 uint16 count = searchable->splid_count[i];
435 if (fullids[i] >= start_id && fullids[i] < start_id + count)
436 continue;
437 else
438 return false;
439 }
440 return true;
441 }
442
locate_first_in_offsets(const UserDictSearchable * searchable)443 int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
444 int32 begin = 0;
445 int32 end = dict_info_.lemma_count - 1;
446 int32 middle = -1;
447
448 int32 first_prefix = middle;
449 int32 last_matched = middle;
450
451 while (begin <= end) {
452 middle = (begin + end) >> 1;
453 uint32 offset = offsets_[middle];
454 uint8 nchar = get_lemma_nchar(offset);
455 const uint16 * splids = get_lemma_spell_ids(offset);
456 int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
457 int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);
458
459 if (pre)
460 first_prefix = middle;
461
462 if (cmp < 0) {
463 begin = middle + 1;
464 } else if (cmp > 0) {
465 end = middle - 1;
466 } else {
467 end = middle - 1;
468 last_matched = middle;
469 }
470 }
471
472 return first_prefix;
473 }
474
prepare_locate(UserDictSearchable * searchable,const uint16 * splid_str,uint16 splid_str_len)475 void UserDict::prepare_locate(UserDictSearchable *searchable,
476 const uint16 *splid_str,
477 uint16 splid_str_len) {
478 searchable->splids_len = splid_str_len;
479 memset(searchable->signature, 0, sizeof(searchable->signature));
480
481 SpellingTrie &spl_trie = SpellingTrie::get_instance();
482 uint32 i = 0;
483 for (; i < splid_str_len; i++) {
484 if (spl_trie.is_half_id(splid_str[i])) {
485 searchable->splid_count[i] =
486 spl_trie.half_to_full(splid_str[i],
487 &(searchable->splid_start[i]));
488 } else {
489 searchable->splid_count[i] = 1;
490 searchable->splid_start[i] = splid_str[i];
491 }
492 const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
493 searchable->signature[i>>2] |= (py << (8 * (i % 4)));
494 }
495 }
496
get_lpis(const uint16 * splid_str,uint16 splid_str_len,LmaPsbItem * lpi_items,size_t lpi_max)497 size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
498 LmaPsbItem *lpi_items, size_t lpi_max) {
499 return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
500 }
501
_get_lpis(const uint16 * splid_str,uint16 splid_str_len,LmaPsbItem * lpi_items,size_t lpi_max,bool * need_extend)502 size_t UserDict::_get_lpis(const uint16 *splid_str,
503 uint16 splid_str_len, LmaPsbItem *lpi_items,
504 size_t lpi_max, bool * need_extend) {
505 bool tmp_extend;
506 if (!need_extend)
507 need_extend = &tmp_extend;
508
509 *need_extend = false;
510
511 if (is_valid_state() == false)
512 return 0;
513 if (lpi_max <= 0)
514 return 0;
515
516 if (0 == pthread_mutex_trylock(&g_mutex_)) {
517 if (load_time_.tv_sec < g_last_update_.tv_sec ||
518 (load_time_.tv_sec == g_last_update_.tv_sec &&
519 load_time_.tv_usec < g_last_update_.tv_usec)) {
520 // Others updated disk file, have to reload
521 pthread_mutex_unlock(&g_mutex_);
522 flush_cache();
523 } else {
524 pthread_mutex_unlock(&g_mutex_);
525 }
526 } else {
527 }
528
529 UserDictSearchable searchable;
530 prepare_locate(&searchable, splid_str, splid_str_len);
531
532 uint32 max_off = dict_info_.lemma_count;
533 #ifdef ___CACHE_ENABLED___
534 int32 middle;
535 uint32 start, count;
536 bool cached = cache_hit(&searchable, &start, &count);
537 if (cached) {
538 middle = start;
539 max_off = start + count;
540 } else {
541 middle = locate_first_in_offsets(&searchable);
542 start = middle;
543 }
544 #else
545 int32 middle = locate_first_in_offsets(&searchable);
546 #endif
547
548 if (middle == -1) {
549 #ifdef ___CACHE_ENABLED___
550 if (!cached)
551 cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
552 #endif
553 return 0;
554 }
555
556 size_t lpi_current = 0;
557
558 bool fuzzy_break = false;
559 bool prefix_break = false;
560 while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
561 if (lpi_current >= lpi_max)
562 break;
563 uint32 offset = offsets_[middle];
564 // Ignore deleted lemmas
565 if (offset & kUserDictOffsetFlagRemove) {
566 middle++;
567 continue;
568 }
569 uint8 nchar = get_lemma_nchar(offset);
570 uint16 * splids = get_lemma_spell_ids(offset);
571 #ifdef ___CACHE_ENABLED___
572 if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
573 #else
574 if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
575 #endif
576 fuzzy_break = true;
577 }
578
579 if (prefix_break == false) {
580 if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
581 if (*need_extend == false &&
582 is_prefix_spell_id(splids, nchar, &searchable)) {
583 *need_extend = true;
584 }
585 } else {
586 prefix_break = true;
587 }
588 }
589
590 if (equal_spell_id(splids, nchar, &searchable) == true) {
591 lpi_items[lpi_current].psb = translate_score(scores_[middle]);
592 lpi_items[lpi_current].id = ids_[middle];
593 lpi_items[lpi_current].lma_len = nchar;
594 lpi_current++;
595 }
596 middle++;
597 }
598
599 #ifdef ___CACHE_ENABLED___
600 if (!cached) {
601 count = middle - start;
602 cache_push(USER_DICT_CACHE, &searchable, start, count);
603 }
604 #endif
605
606 return lpi_current;
607 }
608
609 uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
610 uint16 str_max) {
611 if (is_valid_state() == false)
612 return 0;
613 if (is_valid_lemma_id(id_lemma) == false)
614 return 0;
615 uint32 offset = offsets_by_id_[id_lemma - start_id_];
616 uint8 nchar = get_lemma_nchar(offset);
617 char16 * str = get_lemma_word(offset);
618 uint16 m = nchar < str_max -1 ? nchar : str_max - 1;
619 int i = 0;
620 for (; i < m; i++) {
621 str_buf[i] = str[i];
622 }
623 str_buf[i] = 0;
624 return m;
625 }
626
627 uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
628 uint16 splids_max, bool arg_valid) {
629 if (is_valid_lemma_id(id_lemma) == false)
630 return 0;
631 uint32 offset = offsets_by_id_[id_lemma - start_id_];
632 uint8 nchar = get_lemma_nchar(offset);
633 const uint16 * ids = get_lemma_spell_ids(offset);
634 int i = 0;
635 for (; i < nchar && i < splids_max; i++)
636 splids[i] = ids[i];
637 return i;
638 }
639
640 size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
641 NPredictItem *npre_items, size_t npre_max,
642 size_t b4_used) {
643 uint32 new_added = 0;
644 #ifdef ___PREDICT_ENABLED___
645 int32 end = dict_info_.lemma_count - 1;
646 int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
647 if (j == -1)
648 return 0;
649
650 while (j <= end) {
651 uint32 offset = predicts_[j];
652 // Ignore deleted lemmas
653 if (offset & kUserDictOffsetFlagRemove) {
654 j++;
655 continue;
656 }
657 uint32 nchar = get_lemma_nchar(offset);
658 uint16 * words = get_lemma_word(offset);
659 uint16 * splids = get_lemma_spell_ids(offset);
660
661 if (nchar <= hzs_len) {
662 j++;
663 continue;
664 }
665
666 if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
667 if (new_added >= npre_max) {
668 return new_added;
669 }
670 uint32 cpy_len =
671 (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
672 - (hzs_len << 1);
673 npre_items[new_added].his_len = hzs_len;
674 npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
675 memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
676 if ((cpy_len >> 1) < kMaxPredictSize) {
677 npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
678 }
679 new_added++;
680 } else {
681 break;
682 }
683
684 j++;
685 }
686 #endif
687 return new_added;
688 }
689
690 int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
691 uint16 lemma_len) {
692 int32 max_off = dict_info_.lemma_count;
693
694 UserDictSearchable searchable;
695 prepare_locate(&searchable, splid_str, lemma_len);
696 #ifdef ___CACHE_ENABLED___
697 int32 off;
698 uint32 start, count;
699 bool cached = load_cache(&searchable, &start, &count);
700 if (cached) {
701 off = start;
702 max_off = start + count;
703 } else {
704 off = locate_first_in_offsets(&searchable);
705 start = off;
706 }
707 #else
708 int32 off = locate_first_in_offsets(&searchable);
709 #endif
710
711 if (off == -1) {
712 return off;
713 }
714
715 while (off < max_off) {
716 uint32 offset = offsets_[off];
717 if (offset & kUserDictOffsetFlagRemove) {
718 off++;
719 continue;
720 }
721 uint16 * splids = get_lemma_spell_ids(offset);
722 #ifdef ___CACHE_ENABLED___
723 if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
724 break;
725 #else
726 if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
727 break;
728 #endif
729 if (equal_spell_id(splids, lemma_len, &searchable) == true) {
730 uint16 * str = get_lemma_word(offset);
731 uint32 i = 0;
732 for (i = 0; i < lemma_len; i++) {
733 if (str[i] == lemma_str[i])
734 continue;
735 break;
736 }
737 if (i < lemma_len) {
738 off++;
739 continue;
740 }
741 #ifdef ___CACHE_ENABLED___
742 // No need to save_cache here, since current function is invoked by
743 // put_lemma. It's rarely possible for a user input same lemma twice.
744 // That means first time user type a new lemma, it is newly added into
745 // user dictionary, then it's possible that user type the same lemma
746 // again.
747 // Another reason save_cache can not be invoked here is this function
748 // aborts when lemma is found, and it never knows the count.
749 #endif
750 return off;
751 }
752 off++;
753 }
754
755 return -1;
756 }
757
758 #ifdef ___PREDICT_ENABLED___
759 uint32 UserDict::locate_where_to_insert_in_predicts(
760 const uint16 * words, int lemma_len) {
761 int32 begin = 0;
762 int32 end = dict_info_.lemma_count - 1;
763 int32 middle = end;
764
765 uint32 last_matched = middle;
766
767 while (begin <= end) {
768 middle = (begin + end) >> 1;
769 uint32 offset = offsets_[middle];
770 uint8 nchar = get_lemma_nchar(offset);
771 const uint16 * ws = get_lemma_word(offset);
772
773 uint32 minl = nchar < lemma_len ? nchar : lemma_len;
774 uint32 k = 0;
775 int cmp = 0;
776
777 for (; k < minl; k++) {
778 if (ws[k] < words[k]) {
779 cmp = -1;
780 break;
781 } else if (ws[k] > words[k]) {
782 cmp = 1;
783 break;
784 }
785 }
786 if (cmp == 0) {
787 if (nchar < lemma_len)
788 cmp = -1;
789 else if (nchar > lemma_len)
790 cmp = 1;
791 }
792
793 if (cmp < 0) {
794 begin = middle + 1;
795 last_matched = middle;
796 } else if (cmp > 0) {
797 end = middle - 1;
798 } else {
799 end = middle - 1;
800 last_matched = middle;
801 }
802 }
803
804 return last_matched;
805 }
806
807 int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
808 int32 begin = 0;
809 int32 end = dict_info_.lemma_count - 1;
810 int32 middle = -1;
811
812 int32 last_matched = middle;
813
814 while (begin <= end) {
815 middle = (begin + end) >> 1;
816 uint32 offset = offsets_[middle];
817 uint8 nchar = get_lemma_nchar(offset);
818 const uint16 * ws = get_lemma_word(offset);
819
820 uint32 minl = nchar < lemma_len ? nchar : lemma_len;
821 uint32 k = 0;
822 int cmp = 0;
823
824 for (; k < minl; k++) {
825 if (ws[k] < words[k]) {
826 cmp = -1;
827 break;
828 } else if (ws[k] > words[k]) {
829 cmp = 1;
830 break;
831 }
832 }
833 if (cmp == 0) {
834 if (nchar >= lemma_len)
835 last_matched = middle;
836 if (nchar < lemma_len)
837 cmp = -1;
838 else if (nchar > lemma_len)
839 cmp = 1;
840 }
841
842 if (cmp < 0) {
843 begin = middle + 1;
844 } else if (cmp > 0) {
845 end = middle - 1;
846 } else {
847 end = middle - 1;
848 }
849 }
850
851 return last_matched;
852 }
853
854 #endif
855
856 LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
857 uint16 lemma_len) {
858 int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
859 if (off == -1) {
860 return 0;
861 }
862
863 return ids_[off];
864 }
865
866 LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
867 if (is_valid_state() == false)
868 return 0;
869 if (is_valid_lemma_id(lemma_id) == false)
870 return 0;
871
872 return translate_score(_get_lemma_score(lemma_id));
873 }
874
875 LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
876 uint16 lemma_len) {
877 if (is_valid_state() == false)
878 return 0;
879 return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
880 }
881
882 int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
883 if (is_valid_state() == false)
884 return 0;
885 if (is_valid_lemma_id(lemma_id) == false)
886 return 0;
887
888 uint32 offset = offsets_by_id_[lemma_id - start_id_];
889
890 uint32 nchar = get_lemma_nchar(offset);
891 uint16 * spl = get_lemma_spell_ids(offset);
892 uint16 * wrd = get_lemma_word(offset);
893
894 int32 off = locate_in_offsets(wrd, spl, nchar);
895 if (off == -1) {
896 return 0;
897 }
898
899 return scores_[off];
900 }
901
902 int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
903 uint16 lemma_len) {
904 if (is_valid_state() == false)
905 return 0;
906
907 int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
908 if (off == -1) {
909 return 0;
910 }
911
912 return scores_[off];
913 }
914
915 #ifdef ___SYNC_ENABLED___
916 void UserDict::remove_lemma_from_sync_list(uint32 offset) {
917 offset &= kUserDictOffsetMask;
918 uint32 i = 0;
919 for (; i < dict_info_.sync_count; i++) {
920 unsigned int off = (syncs_[i] & kUserDictOffsetMask);
921 if (off == offset)
922 break;
923 }
924 if (i < dict_info_.sync_count) {
925 syncs_[i] = syncs_[dict_info_.sync_count - 1];
926 dict_info_.sync_count--;
927 }
928 }
929 #endif
930
931 #ifdef ___PREDICT_ENABLED___
932 void UserDict::remove_lemma_from_predict_list(uint32 offset) {
933 offset &= kUserDictOffsetMask;
934 uint32 i = 0;
935 for (; i < dict_info_.lemma_count; i++) {
936 unsigned int off = (predicts_[i] & kUserDictOffsetMask);
937 if (off == offset) {
938 predicts_[i] |= kUserDictOffsetFlagRemove;
939 break;
940 }
941 }
942 }
943 #endif
944
945 bool UserDict::remove_lemma_by_offset_index(int offset_index) {
946 if (is_valid_state() == false)
947 return 0;
948
949 int32 off = offset_index;
950 if (off == -1) {
951 return false;
952 }
953
954 uint32 offset = offsets_[off];
955 uint32 nchar = get_lemma_nchar(offset);
956
957 offsets_[off] |= kUserDictOffsetFlagRemove;
958
959 #ifdef ___SYNC_ENABLED___
960 // Remove corresponding sync item
961 remove_lemma_from_sync_list(offset);
962 #endif
963
964 #ifdef ___PREDICT_ENABLED___
965 remove_lemma_from_predict_list(offset);
966 #endif
967 dict_info_.free_count++;
968 dict_info_.free_size += (2 + (nchar << 2));
969
970 if (state_ < USER_DICT_OFFSET_DIRTY)
971 state_ = USER_DICT_OFFSET_DIRTY;
972 return true;
973 }
974
975 bool UserDict::remove_lemma(LemmaIdType lemma_id) {
976 if (is_valid_state() == false)
977 return 0;
978 if (is_valid_lemma_id(lemma_id) == false)
979 return false;
980 uint32 offset = offsets_by_id_[lemma_id - start_id_];
981
982 uint32 nchar = get_lemma_nchar(offset);
983 uint16 * spl = get_lemma_spell_ids(offset);
984 uint16 * wrd = get_lemma_word(offset);
985
986 int32 off = locate_in_offsets(wrd, spl, nchar);
987
988 return remove_lemma_by_offset_index(off);
989 }
990
991 void UserDict::flush_cache() {
992 LemmaIdType start_id = start_id_;
993 const char * file = strdup(dict_file_);
994 if (!file)
995 return;
996 close_dict();
997 load_dict(file, start_id, kUserDictIdEnd);
998 free((void*)file);
999 #ifdef ___CACHE_ENABLED___
1000 cache_init();
1001 #endif
1002 return;
1003 }
1004
1005 bool UserDict::reset(const char *file) {
1006 FILE *fp = fopen(file, "w+");
1007 if (!fp) {
1008 return false;
1009 }
1010 uint32 version = kUserDictVersion;
1011 size_t wred = fwrite(&version, 1, 4, fp);
1012 UserDictInfo info;
1013 memset(&info, 0, sizeof(info));
1014 // By default, no limitation for lemma count and size
1015 // thereby, reclaim_ratio is never used
1016 wred += fwrite(&info, 1, sizeof(info), fp);
1017 if (wred != sizeof(info) + sizeof(version)) {
1018 fclose(fp);
1019 unlink(file);
1020 return false;
1021 }
1022 fclose(fp);
1023 return true;
1024 }
1025
1026 bool UserDict::validate(const char *file) {
1027 // b is ignored in POSIX compatible os including Linux
1028 // while b is important flag for Windows to specify binary mode
1029 FILE *fp = fopen(file, "rb");
1030 if (!fp) {
1031 return false;
1032 }
1033
1034 size_t size;
1035 size_t readed;
1036 uint32 version;
1037 UserDictInfo dict_info;
1038
1039 // validate
1040 int err = fseek(fp, 0, SEEK_END);
1041 if (err) {
1042 goto error;
1043 }
1044
1045 size = ftell(fp);
1046 if (size < 4 + sizeof(dict_info)) {
1047 goto error;
1048 }
1049
1050 err = fseek(fp, 0, SEEK_SET);
1051 if (err) {
1052 goto error;
1053 }
1054
1055 readed = fread(&version, 1, sizeof(version), fp);
1056 if (readed < sizeof(version)) {
1057 goto error;
1058 }
1059 if (version != kUserDictVersion) {
1060 goto error;
1061 }
1062
1063 err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
1064 if (err) {
1065 goto error;
1066 }
1067
1068 readed = fread(&dict_info, 1, sizeof(dict_info), fp);
1069 if (readed != sizeof(dict_info)) {
1070 goto error;
1071 }
1072
1073 if (size != get_dict_file_size(&dict_info)) {
1074 goto error;
1075 }
1076
1077 fclose(fp);
1078 return true;
1079
1080 error:
1081 fclose(fp);
1082 return false;
1083 }
1084
1085 bool UserDict::load(const char *file, LemmaIdType start_id) {
1086 if (0 != pthread_mutex_trylock(&g_mutex_)) {
1087 return false;
1088 }
1089 // b is ignored in POSIX compatible os including Linux
1090 // while b is important flag for Windows to specify binary mode
1091 FILE *fp = fopen(file, "rb");
1092 if (!fp) {
1093 pthread_mutex_unlock(&g_mutex_);
1094 return false;
1095 }
1096
1097 size_t readed, toread;
1098 UserDictInfo dict_info;
1099 uint8 *lemmas = NULL;
1100 uint32 *offsets = NULL;
1101 #ifdef ___SYNC_ENABLED___
1102 uint32 *syncs = NULL;
1103 #endif
1104 uint32 *scores = NULL;
1105 uint32 *ids = NULL;
1106 uint32 *offsets_by_id = NULL;
1107 #ifdef ___PREDICT_ENABLED___
1108 uint32 *predicts = NULL;
1109 #endif
1110 size_t i;
1111 int err;
1112
1113 err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
1114 if (err) goto error;
1115
1116 readed = fread(&dict_info, 1, sizeof(dict_info), fp);
1117 if (readed != sizeof(dict_info)) goto error;
1118
1119 lemmas = (uint8 *)malloc(
1120 dict_info.lemma_size +
1121 (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));
1122
1123 if (!lemmas) goto error;
1124
1125 offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1126 if (!offsets) goto error;
1127
1128 #ifdef ___PREDICT_ENABLED___
1129 predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1130 if (!predicts) goto error;
1131 #endif
1132
1133 #ifdef ___SYNC_ENABLED___
1134 syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
1135 if (!syncs) goto error;
1136 #endif
1137
1138 scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1139 if (!scores) goto error;
1140
1141 ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1142 if (!ids) goto error;
1143
1144 offsets_by_id = (uint32 *)malloc(
1145 (dict_info.lemma_count + kUserDictPreAlloc) << 2);
1146 if (!offsets_by_id) goto error;
1147
1148 err = fseek(fp, 4, SEEK_SET);
1149 if (err) goto error;
1150
1151 readed = 0;
1152 while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
1153 readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
1154 }
1155 if (readed < dict_info.lemma_size)
1156 goto error;
1157
1158 toread = (dict_info.lemma_count << 2);
1159 readed = 0;
1160 while (readed < toread && !ferror(fp) && !feof(fp)) {
1161 readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
1162 }
1163 if (readed < toread)
1164 goto error;
1165
1166 #ifdef ___PREDICT_ENABLED___
1167 toread = (dict_info.lemma_count << 2);
1168 readed = 0;
1169 while (readed < toread && !ferror(fp) && !feof(fp)) {
1170 readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
1171 }
1172 if (readed < toread)
1173 goto error;
1174 #endif
1175
1176 readed = 0;
1177 while (readed < toread && !ferror(fp) && !feof(fp)) {
1178 readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
1179 }
1180 if (readed < toread)
1181 goto error;
1182
1183 #ifdef ___SYNC_ENABLED___
1184 toread = (dict_info.sync_count << 2);
1185 readed = 0;
1186 while (readed < toread && !ferror(fp) && !feof(fp)) {
1187 readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
1188 }
1189 if (readed < toread)
1190 goto error;
1191 #endif
1192
1193 for (i = 0; i < dict_info.lemma_count; i++) {
1194 ids[i] = start_id + i;
1195 offsets_by_id[i] = offsets[i];
1196 }
1197
1198 lemmas_ = lemmas;
1199 offsets_ = offsets;
1200 #ifdef ___SYNC_ENABLED___
1201 syncs_ = syncs;
1202 sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
1203 #endif
1204 offsets_by_id_ = offsets_by_id;
1205 scores_ = scores;
1206 ids_ = ids;
1207 #ifdef ___PREDICT_ENABLED___
1208 predicts_ = predicts;
1209 #endif
1210 lemma_count_left_ = kUserDictPreAlloc;
1211 lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
1212 memcpy(&dict_info_, &dict_info, sizeof(dict_info));
1213 state_ = USER_DICT_SYNC;
1214
1215 fclose(fp);
1216
1217 pthread_mutex_unlock(&g_mutex_);
1218 return true;
1219
1220 error:
1221 if (lemmas) free(lemmas);
1222 if (offsets) free(offsets);
1223 #ifdef ___SYNC_ENABLED___
1224 if (syncs) free(syncs);
1225 #endif
1226 if (scores) free(scores);
1227 if (ids) free(ids);
1228 if (offsets_by_id) free(offsets_by_id);
1229 #ifdef ___PREDICT_ENABLED___
1230 if (predicts) free(predicts);
1231 #endif
1232 fclose(fp);
1233 pthread_mutex_unlock(&g_mutex_);
1234 return false;
1235 }
1236
1237 void UserDict::write_back() {
1238 // XXX write back is only allowed from close_dict due to thread-safe sake
1239 if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
1240 return;
1241 int fd = open(dict_file_, O_WRONLY);
1242 if (fd == -1)
1243 return;
1244 switch (state_) {
1245 case USER_DICT_DEFRAGMENTED:
1246 write_back_all(fd);
1247 break;
1248 case USER_DICT_LEMMA_DIRTY:
1249 write_back_lemma(fd);
1250 break;
1251 case USER_DICT_OFFSET_DIRTY:
1252 write_back_offset(fd);
1253 break;
1254 case USER_DICT_SCORE_DIRTY:
1255 write_back_score(fd);
1256 break;
1257 #ifdef ___SYNC_ENABLED___
1258 case USER_DICT_SYNC_DIRTY:
1259 write_back_sync(fd);
1260 break;
1261 #endif
1262 default:
1263 break;
1264 }
1265 // It seems truncate is not need on Linux, Windows except Mac
1266 // I am doing it here anyway for safety.
1267 off_t cur = lseek(fd, 0, SEEK_CUR);
1268 ftruncate(fd, cur);
1269 close(fd);
1270 state_ = USER_DICT_SYNC;
1271 }
1272
1273 #ifdef ___SYNC_ENABLED___
1274 void UserDict::write_back_sync(int fd) {
1275 int err = lseek(fd, 4 + dict_info_.lemma_size
1276 + (dict_info_.lemma_count << 3)
1277 #ifdef ___PREDICT_ENABLED___
1278 + (dict_info_.lemma_count << 2)
1279 #endif
1280 , SEEK_SET);
1281 if (err == -1)
1282 return;
1283 write(fd, syncs_, dict_info_.sync_count << 2);
1284 write(fd, &dict_info_, sizeof(dict_info_));
1285 }
1286 #endif
1287
1288 void UserDict::write_back_offset(int fd) {
1289 int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
1290 if (err == -1)
1291 return;
1292 write(fd, offsets_, dict_info_.lemma_count << 2);
1293 #ifdef ___PREDICT_ENABLED___
1294 write(fd, predicts_, dict_info_.lemma_count << 2);
1295 #endif
1296 write(fd, scores_, dict_info_.lemma_count << 2);
1297 #ifdef ___SYNC_ENABLED___
1298 write(fd, syncs_, dict_info_.sync_count << 2);
1299 #endif
1300 write(fd, &dict_info_, sizeof(dict_info_));
1301 }
1302
1303 void UserDict::write_back_score(int fd) {
1304 int err = lseek(fd, 4 + dict_info_.lemma_size
1305 + (dict_info_.lemma_count << 2)
1306 #ifdef ___PREDICT_ENABLED___
1307 + (dict_info_.lemma_count << 2)
1308 #endif
1309 , SEEK_SET);
1310 if (err == -1)
1311 return;
1312 write(fd, scores_, dict_info_.lemma_count << 2);
1313 #ifdef ___SYNC_ENABLED___
1314 write(fd, syncs_, dict_info_.sync_count << 2);
1315 #endif
1316 write(fd, &dict_info_, sizeof(dict_info_));
1317 }
1318
1319 void UserDict::write_back_lemma(int fd) {
1320 int err = lseek(fd, 4, SEEK_SET);
1321 if (err == -1)
1322 return;
1323 // New lemmas are always appended, no need to write whole lemma block
1324 size_t need_write = kUserDictPreAlloc *
1325 (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
1326 err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
1327 if (err == -1)
1328 return;
1329 write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);
1330
1331 write(fd, offsets_, dict_info_.lemma_count << 2);
1332 #ifdef ___PREDICT_ENABLED___
1333 write(fd, predicts_, dict_info_.lemma_count << 2);
1334 #endif
1335 write(fd, scores_, dict_info_.lemma_count << 2);
1336 #ifdef ___SYNC_ENABLED___
1337 write(fd, syncs_, dict_info_.sync_count << 2);
1338 #endif
1339 write(fd, &dict_info_, sizeof(dict_info_));
1340 }
1341
1342 void UserDict::write_back_all(int fd) {
1343 // XXX lemma_size is handled differently in writeall
1344 // and writelemma. I update lemma_size and lemma_count in different
1345 // places for these two cases. Should fix it to make it consistent.
1346 int err = lseek(fd, 4, SEEK_SET);
1347 if (err == -1)
1348 return;
1349 write(fd, lemmas_, dict_info_.lemma_size);
1350 write(fd, offsets_, dict_info_.lemma_count << 2);
1351 #ifdef ___PREDICT_ENABLED___
1352 write(fd, predicts_, dict_info_.lemma_count << 2);
1353 #endif
1354 write(fd, scores_, dict_info_.lemma_count << 2);
1355 #ifdef ___SYNC_ENABLED___
1356 write(fd, syncs_, dict_info_.sync_count << 2);
1357 #endif
1358 write(fd, &dict_info_, sizeof(dict_info_));
1359 }
1360
1361 #ifdef ___CACHE_ENABLED___
1362 bool UserDict::load_cache(UserDictSearchable *searchable,
1363 uint32 *offset, uint32 *length) {
1364 UserDictCache *cache = &caches_[searchable->splids_len - 1];
1365 if (cache->head == cache->tail)
1366 return false;
1367
1368 uint16 j, sig_len = kMaxLemmaSize / 4;
1369 uint16 i = cache->head;
1370 while (1) {
1371 j = 0;
1372 for (; j < sig_len; j++) {
1373 if (cache->signatures[i][j] != searchable->signature[j])
1374 break;
1375 }
1376 if (j < sig_len) {
1377 i++;
1378 if (i >= kUserDictCacheSize)
1379 i -= kUserDictCacheSize;
1380 if (i == cache->tail)
1381 break;
1382 continue;
1383 }
1384 *offset = cache->offsets[i];
1385 *length = cache->lengths[i];
1386 return true;
1387 }
1388 return false;
1389 }
1390
1391 void UserDict::save_cache(UserDictSearchable *searchable,
1392 uint32 offset, uint32 length) {
1393 UserDictCache *cache = &caches_[searchable->splids_len - 1];
1394 uint16 next = cache->tail;
1395
1396 cache->offsets[next] = offset;
1397 cache->lengths[next] = length;
1398 uint16 sig_len = kMaxLemmaSize / 4;
1399 uint16 j = 0;
1400 for (; j < sig_len; j++) {
1401 cache->signatures[next][j] = searchable->signature[j];
1402 }
1403
1404 if (++next >= kUserDictCacheSize) {
1405 next -= kUserDictCacheSize;
1406 }
1407 if (next == cache->head) {
1408 cache->head++;
1409 if (cache->head >= kUserDictCacheSize) {
1410 cache->head -= kUserDictCacheSize;
1411 }
1412 }
1413 cache->tail = next;
1414 }
1415
1416 void UserDict::reset_cache() {
1417 memset(caches_, 0, sizeof(caches_));
1418 }
1419
1420 bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
1421 UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
1422 if (cache->head == cache->tail)
1423 return false;
1424
1425 uint16 j, sig_len = kMaxLemmaSize / 4;
1426 uint16 i = cache->head;
1427 while (1) {
1428 j = 0;
1429 for (; j < sig_len; j++) {
1430 if (cache->signatures[i][j] != searchable->signature[j])
1431 break;
1432 }
1433 if (j < sig_len) {
1434 i++;
1435 if (i >= kUserDictMissCacheSize)
1436 i -= kUserDictMissCacheSize;
1437 if (i == cache->tail)
1438 break;
1439 continue;
1440 }
1441 return true;
1442 }
1443 return false;
1444 }
1445
1446 void UserDict::save_miss_cache(UserDictSearchable *searchable) {
1447 UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
1448 uint16 next = cache->tail;
1449
1450 uint16 sig_len = kMaxLemmaSize / 4;
1451 uint16 j = 0;
1452 for (; j < sig_len; j++) {
1453 cache->signatures[next][j] = searchable->signature[j];
1454 }
1455
1456 if (++next >= kUserDictMissCacheSize) {
1457 next -= kUserDictMissCacheSize;
1458 }
1459 if (next == cache->head) {
1460 cache->head++;
1461 if (cache->head >= kUserDictMissCacheSize) {
1462 cache->head -= kUserDictMissCacheSize;
1463 }
1464 }
1465 cache->tail = next;
1466 }
1467
1468 void UserDict::reset_miss_cache() {
1469 memset(miss_caches_, 0, sizeof(miss_caches_));
1470 }
1471
1472 void UserDict::cache_init() {
1473 reset_cache();
1474 reset_miss_cache();
1475 }
1476
1477 bool UserDict::cache_hit(UserDictSearchable *searchable,
1478 uint32 *offset, uint32 *length) {
1479 bool hit = load_miss_cache(searchable);
1480 if (hit) {
1481 *offset = 0;
1482 *length = 0;
1483 return true;
1484 }
1485 hit = load_cache(searchable, offset, length);
1486 if (hit) {
1487 return true;
1488 }
1489 return false;
1490 }
1491
1492 void UserDict::cache_push(UserDictCacheType type,
1493 UserDictSearchable *searchable,
1494 uint32 offset, uint32 length) {
1495 switch (type) {
1496 case USER_DICT_MISS_CACHE:
1497 save_miss_cache(searchable);
1498 break;
1499 case USER_DICT_CACHE:
1500 save_cache(searchable, offset, length);
1501 break;
1502 default:
1503 break;
1504 }
1505 }
1506
1507 #endif
1508
1509 void UserDict::defragment(void) {
1510 #ifdef ___DEBUG_PERF___
1511 DEBUG_PERF_BEGIN;
1512 #endif
1513 if (is_valid_state() == false)
1514 return;
1515 // Fixup offsets_, set REMOVE flag to lemma's flag if needed
1516 size_t first_freed = 0;
1517 size_t first_inuse = 0;
1518 while (first_freed < dict_info_.lemma_count) {
1519 // Find first freed offset
1520 while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
1521 first_freed < dict_info_.lemma_count) {
1522 first_freed++;
1523 }
1524 if (first_freed < dict_info_.lemma_count) {
1525 // Save REMOVE flag to lemma flag
1526 int off = offsets_[first_freed];
1527 set_lemma_flag(off, kUserDictLemmaFlagRemove);
1528 } else {
1529 break;
1530 }
1531 // Find first inuse offse after first_freed
1532 first_inuse = first_freed + 1;
1533 while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
1534 (first_inuse < dict_info_.lemma_count)) {
1535 // Save REMOVE flag to lemma flag
1536 int off = offsets_[first_inuse];
1537 set_lemma_flag(off, kUserDictLemmaFlagRemove);
1538 first_inuse++;
1539 }
1540 if (first_inuse >= dict_info_.lemma_count) {
1541 break;
1542 }
1543 // Swap offsets_
1544 int tmp = offsets_[first_inuse];
1545 offsets_[first_inuse] = offsets_[first_freed];
1546 offsets_[first_freed] = tmp;
1547 // Move scores_, no need to swap
1548 tmp = scores_[first_inuse];
1549 scores_[first_inuse] = scores_[first_freed];
1550 scores_[first_freed] = tmp;
1551 // Swap ids_
1552 LemmaIdType tmpid = ids_[first_inuse];
1553 ids_[first_inuse] = ids_[first_freed];
1554 ids_[first_freed] = tmpid;
1555 // Go on
1556 first_freed++;
1557 }
1558 #ifdef ___PREDICT_ENABLED___
1559 // Fixup predicts_
1560 first_freed = 0;
1561 first_inuse = 0;
1562 while (first_freed < dict_info_.lemma_count) {
1563 // Find first freed offset
1564 while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
1565 first_freed < dict_info_.lemma_count) {
1566 first_freed++;
1567 }
1568 if (first_freed >= dict_info_.lemma_count)
1569 break;
1570 // Find first inuse offse after first_freed
1571 first_inuse = first_freed + 1;
1572 while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
1573 && (first_inuse < dict_info_.lemma_count)) {
1574 first_inuse++;
1575 }
1576 if (first_inuse >= dict_info_.lemma_count) {
1577 break;
1578 }
1579 // Swap offsets_
1580 int tmp = predicts_[first_inuse];
1581 predicts_[first_inuse] = predicts_[first_freed];
1582 predicts_[first_freed] = tmp;
1583 // Go on
1584 first_freed++;
1585 }
1586 #endif
1587 dict_info_.lemma_count = first_freed;
1588 // Fixup lemmas_
1589 size_t begin = 0;
1590 size_t end = 0;
1591 size_t dst = 0;
1592 int total_size = dict_info_.lemma_size + lemma_size_left_;
1593 int total_count = dict_info_.lemma_count + lemma_count_left_;
1594 size_t real_size = total_size - lemma_size_left_;
1595 while (dst < real_size) {
1596 unsigned char flag = get_lemma_flag(dst);
1597 unsigned char nchr = get_lemma_nchar(dst);
1598 if ((flag & kUserDictLemmaFlagRemove) == 0) {
1599 dst += nchr * 4 + 2;
1600 continue;
1601 }
1602 break;
1603 }
1604 if (dst >= real_size)
1605 return;
1606
1607 end = dst;
1608 while (end < real_size) {
1609 begin = end + get_lemma_nchar(end) * 4 + 2;
1610 repeat:
1611 // not used any more
1612 if (begin >= real_size)
1613 break;
1614 unsigned char flag = get_lemma_flag(begin);
1615 unsigned char nchr = get_lemma_nchar(begin);
1616 if (flag & kUserDictLemmaFlagRemove) {
1617 begin += nchr * 4 + 2;
1618 goto repeat;
1619 }
1620 end = begin + nchr * 4 + 2;
1621 while (end < real_size) {
1622 unsigned char eflag = get_lemma_flag(end);
1623 unsigned char enchr = get_lemma_nchar(end);
1624 if ((eflag & kUserDictLemmaFlagRemove) == 0) {
1625 end += enchr * 4 + 2;
1626 continue;
1627 }
1628 break;
1629 }
1630 memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
1631 for (size_t j = 0; j < dict_info_.lemma_count; j++) {
1632 if (offsets_[j] >= begin && offsets_[j] < end) {
1633 offsets_[j] -= (begin - dst);
1634 offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
1635 }
1636 #ifdef ___PREDICT_ENABLED___
1637 if (predicts_[j] >= begin && predicts_[j] < end) {
1638 predicts_[j] -= (begin - dst);
1639 }
1640 #endif
1641 }
1642 #ifdef ___SYNC_ENABLED___
1643 for (size_t j = 0; j < dict_info_.sync_count; j++) {
1644 if (syncs_[j] >= begin && syncs_[j] < end) {
1645 syncs_[j] -= (begin - dst);
1646 }
1647 }
1648 #endif
1649 dst += (end - begin);
1650 }
1651
1652 dict_info_.free_count = 0;
1653 dict_info_.free_size = 0;
1654 dict_info_.lemma_size = dst;
1655 lemma_size_left_ = total_size - dict_info_.lemma_size;
1656 lemma_count_left_ = total_count - dict_info_.lemma_count;
1657
1658 // XXX Without following code,
1659 // offsets_by_id_ is not reordered.
1660 // That's to say, all removed lemmas' ids are not collected back.
1661 // There may not be room for addition of new lemmas due to
1662 // offsests_by_id_ reason, although lemma_size_left_ is fixed.
1663 // By default, we do want defrag as fast as possible, because
1664 // during defrag procedure, other peers can not write new lemmas
1665 // to user dictionary file.
1666 // XXX If write-back is invoked immediately after
1667 // this defragment, no need to fix up following in-mem data.
1668 for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
1669 ids_[i] = start_id_ + i;
1670 offsets_by_id_[i] = offsets_[i];
1671 }
1672
1673 state_ = USER_DICT_DEFRAGMENTED;
1674
1675 #ifdef ___DEBUG_PERF___
1676 DEBUG_PERF_END;
1677 LOGD_PERF("defragment");
1678 #endif
1679 }
1680
1681 #ifdef ___SYNC_ENABLED___
1682 void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
1683 if (is_valid_state() == false)
1684 return;
1685 if (end > dict_info_.sync_count)
1686 end = dict_info_.sync_count;
1687 memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
1688 dict_info_.sync_count -= (end - start);
1689 if (state_ < USER_DICT_SYNC_DIRTY)
1690 state_ = USER_DICT_SYNC_DIRTY;
1691 }
1692
1693 int UserDict::get_sync_count() {
1694 if (is_valid_state() == false)
1695 return 0;
1696 return dict_info_.sync_count;
1697 }
1698
1699 LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
1700 uint16 lemma_len, uint16 count, uint64 lmt) {
1701 int again = 0;
1702 begin:
1703 LemmaIdType id;
1704 uint32 * syncs_bak = syncs_;
1705 syncs_ = NULL;
1706 id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
1707 syncs_ = syncs_bak;
1708 if (id == 0 && again == 0) {
1709 if ((dict_info_.limit_lemma_count > 0 &&
1710 dict_info_.lemma_count >= dict_info_.limit_lemma_count)
1711 || (dict_info_.limit_lemma_size > 0 &&
1712 dict_info_.lemma_size + (2 + (lemma_len << 2))
1713 > dict_info_.limit_lemma_size)) {
1714 // XXX Always reclaim and defrag in sync code path
1715 // sync thread is background thread and ok with heavy work
1716 reclaim();
1717 defragment();
1718 flush_cache();
1719 again = 1;
1720 goto begin;
1721 }
1722 }
1723 return id;
1724 }
1725
1726 int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
1727 int newly_added = 0;
1728
1729 SpellingParser * spl_parser = new SpellingParser();
1730 if (!spl_parser) {
1731 return 0;
1732 }
1733 #ifdef ___DEBUG_PERF___
1734 DEBUG_PERF_BEGIN;
1735 #endif
1736 char16 *ptr = lemmas;
1737
1738 // Extract pinyin,words,frequence,last_mod_time
1739 char16 * p = ptr, * py16 = ptr;
1740 char16 * hz16 = NULL;
1741 int py16_len = 0;
1742 uint16 splid[kMaxLemmaSize];
1743 int splid_len = 0;
1744 int hz16_len = 0;
1745 char16 * fr16 = NULL;
1746 int fr16_len = 0;
1747
1748 while (p - ptr < len) {
1749 // Pinyin
1750 py16 = p;
1751 splid_len = 0;
1752 while (*p != 0x2c && (p - ptr) < len) {
1753 if (*p == 0x20)
1754 splid_len++;
1755 p++;
1756 }
1757 splid_len++;
1758 if (p - ptr == len)
1759 break;
1760 py16_len = p - py16;
1761 if (kMaxLemmaSize < splid_len) {
1762 break;
1763 }
1764 bool is_pre;
1765 int splidl = spl_parser->splstr16_to_idxs_f(
1766 py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre);
1767 if (splidl != splid_len)
1768 break;
1769 // Phrase
1770 hz16 = ++p;
1771 while (*p != 0x2c && (p - ptr) < len) {
1772 p++;
1773 }
1774 hz16_len = p - hz16;
1775 if (hz16_len != splid_len)
1776 break;
1777 // Frequency
1778 fr16 = ++p;
1779 fr16_len = 0;
1780 while (*p != 0x2c && (p - ptr) < len) {
1781 p++;
1782 }
1783 fr16_len = p - fr16;
1784 uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
1785 // Last modified time
1786 fr16 = ++p;
1787 fr16_len = 0;
1788 while (*p != 0x3b && (p - ptr) < len) {
1789 p++;
1790 }
1791 fr16_len = p - fr16;
1792 uint64 last_mod = utf16le_atoll(fr16, fr16_len);
1793
1794 put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
1795 newly_added++;
1796
1797 p++;
1798 }
1799
1800 #ifdef ___DEBUG_PERF___
1801 DEBUG_PERF_END;
1802 LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
1803 #endif
1804 return newly_added;
1805 }
1806
1807 int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
1808 char16 * str, int size, int * count) {
1809 int len = 0;
1810 *count = 0;
1811
1812 int left_len = size;
1813
1814 if (is_valid_state() == false)
1815 return len;
1816
1817 SpellingTrie * spl_trie = &SpellingTrie::get_instance();
1818 if (!spl_trie) {
1819 return 0;
1820 }
1821
1822 uint32 i;
1823 for (i = 0; i < dict_info_.sync_count; i++) {
1824 int offset = syncs_[i];
1825 uint32 nchar = get_lemma_nchar(offset);
1826 uint16 *spl = get_lemma_spell_ids(offset);
1827 uint16 *wrd = get_lemma_word(offset);
1828 int score = _get_lemma_score(wrd, spl, nchar);
1829
1830 static char score_temp[32], *pscore_temp = score_temp;
1831 static char16 temp[256], *ptemp = temp;
1832
1833 pscore_temp = score_temp;
1834 ptemp = temp;
1835
1836 uint32 j;
1837 // Add pinyin
1838 for (j = 0; j < nchar; j++) {
1839 int ret_len = spl_trie->get_spelling_str16(
1840 spl[j], ptemp, temp + sizeof(temp) - ptemp);
1841 if (ret_len <= 0)
1842 break;
1843 ptemp += ret_len;
1844 if (ptemp < temp + sizeof(temp) - 1) {
1845 *(ptemp++) = ' ';
1846 } else {
1847 j = 0;
1848 break;
1849 }
1850 }
1851 if (j < nchar) {
1852 continue;
1853 }
1854 ptemp--;
1855 if (ptemp < temp + sizeof(temp) - 1) {
1856 *(ptemp++) = ',';
1857 } else {
1858 continue;
1859 }
1860 // Add phrase
1861 for (j = 0; j < nchar; j++) {
1862 if (ptemp < temp + sizeof(temp) - 1) {
1863 *(ptemp++) = wrd[j];
1864 } else {
1865 break;
1866 }
1867 }
1868 if (j < nchar) {
1869 continue;
1870 }
1871 if (ptemp < temp + sizeof(temp) - 1) {
1872 *(ptemp++) = ',';
1873 } else {
1874 continue;
1875 }
1876 // Add frequency
1877 uint32 intf = extract_score_freq(score);
1878 int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
1879 if (ret_len <= 0)
1880 continue;
1881 ptemp += ret_len;
1882 if (ptemp < temp + sizeof(temp) - 1) {
1883 *(ptemp++) = ',';
1884 } else {
1885 continue;
1886 }
1887 // Add last modified time
1888 uint64 last_mod = extract_score_lmt(score);
1889 ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
1890 if (ret_len <= 0)
1891 continue;
1892 ptemp += ret_len;
1893 if (ptemp < temp + sizeof(temp) - 1) {
1894 *(ptemp++) = ';';
1895 } else {
1896 continue;
1897 }
1898
1899 // Write to string
1900 int need_len = ptemp - temp;
1901 if (need_len > left_len)
1902 break;
1903 memcpy(str + len, temp, need_len * 2);
1904 left_len -= need_len;
1905
1906 len += need_len;
1907 (*count)++;
1908 }
1909
1910 if (len > 0) {
1911 if (state_ < USER_DICT_SYNC_DIRTY)
1912 state_ = USER_DICT_SYNC_DIRTY;
1913 }
1914 return len;
1915 }
1916
1917 #endif
1918
1919 bool UserDict::state(UserDictStat * stat) {
1920 if (is_valid_state() == false)
1921 return false;
1922 if (!stat)
1923 return false;
1924 stat->version = version_;
1925 stat->file_name = dict_file_;
1926 stat->load_time.tv_sec = load_time_.tv_sec;
1927 stat->load_time.tv_usec = load_time_.tv_usec;
1928 pthread_mutex_lock(&g_mutex_);
1929 stat->last_update.tv_sec = g_last_update_.tv_sec;
1930 stat->last_update.tv_usec = g_last_update_.tv_usec;
1931 pthread_mutex_unlock(&g_mutex_);
1932 stat->disk_size = get_dict_file_size(&dict_info_);
1933 stat->lemma_count = dict_info_.lemma_count;
1934 stat->lemma_size = dict_info_.lemma_size;
1935 stat->delete_count = dict_info_.free_count;
1936 stat->delete_size = dict_info_.free_size;
1937 #ifdef ___SYNC_ENABLED___
1938 stat->sync_count = dict_info_.sync_count;
1939 #endif
1940 stat->limit_lemma_count = dict_info_.limit_lemma_count;
1941 stat->limit_lemma_size = dict_info_.limit_lemma_size;
1942 stat->reclaim_ratio = dict_info_.reclaim_ratio;
1943 return true;
1944 }
1945
1946 void UserDict::set_limit(uint32 max_lemma_count,
1947 uint32 max_lemma_size, uint32 reclaim_ratio) {
1948 dict_info_.limit_lemma_count = max_lemma_count;
1949 dict_info_.limit_lemma_size = max_lemma_size;
1950 if (reclaim_ratio > 100)
1951 reclaim_ratio = 100;
1952 dict_info_.reclaim_ratio = reclaim_ratio;
1953 }
1954
1955 void UserDict::reclaim() {
1956 if (is_valid_state() == false)
1957 return;
1958
1959 switch (dict_info_.reclaim_ratio) {
1960 case 0:
1961 return;
1962 case 100:
1963 // TODO: CLEAR to be implemented
1964 assert(false);
1965 return;
1966 default:
1967 break;
1968 }
1969
1970 // XXX Reclaim is only based on count, not size
1971 uint32 count = dict_info_.lemma_count;
1972 int rc = count * dict_info_.reclaim_ratio / 100;
1973
1974 UserDictScoreOffsetPair * score_offset_pairs = NULL;
1975 score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
1976 sizeof(UserDictScoreOffsetPair) * rc);
1977 if (score_offset_pairs == NULL) {
1978 return;
1979 }
1980
1981 for (int i = 0; i < rc; i++) {
1982 int s = scores_[i];
1983 score_offset_pairs[i].score = s;
1984 score_offset_pairs[i].offset_index = i;
1985 }
1986
1987 for (int i = (rc + 1) / 2; i >= 0; i--)
1988 shift_down(score_offset_pairs, i, rc);
1989
1990 for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
1991 int s = scores_[i];
1992 if (s < score_offset_pairs[0].score) {
1993 score_offset_pairs[0].score = s;
1994 score_offset_pairs[0].offset_index = i;
1995 shift_down(score_offset_pairs, 0, rc);
1996 }
1997 }
1998
1999 for (int i = 0; i < rc; i++) {
2000 int off = score_offset_pairs[i].offset_index;
2001 remove_lemma_by_offset_index(off);
2002 }
2003 if (rc > 0) {
2004 if (state_ < USER_DICT_OFFSET_DIRTY)
2005 state_ = USER_DICT_OFFSET_DIRTY;
2006 }
2007
2008 free(score_offset_pairs);
2009 }
2010
2011 inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
2012 int s = sop[i].score;
2013 int p = sop[i].offset_index;
2014 sop[i].score = sop[j].score;
2015 sop[i].offset_index = sop[j].offset_index;
2016 sop[j].score = s;
2017 sop[j].offset_index = p;
2018 }
2019
2020 void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
2021 int par = i;
2022 while (par < n) {
2023 int left = par * 2 + 1;
2024 int right = left + 1;
2025 if (left >= n && right >= n)
2026 break;
2027 if (right >= n) {
2028 if (sop[left].score > sop[par].score) {
2029 swap(sop, left, par);
2030 par = left;
2031 continue;
2032 }
2033 } else if (sop[left].score > sop[right].score &&
2034 sop[left].score > sop[par].score) {
2035 swap(sop, left, par);
2036 par = left;
2037 continue;
2038 } else if (sop[right].score > sop[left].score &&
2039 sop[right].score > sop[par].score) {
2040 swap(sop, right, par);
2041 par = right;
2042 continue;
2043 }
2044 break;
2045 }
2046 }
2047
2048 LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
2049 uint16 lemma_len, uint16 count) {
2050 return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
2051 }
2052
2053 LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
2054 uint16 lemma_len, uint16 count, uint64 lmt) {
2055 #ifdef ___DEBUG_PERF___
2056 DEBUG_PERF_BEGIN;
2057 #endif
2058 if (is_valid_state() == false)
2059 return 0;
2060 int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
2061 if (off != -1) {
2062 int delta_score = count - scores_[off];
2063 dict_info_.total_nfreq += delta_score;
2064 scores_[off] = build_score(lmt, count);
2065 if (state_ < USER_DICT_SCORE_DIRTY)
2066 state_ = USER_DICT_SCORE_DIRTY;
2067 #ifdef ___DEBUG_PERF___
2068 DEBUG_PERF_END;
2069 LOGD_PERF("_put_lemma(update)");
2070 #endif
2071 return ids_[off];
2072 } else {
2073 if ((dict_info_.limit_lemma_count > 0 &&
2074 dict_info_.lemma_count >= dict_info_.limit_lemma_count)
2075 || (dict_info_.limit_lemma_size > 0 &&
2076 dict_info_.lemma_size + (2 + (lemma_len << 2))
2077 > dict_info_.limit_lemma_size)) {
2078 // XXX Don't defragment here, it's too time-consuming.
2079 return 0;
2080 }
2081 int flushed = 0;
2082 if (lemma_count_left_ == 0 ||
2083 lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {
2084
2085 // XXX When there is no space for new lemma, we flush to disk
2086 // flush_cache() may be called by upper user
2087 // and better place shoule be found instead of here
2088 flush_cache();
2089 flushed = 1;
2090 // Or simply return and do nothing
2091 // return 0;
2092 }
2093 #ifdef ___DEBUG_PERF___
2094 DEBUG_PERF_END;
2095 LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
2096 #endif
2097 LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
2098 #ifdef ___SYNC_ENABLED___
2099 if (syncs_ && id != 0) {
2100 queue_lemma_for_sync(id);
2101 }
2102 #endif
2103 return id;
2104 }
2105 return 0;
2106 }
2107
2108 #ifdef ___SYNC_ENABLED___
2109 void UserDict::queue_lemma_for_sync(LemmaIdType id) {
2110 if (dict_info_.sync_count < sync_count_size_) {
2111 syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
2112 } else {
2113 uint32 * syncs = (uint32*)realloc(
2114 syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
2115 if (syncs) {
2116 sync_count_size_ += kUserDictPreAlloc;
2117 syncs_ = syncs;
2118 syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
2119 }
2120 }
2121 }
2122 #endif
2123
2124 LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
2125 bool selected) {
2126 #ifdef ___DEBUG_PERF___
2127 DEBUG_PERF_BEGIN;
2128 #endif
2129 if (is_valid_state() == false)
2130 return 0;
2131 if (is_valid_lemma_id(lemma_id) == false)
2132 return 0;
2133 uint32 offset = offsets_by_id_[lemma_id - start_id_];
2134 uint8 lemma_len = get_lemma_nchar(offset);
2135 char16 * lemma_str = get_lemma_word(offset);
2136 uint16 * splids = get_lemma_spell_ids(offset);
2137
2138 int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
2139 if (off != -1) {
2140 int score = scores_[off];
2141 int count = extract_score_freq(score);
2142 uint64 lmt = extract_score_lmt(score);
2143 if (count + delta_count > kUserDictMaxFrequency ||
2144 count + delta_count < count) {
2145 delta_count = kUserDictMaxFrequency - count;
2146 }
2147 count += delta_count;
2148 dict_info_.total_nfreq += delta_count;
2149 if (selected) {
2150 lmt = time(NULL);
2151 }
2152 scores_[off] = build_score(lmt, count);
2153 if (state_ < USER_DICT_SCORE_DIRTY)
2154 state_ = USER_DICT_SCORE_DIRTY;
2155 #ifdef ___DEBUG_PERF___
2156 DEBUG_PERF_END;
2157 LOGD_PERF("update_lemma");
2158 #endif
2159 #ifdef ___SYNC_ENABLED___
2160 queue_lemma_for_sync(ids_[off]);
2161 #endif
2162 return ids_[off];
2163 }
2164 return 0;
2165 }
2166
2167 size_t UserDict::get_total_lemma_count() {
2168 return dict_info_.total_nfreq;
2169 }
2170
2171 void UserDict::set_total_lemma_count_of_others(size_t count) {
2172 total_other_nfreq_ = count;
2173 }
2174
2175 LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
2176 uint16 lemma_len, uint16 count, uint64 lmt) {
2177 LemmaIdType id = get_max_lemma_id() + 1;
2178 size_t offset = dict_info_.lemma_size;
2179 if (offset > kUserDictOffsetMask)
2180 return 0;
2181
2182 lemmas_[offset] = 0;
2183 lemmas_[offset + 1] = (uint8)lemma_len;
2184 for (size_t i = 0; i < lemma_len; i++) {
2185 *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
2186 *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
2187 = lemma_str[i];
2188 }
2189 uint32 off = dict_info_.lemma_count;
2190 offsets_[off] = offset;
2191 scores_[off] = build_score(lmt, count);
2192 ids_[off] = id;
2193 #ifdef ___PREDICT_ENABLED___
2194 predicts_[off] = offset;
2195 #endif
2196
2197 offsets_by_id_[id - start_id_] = offset;
2198
2199 dict_info_.lemma_count++;
2200 dict_info_.lemma_size += (2 + (lemma_len << 2));
2201 lemma_count_left_--;
2202 lemma_size_left_ -= (2 + (lemma_len << 2));
2203
2204 // Sort
2205
2206 UserDictSearchable searchable;
2207 prepare_locate(&searchable, splids, lemma_len);
2208
2209 size_t i = 0;
2210 while (i < off) {
2211 offset = offsets_[i];
2212 uint32 nchar = get_lemma_nchar(offset);
2213 uint16 * spl = get_lemma_spell_ids(offset);
2214
2215 if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
2216 break;
2217 i++;
2218 }
2219 if (i != off) {
2220 uint32 temp = offsets_[off];
2221 memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
2222 offsets_[i] = temp;
2223
2224 temp = scores_[off];
2225 memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
2226 scores_[i] = temp;
2227
2228 temp = ids_[off];
2229 memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
2230 ids_[i] = temp;
2231 }
2232
2233 #ifdef ___PREDICT_ENABLED___
2234 uint32 j = 0;
2235 uint16 * words_new = get_lemma_word(predicts_[off]);
2236 j = locate_where_to_insert_in_predicts(words_new, lemma_len);
2237 if (j != off) {
2238 uint32 temp = predicts_[off];
2239 memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
2240 predicts_[j] = temp;
2241 }
2242 #endif
2243
2244 if (state_ < USER_DICT_LEMMA_DIRTY)
2245 state_ = USER_DICT_LEMMA_DIRTY;
2246
2247 #ifdef ___CACHE_ENABLED___
2248 cache_init();
2249 #endif
2250
2251 dict_info_.total_nfreq += count;
2252 return id;
2253 }
2254 }
2255