• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "../include/userdict.h"
18 #include "../include/splparser.h"
19 #include "../include/ngram.h"
20 #include <stdio.h>
21 #include <string.h>
22 #include <stdlib.h>
23 #include <cutils/log.h>
24 #include <unistd.h>
25 #include <fcntl.h>
26 #include <sys/stat.h>
27 #include <assert.h>
28 #include <ctype.h>
29 #include <sys/types.h>
30 #include <sys/time.h>
31 #include <time.h>
32 #include <pthread.h>
33 #include <math.h>
34 
35 namespace ime_pinyin {
36 
37 #ifdef ___DEBUG_PERF___
38 static uint64 _ellapse_ = 0;
39 static struct timeval _tv_start_, _tv_end_;
40 #define DEBUG_PERF_BEGIN \
41     do { \
42       gettimeofday(&_tv_start_, NULL); \
43     } while(0)
44 #define DEBUG_PERF_END \
45     do { \
46       gettimeofday(&_tv_end_, NULL); \
47       _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
48                   (_tv_end_.tv_usec - _tv_start_.tv_usec); \
49     } while(0)
50 #define LOGD_PERF(message) \
51     ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
52 #else
53 #define DEBUG_PERF_BEGIN
54 #define DEBUG_PERF_END
55 #define LOGD_PERF(message)
56 #endif
57 
58 // XXX File load and write are thread-safe by g_mutex_
59 static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
60 static struct timeval g_last_update_ = {0, 0};
61 
get_dict_file_size(UserDictInfo * info)62 inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
63   return (4 + info->lemma_size + (info->lemma_count << 3)
64 #ifdef ___PREDICT_ENABLED___
65           + (info->lemma_count << 2)
66 #endif
67 #ifdef ___SYNC_ENABLED___
68           + (info->sync_count << 2)
69 #endif
70           + sizeof(*info));
71 }
72 
translate_score(int raw_score)73 inline LmaScoreType UserDict::translate_score(int raw_score) {
74   // 1) ori_freq: original user frequency
75   uint32 ori_freq = extract_score_freq(raw_score);
76   // 2) lmt_off: lmt index (week offset for example)
77   uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
78   if (kUserDictLMTBitWidth < 16) {
79     uint64 mask = ~(1 << kUserDictLMTBitWidth);
80     lmt_off &= mask;
81   }
82   // 3) now_off: current time index (current week offset for example)
83   // assuming load_time_ is around current time
84   uint64 now_off = load_time_.tv_sec;
85   now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
86   now_off = (now_off << (64 - kUserDictLMTBitWidth));
87   now_off = (now_off >> (64 - kUserDictLMTBitWidth));
88   // 4) factor: decide expand-factor
89   int delta = now_off - lmt_off;
90   if (delta > 4)
91     delta = 4;
92   int factor = 80 - (delta << 4);
93 
94   double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
95   return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
96                         * NGram::kLogValueAmplifier);
97 }
98 
extract_score_freq(int raw_score)99 inline int UserDict::extract_score_freq(int raw_score) {
100   // Frequence stored in lowest 16 bits
101   int freq = (raw_score & 0x0000ffff);
102   return freq;
103 }
104 
extract_score_lmt(int raw_score)105 inline uint64 UserDict::extract_score_lmt(int raw_score) {
106   uint64 lmt = ((raw_score & 0xffff0000) >> 16);
107   if (kUserDictLMTBitWidth < 16) {
108     uint64 mask = ~(1 << kUserDictLMTBitWidth);
109     lmt &= mask;
110   }
111   lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
112   return lmt;
113 }
114 
build_score(uint64 lmt,int freq)115 inline int UserDict::build_score(uint64 lmt, int freq) {
116   lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
117   lmt = (lmt << (64 - kUserDictLMTBitWidth));
118   lmt = (lmt >> (64 - kUserDictLMTBitWidth));
119   uint16 lmt16 = (uint16)lmt;
120   int s = freq;
121   s &= 0x0000ffff;
122   s = (lmt16 << 16) | s;
123   return s;
124 }
125 
utf16le_atoll(uint16 * s,int len)126 inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
127   int64 ret = 0;
128   if (len <= 0)
129     return ret;
130 
131   int flag = 1;
132   const uint16 * endp = s + len;
133   if (*s == '-') {
134     flag = -1;
135     s++;
136   } else if (*s == '+') {
137     s++;
138   }
139 
140   while (*s >= '0' && *s <= '9' && s < endp) {
141     ret += ret * 10 + (*s) - '0';
142     s++;
143   }
144   return ret * flag;
145 }
146 
utf16le_lltoa(int64 v,uint16 * s,int size)147 inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
148   if (!s || size <= 0)
149     return 0;
150   uint16 *endp = s + size;
151   int ret_len = 0;
152   if (v < 0) {
153     *(s++) = '-';
154     ++ret_len;
155     v *= -1;
156   }
157 
158   uint16 *b = s;
159   while (s < endp && v != 0) {
160     *(s++) = '0' + (v % 10);
161     v = v / 10;
162     ++ret_len;
163   }
164 
165   if (v != 0)
166     return 0;
167 
168   --s;
169 
170   while (b < s) {
171     *b = *s;
172     ++b, --s;
173   }
174 
175   return ret_len;
176 }
177 
set_lemma_flag(uint32 offset,uint8 flag)178 inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
179   offset &= kUserDictOffsetMask;
180   lemmas_[offset] |= flag;
181 }
182 
get_lemma_flag(uint32 offset)183 inline char UserDict::get_lemma_flag(uint32 offset) {
184   offset &= kUserDictOffsetMask;
185   return (char)(lemmas_[offset]);
186 }
187 
get_lemma_nchar(uint32 offset)188 inline char UserDict::get_lemma_nchar(uint32 offset) {
189   offset &= kUserDictOffsetMask;
190   return (char)(lemmas_[offset + 1]);
191 }
192 
get_lemma_spell_ids(uint32 offset)193 inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
194   offset &= kUserDictOffsetMask;
195   return (uint16 *)(lemmas_ + offset + 2);
196 }
197 
get_lemma_word(uint32 offset)198 inline uint16 * UserDict::get_lemma_word(uint32 offset) {
199   offset &= kUserDictOffsetMask;
200   uint8 nchar = get_lemma_nchar(offset);
201   return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
202 }
203 
get_max_lemma_id()204 inline LemmaIdType UserDict::get_max_lemma_id() {
205   // When a lemma is deleted, we don't not claim its id back for
206   // simplicity and performance
207   return start_id_ + dict_info_.lemma_count - 1;
208 }
209 
is_valid_lemma_id(LemmaIdType id)210 inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
211   if (id >= start_id_ && id <= get_max_lemma_id())
212     return true;
213   return false;
214 }
215 
is_valid_state()216 inline bool UserDict::is_valid_state() {
217   if (state_ == USER_DICT_NONE)
218     return false;
219   return true;
220 }
221 
UserDict()222 UserDict::UserDict()
223     : start_id_(0),
224       version_(0),
225       lemmas_(NULL),
226       offsets_(NULL),
227       scores_(NULL),
228       ids_(NULL),
229 #ifdef ___PREDICT_ENABLED___
230       predicts_(NULL),
231 #endif
232 #ifdef ___SYNC_ENABLED___
233       syncs_(NULL),
234       sync_count_size_(0),
235 #endif
236       offsets_by_id_(NULL),
237       lemma_count_left_(0),
238       lemma_size_left_(0),
239       dict_file_(NULL),
240       state_(USER_DICT_NONE) {
241   memset(&dict_info_, 0, sizeof(dict_info_));
242   memset(&load_time_, 0, sizeof(load_time_));
243 #ifdef ___CACHE_ENABLED___
244   cache_init();
245 #endif
246 }
247 
~UserDict()248 UserDict::~UserDict() {
249   close_dict();
250 }
251 
load_dict(const char * file_name,LemmaIdType start_id,LemmaIdType end_id)252 bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
253                          LemmaIdType end_id) {
254 #ifdef ___DEBUG_PERF___
255   DEBUG_PERF_BEGIN;
256 #endif
257   dict_file_ = strdup(file_name);
258   if (!dict_file_)
259     return false;
260 
261   start_id_ = start_id;
262 
263   if (false == validate(file_name) && false == reset(file_name)) {
264     goto error;
265   }
266   if (false == load(file_name, start_id)) {
267     goto error;
268   }
269 
270   state_ = USER_DICT_SYNC;
271 
272   gettimeofday(&load_time_, NULL);
273 
274 #ifdef ___DEBUG_PERF___
275   DEBUG_PERF_END;
276   LOGD_PERF("load_dict");
277 #endif
278   return true;
279  error:
280   free((void*)dict_file_);
281   start_id_ = 0;
282   return false;
283 }
284 
close_dict()285 bool UserDict::close_dict() {
286   if (state_ == USER_DICT_NONE)
287     return true;
288   if (state_ == USER_DICT_SYNC)
289     goto out;
290 
291   // If dictionary is written back by others,
292   // we can not simply write back here
293   // To do a safe flush, we have to discard all newly added
294   // lemmas and try to reload dict file.
295   pthread_mutex_lock(&g_mutex_);
296   if (load_time_.tv_sec > g_last_update_.tv_sec ||
297     (load_time_.tv_sec == g_last_update_.tv_sec &&
298      load_time_.tv_usec > g_last_update_.tv_usec)) {
299     write_back();
300     gettimeofday(&g_last_update_, NULL);
301   }
302   pthread_mutex_unlock(&g_mutex_);
303 
304  out:
305   free((void*)dict_file_);
306   free(lemmas_);
307   free(offsets_);
308   free(offsets_by_id_);
309   free(scores_);
310   free(ids_);
311 #ifdef ___PREDICT_ENABLED___
312   free(predicts_);
313 #endif
314 
315   version_ = 0;
316   dict_file_ = NULL;
317   lemmas_ = NULL;
318 #ifdef ___SYNC_ENABLED___
319   syncs_ = NULL;
320   sync_count_size_ = 0;
321 #endif
322   offsets_ = NULL;
323   offsets_by_id_ = NULL;
324   scores_ = NULL;
325   ids_ = NULL;
326 #ifdef ___PREDICT_ENABLED___
327   predicts_ = NULL;
328 #endif
329 
330   memset(&dict_info_, 0, sizeof(dict_info_));
331   lemma_count_left_ = 0;
332   lemma_size_left_ = 0;
333   state_ = USER_DICT_NONE;
334 
335   return true;
336 }
337 
number_of_lemmas()338 size_t UserDict::number_of_lemmas() {
339   return dict_info_.lemma_count;
340 }
341 
reset_milestones(uint16 from_step,MileStoneHandle from_handle)342 void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
343   return;
344 }
345 
extend_dict(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)346 MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
347                                       const DictExtPara *dep,
348                                       LmaPsbItem *lpi_items,
349                                       size_t lpi_max, size_t *lpi_num) {
350   if (is_valid_state() == false)
351     return 0;
352 
353   bool need_extend = false;
354 
355 #ifdef ___DEBUG_PERF___
356   DEBUG_PERF_BEGIN;
357 #endif
358   *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
359                        lpi_items, lpi_max, &need_extend);
360 #ifdef ___DEBUG_PERF___
361   DEBUG_PERF_END;
362   LOGD_PERF("extend_dict");
363 #endif
364   return ((*lpi_num > 0 || need_extend) ? 1 : 0);
365 }
366 
is_fuzzy_prefix_spell_id(const uint16 * id1,uint16 len1,const UserDictSearchable * searchable)367 int UserDict::is_fuzzy_prefix_spell_id(
368     const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
369   if (len1 < searchable->splids_len)
370     return 0;
371 
372   SpellingTrie &spl_trie = SpellingTrie::get_instance();
373   uint32 i = 0;
374   for (i = 0; i < searchable->splids_len; i++) {
375     const char py1 = *spl_trie.get_spelling_str(id1[i]);
376     uint16 off = 8 * (i % 4);
377     const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
378     if (py1 == py2)
379       continue;
380     return 0;
381   }
382   return 1;
383 }
384 
fuzzy_compare_spell_id(const uint16 * id1,uint16 len1,const UserDictSearchable * searchable)385 int UserDict::fuzzy_compare_spell_id(
386     const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
387   if (len1 < searchable->splids_len)
388     return -1;
389   if (len1 > searchable->splids_len)
390     return 1;
391 
392   SpellingTrie &spl_trie = SpellingTrie::get_instance();
393   uint32 i = 0;
394   for (i = 0; i < len1; i++) {
395     const char py1 = *spl_trie.get_spelling_str(id1[i]);
396     uint16 off = 8 * (i % 4);
397     const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
398     if (py1 == py2)
399       continue;
400     if (py1 > py2)
401       return 1;
402     return -1;
403   }
404   return 0;
405 }
406 
is_prefix_spell_id(const uint16 * fullids,uint16 fulllen,const UserDictSearchable * searchable)407 bool UserDict::is_prefix_spell_id(
408     const uint16 * fullids, uint16 fulllen,
409     const UserDictSearchable *searchable) {
410   if (fulllen < searchable->splids_len)
411     return false;
412 
413   uint32 i = 0;
414   for (; i < searchable->splids_len; i++) {
415     uint16 start_id = searchable->splid_start[i];
416     uint16 count = searchable->splid_count[i];
417     if (fullids[i] >= start_id && fullids[i] < start_id + count)
418       continue;
419     else
420       return false;
421   }
422   return true;
423 }
424 
equal_spell_id(const uint16 * fullids,uint16 fulllen,const UserDictSearchable * searchable)425 bool UserDict::equal_spell_id(
426     const uint16 * fullids, uint16 fulllen,
427     const UserDictSearchable *searchable) {
428   if (fulllen != searchable->splids_len)
429     return false;
430 
431   uint32 i = 0;
432   for (; i < fulllen; i++) {
433     uint16 start_id = searchable->splid_start[i];
434     uint16 count = searchable->splid_count[i];
435     if (fullids[i] >= start_id && fullids[i] < start_id + count)
436       continue;
437     else
438       return false;
439   }
440   return true;
441 }
442 
locate_first_in_offsets(const UserDictSearchable * searchable)443 int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
444   int32 begin = 0;
445   int32 end = dict_info_.lemma_count - 1;
446   int32 middle = -1;
447 
448   int32 first_prefix = middle;
449   int32 last_matched = middle;
450 
451   while (begin <= end) {
452     middle = (begin + end) >> 1;
453     uint32 offset = offsets_[middle];
454     uint8 nchar = get_lemma_nchar(offset);
455     const uint16 * splids = get_lemma_spell_ids(offset);
456     int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
457     int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);
458 
459     if (pre)
460       first_prefix = middle;
461 
462     if (cmp < 0) {
463       begin = middle + 1;
464     } else if (cmp > 0) {
465       end = middle - 1;
466     } else {
467       end = middle - 1;
468       last_matched = middle;
469     }
470   }
471 
472   return first_prefix;
473 }
474 
prepare_locate(UserDictSearchable * searchable,const uint16 * splid_str,uint16 splid_str_len)475 void UserDict::prepare_locate(UserDictSearchable *searchable,
476                              const uint16 *splid_str,
477                              uint16 splid_str_len) {
478   searchable->splids_len = splid_str_len;
479   memset(searchable->signature, 0, sizeof(searchable->signature));
480 
481   SpellingTrie &spl_trie = SpellingTrie::get_instance();
482   uint32 i = 0;
483   for (; i < splid_str_len; i++) {
484     if (spl_trie.is_half_id(splid_str[i])) {
485       searchable->splid_count[i] =
486           spl_trie.half_to_full(splid_str[i],
487                                 &(searchable->splid_start[i]));
488     } else {
489       searchable->splid_count[i] = 1;
490       searchable->splid_start[i] = splid_str[i];
491     }
492     const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
493     searchable->signature[i>>2] |= (py << (8 * (i % 4)));
494   }
495 }
496 
get_lpis(const uint16 * splid_str,uint16 splid_str_len,LmaPsbItem * lpi_items,size_t lpi_max)497 size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
498                           LmaPsbItem *lpi_items, size_t lpi_max) {
499   return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
500 }
501 
_get_lpis(const uint16 * splid_str,uint16 splid_str_len,LmaPsbItem * lpi_items,size_t lpi_max,bool * need_extend)502 size_t UserDict::_get_lpis(const uint16 *splid_str,
503                            uint16 splid_str_len, LmaPsbItem *lpi_items,
504                            size_t lpi_max, bool * need_extend) {
505   bool tmp_extend;
506   if (!need_extend)
507     need_extend = &tmp_extend;
508 
509   *need_extend = false;
510 
511   if (is_valid_state() == false)
512     return 0;
513   if (lpi_max <= 0)
514     return 0;
515 
516   if (0 == pthread_mutex_trylock(&g_mutex_)) {
517     if (load_time_.tv_sec < g_last_update_.tv_sec ||
518       (load_time_.tv_sec == g_last_update_.tv_sec &&
519        load_time_.tv_usec < g_last_update_.tv_usec)) {
520       // Others updated disk file, have to reload
521       pthread_mutex_unlock(&g_mutex_);
522       flush_cache();
523     } else {
524       pthread_mutex_unlock(&g_mutex_);
525     }
526   } else {
527   }
528 
529   UserDictSearchable searchable;
530   prepare_locate(&searchable, splid_str, splid_str_len);
531 
532   uint32 max_off = dict_info_.lemma_count;
533 #ifdef ___CACHE_ENABLED___
534   int32 middle;
535   uint32 start, count;
536   bool cached = cache_hit(&searchable, &start, &count);
537   if (cached) {
538     middle = start;
539     max_off = start + count;
540   } else {
541     middle = locate_first_in_offsets(&searchable);
542     start = middle;
543   }
544 #else
545   int32 middle = locate_first_in_offsets(&searchable);
546 #endif
547 
548   if (middle == -1) {
549 #ifdef ___CACHE_ENABLED___
550     if (!cached)
551       cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
552 #endif
553     return 0;
554   }
555 
556   size_t lpi_current = 0;
557 
558   bool fuzzy_break = false;
559   bool prefix_break = false;
560   while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
561     if (lpi_current >= lpi_max)
562       break;
563     uint32 offset = offsets_[middle];
564     // Ignore deleted lemmas
565     if (offset & kUserDictOffsetFlagRemove) {
566       middle++;
567       continue;
568     }
569     uint8 nchar = get_lemma_nchar(offset);
570     uint16 * splids = get_lemma_spell_ids(offset);
571 #ifdef ___CACHE_ENABLED___
572     if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
573 #else
574     if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
575 #endif
576       fuzzy_break = true;
577     }
578 
579     if (prefix_break == false) {
580       if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
581         if (*need_extend == false &&
582             is_prefix_spell_id(splids, nchar, &searchable)) {
583           *need_extend = true;
584         }
585       } else {
586         prefix_break = true;
587       }
588     }
589 
590     if (equal_spell_id(splids, nchar, &searchable) == true) {
591       lpi_items[lpi_current].psb = translate_score(scores_[middle]);
592       lpi_items[lpi_current].id = ids_[middle];
593       lpi_items[lpi_current].lma_len = nchar;
594       lpi_current++;
595     }
596     middle++;
597   }
598 
599 #ifdef ___CACHE_ENABLED___
600   if (!cached) {
601     count = middle - start;
602     cache_push(USER_DICT_CACHE, &searchable, start, count);
603   }
604 #endif
605 
606   return lpi_current;
607 }
608 
609 uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
610                                uint16 str_max) {
611   if (is_valid_state() == false)
612     return 0;
613   if (is_valid_lemma_id(id_lemma) == false)
614     return 0;
615   uint32 offset = offsets_by_id_[id_lemma - start_id_];
616   uint8 nchar = get_lemma_nchar(offset);
617   char16 * str = get_lemma_word(offset);
618   uint16 m = nchar < str_max -1 ? nchar : str_max - 1;
619   int i = 0;
620   for (; i < m; i++) {
621     str_buf[i] = str[i];
622   }
623   str_buf[i] = 0;
624   return m;
625 }
626 
627 uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
628                                   uint16 splids_max, bool arg_valid) {
629   if (is_valid_lemma_id(id_lemma) == false)
630     return 0;
631   uint32 offset = offsets_by_id_[id_lemma - start_id_];
632   uint8 nchar = get_lemma_nchar(offset);
633   const uint16 * ids = get_lemma_spell_ids(offset);
634   int i = 0;
635   for (; i < nchar && i < splids_max; i++)
636     splids[i] = ids[i];
637   return i;
638 }
639 
640 size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
641                          NPredictItem *npre_items, size_t npre_max,
642                          size_t b4_used) {
643   uint32 new_added = 0;
644 #ifdef ___PREDICT_ENABLED___
645   int32 end = dict_info_.lemma_count - 1;
646   int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
647   if (j == -1)
648     return 0;
649 
650   while (j <= end) {
651     uint32 offset = predicts_[j];
652     // Ignore deleted lemmas
653     if (offset & kUserDictOffsetFlagRemove) {
654       j++;
655       continue;
656     }
657     uint32 nchar = get_lemma_nchar(offset);
658     uint16 * words = get_lemma_word(offset);
659     uint16 * splids = get_lemma_spell_ids(offset);
660 
661     if (nchar <= hzs_len) {
662       j++;
663       continue;
664     }
665 
666     if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
667       if (new_added >= npre_max) {
668         return new_added;
669       }
670       uint32 cpy_len =
671           (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
672           - (hzs_len << 1);
673       npre_items[new_added].his_len = hzs_len;
674       npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
675       memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
676       if ((cpy_len >> 1) < kMaxPredictSize) {
677         npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
678       }
679       new_added++;
680     } else {
681       break;
682     }
683 
684     j++;
685   }
686 #endif
687   return new_added;
688 }
689 
690 int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
691                                   uint16 lemma_len) {
692   int32 max_off = dict_info_.lemma_count;
693 
694   UserDictSearchable searchable;
695   prepare_locate(&searchable, splid_str, lemma_len);
696 #ifdef ___CACHE_ENABLED___
697   int32 off;
698   uint32 start, count;
699   bool cached = load_cache(&searchable, &start, &count);
700   if (cached) {
701     off = start;
702     max_off = start + count;
703   } else {
704     off = locate_first_in_offsets(&searchable);
705     start = off;
706   }
707 #else
708   int32 off = locate_first_in_offsets(&searchable);
709 #endif
710 
711   if (off == -1) {
712     return off;
713   }
714 
715   while (off < max_off) {
716     uint32 offset = offsets_[off];
717     if (offset & kUserDictOffsetFlagRemove) {
718       off++;
719       continue;
720     }
721     uint16 * splids = get_lemma_spell_ids(offset);
722 #ifdef ___CACHE_ENABLED___
723     if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
724       break;
725 #else
726     if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
727       break;
728 #endif
729     if (equal_spell_id(splids, lemma_len, &searchable) == true) {
730       uint16 * str = get_lemma_word(offset);
731       uint32 i = 0;
732       for (i = 0; i < lemma_len; i++) {
733         if (str[i] == lemma_str[i])
734           continue;
735         break;
736       }
737       if (i < lemma_len) {
738         off++;
739         continue;
740       }
741 #ifdef ___CACHE_ENABLED___
742       // No need to save_cache here, since current function is invoked by
743       // put_lemma. It's rarely possible for a user input same lemma twice.
744       // That means first time user type a new lemma, it is newly added into
745       // user dictionary, then it's possible that user type the same lemma
746       // again.
747       // Another reason save_cache can not be invoked here is this function
748       // aborts when lemma is found, and it never knows the count.
749 #endif
750       return off;
751     }
752     off++;
753   }
754 
755   return -1;
756 }
757 
758 #ifdef ___PREDICT_ENABLED___
759 uint32 UserDict::locate_where_to_insert_in_predicts(
760     const uint16 * words, int lemma_len) {
761   int32 begin = 0;
762   int32 end = dict_info_.lemma_count - 1;
763   int32 middle = end;
764 
765   uint32 last_matched = middle;
766 
767   while (begin <= end) {
768     middle = (begin + end) >> 1;
769     uint32 offset = offsets_[middle];
770     uint8 nchar = get_lemma_nchar(offset);
771     const uint16 * ws = get_lemma_word(offset);
772 
773     uint32 minl = nchar < lemma_len ? nchar : lemma_len;
774     uint32 k = 0;
775     int cmp = 0;
776 
777     for (; k < minl; k++) {
778       if (ws[k] < words[k]) {
779         cmp = -1;
780         break;
781       } else if (ws[k] > words[k]) {
782         cmp = 1;
783         break;
784       }
785     }
786     if (cmp == 0) {
787       if (nchar < lemma_len)
788         cmp = -1;
789       else if (nchar > lemma_len)
790         cmp = 1;
791     }
792 
793     if (cmp < 0) {
794       begin = middle + 1;
795       last_matched = middle;
796     } else if (cmp > 0) {
797       end = middle - 1;
798     } else {
799       end = middle - 1;
800       last_matched = middle;
801     }
802   }
803 
804   return last_matched;
805 }
806 
807 int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
808   int32 begin = 0;
809   int32 end = dict_info_.lemma_count - 1;
810   int32 middle = -1;
811 
812   int32 last_matched = middle;
813 
814   while (begin <= end) {
815     middle = (begin + end) >> 1;
816     uint32 offset = offsets_[middle];
817     uint8 nchar = get_lemma_nchar(offset);
818     const uint16 * ws = get_lemma_word(offset);
819 
820     uint32 minl = nchar < lemma_len ? nchar : lemma_len;
821     uint32 k = 0;
822     int cmp = 0;
823 
824     for (; k < minl; k++) {
825       if (ws[k] < words[k]) {
826         cmp = -1;
827         break;
828       } else if (ws[k] > words[k]) {
829         cmp = 1;
830         break;
831       }
832     }
833     if (cmp == 0) {
834       if (nchar >= lemma_len)
835         last_matched = middle;
836       if (nchar < lemma_len)
837         cmp = -1;
838       else if (nchar > lemma_len)
839         cmp = 1;
840     }
841 
842     if (cmp < 0) {
843       begin = middle + 1;
844     } else if (cmp > 0) {
845       end = middle - 1;
846     } else {
847       end = middle - 1;
848     }
849   }
850 
851   return last_matched;
852 }
853 
854 #endif
855 
856 LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
857                                    uint16 lemma_len) {
858   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
859   if (off == -1) {
860     return 0;
861   }
862 
863   return ids_[off];
864 }
865 
866 LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
867   if (is_valid_state() == false)
868     return 0;
869   if (is_valid_lemma_id(lemma_id) == false)
870     return 0;
871 
872   return translate_score(_get_lemma_score(lemma_id));
873 }
874 
875 LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
876                                 uint16 lemma_len) {
877   if (is_valid_state() == false)
878     return 0;
879   return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
880 }
881 
882 int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
883   if (is_valid_state() == false)
884     return 0;
885   if (is_valid_lemma_id(lemma_id) == false)
886     return 0;
887 
888   uint32 offset = offsets_by_id_[lemma_id - start_id_];
889 
890   uint32 nchar = get_lemma_nchar(offset);
891   uint16 * spl = get_lemma_spell_ids(offset);
892   uint16 * wrd = get_lemma_word(offset);
893 
894   int32 off = locate_in_offsets(wrd, spl, nchar);
895   if (off == -1) {
896     return 0;
897   }
898 
899   return scores_[off];
900 }
901 
902 int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
903                                 uint16 lemma_len) {
904   if (is_valid_state() == false)
905     return 0;
906 
907   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
908   if (off == -1) {
909     return 0;
910   }
911 
912   return scores_[off];
913 }
914 
915 #ifdef ___SYNC_ENABLED___
916 void UserDict::remove_lemma_from_sync_list(uint32 offset) {
917   offset &= kUserDictOffsetMask;
918   uint32 i = 0;
919   for (; i < dict_info_.sync_count; i++) {
920     unsigned int off = (syncs_[i] & kUserDictOffsetMask);
921     if (off == offset)
922       break;
923   }
924   if (i < dict_info_.sync_count) {
925     syncs_[i] = syncs_[dict_info_.sync_count - 1];
926     dict_info_.sync_count--;
927   }
928 }
929 #endif
930 
931 #ifdef ___PREDICT_ENABLED___
932 void UserDict::remove_lemma_from_predict_list(uint32 offset) {
933   offset &= kUserDictOffsetMask;
934   uint32 i = 0;
935   for (; i < dict_info_.lemma_count; i++) {
936     unsigned int off = (predicts_[i] & kUserDictOffsetMask);
937     if (off == offset) {
938       predicts_[i] |= kUserDictOffsetFlagRemove;
939       break;
940     }
941   }
942 }
943 #endif
944 
945 bool UserDict::remove_lemma_by_offset_index(int offset_index) {
946   if (is_valid_state() == false)
947     return 0;
948 
949   int32 off = offset_index;
950   if (off == -1) {
951     return false;
952   }
953 
954   uint32 offset = offsets_[off];
955   uint32 nchar = get_lemma_nchar(offset);
956 
957   offsets_[off] |= kUserDictOffsetFlagRemove;
958 
959 #ifdef ___SYNC_ENABLED___
960   // Remove corresponding sync item
961   remove_lemma_from_sync_list(offset);
962 #endif
963 
964 #ifdef ___PREDICT_ENABLED___
965   remove_lemma_from_predict_list(offset);
966 #endif
967   dict_info_.free_count++;
968   dict_info_.free_size += (2 + (nchar << 2));
969 
970   if (state_ < USER_DICT_OFFSET_DIRTY)
971     state_ = USER_DICT_OFFSET_DIRTY;
972   return true;
973 }
974 
975 bool UserDict::remove_lemma(LemmaIdType lemma_id) {
976   if (is_valid_state() == false)
977     return 0;
978   if (is_valid_lemma_id(lemma_id) == false)
979     return false;
980   uint32 offset = offsets_by_id_[lemma_id - start_id_];
981 
982   uint32 nchar = get_lemma_nchar(offset);
983   uint16 * spl = get_lemma_spell_ids(offset);
984   uint16 * wrd = get_lemma_word(offset);
985 
986   int32 off = locate_in_offsets(wrd, spl, nchar);
987 
988   return remove_lemma_by_offset_index(off);
989 }
990 
991 void UserDict::flush_cache() {
992   LemmaIdType start_id = start_id_;
993   const char * file = strdup(dict_file_);
994   if (!file)
995     return;
996   close_dict();
997   load_dict(file, start_id, kUserDictIdEnd);
998   free((void*)file);
999 #ifdef ___CACHE_ENABLED___
1000   cache_init();
1001 #endif
1002   return;
1003 }
1004 
1005 bool UserDict::reset(const char *file) {
1006   FILE *fp = fopen(file, "w+");
1007   if (!fp) {
1008     return false;
1009   }
1010   uint32 version = kUserDictVersion;
1011   size_t wred = fwrite(&version, 1, 4, fp);
1012   UserDictInfo info;
1013   memset(&info, 0, sizeof(info));
1014   // By default, no limitation for lemma count and size
1015   // thereby, reclaim_ratio is never used
1016   wred += fwrite(&info, 1, sizeof(info), fp);
1017   if (wred != sizeof(info) + sizeof(version)) {
1018     fclose(fp);
1019     unlink(file);
1020     return false;
1021   }
1022   fclose(fp);
1023   return true;
1024 }
1025 
1026 bool UserDict::validate(const char *file) {
1027   // b is ignored in POSIX compatible os including Linux
1028   // while b is important flag for Windows to specify binary mode
1029   FILE *fp = fopen(file, "rb");
1030   if (!fp) {
1031     return false;
1032   }
1033 
1034   size_t size;
1035   size_t readed;
1036   uint32 version;
1037   UserDictInfo dict_info;
1038 
1039   // validate
1040   int err = fseek(fp, 0, SEEK_END);
1041   if (err) {
1042     goto error;
1043   }
1044 
1045   size = ftell(fp);
1046   if (size < 4 + sizeof(dict_info)) {
1047     goto error;
1048   }
1049 
1050   err = fseek(fp, 0, SEEK_SET);
1051   if (err) {
1052     goto error;
1053   }
1054 
1055   readed = fread(&version, 1, sizeof(version), fp);
1056   if (readed < sizeof(version)) {
1057     goto error;
1058   }
1059   if (version != kUserDictVersion) {
1060     goto error;
1061   }
1062 
1063   err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
1064   if (err) {
1065     goto error;
1066   }
1067 
1068   readed = fread(&dict_info, 1, sizeof(dict_info), fp);
1069   if (readed != sizeof(dict_info)) {
1070     goto error;
1071   }
1072 
1073   if (size != get_dict_file_size(&dict_info)) {
1074     goto error;
1075   }
1076 
1077   fclose(fp);
1078   return true;
1079 
1080  error:
1081   fclose(fp);
1082   return false;
1083 }
1084 
1085 bool UserDict::load(const char *file, LemmaIdType start_id) {
1086   if (0 != pthread_mutex_trylock(&g_mutex_)) {
1087     return false;
1088   }
1089   // b is ignored in POSIX compatible os including Linux
1090   // while b is important flag for Windows to specify binary mode
1091   FILE *fp = fopen(file, "rb");
1092   if (!fp) {
1093     pthread_mutex_unlock(&g_mutex_);
1094     return false;
1095   }
1096 
1097   size_t readed, toread;
1098   UserDictInfo dict_info;
1099   uint8 *lemmas = NULL;
1100   uint32 *offsets = NULL;
1101 #ifdef ___SYNC_ENABLED___
1102   uint32 *syncs = NULL;
1103 #endif
1104   uint32 *scores = NULL;
1105   uint32 *ids = NULL;
1106   uint32 *offsets_by_id = NULL;
1107 #ifdef ___PREDICT_ENABLED___
1108   uint32 *predicts = NULL;
1109 #endif
1110   size_t i;
1111   int err;
1112 
1113   err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
1114   if (err) goto error;
1115 
1116   readed = fread(&dict_info, 1, sizeof(dict_info), fp);
1117   if (readed != sizeof(dict_info)) goto error;
1118 
1119   lemmas = (uint8 *)malloc(
1120       dict_info.lemma_size +
1121       (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));
1122 
1123   if (!lemmas) goto error;
1124 
1125   offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1126   if (!offsets) goto error;
1127 
1128 #ifdef ___PREDICT_ENABLED___
1129   predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1130   if (!predicts) goto error;
1131 #endif
1132 
1133 #ifdef ___SYNC_ENABLED___
1134   syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
1135   if (!syncs) goto error;
1136 #endif
1137 
1138   scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1139   if (!scores) goto error;
1140 
1141   ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1142   if (!ids) goto error;
1143 
1144   offsets_by_id = (uint32 *)malloc(
1145       (dict_info.lemma_count + kUserDictPreAlloc) << 2);
1146   if (!offsets_by_id) goto error;
1147 
1148   err = fseek(fp, 4, SEEK_SET);
1149   if (err) goto error;
1150 
1151   readed = 0;
1152   while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
1153     readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
1154   }
1155   if (readed < dict_info.lemma_size)
1156     goto error;
1157 
1158   toread = (dict_info.lemma_count << 2);
1159   readed = 0;
1160   while (readed < toread && !ferror(fp) && !feof(fp)) {
1161     readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
1162   }
1163   if (readed < toread)
1164     goto error;
1165 
1166 #ifdef ___PREDICT_ENABLED___
1167   toread = (dict_info.lemma_count << 2);
1168   readed = 0;
1169   while (readed < toread && !ferror(fp) && !feof(fp)) {
1170     readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
1171   }
1172   if (readed < toread)
1173     goto error;
1174 #endif
1175 
1176   readed = 0;
1177   while (readed < toread && !ferror(fp) && !feof(fp)) {
1178     readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
1179   }
1180   if (readed < toread)
1181     goto error;
1182 
1183 #ifdef ___SYNC_ENABLED___
1184   toread = (dict_info.sync_count << 2);
1185   readed = 0;
1186   while (readed < toread && !ferror(fp) && !feof(fp)) {
1187     readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
1188   }
1189   if (readed < toread)
1190     goto error;
1191 #endif
1192 
1193   for (i = 0; i < dict_info.lemma_count; i++) {
1194     ids[i] = start_id + i;
1195     offsets_by_id[i] = offsets[i];
1196   }
1197 
1198   lemmas_ = lemmas;
1199   offsets_ = offsets;
1200 #ifdef ___SYNC_ENABLED___
1201   syncs_ = syncs;
1202   sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
1203 #endif
1204   offsets_by_id_ = offsets_by_id;
1205   scores_ = scores;
1206   ids_ = ids;
1207 #ifdef ___PREDICT_ENABLED___
1208   predicts_ = predicts;
1209 #endif
1210   lemma_count_left_ = kUserDictPreAlloc;
1211   lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
1212   memcpy(&dict_info_, &dict_info, sizeof(dict_info));
1213   state_ = USER_DICT_SYNC;
1214 
1215   fclose(fp);
1216 
1217   pthread_mutex_unlock(&g_mutex_);
1218   return true;
1219 
1220  error:
1221   if (lemmas) free(lemmas);
1222   if (offsets) free(offsets);
1223 #ifdef ___SYNC_ENABLED___
1224   if (syncs) free(syncs);
1225 #endif
1226   if (scores) free(scores);
1227   if (ids) free(ids);
1228   if (offsets_by_id) free(offsets_by_id);
1229 #ifdef ___PREDICT_ENABLED___
1230   if (predicts) free(predicts);
1231 #endif
1232   fclose(fp);
1233   pthread_mutex_unlock(&g_mutex_);
1234   return false;
1235 }
1236 
1237 void UserDict::write_back() {
1238   // XXX write back is only allowed from close_dict due to thread-safe sake
1239   if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
1240     return;
1241   int fd = open(dict_file_, O_WRONLY);
1242   if (fd == -1)
1243     return;
1244   switch (state_) {
1245     case USER_DICT_DEFRAGMENTED:
1246       write_back_all(fd);
1247       break;
1248     case USER_DICT_LEMMA_DIRTY:
1249       write_back_lemma(fd);
1250       break;
1251     case USER_DICT_OFFSET_DIRTY:
1252       write_back_offset(fd);
1253       break;
1254     case USER_DICT_SCORE_DIRTY:
1255       write_back_score(fd);
1256       break;
1257 #ifdef ___SYNC_ENABLED___
1258     case USER_DICT_SYNC_DIRTY:
1259       write_back_sync(fd);
1260       break;
1261 #endif
1262     default:
1263       break;
1264   }
1265   // It seems truncate is not need on Linux, Windows except Mac
1266   // I am doing it here anyway for safety.
1267   off_t cur = lseek(fd, 0, SEEK_CUR);
1268   ftruncate(fd, cur);
1269   close(fd);
1270   state_ = USER_DICT_SYNC;
1271 }
1272 
1273 #ifdef ___SYNC_ENABLED___
1274 void UserDict::write_back_sync(int fd) {
1275   int err = lseek(fd, 4 + dict_info_.lemma_size
1276                   + (dict_info_.lemma_count << 3)
1277 #ifdef ___PREDICT_ENABLED___
1278                   + (dict_info_.lemma_count << 2)
1279 #endif
1280                   , SEEK_SET);
1281   if (err == -1)
1282     return;
1283   write(fd, syncs_, dict_info_.sync_count << 2);
1284   write(fd, &dict_info_, sizeof(dict_info_));
1285 }
1286 #endif
1287 
1288 void UserDict::write_back_offset(int fd) {
1289   int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
1290   if (err == -1)
1291     return;
1292   write(fd, offsets_, dict_info_.lemma_count << 2);
1293 #ifdef ___PREDICT_ENABLED___
1294   write(fd, predicts_, dict_info_.lemma_count << 2);
1295 #endif
1296   write(fd, scores_, dict_info_.lemma_count << 2);
1297 #ifdef ___SYNC_ENABLED___
1298   write(fd, syncs_, dict_info_.sync_count << 2);
1299 #endif
1300   write(fd, &dict_info_, sizeof(dict_info_));
1301 }
1302 
1303 void UserDict::write_back_score(int fd) {
1304   int err = lseek(fd, 4 + dict_info_.lemma_size
1305                   + (dict_info_.lemma_count << 2)
1306 #ifdef ___PREDICT_ENABLED___
1307                   + (dict_info_.lemma_count << 2)
1308 #endif
1309                   , SEEK_SET);
1310   if (err == -1)
1311     return;
1312   write(fd, scores_, dict_info_.lemma_count << 2);
1313 #ifdef ___SYNC_ENABLED___
1314   write(fd, syncs_, dict_info_.sync_count << 2);
1315 #endif
1316   write(fd, &dict_info_, sizeof(dict_info_));
1317 }
1318 
1319 void UserDict::write_back_lemma(int fd) {
1320   int err = lseek(fd, 4, SEEK_SET);
1321   if (err == -1)
1322     return;
1323   // New lemmas are always appended, no need to write whole lemma block
1324   size_t need_write = kUserDictPreAlloc *
1325       (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
1326   err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
1327   if (err == -1)
1328     return;
1329   write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);
1330 
1331   write(fd, offsets_,  dict_info_.lemma_count << 2);
1332 #ifdef ___PREDICT_ENABLED___
1333   write(fd, predicts_,  dict_info_.lemma_count << 2);
1334 #endif
1335   write(fd, scores_, dict_info_.lemma_count << 2);
1336 #ifdef ___SYNC_ENABLED___
1337   write(fd, syncs_, dict_info_.sync_count << 2);
1338 #endif
1339   write(fd, &dict_info_, sizeof(dict_info_));
1340 }
1341 
1342 void UserDict::write_back_all(int fd) {
1343   // XXX lemma_size is handled differently in writeall
1344   // and writelemma. I update lemma_size and lemma_count in different
1345   // places for these two cases. Should fix it to make it consistent.
1346   int err = lseek(fd, 4, SEEK_SET);
1347   if (err == -1)
1348     return;
1349   write(fd, lemmas_, dict_info_.lemma_size);
1350   write(fd, offsets_, dict_info_.lemma_count << 2);
1351 #ifdef ___PREDICT_ENABLED___
1352   write(fd, predicts_, dict_info_.lemma_count << 2);
1353 #endif
1354   write(fd, scores_, dict_info_.lemma_count << 2);
1355 #ifdef ___SYNC_ENABLED___
1356   write(fd, syncs_, dict_info_.sync_count << 2);
1357 #endif
1358   write(fd, &dict_info_, sizeof(dict_info_));
1359 }
1360 
1361 #ifdef ___CACHE_ENABLED___
1362 bool UserDict::load_cache(UserDictSearchable *searchable,
1363                           uint32 *offset, uint32 *length) {
1364   UserDictCache *cache = &caches_[searchable->splids_len - 1];
1365   if (cache->head == cache->tail)
1366     return false;
1367 
1368   uint16 j, sig_len = kMaxLemmaSize / 4;
1369   uint16 i = cache->head;
1370   while (1) {
1371     j = 0;
1372     for (; j < sig_len; j++) {
1373       if (cache->signatures[i][j] != searchable->signature[j])
1374         break;
1375     }
1376     if (j < sig_len) {
1377       i++;
1378       if (i >= kUserDictCacheSize)
1379         i -= kUserDictCacheSize;
1380       if (i == cache->tail)
1381         break;
1382       continue;
1383     }
1384     *offset = cache->offsets[i];
1385     *length = cache->lengths[i];
1386     return true;
1387   }
1388   return false;
1389 }
1390 
1391 void UserDict::save_cache(UserDictSearchable *searchable,
1392                           uint32 offset, uint32 length) {
1393   UserDictCache *cache = &caches_[searchable->splids_len - 1];
1394   uint16 next = cache->tail;
1395 
1396   cache->offsets[next] = offset;
1397   cache->lengths[next] = length;
1398   uint16 sig_len = kMaxLemmaSize / 4;
1399   uint16 j = 0;
1400   for (; j < sig_len; j++) {
1401     cache->signatures[next][j] = searchable->signature[j];
1402   }
1403 
1404   if (++next >= kUserDictCacheSize) {
1405     next -= kUserDictCacheSize;
1406   }
1407   if (next == cache->head) {
1408     cache->head++;
1409     if (cache->head >= kUserDictCacheSize) {
1410       cache->head -= kUserDictCacheSize;
1411     }
1412   }
1413   cache->tail = next;
1414 }
1415 
1416 void UserDict::reset_cache() {
1417   memset(caches_, 0, sizeof(caches_));
1418 }
1419 
1420 bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
1421   UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
1422   if (cache->head == cache->tail)
1423     return false;
1424 
1425   uint16 j, sig_len = kMaxLemmaSize / 4;
1426   uint16 i = cache->head;
1427   while (1) {
1428     j = 0;
1429     for (; j < sig_len; j++) {
1430       if (cache->signatures[i][j] != searchable->signature[j])
1431         break;
1432     }
1433     if (j < sig_len) {
1434       i++;
1435       if (i >= kUserDictMissCacheSize)
1436         i -= kUserDictMissCacheSize;
1437       if (i == cache->tail)
1438         break;
1439       continue;
1440     }
1441     return true;
1442   }
1443   return false;
1444 }
1445 
1446 void UserDict::save_miss_cache(UserDictSearchable *searchable) {
1447   UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
1448   uint16 next = cache->tail;
1449 
1450   uint16 sig_len = kMaxLemmaSize / 4;
1451   uint16 j = 0;
1452   for (; j < sig_len; j++) {
1453     cache->signatures[next][j] = searchable->signature[j];
1454   }
1455 
1456   if (++next >= kUserDictMissCacheSize) {
1457     next -= kUserDictMissCacheSize;
1458   }
1459   if (next == cache->head) {
1460     cache->head++;
1461     if (cache->head >= kUserDictMissCacheSize) {
1462       cache->head -= kUserDictMissCacheSize;
1463     }
1464   }
1465   cache->tail = next;
1466 }
1467 
1468 void UserDict::reset_miss_cache() {
1469   memset(miss_caches_, 0, sizeof(miss_caches_));
1470 }
1471 
1472 void UserDict::cache_init() {
1473   reset_cache();
1474   reset_miss_cache();
1475 }
1476 
1477 bool UserDict::cache_hit(UserDictSearchable *searchable,
1478                          uint32 *offset, uint32 *length) {
1479   bool hit = load_miss_cache(searchable);
1480   if (hit) {
1481     *offset = 0;
1482     *length = 0;
1483     return true;
1484   }
1485   hit = load_cache(searchable, offset, length);
1486   if (hit) {
1487     return true;
1488   }
1489   return false;
1490 }
1491 
1492 void UserDict::cache_push(UserDictCacheType type,
1493                          UserDictSearchable *searchable,
1494                          uint32 offset, uint32 length) {
1495   switch (type) {
1496     case USER_DICT_MISS_CACHE:
1497       save_miss_cache(searchable);
1498       break;
1499     case USER_DICT_CACHE:
1500       save_cache(searchable, offset, length);
1501       break;
1502     default:
1503       break;
1504   }
1505 }
1506 
1507 #endif
1508 
1509 void UserDict::defragment(void) {
1510 #ifdef ___DEBUG_PERF___
1511   DEBUG_PERF_BEGIN;
1512 #endif
1513   if (is_valid_state() == false)
1514     return;
1515   // Fixup offsets_, set REMOVE flag to lemma's flag if needed
1516   size_t first_freed = 0;
1517   size_t first_inuse = 0;
1518   while (first_freed < dict_info_.lemma_count) {
1519     // Find first freed offset
1520     while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
1521             first_freed < dict_info_.lemma_count) {
1522       first_freed++;
1523     }
1524     if (first_freed < dict_info_.lemma_count) {
1525       // Save REMOVE flag to lemma flag
1526       int off = offsets_[first_freed];
1527       set_lemma_flag(off, kUserDictLemmaFlagRemove);
1528     } else {
1529       break;
1530     }
1531     // Find first inuse offse after first_freed
1532     first_inuse = first_freed + 1;
1533     while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
1534            (first_inuse < dict_info_.lemma_count)) {
1535       // Save REMOVE flag to lemma flag
1536       int off = offsets_[first_inuse];
1537       set_lemma_flag(off, kUserDictLemmaFlagRemove);
1538       first_inuse++;
1539     }
1540     if (first_inuse >= dict_info_.lemma_count) {
1541       break;
1542     }
1543     // Swap offsets_
1544     int tmp = offsets_[first_inuse];
1545     offsets_[first_inuse] = offsets_[first_freed];
1546     offsets_[first_freed] = tmp;
1547     // Move scores_, no need to swap
1548     tmp = scores_[first_inuse];
1549     scores_[first_inuse] = scores_[first_freed];
1550     scores_[first_freed] = tmp;
1551     // Swap ids_
1552     LemmaIdType tmpid = ids_[first_inuse];
1553     ids_[first_inuse] = ids_[first_freed];
1554     ids_[first_freed] = tmpid;
1555     // Go on
1556     first_freed++;
1557   }
1558 #ifdef ___PREDICT_ENABLED___
1559   // Fixup predicts_
1560   first_freed = 0;
1561   first_inuse = 0;
1562   while (first_freed < dict_info_.lemma_count) {
1563     // Find first freed offset
1564     while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
1565             first_freed < dict_info_.lemma_count) {
1566       first_freed++;
1567     }
1568     if (first_freed >= dict_info_.lemma_count)
1569       break;
1570     // Find first inuse offse after first_freed
1571     first_inuse = first_freed + 1;
1572     while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
1573            && (first_inuse < dict_info_.lemma_count)) {
1574       first_inuse++;
1575     }
1576     if (first_inuse >= dict_info_.lemma_count) {
1577       break;
1578     }
1579     // Swap offsets_
1580     int tmp = predicts_[first_inuse];
1581     predicts_[first_inuse] = predicts_[first_freed];
1582     predicts_[first_freed] = tmp;
1583     // Go on
1584     first_freed++;
1585   }
1586 #endif
1587   dict_info_.lemma_count = first_freed;
1588   // Fixup lemmas_
1589   size_t begin = 0;
1590   size_t end = 0;
1591   size_t dst = 0;
1592   int total_size = dict_info_.lemma_size + lemma_size_left_;
1593   int total_count = dict_info_.lemma_count + lemma_count_left_;
1594   size_t real_size = total_size - lemma_size_left_;
1595   while (dst < real_size) {
1596     unsigned char flag = get_lemma_flag(dst);
1597     unsigned char nchr = get_lemma_nchar(dst);
1598     if ((flag & kUserDictLemmaFlagRemove) == 0) {
1599       dst += nchr * 4 + 2;
1600       continue;
1601     }
1602     break;
1603   }
1604   if (dst >= real_size)
1605     return;
1606 
1607   end = dst;
1608   while (end < real_size) {
1609     begin = end + get_lemma_nchar(end) * 4 + 2;
1610  repeat:
1611     // not used any more
1612     if (begin >= real_size)
1613       break;
1614     unsigned char flag = get_lemma_flag(begin);
1615     unsigned char nchr = get_lemma_nchar(begin);
1616     if (flag & kUserDictLemmaFlagRemove) {
1617       begin += nchr * 4 + 2;
1618       goto repeat;
1619     }
1620     end = begin + nchr * 4 + 2;
1621     while (end < real_size) {
1622       unsigned char eflag = get_lemma_flag(end);
1623       unsigned char enchr = get_lemma_nchar(end);
1624       if ((eflag & kUserDictLemmaFlagRemove) == 0) {
1625         end += enchr * 4 + 2;
1626         continue;
1627       }
1628       break;
1629     }
1630     memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
1631     for (size_t j = 0; j < dict_info_.lemma_count; j++) {
1632       if (offsets_[j] >= begin && offsets_[j] < end) {
1633         offsets_[j] -= (begin - dst);
1634         offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
1635       }
1636 #ifdef ___PREDICT_ENABLED___
1637       if (predicts_[j] >= begin && predicts_[j] < end) {
1638         predicts_[j] -= (begin - dst);
1639       }
1640 #endif
1641     }
1642 #ifdef ___SYNC_ENABLED___
1643     for (size_t j = 0; j < dict_info_.sync_count; j++) {
1644       if (syncs_[j] >= begin && syncs_[j] < end) {
1645         syncs_[j] -= (begin - dst);
1646       }
1647     }
1648 #endif
1649     dst += (end - begin);
1650   }
1651 
1652   dict_info_.free_count = 0;
1653   dict_info_.free_size = 0;
1654   dict_info_.lemma_size = dst;
1655   lemma_size_left_ = total_size - dict_info_.lemma_size;
1656   lemma_count_left_ = total_count - dict_info_.lemma_count;
1657 
1658   // XXX Without following code,
1659   // offsets_by_id_ is not reordered.
1660   // That's to say, all removed lemmas' ids are not collected back.
1661   // There may not be room for addition of new lemmas due to
1662   // offsests_by_id_ reason, although lemma_size_left_ is fixed.
1663   // By default, we do want defrag as fast as possible, because
1664   // during defrag procedure, other peers can not write new lemmas
1665   // to user dictionary file.
1666   // XXX If write-back is invoked immediately after
1667   // this defragment, no need to fix up following in-mem data.
1668   for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
1669     ids_[i] = start_id_ + i;
1670     offsets_by_id_[i] = offsets_[i];
1671   }
1672 
1673   state_ = USER_DICT_DEFRAGMENTED;
1674 
1675 #ifdef ___DEBUG_PERF___
1676   DEBUG_PERF_END;
1677   LOGD_PERF("defragment");
1678 #endif
1679 }
1680 
1681 #ifdef ___SYNC_ENABLED___
1682 void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
1683   if (is_valid_state() == false)
1684     return;
1685   if (end > dict_info_.sync_count)
1686     end = dict_info_.sync_count;
1687   memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
1688   dict_info_.sync_count -= (end - start);
1689   if (state_ < USER_DICT_SYNC_DIRTY)
1690     state_ = USER_DICT_SYNC_DIRTY;
1691 }
1692 
1693 int UserDict::get_sync_count() {
1694   if (is_valid_state() == false)
1695     return 0;
1696   return dict_info_.sync_count;
1697 }
1698 
1699 LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
1700                         uint16 lemma_len, uint16 count, uint64 lmt) {
1701   int again = 0;
1702  begin:
1703   LemmaIdType id;
1704   uint32 * syncs_bak = syncs_;
1705   syncs_ = NULL;
1706   id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
1707   syncs_ = syncs_bak;
1708   if (id == 0 && again == 0) {
1709     if ((dict_info_.limit_lemma_count > 0 &&
1710         dict_info_.lemma_count >= dict_info_.limit_lemma_count)
1711         || (dict_info_.limit_lemma_size > 0 &&
1712             dict_info_.lemma_size + (2 + (lemma_len << 2))
1713             > dict_info_.limit_lemma_size)) {
1714       // XXX Always reclaim and defrag in sync code path
1715       //     sync thread is background thread and ok with heavy work
1716       reclaim();
1717       defragment();
1718       flush_cache();
1719       again = 1;
1720       goto begin;
1721     }
1722   }
1723   return id;
1724 }
1725 
1726 int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
1727   int newly_added = 0;
1728 
1729   SpellingParser * spl_parser = new SpellingParser();
1730   if (!spl_parser) {
1731     return 0;
1732   }
1733 #ifdef ___DEBUG_PERF___
1734   DEBUG_PERF_BEGIN;
1735 #endif
1736   char16 *ptr = lemmas;
1737 
1738   // Extract pinyin,words,frequence,last_mod_time
1739   char16 * p = ptr, * py16 = ptr;
1740   char16 * hz16 = NULL;
1741   int py16_len = 0;
1742   uint16 splid[kMaxLemmaSize];
1743   int splid_len = 0;
1744   int hz16_len = 0;
1745   char16 * fr16 = NULL;
1746   int fr16_len = 0;
1747 
1748   while (p - ptr < len) {
1749     // Pinyin
1750     py16 = p;
1751     splid_len = 0;
1752     while (*p != 0x2c && (p - ptr) < len) {
1753       if (*p == 0x20)
1754         splid_len++;
1755       p++;
1756     }
1757     splid_len++;
1758     if (p - ptr == len)
1759       break;
1760     py16_len = p - py16;
1761     if (kMaxLemmaSize < splid_len) {
1762       break;
1763     }
1764     bool is_pre;
1765     int splidl = spl_parser->splstr16_to_idxs_f(
1766         py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre);
1767     if (splidl != splid_len)
1768       break;
1769     // Phrase
1770     hz16 = ++p;
1771     while (*p != 0x2c && (p - ptr) < len) {
1772       p++;
1773     }
1774     hz16_len = p - hz16;
1775     if (hz16_len != splid_len)
1776       break;
1777     // Frequency
1778     fr16 = ++p;
1779     fr16_len = 0;
1780     while (*p != 0x2c && (p - ptr) < len) {
1781       p++;
1782     }
1783     fr16_len = p - fr16;
1784     uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
1785     // Last modified time
1786     fr16 = ++p;
1787     fr16_len = 0;
1788     while (*p != 0x3b && (p - ptr) < len) {
1789       p++;
1790     }
1791     fr16_len = p - fr16;
1792     uint64 last_mod = utf16le_atoll(fr16, fr16_len);
1793 
1794     put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
1795     newly_added++;
1796 
1797     p++;
1798   }
1799 
1800 #ifdef ___DEBUG_PERF___
1801   DEBUG_PERF_END;
1802   LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
1803 #endif
1804   return newly_added;
1805 }
1806 
1807 int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
1808     char16 * str, int size, int * count) {
1809   int len = 0;
1810   *count = 0;
1811 
1812   int left_len = size;
1813 
1814   if (is_valid_state() == false)
1815     return len;
1816 
1817   SpellingTrie * spl_trie = &SpellingTrie::get_instance();
1818   if (!spl_trie) {
1819     return 0;
1820   }
1821 
1822   uint32 i;
1823   for (i = 0; i < dict_info_.sync_count; i++) {
1824     int offset = syncs_[i];
1825     uint32 nchar = get_lemma_nchar(offset);
1826     uint16 *spl = get_lemma_spell_ids(offset);
1827     uint16 *wrd = get_lemma_word(offset);
1828     int score = _get_lemma_score(wrd, spl, nchar);
1829 
1830     static char score_temp[32], *pscore_temp = score_temp;
1831     static char16 temp[256], *ptemp = temp;
1832 
1833     pscore_temp = score_temp;
1834     ptemp = temp;
1835 
1836     uint32 j;
1837     // Add pinyin
1838     for (j = 0; j < nchar; j++) {
1839       int ret_len = spl_trie->get_spelling_str16(
1840           spl[j], ptemp, temp + sizeof(temp) - ptemp);
1841       if (ret_len <= 0)
1842         break;
1843       ptemp += ret_len;
1844       if (ptemp < temp + sizeof(temp) - 1) {
1845         *(ptemp++) = ' ';
1846       } else {
1847         j = 0;
1848         break;
1849       }
1850     }
1851     if (j < nchar) {
1852       continue;
1853     }
1854     ptemp--;
1855     if (ptemp < temp + sizeof(temp) - 1) {
1856       *(ptemp++) = ',';
1857     } else {
1858       continue;
1859     }
1860     // Add phrase
1861     for (j = 0; j < nchar; j++) {
1862       if (ptemp < temp + sizeof(temp) - 1) {
1863         *(ptemp++) = wrd[j];
1864       } else {
1865         break;
1866       }
1867     }
1868     if (j < nchar) {
1869       continue;
1870     }
1871     if (ptemp < temp + sizeof(temp) - 1) {
1872       *(ptemp++) = ',';
1873     } else {
1874       continue;
1875     }
1876     // Add frequency
1877     uint32 intf = extract_score_freq(score);
1878     int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
1879     if (ret_len <= 0)
1880       continue;
1881     ptemp += ret_len;
1882     if (ptemp < temp + sizeof(temp) - 1) {
1883       *(ptemp++) = ',';
1884     } else {
1885       continue;
1886     }
1887     // Add last modified time
1888     uint64 last_mod = extract_score_lmt(score);
1889     ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
1890     if (ret_len <= 0)
1891       continue;
1892     ptemp += ret_len;
1893     if (ptemp < temp + sizeof(temp) - 1) {
1894       *(ptemp++) = ';';
1895     } else {
1896       continue;
1897     }
1898 
1899     // Write to string
1900     int need_len = ptemp - temp;
1901     if (need_len > left_len)
1902       break;
1903     memcpy(str + len, temp, need_len * 2);
1904     left_len -= need_len;
1905 
1906     len += need_len;
1907     (*count)++;
1908   }
1909 
1910   if (len > 0) {
1911     if (state_ < USER_DICT_SYNC_DIRTY)
1912       state_ = USER_DICT_SYNC_DIRTY;
1913   }
1914   return len;
1915 }
1916 
1917 #endif
1918 
1919 bool UserDict::state(UserDictStat * stat) {
1920   if (is_valid_state() == false)
1921     return false;
1922   if (!stat)
1923     return false;
1924   stat->version = version_;
1925   stat->file_name = dict_file_;
1926   stat->load_time.tv_sec = load_time_.tv_sec;
1927   stat->load_time.tv_usec = load_time_.tv_usec;
1928   pthread_mutex_lock(&g_mutex_);
1929   stat->last_update.tv_sec = g_last_update_.tv_sec;
1930   stat->last_update.tv_usec = g_last_update_.tv_usec;
1931   pthread_mutex_unlock(&g_mutex_);
1932   stat->disk_size = get_dict_file_size(&dict_info_);
1933   stat->lemma_count = dict_info_.lemma_count;
1934   stat->lemma_size = dict_info_.lemma_size;
1935   stat->delete_count = dict_info_.free_count;
1936   stat->delete_size = dict_info_.free_size;
1937 #ifdef ___SYNC_ENABLED___
1938   stat->sync_count = dict_info_.sync_count;
1939 #endif
1940   stat->limit_lemma_count = dict_info_.limit_lemma_count;
1941   stat->limit_lemma_size = dict_info_.limit_lemma_size;
1942   stat->reclaim_ratio = dict_info_.reclaim_ratio;
1943   return true;
1944 }
1945 
1946 void UserDict::set_limit(uint32 max_lemma_count,
1947                          uint32 max_lemma_size, uint32 reclaim_ratio) {
1948   dict_info_.limit_lemma_count = max_lemma_count;
1949   dict_info_.limit_lemma_size = max_lemma_size;
1950   if (reclaim_ratio > 100)
1951     reclaim_ratio = 100;
1952   dict_info_.reclaim_ratio = reclaim_ratio;
1953 }
1954 
1955 void UserDict::reclaim() {
1956   if (is_valid_state() == false)
1957     return;
1958 
1959   switch (dict_info_.reclaim_ratio) {
1960     case 0:
1961       return;
1962     case 100:
1963       // TODO: CLEAR to be implemented
1964       assert(false);
1965       return;
1966     default:
1967       break;
1968   }
1969 
1970   // XXX Reclaim is only based on count, not size
1971   uint32 count = dict_info_.lemma_count;
1972   int rc = count * dict_info_.reclaim_ratio / 100;
1973 
1974   UserDictScoreOffsetPair * score_offset_pairs = NULL;
1975   score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
1976       sizeof(UserDictScoreOffsetPair) * rc);
1977   if (score_offset_pairs == NULL) {
1978     return;
1979   }
1980 
1981   for (int i = 0; i < rc; i++) {
1982     int s = scores_[i];
1983     score_offset_pairs[i].score = s;
1984     score_offset_pairs[i].offset_index = i;
1985   }
1986 
1987   for (int i = (rc + 1) / 2; i >= 0; i--)
1988     shift_down(score_offset_pairs, i, rc);
1989 
1990   for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
1991     int s = scores_[i];
1992     if (s < score_offset_pairs[0].score) {
1993       score_offset_pairs[0].score = s;
1994       score_offset_pairs[0].offset_index = i;
1995       shift_down(score_offset_pairs, 0, rc);
1996     }
1997   }
1998 
1999   for (int i = 0; i < rc; i++) {
2000     int off = score_offset_pairs[i].offset_index;
2001     remove_lemma_by_offset_index(off);
2002   }
2003   if (rc > 0) {
2004     if (state_ < USER_DICT_OFFSET_DIRTY)
2005       state_ = USER_DICT_OFFSET_DIRTY;
2006   }
2007 
2008   free(score_offset_pairs);
2009 }
2010 
2011 inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
2012   int s = sop[i].score;
2013   int p = sop[i].offset_index;
2014   sop[i].score = sop[j].score;
2015   sop[i].offset_index = sop[j].offset_index;
2016   sop[j].score = s;
2017   sop[j].offset_index = p;
2018 }
2019 
2020 void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
2021   int par = i;
2022   while (par < n) {
2023     int left = par * 2 + 1;
2024     int right = left + 1;
2025     if (left >= n && right >= n)
2026       break;
2027     if (right >= n) {
2028       if (sop[left].score > sop[par].score) {
2029         swap(sop, left, par);
2030         par = left;
2031         continue;
2032       }
2033     } else if (sop[left].score > sop[right].score &&
2034                sop[left].score > sop[par].score) {
2035       swap(sop, left, par);
2036       par = left;
2037       continue;
2038     } else if (sop[right].score > sop[left].score &&
2039                sop[right].score > sop[par].score) {
2040       swap(sop, right, par);
2041       par = right;
2042       continue;
2043     }
2044     break;
2045   }
2046 }
2047 
2048 LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
2049                                 uint16 lemma_len, uint16 count) {
2050   return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
2051 }
2052 
2053 LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
2054                                  uint16 lemma_len, uint16 count, uint64 lmt) {
2055 #ifdef ___DEBUG_PERF___
2056   DEBUG_PERF_BEGIN;
2057 #endif
2058   if (is_valid_state() == false)
2059     return 0;
2060   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
2061   if (off != -1) {
2062     int delta_score = count - scores_[off];
2063     dict_info_.total_nfreq += delta_score;
2064     scores_[off] = build_score(lmt, count);
2065     if (state_ < USER_DICT_SCORE_DIRTY)
2066       state_ = USER_DICT_SCORE_DIRTY;
2067 #ifdef ___DEBUG_PERF___
2068     DEBUG_PERF_END;
2069     LOGD_PERF("_put_lemma(update)");
2070 #endif
2071     return ids_[off];
2072   } else {
2073     if ((dict_info_.limit_lemma_count > 0 &&
2074         dict_info_.lemma_count >= dict_info_.limit_lemma_count)
2075         || (dict_info_.limit_lemma_size > 0 &&
2076             dict_info_.lemma_size + (2 + (lemma_len << 2))
2077             > dict_info_.limit_lemma_size)) {
2078       // XXX Don't defragment here, it's too time-consuming.
2079       return 0;
2080     }
2081     int flushed = 0;
2082     if (lemma_count_left_ == 0 ||
2083         lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {
2084 
2085       // XXX When there is no space for new lemma, we flush to disk
2086       // flush_cache() may be called by upper user
2087       // and better place shoule be found instead of here
2088       flush_cache();
2089       flushed = 1;
2090       // Or simply return and do nothing
2091       // return 0;
2092     }
2093 #ifdef ___DEBUG_PERF___
2094     DEBUG_PERF_END;
2095     LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
2096 #endif
2097     LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
2098 #ifdef ___SYNC_ENABLED___
2099     if (syncs_ && id != 0) {
2100       queue_lemma_for_sync(id);
2101     }
2102 #endif
2103     return id;
2104   }
2105   return 0;
2106 }
2107 
2108 #ifdef ___SYNC_ENABLED___
2109 void UserDict::queue_lemma_for_sync(LemmaIdType id) {
2110   if (dict_info_.sync_count < sync_count_size_) {
2111     syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
2112   } else {
2113     uint32 * syncs = (uint32*)realloc(
2114         syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
2115     if (syncs) {
2116       sync_count_size_ += kUserDictPreAlloc;
2117       syncs_ = syncs;
2118       syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
2119     }
2120   }
2121 }
2122 #endif
2123 
2124 LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
2125                                    bool selected) {
2126 #ifdef ___DEBUG_PERF___
2127   DEBUG_PERF_BEGIN;
2128 #endif
2129   if (is_valid_state() == false)
2130     return 0;
2131   if (is_valid_lemma_id(lemma_id) == false)
2132     return 0;
2133   uint32 offset = offsets_by_id_[lemma_id - start_id_];
2134   uint8 lemma_len = get_lemma_nchar(offset);
2135   char16 * lemma_str = get_lemma_word(offset);
2136   uint16 * splids = get_lemma_spell_ids(offset);
2137 
2138   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
2139   if (off != -1) {
2140     int score = scores_[off];
2141     int count = extract_score_freq(score);
2142     uint64 lmt = extract_score_lmt(score);
2143     if (count + delta_count > kUserDictMaxFrequency ||
2144         count + delta_count < count) {
2145       delta_count = kUserDictMaxFrequency - count;
2146     }
2147     count += delta_count;
2148     dict_info_.total_nfreq += delta_count;
2149     if (selected) {
2150       lmt = time(NULL);
2151     }
2152     scores_[off] = build_score(lmt, count);
2153     if (state_ < USER_DICT_SCORE_DIRTY)
2154       state_ = USER_DICT_SCORE_DIRTY;
2155 #ifdef ___DEBUG_PERF___
2156     DEBUG_PERF_END;
2157     LOGD_PERF("update_lemma");
2158 #endif
2159 #ifdef ___SYNC_ENABLED___
2160     queue_lemma_for_sync(ids_[off]);
2161 #endif
2162     return ids_[off];
2163   }
2164   return 0;
2165 }
2166 
2167 size_t UserDict::get_total_lemma_count() {
2168   return dict_info_.total_nfreq;
2169 }
2170 
2171 void UserDict::set_total_lemma_count_of_others(size_t count) {
2172   total_other_nfreq_ = count;
2173 }
2174 
2175 LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
2176                                    uint16 lemma_len, uint16 count, uint64 lmt) {
2177   LemmaIdType id = get_max_lemma_id() + 1;
2178   size_t offset = dict_info_.lemma_size;
2179   if (offset > kUserDictOffsetMask)
2180     return 0;
2181 
2182   lemmas_[offset] = 0;
2183   lemmas_[offset + 1] = (uint8)lemma_len;
2184   for (size_t i = 0; i < lemma_len; i++) {
2185     *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
2186     *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
2187         = lemma_str[i];
2188   }
2189   uint32 off = dict_info_.lemma_count;
2190   offsets_[off] = offset;
2191   scores_[off] = build_score(lmt, count);
2192   ids_[off] = id;
2193 #ifdef ___PREDICT_ENABLED___
2194   predicts_[off] = offset;
2195 #endif
2196 
2197   offsets_by_id_[id - start_id_] = offset;
2198 
2199   dict_info_.lemma_count++;
2200   dict_info_.lemma_size += (2 + (lemma_len << 2));
2201   lemma_count_left_--;
2202   lemma_size_left_ -= (2 + (lemma_len << 2));
2203 
2204   // Sort
2205 
2206   UserDictSearchable searchable;
2207   prepare_locate(&searchable, splids, lemma_len);
2208 
2209   size_t i = 0;
2210   while (i < off) {
2211     offset = offsets_[i];
2212     uint32 nchar = get_lemma_nchar(offset);
2213     uint16 * spl = get_lemma_spell_ids(offset);
2214 
2215     if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
2216       break;
2217     i++;
2218   }
2219   if (i != off) {
2220     uint32 temp = offsets_[off];
2221     memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
2222     offsets_[i] = temp;
2223 
2224     temp = scores_[off];
2225     memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
2226     scores_[i] = temp;
2227 
2228     temp = ids_[off];
2229     memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
2230     ids_[i] = temp;
2231   }
2232 
2233 #ifdef ___PREDICT_ENABLED___
2234   uint32 j = 0;
2235   uint16 * words_new = get_lemma_word(predicts_[off]);
2236   j = locate_where_to_insert_in_predicts(words_new, lemma_len);
2237   if (j != off) {
2238     uint32 temp = predicts_[off];
2239     memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
2240     predicts_[j] = temp;
2241   }
2242 #endif
2243 
2244   if (state_ < USER_DICT_LEMMA_DIRTY)
2245     state_ = USER_DICT_LEMMA_DIRTY;
2246 
2247 #ifdef ___CACHE_ENABLED___
2248   cache_init();
2249 #endif
2250 
2251   dict_info_.total_nfreq += count;
2252   return id;
2253 }
2254 }
2255