• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * Copyright (C) 2009 The Android Open Source Project
3   *
4   * Licensed under the Apache License, Version 2.0 (the "License");
5   * you may not use this file except in compliance with the License.
6   * You may obtain a copy of the License at
7   *
8   *      http://www.apache.org/licenses/LICENSE-2.0
9   *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  #include "../include/userdict.h"
18  #include "../include/splparser.h"
19  #include "../include/ngram.h"
20  #include <stdio.h>
21  #include <string.h>
22  #include <stdlib.h>
23  #include <cutils/log.h>
24  #include <unistd.h>
25  #include <fcntl.h>
26  #include <sys/stat.h>
27  #include <assert.h>
28  #include <ctype.h>
29  #include <sys/types.h>
30  #include <sys/time.h>
31  #include <time.h>
32  #include <pthread.h>
33  #include <math.h>
34  
35  namespace ime_pinyin {
36  
37  #ifdef ___DEBUG_PERF___
38  static uint64 _ellapse_ = 0;
39  static struct timeval _tv_start_, _tv_end_;
40  #define DEBUG_PERF_BEGIN \
41      do { \
42        gettimeofday(&_tv_start_, NULL); \
43      } while(0)
44  #define DEBUG_PERF_END \
45      do { \
46        gettimeofday(&_tv_end_, NULL); \
47        _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
48                    (_tv_end_.tv_usec - _tv_start_.tv_usec); \
49      } while(0)
50  #define LOGD_PERF(message) \
51      LOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
52  #else
53  #define DEBUG_PERF_BEGIN
54  #define DEBUG_PERF_END
55  #define LOGD_PERF(message)
56  #endif
57  
58  // XXX File load and write are thread-safe by g_mutex_
59  static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
60  static struct timeval g_last_update_ = {0, 0};
61  
get_dict_file_size(UserDictInfo * info)62  inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
63    return (4 + info->lemma_size + (info->lemma_count << 3)
64  #ifdef ___PREDICT_ENABLED___
65            + (info->lemma_count << 2)
66  #endif
67  #ifdef ___SYNC_ENABLED___
68            + (info->sync_count << 2)
69  #endif
70            + sizeof(*info));
71  }
72  
translate_score(int raw_score)73  inline LmaScoreType UserDict::translate_score(int raw_score) {
74    // 1) ori_freq: original user frequency
75    uint32 ori_freq = extract_score_freq(raw_score);
76    // 2) lmt_off: lmt index (week offset for example)
77    uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
78    if (kUserDictLMTBitWidth < 16) {
79      uint64 mask = ~(1 << kUserDictLMTBitWidth);
80      lmt_off &= mask;
81    }
82    // 3) now_off: current time index (current week offset for example)
83    // assuming load_time_ is around current time
84    uint64 now_off = load_time_.tv_sec;
85    now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
86    now_off = (now_off << (64 - kUserDictLMTBitWidth));
87    now_off = (now_off >> (64 - kUserDictLMTBitWidth));
88    // 4) factor: decide expand-factor
89    int delta = now_off - lmt_off;
90    if (delta > 4)
91      delta = 4;
92    int factor = 80 - (delta << 4);
93  
94    double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
95    return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
96                          * NGram::kLogValueAmplifier);
97  }
98  
extract_score_freq(int raw_score)99  inline int UserDict::extract_score_freq(int raw_score) {
100    // Frequence stored in lowest 16 bits
101    int freq = (raw_score & 0x0000ffff);
102    return freq;
103  }
104  
extract_score_lmt(int raw_score)105  inline uint64 UserDict::extract_score_lmt(int raw_score) {
106    uint64 lmt = ((raw_score & 0xffff0000) >> 16);
107    if (kUserDictLMTBitWidth < 16) {
108      uint64 mask = ~(1 << kUserDictLMTBitWidth);
109      lmt &= mask;
110    }
111    lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
112    return lmt;
113  }
114  
build_score(uint64 lmt,int freq)115  inline int UserDict::build_score(uint64 lmt, int freq) {
116    lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
117    lmt = (lmt << (64 - kUserDictLMTBitWidth));
118    lmt = (lmt >> (64 - kUserDictLMTBitWidth));
119    uint16 lmt16 = (uint16)lmt;
120    int s = freq;
121    s &= 0x0000ffff;
122    s = (lmt16 << 16) | s;
123    return s;
124  }
125  
utf16le_atoll(uint16 * s,int len)126  inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
127    int64 ret = 0;
128    if (len <= 0)
129      return ret;
130  
131    int flag = 1;
132    const uint16 * endp = s + len;
133    if (*s == '-') {
134      flag = -1;
135      s++;
136    } else if (*s == '+') {
137      s++;
138    }
139  
140    while (*s >= '0' && *s <= '9' && s < endp) {
141      ret += ret * 10 + (*s) - '0';
142      s++;
143    }
144    return ret * flag;
145  }
146  
utf16le_lltoa(int64 v,uint16 * s,int size)147  inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
148    if (!s || size <= 0)
149      return 0;
150    uint16 *endp = s + size;
151    int ret_len = 0;
152    if (v < 0) {
153      *(s++) = '-';
154      ++ret_len;
155      v *= -1;
156    }
157  
158    uint16 *b = s;
159    while (s < endp && v != 0) {
160      *(s++) = '0' + (v % 10);
161      v = v / 10;
162      ++ret_len;
163    }
164  
165    if (v != 0)
166      return 0;
167  
168    --s;
169  
170    while (b < s) {
171      *b = *s;
172      ++b, --s;
173    }
174  
175    return ret_len;
176  }
177  
set_lemma_flag(uint32 offset,uint8 flag)178  inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
179    offset &= kUserDictOffsetMask;
180    lemmas_[offset] |= flag;
181  }
182  
get_lemma_flag(uint32 offset)183  inline char UserDict::get_lemma_flag(uint32 offset) {
184    offset &= kUserDictOffsetMask;
185    return (char)(lemmas_[offset]);
186  }
187  
get_lemma_nchar(uint32 offset)188  inline char UserDict::get_lemma_nchar(uint32 offset) {
189    offset &= kUserDictOffsetMask;
190    return (char)(lemmas_[offset + 1]);
191  }
192  
get_lemma_spell_ids(uint32 offset)193  inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
194    offset &= kUserDictOffsetMask;
195    return (uint16 *)(lemmas_ + offset + 2);
196  }
197  
get_lemma_word(uint32 offset)198  inline uint16 * UserDict::get_lemma_word(uint32 offset) {
199    offset &= kUserDictOffsetMask;
200    uint8 nchar = get_lemma_nchar(offset);
201    return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
202  }
203  
get_max_lemma_id()204  inline LemmaIdType UserDict::get_max_lemma_id() {
205    // When a lemma is deleted, we don't not claim its id back for
206    // simplicity and performance
207    return start_id_ + dict_info_.lemma_count - 1;
208  }
209  
is_valid_lemma_id(LemmaIdType id)210  inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
211    if (id >= start_id_ && id <= get_max_lemma_id())
212      return true;
213    return false;
214  }
215  
is_valid_state()216  inline bool UserDict::is_valid_state() {
217    if (state_ == USER_DICT_NONE)
218      return false;
219    return true;
220  }
221  
UserDict()222  UserDict::UserDict()
223      : start_id_(0),
224        version_(0),
225        lemmas_(NULL),
226        offsets_(NULL),
227        scores_(NULL),
228        ids_(NULL),
229  #ifdef ___PREDICT_ENABLED___
230        predicts_(NULL),
231  #endif
232  #ifdef ___SYNC_ENABLED___
233        syncs_(NULL),
234        sync_count_size_(0),
235  #endif
236        offsets_by_id_(NULL),
237        lemma_count_left_(0),
238        lemma_size_left_(0),
239        dict_file_(NULL),
240        state_(USER_DICT_NONE) {
241    memset(&dict_info_, 0, sizeof(dict_info_));
242    memset(&load_time_, 0, sizeof(load_time_));
243  #ifdef ___CACHE_ENABLED___
244    cache_init();
245  #endif
246  }
247  
~UserDict()248  UserDict::~UserDict() {
249    close_dict();
250  }
251  
load_dict(const char * file_name,LemmaIdType start_id,LemmaIdType end_id)252  bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
253                           LemmaIdType end_id) {
254  #ifdef ___DEBUG_PERF___
255    DEBUG_PERF_BEGIN;
256  #endif
257    dict_file_ = strdup(file_name);
258    if (!dict_file_)
259      return false;
260  
261    start_id_ = start_id;
262  
263    if (false == validate(file_name) && false == reset(file_name)) {
264      goto error;
265    }
266    if (false == load(file_name, start_id)) {
267      goto error;
268    }
269  
270    state_ = USER_DICT_SYNC;
271  
272    gettimeofday(&load_time_, NULL);
273  
274  #ifdef ___DEBUG_PERF___
275    DEBUG_PERF_END;
276    LOGD_PERF("load_dict");
277  #endif
278    return true;
279   error:
280    free((void*)dict_file_);
281    start_id_ = 0;
282    return false;
283  }
284  
close_dict()285  bool UserDict::close_dict() {
286    if (state_ == USER_DICT_NONE)
287      return true;
288    if (state_ == USER_DICT_SYNC)
289      goto out;
290  
291    // If dictionary is written back by others,
292    // we can not simply write back here
293    // To do a safe flush, we have to discard all newly added
294    // lemmas and try to reload dict file.
295    pthread_mutex_lock(&g_mutex_);
296    if (load_time_.tv_sec > g_last_update_.tv_sec ||
297      (load_time_.tv_sec == g_last_update_.tv_sec &&
298       load_time_.tv_usec > g_last_update_.tv_usec)) {
299      write_back();
300      gettimeofday(&g_last_update_, NULL);
301    }
302    pthread_mutex_unlock(&g_mutex_);
303  
304   out:
305    free((void*)dict_file_);
306    free(lemmas_);
307    free(offsets_);
308    free(offsets_by_id_);
309    free(scores_);
310    free(ids_);
311  #ifdef ___PREDICT_ENABLED___
312    free(predicts_);
313  #endif
314  
315    version_ = 0;
316    dict_file_ = NULL;
317    lemmas_ = NULL;
318  #ifdef ___SYNC_ENABLED___
319    syncs_ = NULL;
320    sync_count_size_ = 0;
321  #endif
322    offsets_ = NULL;
323    offsets_by_id_ = NULL;
324    scores_ = NULL;
325    ids_ = NULL;
326  #ifdef ___PREDICT_ENABLED___
327    predicts_ = NULL;
328  #endif
329  
330    memset(&dict_info_, 0, sizeof(dict_info_));
331    lemma_count_left_ = 0;
332    lemma_size_left_ = 0;
333    state_ = USER_DICT_NONE;
334  
335    return true;
336  }
337  
number_of_lemmas()338  size_t UserDict::number_of_lemmas() {
339    return dict_info_.lemma_count;
340  }
341  
reset_milestones(uint16 from_step,MileStoneHandle from_handle)342  void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
343    return;
344  }
345  
extend_dict(MileStoneHandle from_handle,const DictExtPara * dep,LmaPsbItem * lpi_items,size_t lpi_max,size_t * lpi_num)346  MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
347                                        const DictExtPara *dep,
348                                        LmaPsbItem *lpi_items,
349                                        size_t lpi_max, size_t *lpi_num) {
350    if (is_valid_state() == false)
351      return 0;
352  
353    bool need_extend = false;
354  
355  #ifdef ___DEBUG_PERF___
356    DEBUG_PERF_BEGIN;
357  #endif
358    *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
359                         lpi_items, lpi_max, &need_extend);
360  #ifdef ___DEBUG_PERF___
361    DEBUG_PERF_END;
362    LOGD_PERF("extend_dict");
363  #endif
364    return ((*lpi_num > 0 || need_extend) ? 1 : 0);
365  }
366  
is_fuzzy_prefix_spell_id(const uint16 * id1,uint16 len1,const UserDictSearchable * searchable)367  int UserDict::is_fuzzy_prefix_spell_id(
368      const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
369    if (len1 < searchable->splids_len)
370      return 0;
371  
372    SpellingTrie &spl_trie = SpellingTrie::get_instance();
373    uint32 i = 0;
374    for (i = 0; i < searchable->splids_len; i++) {
375      const char py1 = *spl_trie.get_spelling_str(id1[i]);
376      uint16 off = 8 * (i % 4);
377      const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
378      if (py1 == py2)
379        continue;
380      return 0;
381    }
382    return 1;
383  }
384  
fuzzy_compare_spell_id(const uint16 * id1,uint16 len1,const UserDictSearchable * searchable)385  int UserDict::fuzzy_compare_spell_id(
386      const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
387    if (len1 < searchable->splids_len)
388      return -1;
389    if (len1 > searchable->splids_len)
390      return 1;
391  
392    SpellingTrie &spl_trie = SpellingTrie::get_instance();
393    uint32 i = 0;
394    for (i = 0; i < len1; i++) {
395      const char py1 = *spl_trie.get_spelling_str(id1[i]);
396      uint16 off = 8 * (i % 4);
397      const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
398      if (py1 == py2)
399        continue;
400      if (py1 > py2)
401        return 1;
402      return -1;
403    }
404    return 0;
405  }
406  
is_prefix_spell_id(const uint16 * fullids,uint16 fulllen,const UserDictSearchable * searchable)407  bool UserDict::is_prefix_spell_id(
408      const uint16 * fullids, uint16 fulllen,
409      const UserDictSearchable *searchable) {
410    if (fulllen < searchable->splids_len)
411      return false;
412  
413    uint32 i = 0;
414    for (; i < searchable->splids_len; i++) {
415      uint16 start_id = searchable->splid_start[i];
416      uint16 count = searchable->splid_count[i];
417      if (fullids[i] >= start_id && fullids[i] < start_id + count)
418        continue;
419      else
420        return false;
421    }
422    return true;
423  }
424  
equal_spell_id(const uint16 * fullids,uint16 fulllen,const UserDictSearchable * searchable)425  bool UserDict::equal_spell_id(
426      const uint16 * fullids, uint16 fulllen,
427      const UserDictSearchable *searchable) {
428    if (fulllen != searchable->splids_len)
429      return false;
430  
431    uint32 i = 0;
432    for (; i < fulllen; i++) {
433      uint16 start_id = searchable->splid_start[i];
434      uint16 count = searchable->splid_count[i];
435      if (fullids[i] >= start_id && fullids[i] < start_id + count)
436        continue;
437      else
438        return false;
439    }
440    return true;
441  }
442  
locate_first_in_offsets(const UserDictSearchable * searchable)443  int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
444    int32 begin = 0;
445    int32 end = dict_info_.lemma_count - 1;
446    int32 middle = -1;
447  
448    int32 first_prefix = middle;
449    int32 last_matched = middle;
450  
451    while (begin <= end) {
452      middle = (begin + end) >> 1;
453      uint32 offset = offsets_[middle];
454      uint8 nchar = get_lemma_nchar(offset);
455      const uint16 * splids = get_lemma_spell_ids(offset);
456      int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
457      int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);
458  
459      if (pre)
460        first_prefix = middle;
461  
462      if (cmp < 0) {
463        begin = middle + 1;
464      } else if (cmp > 0) {
465        end = middle - 1;
466      } else {
467        end = middle - 1;
468        last_matched = middle;
469      }
470    }
471  
472    return first_prefix;
473  }
474  
prepare_locate(UserDictSearchable * searchable,const uint16 * splid_str,uint16 splid_str_len)475  void UserDict::prepare_locate(UserDictSearchable *searchable,
476                               const uint16 *splid_str,
477                               uint16 splid_str_len) {
478    searchable->splids_len = splid_str_len;
479    memset(searchable->signature, 0, sizeof(searchable->signature));
480  
481    SpellingTrie &spl_trie = SpellingTrie::get_instance();
482    uint32 i = 0;
483    for (; i < splid_str_len; i++) {
484      if (spl_trie.is_half_id(splid_str[i])) {
485        searchable->splid_count[i] =
486            spl_trie.half_to_full(splid_str[i],
487                                  &(searchable->splid_start[i]));
488      } else {
489        searchable->splid_count[i] = 1;
490        searchable->splid_start[i] = splid_str[i];
491      }
492      const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
493      searchable->signature[i>>2] |= (py << (8 * (i % 4)));
494    }
495  }
496  
get_lpis(const uint16 * splid_str,uint16 splid_str_len,LmaPsbItem * lpi_items,size_t lpi_max)497  size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
498                            LmaPsbItem *lpi_items, size_t lpi_max) {
499    return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
500  }
501  
_get_lpis(const uint16 * splid_str,uint16 splid_str_len,LmaPsbItem * lpi_items,size_t lpi_max,bool * need_extend)502  size_t UserDict::_get_lpis(const uint16 *splid_str,
503                             uint16 splid_str_len, LmaPsbItem *lpi_items,
504                             size_t lpi_max, bool * need_extend) {
505    bool tmp_extend;
506    if (!need_extend)
507      need_extend = &tmp_extend;
508  
509    *need_extend = false;
510  
511    if (is_valid_state() == false)
512      return 0;
513    if (lpi_max <= 0)
514      return 0;
515  
516    if (0 == pthread_mutex_trylock(&g_mutex_)) {
517      if (load_time_.tv_sec < g_last_update_.tv_sec ||
518        (load_time_.tv_sec == g_last_update_.tv_sec &&
519         load_time_.tv_usec < g_last_update_.tv_usec)) {
520        // Others updated disk file, have to reload
521        pthread_mutex_unlock(&g_mutex_);
522        flush_cache();
523      } else {
524        pthread_mutex_unlock(&g_mutex_);
525      }
526    } else {
527    }
528  
529    UserDictSearchable searchable;
530    prepare_locate(&searchable, splid_str, splid_str_len);
531  
532    uint32 max_off = dict_info_.lemma_count;
533  #ifdef ___CACHE_ENABLED___
534    int32 middle;
535    uint32 start, count;
536    bool cached = cache_hit(&searchable, &start, &count);
537    if (cached) {
538      middle = start;
539      max_off = start + count;
540    } else {
541      middle = locate_first_in_offsets(&searchable);
542      start = middle;
543    }
544  #else
545    int32 middle = locate_first_in_offsets(&searchable);
546  #endif
547  
548    if (middle == -1) {
549  #ifdef ___CACHE_ENABLED___
550      if (!cached)
551        cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
552  #endif
553      return 0;
554    }
555  
556    size_t lpi_current = 0;
557  
558    bool fuzzy_break = false;
559    bool prefix_break = false;
560    while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
561      if (lpi_current >= lpi_max)
562        break;
563      uint32 offset = offsets_[middle];
564      // Ignore deleted lemmas
565      if (offset & kUserDictOffsetFlagRemove) {
566        middle++;
567        continue;
568      }
569      uint8 nchar = get_lemma_nchar(offset);
570      uint16 * splids = get_lemma_spell_ids(offset);
571  #ifdef ___CACHE_ENABLED___
572      if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
573  #else
574      if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
575  #endif
576        fuzzy_break = true;
577      }
578  
579      if (prefix_break == false) {
580        if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
581          if (*need_extend == false &&
582              is_prefix_spell_id(splids, nchar, &searchable)) {
583            *need_extend = true;
584          }
585        } else {
586          prefix_break = true;
587        }
588      }
589  
590      if (equal_spell_id(splids, nchar, &searchable) == true) {
591        lpi_items[lpi_current].psb = translate_score(scores_[middle]);
592        lpi_items[lpi_current].id = ids_[middle];
593        lpi_items[lpi_current].lma_len = nchar;
594        lpi_current++;
595      }
596      middle++;
597    }
598  
599  #ifdef ___CACHE_ENABLED___
600    if (!cached) {
601      count = middle - start;
602      cache_push(USER_DICT_CACHE, &searchable, start, count);
603    }
604  #endif
605  
606    return lpi_current;
607  }
608  
609  uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
610                                 uint16 str_max) {
611    if (is_valid_state() == false)
612      return 0;
613    if (is_valid_lemma_id(id_lemma) == false)
614      return 0;
615    uint32 offset = offsets_by_id_[id_lemma - start_id_];
616    uint8 nchar = get_lemma_nchar(offset);
617    char16 * str = get_lemma_word(offset);
618    uint16 m = nchar < str_max -1 ? nchar : str_max - 1;
619    int i = 0;
620    for (; i < m; i++) {
621      str_buf[i] = str[i];
622    }
623    str_buf[i] = 0;
624    return m;
625  }
626  
627  uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
628                                    uint16 splids_max, bool arg_valid) {
629    if (is_valid_lemma_id(id_lemma) == false)
630      return 0;
631    uint32 offset = offsets_by_id_[id_lemma - start_id_];
632    uint8 nchar = get_lemma_nchar(offset);
633    const uint16 * ids = get_lemma_spell_ids(offset);
634    int i = 0;
635    for (; i < nchar && i < splids_max; i++)
636      splids[i] = ids[i];
637    return i;
638  }
639  
640  size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
641                           NPredictItem *npre_items, size_t npre_max,
642                           size_t b4_used) {
643    uint32 new_added = 0;
644  #ifdef ___PREDICT_ENABLED___
645    int32 end = dict_info_.lemma_count - 1;
646    int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
647    if (j == -1)
648      return 0;
649  
650    while (j <= end) {
651      uint32 offset = predicts_[j];
652      // Ignore deleted lemmas
653      if (offset & kUserDictOffsetFlagRemove) {
654        j++;
655        continue;
656      }
657      uint32 nchar = get_lemma_nchar(offset);
658      uint16 * words = get_lemma_word(offset);
659      uint16 * splids = get_lemma_spell_ids(offset);
660  
661      if (nchar <= hzs_len) {
662        j++;
663        continue;
664      }
665  
666      if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
667        if (new_added >= npre_max) {
668          return new_added;
669        }
670        uint32 cpy_len =
671            (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
672            - (hzs_len << 1);
673        npre_items[new_added].his_len = hzs_len;
674        npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
675        memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
676        if ((cpy_len >> 1) < kMaxPredictSize) {
677          npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
678        }
679        new_added++;
680      } else {
681        break;
682      }
683  
684      j++;
685    }
686  #endif
687    return new_added;
688  }
689  
690  int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
691                                    uint16 lemma_len) {
692    int32 max_off = dict_info_.lemma_count;
693  
694    UserDictSearchable searchable;
695    prepare_locate(&searchable, splid_str, lemma_len);
696  #ifdef ___CACHE_ENABLED___
697    int32 off;
698    uint32 start, count;
699    bool cached = load_cache(&searchable, &start, &count);
700    if (cached) {
701      off = start;
702      max_off = start + count;
703    } else {
704      off = locate_first_in_offsets(&searchable);
705      start = off;
706    }
707  #else
708    int32 off = locate_first_in_offsets(&searchable);
709  #endif
710  
711    if (off == -1) {
712      return off;
713    }
714  
715    while (off < max_off) {
716      uint32 offset = offsets_[off];
717      if (offset & kUserDictOffsetFlagRemove) {
718        off++;
719        continue;
720      }
721      uint16 * splids = get_lemma_spell_ids(offset);
722  #ifdef ___CACHE_ENABLED___
723      if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
724        break;
725  #else
726      if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
727        break;
728  #endif
729      if (equal_spell_id(splids, lemma_len, &searchable) == true) {
730        uint16 * str = get_lemma_word(offset);
731        uint32 i = 0;
732        for (i = 0; i < lemma_len; i++) {
733          if (str[i] == lemma_str[i])
734            continue;
735          break;
736        }
737        if (i < lemma_len) {
738          off++;
739          continue;
740        }
741  #ifdef ___CACHE_ENABLED___
742        // No need to save_cache here, since current function is invoked by
743        // put_lemma. It's rarely possible for a user input same lemma twice.
744        // That means first time user type a new lemma, it is newly added into
745        // user dictionary, then it's possible that user type the same lemma
746        // again.
747        // Another reason save_cache can not be invoked here is this function
748        // aborts when lemma is found, and it never knows the count.
749  #endif
750        return off;
751      }
752      off++;
753    }
754  
755    return -1;
756  }
757  
758  #ifdef ___PREDICT_ENABLED___
759  uint32 UserDict::locate_where_to_insert_in_predicts(
760      const uint16 * words, int lemma_len) {
761    int32 begin = 0;
762    int32 end = dict_info_.lemma_count - 1;
763    int32 middle = end;
764  
765    uint32 last_matched = middle;
766  
767    while (begin <= end) {
768      middle = (begin + end) >> 1;
769      uint32 offset = offsets_[middle];
770      uint8 nchar = get_lemma_nchar(offset);
771      const uint16 * ws = get_lemma_word(offset);
772  
773      uint32 minl = nchar < lemma_len ? nchar : lemma_len;
774      uint32 k = 0;
775      int cmp = 0;
776  
777      for (; k < minl; k++) {
778        if (ws[k] < words[k]) {
779          cmp = -1;
780          break;
781        } else if (ws[k] > words[k]) {
782          cmp = 1;
783          break;
784        }
785      }
786      if (cmp == 0) {
787        if (nchar < lemma_len)
788          cmp = -1;
789        else if (nchar > lemma_len)
790          cmp = 1;
791      }
792  
793      if (cmp < 0) {
794        begin = middle + 1;
795        last_matched = middle;
796      } else if (cmp > 0) {
797        end = middle - 1;
798      } else {
799        end = middle - 1;
800        last_matched = middle;
801      }
802    }
803  
804    return last_matched;
805  }
806  
807  int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
808    int32 begin = 0;
809    int32 end = dict_info_.lemma_count - 1;
810    int32 middle = -1;
811  
812    int32 last_matched = middle;
813  
814    while (begin <= end) {
815      middle = (begin + end) >> 1;
816      uint32 offset = offsets_[middle];
817      uint8 nchar = get_lemma_nchar(offset);
818      const uint16 * ws = get_lemma_word(offset);
819  
820      uint32 minl = nchar < lemma_len ? nchar : lemma_len;
821      uint32 k = 0;
822      int cmp = 0;
823  
824      for (; k < minl; k++) {
825        if (ws[k] < words[k]) {
826          cmp = -1;
827          break;
828        } else if (ws[k] > words[k]) {
829          cmp = 1;
830          break;
831        }
832      }
833      if (cmp == 0) {
834        if (nchar >= lemma_len)
835          last_matched = middle;
836        if (nchar < lemma_len)
837          cmp = -1;
838        else if (nchar > lemma_len)
839          cmp = 1;
840      }
841  
842      if (cmp < 0) {
843        begin = middle + 1;
844      } else if (cmp > 0) {
845        end = middle - 1;
846      } else {
847        end = middle - 1;
848      }
849    }
850  
851    return last_matched;
852  }
853  
854  #endif
855  
856  LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
857                                     uint16 lemma_len) {
858    int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
859    if (off == -1) {
860      return 0;
861    }
862  
863    return ids_[off];
864  }
865  
866  LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
867    if (is_valid_state() == false)
868      return 0;
869    if (is_valid_lemma_id(lemma_id) == false)
870      return 0;
871  
872    return translate_score(_get_lemma_score(lemma_id));
873  }
874  
875  LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
876                                  uint16 lemma_len) {
877    if (is_valid_state() == false)
878      return 0;
879    return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
880  }
881  
882  int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
883    if (is_valid_state() == false)
884      return 0;
885    if (is_valid_lemma_id(lemma_id) == false)
886      return 0;
887  
888    uint32 offset = offsets_by_id_[lemma_id - start_id_];
889  
890    uint32 nchar = get_lemma_nchar(offset);
891    uint16 * spl = get_lemma_spell_ids(offset);
892    uint16 * wrd = get_lemma_word(offset);
893  
894    int32 off = locate_in_offsets(wrd, spl, nchar);
895    if (off == -1) {
896      return 0;
897    }
898  
899    return scores_[off];
900  }
901  
902  int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
903                                  uint16 lemma_len) {
904    if (is_valid_state() == false)
905      return 0;
906  
907    int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
908    if (off == -1) {
909      return 0;
910    }
911  
912    return scores_[off];
913  }
914  
915  #ifdef ___SYNC_ENABLED___
916  void UserDict::remove_lemma_from_sync_list(uint32 offset) {
917    offset &= kUserDictOffsetMask;
918    uint32 i = 0;
919    for (; i < dict_info_.sync_count; i++) {
920      unsigned int off = (syncs_[i] & kUserDictOffsetMask);
921      if (off == offset)
922        break;
923    }
924    if (i < dict_info_.sync_count) {
925      syncs_[i] = syncs_[dict_info_.sync_count - 1];
926      dict_info_.sync_count--;
927    }
928  }
929  #endif
930  
931  #ifdef ___PREDICT_ENABLED___
932  void UserDict::remove_lemma_from_predict_list(uint32 offset) {
933    offset &= kUserDictOffsetMask;
934    uint32 i = 0;
935    for (; i < dict_info_.lemma_count; i++) {
936      unsigned int off = (predicts_[i] & kUserDictOffsetMask);
937      if (off == offset) {
938        predicts_[i] |= kUserDictOffsetFlagRemove;
939        break;
940      }
941    }
942  }
943  #endif
944  
945  bool UserDict::remove_lemma_by_offset_index(int offset_index) {
946    if (is_valid_state() == false)
947      return 0;
948  
949    int32 off = offset_index;
950    if (off == -1) {
951      return false;
952    }
953  
954    uint32 offset = offsets_[off];
955    uint32 nchar = get_lemma_nchar(offset);
956  
957    offsets_[off] |= kUserDictOffsetFlagRemove;
958  
959  #ifdef ___SYNC_ENABLED___
960    // Remove corresponding sync item
961    remove_lemma_from_sync_list(offset);
962  #endif
963  
964  #ifdef ___PREDICT_ENABLED___
965    remove_lemma_from_predict_list(offset);
966  #endif
967    dict_info_.free_count++;
968    dict_info_.free_size += (2 + (nchar << 2));
969  
970    if (state_ < USER_DICT_OFFSET_DIRTY)
971      state_ = USER_DICT_OFFSET_DIRTY;
972    return true;
973  }
974  
975  bool UserDict::remove_lemma(LemmaIdType lemma_id) {
976    if (is_valid_state() == false)
977      return 0;
978    if (is_valid_lemma_id(lemma_id) == false)
979      return false;
980    uint32 offset = offsets_by_id_[lemma_id - start_id_];
981  
982    uint32 nchar = get_lemma_nchar(offset);
983    uint16 * spl = get_lemma_spell_ids(offset);
984    uint16 * wrd = get_lemma_word(offset);
985  
986    int32 off = locate_in_offsets(wrd, spl, nchar);
987  
988    return remove_lemma_by_offset_index(off);
989  }
990  
991  void UserDict::flush_cache() {
992    LemmaIdType start_id = start_id_;
993    const char * file = strdup(dict_file_);
994    if (!file)
995      return;
996    close_dict();
997    load_dict(file, start_id, kUserDictIdEnd);
998    free((void*)file);
999  #ifdef ___CACHE_ENABLED___
1000    cache_init();
1001  #endif
1002    return;
1003  }
1004  
1005  bool UserDict::reset(const char *file) {
1006    FILE *fp = fopen(file, "w+");
1007    if (!fp) {
1008      return false;
1009    }
1010    uint32 version = kUserDictVersion;
1011    size_t wred = fwrite(&version, 1, 4, fp);
1012    UserDictInfo info;
1013    memset(&info, 0, sizeof(info));
1014    // By default, no limitation for lemma count and size
1015    // thereby, reclaim_ratio is never used
1016    wred += fwrite(&info, 1, sizeof(info), fp);
1017    if (wred != sizeof(info) + sizeof(version)) {
1018      fclose(fp);
1019      unlink(file);
1020      return false;
1021    }
1022    fclose(fp);
1023    return true;
1024  }
1025  
1026  bool UserDict::validate(const char *file) {
1027    // b is ignored in POSIX compatible os including Linux
1028    // while b is important flag for Windows to specify binary mode
1029    FILE *fp = fopen(file, "rb");
1030    if (!fp) {
1031      return false;
1032    }
1033  
1034    size_t size;
1035    size_t readed;
1036    uint32 version;
1037    UserDictInfo dict_info;
1038  
1039    // validate
1040    int err = fseek(fp, 0, SEEK_END);
1041    if (err) {
1042      goto error;
1043    }
1044  
1045    size = ftell(fp);
1046    if (size < 4 + sizeof(dict_info)) {
1047      goto error;
1048    }
1049  
1050    err = fseek(fp, 0, SEEK_SET);
1051    if (err) {
1052      goto error;
1053    }
1054  
1055    readed = fread(&version, 1, sizeof(version), fp);
1056    if (readed < sizeof(version)) {
1057      goto error;
1058    }
1059    if (version != kUserDictVersion) {
1060      goto error;
1061    }
1062  
1063    err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
1064    if (err) {
1065      goto error;
1066    }
1067  
1068    readed = fread(&dict_info, 1, sizeof(dict_info), fp);
1069    if (readed != sizeof(dict_info)) {
1070      goto error;
1071    }
1072  
1073    if (size != get_dict_file_size(&dict_info)) {
1074      goto error;
1075    }
1076  
1077    fclose(fp);
1078    return true;
1079  
1080   error:
1081    fclose(fp);
1082    return false;
1083  }
1084  
1085  bool UserDict::load(const char *file, LemmaIdType start_id) {
1086    if (0 != pthread_mutex_trylock(&g_mutex_)) {
1087      return false;
1088    }
1089    // b is ignored in POSIX compatible os including Linux
1090    // while b is important flag for Windows to specify binary mode
1091    FILE *fp = fopen(file, "rb");
1092    if (!fp) {
1093      pthread_mutex_unlock(&g_mutex_);
1094      return false;
1095    }
1096  
1097    size_t readed, toread;
1098    UserDictInfo dict_info;
1099    uint8 *lemmas = NULL;
1100    uint32 *offsets = NULL;
1101  #ifdef ___SYNC_ENABLED___
1102    uint32 *syncs = NULL;
1103  #endif
1104    uint32 *scores = NULL;
1105    uint32 *ids = NULL;
1106    uint32 *offsets_by_id = NULL;
1107  #ifdef ___PREDICT_ENABLED___
1108    uint32 *predicts = NULL;
1109  #endif
1110    size_t i;
1111    int err;
1112  
1113    err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
1114    if (err) goto error;
1115  
1116    readed = fread(&dict_info, 1, sizeof(dict_info), fp);
1117    if (readed != sizeof(dict_info)) goto error;
1118  
1119    lemmas = (uint8 *)malloc(
1120        dict_info.lemma_size +
1121        (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));
1122  
1123    if (!lemmas) goto error;
1124  
1125    offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1126    if (!offsets) goto error;
1127  
1128  #ifdef ___PREDICT_ENABLED___
1129    predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1130    if (!predicts) goto error;
1131  #endif
1132  
1133  #ifdef ___SYNC_ENABLED___
1134    syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
1135    if (!syncs) goto error;
1136  #endif
1137  
1138    scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1139    if (!scores) goto error;
1140  
1141    ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1142    if (!ids) goto error;
1143  
1144    offsets_by_id = (uint32 *)malloc(
1145        (dict_info.lemma_count + kUserDictPreAlloc) << 2);
1146    if (!offsets_by_id) goto error;
1147  
1148    err = fseek(fp, 4, SEEK_SET);
1149    if (err) goto error;
1150  
1151    readed = 0;
1152    while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
1153      readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
1154    }
1155    if (readed < dict_info.lemma_size)
1156      goto error;
1157  
1158    toread = (dict_info.lemma_count << 2);
1159    readed = 0;
1160    while (readed < toread && !ferror(fp) && !feof(fp)) {
1161      readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
1162    }
1163    if (readed < toread)
1164      goto error;
1165  
1166  #ifdef ___PREDICT_ENABLED___
1167    toread = (dict_info.lemma_count << 2);
1168    readed = 0;
1169    while (readed < toread && !ferror(fp) && !feof(fp)) {
1170      readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
1171    }
1172    if (readed < toread)
1173      goto error;
1174  #endif
1175  
1176    readed = 0;
1177    while (readed < toread && !ferror(fp) && !feof(fp)) {
1178      readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
1179    }
1180    if (readed < toread)
1181      goto error;
1182  
1183  #ifdef ___SYNC_ENABLED___
1184    toread = (dict_info.sync_count << 2);
1185    readed = 0;
1186    while (readed < toread && !ferror(fp) && !feof(fp)) {
1187      readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
1188    }
1189    if (readed < toread)
1190      goto error;
1191  #endif
1192  
1193    for (i = 0; i < dict_info.lemma_count; i++) {
1194      ids[i] = start_id + i;
1195      offsets_by_id[i] = offsets[i];
1196    }
1197  
1198    lemmas_ = lemmas;
1199    offsets_ = offsets;
1200  #ifdef ___SYNC_ENABLED___
1201    syncs_ = syncs;
1202    sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
1203  #endif
1204    offsets_by_id_ = offsets_by_id;
1205    scores_ = scores;
1206    ids_ = ids;
1207  #ifdef ___PREDICT_ENABLED___
1208    predicts_ = predicts;
1209  #endif
1210    lemma_count_left_ = kUserDictPreAlloc;
1211    lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
1212    memcpy(&dict_info_, &dict_info, sizeof(dict_info));
1213    state_ = USER_DICT_SYNC;
1214  
1215    fclose(fp);
1216  
1217    pthread_mutex_unlock(&g_mutex_);
1218    return true;
1219  
1220   error:
1221    if (lemmas) free(lemmas);
1222    if (offsets) free(offsets);
1223  #ifdef ___SYNC_ENABLED___
1224    if (syncs) free(syncs);
1225  #endif
1226    if (scores) free(scores);
1227    if (ids) free(ids);
1228    if (offsets_by_id) free(offsets_by_id);
1229  #ifdef ___PREDICT_ENABLED___
1230    if (predicts) free(predicts);
1231  #endif
1232    fclose(fp);
1233    pthread_mutex_unlock(&g_mutex_);
1234    return false;
1235  }
1236  
1237  void UserDict::write_back() {
1238    // XXX write back is only allowed from close_dict due to thread-safe sake
1239    if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
1240      return;
1241    int fd = open(dict_file_, O_WRONLY);
1242    if (fd == -1)
1243      return;
1244    switch (state_) {
1245      case USER_DICT_DEFRAGMENTED:
1246        write_back_all(fd);
1247        break;
1248      case USER_DICT_LEMMA_DIRTY:
1249        write_back_lemma(fd);
1250        break;
1251      case USER_DICT_OFFSET_DIRTY:
1252        write_back_offset(fd);
1253        break;
1254      case USER_DICT_SCORE_DIRTY:
1255        write_back_score(fd);
1256        break;
1257  #ifdef ___SYNC_ENABLED___
1258      case USER_DICT_SYNC_DIRTY:
1259        write_back_sync(fd);
1260        break;
1261  #endif
1262      default:
1263        break;
1264    }
1265    // It seems truncate is not need on Linux, Windows except Mac
1266    // I am doing it here anyway for safety.
1267    off_t cur = lseek(fd, 0, SEEK_CUR);
1268    ftruncate(fd, cur);
1269    close(fd);
1270    state_ = USER_DICT_SYNC;
1271  }
1272  
1273  #ifdef ___SYNC_ENABLED___
1274  void UserDict::write_back_sync(int fd) {
1275    int err = lseek(fd, 4 + dict_info_.lemma_size
1276                    + (dict_info_.lemma_count << 3)
1277  #ifdef ___PREDICT_ENABLED___
1278                    + (dict_info_.lemma_count << 2)
1279  #endif
1280                    , SEEK_SET);
1281    if (err == -1)
1282      return;
1283    write(fd, syncs_, dict_info_.sync_count << 2);
1284    write(fd, &dict_info_, sizeof(dict_info_));
1285  }
1286  #endif
1287  
1288  void UserDict::write_back_offset(int fd) {
1289    int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
1290    if (err == -1)
1291      return;
1292    write(fd, offsets_, dict_info_.lemma_count << 2);
1293  #ifdef ___PREDICT_ENABLED___
1294    write(fd, predicts_, dict_info_.lemma_count << 2);
1295  #endif
1296    write(fd, scores_, dict_info_.lemma_count << 2);
1297  #ifdef ___SYNC_ENABLED___
1298    write(fd, syncs_, dict_info_.sync_count << 2);
1299  #endif
1300    write(fd, &dict_info_, sizeof(dict_info_));
1301  }
1302  
1303  void UserDict::write_back_score(int fd) {
1304    int err = lseek(fd, 4 + dict_info_.lemma_size
1305                    + (dict_info_.lemma_count << 2)
1306  #ifdef ___PREDICT_ENABLED___
1307                    + (dict_info_.lemma_count << 2)
1308  #endif
1309                    , SEEK_SET);
1310    if (err == -1)
1311      return;
1312    write(fd, scores_, dict_info_.lemma_count << 2);
1313  #ifdef ___SYNC_ENABLED___
1314    write(fd, syncs_, dict_info_.sync_count << 2);
1315  #endif
1316    write(fd, &dict_info_, sizeof(dict_info_));
1317  }
1318  
1319  void UserDict::write_back_lemma(int fd) {
1320    int err = lseek(fd, 4, SEEK_SET);
1321    if (err == -1)
1322      return;
1323    // New lemmas are always appended, no need to write whole lemma block
1324    size_t need_write = kUserDictPreAlloc *
1325        (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
1326    err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
1327    if (err == -1)
1328      return;
1329    write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);
1330  
1331    write(fd, offsets_,  dict_info_.lemma_count << 2);
1332  #ifdef ___PREDICT_ENABLED___
1333    write(fd, predicts_,  dict_info_.lemma_count << 2);
1334  #endif
1335    write(fd, scores_, dict_info_.lemma_count << 2);
1336  #ifdef ___SYNC_ENABLED___
1337    write(fd, syncs_, dict_info_.sync_count << 2);
1338  #endif
1339    write(fd, &dict_info_, sizeof(dict_info_));
1340  }
1341  
1342  void UserDict::write_back_all(int fd) {
1343    // XXX lemma_size is handled differently in writeall
1344    // and writelemma. I update lemma_size and lemma_count in different
1345    // places for these two cases. Should fix it to make it consistent.
1346    int err = lseek(fd, 4, SEEK_SET);
1347    if (err == -1)
1348      return;
1349    write(fd, lemmas_, dict_info_.lemma_size);
1350    write(fd, offsets_, dict_info_.lemma_count << 2);
1351  #ifdef ___PREDICT_ENABLED___
1352    write(fd, predicts_, dict_info_.lemma_count << 2);
1353  #endif
1354    write(fd, scores_, dict_info_.lemma_count << 2);
1355  #ifdef ___SYNC_ENABLED___
1356    write(fd, syncs_, dict_info_.sync_count << 2);
1357  #endif
1358    write(fd, &dict_info_, sizeof(dict_info_));
1359  }
1360  
1361  #ifdef ___CACHE_ENABLED___
1362  bool UserDict::load_cache(UserDictSearchable *searchable,
1363                            uint32 *offset, uint32 *length) {
1364    UserDictCache *cache = &caches_[searchable->splids_len - 1];
1365    if (cache->head == cache->tail)
1366      return false;
1367  
1368    uint16 j, sig_len = kMaxLemmaSize / 4;
1369    uint16 i = cache->head;
1370    while (1) {
1371      j = 0;
1372      for (; j < sig_len; j++) {
1373        if (cache->signatures[i][j] != searchable->signature[j])
1374          break;
1375      }
1376      if (j < sig_len) {
1377        i++;
1378        if (i >= kUserDictCacheSize)
1379          i -= kUserDictCacheSize;
1380        if (i == cache->tail)
1381          break;
1382        continue;
1383      }
1384      *offset = cache->offsets[i];
1385      *length = cache->lengths[i];
1386      return true;
1387    }
1388    return false;
1389  }
1390  
1391  void UserDict::save_cache(UserDictSearchable *searchable,
1392                            uint32 offset, uint32 length) {
1393    UserDictCache *cache = &caches_[searchable->splids_len - 1];
1394    uint16 next = cache->tail;
1395  
1396    cache->offsets[next] = offset;
1397    cache->lengths[next] = length;
1398    uint16 sig_len = kMaxLemmaSize / 4;
1399    uint16 j = 0;
1400    for (; j < sig_len; j++) {
1401      cache->signatures[next][j] = searchable->signature[j];
1402    }
1403  
1404    if (++next >= kUserDictCacheSize) {
1405      next -= kUserDictCacheSize;
1406    }
1407    if (next == cache->head) {
1408      cache->head++;
1409      if (cache->head >= kUserDictCacheSize) {
1410        cache->head -= kUserDictCacheSize;
1411      }
1412    }
1413    cache->tail = next;
1414  }
1415  
1416  void UserDict::reset_cache() {
1417    memset(caches_, 0, sizeof(caches_));
1418  }
1419  
1420  bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
1421    UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
1422    if (cache->head == cache->tail)
1423      return false;
1424  
1425    uint16 j, sig_len = kMaxLemmaSize / 4;
1426    uint16 i = cache->head;
1427    while (1) {
1428      j = 0;
1429      for (; j < sig_len; j++) {
1430        if (cache->signatures[i][j] != searchable->signature[j])
1431          break;
1432      }
1433      if (j < sig_len) {
1434        i++;
1435        if (i >= kUserDictMissCacheSize)
1436          i -= kUserDictMissCacheSize;
1437        if (i == cache->tail)
1438          break;
1439        continue;
1440      }
1441      return true;
1442    }
1443    return false;
1444  }
1445  
1446  void UserDict::save_miss_cache(UserDictSearchable *searchable) {
1447    UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
1448    uint16 next = cache->tail;
1449  
1450    uint16 sig_len = kMaxLemmaSize / 4;
1451    uint16 j = 0;
1452    for (; j < sig_len; j++) {
1453      cache->signatures[next][j] = searchable->signature[j];
1454    }
1455  
1456    if (++next >= kUserDictMissCacheSize) {
1457      next -= kUserDictMissCacheSize;
1458    }
1459    if (next == cache->head) {
1460      cache->head++;
1461      if (cache->head >= kUserDictMissCacheSize) {
1462        cache->head -= kUserDictMissCacheSize;
1463      }
1464    }
1465    cache->tail = next;
1466  }
1467  
1468  void UserDict::reset_miss_cache() {
1469    memset(miss_caches_, 0, sizeof(miss_caches_));
1470  }
1471  
1472  void UserDict::cache_init() {
1473    reset_cache();
1474    reset_miss_cache();
1475  }
1476  
1477  bool UserDict::cache_hit(UserDictSearchable *searchable,
1478                           uint32 *offset, uint32 *length) {
1479    bool hit = load_miss_cache(searchable);
1480    if (hit) {
1481      *offset = 0;
1482      *length = 0;
1483      return true;
1484    }
1485    hit = load_cache(searchable, offset, length);
1486    if (hit) {
1487      return true;
1488    }
1489    return false;
1490  }
1491  
1492  void UserDict::cache_push(UserDictCacheType type,
1493                           UserDictSearchable *searchable,
1494                           uint32 offset, uint32 length) {
1495    switch (type) {
1496      case USER_DICT_MISS_CACHE:
1497        save_miss_cache(searchable);
1498        break;
1499      case USER_DICT_CACHE:
1500        save_cache(searchable, offset, length);
1501        break;
1502      default:
1503        break;
1504    }
1505  }
1506  
1507  #endif
1508  
1509  void UserDict::defragment(void) {
1510  #ifdef ___DEBUG_PERF___
1511    DEBUG_PERF_BEGIN;
1512  #endif
1513    if (is_valid_state() == false)
1514      return;
1515    // Fixup offsets_, set REMOVE flag to lemma's flag if needed
1516    size_t first_freed = 0;
1517    size_t first_inuse = 0;
1518    while (first_freed < dict_info_.lemma_count) {
1519      // Find first freed offset
1520      while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
1521              first_freed < dict_info_.lemma_count) {
1522        first_freed++;
1523      }
1524      if (first_freed < dict_info_.lemma_count) {
1525        // Save REMOVE flag to lemma flag
1526        int off = offsets_[first_freed];
1527        set_lemma_flag(off, kUserDictLemmaFlagRemove);
1528      } else {
1529        break;
1530      }
1531      // Find first inuse offse after first_freed
1532      first_inuse = first_freed + 1;
1533      while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
1534             (first_inuse < dict_info_.lemma_count)) {
1535        // Save REMOVE flag to lemma flag
1536        int off = offsets_[first_inuse];
1537        set_lemma_flag(off, kUserDictLemmaFlagRemove);
1538        first_inuse++;
1539      }
1540      if (first_inuse >= dict_info_.lemma_count) {
1541        break;
1542      }
1543      // Swap offsets_
1544      int tmp = offsets_[first_inuse];
1545      offsets_[first_inuse] = offsets_[first_freed];
1546      offsets_[first_freed] = tmp;
1547      // Move scores_, no need to swap
1548      tmp = scores_[first_inuse];
1549      scores_[first_inuse] = scores_[first_freed];
1550      scores_[first_freed] = tmp;
1551      // Swap ids_
1552      LemmaIdType tmpid = ids_[first_inuse];
1553      ids_[first_inuse] = ids_[first_freed];
1554      ids_[first_freed] = tmpid;
1555      // Go on
1556      first_freed++;
1557    }
1558  #ifdef ___PREDICT_ENABLED___
1559    // Fixup predicts_
1560    first_freed = 0;
1561    first_inuse = 0;
1562    while (first_freed < dict_info_.lemma_count) {
1563      // Find first freed offset
1564      while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
1565              first_freed < dict_info_.lemma_count) {
1566        first_freed++;
1567      }
1568      if (first_freed >= dict_info_.lemma_count)
1569        break;
1570      // Find first inuse offse after first_freed
1571      first_inuse = first_freed + 1;
1572      while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
1573             && (first_inuse < dict_info_.lemma_count)) {
1574        first_inuse++;
1575      }
1576      if (first_inuse >= dict_info_.lemma_count) {
1577        break;
1578      }
1579      // Swap offsets_
1580      int tmp = predicts_[first_inuse];
1581      predicts_[first_inuse] = predicts_[first_freed];
1582      predicts_[first_freed] = tmp;
1583      // Go on
1584      first_freed++;
1585    }
1586  #endif
1587    dict_info_.lemma_count = first_freed;
1588    // Fixup lemmas_
1589    size_t begin = 0;
1590    size_t end = 0;
1591    size_t dst = 0;
1592    int total_size = dict_info_.lemma_size + lemma_size_left_;
1593    int total_count = dict_info_.lemma_count + lemma_count_left_;
1594    size_t real_size = total_size - lemma_size_left_;
1595    while (dst < real_size) {
1596      unsigned char flag = get_lemma_flag(dst);
1597      unsigned char nchr = get_lemma_nchar(dst);
1598      if ((flag & kUserDictLemmaFlagRemove) == 0) {
1599        dst += nchr * 4 + 2;
1600        continue;
1601      }
1602      break;
1603    }
1604    if (dst >= real_size)
1605      return;
1606  
1607    end = dst;
1608    while (end < real_size) {
1609      begin = end + get_lemma_nchar(end) * 4 + 2;
1610   repeat:
1611      // not used any more
1612      if (begin >= real_size)
1613        break;
1614      unsigned char flag = get_lemma_flag(begin);
1615      unsigned char nchr = get_lemma_nchar(begin);
1616      if (flag & kUserDictLemmaFlagRemove) {
1617        begin += nchr * 4 + 2;
1618        goto repeat;
1619      }
1620      end = begin + nchr * 4 + 2;
1621      while (end < real_size) {
1622        unsigned char eflag = get_lemma_flag(end);
1623        unsigned char enchr = get_lemma_nchar(end);
1624        if ((eflag & kUserDictLemmaFlagRemove) == 0) {
1625          end += enchr * 4 + 2;
1626          continue;
1627        }
1628        break;
1629      }
1630      memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
1631      for (size_t j = 0; j < dict_info_.lemma_count; j++) {
1632        if (offsets_[j] >= begin && offsets_[j] < end) {
1633          offsets_[j] -= (begin - dst);
1634          offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
1635        }
1636  #ifdef ___PREDICT_ENABLED___
1637        if (predicts_[j] >= begin && predicts_[j] < end) {
1638          predicts_[j] -= (begin - dst);
1639        }
1640  #endif
1641      }
1642  #ifdef ___SYNC_ENABLED___
1643      for (size_t j = 0; j < dict_info_.sync_count; j++) {
1644        if (syncs_[j] >= begin && syncs_[j] < end) {
1645          syncs_[j] -= (begin - dst);
1646        }
1647      }
1648  #endif
1649      dst += (end - begin);
1650    }
1651  
1652    dict_info_.free_count = 0;
1653    dict_info_.free_size = 0;
1654    dict_info_.lemma_size = dst;
1655    lemma_size_left_ = total_size - dict_info_.lemma_size;
1656    lemma_count_left_ = total_count - dict_info_.lemma_count;
1657  
1658    // XXX Without following code,
1659    // offsets_by_id_ is not reordered.
1660    // That's to say, all removed lemmas' ids are not collected back.
1661    // There may not be room for addition of new lemmas due to
1662    // offsests_by_id_ reason, although lemma_size_left_ is fixed.
1663    // By default, we do want defrag as fast as possible, because
1664    // during defrag procedure, other peers can not write new lemmas
1665    // to user dictionary file.
1666    // XXX If write-back is invoked immediately after
1667    // this defragment, no need to fix up following in-mem data.
1668    for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
1669      ids_[i] = start_id_ + i;
1670      offsets_by_id_[i] = offsets_[i];
1671    }
1672  
1673    state_ = USER_DICT_DEFRAGMENTED;
1674  
1675  #ifdef ___DEBUG_PERF___
1676    DEBUG_PERF_END;
1677    LOGD_PERF("defragment");
1678  #endif
1679  }
1680  
1681  #ifdef ___SYNC_ENABLED___
1682  void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
1683    if (is_valid_state() == false)
1684      return;
1685    if (end > dict_info_.sync_count)
1686      end = dict_info_.sync_count;
1687    memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
1688    dict_info_.sync_count -= (end - start);
1689    if (state_ < USER_DICT_SYNC_DIRTY)
1690      state_ = USER_DICT_SYNC_DIRTY;
1691  }
1692  
1693  int UserDict::get_sync_count() {
1694    if (is_valid_state() == false)
1695      return 0;
1696    return dict_info_.sync_count;
1697  }
1698  
1699  LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
1700                          uint16 lemma_len, uint16 count, uint64 lmt) {
1701    int again = 0;
1702   begin:
1703    LemmaIdType id;
1704    uint32 * syncs_bak = syncs_;
1705    syncs_ = NULL;
1706    id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
1707    syncs_ = syncs_bak;
1708    if (id == 0 && again == 0) {
1709      if ((dict_info_.limit_lemma_count > 0 &&
1710          dict_info_.lemma_count >= dict_info_.limit_lemma_count)
1711          || (dict_info_.limit_lemma_size > 0 &&
1712              dict_info_.lemma_size + (2 + (lemma_len << 2))
1713              > dict_info_.limit_lemma_size)) {
1714        // XXX Always reclaim and defrag in sync code path
1715        //     sync thread is background thread and ok with heavy work
1716        reclaim();
1717        defragment();
1718        flush_cache();
1719        again = 1;
1720        goto begin;
1721      }
1722    }
1723    return id;
1724  }
1725  
1726  int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
1727    int newly_added = 0;
1728  
1729    SpellingParser * spl_parser = new SpellingParser();
1730    if (!spl_parser) {
1731      return 0;
1732    }
1733  #ifdef ___DEBUG_PERF___
1734    DEBUG_PERF_BEGIN;
1735  #endif
1736    char16 *ptr = lemmas;
1737  
1738    // Extract pinyin,words,frequence,last_mod_time
1739    char16 * p = ptr, * py16 = ptr;
1740    char16 * hz16 = NULL;
1741    int py16_len = 0;
1742    uint16 splid[kMaxLemmaSize];
1743    int splid_len = 0;
1744    int hz16_len = 0;
1745    char16 * fr16 = NULL;
1746    int fr16_len = 0;
1747  
1748    while (p - ptr < len) {
1749      // Pinyin
1750      py16 = p;
1751      splid_len = 0;
1752      while (*p != 0x2c && (p - ptr) < len) {
1753        if (*p == 0x20)
1754          splid_len++;
1755        p++;
1756      }
1757      splid_len++;
1758      if (p - ptr == len)
1759        break;
1760      py16_len = p - py16;
1761      if (kMaxLemmaSize < splid_len) {
1762        break;
1763      }
1764      bool is_pre;
1765      int splidl = spl_parser->splstr16_to_idxs_f(
1766          py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre);
1767      if (splidl != splid_len)
1768        break;
1769      // Phrase
1770      hz16 = ++p;
1771      while (*p != 0x2c && (p - ptr) < len) {
1772        p++;
1773      }
1774      hz16_len = p - hz16;
1775      if (hz16_len != splid_len)
1776        break;
1777      // Frequency
1778      fr16 = ++p;
1779      fr16_len = 0;
1780      while (*p != 0x2c && (p - ptr) < len) {
1781        p++;
1782      }
1783      fr16_len = p - fr16;
1784      uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
1785      // Last modified time
1786      fr16 = ++p;
1787      fr16_len = 0;
1788      while (*p != 0x3b && (p - ptr) < len) {
1789        p++;
1790      }
1791      fr16_len = p - fr16;
1792      uint64 last_mod = utf16le_atoll(fr16, fr16_len);
1793  
1794      put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
1795      newly_added++;
1796  
1797      p++;
1798    }
1799  
1800  #ifdef ___DEBUG_PERF___
1801    DEBUG_PERF_END;
1802    LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
1803  #endif
1804    return newly_added;
1805  }
1806  
1807  int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
1808      char16 * str, int size, int * count) {
1809    int len = 0;
1810    *count = 0;
1811  
1812    int left_len = size;
1813  
1814    if (is_valid_state() == false)
1815      return len;
1816  
1817    SpellingTrie * spl_trie = &SpellingTrie::get_instance();
1818    if (!spl_trie) {
1819      return 0;
1820    }
1821  
1822    uint32 i;
1823    for (i = 0; i < dict_info_.sync_count; i++) {
1824      int offset = syncs_[i];
1825      uint32 nchar = get_lemma_nchar(offset);
1826      uint16 *spl = get_lemma_spell_ids(offset);
1827      uint16 *wrd = get_lemma_word(offset);
1828      int score = _get_lemma_score(wrd, spl, nchar);
1829  
1830      static char score_temp[32], *pscore_temp = score_temp;
1831      static char16 temp[256], *ptemp = temp;
1832  
1833      pscore_temp = score_temp;
1834      ptemp = temp;
1835  
1836      uint32 j;
1837      // Add pinyin
1838      for (j = 0; j < nchar; j++) {
1839        int ret_len = spl_trie->get_spelling_str16(
1840            spl[j], ptemp, temp + sizeof(temp) - ptemp);
1841        if (ret_len <= 0)
1842          break;
1843        ptemp += ret_len;
1844        if (ptemp < temp + sizeof(temp) - 1) {
1845          *(ptemp++) = ' ';
1846        } else {
1847          j = 0;
1848          break;
1849        }
1850      }
1851      if (j < nchar) {
1852        continue;
1853      }
1854      ptemp--;
1855      if (ptemp < temp + sizeof(temp) - 1) {
1856        *(ptemp++) = ',';
1857      } else {
1858        continue;
1859      }
1860      // Add phrase
1861      for (j = 0; j < nchar; j++) {
1862        if (ptemp < temp + sizeof(temp) - 1) {
1863          *(ptemp++) = wrd[j];
1864        } else {
1865          break;
1866        }
1867      }
1868      if (j < nchar) {
1869        continue;
1870      }
1871      if (ptemp < temp + sizeof(temp) - 1) {
1872        *(ptemp++) = ',';
1873      } else {
1874        continue;
1875      }
1876      // Add frequency
1877      uint32 intf = extract_score_freq(score);
1878      int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
1879      if (ret_len <= 0)
1880        continue;
1881      ptemp += ret_len;
1882      if (ptemp < temp + sizeof(temp) - 1) {
1883        *(ptemp++) = ',';
1884      } else {
1885        continue;
1886      }
1887      // Add last modified time
1888      uint64 last_mod = extract_score_lmt(score);
1889      ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
1890      if (ret_len <= 0)
1891        continue;
1892      ptemp += ret_len;
1893      if (ptemp < temp + sizeof(temp) - 1) {
1894        *(ptemp++) = ';';
1895      } else {
1896        continue;
1897      }
1898  
1899      // Write to string
1900      int need_len = ptemp - temp;
1901      if (need_len > left_len)
1902        break;
1903      memcpy(str + len, temp, need_len * 2);
1904      left_len -= need_len;
1905  
1906      len += need_len;
1907      (*count)++;
1908    }
1909  
1910    if (len > 0) {
1911      if (state_ < USER_DICT_SYNC_DIRTY)
1912        state_ = USER_DICT_SYNC_DIRTY;
1913    }
1914    return len;
1915  }
1916  
1917  #endif
1918  
1919  bool UserDict::state(UserDictStat * stat) {
1920    if (is_valid_state() == false)
1921      return false;
1922    if (!stat)
1923      return false;
1924    stat->version = version_;
1925    stat->file_name = dict_file_;
1926    stat->load_time.tv_sec = load_time_.tv_sec;
1927    stat->load_time.tv_usec = load_time_.tv_usec;
1928    pthread_mutex_lock(&g_mutex_);
1929    stat->last_update.tv_sec = g_last_update_.tv_sec;
1930    stat->last_update.tv_usec = g_last_update_.tv_usec;
1931    pthread_mutex_unlock(&g_mutex_);
1932    stat->disk_size = get_dict_file_size(&dict_info_);
1933    stat->lemma_count = dict_info_.lemma_count;
1934    stat->lemma_size = dict_info_.lemma_size;
1935    stat->delete_count = dict_info_.free_count;
1936    stat->delete_size = dict_info_.free_size;
1937  #ifdef ___SYNC_ENABLED___
1938    stat->sync_count = dict_info_.sync_count;
1939  #endif
1940    stat->limit_lemma_count = dict_info_.limit_lemma_count;
1941    stat->limit_lemma_size = dict_info_.limit_lemma_size;
1942    stat->reclaim_ratio = dict_info_.reclaim_ratio;
1943    return true;
1944  }
1945  
1946  void UserDict::set_limit(uint32 max_lemma_count,
1947                           uint32 max_lemma_size, uint32 reclaim_ratio) {
1948    dict_info_.limit_lemma_count = max_lemma_count;
1949    dict_info_.limit_lemma_size = max_lemma_size;
1950    if (reclaim_ratio > 100)
1951      reclaim_ratio = 100;
1952    dict_info_.reclaim_ratio = reclaim_ratio;
1953  }
1954  
1955  void UserDict::reclaim() {
1956    if (is_valid_state() == false)
1957      return;
1958  
1959    switch (dict_info_.reclaim_ratio) {
1960      case 0:
1961        return;
1962      case 100:
1963        // TODO: CLEAR to be implemented
1964        assert(false);
1965        return;
1966      default:
1967        break;
1968    }
1969  
1970    // XXX Reclaim is only based on count, not size
1971    uint32 count = dict_info_.lemma_count;
1972    int rc = count * dict_info_.reclaim_ratio / 100;
1973  
1974    UserDictScoreOffsetPair * score_offset_pairs = NULL;
1975    score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
1976        sizeof(UserDictScoreOffsetPair) * rc);
1977    if (score_offset_pairs == NULL) {
1978      return;
1979    }
1980  
1981    for (int i = 0; i < rc; i++) {
1982      int s = scores_[i];
1983      score_offset_pairs[i].score = s;
1984      score_offset_pairs[i].offset_index = i;
1985    }
1986  
1987    for (int i = (rc + 1) / 2; i >= 0; i--)
1988      shift_down(score_offset_pairs, i, rc);
1989  
1990    for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
1991      int s = scores_[i];
1992      if (s < score_offset_pairs[0].score) {
1993        score_offset_pairs[0].score = s;
1994        score_offset_pairs[0].offset_index = i;
1995        shift_down(score_offset_pairs, 0, rc);
1996      }
1997    }
1998  
1999    for (int i = 0; i < rc; i++) {
2000      int off = score_offset_pairs[i].offset_index;
2001      remove_lemma_by_offset_index(off);
2002    }
2003    if (rc > 0) {
2004      if (state_ < USER_DICT_OFFSET_DIRTY)
2005        state_ = USER_DICT_OFFSET_DIRTY;
2006    }
2007  
2008    free(score_offset_pairs);
2009  }
2010  
2011  inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
2012    int s = sop[i].score;
2013    int p = sop[i].offset_index;
2014    sop[i].score = sop[j].score;
2015    sop[i].offset_index = sop[j].offset_index;
2016    sop[j].score = s;
2017    sop[j].offset_index = p;
2018  }
2019  
2020  void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
2021    int par = i;
2022    while (par < n) {
2023      int left = par * 2 + 1;
2024      int right = left + 1;
2025      if (left >= n && right >= n)
2026        break;
2027      if (right >= n) {
2028        if (sop[left].score > sop[par].score) {
2029          swap(sop, left, par);
2030          par = left;
2031          continue;
2032        }
2033      } else if (sop[left].score > sop[right].score &&
2034                 sop[left].score > sop[par].score) {
2035        swap(sop, left, par);
2036        par = left;
2037        continue;
2038      } else if (sop[right].score > sop[left].score &&
2039                 sop[right].score > sop[par].score) {
2040        swap(sop, right, par);
2041        par = right;
2042        continue;
2043      }
2044      break;
2045    }
2046  }
2047  
2048  LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
2049                                  uint16 lemma_len, uint16 count) {
2050    return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
2051  }
2052  
2053  LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
2054                                   uint16 lemma_len, uint16 count, uint64 lmt) {
2055  #ifdef ___DEBUG_PERF___
2056    DEBUG_PERF_BEGIN;
2057  #endif
2058    if (is_valid_state() == false)
2059      return 0;
2060    int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
2061    if (off != -1) {
2062      int delta_score = count - scores_[off];
2063      dict_info_.total_nfreq += delta_score;
2064      scores_[off] = build_score(lmt, count);
2065      if (state_ < USER_DICT_SCORE_DIRTY)
2066        state_ = USER_DICT_SCORE_DIRTY;
2067  #ifdef ___DEBUG_PERF___
2068      DEBUG_PERF_END;
2069      LOGD_PERF("_put_lemma(update)");
2070  #endif
2071      return ids_[off];
2072    } else {
2073      if ((dict_info_.limit_lemma_count > 0 &&
2074          dict_info_.lemma_count >= dict_info_.limit_lemma_count)
2075          || (dict_info_.limit_lemma_size > 0 &&
2076              dict_info_.lemma_size + (2 + (lemma_len << 2))
2077              > dict_info_.limit_lemma_size)) {
2078        // XXX Don't defragment here, it's too time-consuming.
2079        return 0;
2080      }
2081      int flushed = 0;
2082      if (lemma_count_left_ == 0 ||
2083          lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {
2084  
2085        // XXX When there is no space for new lemma, we flush to disk
2086        // flush_cache() may be called by upper user
2087        // and better place shoule be found instead of here
2088        flush_cache();
2089        flushed = 1;
2090        // Or simply return and do nothing
2091        // return 0;
2092      }
2093  #ifdef ___DEBUG_PERF___
2094      DEBUG_PERF_END;
2095      LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
2096  #endif
2097      LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
2098  #ifdef ___SYNC_ENABLED___
2099      if (syncs_ && id != 0) {
2100        queue_lemma_for_sync(id);
2101      }
2102  #endif
2103      return id;
2104    }
2105    return 0;
2106  }
2107  
2108  #ifdef ___SYNC_ENABLED___
2109  void UserDict::queue_lemma_for_sync(LemmaIdType id) {
2110    if (dict_info_.sync_count < sync_count_size_) {
2111      syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
2112    } else {
2113      uint32 * syncs = (uint32*)realloc(
2114          syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
2115      if (syncs) {
2116        sync_count_size_ += kUserDictPreAlloc;
2117        syncs_ = syncs;
2118        syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
2119      }
2120    }
2121  }
2122  #endif
2123  
2124  LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
2125                                     bool selected) {
2126  #ifdef ___DEBUG_PERF___
2127    DEBUG_PERF_BEGIN;
2128  #endif
2129    if (is_valid_state() == false)
2130      return 0;
2131    if (is_valid_lemma_id(lemma_id) == false)
2132      return 0;
2133    uint32 offset = offsets_by_id_[lemma_id - start_id_];
2134    uint8 lemma_len = get_lemma_nchar(offset);
2135    char16 * lemma_str = get_lemma_word(offset);
2136    uint16 * splids = get_lemma_spell_ids(offset);
2137  
2138    int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
2139    if (off != -1) {
2140      int score = scores_[off];
2141      int count = extract_score_freq(score);
2142      uint64 lmt = extract_score_lmt(score);
2143      if (count + delta_count > kUserDictMaxFrequency ||
2144          count + delta_count < count) {
2145        delta_count = kUserDictMaxFrequency - count;
2146      }
2147      count += delta_count;
2148      dict_info_.total_nfreq += delta_count;
2149      if (selected) {
2150        lmt = time(NULL);
2151      }
2152      scores_[off] = build_score(lmt, count);
2153      if (state_ < USER_DICT_SCORE_DIRTY)
2154        state_ = USER_DICT_SCORE_DIRTY;
2155  #ifdef ___DEBUG_PERF___
2156      DEBUG_PERF_END;
2157      LOGD_PERF("update_lemma");
2158  #endif
2159  #ifdef ___SYNC_ENABLED___
2160      queue_lemma_for_sync(ids_[off]);
2161  #endif
2162      return ids_[off];
2163    }
2164    return 0;
2165  }
2166  
2167  size_t UserDict::get_total_lemma_count() {
2168    return dict_info_.total_nfreq;
2169  }
2170  
2171  void UserDict::set_total_lemma_count_of_others(size_t count) {
2172    total_other_nfreq_ = count;
2173  }
2174  
2175  LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
2176                                     uint16 lemma_len, uint16 count, uint64 lmt) {
2177    LemmaIdType id = get_max_lemma_id() + 1;
2178    size_t offset = dict_info_.lemma_size;
2179    if (offset > kUserDictOffsetMask)
2180      return 0;
2181  
2182    lemmas_[offset] = 0;
2183    lemmas_[offset + 1] = (uint8)lemma_len;
2184    for (size_t i = 0; i < lemma_len; i++) {
2185      *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
2186      *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
2187          = lemma_str[i];
2188    }
2189    uint32 off = dict_info_.lemma_count;
2190    offsets_[off] = offset;
2191    scores_[off] = build_score(lmt, count);
2192    ids_[off] = id;
2193  #ifdef ___PREDICT_ENABLED___
2194    predicts_[off] = offset;
2195  #endif
2196  
2197    offsets_by_id_[id - start_id_] = offset;
2198  
2199    dict_info_.lemma_count++;
2200    dict_info_.lemma_size += (2 + (lemma_len << 2));
2201    lemma_count_left_--;
2202    lemma_size_left_ -= (2 + (lemma_len << 2));
2203  
2204    // Sort
2205  
2206    UserDictSearchable searchable;
2207    prepare_locate(&searchable, splids, lemma_len);
2208  
2209    size_t i = 0;
2210    while (i < off) {
2211      offset = offsets_[i];
2212      uint32 nchar = get_lemma_nchar(offset);
2213      uint16 * spl = get_lemma_spell_ids(offset);
2214  
2215      if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
2216        break;
2217      i++;
2218    }
2219    if (i != off) {
2220      uint32 temp = offsets_[off];
2221      memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
2222      offsets_[i] = temp;
2223  
2224      temp = scores_[off];
2225      memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
2226      scores_[i] = temp;
2227  
2228      temp = ids_[off];
2229      memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
2230      ids_[i] = temp;
2231    }
2232  
2233  #ifdef ___PREDICT_ENABLED___
2234    uint32 j = 0;
2235    uint16 * words_new = get_lemma_word(predicts_[off]);
2236    j = locate_where_to_insert_in_predicts(words_new, lemma_len);
2237    if (j != off) {
2238      uint32 temp = predicts_[off];
2239      memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
2240      predicts_[j] = temp;
2241    }
2242  #endif
2243  
2244    if (state_ < USER_DICT_LEMMA_DIRTY)
2245      state_ = USER_DICT_LEMMA_DIRTY;
2246  
2247  #ifdef ___CACHE_ENABLED___
2248    cache_init();
2249  #endif
2250  
2251    dict_info_.total_nfreq += count;
2252    return id;
2253  }
2254  }
2255