• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * Copyright (C) 2009 The Android Open Source Project
3   *
4   * Licensed under the Apache License, Version 2.0 (the "License");
5   * you may not use this file except in compliance with the License.
6   * You may obtain a copy of the License at
7   *
8   *      http://www.apache.org/licenses/LICENSE-2.0
9   *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  #ifndef PINYINIME_INCLUDE_USERDICT_H__
18  #define PINYINIME_INCLUDE_USERDICT_H__
19  
20  #define ___CACHE_ENABLED___
21  #define ___SYNC_ENABLED___
22  #define ___PREDICT_ENABLED___
23  
24  // Debug performance for operations
25  // #define ___DEBUG_PERF___
26  
27  #include <pthread.h>
28  #include "atomdictbase.h"
29  
30  namespace ime_pinyin {
31  
32  class UserDict : public AtomDictBase {
33   public:
34    UserDict();
35    ~UserDict();
36  
37    bool load_dict(const char *file_name, LemmaIdType start_id,
38                   LemmaIdType end_id);
39  
40    bool close_dict();
41  
42    size_t number_of_lemmas();
43  
44    void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
45  
46    MileStoneHandle extend_dict(MileStoneHandle from_handle,
47                                const DictExtPara *dep, LmaPsbItem *lpi_items,
48                                size_t lpi_max, size_t *lpi_num);
49  
50    size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
51                    LmaPsbItem *lpi_items, size_t lpi_max);
52  
53    uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
54                         uint16 str_max);
55  
56    uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
57                            uint16 splids_max, bool arg_valid);
58  
59    size_t predict(const char16 last_hzs[], uint16 hzs_len,
60                   NPredictItem *npre_items, size_t npre_max,
61                   size_t b4_used);
62  
63    // Full spelling ids are required
64    LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
65                          uint16 lemma_len, uint16 count);
66  
67    LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
68                             bool selected);
69  
70    LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
71                             uint16 lemma_len);
72  
73    LmaScoreType get_lemma_score(LemmaIdType lemma_id);
74  
75    LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
76                          uint16 lemma_len);
77  
78    bool remove_lemma(LemmaIdType lemma_id);
79  
80    size_t get_total_lemma_count();
81    void set_total_lemma_count_of_others(size_t count);
82  
83    void flush_cache();
84  
85    void set_limit(uint32 max_lemma_count, uint32 max_lemma_size,
86                   uint32 reclaim_ratio);
87  
88    void reclaim();
89  
90    void defragment();
91  
92  #ifdef ___SYNC_ENABLED___
93    void clear_sync_lemmas(unsigned int start, unsigned int end);
94  
95    int get_sync_count();
96  
97    LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
98                          uint16 lemma_len, uint16 count, uint64 lmt);
99     /**
100      * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag.
101      *
102      * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12'
103      * @param len length of lemmas string in UTF-16LE
104      * @return newly added lemma count
105      */
106    int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len);
107  
108    /**
109     * Get lemmas need sync to a UTF-16LE string of above format.
110     * Note: input buffer (str) must not be too small. If str is too small to
111     *       contain single one lemma, there might be a dead loop.
112     *
113     * @param str buffer to write lemmas
114     * @param size buffer size in UTF-16LE
115     * @param count output value of lemma returned
116     * @return UTF-16LE string length
117     */
118    int get_sync_lemmas_in_utf16le_string_from_beginning(
119        char16 * str, int size, int * count);
120  
121  #endif
122  
123    struct UserDictStat {
124      uint32 version;
125      const char * file_name;
126      struct timeval load_time;
127      struct timeval last_update;
128      uint32 disk_size;
129      uint32 lemma_count;
130      uint32 lemma_size;
131      uint32 delete_count;
132      uint32 delete_size;
133  #ifdef ___SYNC_ENABLED___
134      uint32 sync_count;
135  #endif
136      uint32 reclaim_ratio;
137      uint32 limit_lemma_count;
138      uint32 limit_lemma_size;
139    };
140  
141    bool state(UserDictStat * stat);
142  
143   private:
144    uint32 total_other_nfreq_;
145    struct timeval load_time_;
146    LemmaIdType start_id_;
147    uint32 version_;
148    uint8 * lemmas_;
149  
150    // In-Memory-Only flag for each lemma
151    static const uint8 kUserDictLemmaFlagRemove = 1;
152    // Inuse lemmas' offset
153    uint32 * offsets_;
154    // Highest bit in offset tells whether corresponding lemma is removed
155    static const uint32 kUserDictOffsetFlagRemove = (1 << 31);
156    // Maximum possible for the offset
157    static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove);
158    // Bit width for last modified time, from 1 to 16
159    static const uint32 kUserDictLMTBitWidth = 16;
160    // Granularity for last modified time in second
161    static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7;
162    // Maximum frequency count
163    static const uint16 kUserDictMaxFrequency = 0xFFFF;
164  
165  #define COARSE_UTC(year, month, day, hour, minute, second) \
166    ( \
167      (year - 1970) * 365 * 24 * 60 * 60 + \
168      (month - 1) * 30 * 24 * 60 * 60 + \
169      (day - 1) * 24 * 60 * 60 + \
170      (hour - 0) * 60 * 60 + \
171      (minute - 0) * 60 + \
172      (second - 0) \
173    )
174    static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0);
175  
176    // Correspond to offsets_
177    uint32 * scores_;
178    // Following two fields are only valid in memory
179    uint32 * ids_;
180  #ifdef ___PREDICT_ENABLED___
181    uint32 * predicts_;
182  #endif
183  #ifdef ___SYNC_ENABLED___
184    uint32 * syncs_;
185    size_t sync_count_size_;
186  #endif
187    uint32 * offsets_by_id_;
188  
189    size_t lemma_count_left_;
190    size_t lemma_size_left_;
191  
192    const char * dict_file_;
193  
194    // Be sure size is 4xN
195    struct UserDictInfo {
196      // When limitation reached, how much percentage will be reclaimed (1 ~ 100)
197      uint32 reclaim_ratio;
198      // maximum lemma count, 0 means no limitation
199      uint32 limit_lemma_count;
200      // Maximum lemma size, it's different from
201      // whole disk file size or in-mem dict size
202      // 0 means no limitation
203      uint32 limit_lemma_size;
204      // Total lemma count including deleted and inuse
205      // Also indicate offsets_ size
206      uint32 lemma_count;
207      // Total size of lemmas including used and freed
208      uint32 lemma_size;
209      // Freed lemma count
210      uint32 free_count;
211      // Freed lemma size in byte
212      uint32 free_size;
213  #ifdef ___SYNC_ENABLED___
214      uint32 sync_count;
215  #endif
216      int32 total_nfreq;
217    } dict_info_;
218  
219    static const uint32 kUserDictVersion = 0x0ABCDEF0;
220  
221    static const uint32 kUserDictPreAlloc = 32;
222    static const uint32 kUserDictAverageNchar = 8;
223  
224    enum UserDictState {
225      // Keep in order
226      USER_DICT_NONE = 0,
227      USER_DICT_SYNC,
228  #ifdef ___SYNC_ENABLED___
229      USER_DICT_SYNC_DIRTY,
230  #endif
231      USER_DICT_SCORE_DIRTY,
232      USER_DICT_OFFSET_DIRTY,
233      USER_DICT_LEMMA_DIRTY,
234  
235      USER_DICT_DEFRAGMENTED,
236    } state_;
237  
238    struct UserDictSearchable {
239      uint16 splids_len;
240      uint16 splid_start[kMaxLemmaSize];
241      uint16 splid_count[kMaxLemmaSize];
242      // Compact inital letters for both FuzzyCompareSpellId and cache system
243      uint32 signature[kMaxLemmaSize / 4];
244    };
245  
246  #ifdef ___CACHE_ENABLED___
247    enum UserDictCacheType {
248      USER_DICT_CACHE,
249      USER_DICT_MISS_CACHE,
250    };
251  
252    static const int kUserDictCacheSize = 4;
253    static const int kUserDictMissCacheSize = kMaxLemmaSize - 1;
254  
255    struct UserDictMissCache {
256      uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4];
257      uint16 head, tail;
258    } miss_caches_[kMaxLemmaSize];
259  
260    struct UserDictCache {
261      uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4];
262      uint32 offsets[kUserDictCacheSize];
263      uint32 lengths[kUserDictCacheSize];
264      // Ring buffer
265      uint16 head, tail;
266    } caches_[kMaxLemmaSize];
267  
268    void cache_init();
269  
270    void cache_push(UserDictCacheType type,
271                   UserDictSearchable *searchable,
272                   uint32 offset, uint32 length);
273  
274    bool cache_hit(UserDictSearchable *searchable,
275                   uint32 *offset, uint32 *length);
276  
277    bool load_cache(UserDictSearchable *searchable,
278                    uint32 *offset, uint32 *length);
279  
280    void save_cache(UserDictSearchable *searchable,
281                    uint32 offset, uint32 length);
282  
283    void reset_cache();
284  
285    bool load_miss_cache(UserDictSearchable *searchable);
286  
287    void save_miss_cache(UserDictSearchable *searchable);
288  
289    void reset_miss_cache();
290  #endif
291  
292    LmaScoreType translate_score(int f);
293  
294    int extract_score_freq(int raw_score);
295  
296    uint64 extract_score_lmt(int raw_score);
297  
298    inline int build_score(uint64 lmt, int freq);
299  
300    inline int64 utf16le_atoll(uint16 *s, int len);
301  
302    inline int utf16le_lltoa(int64 v, uint16 *s, int size);
303  
304    LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[],
305                          uint16 lemma_len, uint16 count, uint64 lmt);
306  
307    size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len,
308                     LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend);
309  
310    int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
311  
312    int _get_lemma_score(LemmaIdType lemma_id);
313  
314    int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1,
315                                 const UserDictSearchable *searchable);
316  
317    bool is_prefix_spell_id(const uint16 * fullids,
318                            uint16 fulllen, const UserDictSearchable *searchable);
319  
320    uint32 get_dict_file_size(UserDictInfo * info);
321  
322    bool reset(const char *file);
323  
324    bool validate(const char *file);
325  
326    bool load(const char *file, LemmaIdType start_id);
327  
328    bool is_valid_state();
329  
330    bool is_valid_lemma_id(LemmaIdType id);
331  
332    LemmaIdType get_max_lemma_id();
333  
334    void set_lemma_flag(uint32 offset, uint8 flag);
335  
336    char get_lemma_flag(uint32 offset);
337  
338    char get_lemma_nchar(uint32 offset);
339  
340    uint16 * get_lemma_spell_ids(uint32 offset);
341  
342    uint16 * get_lemma_word(uint32 offset);
343  
344    // Prepare searchable to fasten locate process
345    void prepare_locate(UserDictSearchable *searchable,
346                        const uint16 * splids, uint16 len);
347  
348    // Compare initial letters only
349    int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1,
350                                 const UserDictSearchable *searchable);
351  
352    // Compare exactly two spell ids
353    // First argument must be a full id spell id
354    bool equal_spell_id(const uint16 * fullids,
355                        uint16 fulllen, const UserDictSearchable *searchable);
356  
357    // Find first item by initial letters
358    int32 locate_first_in_offsets(const UserDictSearchable *searchable);
359  
360    LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[],
361                             uint16 lemma_len, uint16 count, uint64 lmt);
362  
363    // Check if a lemma is in dictionary
364    int32 locate_in_offsets(char16 lemma_str[],
365                            uint16 splid_str[], uint16 lemma_len);
366  
367    bool remove_lemma_by_offset_index(int offset_index);
368  #ifdef ___PREDICT_ENABLED___
369    uint32 locate_where_to_insert_in_predicts(const uint16 * words,
370                                              int lemma_len);
371  
372    int32 locate_first_in_predicts(const uint16 * words, int lemma_len);
373  
374    void remove_lemma_from_predict_list(uint32 offset);
375  #endif
376  #ifdef ___SYNC_ENABLED___
377    void queue_lemma_for_sync(LemmaIdType id);
378  
379    void remove_lemma_from_sync_list(uint32 offset);
380  
381    void write_back_sync(int fd);
382  #endif
383    void write_back_score(int fd);
384    void write_back_offset(int fd);
385    void write_back_lemma(int fd);
386    void write_back_all(int fd);
387    void write_back();
388  
389    struct UserDictScoreOffsetPair {
390      int score;
391      uint32 offset_index;
392    };
393  
394    inline void swap(UserDictScoreOffsetPair * sop, int i, int j);
395  
396    void shift_down(UserDictScoreOffsetPair * sop, int i, int n);
397  
398    // On-disk format for each lemma
399    // +-------------+
400    // | Version (4) |
401    // +-------------+
402    // +-----------+-----------+--------------------+-------------------+
403    // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) |
404    // +-----------+-----------+--------------------+-------------------+
405    // ...
406    // +-----------------------+     +-------------+      <---Offset of offset
407    // | Offset1 by_splids (4) | ... | OffsetN (4) |
408    // +-----------------------+     +-------------+
409  #ifdef ___PREDICT_ENABLED___
410    // +----------------------+     +-------------+
411    // | Offset1 by_lemma (4) | ... | OffsetN (4) |
412    // +----------------------+     +-------------+
413  #endif
414    // +------------+     +------------+
415    // | Score1 (4) | ... | ScoreN (4) |
416    // +------------+     +------------+
417  #ifdef ___SYNC_ENABLED___
418    // +-------------+     +-------------+
419    // | NewAdd1 (4) | ... | NewAddN (4) |
420    // +-------------+     +-------------+
421  #endif
422    // +----------------+
423    // | Dict Info (4x) |
424    // +----------------+
425  };
426  }
427  
428  #endif
429