• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef PINYINIME_INCLUDE_USERDICT_H__
18 #define PINYINIME_INCLUDE_USERDICT_H__
19 
20 #define ___CACHE_ENABLED___
21 #define ___SYNC_ENABLED___
22 #define ___PREDICT_ENABLED___
23 
24 // Debug performance for operations
25 // #define ___DEBUG_PERF___
26 
27 #include <pthread.h>
28 #include "atomdictbase.h"
29 
30 namespace ime_pinyin {
31 
32 class UserDict : public AtomDictBase {
33  public:
34   UserDict();
35   ~UserDict();
36 
37   bool load_dict(const char *file_name, LemmaIdType start_id,
38                  LemmaIdType end_id);
39 
40   bool close_dict();
41 
42   size_t number_of_lemmas();
43 
44   void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
45 
46   MileStoneHandle extend_dict(MileStoneHandle from_handle,
47                               const DictExtPara *dep, LmaPsbItem *lpi_items,
48                               size_t lpi_max, size_t *lpi_num);
49 
50   size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
51                   LmaPsbItem *lpi_items, size_t lpi_max);
52 
53   uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
54                        uint16 str_max);
55 
56   uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
57                           uint16 splids_max, bool arg_valid);
58 
59   size_t predict(const char16 last_hzs[], uint16 hzs_len,
60                  NPredictItem *npre_items, size_t npre_max,
61                  size_t b4_used);
62 
63   // Full spelling ids are required
64   LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
65                         uint16 lemma_len, uint16 count);
66 
67   LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
68                            bool selected);
69 
70   LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
71                            uint16 lemma_len);
72 
73   LmaScoreType get_lemma_score(LemmaIdType lemma_id);
74 
75   LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
76                         uint16 lemma_len);
77 
78   bool remove_lemma(LemmaIdType lemma_id);
79 
80   size_t get_total_lemma_count();
81   void set_total_lemma_count_of_others(size_t count);
82 
83   void flush_cache();
84 
85   void set_limit(uint32 max_lemma_count, uint32 max_lemma_size,
86                  uint32 reclaim_ratio);
87 
88   void reclaim();
89 
90   void defragment();
91 
92 #ifdef ___SYNC_ENABLED___
93   void clear_sync_lemmas(unsigned int start, unsigned int end);
94 
95   int get_sync_count();
96 
97   LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
98                         uint16 lemma_len, uint16 count, uint64 lmt);
99    /**
100     * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag.
101     *
102     * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12'
103     * @param len length of lemmas string in UTF-16LE
104     * @return newly added lemma count
105     */
106   int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len);
107 
108   /**
109    * Get lemmas need sync to a UTF-16LE string of above format.
110    * Note: input buffer (str) must not be too small. If str is too small to
111    *       contain single one lemma, there might be a dead loop.
112    *
113    * @param str buffer to write lemmas
114    * @param size buffer size in UTF-16LE
115    * @param count output value of lemma returned
116    * @return UTF-16LE string length
117    */
118   int get_sync_lemmas_in_utf16le_string_from_beginning(
119       char16 * str, int size, int * count);
120 
121 #endif
122 
123   struct UserDictStat {
124     uint32 version;
125     const char * file_name;
126     struct timeval load_time;
127     struct timeval last_update;
128     uint32 disk_size;
129     uint32 lemma_count;
130     uint32 lemma_size;
131     uint32 delete_count;
132     uint32 delete_size;
133 #ifdef ___SYNC_ENABLED___
134     uint32 sync_count;
135 #endif
136     uint32 reclaim_ratio;
137     uint32 limit_lemma_count;
138     uint32 limit_lemma_size;
139   };
140 
141   bool state(UserDictStat * stat);
142 
143  private:
144   uint32 total_other_nfreq_;
145   struct timeval load_time_;
146   LemmaIdType start_id_;
147   uint32 version_;
148   uint8 * lemmas_;
149 
150   // In-Memory-Only flag for each lemma
151   static const uint8 kUserDictLemmaFlagRemove = 1;
152   // Inuse lemmas' offset
153   uint32 * offsets_;
154   // Highest bit in offset tells whether corresponding lemma is removed
155   static const uint32 kUserDictOffsetFlagRemove = (1 << 31);
156   // Maximum possible for the offset
157   static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove);
158   // Bit width for last modified time, from 1 to 16
159   static const uint32 kUserDictLMTBitWidth = 16;
160   // Granularity for last modified time in second
161   static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7;
162   // Maximum frequency count
163   static const uint16 kUserDictMaxFrequency = 0xFFFF;
164 
165 #define COARSE_UTC(year, month, day, hour, minute, second) \
166   ( \
167     (year - 1970) * 365 * 24 * 60 * 60 + \
168     (month - 1) * 30 * 24 * 60 * 60 + \
169     (day - 1) * 24 * 60 * 60 + \
170     (hour - 0) * 60 * 60 + \
171     (minute - 0) * 60 + \
172     (second - 0) \
173   )
174   static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0);
175 
176   // Correspond to offsets_
177   uint32 * scores_;
178   // Following two fields are only valid in memory
179   uint32 * ids_;
180 #ifdef ___PREDICT_ENABLED___
181   uint32 * predicts_;
182 #endif
183 #ifdef ___SYNC_ENABLED___
184   uint32 * syncs_;
185   size_t sync_count_size_;
186 #endif
187   uint32 * offsets_by_id_;
188 
189   size_t lemma_count_left_;
190   size_t lemma_size_left_;
191 
192   const char * dict_file_;
193 
194   // Be sure size is 4xN
195   struct UserDictInfo {
196     // When limitation reached, how much percentage will be reclaimed (1 ~ 100)
197     uint32 reclaim_ratio;
198     // maximum lemma count, 0 means no limitation
199     uint32 limit_lemma_count;
200     // Maximum lemma size, it's different from
201     // whole disk file size or in-mem dict size
202     // 0 means no limitation
203     uint32 limit_lemma_size;
204     // Total lemma count including deleted and inuse
205     // Also indicate offsets_ size
206     uint32 lemma_count;
207     // Total size of lemmas including used and freed
208     uint32 lemma_size;
209     // Freed lemma count
210     uint32 free_count;
211     // Freed lemma size in byte
212     uint32 free_size;
213 #ifdef ___SYNC_ENABLED___
214     uint32 sync_count;
215 #endif
216     int32 total_nfreq;
217   } dict_info_;
218 
219   static const uint32 kUserDictVersion = 0x0ABCDEF0;
220 
221   static const uint32 kUserDictPreAlloc = 32;
222   static const uint32 kUserDictAverageNchar = 8;
223 
224   enum UserDictState {
225     // Keep in order
226     USER_DICT_NONE = 0,
227     USER_DICT_SYNC,
228 #ifdef ___SYNC_ENABLED___
229     USER_DICT_SYNC_DIRTY,
230 #endif
231     USER_DICT_SCORE_DIRTY,
232     USER_DICT_OFFSET_DIRTY,
233     USER_DICT_LEMMA_DIRTY,
234 
235     USER_DICT_DEFRAGMENTED,
236   } state_;
237 
238   struct UserDictSearchable {
239     uint16 splids_len;
240     uint16 splid_start[kMaxLemmaSize];
241     uint16 splid_count[kMaxLemmaSize];
242     // Compact inital letters for both FuzzyCompareSpellId and cache system
243     uint32 signature[kMaxLemmaSize / 4];
244   };
245 
246 #ifdef ___CACHE_ENABLED___
247   enum UserDictCacheType {
248     USER_DICT_CACHE,
249     USER_DICT_MISS_CACHE,
250   };
251 
252   static const int kUserDictCacheSize = 4;
253   static const int kUserDictMissCacheSize = kMaxLemmaSize - 1;
254 
255   struct UserDictMissCache {
256     uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4];
257     uint16 head, tail;
258   } miss_caches_[kMaxLemmaSize];
259 
260   struct UserDictCache {
261     uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4];
262     uint32 offsets[kUserDictCacheSize];
263     uint32 lengths[kUserDictCacheSize];
264     // Ring buffer
265     uint16 head, tail;
266   } caches_[kMaxLemmaSize];
267 
268   void cache_init();
269 
270   void cache_push(UserDictCacheType type,
271                  UserDictSearchable *searchable,
272                  uint32 offset, uint32 length);
273 
274   bool cache_hit(UserDictSearchable *searchable,
275                  uint32 *offset, uint32 *length);
276 
277   bool load_cache(UserDictSearchable *searchable,
278                   uint32 *offset, uint32 *length);
279 
280   void save_cache(UserDictSearchable *searchable,
281                   uint32 offset, uint32 length);
282 
283   void reset_cache();
284 
285   bool load_miss_cache(UserDictSearchable *searchable);
286 
287   void save_miss_cache(UserDictSearchable *searchable);
288 
289   void reset_miss_cache();
290 #endif
291 
292   LmaScoreType translate_score(int f);
293 
294   int extract_score_freq(int raw_score);
295 
296   uint64 extract_score_lmt(int raw_score);
297 
298   inline int build_score(uint64 lmt, int freq);
299 
300   inline int64 utf16le_atoll(uint16 *s, int len);
301 
302   inline int utf16le_lltoa(int64 v, uint16 *s, int size);
303 
304   LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[],
305                         uint16 lemma_len, uint16 count, uint64 lmt);
306 
307   size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len,
308                    LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend);
309 
310   int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
311 
312   int _get_lemma_score(LemmaIdType lemma_id);
313 
314   int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1,
315                                const UserDictSearchable *searchable);
316 
317   bool is_prefix_spell_id(const uint16 * fullids,
318                           uint16 fulllen, const UserDictSearchable *searchable);
319 
320   uint32 get_dict_file_size(UserDictInfo * info);
321 
322   bool reset(const char *file);
323 
324   bool validate(const char *file);
325 
326   bool load(const char *file, LemmaIdType start_id);
327 
328   bool is_valid_state();
329 
330   bool is_valid_lemma_id(LemmaIdType id);
331 
332   LemmaIdType get_max_lemma_id();
333 
334   void set_lemma_flag(uint32 offset, uint8 flag);
335 
336   char get_lemma_flag(uint32 offset);
337 
338   char get_lemma_nchar(uint32 offset);
339 
340   uint16 * get_lemma_spell_ids(uint32 offset);
341 
342   uint16 * get_lemma_word(uint32 offset);
343 
344   // Prepare searchable to fasten locate process
345   void prepare_locate(UserDictSearchable *searchable,
346                       const uint16 * splids, uint16 len);
347 
348   // Compare initial letters only
349   int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1,
350                                const UserDictSearchable *searchable);
351 
352   // Compare exactly two spell ids
353   // First argument must be a full id spell id
354   bool equal_spell_id(const uint16 * fullids,
355                       uint16 fulllen, const UserDictSearchable *searchable);
356 
357   // Find first item by initial letters
358   int32 locate_first_in_offsets(const UserDictSearchable *searchable);
359 
360   LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[],
361                            uint16 lemma_len, uint16 count, uint64 lmt);
362 
363   // Check if a lemma is in dictionary
364   int32 locate_in_offsets(char16 lemma_str[],
365                           uint16 splid_str[], uint16 lemma_len);
366 
367   bool remove_lemma_by_offset_index(int offset_index);
368 #ifdef ___PREDICT_ENABLED___
369   uint32 locate_where_to_insert_in_predicts(const uint16 * words,
370                                             int lemma_len);
371 
372   int32 locate_first_in_predicts(const uint16 * words, int lemma_len);
373 
374   void remove_lemma_from_predict_list(uint32 offset);
375 #endif
376 #ifdef ___SYNC_ENABLED___
377   void queue_lemma_for_sync(LemmaIdType id);
378 
379   void remove_lemma_from_sync_list(uint32 offset);
380 
381   void write_back_sync(int fd);
382 #endif
383   void write_back_score(int fd);
384   void write_back_offset(int fd);
385   void write_back_lemma(int fd);
386   void write_back_all(int fd);
387   void write_back();
388 
389   struct UserDictScoreOffsetPair {
390     int score;
391     uint32 offset_index;
392   };
393 
394   inline void swap(UserDictScoreOffsetPair * sop, int i, int j);
395 
396   void shift_down(UserDictScoreOffsetPair * sop, int i, int n);
397 
398   // On-disk format for each lemma
399   // +-------------+
400   // | Version (4) |
401   // +-------------+
402   // +-----------+-----------+--------------------+-------------------+
403   // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) |
404   // +-----------+-----------+--------------------+-------------------+
405   // ...
406   // +-----------------------+     +-------------+      <---Offset of offset
407   // | Offset1 by_splids (4) | ... | OffsetN (4) |
408   // +-----------------------+     +-------------+
409 #ifdef ___PREDICT_ENABLED___
410   // +----------------------+     +-------------+
411   // | Offset1 by_lemma (4) | ... | OffsetN (4) |
412   // +----------------------+     +-------------+
413 #endif
414   // +------------+     +------------+
415   // | Score1 (4) | ... | ScoreN (4) |
416   // +------------+     +------------+
417 #ifdef ___SYNC_ENABLED___
418   // +-------------+     +-------------+
419   // | NewAdd1 (4) | ... | NewAddN (4) |
420   // +-------------+     +-------------+
421 #endif
422   // +----------------+
423   // | Dict Info (4x) |
424   // +----------------+
425 };
426 }
427 
428 #endif
429