• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef PINYINIME_INCLUDE_NGRAM_H__
18 #define PINYINIME_INCLUDE_NGRAM_H__
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include "./dictdef.h"
23 
24 namespace ime_pinyin {
25 
26 typedef unsigned char CODEBOOK_TYPE;
27 
28 static const size_t kCodeBookSize = 256;
29 
30 class NGram {
31  public:
32   // The maximum score of a lemma item.
33   static const LmaScoreType kMaxScore = 0x3fff;
34 
35   // In order to reduce the storage size, the original log value is amplified by
36   // kScoreAmplifier, and we use LmaScoreType to store.
37   // After this process, an item with a lower score has a higher frequency.
38   static const int kLogValueAmplifier = -800;
39 
40   // System words' total frequency. It is not the real total frequency, instead,
41   // It is only used to adjust system lemmas' scores when the user dictionary's
42   // total frequency changes.
43   // In this version, frequencies of system lemmas are fixed. We are considering
44   // to make them changable in next version.
45   static const size_t kSysDictTotalFreq = 100000000;
46 
47  private:
48 
49   static NGram* instance_;
50 
51   bool initialized_;
52   size_t idx_num_;
53 
54   size_t total_freq_none_sys_;
55 
56   // Score compensation for system dictionary lemmas.
57   // Because after user adds some user lemmas, the total frequency changes, and
58   // we use this value to normalize the score.
59   float sys_score_compensation_;
60 
61 #ifdef ___BUILD_MODEL___
62   double *freq_codes_df_;
63 #endif
64   LmaScoreType *freq_codes_;
65   CODEBOOK_TYPE *lma_freq_idx_;
66 
67  public:
68   NGram();
69   ~NGram();
70 
71   static NGram& get_instance();
72 
73   bool save_ngram(FILE *fp);
74   bool load_ngram(FILE *fp);
75 
76   // Set the total frequency of all none system dictionaries.
77   void set_total_freq_none_sys(size_t freq_none_sys);
78 
79   float get_uni_psb(LemmaIdType lma_id);
80 
81   // Convert a probability to score. Actually, the score will be limited to
82   // kMaxScore, but at runtime, we also need float expression to get accurate
83   // value of the score.
84   // After the conversion, a lower score indicates a higher probability of the
85   // item.
86   static float convert_psb_to_score(double psb);
87 
88 #ifdef ___BUILD_MODEL___
89   // For constructing the unigram mode model.
90   bool build_unigram(LemmaEntry *lemma_arr, size_t num,
91                      LemmaIdType next_idx_unused);
92 #endif
93 };
94 }
95 
96 #endif  // PINYINIME_INCLUDE_NGRAM_H__
97