1 /*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <assert.h>
18 #include <math.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include <time.h>
22 #include "../include/mystdlib.h"
23 #include "../include/ngram.h"
24
25 namespace ime_pinyin {
26
27 #define ADD_COUNT 0.3
28
comp_double(const void * p1,const void * p2)29 int comp_double(const void *p1, const void *p2) {
30 if (*static_cast<const double*>(p1) < *static_cast<const double*>(p2))
31 return -1;
32 if (*static_cast<const double*>(p1) > *static_cast<const double*>(p2))
33 return 1;
34 return 0;
35 }
36
distance(double freq,double code)37 inline double distance(double freq, double code) {
38 // return fabs(freq - code);
39 return freq * fabs(log(freq) - log(code));
40 }
41
42 // Find the index of the code value which is nearest to the given freq
qsearch_nearest(double code_book[],double freq,int start,int end)43 int qsearch_nearest(double code_book[], double freq, int start, int end) {
44 if (start == end)
45 return start;
46
47 if (start + 1 == end) {
48 if (distance(freq, code_book[end]) > distance(freq, code_book[start]))
49 return start;
50 return end;
51 }
52
53 int mid = (start + end) / 2;
54
55 if (code_book[mid] > freq)
56 return qsearch_nearest(code_book, freq, start, mid);
57 else
58 return qsearch_nearest(code_book, freq, mid, end);
59 }
60
update_code_idx(double freqs[],size_t num,double code_book[],CODEBOOK_TYPE * code_idx)61 size_t update_code_idx(double freqs[], size_t num, double code_book[],
62 CODEBOOK_TYPE *code_idx) {
63 size_t changed = 0;
64 for (size_t pos = 0; pos < num; pos++) {
65 CODEBOOK_TYPE idx;
66 idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1);
67 if (idx != code_idx[pos])
68 changed++;
69 code_idx[pos] = idx;
70 }
71 return changed;
72 }
73
recalculate_kernel(double freqs[],size_t num,double code_book[],CODEBOOK_TYPE * code_idx)74 double recalculate_kernel(double freqs[], size_t num, double code_book[],
75 CODEBOOK_TYPE *code_idx) {
76 double ret = 0;
77
78 size_t *item_num = new size_t[kCodeBookSize];
79 assert(item_num);
80 memset(item_num, 0, sizeof(size_t) * kCodeBookSize);
81
82 double *cb_new = new double[kCodeBookSize];
83 assert(cb_new);
84 memset(cb_new, 0, sizeof(double) * kCodeBookSize);
85
86 for (size_t pos = 0; pos < num; pos++) {
87 ret += distance(freqs[pos], code_book[code_idx[pos]]);
88
89 cb_new[code_idx[pos]] += freqs[pos];
90 item_num[code_idx[pos]] += 1;
91 }
92
93 for (size_t code = 0; code < kCodeBookSize; code++) {
94 assert(item_num[code] > 0);
95 code_book[code] = cb_new[code] / item_num[code];
96 }
97
98 delete [] item_num;
99 delete [] cb_new;
100
101 return ret;
102 }
103
iterate_codes(double freqs[],size_t num,double code_book[],CODEBOOK_TYPE * code_idx)104 void iterate_codes(double freqs[], size_t num, double code_book[],
105 CODEBOOK_TYPE *code_idx) {
106 size_t iter_num = 0;
107 double delta_last = 0;
108 do {
109 size_t changed = update_code_idx(freqs, num, code_book, code_idx);
110
111 double delta = recalculate_kernel(freqs, num, code_book, code_idx);
112
113 if (kPrintDebug0) {
114 printf("---Unigram codebook iteration: %d : %d, %.9f\n",
115 iter_num, changed, delta);
116 }
117 iter_num++;
118
119 if (iter_num > 1 &&
120 (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001))
121 break;
122 delta_last = delta;
123 } while (true);
124 }
125
126
127 NGram* NGram::instance_ = NULL;
128
NGram()129 NGram::NGram() {
130 initialized_ = false;
131 idx_num_ = 0;
132 lma_freq_idx_ = NULL;
133 sys_score_compensation_ = 0;
134
135 #ifdef ___BUILD_MODEL___
136 freq_codes_df_ = NULL;
137 #endif
138 freq_codes_ = NULL;
139 }
140
~NGram()141 NGram::~NGram() {
142 if (NULL != lma_freq_idx_)
143 free(lma_freq_idx_);
144
145 #ifdef ___BUILD_MODEL___
146 if (NULL != freq_codes_df_)
147 free(freq_codes_df_);
148 #endif
149
150 if (NULL != freq_codes_)
151 free(freq_codes_);
152 }
153
get_instance()154 NGram& NGram::get_instance() {
155 if (NULL == instance_)
156 instance_ = new NGram();
157 return *instance_;
158 }
159
save_ngram(FILE * fp)160 bool NGram::save_ngram(FILE *fp) {
161 if (!initialized_ || NULL == fp)
162 return false;
163
164 if (0 == idx_num_ || NULL == freq_codes_ || NULL == lma_freq_idx_)
165 return false;
166
167 if (fwrite(&idx_num_, sizeof(size_t), 1, fp) != 1)
168 return false;
169
170 if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
171 kCodeBookSize)
172 return false;
173
174 if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_)
175 return false;
176
177 return true;
178 }
179
load_ngram(FILE * fp)180 bool NGram::load_ngram(FILE *fp) {
181 if (NULL == fp)
182 return false;
183
184 initialized_ = false;
185
186 if (fread(&idx_num_, sizeof(size_t), 1, fp) != 1 )
187 return false;
188
189 if (NULL != lma_freq_idx_)
190 free(lma_freq_idx_);
191
192 if (NULL != freq_codes_)
193 free(freq_codes_);
194
195 lma_freq_idx_ = static_cast<CODEBOOK_TYPE*>
196 (malloc(idx_num_ * sizeof(CODEBOOK_TYPE)));
197 freq_codes_ = static_cast<LmaScoreType*>
198 (malloc(kCodeBookSize * sizeof(LmaScoreType)));
199
200 if (NULL == lma_freq_idx_ || NULL == freq_codes_)
201 return false;
202
203 if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
204 kCodeBookSize)
205 return false;
206
207 if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_)
208 return false;
209
210 initialized_ = true;
211
212 total_freq_none_sys_ = 0;
213 return true;
214 }
215
set_total_freq_none_sys(size_t freq_none_sys)216 void NGram::set_total_freq_none_sys(size_t freq_none_sys) {
217 total_freq_none_sys_ = freq_none_sys;
218 if (0 == total_freq_none_sys_) {
219 sys_score_compensation_ = 0;
220 } else {
221 double factor = static_cast<double>(kSysDictTotalFreq) / (
222 kSysDictTotalFreq + total_freq_none_sys_);
223 sys_score_compensation_ = static_cast<float>(
224 log(factor) * kLogValueAmplifier);
225 }
226 }
227
228 // The caller makes sure this oject is initialized.
get_uni_psb(LemmaIdType lma_id)229 float NGram::get_uni_psb(LemmaIdType lma_id) {
230 return static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) +
231 sys_score_compensation_;
232 }
233
convert_psb_to_score(double psb)234 float NGram::convert_psb_to_score(double psb) {
235 float score = static_cast<float>(
236 log(psb) * static_cast<double>(kLogValueAmplifier));
237 if (score > static_cast<float>(kMaxScore)) {
238 score = static_cast<float>(kMaxScore);
239 }
240 return score;
241 }
242
243 #ifdef ___BUILD_MODEL___
build_unigram(LemmaEntry * lemma_arr,size_t lemma_num,LemmaIdType next_idx_unused)244 bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num,
245 LemmaIdType next_idx_unused) {
246 if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1)
247 return false;
248
249 double total_freq = 0;
250 double *freqs = new double[next_idx_unused];
251 if (NULL == freqs)
252 return false;
253
254 freqs[0] = ADD_COUNT;
255 total_freq += freqs[0];
256 LemmaIdType idx_now = 0;
257 for (size_t pos = 0; pos < lemma_num; pos++) {
258 if (lemma_arr[pos].idx_by_hz == idx_now)
259 continue;
260 idx_now++;
261
262 assert(lemma_arr[pos].idx_by_hz == idx_now);
263
264 freqs[idx_now] = lemma_arr[pos].freq;
265 if (freqs[idx_now] <= 0)
266 freqs[idx_now] = 0.3;
267
268 total_freq += freqs[idx_now];
269 }
270
271 double max_freq = 0;
272 idx_num_ = idx_now + 1;
273 assert(idx_now + 1 == next_idx_unused);
274
275 for (size_t pos = 0; pos < idx_num_; pos++) {
276 freqs[pos] = freqs[pos] / total_freq;
277 assert(freqs[pos] > 0);
278 if (freqs[pos] > max_freq)
279 max_freq = freqs[pos];
280 }
281
282 // calculate the code book
283 if (NULL == freq_codes_df_)
284 freq_codes_df_ = new double[kCodeBookSize];
285 assert(freq_codes_df_);
286 memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize);
287
288 if (NULL == freq_codes_)
289 freq_codes_ = new LmaScoreType[kCodeBookSize];
290 assert(freq_codes_);
291 memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize);
292
293 size_t freq_pos = 0;
294 for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
295 bool found = true;
296
297 while (found) {
298 found = false;
299 double cand = freqs[freq_pos];
300 for (size_t i = 0; i < code_pos; i++)
301 if (freq_codes_df_[i] == cand) {
302 found = true;
303 break;
304 }
305 if (found)
306 freq_pos++;
307 }
308
309 freq_codes_df_[code_pos] = freqs[freq_pos];
310 freq_pos++;
311 }
312
313 myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double);
314
315 if (NULL == lma_freq_idx_)
316 lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_];
317 assert(lma_freq_idx_);
318
319 iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_);
320
321 delete [] freqs;
322
323 if (kPrintDebug0) {
324 printf("\n------Language Model Unigram Codebook------\n");
325 }
326
327 for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
328 double log_score = log(freq_codes_df_[code_pos]);
329 float final_score = convert_psb_to_score(freq_codes_df_[code_pos]);
330 if (kPrintDebug0) {
331 printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n",
332 code_pos, freq_codes_df_[code_pos], log_score, final_score);
333 }
334 freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score);
335 }
336
337 initialized_ = true;
338 return true;
339 }
340 #endif
341
342 } // namespace ime_pinyin
343