1 /*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <assert.h>
18 #include <stdlib.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include <math.h>
22 #include "../include/spellingtable.h"
23
24 namespace ime_pinyin {
25
26 #ifdef ___BUILD_MODEL___
27
28 const char SpellingTable::
29 kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"};
30
31 // "" is the biggest, so that all empty strings will be moved to the end
32 // _eb mean empty is biggest
compare_raw_spl_eb(const void * p1,const void * p2)33 int compare_raw_spl_eb(const void* p1, const void* p2) {
34 if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0])
35 return 1;
36
37 if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0])
38 return -1;
39
40 return strcmp((static_cast<const RawSpelling*>(p1))->str,
41 (static_cast<const RawSpelling*>(p2))->str);
42 }
43
get_odd_next(size_t value)44 size_t get_odd_next(size_t value) {
45 size_t v_next = value;
46 while (true) {
47 size_t v_next_sqrt = (size_t)sqrt(v_next);
48
49 bool is_odd = true;
50 for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) {
51 if (v_next % v_dv == 0) {
52 is_odd = false;
53 break;
54 }
55 }
56
57 if (is_odd)
58 return v_next;
59
60 v_next++;
61 }
62
63 // never reach here
64 return 0;
65 }
66
SpellingTable()67 SpellingTable::SpellingTable() {
68 need_score_ = false;
69 raw_spellings_ = NULL;
70 spelling_buf_ = NULL;
71 spelling_num_ = 0;
72 total_freq_ = 0;
73 frozen_ = true;
74 }
75
~SpellingTable()76 SpellingTable::~SpellingTable() {
77 free_resource();
78 }
79
get_hash_pos(const char * spelling_str)80 size_t SpellingTable::get_hash_pos(const char* spelling_str) {
81 size_t hash_pos = 0;
82 for (size_t pos = 0; pos < spelling_size_; pos++) {
83 if ('\0' == spelling_str[pos])
84 break;
85 hash_pos += (size_t)spelling_str[pos];
86 }
87
88 hash_pos = hash_pos % spelling_max_num_;
89 return hash_pos;
90 }
91
hash_pos_next(size_t hash_pos)92 size_t SpellingTable::hash_pos_next(size_t hash_pos) {
93 hash_pos += 123;
94 hash_pos = hash_pos % spelling_max_num_;
95 return hash_pos;
96 }
97
free_resource()98 void SpellingTable::free_resource() {
99 if (NULL != raw_spellings_)
100 delete [] raw_spellings_;
101 raw_spellings_ = NULL;
102
103 if (NULL != spelling_buf_)
104 delete [] spelling_buf_;
105 spelling_buf_ = NULL;
106 }
107
init_table(size_t pure_spl_size,size_t spl_max_num,bool need_score)108 bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num,
109 bool need_score) {
110 if (pure_spl_size == 0 || spl_max_num ==0)
111 return false;
112
113 need_score_ = need_score;
114
115 free_resource();
116
117 spelling_size_ = pure_spl_size + 1;
118 if (need_score)
119 spelling_size_ += 1;
120 spelling_max_num_ = get_odd_next(spl_max_num);
121 spelling_num_ = 0;
122
123 raw_spellings_ = new RawSpelling[spelling_max_num_];
124 spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)];
125 if (NULL == raw_spellings_ || NULL == spelling_buf_) {
126 free_resource();
127 return false;
128 }
129
130 memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling));
131 memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_));
132 frozen_ = false;
133 total_freq_ = 0;
134 return true;
135 }
136
put_spelling(const char * spelling_str,double freq)137 bool SpellingTable::put_spelling(const char* spelling_str, double freq) {
138 if (frozen_ || NULL == spelling_str)
139 return false;
140
141 for (size_t pos = 0; pos < kNotSupportNum; pos++) {
142 if (strcmp(spelling_str, kNotSupportList[pos]) == 0) {
143 return false;
144 }
145 }
146
147 total_freq_ += freq;
148
149 size_t hash_pos = get_hash_pos(spelling_str);
150
151 raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
152
153 if (strncmp(raw_spellings_[hash_pos].str, spelling_str,
154 spelling_size_ - 1) == 0) {
155 raw_spellings_[hash_pos].freq += freq;
156 return true;
157 }
158
159 size_t hash_pos_ori = hash_pos;
160
161 while (true) {
162 if (strncmp(raw_spellings_[hash_pos].str,
163 spelling_str, spelling_size_ - 1) == 0) {
164 raw_spellings_[hash_pos].freq += freq;
165 return true;
166 }
167
168 if ('\0' == raw_spellings_[hash_pos].str[0]) {
169 raw_spellings_[hash_pos].freq += freq;
170 strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1);
171 raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
172 spelling_num_++;
173 return true;
174 }
175
176 hash_pos = hash_pos_next(hash_pos);
177 if (hash_pos_ori == hash_pos)
178 return false;
179 }
180
181 // never reach here
182 return false;
183 }
184
contain(const char * spelling_str)185 bool SpellingTable::contain(const char* spelling_str) {
186 if (NULL == spelling_str || NULL == spelling_buf_ || frozen_)
187 return false;
188
189 size_t hash_pos = get_hash_pos(spelling_str);
190
191 if ('\0' == raw_spellings_[hash_pos].str[0])
192 return false;
193
194 if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
195 == 0)
196 return true;
197
198 size_t hash_pos_ori = hash_pos;
199
200 while (true) {
201 hash_pos = hash_pos_next(hash_pos);
202 if (hash_pos_ori == hash_pos)
203 return false;
204
205 if ('\0' == raw_spellings_[hash_pos].str[0])
206 return false;
207
208 if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
209 == 0)
210 return true;
211 }
212
213 // never reach here
214 return false;
215 }
216
arrange(size_t * item_size,size_t * spl_num)217 const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) {
218 if (NULL == raw_spellings_ || NULL == spelling_buf_ ||
219 NULL == item_size || NULL == spl_num)
220 return NULL;
221
222 qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling),
223 compare_raw_spl_eb);
224
225 // After sorting, only the first spelling_num_ items are valid.
226 // Copy them to the destination buffer.
227 for (size_t pos = 0; pos < spelling_num_; pos++) {
228 strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str,
229 spelling_size_);
230 }
231
232 if (need_score_) {
233 if (kPrintDebug0)
234 printf("------------Spelling Possiblities--------------\n");
235
236 double max_score = 0;
237 double min_score = 0;
238
239 // After sorting, only the first spelling_num_ items are valid.
240 for (size_t pos = 0; pos < spelling_num_; pos++) {
241 raw_spellings_[pos].freq /= total_freq_;
242 if (need_score_) {
243 if (0 == pos) {
244 max_score = raw_spellings_[0].freq;
245 min_score = max_score;
246 } else {
247 if (raw_spellings_[pos].freq > max_score)
248 max_score = raw_spellings_[pos].freq;
249 if (raw_spellings_[pos].freq < min_score)
250 min_score = raw_spellings_[pos].freq;
251 }
252 }
253 }
254
255 if (kPrintDebug0)
256 printf("-----max psb: %f, min psb: %f\n", max_score, min_score);
257
258 max_score = log(max_score);
259 min_score = log(min_score);
260
261 if (kPrintDebug0)
262 printf("-----max log value: %f, min log value: %f\n",
263 max_score, min_score);
264
265 // The absolute value of min_score is bigger than that of max_score because
266 // both of them are negative after log function.
267 score_amplifier_ = 1.0 * 255 / min_score;
268
269 double average_score = 0;
270 for (size_t pos = 0; pos < spelling_num_; pos++) {
271 double score = log(raw_spellings_[pos].freq) * score_amplifier_;
272 assert(score >= 0);
273
274 average_score += score;
275
276 // Because of calculation precision issue, score might be a little bigger
277 // than 255 after being amplified.
278 if (score > 255)
279 score = 255;
280 char *this_spl_buf = spelling_buf_ + pos * spelling_size_;
281 this_spl_buf[spelling_size_ - 1] =
282 static_cast<char>((unsigned char)score);
283
284 if (kPrintDebug0) {
285 printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf,
286 (unsigned char)this_spl_buf[spelling_size_ -1]);
287 }
288 }
289 average_score /= spelling_num_;
290 assert(average_score <= 255);
291 average_score_ = static_cast<uint8>(average_score);
292
293 if (kPrintDebug0)
294 printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_,
295 average_score_);
296 }
297
298 *item_size = spelling_size_;
299 *spl_num = spelling_num_;
300 frozen_ = true;
301 return spelling_buf_;
302 }
303
get_score_amplifier()304 float SpellingTable::get_score_amplifier() {
305 return static_cast<float>(score_amplifier_);
306 }
307
get_average_score()308 unsigned char SpellingTable::get_average_score() {
309 return average_score_;
310 }
311
312 #endif // ___BUILD_MODEL___
313 } // namespace ime_pinyin
314