• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <assert.h>
18 #include <stdlib.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include <math.h>
22 #include "../include/spellingtable.h"
23 
24 namespace ime_pinyin {
25 
26 #ifdef ___BUILD_MODEL___
27 
28 const char SpellingTable::
29     kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"};
30 
31 // "" is the biggest, so that all empty strings will be moved to the end
32 // _eb mean empty is biggest
compare_raw_spl_eb(const void * p1,const void * p2)33 int compare_raw_spl_eb(const void* p1, const void* p2) {
34   if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0])
35     return 1;
36 
37   if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0])
38     return -1;
39 
40   return strcmp((static_cast<const RawSpelling*>(p1))->str,
41                 (static_cast<const RawSpelling*>(p2))->str);
42 }
43 
get_odd_next(size_t value)44 size_t get_odd_next(size_t value) {
45   size_t v_next = value;
46   while (true) {
47     size_t v_next_sqrt = (size_t)sqrt(v_next);
48 
49     bool is_odd = true;
50     for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) {
51       if (v_next % v_dv == 0) {
52         is_odd = false;
53         break;
54       }
55     }
56 
57     if (is_odd)
58       return v_next;
59 
60     v_next++;
61   }
62 
63   // never reach here
64   return 0;
65 }
66 
SpellingTable()67 SpellingTable::SpellingTable() {
68   need_score_ = false;
69   raw_spellings_ = NULL;
70   spelling_buf_ = NULL;
71   spelling_num_ = 0;
72   total_freq_ = 0;
73   frozen_ = true;
74 }
75 
~SpellingTable()76 SpellingTable::~SpellingTable() {
77   free_resource();
78 }
79 
get_hash_pos(const char * spelling_str)80 size_t SpellingTable::get_hash_pos(const char* spelling_str) {
81   size_t hash_pos = 0;
82   for (size_t pos = 0; pos < spelling_size_; pos++) {
83     if ('\0' == spelling_str[pos])
84       break;
85     hash_pos += (size_t)spelling_str[pos];
86   }
87 
88   hash_pos = hash_pos % spelling_max_num_;
89   return hash_pos;
90 }
91 
hash_pos_next(size_t hash_pos)92 size_t SpellingTable::hash_pos_next(size_t hash_pos) {
93   hash_pos += 123;
94   hash_pos = hash_pos % spelling_max_num_;
95   return hash_pos;
96 }
97 
free_resource()98 void SpellingTable::free_resource() {
99   if (NULL != raw_spellings_)
100     delete [] raw_spellings_;
101   raw_spellings_ = NULL;
102 
103   if (NULL != spelling_buf_)
104     delete [] spelling_buf_;
105   spelling_buf_ = NULL;
106 }
107 
init_table(size_t pure_spl_size,size_t spl_max_num,bool need_score)108 bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num,
109                                bool need_score) {
110   if (pure_spl_size == 0 || spl_max_num ==0)
111     return false;
112 
113   need_score_ = need_score;
114 
115   free_resource();
116 
117   spelling_size_ = pure_spl_size + 1;
118   if (need_score)
119     spelling_size_ += 1;
120   spelling_max_num_ = get_odd_next(spl_max_num);
121   spelling_num_ = 0;
122 
123   raw_spellings_ = new RawSpelling[spelling_max_num_];
124   spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)];
125   if (NULL == raw_spellings_ || NULL == spelling_buf_) {
126     free_resource();
127     return false;
128   }
129 
130   memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling));
131   memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_));
132   frozen_ = false;
133   total_freq_ = 0;
134   return true;
135 }
136 
put_spelling(const char * spelling_str,double freq)137 bool SpellingTable::put_spelling(const char* spelling_str, double freq) {
138   if (frozen_ || NULL == spelling_str)
139     return false;
140 
141   for (size_t pos = 0; pos < kNotSupportNum; pos++) {
142     if (strcmp(spelling_str, kNotSupportList[pos]) == 0) {
143       return false;
144     }
145   }
146 
147   total_freq_ += freq;
148 
149   size_t hash_pos = get_hash_pos(spelling_str);
150 
151   raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
152 
153   if (strncmp(raw_spellings_[hash_pos].str, spelling_str,
154               spelling_size_ - 1) == 0) {
155     raw_spellings_[hash_pos].freq += freq;
156     return true;
157   }
158 
159   size_t hash_pos_ori = hash_pos;
160 
161   while (true) {
162     if (strncmp(raw_spellings_[hash_pos].str,
163                 spelling_str, spelling_size_ - 1) == 0) {
164       raw_spellings_[hash_pos].freq += freq;
165       return true;
166     }
167 
168     if ('\0' == raw_spellings_[hash_pos].str[0]) {
169       raw_spellings_[hash_pos].freq += freq;
170       strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1);
171       raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
172       spelling_num_++;
173       return true;
174     }
175 
176     hash_pos = hash_pos_next(hash_pos);
177     if (hash_pos_ori == hash_pos)
178       return false;
179   }
180 
181   // never reach here
182   return false;
183 }
184 
contain(const char * spelling_str)185 bool SpellingTable::contain(const char* spelling_str) {
186   if (NULL == spelling_str || NULL == spelling_buf_ || frozen_)
187     return false;
188 
189   size_t hash_pos = get_hash_pos(spelling_str);
190 
191   if ('\0' == raw_spellings_[hash_pos].str[0])
192     return false;
193 
194   if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
195       == 0)
196     return true;
197 
198   size_t hash_pos_ori = hash_pos;
199 
200   while (true) {
201     hash_pos = hash_pos_next(hash_pos);
202     if (hash_pos_ori == hash_pos)
203       return false;
204 
205     if ('\0' == raw_spellings_[hash_pos].str[0])
206       return false;
207 
208     if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
209         == 0)
210       return true;
211   }
212 
213   // never reach here
214   return false;
215 }
216 
arrange(size_t * item_size,size_t * spl_num)217 const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) {
218   if (NULL == raw_spellings_ || NULL == spelling_buf_ ||
219       NULL == item_size || NULL == spl_num)
220     return NULL;
221 
222   qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling),
223         compare_raw_spl_eb);
224 
225   // After sorting, only the first spelling_num_ items are valid.
226   // Copy them to the destination buffer.
227   for (size_t pos = 0; pos < spelling_num_; pos++) {
228     strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str,
229             spelling_size_);
230   }
231 
232   if (need_score_) {
233     if (kPrintDebug0)
234       printf("------------Spelling Possiblities--------------\n");
235 
236     double max_score = 0;
237     double min_score = 0;
238 
239     // After sorting, only the first spelling_num_ items are valid.
240     for (size_t pos = 0; pos < spelling_num_; pos++) {
241       raw_spellings_[pos].freq /= total_freq_;
242       if (need_score_) {
243         if (0 == pos) {
244           max_score = raw_spellings_[0].freq;
245           min_score = max_score;
246         } else {
247           if (raw_spellings_[pos].freq > max_score)
248             max_score = raw_spellings_[pos].freq;
249           if (raw_spellings_[pos].freq < min_score)
250             min_score = raw_spellings_[pos].freq;
251         }
252       }
253     }
254 
255     if (kPrintDebug0)
256       printf("-----max psb: %f, min psb: %f\n", max_score, min_score);
257 
258     max_score = log(max_score);
259     min_score = log(min_score);
260 
261     if (kPrintDebug0)
262       printf("-----max log value: %f, min log value: %f\n",
263              max_score, min_score);
264 
265     // The absolute value of min_score is bigger than that of max_score because
266     // both of them are negative after log function.
267     score_amplifier_ = 1.0 * 255 / min_score;
268 
269     double average_score = 0;
270     for (size_t pos = 0; pos < spelling_num_; pos++) {
271       double score = log(raw_spellings_[pos].freq) * score_amplifier_;
272       assert(score >= 0);
273 
274       average_score += score;
275 
276       // Because of calculation precision issue, score might be a little bigger
277       // than 255 after being amplified.
278       if (score > 255)
279         score = 255;
280       char *this_spl_buf = spelling_buf_ + pos * spelling_size_;
281       this_spl_buf[spelling_size_ - 1] =
282           static_cast<char>((unsigned char)score);
283 
284       if (kPrintDebug0) {
285         printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf,
286                (unsigned char)this_spl_buf[spelling_size_ -1]);
287       }
288     }
289     average_score /= spelling_num_;
290     assert(average_score <= 255);
291     average_score_ = static_cast<uint8>(average_score);
292 
293     if (kPrintDebug0)
294       printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_,
295              average_score_);
296   }
297 
298   *item_size = spelling_size_;
299   *spl_num = spelling_num_;
300   frozen_ = true;
301   return spelling_buf_;
302 }
303 
get_score_amplifier()304 float SpellingTable::get_score_amplifier() {
305   return static_cast<float>(score_amplifier_);
306 }
307 
get_average_score()308 unsigned char SpellingTable::get_average_score() {
309   return average_score_;
310 }
311 
312 #endif  // ___BUILD_MODEL___
313 }  // namespace ime_pinyin
314