1 /* -*-C-*-
2 ********************************************************************************
3 *
4 * File: permdawg.c (Formerly permdawg.c)
5 * Description: Scale word choices by a dictionary
6 * Author: Mark Seaman, OCR Technology
7 * Created: Fri Oct 16 14:37:00 1987
8 * Modified: Tue Jul 9 15:43:18 1991 (Mark Seaman) marks@hpgrlt
9 * Language: C
10 * Package: N/A
11 * Status: Reusable Software Component
12 *
13 * (c) Copyright 1987, Hewlett-Packard Company.
14 ** Licensed under the Apache License, Version 2.0 (the "License");
15 ** you may not use this file except in compliance with the License.
16 ** You may obtain a copy of the License at
17 ** http://www.apache.org/licenses/LICENSE-2.0
18 ** Unless required by applicable law or agreed to in writing, software
19 ** distributed under the License is distributed on an "AS IS" BASIS,
20 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 ** See the License for the specific language governing permissions and
22 ** limitations under the License.
23 *
24 *********************************************************************************/
25 /*----------------------------------------------------------------------
26 I n c l u d e s
27 ----------------------------------------------------------------------*/
28
29 #include "context.h"
30 #include "conversion.h"
31 #include "cutil.h"
32 #include "dawg.h"
33 #include "freelist.h"
34 #include "globals.h"
35 #include "ndminx.h"
36 #include "permdawg.h"
37 #include "permute.h"
38 #include "stopper.h"
39 #include "tordvars.h"
40 #include "tprintf.h"
41 #include "varable.h"
42
43 #include <ctype.h>
44 #include "dict.h"
45 #include "image.h"
46
47 /*----------------------------------------------------------------------
48 V a r i a b l e s
49 ----------------------------------------------------------------------*/
50 BOOL_VAR(segment_dawg_debug, 0, "Debug mode for word segmentation");
51
52 double_VAR(segment_penalty_dict_case_bad, OK_WERD,
53 "Default score multiplier for word matches, which may have "
54 "case issues (lower is better).");
55
56 double_VAR(segment_penalty_dict_case_ok, GOOD_WERD,
57 "Score multiplier for word matches that have good case "
58 "(lower is better).");
59
60 double_VAR(segment_penalty_dict_frequent_word, FREQ_WERD,
61 "Score multiplier for word matches which have good case and are "
62 "frequent in the given language (lower is better).");
63
64 /*----------------------------------------------------------------------
65 F u n c t i o n s
66 ----------------------------------------------------------------------*/
67 namespace tesseract {
68
69 static const float kPermDawgRatingPad = 5.0;
70
71 /**********************************************************************
72 * adjust_word
73 *
74 * Assign an adjusted value to a string that is a word. The value
75 * that this word choice has is based on case and punctuation rules.
76 **********************************************************************/
adjust_word(WERD_CHOICE * word,float * certainty_array)77 void Dict::adjust_word(WERD_CHOICE *word,
78 float *certainty_array) {
79 float adjust_factor;
80 float new_rating = word->rating();
81
82 if (segment_dawg_debug) {
83 tprintf ("Word: %s %4.2f ",
84 word->debug_string(getUnicharset()).string(), word->rating());
85 }
86
87 new_rating += RATING_PAD;
88 if (Context::case_ok(*word, getUnicharset())) {
89 if (freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
90 word->set_permuter(FREQ_DAWG_PERM);
91 new_rating *= segment_penalty_dict_frequent_word;
92 adjust_factor = segment_penalty_dict_frequent_word;
93 if (segment_dawg_debug)
94 tprintf(", F, %4.2f ", (double)segment_penalty_dict_frequent_word);
95 } else {
96 new_rating *= segment_penalty_dict_case_ok;
97 adjust_factor = segment_penalty_dict_case_ok;
98 if (segment_dawg_debug)
99 tprintf(", %4.2f ", (double)segment_penalty_dict_case_ok);
100 }
101 } else {
102 new_rating *= segment_penalty_dict_case_bad;
103 adjust_factor = segment_penalty_dict_case_bad;
104 if (segment_dawg_debug) {
105 tprintf(", C %4.2f ", (double)segment_penalty_dict_case_bad);
106 }
107 }
108 new_rating -= RATING_PAD;
109 word->set_rating(new_rating);
110
111 LogNewChoice(*word, adjust_factor, certainty_array, false);
112
113 if (segment_dawg_debug)
114 tprintf(" --> %4.2f\n", new_rating);
115 }
116
117 /**********************************************************************
118 * go_deeper_dawg_fxn
119 *
120 * If the choice being composed so far could be a dictionary word
121 * keep exploring choices.
122 **********************************************************************/
go_deeper_dawg_fxn(const char * debug,const BLOB_CHOICE_LIST_VECTOR & char_choices,int char_choice_index,const CHAR_FRAGMENT_INFO * prev_char_frag_info,bool word_ending,WERD_CHOICE * word,float certainties[],float * limit,WERD_CHOICE * best_choice,void * void_more_args)123 void Dict::go_deeper_dawg_fxn(
124 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
125 int char_choice_index,
126 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
127 bool word_ending, WERD_CHOICE *word, float certainties[],
128 float *limit, WERD_CHOICE *best_choice, void *void_more_args) {
129 DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args);
130 int word_index = word->length() - 1;
131
132 // There are two modes for deciding whether to go deeper: regular dawg
133 // permuter mode and the special ambigs mode. If *limit is <= 0.0 the
134 // function switches to the ambigs mode (this is the case when
135 // dawg_permute_and_select() function is called from NoDangerousAmbigs()) and
136 // only searches for the first choice that has a rating better than *limit
137 // (in this case ratings are fake, since the real ratings can not be < 0).
138 // Modification of the hyphen state is turned off in the ambigs mode.
139 // When in the regular dawg permuter mode, the function explores all the
140 // possible words and chooses the one with the best rating. The letters with
141 // ratings that are far worse than the ones seen so far are pruned out.
142 bool ambigs_mode = (*limit <= 0.0);
143 if (ambigs_mode) {
144 if (best_choice->rating() < *limit) return;
145 } else {
146 // Prune bad subwords
147 if (more_args->rating_array[word_index] == NO_RATING) {
148 more_args->rating_array[word_index] = word->rating();
149 } else {
150 float permdawg_limit = more_args->rating_array[word_index] *
151 more_args->rating_margin + kPermDawgRatingPad;
152 if (permdawg_limit < word->rating()) {
153 if (segment_dawg_debug) {
154 tprintf("early pruned word rating=%4.2f,"
155 " permdawg_limit=%4.2f, word=%s\n", word->rating(),
156 permdawg_limit, word->debug_string(getUnicharset()).string());
157 }
158 return;
159 }
160 }
161 }
162 // Deal with hyphens
163 if (word_ending && has_hyphen_end(*word) && !ambigs_mode) {
164 if (segment_dawg_debug)
165 tprintf("new hyphen choice = %s\n",
166 word->debug_string(getUnicharset()).string());
167 word->set_permuter(more_args->permuter);
168 adjust_word(word, certainties);
169 set_hyphen_word(*word, *(more_args->active_dawgs),
170 *(more_args->constraints));
171 update_best_choice(*word, best_choice);
172 } else { // Look up char in DAWG
173 // TODO(daria): update the rest of the code that specifies alternative
174 // letter_is_okay_ functions (e.g. TessCharNgram class) to work with
175 // multi-byte unichars and/or unichar ids.
176
177 // If the current unichar is an ngram first try calling
178 // letter_is_okay() for each unigram it contains separately.
179 UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
180 bool checked_unigrams = false;
181 if (getUnicharset().get_isngram(orig_uch_id)) {
182 if (segment_dawg_debug) {
183 tprintf("checking unigrams in an ngram %s\n",
184 getUnicharset().debug_str(orig_uch_id).string());
185 }
186 int orig_num_fragments = word->fragment_length(word_index);
187 int num_unigrams = 0;
188 word->remove_last_unichar_id();
189 const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
190 const char *ngram_str_end = ngram_str + strlen(ngram_str);
191 const char *ngram_ptr = ngram_str;
192 bool unigrams_ok = true;
193 // Construct DawgArgs that reflect the current state.
194 DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs);
195 DawgInfoVector unigram_constraints = *(more_args->constraints);
196 DawgInfoVector unigram_updated_active_dawgs;
197 DawgInfoVector unigram_updated_constraints;
198 DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_constraints,
199 &unigram_updated_active_dawgs,
200 &unigram_updated_constraints, 0.0);
201 unigram_dawg_args.permuter = more_args->permuter;
202 // Check unigrams in the ngram with letter_is_okay().
203 while (unigrams_ok && ngram_ptr < ngram_str_end) {
204 int step = getUnicharset().step(ngram_ptr);
205 UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID :
206 getUnicharset().unichar_to_id(ngram_ptr, step);
207 ngram_ptr += step;
208 ++num_unigrams;
209 word->append_unichar_id(uch_id, 1, 0.0, 0.0);
210 unigrams_ok = unigrams_ok && (this->*letter_is_okay_)(
211 &unigram_dawg_args, word_index+num_unigrams-1, word,
212 word_ending && (ngram_ptr == ngram_str_end));
213 (*unigram_dawg_args.active_dawgs) =
214 *(unigram_dawg_args.updated_active_dawgs);
215 (*unigram_dawg_args.constraints) =
216 *(unigram_dawg_args.updated_constraints);
217 if (segment_dawg_debug) {
218 tprintf("unigram %s is %s\n",
219 getUnicharset().debug_str(uch_id).string(),
220 unigrams_ok ? "OK" : "not OK");
221 }
222 }
223 // Restore the word and copy the updated dawg state if needed.
224 while (num_unigrams-- > 0) word->remove_last_unichar_id();
225 word->append_unichar_id_space_allocated(
226 orig_uch_id, orig_num_fragments, 0.0, 0.0);
227 if (unigrams_ok) {
228 checked_unigrams = true;
229 more_args->permuter = unigram_dawg_args.permuter;
230 *(more_args->updated_active_dawgs) =
231 *(unigram_dawg_args.updated_active_dawgs);
232 *(more_args->updated_constraints) =
233 *(unigram_dawg_args.updated_constraints);
234 }
235 }
236
237 // Check which dawgs from dawgs_ vector contain the word
238 // up to and including the current unichar.
239 if (checked_unigrams ||
240 (this->*letter_is_okay_)(more_args, word_index, word, word_ending)) {
241 // Add a new word choice
242 if (word_ending) {
243 if (segment_dawg_debug) {
244 tprintf("found word = %s\n",
245 word->debug_string(getUnicharset()).string());
246 }
247 WERD_CHOICE *adjusted_word = word;
248 WERD_CHOICE hyphen_tail_word;
249 if (!ambigs_mode && hyphen_base_size() > 0) {
250 hyphen_tail_word = *word;
251 remove_hyphen_head(&hyphen_tail_word);
252 adjusted_word = &hyphen_tail_word;
253 }
254 adjusted_word->set_permuter(more_args->permuter);
255 if (!ambigs_mode) {
256 adjust_word(adjusted_word, &certainties[hyphen_base_size()]);
257 }
258 update_best_choice(*adjusted_word, best_choice);
259 } else { // search the next letter
260 // Make updated_* point to the next entries in the DawgInfoVector
261 // arrays (that were originally created in dawg_permute_and_select)
262 ++(more_args->updated_active_dawgs);
263 ++(more_args->updated_constraints);
264 // Make active_dawgs and constraints point to the updated ones.
265 ++(more_args->active_dawgs);
266 ++(more_args->constraints);
267 permute_choices(debug, char_choices, char_choice_index + 1,
268 prev_char_frag_info, word, certainties, limit,
269 best_choice, more_args);
270 // Restore previous state to explore another letter in this position.
271 --(more_args->updated_active_dawgs);
272 --(more_args->updated_constraints);
273 --(more_args->active_dawgs);
274 --(more_args->constraints);
275 }
276 } else {
277 if (segment_dawg_debug) {
278 tprintf("last unichar not OK at index %d in %s\n",
279 word_index, word->debug_string(getUnicharset()).string());
280 }
281 }
282 }
283 }
284
285 /**********************************************************************
286 * dawg_permute_and_select
287 *
288 * Recursively explore all the possible character combinations in
289 * the given char_choices. Use go_deeper_dawg_fxn() to search all the
290 * dawgs in the dawgs_ vector in parallel and discard invalid words.
291 *
292 * Allocate and return a WERD_CHOICE with the best valid word found.
293 * **********************************************************************/
dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR & char_choices,float rating_limit)294 WERD_CHOICE *Dict::dawg_permute_and_select(
295 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
296 WERD_CHOICE *best_choice = new WERD_CHOICE();
297 best_choice->make_bad();
298 best_choice->set_rating(rating_limit);
299 if (char_choices.length() == 0) return best_choice;
300 DawgInfoVector *active_dawgs = new DawgInfoVector[char_choices.length() + 1];
301 DawgInfoVector *constraints = new DawgInfoVector[char_choices.length() + 1];
302 init_active_dawgs(&(active_dawgs[0]));
303 init_constraints(&(constraints[0]));
304 DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
305 &(active_dawgs[1]), &(constraints[1]),
306 (segment_penalty_dict_case_bad /
307 segment_penalty_dict_case_ok));
308 WERD_CHOICE word(MAX_WERD_LENGTH);
309 copy_hyphen_info(&word);
310 // Discard rating and certainty of the hyphen base (if any).
311 word.set_rating(0.0);
312 word.set_certainty(0.0);
313 if (word.length() + char_choices.length() > MAX_WERD_LENGTH) {
314 return best_choice; // the word is too long to permute
315 }
316 float certainties[MAX_WERD_LENGTH];
317 this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
318 permute_choices(segment_dawg_debug ? "segment_dawg_debug" : NULL,
319 char_choices, 0, NULL, &word, certainties,
320 &rating_limit, best_choice, &dawg_args);
321 delete[] active_dawgs;
322 delete[] constraints;
323 return best_choice;
324 }
325
326 // Fill the given active_dawgs vector with dawgs that could contain the
327 // beginning of the word. If hyphenated() returns true, copy the entries
328 // from hyphen_active_dawgs_ instead.
init_active_dawgs(DawgInfoVector * active_dawgs)329 void Dict::init_active_dawgs(DawgInfoVector *active_dawgs) {
330 int i;
331 if (hyphenated()) {
332 *active_dawgs = hyphen_active_dawgs_;
333 if (dawg_debug_level >= 3) {
334 for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
335 tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
336 hyphen_active_dawgs_[i].dawg_index,
337 hyphen_active_dawgs_[i].ref);
338 }
339 }
340 } else {
341 for (i = 0; i < dawgs_.length(); ++i) {
342 if (kBeginningDawgsType[(dawgs_[i])->type()]) {
343 *active_dawgs += DawgInfo(i, NO_EDGE);
344 if (dawg_debug_level >= 3) {
345 tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
346 }
347 }
348 }
349 }
350 }
351
352 // If hyphenated() returns true, copy the entries from hyphen_constraints_
353 // into the given constraints vector.
init_constraints(DawgInfoVector * constraints)354 void Dict::init_constraints(DawgInfoVector *constraints) {
355 if (hyphenated()) {
356 *constraints = hyphen_constraints_;
357 if (dawg_debug_level >= 3) {
358 for (int i = 0; i < hyphen_constraints_.size(); ++i) {
359 tprintf("Adding hyphen constraint [%d, " REFFORMAT "]\n",
360 hyphen_constraints_[i].dawg_index,
361 hyphen_constraints_[i].ref);
362 }
363 }
364 }
365 }
366
367 } // namespace tesseract
368