• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File:        permdawg.c  (Formerly permdawg.c)
5  * Description:  Scale word choices by a dictionary
6  * Author:       Mark Seaman, OCR Technology
7  * Created:      Fri Oct 16 14:37:00 1987
8  * Modified:     Tue Jul  9 15:43:18 1991 (Mark Seaman) marks@hpgrlt
9  * Language:     C
10  * Package:      N/A
11  * Status:       Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26               I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #include "context.h"
30 #include "conversion.h"
31 #include "cutil.h"
32 #include "dawg.h"
33 #include "freelist.h"
34 #include "globals.h"
35 #include "ndminx.h"
36 #include "permdawg.h"
37 #include "permute.h"
38 #include "stopper.h"
39 #include "tordvars.h"
40 #include "tprintf.h"
41 #include "varable.h"
42 
43 #include <ctype.h>
44 #include "dict.h"
45 #include "image.h"
46 
47 /*----------------------------------------------------------------------
48               V a r i a b l e s
49 ----------------------------------------------------------------------*/
50 BOOL_VAR(segment_dawg_debug, 0, "Debug mode for word segmentation");
51 
52 double_VAR(segment_penalty_dict_case_bad, OK_WERD,
53            "Default score multiplier for word matches, which may have "
54            "case issues (lower is better).");
55 
56 double_VAR(segment_penalty_dict_case_ok, GOOD_WERD,
57            "Score multiplier for word matches that have good case "
58            "(lower is better).");
59 
60 double_VAR(segment_penalty_dict_frequent_word, FREQ_WERD,
61            "Score multiplier for word matches which have good case and are "
62            "frequent in the given language (lower is better).");
63 
64 /*----------------------------------------------------------------------
65               F u n c t i o n s
66 ----------------------------------------------------------------------*/
67 namespace tesseract {
68 
69 static const float kPermDawgRatingPad = 5.0;
70 
71 /**********************************************************************
72  * adjust_word
73  *
74  * Assign an adjusted value to a string that is a word.	The value
75  * that this word choice has is based on case and punctuation rules.
76  **********************************************************************/
adjust_word(WERD_CHOICE * word,float * certainty_array)77 void Dict::adjust_word(WERD_CHOICE *word,
78                        float *certainty_array) {
79   float adjust_factor;
80   float new_rating = word->rating();
81 
82   if (segment_dawg_debug) {
83     tprintf ("Word: %s %4.2f ",
84             word->debug_string(getUnicharset()).string(), word->rating());
85   }
86 
87   new_rating += RATING_PAD;
88   if (Context::case_ok(*word, getUnicharset())) {
89     if (freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
90       word->set_permuter(FREQ_DAWG_PERM);
91       new_rating *= segment_penalty_dict_frequent_word;
92       adjust_factor = segment_penalty_dict_frequent_word;
93       if (segment_dawg_debug)
94         tprintf(", F, %4.2f ", (double)segment_penalty_dict_frequent_word);
95     } else {
96       new_rating *= segment_penalty_dict_case_ok;
97       adjust_factor = segment_penalty_dict_case_ok;
98       if (segment_dawg_debug)
99         tprintf(", %4.2f ", (double)segment_penalty_dict_case_ok);
100     }
101   } else {
102     new_rating *= segment_penalty_dict_case_bad;
103     adjust_factor = segment_penalty_dict_case_bad;
104     if (segment_dawg_debug) {
105       tprintf(", C %4.2f ", (double)segment_penalty_dict_case_bad);
106     }
107   }
108   new_rating -= RATING_PAD;
109   word->set_rating(new_rating);
110 
111   LogNewChoice(*word, adjust_factor, certainty_array, false);
112 
113   if (segment_dawg_debug)
114     tprintf(" --> %4.2f\n", new_rating);
115 }
116 
117 /**********************************************************************
118  * go_deeper_dawg_fxn
119  *
120  * If the choice being composed so far could be a dictionary word
121  * keep exploring choices.
122  **********************************************************************/
go_deeper_dawg_fxn(const char * debug,const BLOB_CHOICE_LIST_VECTOR & char_choices,int char_choice_index,const CHAR_FRAGMENT_INFO * prev_char_frag_info,bool word_ending,WERD_CHOICE * word,float certainties[],float * limit,WERD_CHOICE * best_choice,void * void_more_args)123 void Dict::go_deeper_dawg_fxn(
124     const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
125                               int char_choice_index,
126                               const CHAR_FRAGMENT_INFO *prev_char_frag_info,
127     bool word_ending, WERD_CHOICE *word, float certainties[],
128     float *limit, WERD_CHOICE *best_choice, void *void_more_args) {
129   DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args);
130   int word_index = word->length() - 1;
131 
132   // There are two modes for deciding whether to go deeper: regular dawg
133   // permuter mode and the special ambigs mode. If *limit is <= 0.0 the
134   // function switches to the ambigs mode (this is the case when
135   // dawg_permute_and_select() function is called from NoDangerousAmbigs()) and
136   // only searches for the first choice that has a rating better than *limit
137   // (in this case ratings are fake, since the real ratings can not be < 0).
138   // Modification of the hyphen state is turned off in the ambigs mode.
139   // When in the regular dawg permuter mode, the function explores all the
140   // possible words and chooses the one with the best rating. The letters with
141   // ratings that are far worse than the ones seen so far are pruned out.
142   bool ambigs_mode = (*limit <= 0.0);
143   if (ambigs_mode) {
144     if (best_choice->rating() < *limit) return;
145   } else {
146     // Prune bad subwords
147     if (more_args->rating_array[word_index] == NO_RATING) {
148       more_args->rating_array[word_index] = word->rating();
149   } else {
150       float permdawg_limit = more_args->rating_array[word_index] *
151         more_args->rating_margin + kPermDawgRatingPad;
152       if (permdawg_limit < word->rating()) {
153       if (segment_dawg_debug) {
154           tprintf("early pruned word rating=%4.2f,"
155                   " permdawg_limit=%4.2f, word=%s\n", word->rating(),
156                   permdawg_limit, word->debug_string(getUnicharset()).string());
157       }
158       return;
159     }
160   }
161   }
162   // Deal with hyphens
163   if (word_ending && has_hyphen_end(*word) && !ambigs_mode) {
164     if (segment_dawg_debug)
165       tprintf("new hyphen choice = %s\n",
166               word->debug_string(getUnicharset()).string());
167     word->set_permuter(more_args->permuter);
168     adjust_word(word, certainties);
169     set_hyphen_word(*word, *(more_args->active_dawgs),
170                     *(more_args->constraints));
171     update_best_choice(*word, best_choice);
172   } else {  // Look up char in DAWG
173     // TODO(daria): update the rest of the code that specifies alternative
174     // letter_is_okay_ functions (e.g. TessCharNgram class) to work with
175     // multi-byte unichars and/or unichar ids.
176 
177     // If the current unichar is an ngram first try calling
178     // letter_is_okay() for each unigram it contains separately.
179     UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
180     bool checked_unigrams = false;
181     if (getUnicharset().get_isngram(orig_uch_id)) {
182       if (segment_dawg_debug) {
183         tprintf("checking unigrams in an ngram %s\n",
184                 getUnicharset().debug_str(orig_uch_id).string());
185       }
186       int orig_num_fragments = word->fragment_length(word_index);
187       int num_unigrams = 0;
188       word->remove_last_unichar_id();
189       const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
190       const char *ngram_str_end = ngram_str + strlen(ngram_str);
191       const char *ngram_ptr = ngram_str;
192       bool unigrams_ok = true;
193       // Construct DawgArgs that reflect the current state.
194       DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs);
195       DawgInfoVector unigram_constraints = *(more_args->constraints);
196       DawgInfoVector unigram_updated_active_dawgs;
197       DawgInfoVector unigram_updated_constraints;
198       DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_constraints,
199                                  &unigram_updated_active_dawgs,
200                                  &unigram_updated_constraints, 0.0);
201       unigram_dawg_args.permuter = more_args->permuter;
202       // Check unigrams in the ngram with letter_is_okay().
203       while (unigrams_ok && ngram_ptr < ngram_str_end) {
204         int step = getUnicharset().step(ngram_ptr);
205         UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID :
206             getUnicharset().unichar_to_id(ngram_ptr, step);
207         ngram_ptr += step;
208         ++num_unigrams;
209         word->append_unichar_id(uch_id, 1, 0.0, 0.0);
210         unigrams_ok = unigrams_ok && (this->*letter_is_okay_)(
211             &unigram_dawg_args, word_index+num_unigrams-1, word,
212             word_ending && (ngram_ptr == ngram_str_end));
213         (*unigram_dawg_args.active_dawgs) =
214           *(unigram_dawg_args.updated_active_dawgs);
215         (*unigram_dawg_args.constraints) =
216           *(unigram_dawg_args.updated_constraints);
217         if (segment_dawg_debug) {
218           tprintf("unigram %s is %s\n",
219                   getUnicharset().debug_str(uch_id).string(),
220                   unigrams_ok ? "OK" : "not OK");
221         }
222       }
223       // Restore the word and copy the updated dawg state if needed.
224       while (num_unigrams-- > 0) word->remove_last_unichar_id();
225       word->append_unichar_id_space_allocated(
226           orig_uch_id, orig_num_fragments, 0.0, 0.0);
227       if (unigrams_ok) {
228         checked_unigrams = true;
229         more_args->permuter = unigram_dawg_args.permuter;
230         *(more_args->updated_active_dawgs) =
231           *(unigram_dawg_args.updated_active_dawgs);
232         *(more_args->updated_constraints) =
233           *(unigram_dawg_args.updated_constraints);
234       }
235     }
236 
237     // Check which dawgs from dawgs_ vector contain the word
238     // up to and including the current unichar.
239     if (checked_unigrams ||
240         (this->*letter_is_okay_)(more_args, word_index, word, word_ending)) {
241       // Add a new word choice
242       if (word_ending) {
243         if (segment_dawg_debug) {
244           tprintf("found word = %s\n",
245                   word->debug_string(getUnicharset()).string());
246         }
247         WERD_CHOICE *adjusted_word = word;
248         WERD_CHOICE hyphen_tail_word;
249         if (!ambigs_mode && hyphen_base_size() > 0) {
250           hyphen_tail_word = *word;
251           remove_hyphen_head(&hyphen_tail_word);
252           adjusted_word = &hyphen_tail_word;
253         }
254         adjusted_word->set_permuter(more_args->permuter);
255         if (!ambigs_mode) {
256           adjust_word(adjusted_word, &certainties[hyphen_base_size()]);
257         }
258         update_best_choice(*adjusted_word, best_choice);
259       } else {  // search the next letter
260         // Make updated_* point to the next entries in the DawgInfoVector
261         // arrays (that were originally created in dawg_permute_and_select)
262         ++(more_args->updated_active_dawgs);
263         ++(more_args->updated_constraints);
264         // Make active_dawgs and constraints point to the updated ones.
265         ++(more_args->active_dawgs);
266         ++(more_args->constraints);
267         permute_choices(debug, char_choices, char_choice_index + 1,
268                         prev_char_frag_info, word, certainties, limit,
269                         best_choice, more_args);
270         // Restore previous state to explore another letter in this position.
271         --(more_args->updated_active_dawgs);
272         --(more_args->updated_constraints);
273         --(more_args->active_dawgs);
274         --(more_args->constraints);
275       }
276     } else {
277       if (segment_dawg_debug) {
278         tprintf("last unichar not OK at index %d in %s\n",
279                 word_index, word->debug_string(getUnicharset()).string());
280       }
281     }
282   }
283 }
284 
285 /**********************************************************************
286  * dawg_permute_and_select
287  *
288  * Recursively explore all the possible character combinations in
289  * the given char_choices. Use go_deeper_dawg_fxn() to search all the
290  * dawgs in the dawgs_ vector in parallel and discard invalid words.
291  *
292  * Allocate and return a WERD_CHOICE with the best valid word found.
293  * **********************************************************************/
dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR & char_choices,float rating_limit)294 WERD_CHOICE *Dict::dawg_permute_and_select(
295     const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
296   WERD_CHOICE *best_choice = new WERD_CHOICE();
297   best_choice->make_bad();
298   best_choice->set_rating(rating_limit);
299   if (char_choices.length() == 0) return best_choice;
300   DawgInfoVector *active_dawgs = new DawgInfoVector[char_choices.length() + 1];
301   DawgInfoVector *constraints =  new DawgInfoVector[char_choices.length() + 1];
302   init_active_dawgs(&(active_dawgs[0]));
303   init_constraints(&(constraints[0]));
304   DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
305                      &(active_dawgs[1]), &(constraints[1]),
306                      (segment_penalty_dict_case_bad /
307                       segment_penalty_dict_case_ok));
308   WERD_CHOICE word(MAX_WERD_LENGTH);
309   copy_hyphen_info(&word);
310   // Discard rating and certainty of the hyphen base (if any).
311   word.set_rating(0.0);
312   word.set_certainty(0.0);
313   if (word.length() + char_choices.length() > MAX_WERD_LENGTH) {
314     return best_choice;  // the word is too long to permute
315   }
316   float certainties[MAX_WERD_LENGTH];
317   this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
318   permute_choices(segment_dawg_debug ? "segment_dawg_debug" : NULL,
319                   char_choices, 0, NULL, &word, certainties,
320                   &rating_limit, best_choice, &dawg_args);
321   delete[] active_dawgs;
322   delete[] constraints;
323   return best_choice;
324 }
325 
326 // Fill the given active_dawgs vector with dawgs that could contain the
327 // beginning of the word. If hyphenated() returns true, copy the entries
328 // from hyphen_active_dawgs_ instead.
init_active_dawgs(DawgInfoVector * active_dawgs)329 void Dict::init_active_dawgs(DawgInfoVector *active_dawgs) {
330   int i;
331   if (hyphenated()) {
332     *active_dawgs = hyphen_active_dawgs_;
333     if (dawg_debug_level >= 3) {
334       for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
335         tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
336                 hyphen_active_dawgs_[i].dawg_index,
337                 hyphen_active_dawgs_[i].ref);
338   }
339   }
340   } else {
341     for (i = 0; i < dawgs_.length(); ++i) {
342       if (kBeginningDawgsType[(dawgs_[i])->type()]) {
343         *active_dawgs += DawgInfo(i, NO_EDGE);
344         if (dawg_debug_level >= 3) {
345           tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
346     }
347   }
348 }
349   }
350   }
351 
352 // If hyphenated() returns true, copy the entries from hyphen_constraints_
353 // into the given constraints vector.
init_constraints(DawgInfoVector * constraints)354 void Dict::init_constraints(DawgInfoVector *constraints) {
355   if (hyphenated()) {
356     *constraints = hyphen_constraints_;
357     if (dawg_debug_level >= 3) {
358       for (int i = 0; i < hyphen_constraints_.size(); ++i) {
359         tprintf("Adding hyphen constraint [%d, " REFFORMAT "]\n",
360                 hyphen_constraints_[i].dawg_index,
361                 hyphen_constraints_[i].ref);
362   }
363 }
364 }
365 }
366 
367 }  // namespace tesseract
368