• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File:        permute.c  (Formerly permute.c)
5  * Description:  Choose OCR text given character-probability maps
6  *               for sequences of glyph fragments and a dictionary provided as
7  *               a Dual Acyclic Word Graph.
8  *               In this file, "permute" should be read "combine."
9  * Author:       Mark Seaman, OCR Technology
10  * Created:      Fri Sep 22 14:05:51 1989
11  * Modified:     Thu Jan  3 16:38:46 1991 (Mark Seaman) marks@hpgrlt
12  * Language:     C
13  * Package:      N/A
14  * Status:       Experimental (Do Not Distribute)
15  *
16  * (c) Copyright 1989, Hewlett-Packard Company.
17  ** Licensed under the Apache License, Version 2.0 (the "License");
18  ** you may not use this file except in compliance with the License.
19  ** You may obtain a copy of the License at
20  ** http://www.apache.org/licenses/LICENSE-2.0
21  ** Unless required by applicable law or agreed to in writing, software
22  ** distributed under the License is distributed on an "AS IS" BASIS,
23  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24  ** See the License for the specific language governing permissions and
25  ** limitations under the License.
26  *
27  *********************************************************************************/
28 /*----------------------------------------------------------------------
29             I n c l u d e s
30 ---------------------------------------------------------------------*/
31 
32 #include <assert.h>
33 #include <math.h>
34 
35 #include "const.h"
36 
37 #include "permute.h"
38 
39 #include "callcpp.h"
40 #include "context.h"
41 #include "conversion.h"
42 #include "freelist.h"
43 #include "globals.h"
44 #include "ndminx.h"
45 #include "permdawg.h"
46 #include "permngram.h"
47 #include "ratngs.h"
48 #include "stopper.h"
49 #include "tordvars.h"
50 #include "tprintf.h"
51 #include "trie.h"
52 #include "varable.h"
53 #include "unicharset.h"
54 #include "dict.h"
55 #include "image.h"
56 #include "ccutil.h"
57 
58 int permutation_count;           // Used in metrics.cpp.
59 /*----------------------------------------------------------------------
60               V a r i a b l e s
61 ----------------------------------------------------------------------*/
62 // TODO(tkielbus) Choose a value for the MAX_NUM_EDGES constant
63 // (or make it dynamic)
64 #define MAX_NUM_EDGES          2000000
65 #define MAX_DOC_EDGES          250000
66 #define MAX_USER_EDGES         50000
67                                  /* Weights for adjustment */
68 #define NON_WERD               1.25
69 #define GARBAGE_STRING         1.5
70 #define MAX_PERM_LENGTH         128
71 
72 // debugging flags
73 INT_VAR(fragments_debug, 0, "Debug character fragments");
74 
75 BOOL_VAR(segment_debug, 0, "Debug the whole segmentation process");
76 
77 BOOL_VAR(permute_debug, 0, "Debug char permutation process");
78 
79 
80 // control parameters
81 double_VAR(bestrate_pruning_factor, 2.0,
82            "Multiplying factor of current best rate to prune other hypotheses");
83 
84 BOOL_VAR(permute_script_word, 0,
85          "Turn on word script consistency permuter");
86 
87 BOOL_VAR(segment_segcost_rating, 0,
88          "incorporate segmentation cost in word rating?");
89 
90 double_VAR(segment_reward_script, 0.95,
91            "Score multipler for script consistency within a word. "
92            "Being a 'reward' factor, it should be <= 1. "
93            "Smaller value implies bigger reward.");
94 
95 double_VAR(segment_penalty_dict_nonword, NON_WERD,
96            "Score multiplier for glyph fragment segmentations which do not "
97            "match a dictionary word (lower is better).");
98 
99 double_VAR(segment_penalty_garbage, GARBAGE_STRING,
100            "Score multiplier for poorly cased strings that are not in the "
101            "dictionary and generally look like garbage (lower is better).");
102 
103 BOOL_VAR(save_doc_words, 0, "Save Document Words");
104 
105 BOOL_VAR(doc_dict_enable, 1, "Enable Document Dictionary ");
106 
107 BOOL_VAR(ngram_permuter_activated, FALSE,
108          "Activate character-level n-gram-based permuter");
109 
110 STRING_VAR(global_user_words_suffix, "", "A list of user-provided words.");
111 
112 // This is an ugly way to incorporate segmentation cost in word rating.
113 // See comments in incorporate_segcost.
114 float wordseg_rating_adjust_factor;
115 
116 int permute_only_top = 0;
117 
118 #define SIM_CERTAINTY_SCALE  -10.0   /* Similarity matcher values */
119 #define SIM_CERTAINTY_OFFSET -10.0   /* Similarity matcher values */
120 #define SIMILARITY_FLOOR     100.0   /* Worst E*L product to stop on */
121 
122 // TODO(daria): If hyphens are different in different languages and can be
123 // inferred from training data we should load their values dynamically.
124 static const char kHyphenSymbol[] = "-";
125 
126 /*----------------------------------------------------------------------
127               F u n c t i o n s
128 ----------------------------------------------------------------------*/
129 
130 /**********************************************************************
131  * get_best_delete_other
132  *
133  * Returns the best of two choices and deletes the other (worse) choice.
134  * A choice is better if it has a non-empty string and has a lower
135  * rating than the other choice. If the ratings are the same,
136  * choice2 is preferred over choice1.
137  **********************************************************************/
get_best_delete_other(WERD_CHOICE * choice1,WERD_CHOICE * choice2)138 WERD_CHOICE *get_best_delete_other(WERD_CHOICE *choice1,
139                                    WERD_CHOICE *choice2) {
140   if (!choice1) return choice2;
141   if (!choice2) return choice1;
142   if (choice1->rating() < choice2->rating() || choice2->length() == 0) {
143     delete choice2;
144     return choice1;
145   } else {
146     delete choice1;
147     return choice2;
148   }
149 }
150 
151 
152 /**********************************************************************
153  * good_choice
154  *
155  * Return TRUE if a good answer is found for the unknown blob rating.
156  **********************************************************************/
good_choice(const WERD_CHOICE & choice)157 int good_choice(const WERD_CHOICE &choice) {
158   register float certainty;
159   if (tord_similarity_enable) {
160     if ((choice.rating() + 1) * choice.certainty() > SIMILARITY_FLOOR)
161       return false;
162     certainty =
163       SIM_CERTAINTY_OFFSET + choice.rating() * SIM_CERTAINTY_SCALE;
164   } else {
165     certainty = choice.certainty();
166   }
167 
168   return (certainty > tord_certainty_threshold) ? true : false;
169 }
170 
171 
172 /**********************************************************************
173  * add_document_word
174  *
175  * Add a word found on this document to the document specific
176  * dictionary.
177  **********************************************************************/
178 namespace tesseract {
add_document_word(const WERD_CHOICE & best_choice)179 void Dict::add_document_word(const WERD_CHOICE &best_choice) {
180   // Do not add hyphenated word parts to the document dawg.
181   // hyphen_word_ will be non-NULL after the set_hyphen_word() is
182   // called when the first part of the hyphenated word is
183   // discovered and while the second part of the word is recognized.
184   // hyphen_word_ is cleared in cc_recg() before the next word on
185   // the line is recognized.
186   if (hyphen_word_) return;
187 
188   char filename[CHARS_PER_LINE];
189   FILE *doc_word_file;
190   int stringlen = best_choice.length();
191 
192   if (!doc_dict_enable || valid_word(best_choice) ||
193       CurrentWordAmbig() || stringlen < 2)
194     return;
195 
196   if (!good_choice(best_choice) || stringlen == 2) {
197     if (best_choice.certainty() < permuter_pending_threshold)
198       return;
199 
200     if (!pending_words_->word_in_dawg(best_choice)) {
201       if (stringlen > 2 ||
202           (stringlen == 2 &&
203            getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
204            getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
205         pending_words_->add_word_to_dawg(best_choice);
206       }
207       return;
208     }
209   }
210 
211   if (save_doc_words) {
212     strcpy(filename, getImage()->getCCUtil()->imagefile.string());
213     strcat (filename, ".doc");
214     doc_word_file = open_file (filename, "a");
215     fprintf (doc_word_file, "%s\n",
216              best_choice.debug_string(getUnicharset()).string());
217     fclose(doc_word_file);
218   }
219   document_words_->add_word_to_dawg(best_choice);
220 }
221 
222 
223 /**********************************************************************
224  * adjust_non_word
225  *
226  * Assign an adjusted value to a string that is a non-word.  The value
227  * that this word choice has is based on case and punctuation rules.
228  * The adjustment value applied is stored in adjust_factor upon return.
229  **********************************************************************/
adjust_non_word(WERD_CHOICE * word,float * adjust_factor)230 void Dict::adjust_non_word(WERD_CHOICE *word, float *adjust_factor) {
231   float new_rating;
232   if (permute_debug)
233     cprintf("Non-word: %s %4.2f ",
234             word->debug_string(getUnicharset()).string(), word->rating());
235 
236   new_rating = word->rating() + RATING_PAD;
237   if (Context::case_ok(*word, getUnicharset()) && valid_punctuation(*word)) {
238     new_rating *= segment_penalty_dict_nonword;
239     *adjust_factor = segment_penalty_dict_nonword;
240     if (permute_debug) tprintf(", W");
241   } else {
242     new_rating *= segment_penalty_garbage;
243     *adjust_factor = segment_penalty_garbage;
244     if (permute_debug) {
245       if (!Context::case_ok(*word, getUnicharset())) tprintf(", C");
246       if (!valid_punctuation(*word)) tprintf(", P");
247     }
248   }
249   new_rating -= RATING_PAD;
250   word->set_rating(new_rating);
251   if (permute_debug)
252     cprintf (" %4.2f --> %4.2f\n", *adjust_factor, new_rating);
253 }
254 
255 
256 /**********************************************************************
257  * init_permute
258  *
259  * Initialize anything that needs to be set up for the permute
260  * functions.
261  **********************************************************************/
init_permute()262 void Dict::init_permute() {
263   STRING name;
264   STRING &lang = getImage()->getCCUtil()->lang;
265 
266   if (dawgs_.length() != 0) end_permute();
267 
268   hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
269   TessdataManager &tessdata_manager =
270     getImage()->getCCUtil()->tessdata_manager;
271 
272   // Load dawgs_.
273   if (global_load_punc_dawg &&
274       tessdata_manager.SeekToStart(TESSDATA_PUNC_DAWG)) {
275     dawgs_ += new SquishedDawg(tessdata_manager.GetDataFilePtr(),
276                                DAWG_TYPE_PUNCTUATION, lang, PUNC_PERM);
277   }
278   if (global_load_system_dawg &&
279       tessdata_manager.SeekToStart(TESSDATA_SYSTEM_DAWG)) {
280     dawgs_ += new SquishedDawg(tessdata_manager.GetDataFilePtr(),
281                                DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM);
282   }
283   if (global_load_number_dawg &&
284       tessdata_manager.SeekToStart(TESSDATA_NUMBER_DAWG)) {
285     dawgs_ +=
286       new SquishedDawg(tessdata_manager.GetDataFilePtr(),
287                        DAWG_TYPE_NUMBER, lang, NUMBER_PERM);
288   }
289   if (((STRING &)global_user_words_suffix).length() > 0) {
290     Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
291                               MAX_USER_EDGES, getUnicharset().size());
292   name = getImage()->getCCUtil()->language_data_path_prefix;
293     name += global_user_words_suffix;
294     if (!trie_ptr->read_word_list(name.string(), getUnicharset())) {
295       tprintf("Error: failed to load %s\n", name.string());
296       exit(1);
297     }
298     dawgs_ += trie_ptr;
299   }
300   document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
301                              MAX_DOC_EDGES, getUnicharset().size());
302   dawgs_ += document_words_;
303 
304   // This dawg is temporary and should not be searched by letter_is_ok.
305   pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
306                             MAX_DOC_EDGES, getUnicharset().size());
307 
308   // The frequent words dawg is only searched when a word
309   // is found in any of the other dawgs.
310   if (tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
311     freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
312                                   DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM);
313   }
314 
315   // Construct a list of corresponding successors for each dawg. Each entry i
316   // in the successors_ vector is a vector of integers that represent the
317   // indices into the dawgs_ vector of the successors for dawg i.
318   successors_.reserve(dawgs_.length());
319   for (int i = 0; i < dawgs_.length(); ++i) {
320     const Dawg *dawg = dawgs_[i];
321     SuccessorList *lst = new SuccessorList();
322     for (int j = 0; j < dawgs_.length(); ++j) {
323       const Dawg *other = dawgs_[j];
324       if (dawg->lang() == other->lang() &&
325           kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
326     }
327     successors_ += lst;
328   }
329 }
330 
end_permute()331 void Dict::end_permute() {
332   if (dawgs_.length() == 0)
333     return;  // Not safe to call twice.
334   dawgs_.delete_data_pointers();
335   successors_.delete_data_pointers();
336   dawgs_.clear();
337   successors_.clear();
338   document_words_ = NULL;
339   if (pending_words_ != NULL) delete pending_words_;
340   pending_words_ = NULL;
341   if (freq_dawg_ != NULL) delete freq_dawg_;
342   freq_dawg_ = NULL;
343 }
344 
345 
346 /**********************************************************************
347  * permute_all
348  *
349  * Permute all the characters together using all of the different types
350  * of permuters/selectors available.  Each of the characters must have
351  * a non-NULL choice list.
352  *
353  * Note: order of applying permuters does matter, since the latter
354  * permuter will be recorded if the resulting word ratings are the same.
355  **********************************************************************/
permute_all(const BLOB_CHOICE_LIST_VECTOR & char_choices,float rating_limit,WERD_CHOICE * raw_choice)356 WERD_CHOICE *Dict::permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
357                                float rating_limit,
358                                WERD_CHOICE *raw_choice) {
359   WERD_CHOICE *result1;
360   WERD_CHOICE *result2 = NULL;
361   BOOL8 any_alpha;
362   float top_choice_rating_limit = rating_limit;
363 
364   // Initialize result1 from the result of permute_top_choice.
365   result1 = permute_top_choice(char_choices, &top_choice_rating_limit,
366                                raw_choice, &any_alpha);
367 
368   // Enforce script consistency within a word on some scripts
369   if (permute_script_word &&
370       !word_script_eq(char_choices, getUnicharset().common_sid()) &&
371       !word_script_eq(char_choices, getUnicharset().latin_sid())) {
372     result2 = permute_script_words(char_choices);
373     // TODO(dsl): incorporate segmentation cost into word rating.
374     // This should only be turned on for scripts that we have a segmentation
375     // cost model for, such as CJK.
376     if (segment_segcost_rating)
377       incorporate_segcost(result2);
378     result1 = get_best_delete_other(result1, result2);
379   }
380 
381   // Permute character fragments if necessary.
382   if (result1 == NULL || result1->fragment_mark()) {
383     result2 = top_fragments_permute_and_select(char_choices,
384                                                top_choice_rating_limit);
385     result1 = get_best_delete_other(result1, result2);
386   }
387 
388   // TODO(daria): update ngram permuter code.
389   if (ngram_permuter_activated) {
390     tprintf("Error: ngram permuter functionality is not available\n");
391     exit(1);
392     // A_CHOICE *ngram_choice =
393     //  ngram_permute_and_select(old_char_choices, rating_limit, word_dawg_);
394     // return ngram_choice;
395   }
396 
397   if (result1 == NULL)
398     return (NULL);
399   if (permute_only_top)
400     return result1;
401 
402   result2 = dawg_permute_and_select(char_choices, rating_limit);
403   result1 = get_best_delete_other(result1, result2);
404 
405   result2 = permute_compound_words(char_choices, rating_limit);
406   result1 = get_best_delete_other(result1, result2);
407 
408   return (result1);
409     }
410 
411 // Returns the top choice char id.  A helper function to make code cleaner.
get_top_choice_uid(BLOB_CHOICE_LIST * blob_list)412 UNICHAR_ID get_top_choice_uid(BLOB_CHOICE_LIST *blob_list) {
413   BLOB_CHOICE_IT blob_choice_it;
414   blob_choice_it.set_to_list(blob_list);
415   return (blob_choice_it.data()) ? blob_choice_it.data()->unichar_id()
416                                  : INVALID_UNICHAR_ID;
417   }
418 
419 // Return the "dominant" script ID for the word.  By "dominant", the script
420 // must account for at least half the characters.  Otherwise, it returns 0.
get_top_word_script(const BLOB_CHOICE_LIST_VECTOR & char_choices,const UNICHARSET & unicharset)421 int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices,
422                         const UNICHARSET &unicharset) {
423   int max_script = unicharset.get_script_table_size();
424   int *sid = new int[max_script];
425   int x;
426   for (x = 0; x < max_script; x++) sid[x] = 0;
427   for (x = 0; x < char_choices.length(); ++x) {
428     BLOB_CHOICE_IT blob_choice_it;
429     blob_choice_it.set_to_list(char_choices.get(x));
430     sid[blob_choice_it.data()->script_id()]++;
431   }
432   // Note that high script ID overrides lower one on a tie, thus biasing
433   // towards non-Common script (if sorted that way in unicharset file).
434   int max_sid = 0;
435   for (x = 1; x < max_script; x++)
436     if (sid[x] >= sid[max_sid]) max_sid = x;
437   if (sid[max_sid] < char_choices.length() / 2)
438     max_sid = unicharset.null_sid();
439   delete[] sid;
440   return max_sid;
441   }
442 
443 /**********************************************************************
444  * Checks whether the dominant word script, if there is one, matches
445  * the given target script ID.
446  **********************************************************************/
word_script_eq(const BLOB_CHOICE_LIST_VECTOR & char_choices,int target_sid)447 bool Dict::word_script_eq(const BLOB_CHOICE_LIST_VECTOR &char_choices,
448                           int target_sid) {
449   int max_sid = get_top_word_script(char_choices, getUnicharset());
450   // If "Latin" is not a loaded script, then latin_sid() would return 0.
451   // max_sid could also be 0 if there is no dominant script.
452   // This is faster than
453   // strcmp(getUnicharset().get_script_from_script_id(max_sid), "Latin")
454   return (max_sid > 0 && max_sid == target_sid);
455 }
456 
457 /**********************************************************************
458  * Iterate through all the character choices (for a single blob) and
459  * return the first that matches the given type, which is one of 'aA0px*',
460  * for lower, upper, digit, punctuation, other, and 'any', respectively.
461  * If not match is found, a NULL is returned.
462  **********************************************************************/
find_choice_by_type(BLOB_CHOICE_LIST * char_choices,char target_type,const UNICHARSET & unicharset)463 BLOB_CHOICE* find_choice_by_type(
464     BLOB_CHOICE_LIST *char_choices,
465     char target_type,
466     const UNICHARSET &unicharset) {
467   BLOB_CHOICE_IT c_it;
468   c_it.set_to_list(char_choices);
469   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
470     bool found = false;
471     UNICHAR_ID unichar_id = c_it.data()->unichar_id();
472     switch (target_type) {
473       case '*': found = true;  break;
474       case 'A': found = unicharset.get_isupper(unichar_id); break;
475       case 'a': found = unicharset.get_islower(unichar_id); break;
476       case '0': found = unicharset.get_isdigit(unichar_id); break;
477       case 'p': found = unicharset.get_ispunctuation(unichar_id); break;
478       case 'x': found = !unicharset.get_isupper(unichar_id) &&
479                         !unicharset.get_islower(unichar_id) &&
480                         !unicharset.get_isdigit(unichar_id) &&
481                         !unicharset.get_ispunctuation(unichar_id);
482                 break;
483     }
484     if (found) return c_it.data();
485   }
486   return NULL;
487 }
488 
489 /**********************************************************************
490  * Iterate through all the character choices (for a single blob) and
491  * return the first that matches the target script ID.  If backup_sid
492  * is not 0, then a match on either the target or backup sid is allowed.
493  * Note that there is no preference between a target or backup sid.
494  * To search for another sid only if no target_sid matched, use
495  * secondary_sid.
496  * So for example, to find first Han or Common char choice, do
497  *   find_choice_by_script(cchoice, han_sid, common_sid, 0);
498  * To find first Han choice, but allow Common if none is found, do
499  *   find_choice_by_script(cchoice, han_sid, 0, common_sid);
500  **********************************************************************/
find_choice_by_script(BLOB_CHOICE_LIST * char_choices,int target_sid,int backup_sid,int secondary_sid)501 BLOB_CHOICE* find_choice_by_script(
502     BLOB_CHOICE_LIST *char_choices,
503     int target_sid,
504     int backup_sid,
505     int secondary_sid) {
506   BLOB_CHOICE_IT c_it;
507   c_it.set_to_list(char_choices);
508   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
509     bool found = false;
510     if (c_it.data()->script_id() == 0) continue;
511     if (c_it.data()->script_id() == target_sid) found = true;
512     if (backup_sid > 0 && c_it.data()->script_id() == backup_sid) found = true;
513     if (found) return c_it.data();
514   }
515   if (secondary_sid > 0) {
516     c_it.set_to_list(char_choices);
517     for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
518       if (c_it.data()->script_id() == 0) continue;
519       if (c_it.data()->script_id() == secondary_sid)
520         return c_it.data();
521     }
522   }
523   return NULL;
524 }
525 
526 /**********************************************************************
527  * Incorporate segmentation cost into the word rating.  This is done
528  * through a mutliplier wordseg_rating_adjust_factor which is determined
529  * in bestfirst.cpp during state evaluation.  This is not the cleanest
530  * way to do this.  It would be better to reorganize the SEARCH_STATE
531  * to keep track of associated states, or do the rating adjustment
532  * outside the permuter in evalaute_state.
533  **********************************************************************/
incorporate_segcost(WERD_CHOICE * word)534 void Dict::incorporate_segcost(WERD_CHOICE *word) {
535   if (!word || wordseg_rating_adjust_factor <= 0) return;
536 
537   float old_rating = word->rating();
538   float new_rating = old_rating * wordseg_rating_adjust_factor;
539   word->set_rating(new_rating);
540   if (permute_debug)
541     tprintf("Permute segadjust %f * %f --> %f\n",
542             old_rating, wordseg_rating_adjust_factor, new_rating);
543 }
544 
545 /**********************************************************************
546  * Try flipping characters in a word to get better script consistency.
547  * Similar to how upper/lower case checking is done in top_choice_permuter,
548  * this permuter tries to suggest a more script-consistent choice AND
549  * modifieds the rating.  So it combines both the case_ok check and
550  * adjust_non_word functionality.  However, instead of penalizing an
551  * inconsistent word with a > 1 multiplier, we reward the script-consistent
552  * choice with a < 1 multiplier.
553  **********************************************************************/
permute_script_words(const BLOB_CHOICE_LIST_VECTOR & char_choices)554 WERD_CHOICE* Dict::permute_script_words(
555     const BLOB_CHOICE_LIST_VECTOR &char_choices) {
556   if (char_choices.length() > MAX_WERD_LENGTH)
557     return NULL;
558 
559   int word_sid = get_top_word_script(char_choices, getUnicharset());
560   if (word_sid == getUnicharset().null_sid())
561     return NULL;
562 
563   if (permute_debug) {
564     tprintf("\n\nPermuteScript %s\n",
565             getUnicharset().get_script_from_script_id(word_sid));
566     print_char_choices_list("", char_choices, getUnicharset(),
567                             permute_debug > 1);
568   }
569 
570   WERD_CHOICE *current_word = new WERD_CHOICE(MAX_WERD_LENGTH);
571   BLOB_CHOICE_IT blob_choice_it;
572   bool replaced = false;
573   bool prev_is_consistent = false;
574   for (int x = 0; x < char_choices.length(); ++x) {
575     blob_choice_it.set_to_list(char_choices.get(x));
576     BLOB_CHOICE *first_choice = blob_choice_it.data();
577     if (!first_choice) return NULL;
578     UNICHAR_ID unichar_id = first_choice->unichar_id();
579     bool sid_consistent = (first_choice->script_id() == word_sid);
580     bool this_is_punct = getUnicharset().get_ispunctuation(unichar_id);
581 
582     if (!sid_consistent && !this_is_punct && prev_is_consistent) {
583       // If the previous char is CJK, we prefer a cjk over non-cjk char
584       if (permute_debug) {
585         tprintf("Checking %s r%g\n", getUnicharset().id_to_unichar(unichar_id),
586                                      first_choice->rating());
587         print_ratings_list("\t", char_choices.get(x), getUnicharset());
588       }
589       // prefer a script consistent choice
590       BLOB_CHOICE* c_it = find_choice_by_script(char_choices.get(x),
591                                                 word_sid, 0, 0);
592       // make this a separate check
593       // otherwise, prefer a punctuation
594       if (c_it == NULL)
595         c_it = find_choice_by_type(char_choices.get(x), 'p', getUnicharset());
596 
597       if (c_it != NULL) {
598         if (permute_debug)
599           tprintf("Replacing %d r%g ==> %d r%g\n",
600                   first_choice->unichar_id(), first_choice->rating(),
601                   c_it->unichar_id(), c_it->rating());
602         first_choice = c_it;
603         replaced = true;
604       }
605     }
606     current_word->append_unichar_id_space_allocated(
607       first_choice->unichar_id(), 1,
608       first_choice->rating(), first_choice->certainty());
609     prev_is_consistent = sid_consistent;
610   }
611   if (replaced) {
612     // When we replace a word choice (usually top choice) with
613     // another for the sake of script consistency, we need to improve its
614     // rating so that it will replace the best choice.  How much we modify
615     // the rating determines how strong is the script consistency constraint.
616     // We need a more consistent solution for all contextual constraints
617     // like case, punct pattern, script, etc.  Right now, this does the same
618     // thing as adjust_non_words for case and punctuation rules.
619     float rating = current_word->rating();
620     rating *= segment_reward_script;
621     current_word->set_rating(rating);
622   }
623   current_word->populate_unichars(getUnicharset());
624   if (permute_debug && replaced)
625     current_word->print("<== permute_script_word **");
626   return current_word;
627 }
628 
629 /**********************************************************************
630  * permute_characters
631  *
632  * Permute these characters together according to each of the different
633  * permuters that are enabled.
634  **********************************************************************/
permute_characters(const BLOB_CHOICE_LIST_VECTOR & char_choices,float limit,WERD_CHOICE * best_choice,WERD_CHOICE * raw_choice)635 void Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
636                               float limit,
637                               WERD_CHOICE *best_choice,
638                               WERD_CHOICE *raw_choice) {
639   float old_raw_choice_rating = raw_choice->rating();
640   permutation_count++;           /* Global counter */
641   if (tord_display_ratings > 1) {
642     cprintf("\nchar_choices in permute_characters:\n");
643     print_char_choices_list("\n==> Input CharChoices", char_choices,
644                             getUnicharset(), true);
645   }
646 
647   if (char_choices.length() == 1 &&
648       get_top_choice_uid(char_choices.get(0)) == 0)
649     return;
650   WERD_CHOICE *this_choice = permute_all(char_choices, limit, raw_choice);
651 
652   if (raw_choice->rating() < old_raw_choice_rating) {
653     // Populate unichars_ and unichar_lengths_ of raw_choice. This is
654     // needed for various components that still work with unichars rather
655     // than unichar ids (e.g. AdaptToWord).
656     raw_choice->populate_unichars(getUnicharset());
657   }
658   if (this_choice && this_choice->rating() < best_choice->rating()) {
659     *best_choice = *this_choice;
660     // Populate unichars_ and unichar_lengths_ of best_choice. This is
661     // needed for various components that still work with unichars rather
662     // than unichar ids (dawg, *_ok functions, various hard-coded hacks).
663     best_choice->populate_unichars(getUnicharset());
664 
665     if (tord_display_ratings) {
666       cprintf("permute_characters: %s\n",
667               best_choice->debug_string(getUnicharset()).string());
668     }
669   }
670   delete this_choice;
671 }
672 
673 /**********************************************************************
674  * permute_compound_words
675  *
676  * Return the top choice for each character as the choice for the word.
677  **********************************************************************/
permute_compound_words(const BLOB_CHOICE_LIST_VECTOR & char_choices,float rating_limit)678 WERD_CHOICE *Dict::permute_compound_words(
679     const BLOB_CHOICE_LIST_VECTOR &char_choices,
680     float rating_limit) {
681   BLOB_CHOICE *first_choice;
682   WERD_CHOICE *best_choice = NULL;
683   WERD_CHOICE current_word(MAX_WERD_LENGTH);
684   int first_index = 0;
685   int x;
686   BLOB_CHOICE_IT blob_choice_it;
687 
688   if (char_choices.length() > MAX_WERD_LENGTH) {
689     WERD_CHOICE *bad_word_choice = new WERD_CHOICE();
690     bad_word_choice->make_bad();
691     return bad_word_choice;
692   }
693 
694   UNICHAR_ID slash = getUnicharset().unichar_to_id("/");
695   UNICHAR_ID dash = getUnicharset().unichar_to_id("-");
696   for (x = 0; x < char_choices.length(); ++x) {
697     blob_choice_it.set_to_list(char_choices.get(x));
698     first_choice = blob_choice_it.data();
699     if (first_choice->unichar_id() == slash ||
700         first_choice->unichar_id() == dash) {
701       if (x > first_index) {
702         if (segment_debug)
703           cprintf ("Hyphenated word found\n");
704         permute_subword(char_choices, rating_limit, first_index,
705                         x - 1, &current_word);
706         if (current_word.rating() > rating_limit)
707           break;
708       }
709       // Append hyphen/slash separator to current_word.
710       current_word.append_unichar_id_space_allocated(
711           first_choice->unichar_id(), 1,
712           first_choice->rating(), first_choice->certainty());
713 
714       first_index = x + 1;  // update first_index
715     }
716   }
717 
718   if (first_index > 0 && first_index < x &&
719       current_word.rating() <= rating_limit) {
720     permute_subword(char_choices, rating_limit, first_index,
721                     x - 1, &current_word);
722     best_choice = new WERD_CHOICE(current_word);
723     best_choice->set_permuter(COMPOUND_PERM);
724   }
725   return (best_choice);
726 }
727 
728 
729 /**********************************************************************
730  * permute_subword
731  *
732  * Permute a part of a compound word this subword is bounded by hyphens
733  * and the start and end of the word.  Call the standard word permute
734  * function on a set of choices covering only part of the original
735  * word.  When it is done reclaim the memory that was used in the
736  * excercise.
737  **********************************************************************/
permute_subword(const BLOB_CHOICE_LIST_VECTOR & char_choices,float rating_limit,int start,int end,WERD_CHOICE * current_word)738 void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
739                            float rating_limit,
740                            int start,
741                            int end,
742                            WERD_CHOICE *current_word) {
743   int x;
744   BLOB_CHOICE_LIST_VECTOR subchoices;
745   WERD_CHOICE *best_choice = NULL;
746   WERD_CHOICE raw_choice;
747   raw_choice.make_bad();
748 
749   DisableChoiceAccum();
750 
751   for (x = start; x <= end; x++) {
752     if (char_choices.get(x) != NULL) {
753       subchoices += char_choices.get(x);
754     }
755   }
756 
757   if (!subchoices.empty()) {
758     bool old_segment_dawg_debug = segment_dawg_debug;
759     if (segment_debug) segment_dawg_debug.set_value(true);
760     best_choice = permute_all(subchoices, rating_limit, &raw_choice);
761 
762     if (segment_debug) {
763       segment_dawg_debug.set_value(old_segment_dawg_debug);
764     }
765     if (best_choice && best_choice->length() > 0) {
766       *current_word += *best_choice;
767     } else {
768       current_word->set_rating(MAX_FLOAT32);
769     }
770   } else {
771     current_word->set_rating(MAX_FLOAT32);
772   }
773 
774   if (best_choice)
775     delete best_choice;
776 
777   if (segment_debug && current_word->rating() < MAX_FLOAT32) {
778     cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
779              current_word->debug_string(getUnicharset()).string(),
780              current_word->rating(), current_word->certainty());
781   }
782 
783   EnableChoiceAccum();
784 }
785 
786 /**********************************************************************
787  * permute_top_choice
788  *
789  * Return the top choice for each character as the choice for the word.
790  * In addition a choice is created for the best lower and upper case
791  * non-words.  In each character position the best lower (or upper) case
792  * character is substituted for the best overall character.
793  **********************************************************************/
permute_top_choice(const BLOB_CHOICE_LIST_VECTOR & char_choices,float * rating_limit,WERD_CHOICE * raw_choice,BOOL8 * any_alpha)794 WERD_CHOICE *Dict::permute_top_choice(
795     const BLOB_CHOICE_LIST_VECTOR &char_choices,
796     float* rating_limit,
797     WERD_CHOICE *raw_choice,
798     BOOL8 *any_alpha) {
799   BLOB_CHOICE *first_choice;
800   const char *first_char;             //first choice
801   const char *second_char;            //second choice
802   const char *third_char;             //third choice
803   char prev_char[UNICHAR_LEN + 1];    //prev in word
804   const char *next_char = "";         //next in word
805   const char *next_next_char = "";    //after next next in word
806 
807   WERD_CHOICE word(MAX_PERM_LENGTH);
808   word.set_permuter(TOP_CHOICE_PERM);
809   WERD_CHOICE capital_word(MAX_PERM_LENGTH);
810   capital_word.set_permuter(UPPER_CASE_PERM);
811   WERD_CHOICE lower_word(MAX_PERM_LENGTH);
812   lower_word.set_permuter(LOWER_CASE_PERM);
813 
814   int x;
815   BOOL8 char_alpha;
816   float first_rating = 0;
817   float adjust_factor;
818 
819   float certainties[MAX_PERM_LENGTH + 1];
820   float lower_certainties[MAX_PERM_LENGTH + 1];
821   float upper_certainties[MAX_PERM_LENGTH + 1];
822 
823   BLOB_CHOICE_IT blob_choice_it;
824   UNICHAR_ID temp_id;
825   UNICHAR_ID unichar_id;
826   UNICHAR_ID space = getUnicharset().unichar_to_id(" ");
827   register const char* ch;
828   register inT8 lower_done;
829   register inT8 upper_done;
830 
831   prev_char[0] = '\0';
832 
833   if (any_alpha != NULL)
834     *any_alpha = FALSE;
835 
836   if (char_choices.length() > MAX_PERM_LENGTH) {
837     return (NULL);
838   }
839 
840   for (x = 0; x < char_choices.length(); ++x) {
841     if (x + 1 < char_choices.length()) {
842       unichar_id = get_top_choice_uid(char_choices.get(x+1));
843       next_char = unichar_id != INVALID_UNICHAR_ID ?
844         getUnicharset().id_to_unichar(unichar_id) : "";
845     } else {
846       next_char = "";
847     }
848 
849     if (x + 2 < char_choices.length()) {
850       unichar_id = get_top_choice_uid(char_choices.get(x+2));
851       next_next_char = unichar_id != INVALID_UNICHAR_ID ?
852         getUnicharset().id_to_unichar(unichar_id) : "";
853     } else {
854       next_next_char = "";
855     }
856 
857     blob_choice_it.set_to_list(char_choices.get(x));
858     ASSERT_HOST(!blob_choice_it.empty());
859     first_choice = NULL;
860     for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
861          blob_choice_it.forward()) {  // find the best non-fragment char choice
862       temp_id = blob_choice_it.data()->unichar_id();
863       if (!(getUnicharset().get_fragment(temp_id))) {
864         first_choice = blob_choice_it.data();
865         break;
866       } else if (char_choices.length() > 1) {
867         word.set_fragment_mark(true);
868         capital_word.set_fragment_mark(true);
869         lower_word.set_fragment_mark(true);
870       }
871     }
872     if (first_choice == NULL) {
873       cprintf("Permuter found only fragments for"
874               " character at position %d; word=%s\n",
875               x, word.debug_string(getUnicharset()).string());
876     }
877     ASSERT_HOST(first_choice != NULL);
878 
879     unichar_id = first_choice->unichar_id() != INVALID_UNICHAR_ID ?
880       first_choice->unichar_id() : space;
881     first_char = getUnicharset().id_to_unichar(unichar_id);
882     first_rating = first_choice->rating();
883     word.append_unichar_id_space_allocated(
884         unichar_id, 1, first_choice->rating(), first_choice->certainty());
885     capital_word.append_unichar_id_space_allocated(
886         unichar_id, 1, first_choice->rating(), first_choice->certainty());
887     lower_word.append_unichar_id_space_allocated(
888         unichar_id, 1, first_choice->rating(), first_choice->certainty());
889 
890     certainties[x] = first_choice->certainty();
891     lower_certainties[x] = first_choice->certainty();
892     upper_certainties[x] = first_choice->certainty();
893 
894     lower_done = FALSE;
895     upper_done = FALSE;
896     char_alpha = FALSE;
897     second_char = "";
898     third_char = "";
899     for (; !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
900       unichar_id = blob_choice_it.data()->unichar_id();
901       if (getUnicharset().eq(unichar_id, "l") && !blob_choice_it.at_last() &&
902           blob_choice_it.data_relative(1)->rating() == first_rating) {
903         temp_id = blob_choice_it.data_relative(1)->unichar_id();
904         if (getUnicharset().eq(temp_id, "1") ||
905             getUnicharset().eq(temp_id, "I")) {
906           second_char = getUnicharset().id_to_unichar(temp_id);
907           blob_choice_it.forward();
908           if (!blob_choice_it.at_last() &&
909               blob_choice_it.data_relative(1)->rating() == first_rating) {
910             temp_id = blob_choice_it.data_relative(1)->unichar_id();
911             if (getUnicharset().eq(temp_id, "1") ||
912                 getUnicharset().eq(temp_id, "I")) {
913               third_char = getUnicharset().id_to_unichar(temp_id);
914               blob_choice_it.forward();
915             }
916           }
917           ch = choose_il1 (first_char, second_char, third_char,
918             prev_char, next_char, next_next_char);
919           unichar_id = (ch != NULL && *ch != '\0') ?
920             getUnicharset().unichar_to_id(ch) : INVALID_UNICHAR_ID;
921           if (strcmp(ch, "l") != 0 &&
922               getUnicharset().eq(word.unichar_id(x), "l")) {
923             word.set_unichar_id(unichar_id, x);
924             lower_word.set_unichar_id(unichar_id, x);
925             capital_word.set_unichar_id(unichar_id, x);
926           }
927         }
928       }
929       if (unichar_id != INVALID_UNICHAR_ID) {
930         /* Find lower case */
931         if (!lower_done &&
932             (getUnicharset().get_islower(unichar_id) ||
933              (getUnicharset().get_isupper(unichar_id) && x == 0))) {
934           lower_word.set_unichar_id(unichar_id, x);
935           lower_word.set_rating(lower_word.rating() -
936             first_choice->rating() + blob_choice_it.data()->rating());
937           if (blob_choice_it.data()->certainty() < lower_word.certainty()) {
938             lower_word.set_certainty(blob_choice_it.data()->certainty());
939           }
940           lower_certainties[x] = blob_choice_it.data()->certainty();
941           lower_done = TRUE;
942         }
943         /* Find upper case */
944         if (!upper_done && getUnicharset().get_isupper(unichar_id)) {
945           capital_word.set_unichar_id(unichar_id, x);
946           capital_word.set_rating(capital_word.rating() -
947             first_choice->rating() + blob_choice_it.data()->rating());
948           if (blob_choice_it.data()->certainty() < capital_word.certainty()) {
949             capital_word.set_certainty(blob_choice_it.data()->certainty());
950           }
951           upper_certainties[x] = blob_choice_it.data()->certainty();
952           upper_done = TRUE;
953         }
954         if (!char_alpha) {
955           const CHAR_FRAGMENT *fragment =
956             getUnicharset().get_fragment(unichar_id);
957           temp_id = !fragment ? unichar_id :
958             getUnicharset().unichar_to_id(fragment->get_unichar());
959           if (getUnicharset().get_isalpha(temp_id)) {
960             char_alpha = TRUE;
961           }
962         }
963         if (lower_done && upper_done)
964           break;
965       }
966     }
967     if (char_alpha && any_alpha != NULL)
968       *any_alpha = TRUE;
969 
970     if (word.rating() > bestrate_pruning_factor * *rating_limit) {
971       if (permute_debug)
972         tprintf("\n***** Aborting high-cost word: %g > limit %g \n",
973                 word.rating(), bestrate_pruning_factor * *rating_limit);
974       return (NULL);
975     }
976 
977     *prev_char = '\0';
978     temp_id = word.unichar_id(word.length()-1);
979     if (temp_id != INVALID_UNICHAR_ID) {
980       strcpy(prev_char, getUnicharset().id_to_unichar(temp_id));
981     }
982   }
983 
984   if (word.rating() < raw_choice->rating()) {
985     *raw_choice = word;
986     LogNewChoice(*raw_choice, 1.0, certainties, true);
987   }
988 
989   if (ngram_permuter_activated)
990     return NULL;
991 
992   float rating = word.rating();
993   adjust_non_word(&word, &adjust_factor);
994   LogNewChoice(word, adjust_factor, certainties, false);
995 
996   float lower_rating = lower_word.rating();
997   adjust_non_word(&lower_word, &adjust_factor);
998   LogNewChoice(lower_word, adjust_factor, lower_certainties, false);
999 
1000   float upper_rating = capital_word.rating();
1001   adjust_non_word(&capital_word, &adjust_factor);
1002   LogNewChoice(capital_word, adjust_factor, upper_certainties, false);
1003 
1004   WERD_CHOICE *best_choice = &word;
1005   *rating_limit = rating;
1006   if (lower_word.rating() < best_choice->rating()) {
1007     best_choice = &lower_word;
1008     *rating_limit = lower_rating;
1009   }
1010   if (capital_word.rating() < best_choice->rating()) {
1011     best_choice = &capital_word;
1012     *rating_limit = upper_rating;
1013   }
1014   return new WERD_CHOICE(*best_choice);
1015 }
1016 
1017 
1018 /**********************************************************************
1019  * choose_il1
1020  *
1021  * Choose between the candidate il1 chars.
1022  **********************************************************************/
choose_il1(const char * first_char,const char * second_char,const char * third_char,const char * prev_char,const char * next_char,const char * next_next_char)1023 const char* Dict::choose_il1(const char *first_char,        //first choice
1024                              const char *second_char,       //second choice
1025                              const char *third_char,        //third choice
1026                              const char *prev_char,         //prev in word
1027                              const char *next_char,         //next in word
1028                              const char *next_next_char) {  //after next next in word
1029   inT32 type1;                   //1/I/l type of first choice
1030   inT32 type2;                   //1/I/l type of second choice
1031   inT32 type3;                   //1/I/l type of third choice
1032 
1033   int first_char_length = strlen(first_char);
1034   int prev_char_length = strlen(prev_char);
1035   int next_char_length = strlen(next_char);
1036   int next_next_char_length = strlen(next_next_char);
1037 
1038   if (*first_char == 'l' && *second_char != '\0') {
1039     if (*second_char == 'I'
1040         && (((prev_char_length != 0 &&
1041             getUnicharset().get_isupper (prev_char, prev_char_length)) &&
1042             (next_char_length == 0 ||
1043              !getUnicharset().get_islower (next_char, next_char_length)) &&
1044             (next_char_length == 0 ||
1045              !getUnicharset().get_isdigit (next_char, next_char_length))) ||
1046             ((next_char_length != 0 &&
1047              getUnicharset().get_isupper (next_char, next_char_length)) &&
1048             (prev_char_length == 0 ||
1049              !getUnicharset().get_islower (prev_char, prev_char_length)) &&
1050             (prev_char_length == 0 ||
1051              !getUnicharset().get_isdigit (prev_char, prev_char_length)))))
1052       first_char = second_char;  //override
1053     else if (*second_char == '1' || *third_char == '1') {
1054       if ((next_char_length != 0 &&
1055            getUnicharset().get_isdigit (next_char, next_char_length)) ||
1056           (prev_char_length != 0 &&
1057            getUnicharset().get_isdigit (prev_char, prev_char_length))
1058           || (*next_char == 'l' &&
1059           (next_next_char_length != 0 &&
1060            getUnicharset().get_isdigit (next_next_char,
1061                                         next_next_char_length)))) {
1062         first_char = "1";
1063         first_char_length = 1;
1064       }
1065       else if ((prev_char_length == 0 ||
1066                 !getUnicharset().get_islower (prev_char, prev_char_length)) &&
1067                ((next_char_length == 0 ||
1068                  !getUnicharset().get_islower (next_char, next_char_length)) ||
1069                 (*next_char == 's' &&
1070                 *next_next_char == 't'))) {
1071         if (((*prev_char != '\'' && *prev_char != '`') || *next_char != '\0')
1072             && ((*next_char != '\'' && *next_char != '`')
1073                 || *prev_char != '\0')) {
1074           first_char = "1";
1075           first_char_length = 1;
1076         }
1077       }
1078     }
1079     if (*first_char == 'l' && *next_char != '\0' &&
1080         (prev_char_length == 0 ||
1081          !getUnicharset().get_isalpha (prev_char, prev_char_length))) {
1082       type1 = 2;
1083 
1084       if (*second_char == '1')
1085         type2 = 0;
1086       else if (*second_char == 'I')
1087         type2 = 1;
1088       else if (*second_char == 'l')
1089         type2 = 2;
1090       else
1091         type2 = type1;
1092 
1093       if (*third_char == '1')
1094         type3 = 0;
1095       else if (*third_char == 'I')
1096         type3 = 1;
1097       else if (*third_char == 'l')
1098         type3 = 2;
1099       else
1100         type3 = type1;
1101 
1102 #if 0
1103       if (bigram_counts[*next_char][type2] >
1104       bigram_counts[*next_char][type1]) {
1105         first_char = second_char;
1106         type1 = type2;
1107       }
1108       if (bigram_counts[*next_char][type3] >
1109       bigram_counts[*next_char][type1]) {
1110         first_char = third_char;
1111       }
1112 #endif
1113     }
1114   }
1115   return first_char;
1116 }
1117 
1118 //
1119 // Check all the DAWGs to see if this word is in any of them.
1120 //
valid_word(const WERD_CHOICE & word,bool numbers_ok)1121 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) {
1122   const WERD_CHOICE *word_ptr = &word;
1123   WERD_CHOICE temp_word;
1124   if (hyphenated()) {
1125     copy_hyphen_info(&temp_word);
1126     temp_word += word;
1127     word_ptr = &temp_word;
1128   }
1129   if (word_ptr->length() == 0) return NO_PERM;
1130   // Allocate vectors for holding current and updated
1131   // active_dawgs and constraints and initialize them.
1132   DawgInfoVector *active_dawgs = new DawgInfoVector[2];
1133   DawgInfoVector *constraints = new DawgInfoVector[2];
1134   init_active_dawgs(&(active_dawgs[0]));
1135   init_constraints(&(constraints[0]));
1136   DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
1137                      &(active_dawgs[1]), &(constraints[1]), 0.0);
1138   int last_index = word_ptr->length() - 1;
1139   // Call leter_is_okay for each letter in the word.
1140   for (int i = hyphen_base_size(); i <= last_index; ++i) {
1141     if (!((this->*letter_is_okay_)(&dawg_args, i, word_ptr,
1142                                    i == last_index))) break;
1143     // Swap active_dawgs, constraints with the corresponding updated vector.
1144     if (dawg_args.updated_active_dawgs == &(active_dawgs[1])) {
1145       dawg_args.updated_active_dawgs = &(active_dawgs[0]);
1146       dawg_args.updated_constraints = &(constraints[0]);
1147       ++(dawg_args.active_dawgs);
1148       ++(dawg_args.constraints);
1149     } else {
1150       ++(dawg_args.updated_active_dawgs);
1151       ++(dawg_args.updated_constraints);
1152       dawg_args.active_dawgs = &(active_dawgs[0]);
1153       dawg_args.constraints = &(constraints[0]);
1154     }
1155   }
1156   delete[] active_dawgs;
1157   delete[] constraints;
1158   if (dawg_args.permuter == SYSTEM_DAWG_PERM ||
1159       dawg_args.permuter == DOC_DAWG_PERM ||
1160       dawg_args.permuter == USER_DAWG_PERM ||
1161       (numbers_ok && dawg_args.permuter == NUMBER_PERM)){
1162     return dawg_args.permuter;
1163   } else {
1164     return NO_PERM;
1165   }
1166 }
1167 
1168 //
1169 // Return true if the word contains a valid punctuation pattern.
1170 //
1171 // Note: Since the domains of punctuation symbols and symblos
1172 // used in numbers are not disjoint, a valid number might contain
1173 // an invalid punctuation pattern (e.g. .99).
1174 //
valid_punctuation(const WERD_CHOICE & word)1175 bool Dict::valid_punctuation(const WERD_CHOICE &word) {
1176   if (word.length() == 0) return NO_PERM;
1177   int i;
1178   WERD_CHOICE new_word;
1179   int last_index = word.length() - 1;
1180   int new_len = 0;
1181   for (i = 0; i <= last_index; ++i) {
1182     UNICHAR_ID unichar_id = (word.unichar_id(i));
1183     if (getUnicharset().get_ispunctuation(unichar_id)) {
1184       new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
1185     } else if (!getUnicharset().get_isalpha(unichar_id) &&
1186                !getUnicharset().get_isdigit(unichar_id)) {
1187       return false;  // neither punc, nor alpha, nor digit
1188     } else if ((new_len = new_word.length()) == 0 ||
1189                new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
1190       new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
1191     }
1192   }
1193   for (i = 0; i < dawgs_.size(); ++i) {
1194     if (dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
1195         dawgs_[i]->word_in_dawg(new_word)) return true;
1196   }
1197   return false;
1198 }
1199 
1200 /**********************************************************************
1201  * fragment_state
1202  *
1203  * Given the current char choice and information about previously seen
1204  * fragments, determines whether adjacent character fragments are
1205  * present and whether they can be concatenated.
1206  *
1207  * The given prev_char_frag_info contains:
1208  *  -- fragment: if not NULL contains information about immediately
1209  *     preceeding fragmented character choice
1210  *  -- num_fragments: number of fragments that have been used so far
1211  *     to construct a character
1212  *  -- certainty: certainty of the current choice or minimum
1213  *     certainty of all fragments concatenated so far
1214  *  -- rating: rating of the current choice or sum of fragment
1215  *     ratings concatenated so far
1216  *
1217  * The output char_frag_info is filled in as follows:
1218  * -- character: is set to be NULL if the choice is a non-matching
1219  *    or non-ending fragment piece; is set to unichar of the given choice
1220  *    if it represents a regular character or a matching ending fragment
1221  * -- fragment,num_fragments,certainty,rating are set as described above
1222  *
1223  * Returns false if a non-matching fragment is discovered, true otherwise.
1224  **********************************************************************/
fragment_state_okay(UNICHAR_ID curr_unichar_id,float curr_rating,float curr_certainty,const CHAR_FRAGMENT_INFO * prev_char_frag_info,const char * debug,int word_ending,CHAR_FRAGMENT_INFO * char_frag_info)1225 bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id,
1226                                float curr_rating, float curr_certainty,
1227                                const CHAR_FRAGMENT_INFO *prev_char_frag_info,
1228                                const char *debug, int word_ending,
1229                                CHAR_FRAGMENT_INFO *char_frag_info) {
1230   const CHAR_FRAGMENT *this_fragment =
1231     getUnicharset().get_fragment(curr_unichar_id);
1232   const CHAR_FRAGMENT *prev_fragment =
1233     prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;
1234 
1235   // Print debug info for fragments.
1236   if (debug && (prev_fragment || this_fragment)) {
1237     cprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
1238             getUnicharset().debug_str(curr_unichar_id).string(),
1239             word_ending);
1240     if (prev_fragment) {
1241       cprintf("prev_fragment %s\n", prev_fragment->to_string().string());
1242     }
1243     if (this_fragment) {
1244       cprintf("this_fragment %s\n", this_fragment->to_string().string());
1245     }
1246   }
1247 
1248   char_frag_info->unichar_id = curr_unichar_id;
1249   char_frag_info->fragment = this_fragment;
1250   char_frag_info->rating = curr_rating;
1251   char_frag_info->certainty = curr_certainty;
1252   char_frag_info->num_fragments = 1;
1253   if (prev_fragment && !this_fragment) {
1254     if (debug) tprintf("Skip choice with incomplete fragment\n");
1255     return false;
1256   }
1257   if (this_fragment) {
1258     // We are dealing with a fragment.
1259     char_frag_info->unichar_id = INVALID_UNICHAR_ID;
1260     if (prev_fragment) {
1261       if (!this_fragment->is_continuation_of(prev_fragment)) {
1262         if (debug) tprintf("Non-matching fragment piece\n");
1263         return false;
1264       }
1265       if (this_fragment->is_ending()) {
1266         char_frag_info->unichar_id =
1267           getUnicharset().unichar_to_id(this_fragment->get_unichar());
1268         char_frag_info->fragment = NULL;
1269         if (debug) {
1270           tprintf("Built character %s from fragments\n",
1271                   getUnicharset().debug_str(
1272                       char_frag_info->unichar_id).string());
1273         }
1274       } else {
1275         if (debug) tprintf("Record fragment continuation\n");
1276         char_frag_info->fragment = this_fragment;
1277       }
1278       // Update certainty and rating.
1279       char_frag_info->rating =
1280         prev_char_frag_info->rating + curr_rating;
1281       char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
1282       char_frag_info->certainty =
1283         MIN(curr_certainty, prev_char_frag_info->certainty);
1284     } else {
1285       if (this_fragment->is_beginning()) {
1286         if (debug) cprintf("Record fragment beginning\n");
1287       } else {
1288         if (debug) {
1289           tprintf("Non-starting fragment piece with no prev_fragment\n");
1290         }
1291         return false;
1292       }
1293     }
1294   }
1295   if (word_ending && char_frag_info->fragment) {
1296     if (debug) tprintf("Word can not end with a fragment\n");
1297     return false;
1298   }
1299   return true;
1300 }
1301 /**********************************************************************
1302  * top_fragments_permute_and_select
1303  *
1304  * Creates a copy of character choices list that contain only fragments
1305  * and the best non-fragmented character choice.
1306  * Permutes character in this shortened list, builds characters from
1307  * fragments if possible and returns a better choice if found.
1308  **********************************************************************/
top_fragments_permute_and_select(const BLOB_CHOICE_LIST_VECTOR & char_choices,float rating_limit)1309 WERD_CHOICE *Dict::top_fragments_permute_and_select(
1310     const BLOB_CHOICE_LIST_VECTOR &char_choices,
1311     float rating_limit) {
1312   if (char_choices.length() <= 1 ||
1313       char_choices.length() > MAX_PERM_LENGTH) {
1314     return NULL;
1315   }
1316   // See it would be possible to benefit from permuting fragments.
1317   int x;
1318   float min_rating = 0.0;
1319   BLOB_CHOICE_IT blob_choice_it;
1320   for (x = 0; x < char_choices.length(); ++x) {
1321     blob_choice_it.set_to_list(char_choices.get(x));
1322     if (blob_choice_it.data()) {
1323       min_rating += blob_choice_it.data()->rating();
1324     }
1325     if (min_rating >= rating_limit) {
1326       return NULL;
1327     }
1328   }
1329   if (fragments_debug > 1) {
1330     tprintf("A choice with fragment beats top choice\n");
1331     tprintf("Running fragment permuter...\n");
1332   }
1333 
1334   // Construct a modified choices list that contains (for each position):
1335   // the best choice, all fragments and at least one choice for
1336   // a non-fragmented character.
1337   BLOB_CHOICE_LIST_VECTOR frag_char_choices(char_choices.length());
1338   for (x = 0; x < char_choices.length(); ++x) {
1339     bool need_nonfrag_char = true;
1340     BLOB_CHOICE_LIST *frag_choices = new BLOB_CHOICE_LIST();
1341     BLOB_CHOICE_IT frag_choices_it;
1342     frag_choices_it.set_to_list(frag_choices);
1343     blob_choice_it.set_to_list(char_choices.get(x));
1344     for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1345          blob_choice_it.forward()) {
1346       if (getUnicharset().get_fragment(blob_choice_it.data()->unichar_id())) {
1347         frag_choices_it.add_after_then_move(
1348             new BLOB_CHOICE(*(blob_choice_it.data())));
1349       } else if (need_nonfrag_char) {
1350         frag_choices_it.add_after_then_move(
1351             new BLOB_CHOICE(*(blob_choice_it.data())));
1352         need_nonfrag_char = false;
1353       }
1354     }
1355     frag_char_choices += frag_choices;
1356   }
1357 
1358   WERD_CHOICE *best_choice = new WERD_CHOICE();
1359   best_choice->make_bad();
1360   WERD_CHOICE word(MAX_PERM_LENGTH);
1361   word.set_permuter(TOP_CHOICE_PERM);
1362   float certainties[MAX_PERM_LENGTH];
1363   this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_top_fragments_fxn;
1364   permute_choices((fragments_debug > 1) ? "fragments_debug" : NULL,
1365                   frag_char_choices, 0, NULL, &word, certainties,
1366                   &rating_limit, best_choice, NULL);
1367 
1368   frag_char_choices.delete_data_pointers();
1369   return best_choice;
1370 }
1371 
1372 /**********************************************************************
1373  * permute_choices
1374  *
1375  * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST
1376  * with the given char_choice_index in char_choices.
1377  **********************************************************************/
permute_choices(const char * debug,const BLOB_CHOICE_LIST_VECTOR & char_choices,int char_choice_index,const CHAR_FRAGMENT_INFO * prev_char_frag_info,WERD_CHOICE * word,float certainties[],float * limit,WERD_CHOICE * best_choice,void * more_args)1378 void Dict::permute_choices(
1379     const char *debug,
1380     const BLOB_CHOICE_LIST_VECTOR &char_choices,
1381     int char_choice_index,
1382     const CHAR_FRAGMENT_INFO *prev_char_frag_info,
1383     WERD_CHOICE *word,
1384     float certainties[],
1385     float *limit,
1386     WERD_CHOICE *best_choice,
1387     void *more_args) {
1388   if (debug) {
1389     tprintf("%s permute_choices: char_choice_index=%d"
1390             " limit=%4.2f rating=%4.2f, certainty=%4.2f word=%s\n",
1391             debug, char_choice_index, *limit, word->rating(),
1392             word->certainty(), word->debug_string(getUnicharset()).string());
1393   }
1394   if (char_choice_index < char_choices.length()) {
1395     BLOB_CHOICE_IT blob_choice_it;
1396     blob_choice_it.set_to_list(char_choices.get(char_choice_index));
1397     for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1398          blob_choice_it.forward()) {
1399       append_choices(debug, char_choices, *(blob_choice_it.data()),
1400                      char_choice_index, prev_char_frag_info, word,
1401                      certainties, limit, best_choice, more_args);
1402 
1403     }
1404   }
1405 }
1406 
1407 /**********************************************************************
1408  * append_choices
1409  *
1410  * Check to see whether or not the next choice is worth appending to
1411  * the word being generated. If so then keep going deeper into the word.
1412  *
1413  * This function assumes that Dict::go_deeper_fxn_ is set.
1414  **********************************************************************/
append_choices(const char * debug,const BLOB_CHOICE_LIST_VECTOR & char_choices,const BLOB_CHOICE & blob_choice,int char_choice_index,const CHAR_FRAGMENT_INFO * prev_char_frag_info,WERD_CHOICE * word,float certainties[],float * limit,WERD_CHOICE * best_choice,void * more_args)1415 void Dict::append_choices(
1416     const char *debug,
1417     const BLOB_CHOICE_LIST_VECTOR &char_choices,
1418     const BLOB_CHOICE &blob_choice,
1419     int char_choice_index,
1420     const CHAR_FRAGMENT_INFO *prev_char_frag_info,
1421     WERD_CHOICE *word,
1422     float certainties[],
1423     float *limit,
1424     WERD_CHOICE *best_choice,
1425     void *more_args) {
1426   int word_ending =
1427     (char_choice_index == char_choices.length() - 1) ? true : false;
1428 
1429   // Deal with fragments.
1430   CHAR_FRAGMENT_INFO char_frag_info;
1431   if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
1432                            blob_choice.certainty(), prev_char_frag_info, debug,
1433                            word_ending, &char_frag_info)) {
1434     return;  // blob_choice must be an invalid fragment
1435   }
1436   // Search the next letter if this character is a fragment.
1437   if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
1438     permute_choices(debug, char_choices, char_choice_index + 1,
1439                     &char_frag_info, word, certainties, limit,
1440                     best_choice, more_args);
1441     return;
1442   }
1443 
1444   // Add the next unichar.
1445   float old_rating = word->rating();
1446   float old_certainty = word->certainty();
1447   uinT8 old_permuter = word->permuter();
1448   certainties[word->length()] = char_frag_info.certainty;
1449   word->append_unichar_id_space_allocated(
1450       char_frag_info.unichar_id, char_frag_info.num_fragments,
1451       char_frag_info.rating, char_frag_info.certainty);
1452 
1453   // Explore the next unichar.
1454   (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
1455                           &char_frag_info, word_ending, word, certainties,
1456                           limit, best_choice, more_args);
1457 
1458   // Remove the unichar we added to explore other choices in it's place.
1459   word->remove_last_unichar_id();
1460   word->set_rating(old_rating);
1461   word->set_certainty(old_certainty);
1462   word->set_permuter(old_permuter);
1463 }
1464 
1465 /**********************************************************************
1466  * go_deeper_top_fragments_fxn
1467  *
1468  * If the choice being composed so far could be better
1469  * than best_choice keep exploring choices.
1470  **********************************************************************/
go_deeper_top_fragments_fxn(const char * debug,const BLOB_CHOICE_LIST_VECTOR & char_choices,int char_choice_index,const CHAR_FRAGMENT_INFO * prev_char_frag_info,bool word_ending,WERD_CHOICE * word,float certainties[],float * limit,WERD_CHOICE * best_choice,void * more_args)1471 void Dict::go_deeper_top_fragments_fxn(
1472     const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
1473     int char_choice_index,
1474     const CHAR_FRAGMENT_INFO *prev_char_frag_info,
1475     bool word_ending, WERD_CHOICE *word, float certainties[],
1476     float *limit, WERD_CHOICE *best_choice, void *more_args) {
1477   if (word->rating() < *limit) {
1478     if (word_ending) {
1479       if (fragments_debug > 1) {
1480         tprintf("fragments_debug new choice = %s\n",
1481                 word->debug_string(getUnicharset()).string());
1482       }
1483       *limit = word->rating();
1484 
1485       float adjust_factor;
1486       adjust_non_word(word, &adjust_factor);
1487       LogNewChoice(*word, adjust_factor, certainties, false);
1488 
1489       if (word->rating() < best_choice->rating()) {
1490         *best_choice = *word;
1491       }
1492     } else {  // search the next letter
1493       permute_choices(debug, char_choices, char_choice_index + 1,
1494                       prev_char_frag_info, word, certainties, limit,
1495                       best_choice, more_args);
1496     }
1497   } else {
1498     if (fragments_debug > 1) {
1499       tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
1500               word->debug_string(getUnicharset()).string(),
1501               word->rating(), *limit);
1502     }
1503   }
1504 }
1505 
1506 }  // namespace tesseract
1507