1 /* -*-C-*-
2 ********************************************************************************
3 *
4 * File: permute.c (Formerly permute.c)
5 * Description: Choose OCR text given character-probability maps
6 * for sequences of glyph fragments and a dictionary provided as
7 * a Dual Acyclic Word Graph.
8 * In this file, "permute" should be read "combine."
9 * Author: Mark Seaman, OCR Technology
10 * Created: Fri Sep 22 14:05:51 1989
11 * Modified: Thu Jan 3 16:38:46 1991 (Mark Seaman) marks@hpgrlt
12 * Language: C
13 * Package: N/A
14 * Status: Experimental (Do Not Distribute)
15 *
16 * (c) Copyright 1989, Hewlett-Packard Company.
17 ** Licensed under the Apache License, Version 2.0 (the "License");
18 ** you may not use this file except in compliance with the License.
19 ** You may obtain a copy of the License at
20 ** http://www.apache.org/licenses/LICENSE-2.0
21 ** Unless required by applicable law or agreed to in writing, software
22 ** distributed under the License is distributed on an "AS IS" BASIS,
23 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24 ** See the License for the specific language governing permissions and
25 ** limitations under the License.
26 *
27 *********************************************************************************/
28 /*----------------------------------------------------------------------
29 I n c l u d e s
30 ---------------------------------------------------------------------*/
31
32 #include <assert.h>
33 #include <math.h>
34
35 #include "const.h"
36
37 #include "permute.h"
38
39 #include "callcpp.h"
40 #include "context.h"
41 #include "conversion.h"
42 #include "freelist.h"
43 #include "globals.h"
44 #include "ndminx.h"
45 #include "permdawg.h"
46 #include "permngram.h"
47 #include "ratngs.h"
48 #include "stopper.h"
49 #include "tordvars.h"
50 #include "tprintf.h"
51 #include "trie.h"
52 #include "varable.h"
53 #include "unicharset.h"
54 #include "dict.h"
55 #include "image.h"
56 #include "ccutil.h"
57
58 int permutation_count; // Used in metrics.cpp.
59 /*----------------------------------------------------------------------
60 V a r i a b l e s
61 ----------------------------------------------------------------------*/
62 // TODO(tkielbus) Choose a value for the MAX_NUM_EDGES constant
63 // (or make it dynamic)
64 #define MAX_NUM_EDGES 2000000
65 #define MAX_DOC_EDGES 250000
66 #define MAX_USER_EDGES 50000
67 /* Weights for adjustment */
68 #define NON_WERD 1.25
69 #define GARBAGE_STRING 1.5
70 #define MAX_PERM_LENGTH 128
71
72 // debugging flags
73 INT_VAR(fragments_debug, 0, "Debug character fragments");
74
75 BOOL_VAR(segment_debug, 0, "Debug the whole segmentation process");
76
77 BOOL_VAR(permute_debug, 0, "Debug char permutation process");
78
79
80 // control parameters
81 double_VAR(bestrate_pruning_factor, 2.0,
82 "Multiplying factor of current best rate to prune other hypotheses");
83
84 BOOL_VAR(permute_script_word, 0,
85 "Turn on word script consistency permuter");
86
87 BOOL_VAR(segment_segcost_rating, 0,
88 "incorporate segmentation cost in word rating?");
89
90 double_VAR(segment_reward_script, 0.95,
91 "Score multipler for script consistency within a word. "
92 "Being a 'reward' factor, it should be <= 1. "
93 "Smaller value implies bigger reward.");
94
95 double_VAR(segment_penalty_dict_nonword, NON_WERD,
96 "Score multiplier for glyph fragment segmentations which do not "
97 "match a dictionary word (lower is better).");
98
99 double_VAR(segment_penalty_garbage, GARBAGE_STRING,
100 "Score multiplier for poorly cased strings that are not in the "
101 "dictionary and generally look like garbage (lower is better).");
102
103 BOOL_VAR(save_doc_words, 0, "Save Document Words");
104
105 BOOL_VAR(doc_dict_enable, 1, "Enable Document Dictionary ");
106
107 BOOL_VAR(ngram_permuter_activated, FALSE,
108 "Activate character-level n-gram-based permuter");
109
110 STRING_VAR(global_user_words_suffix, "", "A list of user-provided words.");
111
112 // This is an ugly way to incorporate segmentation cost in word rating.
113 // See comments in incorporate_segcost.
114 float wordseg_rating_adjust_factor;
115
116 int permute_only_top = 0;
117
118 #define SIM_CERTAINTY_SCALE -10.0 /* Similarity matcher values */
119 #define SIM_CERTAINTY_OFFSET -10.0 /* Similarity matcher values */
120 #define SIMILARITY_FLOOR 100.0 /* Worst E*L product to stop on */
121
122 // TODO(daria): If hyphens are different in different languages and can be
123 // inferred from training data we should load their values dynamically.
124 static const char kHyphenSymbol[] = "-";
125
126 /*----------------------------------------------------------------------
127 F u n c t i o n s
128 ----------------------------------------------------------------------*/
129
130 /**********************************************************************
131 * get_best_delete_other
132 *
133 * Returns the best of two choices and deletes the other (worse) choice.
134 * A choice is better if it has a non-empty string and has a lower
135 * rating than the other choice. If the ratings are the same,
136 * choice2 is preferred over choice1.
137 **********************************************************************/
get_best_delete_other(WERD_CHOICE * choice1,WERD_CHOICE * choice2)138 WERD_CHOICE *get_best_delete_other(WERD_CHOICE *choice1,
139 WERD_CHOICE *choice2) {
140 if (!choice1) return choice2;
141 if (!choice2) return choice1;
142 if (choice1->rating() < choice2->rating() || choice2->length() == 0) {
143 delete choice2;
144 return choice1;
145 } else {
146 delete choice1;
147 return choice2;
148 }
149 }
150
151
152 /**********************************************************************
153 * good_choice
154 *
155 * Return TRUE if a good answer is found for the unknown blob rating.
156 **********************************************************************/
good_choice(const WERD_CHOICE & choice)157 int good_choice(const WERD_CHOICE &choice) {
158 register float certainty;
159 if (tord_similarity_enable) {
160 if ((choice.rating() + 1) * choice.certainty() > SIMILARITY_FLOOR)
161 return false;
162 certainty =
163 SIM_CERTAINTY_OFFSET + choice.rating() * SIM_CERTAINTY_SCALE;
164 } else {
165 certainty = choice.certainty();
166 }
167
168 return (certainty > tord_certainty_threshold) ? true : false;
169 }
170
171
172 /**********************************************************************
173 * add_document_word
174 *
175 * Add a word found on this document to the document specific
176 * dictionary.
177 **********************************************************************/
178 namespace tesseract {
add_document_word(const WERD_CHOICE & best_choice)179 void Dict::add_document_word(const WERD_CHOICE &best_choice) {
180 // Do not add hyphenated word parts to the document dawg.
181 // hyphen_word_ will be non-NULL after the set_hyphen_word() is
182 // called when the first part of the hyphenated word is
183 // discovered and while the second part of the word is recognized.
184 // hyphen_word_ is cleared in cc_recg() before the next word on
185 // the line is recognized.
186 if (hyphen_word_) return;
187
188 char filename[CHARS_PER_LINE];
189 FILE *doc_word_file;
190 int stringlen = best_choice.length();
191
192 if (!doc_dict_enable || valid_word(best_choice) ||
193 CurrentWordAmbig() || stringlen < 2)
194 return;
195
196 if (!good_choice(best_choice) || stringlen == 2) {
197 if (best_choice.certainty() < permuter_pending_threshold)
198 return;
199
200 if (!pending_words_->word_in_dawg(best_choice)) {
201 if (stringlen > 2 ||
202 (stringlen == 2 &&
203 getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
204 getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
205 pending_words_->add_word_to_dawg(best_choice);
206 }
207 return;
208 }
209 }
210
211 if (save_doc_words) {
212 strcpy(filename, getImage()->getCCUtil()->imagefile.string());
213 strcat (filename, ".doc");
214 doc_word_file = open_file (filename, "a");
215 fprintf (doc_word_file, "%s\n",
216 best_choice.debug_string(getUnicharset()).string());
217 fclose(doc_word_file);
218 }
219 document_words_->add_word_to_dawg(best_choice);
220 }
221
222
223 /**********************************************************************
224 * adjust_non_word
225 *
226 * Assign an adjusted value to a string that is a non-word. The value
227 * that this word choice has is based on case and punctuation rules.
228 * The adjustment value applied is stored in adjust_factor upon return.
229 **********************************************************************/
adjust_non_word(WERD_CHOICE * word,float * adjust_factor)230 void Dict::adjust_non_word(WERD_CHOICE *word, float *adjust_factor) {
231 float new_rating;
232 if (permute_debug)
233 cprintf("Non-word: %s %4.2f ",
234 word->debug_string(getUnicharset()).string(), word->rating());
235
236 new_rating = word->rating() + RATING_PAD;
237 if (Context::case_ok(*word, getUnicharset()) && valid_punctuation(*word)) {
238 new_rating *= segment_penalty_dict_nonword;
239 *adjust_factor = segment_penalty_dict_nonword;
240 if (permute_debug) tprintf(", W");
241 } else {
242 new_rating *= segment_penalty_garbage;
243 *adjust_factor = segment_penalty_garbage;
244 if (permute_debug) {
245 if (!Context::case_ok(*word, getUnicharset())) tprintf(", C");
246 if (!valid_punctuation(*word)) tprintf(", P");
247 }
248 }
249 new_rating -= RATING_PAD;
250 word->set_rating(new_rating);
251 if (permute_debug)
252 cprintf (" %4.2f --> %4.2f\n", *adjust_factor, new_rating);
253 }
254
255
256 /**********************************************************************
257 * init_permute
258 *
259 * Initialize anything that needs to be set up for the permute
260 * functions.
261 **********************************************************************/
init_permute()262 void Dict::init_permute() {
263 STRING name;
264 STRING &lang = getImage()->getCCUtil()->lang;
265
266 if (dawgs_.length() != 0) end_permute();
267
268 hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
269 TessdataManager &tessdata_manager =
270 getImage()->getCCUtil()->tessdata_manager;
271
272 // Load dawgs_.
273 if (global_load_punc_dawg &&
274 tessdata_manager.SeekToStart(TESSDATA_PUNC_DAWG)) {
275 dawgs_ += new SquishedDawg(tessdata_manager.GetDataFilePtr(),
276 DAWG_TYPE_PUNCTUATION, lang, PUNC_PERM);
277 }
278 if (global_load_system_dawg &&
279 tessdata_manager.SeekToStart(TESSDATA_SYSTEM_DAWG)) {
280 dawgs_ += new SquishedDawg(tessdata_manager.GetDataFilePtr(),
281 DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM);
282 }
283 if (global_load_number_dawg &&
284 tessdata_manager.SeekToStart(TESSDATA_NUMBER_DAWG)) {
285 dawgs_ +=
286 new SquishedDawg(tessdata_manager.GetDataFilePtr(),
287 DAWG_TYPE_NUMBER, lang, NUMBER_PERM);
288 }
289 if (((STRING &)global_user_words_suffix).length() > 0) {
290 Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
291 MAX_USER_EDGES, getUnicharset().size());
292 name = getImage()->getCCUtil()->language_data_path_prefix;
293 name += global_user_words_suffix;
294 if (!trie_ptr->read_word_list(name.string(), getUnicharset())) {
295 tprintf("Error: failed to load %s\n", name.string());
296 exit(1);
297 }
298 dawgs_ += trie_ptr;
299 }
300 document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
301 MAX_DOC_EDGES, getUnicharset().size());
302 dawgs_ += document_words_;
303
304 // This dawg is temporary and should not be searched by letter_is_ok.
305 pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
306 MAX_DOC_EDGES, getUnicharset().size());
307
308 // The frequent words dawg is only searched when a word
309 // is found in any of the other dawgs.
310 if (tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
311 freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
312 DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM);
313 }
314
315 // Construct a list of corresponding successors for each dawg. Each entry i
316 // in the successors_ vector is a vector of integers that represent the
317 // indices into the dawgs_ vector of the successors for dawg i.
318 successors_.reserve(dawgs_.length());
319 for (int i = 0; i < dawgs_.length(); ++i) {
320 const Dawg *dawg = dawgs_[i];
321 SuccessorList *lst = new SuccessorList();
322 for (int j = 0; j < dawgs_.length(); ++j) {
323 const Dawg *other = dawgs_[j];
324 if (dawg->lang() == other->lang() &&
325 kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
326 }
327 successors_ += lst;
328 }
329 }
330
end_permute()331 void Dict::end_permute() {
332 if (dawgs_.length() == 0)
333 return; // Not safe to call twice.
334 dawgs_.delete_data_pointers();
335 successors_.delete_data_pointers();
336 dawgs_.clear();
337 successors_.clear();
338 document_words_ = NULL;
339 if (pending_words_ != NULL) delete pending_words_;
340 pending_words_ = NULL;
341 if (freq_dawg_ != NULL) delete freq_dawg_;
342 freq_dawg_ = NULL;
343 }
344
345
346 /**********************************************************************
347 * permute_all
348 *
349 * Permute all the characters together using all of the different types
350 * of permuters/selectors available. Each of the characters must have
351 * a non-NULL choice list.
352 *
353 * Note: order of applying permuters does matter, since the latter
354 * permuter will be recorded if the resulting word ratings are the same.
355 **********************************************************************/
permute_all(const BLOB_CHOICE_LIST_VECTOR & char_choices,float rating_limit,WERD_CHOICE * raw_choice)356 WERD_CHOICE *Dict::permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
357 float rating_limit,
358 WERD_CHOICE *raw_choice) {
359 WERD_CHOICE *result1;
360 WERD_CHOICE *result2 = NULL;
361 BOOL8 any_alpha;
362 float top_choice_rating_limit = rating_limit;
363
364 // Initialize result1 from the result of permute_top_choice.
365 result1 = permute_top_choice(char_choices, &top_choice_rating_limit,
366 raw_choice, &any_alpha);
367
368 // Enforce script consistency within a word on some scripts
369 if (permute_script_word &&
370 !word_script_eq(char_choices, getUnicharset().common_sid()) &&
371 !word_script_eq(char_choices, getUnicharset().latin_sid())) {
372 result2 = permute_script_words(char_choices);
373 // TODO(dsl): incorporate segmentation cost into word rating.
374 // This should only be turned on for scripts that we have a segmentation
375 // cost model for, such as CJK.
376 if (segment_segcost_rating)
377 incorporate_segcost(result2);
378 result1 = get_best_delete_other(result1, result2);
379 }
380
381 // Permute character fragments if necessary.
382 if (result1 == NULL || result1->fragment_mark()) {
383 result2 = top_fragments_permute_and_select(char_choices,
384 top_choice_rating_limit);
385 result1 = get_best_delete_other(result1, result2);
386 }
387
388 // TODO(daria): update ngram permuter code.
389 if (ngram_permuter_activated) {
390 tprintf("Error: ngram permuter functionality is not available\n");
391 exit(1);
392 // A_CHOICE *ngram_choice =
393 // ngram_permute_and_select(old_char_choices, rating_limit, word_dawg_);
394 // return ngram_choice;
395 }
396
397 if (result1 == NULL)
398 return (NULL);
399 if (permute_only_top)
400 return result1;
401
402 result2 = dawg_permute_and_select(char_choices, rating_limit);
403 result1 = get_best_delete_other(result1, result2);
404
405 result2 = permute_compound_words(char_choices, rating_limit);
406 result1 = get_best_delete_other(result1, result2);
407
408 return (result1);
409 }
410
411 // Returns the top choice char id. A helper function to make code cleaner.
get_top_choice_uid(BLOB_CHOICE_LIST * blob_list)412 UNICHAR_ID get_top_choice_uid(BLOB_CHOICE_LIST *blob_list) {
413 BLOB_CHOICE_IT blob_choice_it;
414 blob_choice_it.set_to_list(blob_list);
415 return (blob_choice_it.data()) ? blob_choice_it.data()->unichar_id()
416 : INVALID_UNICHAR_ID;
417 }
418
419 // Return the "dominant" script ID for the word. By "dominant", the script
420 // must account for at least half the characters. Otherwise, it returns 0.
get_top_word_script(const BLOB_CHOICE_LIST_VECTOR & char_choices,const UNICHARSET & unicharset)421 int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices,
422 const UNICHARSET &unicharset) {
423 int max_script = unicharset.get_script_table_size();
424 int *sid = new int[max_script];
425 int x;
426 for (x = 0; x < max_script; x++) sid[x] = 0;
427 for (x = 0; x < char_choices.length(); ++x) {
428 BLOB_CHOICE_IT blob_choice_it;
429 blob_choice_it.set_to_list(char_choices.get(x));
430 sid[blob_choice_it.data()->script_id()]++;
431 }
432 // Note that high script ID overrides lower one on a tie, thus biasing
433 // towards non-Common script (if sorted that way in unicharset file).
434 int max_sid = 0;
435 for (x = 1; x < max_script; x++)
436 if (sid[x] >= sid[max_sid]) max_sid = x;
437 if (sid[max_sid] < char_choices.length() / 2)
438 max_sid = unicharset.null_sid();
439 delete[] sid;
440 return max_sid;
441 }
442
443 /**********************************************************************
444 * Checks whether the dominant word script, if there is one, matches
445 * the given target script ID.
446 **********************************************************************/
word_script_eq(const BLOB_CHOICE_LIST_VECTOR & char_choices,int target_sid)447 bool Dict::word_script_eq(const BLOB_CHOICE_LIST_VECTOR &char_choices,
448 int target_sid) {
449 int max_sid = get_top_word_script(char_choices, getUnicharset());
450 // If "Latin" is not a loaded script, then latin_sid() would return 0.
451 // max_sid could also be 0 if there is no dominant script.
452 // This is faster than
453 // strcmp(getUnicharset().get_script_from_script_id(max_sid), "Latin")
454 return (max_sid > 0 && max_sid == target_sid);
455 }
456
457 /**********************************************************************
458 * Iterate through all the character choices (for a single blob) and
459 * return the first that matches the given type, which is one of 'aA0px*',
460 * for lower, upper, digit, punctuation, other, and 'any', respectively.
461 * If not match is found, a NULL is returned.
462 **********************************************************************/
find_choice_by_type(BLOB_CHOICE_LIST * char_choices,char target_type,const UNICHARSET & unicharset)463 BLOB_CHOICE* find_choice_by_type(
464 BLOB_CHOICE_LIST *char_choices,
465 char target_type,
466 const UNICHARSET &unicharset) {
467 BLOB_CHOICE_IT c_it;
468 c_it.set_to_list(char_choices);
469 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
470 bool found = false;
471 UNICHAR_ID unichar_id = c_it.data()->unichar_id();
472 switch (target_type) {
473 case '*': found = true; break;
474 case 'A': found = unicharset.get_isupper(unichar_id); break;
475 case 'a': found = unicharset.get_islower(unichar_id); break;
476 case '0': found = unicharset.get_isdigit(unichar_id); break;
477 case 'p': found = unicharset.get_ispunctuation(unichar_id); break;
478 case 'x': found = !unicharset.get_isupper(unichar_id) &&
479 !unicharset.get_islower(unichar_id) &&
480 !unicharset.get_isdigit(unichar_id) &&
481 !unicharset.get_ispunctuation(unichar_id);
482 break;
483 }
484 if (found) return c_it.data();
485 }
486 return NULL;
487 }
488
489 /**********************************************************************
490 * Iterate through all the character choices (for a single blob) and
491 * return the first that matches the target script ID. If backup_sid
492 * is not 0, then a match on either the target or backup sid is allowed.
493 * Note that there is no preference between a target or backup sid.
494 * To search for another sid only if no target_sid matched, use
495 * secondary_sid.
496 * So for example, to find first Han or Common char choice, do
497 * find_choice_by_script(cchoice, han_sid, common_sid, 0);
498 * To find first Han choice, but allow Common if none is found, do
499 * find_choice_by_script(cchoice, han_sid, 0, common_sid);
500 **********************************************************************/
find_choice_by_script(BLOB_CHOICE_LIST * char_choices,int target_sid,int backup_sid,int secondary_sid)501 BLOB_CHOICE* find_choice_by_script(
502 BLOB_CHOICE_LIST *char_choices,
503 int target_sid,
504 int backup_sid,
505 int secondary_sid) {
506 BLOB_CHOICE_IT c_it;
507 c_it.set_to_list(char_choices);
508 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
509 bool found = false;
510 if (c_it.data()->script_id() == 0) continue;
511 if (c_it.data()->script_id() == target_sid) found = true;
512 if (backup_sid > 0 && c_it.data()->script_id() == backup_sid) found = true;
513 if (found) return c_it.data();
514 }
515 if (secondary_sid > 0) {
516 c_it.set_to_list(char_choices);
517 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
518 if (c_it.data()->script_id() == 0) continue;
519 if (c_it.data()->script_id() == secondary_sid)
520 return c_it.data();
521 }
522 }
523 return NULL;
524 }
525
526 /**********************************************************************
527 * Incorporate segmentation cost into the word rating. This is done
528 * through a mutliplier wordseg_rating_adjust_factor which is determined
529 * in bestfirst.cpp during state evaluation. This is not the cleanest
530 * way to do this. It would be better to reorganize the SEARCH_STATE
531 * to keep track of associated states, or do the rating adjustment
532 * outside the permuter in evalaute_state.
533 **********************************************************************/
incorporate_segcost(WERD_CHOICE * word)534 void Dict::incorporate_segcost(WERD_CHOICE *word) {
535 if (!word || wordseg_rating_adjust_factor <= 0) return;
536
537 float old_rating = word->rating();
538 float new_rating = old_rating * wordseg_rating_adjust_factor;
539 word->set_rating(new_rating);
540 if (permute_debug)
541 tprintf("Permute segadjust %f * %f --> %f\n",
542 old_rating, wordseg_rating_adjust_factor, new_rating);
543 }
544
545 /**********************************************************************
546 * Try flipping characters in a word to get better script consistency.
547 * Similar to how upper/lower case checking is done in top_choice_permuter,
548 * this permuter tries to suggest a more script-consistent choice AND
549 * modifieds the rating. So it combines both the case_ok check and
550 * adjust_non_word functionality. However, instead of penalizing an
551 * inconsistent word with a > 1 multiplier, we reward the script-consistent
552 * choice with a < 1 multiplier.
553 **********************************************************************/
permute_script_words(const BLOB_CHOICE_LIST_VECTOR & char_choices)554 WERD_CHOICE* Dict::permute_script_words(
555 const BLOB_CHOICE_LIST_VECTOR &char_choices) {
556 if (char_choices.length() > MAX_WERD_LENGTH)
557 return NULL;
558
559 int word_sid = get_top_word_script(char_choices, getUnicharset());
560 if (word_sid == getUnicharset().null_sid())
561 return NULL;
562
563 if (permute_debug) {
564 tprintf("\n\nPermuteScript %s\n",
565 getUnicharset().get_script_from_script_id(word_sid));
566 print_char_choices_list("", char_choices, getUnicharset(),
567 permute_debug > 1);
568 }
569
570 WERD_CHOICE *current_word = new WERD_CHOICE(MAX_WERD_LENGTH);
571 BLOB_CHOICE_IT blob_choice_it;
572 bool replaced = false;
573 bool prev_is_consistent = false;
574 for (int x = 0; x < char_choices.length(); ++x) {
575 blob_choice_it.set_to_list(char_choices.get(x));
576 BLOB_CHOICE *first_choice = blob_choice_it.data();
577 if (!first_choice) return NULL;
578 UNICHAR_ID unichar_id = first_choice->unichar_id();
579 bool sid_consistent = (first_choice->script_id() == word_sid);
580 bool this_is_punct = getUnicharset().get_ispunctuation(unichar_id);
581
582 if (!sid_consistent && !this_is_punct && prev_is_consistent) {
583 // If the previous char is CJK, we prefer a cjk over non-cjk char
584 if (permute_debug) {
585 tprintf("Checking %s r%g\n", getUnicharset().id_to_unichar(unichar_id),
586 first_choice->rating());
587 print_ratings_list("\t", char_choices.get(x), getUnicharset());
588 }
589 // prefer a script consistent choice
590 BLOB_CHOICE* c_it = find_choice_by_script(char_choices.get(x),
591 word_sid, 0, 0);
592 // make this a separate check
593 // otherwise, prefer a punctuation
594 if (c_it == NULL)
595 c_it = find_choice_by_type(char_choices.get(x), 'p', getUnicharset());
596
597 if (c_it != NULL) {
598 if (permute_debug)
599 tprintf("Replacing %d r%g ==> %d r%g\n",
600 first_choice->unichar_id(), first_choice->rating(),
601 c_it->unichar_id(), c_it->rating());
602 first_choice = c_it;
603 replaced = true;
604 }
605 }
606 current_word->append_unichar_id_space_allocated(
607 first_choice->unichar_id(), 1,
608 first_choice->rating(), first_choice->certainty());
609 prev_is_consistent = sid_consistent;
610 }
611 if (replaced) {
612 // When we replace a word choice (usually top choice) with
613 // another for the sake of script consistency, we need to improve its
614 // rating so that it will replace the best choice. How much we modify
615 // the rating determines how strong is the script consistency constraint.
616 // We need a more consistent solution for all contextual constraints
617 // like case, punct pattern, script, etc. Right now, this does the same
618 // thing as adjust_non_words for case and punctuation rules.
619 float rating = current_word->rating();
620 rating *= segment_reward_script;
621 current_word->set_rating(rating);
622 }
623 current_word->populate_unichars(getUnicharset());
624 if (permute_debug && replaced)
625 current_word->print("<== permute_script_word **");
626 return current_word;
627 }
628
629 /**********************************************************************
630 * permute_characters
631 *
632 * Permute these characters together according to each of the different
633 * permuters that are enabled.
634 **********************************************************************/
permute_characters(const BLOB_CHOICE_LIST_VECTOR & char_choices,float limit,WERD_CHOICE * best_choice,WERD_CHOICE * raw_choice)635 void Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
636 float limit,
637 WERD_CHOICE *best_choice,
638 WERD_CHOICE *raw_choice) {
639 float old_raw_choice_rating = raw_choice->rating();
640 permutation_count++; /* Global counter */
641 if (tord_display_ratings > 1) {
642 cprintf("\nchar_choices in permute_characters:\n");
643 print_char_choices_list("\n==> Input CharChoices", char_choices,
644 getUnicharset(), true);
645 }
646
647 if (char_choices.length() == 1 &&
648 get_top_choice_uid(char_choices.get(0)) == 0)
649 return;
650 WERD_CHOICE *this_choice = permute_all(char_choices, limit, raw_choice);
651
652 if (raw_choice->rating() < old_raw_choice_rating) {
653 // Populate unichars_ and unichar_lengths_ of raw_choice. This is
654 // needed for various components that still work with unichars rather
655 // than unichar ids (e.g. AdaptToWord).
656 raw_choice->populate_unichars(getUnicharset());
657 }
658 if (this_choice && this_choice->rating() < best_choice->rating()) {
659 *best_choice = *this_choice;
660 // Populate unichars_ and unichar_lengths_ of best_choice. This is
661 // needed for various components that still work with unichars rather
662 // than unichar ids (dawg, *_ok functions, various hard-coded hacks).
663 best_choice->populate_unichars(getUnicharset());
664
665 if (tord_display_ratings) {
666 cprintf("permute_characters: %s\n",
667 best_choice->debug_string(getUnicharset()).string());
668 }
669 }
670 delete this_choice;
671 }
672
673 /**********************************************************************
674 * permute_compound_words
675 *
676 * Return the top choice for each character as the choice for the word.
677 **********************************************************************/
permute_compound_words(const BLOB_CHOICE_LIST_VECTOR & char_choices,float rating_limit)678 WERD_CHOICE *Dict::permute_compound_words(
679 const BLOB_CHOICE_LIST_VECTOR &char_choices,
680 float rating_limit) {
681 BLOB_CHOICE *first_choice;
682 WERD_CHOICE *best_choice = NULL;
683 WERD_CHOICE current_word(MAX_WERD_LENGTH);
684 int first_index = 0;
685 int x;
686 BLOB_CHOICE_IT blob_choice_it;
687
688 if (char_choices.length() > MAX_WERD_LENGTH) {
689 WERD_CHOICE *bad_word_choice = new WERD_CHOICE();
690 bad_word_choice->make_bad();
691 return bad_word_choice;
692 }
693
694 UNICHAR_ID slash = getUnicharset().unichar_to_id("/");
695 UNICHAR_ID dash = getUnicharset().unichar_to_id("-");
696 for (x = 0; x < char_choices.length(); ++x) {
697 blob_choice_it.set_to_list(char_choices.get(x));
698 first_choice = blob_choice_it.data();
699 if (first_choice->unichar_id() == slash ||
700 first_choice->unichar_id() == dash) {
701 if (x > first_index) {
702 if (segment_debug)
703 cprintf ("Hyphenated word found\n");
704 permute_subword(char_choices, rating_limit, first_index,
705 x - 1, ¤t_word);
706 if (current_word.rating() > rating_limit)
707 break;
708 }
709 // Append hyphen/slash separator to current_word.
710 current_word.append_unichar_id_space_allocated(
711 first_choice->unichar_id(), 1,
712 first_choice->rating(), first_choice->certainty());
713
714 first_index = x + 1; // update first_index
715 }
716 }
717
718 if (first_index > 0 && first_index < x &&
719 current_word.rating() <= rating_limit) {
720 permute_subword(char_choices, rating_limit, first_index,
721 x - 1, ¤t_word);
722 best_choice = new WERD_CHOICE(current_word);
723 best_choice->set_permuter(COMPOUND_PERM);
724 }
725 return (best_choice);
726 }
727
728
729 /**********************************************************************
730 * permute_subword
731 *
732 * Permute a part of a compound word this subword is bounded by hyphens
733 * and the start and end of the word. Call the standard word permute
734 * function on a set of choices covering only part of the original
735 * word. When it is done reclaim the memory that was used in the
736 * excercise.
737 **********************************************************************/
permute_subword(const BLOB_CHOICE_LIST_VECTOR & char_choices,float rating_limit,int start,int end,WERD_CHOICE * current_word)738 void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
739 float rating_limit,
740 int start,
741 int end,
742 WERD_CHOICE *current_word) {
743 int x;
744 BLOB_CHOICE_LIST_VECTOR subchoices;
745 WERD_CHOICE *best_choice = NULL;
746 WERD_CHOICE raw_choice;
747 raw_choice.make_bad();
748
749 DisableChoiceAccum();
750
751 for (x = start; x <= end; x++) {
752 if (char_choices.get(x) != NULL) {
753 subchoices += char_choices.get(x);
754 }
755 }
756
757 if (!subchoices.empty()) {
758 bool old_segment_dawg_debug = segment_dawg_debug;
759 if (segment_debug) segment_dawg_debug.set_value(true);
760 best_choice = permute_all(subchoices, rating_limit, &raw_choice);
761
762 if (segment_debug) {
763 segment_dawg_debug.set_value(old_segment_dawg_debug);
764 }
765 if (best_choice && best_choice->length() > 0) {
766 *current_word += *best_choice;
767 } else {
768 current_word->set_rating(MAX_FLOAT32);
769 }
770 } else {
771 current_word->set_rating(MAX_FLOAT32);
772 }
773
774 if (best_choice)
775 delete best_choice;
776
777 if (segment_debug && current_word->rating() < MAX_FLOAT32) {
778 cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
779 current_word->debug_string(getUnicharset()).string(),
780 current_word->rating(), current_word->certainty());
781 }
782
783 EnableChoiceAccum();
784 }
785
786 /**********************************************************************
787 * permute_top_choice
788 *
789 * Return the top choice for each character as the choice for the word.
790 * In addition a choice is created for the best lower and upper case
791 * non-words. In each character position the best lower (or upper) case
792 * character is substituted for the best overall character.
793 **********************************************************************/
permute_top_choice(const BLOB_CHOICE_LIST_VECTOR & char_choices,float * rating_limit,WERD_CHOICE * raw_choice,BOOL8 * any_alpha)794 WERD_CHOICE *Dict::permute_top_choice(
795 const BLOB_CHOICE_LIST_VECTOR &char_choices,
796 float* rating_limit,
797 WERD_CHOICE *raw_choice,
798 BOOL8 *any_alpha) {
799 BLOB_CHOICE *first_choice;
800 const char *first_char; //first choice
801 const char *second_char; //second choice
802 const char *third_char; //third choice
803 char prev_char[UNICHAR_LEN + 1]; //prev in word
804 const char *next_char = ""; //next in word
805 const char *next_next_char = ""; //after next next in word
806
807 WERD_CHOICE word(MAX_PERM_LENGTH);
808 word.set_permuter(TOP_CHOICE_PERM);
809 WERD_CHOICE capital_word(MAX_PERM_LENGTH);
810 capital_word.set_permuter(UPPER_CASE_PERM);
811 WERD_CHOICE lower_word(MAX_PERM_LENGTH);
812 lower_word.set_permuter(LOWER_CASE_PERM);
813
814 int x;
815 BOOL8 char_alpha;
816 float first_rating = 0;
817 float adjust_factor;
818
819 float certainties[MAX_PERM_LENGTH + 1];
820 float lower_certainties[MAX_PERM_LENGTH + 1];
821 float upper_certainties[MAX_PERM_LENGTH + 1];
822
823 BLOB_CHOICE_IT blob_choice_it;
824 UNICHAR_ID temp_id;
825 UNICHAR_ID unichar_id;
826 UNICHAR_ID space = getUnicharset().unichar_to_id(" ");
827 register const char* ch;
828 register inT8 lower_done;
829 register inT8 upper_done;
830
831 prev_char[0] = '\0';
832
833 if (any_alpha != NULL)
834 *any_alpha = FALSE;
835
836 if (char_choices.length() > MAX_PERM_LENGTH) {
837 return (NULL);
838 }
839
840 for (x = 0; x < char_choices.length(); ++x) {
841 if (x + 1 < char_choices.length()) {
842 unichar_id = get_top_choice_uid(char_choices.get(x+1));
843 next_char = unichar_id != INVALID_UNICHAR_ID ?
844 getUnicharset().id_to_unichar(unichar_id) : "";
845 } else {
846 next_char = "";
847 }
848
849 if (x + 2 < char_choices.length()) {
850 unichar_id = get_top_choice_uid(char_choices.get(x+2));
851 next_next_char = unichar_id != INVALID_UNICHAR_ID ?
852 getUnicharset().id_to_unichar(unichar_id) : "";
853 } else {
854 next_next_char = "";
855 }
856
857 blob_choice_it.set_to_list(char_choices.get(x));
858 ASSERT_HOST(!blob_choice_it.empty());
859 first_choice = NULL;
860 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
861 blob_choice_it.forward()) { // find the best non-fragment char choice
862 temp_id = blob_choice_it.data()->unichar_id();
863 if (!(getUnicharset().get_fragment(temp_id))) {
864 first_choice = blob_choice_it.data();
865 break;
866 } else if (char_choices.length() > 1) {
867 word.set_fragment_mark(true);
868 capital_word.set_fragment_mark(true);
869 lower_word.set_fragment_mark(true);
870 }
871 }
872 if (first_choice == NULL) {
873 cprintf("Permuter found only fragments for"
874 " character at position %d; word=%s\n",
875 x, word.debug_string(getUnicharset()).string());
876 }
877 ASSERT_HOST(first_choice != NULL);
878
879 unichar_id = first_choice->unichar_id() != INVALID_UNICHAR_ID ?
880 first_choice->unichar_id() : space;
881 first_char = getUnicharset().id_to_unichar(unichar_id);
882 first_rating = first_choice->rating();
883 word.append_unichar_id_space_allocated(
884 unichar_id, 1, first_choice->rating(), first_choice->certainty());
885 capital_word.append_unichar_id_space_allocated(
886 unichar_id, 1, first_choice->rating(), first_choice->certainty());
887 lower_word.append_unichar_id_space_allocated(
888 unichar_id, 1, first_choice->rating(), first_choice->certainty());
889
890 certainties[x] = first_choice->certainty();
891 lower_certainties[x] = first_choice->certainty();
892 upper_certainties[x] = first_choice->certainty();
893
894 lower_done = FALSE;
895 upper_done = FALSE;
896 char_alpha = FALSE;
897 second_char = "";
898 third_char = "";
899 for (; !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
900 unichar_id = blob_choice_it.data()->unichar_id();
901 if (getUnicharset().eq(unichar_id, "l") && !blob_choice_it.at_last() &&
902 blob_choice_it.data_relative(1)->rating() == first_rating) {
903 temp_id = blob_choice_it.data_relative(1)->unichar_id();
904 if (getUnicharset().eq(temp_id, "1") ||
905 getUnicharset().eq(temp_id, "I")) {
906 second_char = getUnicharset().id_to_unichar(temp_id);
907 blob_choice_it.forward();
908 if (!blob_choice_it.at_last() &&
909 blob_choice_it.data_relative(1)->rating() == first_rating) {
910 temp_id = blob_choice_it.data_relative(1)->unichar_id();
911 if (getUnicharset().eq(temp_id, "1") ||
912 getUnicharset().eq(temp_id, "I")) {
913 third_char = getUnicharset().id_to_unichar(temp_id);
914 blob_choice_it.forward();
915 }
916 }
917 ch = choose_il1 (first_char, second_char, third_char,
918 prev_char, next_char, next_next_char);
919 unichar_id = (ch != NULL && *ch != '\0') ?
920 getUnicharset().unichar_to_id(ch) : INVALID_UNICHAR_ID;
921 if (strcmp(ch, "l") != 0 &&
922 getUnicharset().eq(word.unichar_id(x), "l")) {
923 word.set_unichar_id(unichar_id, x);
924 lower_word.set_unichar_id(unichar_id, x);
925 capital_word.set_unichar_id(unichar_id, x);
926 }
927 }
928 }
929 if (unichar_id != INVALID_UNICHAR_ID) {
930 /* Find lower case */
931 if (!lower_done &&
932 (getUnicharset().get_islower(unichar_id) ||
933 (getUnicharset().get_isupper(unichar_id) && x == 0))) {
934 lower_word.set_unichar_id(unichar_id, x);
935 lower_word.set_rating(lower_word.rating() -
936 first_choice->rating() + blob_choice_it.data()->rating());
937 if (blob_choice_it.data()->certainty() < lower_word.certainty()) {
938 lower_word.set_certainty(blob_choice_it.data()->certainty());
939 }
940 lower_certainties[x] = blob_choice_it.data()->certainty();
941 lower_done = TRUE;
942 }
943 /* Find upper case */
944 if (!upper_done && getUnicharset().get_isupper(unichar_id)) {
945 capital_word.set_unichar_id(unichar_id, x);
946 capital_word.set_rating(capital_word.rating() -
947 first_choice->rating() + blob_choice_it.data()->rating());
948 if (blob_choice_it.data()->certainty() < capital_word.certainty()) {
949 capital_word.set_certainty(blob_choice_it.data()->certainty());
950 }
951 upper_certainties[x] = blob_choice_it.data()->certainty();
952 upper_done = TRUE;
953 }
954 if (!char_alpha) {
955 const CHAR_FRAGMENT *fragment =
956 getUnicharset().get_fragment(unichar_id);
957 temp_id = !fragment ? unichar_id :
958 getUnicharset().unichar_to_id(fragment->get_unichar());
959 if (getUnicharset().get_isalpha(temp_id)) {
960 char_alpha = TRUE;
961 }
962 }
963 if (lower_done && upper_done)
964 break;
965 }
966 }
967 if (char_alpha && any_alpha != NULL)
968 *any_alpha = TRUE;
969
970 if (word.rating() > bestrate_pruning_factor * *rating_limit) {
971 if (permute_debug)
972 tprintf("\n***** Aborting high-cost word: %g > limit %g \n",
973 word.rating(), bestrate_pruning_factor * *rating_limit);
974 return (NULL);
975 }
976
977 *prev_char = '\0';
978 temp_id = word.unichar_id(word.length()-1);
979 if (temp_id != INVALID_UNICHAR_ID) {
980 strcpy(prev_char, getUnicharset().id_to_unichar(temp_id));
981 }
982 }
983
984 if (word.rating() < raw_choice->rating()) {
985 *raw_choice = word;
986 LogNewChoice(*raw_choice, 1.0, certainties, true);
987 }
988
989 if (ngram_permuter_activated)
990 return NULL;
991
992 float rating = word.rating();
993 adjust_non_word(&word, &adjust_factor);
994 LogNewChoice(word, adjust_factor, certainties, false);
995
996 float lower_rating = lower_word.rating();
997 adjust_non_word(&lower_word, &adjust_factor);
998 LogNewChoice(lower_word, adjust_factor, lower_certainties, false);
999
1000 float upper_rating = capital_word.rating();
1001 adjust_non_word(&capital_word, &adjust_factor);
1002 LogNewChoice(capital_word, adjust_factor, upper_certainties, false);
1003
1004 WERD_CHOICE *best_choice = &word;
1005 *rating_limit = rating;
1006 if (lower_word.rating() < best_choice->rating()) {
1007 best_choice = &lower_word;
1008 *rating_limit = lower_rating;
1009 }
1010 if (capital_word.rating() < best_choice->rating()) {
1011 best_choice = &capital_word;
1012 *rating_limit = upper_rating;
1013 }
1014 return new WERD_CHOICE(*best_choice);
1015 }
1016
1017
1018 /**********************************************************************
1019 * choose_il1
1020 *
1021 * Choose between the candidate il1 chars.
1022 **********************************************************************/
choose_il1(const char * first_char,const char * second_char,const char * third_char,const char * prev_char,const char * next_char,const char * next_next_char)1023 const char* Dict::choose_il1(const char *first_char, //first choice
1024 const char *second_char, //second choice
1025 const char *third_char, //third choice
1026 const char *prev_char, //prev in word
1027 const char *next_char, //next in word
1028 const char *next_next_char) { //after next next in word
1029 inT32 type1; //1/I/l type of first choice
1030 inT32 type2; //1/I/l type of second choice
1031 inT32 type3; //1/I/l type of third choice
1032
1033 int first_char_length = strlen(first_char);
1034 int prev_char_length = strlen(prev_char);
1035 int next_char_length = strlen(next_char);
1036 int next_next_char_length = strlen(next_next_char);
1037
1038 if (*first_char == 'l' && *second_char != '\0') {
1039 if (*second_char == 'I'
1040 && (((prev_char_length != 0 &&
1041 getUnicharset().get_isupper (prev_char, prev_char_length)) &&
1042 (next_char_length == 0 ||
1043 !getUnicharset().get_islower (next_char, next_char_length)) &&
1044 (next_char_length == 0 ||
1045 !getUnicharset().get_isdigit (next_char, next_char_length))) ||
1046 ((next_char_length != 0 &&
1047 getUnicharset().get_isupper (next_char, next_char_length)) &&
1048 (prev_char_length == 0 ||
1049 !getUnicharset().get_islower (prev_char, prev_char_length)) &&
1050 (prev_char_length == 0 ||
1051 !getUnicharset().get_isdigit (prev_char, prev_char_length)))))
1052 first_char = second_char; //override
1053 else if (*second_char == '1' || *third_char == '1') {
1054 if ((next_char_length != 0 &&
1055 getUnicharset().get_isdigit (next_char, next_char_length)) ||
1056 (prev_char_length != 0 &&
1057 getUnicharset().get_isdigit (prev_char, prev_char_length))
1058 || (*next_char == 'l' &&
1059 (next_next_char_length != 0 &&
1060 getUnicharset().get_isdigit (next_next_char,
1061 next_next_char_length)))) {
1062 first_char = "1";
1063 first_char_length = 1;
1064 }
1065 else if ((prev_char_length == 0 ||
1066 !getUnicharset().get_islower (prev_char, prev_char_length)) &&
1067 ((next_char_length == 0 ||
1068 !getUnicharset().get_islower (next_char, next_char_length)) ||
1069 (*next_char == 's' &&
1070 *next_next_char == 't'))) {
1071 if (((*prev_char != '\'' && *prev_char != '`') || *next_char != '\0')
1072 && ((*next_char != '\'' && *next_char != '`')
1073 || *prev_char != '\0')) {
1074 first_char = "1";
1075 first_char_length = 1;
1076 }
1077 }
1078 }
1079 if (*first_char == 'l' && *next_char != '\0' &&
1080 (prev_char_length == 0 ||
1081 !getUnicharset().get_isalpha (prev_char, prev_char_length))) {
1082 type1 = 2;
1083
1084 if (*second_char == '1')
1085 type2 = 0;
1086 else if (*second_char == 'I')
1087 type2 = 1;
1088 else if (*second_char == 'l')
1089 type2 = 2;
1090 else
1091 type2 = type1;
1092
1093 if (*third_char == '1')
1094 type3 = 0;
1095 else if (*third_char == 'I')
1096 type3 = 1;
1097 else if (*third_char == 'l')
1098 type3 = 2;
1099 else
1100 type3 = type1;
1101
1102 #if 0
1103 if (bigram_counts[*next_char][type2] >
1104 bigram_counts[*next_char][type1]) {
1105 first_char = second_char;
1106 type1 = type2;
1107 }
1108 if (bigram_counts[*next_char][type3] >
1109 bigram_counts[*next_char][type1]) {
1110 first_char = third_char;
1111 }
1112 #endif
1113 }
1114 }
1115 return first_char;
1116 }
1117
1118 //
1119 // Check all the DAWGs to see if this word is in any of them.
1120 //
valid_word(const WERD_CHOICE & word,bool numbers_ok)1121 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) {
1122 const WERD_CHOICE *word_ptr = &word;
1123 WERD_CHOICE temp_word;
1124 if (hyphenated()) {
1125 copy_hyphen_info(&temp_word);
1126 temp_word += word;
1127 word_ptr = &temp_word;
1128 }
1129 if (word_ptr->length() == 0) return NO_PERM;
1130 // Allocate vectors for holding current and updated
1131 // active_dawgs and constraints and initialize them.
1132 DawgInfoVector *active_dawgs = new DawgInfoVector[2];
1133 DawgInfoVector *constraints = new DawgInfoVector[2];
1134 init_active_dawgs(&(active_dawgs[0]));
1135 init_constraints(&(constraints[0]));
1136 DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
1137 &(active_dawgs[1]), &(constraints[1]), 0.0);
1138 int last_index = word_ptr->length() - 1;
1139 // Call leter_is_okay for each letter in the word.
1140 for (int i = hyphen_base_size(); i <= last_index; ++i) {
1141 if (!((this->*letter_is_okay_)(&dawg_args, i, word_ptr,
1142 i == last_index))) break;
1143 // Swap active_dawgs, constraints with the corresponding updated vector.
1144 if (dawg_args.updated_active_dawgs == &(active_dawgs[1])) {
1145 dawg_args.updated_active_dawgs = &(active_dawgs[0]);
1146 dawg_args.updated_constraints = &(constraints[0]);
1147 ++(dawg_args.active_dawgs);
1148 ++(dawg_args.constraints);
1149 } else {
1150 ++(dawg_args.updated_active_dawgs);
1151 ++(dawg_args.updated_constraints);
1152 dawg_args.active_dawgs = &(active_dawgs[0]);
1153 dawg_args.constraints = &(constraints[0]);
1154 }
1155 }
1156 delete[] active_dawgs;
1157 delete[] constraints;
1158 if (dawg_args.permuter == SYSTEM_DAWG_PERM ||
1159 dawg_args.permuter == DOC_DAWG_PERM ||
1160 dawg_args.permuter == USER_DAWG_PERM ||
1161 (numbers_ok && dawg_args.permuter == NUMBER_PERM)){
1162 return dawg_args.permuter;
1163 } else {
1164 return NO_PERM;
1165 }
1166 }
1167
1168 //
1169 // Return true if the word contains a valid punctuation pattern.
1170 //
1171 // Note: Since the domains of punctuation symbols and symblos
1172 // used in numbers are not disjoint, a valid number might contain
1173 // an invalid punctuation pattern (e.g. .99).
1174 //
valid_punctuation(const WERD_CHOICE & word)1175 bool Dict::valid_punctuation(const WERD_CHOICE &word) {
1176 if (word.length() == 0) return NO_PERM;
1177 int i;
1178 WERD_CHOICE new_word;
1179 int last_index = word.length() - 1;
1180 int new_len = 0;
1181 for (i = 0; i <= last_index; ++i) {
1182 UNICHAR_ID unichar_id = (word.unichar_id(i));
1183 if (getUnicharset().get_ispunctuation(unichar_id)) {
1184 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
1185 } else if (!getUnicharset().get_isalpha(unichar_id) &&
1186 !getUnicharset().get_isdigit(unichar_id)) {
1187 return false; // neither punc, nor alpha, nor digit
1188 } else if ((new_len = new_word.length()) == 0 ||
1189 new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
1190 new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
1191 }
1192 }
1193 for (i = 0; i < dawgs_.size(); ++i) {
1194 if (dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
1195 dawgs_[i]->word_in_dawg(new_word)) return true;
1196 }
1197 return false;
1198 }
1199
1200 /**********************************************************************
1201 * fragment_state
1202 *
1203 * Given the current char choice and information about previously seen
1204 * fragments, determines whether adjacent character fragments are
1205 * present and whether they can be concatenated.
1206 *
1207 * The given prev_char_frag_info contains:
1208 * -- fragment: if not NULL contains information about immediately
1209 * preceeding fragmented character choice
1210 * -- num_fragments: number of fragments that have been used so far
1211 * to construct a character
1212 * -- certainty: certainty of the current choice or minimum
1213 * certainty of all fragments concatenated so far
1214 * -- rating: rating of the current choice or sum of fragment
1215 * ratings concatenated so far
1216 *
1217 * The output char_frag_info is filled in as follows:
1218 * -- character: is set to be NULL if the choice is a non-matching
1219 * or non-ending fragment piece; is set to unichar of the given choice
1220 * if it represents a regular character or a matching ending fragment
1221 * -- fragment,num_fragments,certainty,rating are set as described above
1222 *
1223 * Returns false if a non-matching fragment is discovered, true otherwise.
1224 **********************************************************************/
fragment_state_okay(UNICHAR_ID curr_unichar_id,float curr_rating,float curr_certainty,const CHAR_FRAGMENT_INFO * prev_char_frag_info,const char * debug,int word_ending,CHAR_FRAGMENT_INFO * char_frag_info)1225 bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id,
1226 float curr_rating, float curr_certainty,
1227 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
1228 const char *debug, int word_ending,
1229 CHAR_FRAGMENT_INFO *char_frag_info) {
1230 const CHAR_FRAGMENT *this_fragment =
1231 getUnicharset().get_fragment(curr_unichar_id);
1232 const CHAR_FRAGMENT *prev_fragment =
1233 prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;
1234
1235 // Print debug info for fragments.
1236 if (debug && (prev_fragment || this_fragment)) {
1237 cprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
1238 getUnicharset().debug_str(curr_unichar_id).string(),
1239 word_ending);
1240 if (prev_fragment) {
1241 cprintf("prev_fragment %s\n", prev_fragment->to_string().string());
1242 }
1243 if (this_fragment) {
1244 cprintf("this_fragment %s\n", this_fragment->to_string().string());
1245 }
1246 }
1247
1248 char_frag_info->unichar_id = curr_unichar_id;
1249 char_frag_info->fragment = this_fragment;
1250 char_frag_info->rating = curr_rating;
1251 char_frag_info->certainty = curr_certainty;
1252 char_frag_info->num_fragments = 1;
1253 if (prev_fragment && !this_fragment) {
1254 if (debug) tprintf("Skip choice with incomplete fragment\n");
1255 return false;
1256 }
1257 if (this_fragment) {
1258 // We are dealing with a fragment.
1259 char_frag_info->unichar_id = INVALID_UNICHAR_ID;
1260 if (prev_fragment) {
1261 if (!this_fragment->is_continuation_of(prev_fragment)) {
1262 if (debug) tprintf("Non-matching fragment piece\n");
1263 return false;
1264 }
1265 if (this_fragment->is_ending()) {
1266 char_frag_info->unichar_id =
1267 getUnicharset().unichar_to_id(this_fragment->get_unichar());
1268 char_frag_info->fragment = NULL;
1269 if (debug) {
1270 tprintf("Built character %s from fragments\n",
1271 getUnicharset().debug_str(
1272 char_frag_info->unichar_id).string());
1273 }
1274 } else {
1275 if (debug) tprintf("Record fragment continuation\n");
1276 char_frag_info->fragment = this_fragment;
1277 }
1278 // Update certainty and rating.
1279 char_frag_info->rating =
1280 prev_char_frag_info->rating + curr_rating;
1281 char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
1282 char_frag_info->certainty =
1283 MIN(curr_certainty, prev_char_frag_info->certainty);
1284 } else {
1285 if (this_fragment->is_beginning()) {
1286 if (debug) cprintf("Record fragment beginning\n");
1287 } else {
1288 if (debug) {
1289 tprintf("Non-starting fragment piece with no prev_fragment\n");
1290 }
1291 return false;
1292 }
1293 }
1294 }
1295 if (word_ending && char_frag_info->fragment) {
1296 if (debug) tprintf("Word can not end with a fragment\n");
1297 return false;
1298 }
1299 return true;
1300 }
1301 /**********************************************************************
1302 * top_fragments_permute_and_select
1303 *
1304 * Creates a copy of character choices list that contain only fragments
1305 * and the best non-fragmented character choice.
1306 * Permutes character in this shortened list, builds characters from
1307 * fragments if possible and returns a better choice if found.
1308 **********************************************************************/
top_fragments_permute_and_select(const BLOB_CHOICE_LIST_VECTOR & char_choices,float rating_limit)1309 WERD_CHOICE *Dict::top_fragments_permute_and_select(
1310 const BLOB_CHOICE_LIST_VECTOR &char_choices,
1311 float rating_limit) {
1312 if (char_choices.length() <= 1 ||
1313 char_choices.length() > MAX_PERM_LENGTH) {
1314 return NULL;
1315 }
1316 // See it would be possible to benefit from permuting fragments.
1317 int x;
1318 float min_rating = 0.0;
1319 BLOB_CHOICE_IT blob_choice_it;
1320 for (x = 0; x < char_choices.length(); ++x) {
1321 blob_choice_it.set_to_list(char_choices.get(x));
1322 if (blob_choice_it.data()) {
1323 min_rating += blob_choice_it.data()->rating();
1324 }
1325 if (min_rating >= rating_limit) {
1326 return NULL;
1327 }
1328 }
1329 if (fragments_debug > 1) {
1330 tprintf("A choice with fragment beats top choice\n");
1331 tprintf("Running fragment permuter...\n");
1332 }
1333
1334 // Construct a modified choices list that contains (for each position):
1335 // the best choice, all fragments and at least one choice for
1336 // a non-fragmented character.
1337 BLOB_CHOICE_LIST_VECTOR frag_char_choices(char_choices.length());
1338 for (x = 0; x < char_choices.length(); ++x) {
1339 bool need_nonfrag_char = true;
1340 BLOB_CHOICE_LIST *frag_choices = new BLOB_CHOICE_LIST();
1341 BLOB_CHOICE_IT frag_choices_it;
1342 frag_choices_it.set_to_list(frag_choices);
1343 blob_choice_it.set_to_list(char_choices.get(x));
1344 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1345 blob_choice_it.forward()) {
1346 if (getUnicharset().get_fragment(blob_choice_it.data()->unichar_id())) {
1347 frag_choices_it.add_after_then_move(
1348 new BLOB_CHOICE(*(blob_choice_it.data())));
1349 } else if (need_nonfrag_char) {
1350 frag_choices_it.add_after_then_move(
1351 new BLOB_CHOICE(*(blob_choice_it.data())));
1352 need_nonfrag_char = false;
1353 }
1354 }
1355 frag_char_choices += frag_choices;
1356 }
1357
1358 WERD_CHOICE *best_choice = new WERD_CHOICE();
1359 best_choice->make_bad();
1360 WERD_CHOICE word(MAX_PERM_LENGTH);
1361 word.set_permuter(TOP_CHOICE_PERM);
1362 float certainties[MAX_PERM_LENGTH];
1363 this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_top_fragments_fxn;
1364 permute_choices((fragments_debug > 1) ? "fragments_debug" : NULL,
1365 frag_char_choices, 0, NULL, &word, certainties,
1366 &rating_limit, best_choice, NULL);
1367
1368 frag_char_choices.delete_data_pointers();
1369 return best_choice;
1370 }
1371
1372 /**********************************************************************
1373 * permute_choices
1374 *
1375 * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST
1376 * with the given char_choice_index in char_choices.
1377 **********************************************************************/
permute_choices(const char * debug,const BLOB_CHOICE_LIST_VECTOR & char_choices,int char_choice_index,const CHAR_FRAGMENT_INFO * prev_char_frag_info,WERD_CHOICE * word,float certainties[],float * limit,WERD_CHOICE * best_choice,void * more_args)1378 void Dict::permute_choices(
1379 const char *debug,
1380 const BLOB_CHOICE_LIST_VECTOR &char_choices,
1381 int char_choice_index,
1382 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
1383 WERD_CHOICE *word,
1384 float certainties[],
1385 float *limit,
1386 WERD_CHOICE *best_choice,
1387 void *more_args) {
1388 if (debug) {
1389 tprintf("%s permute_choices: char_choice_index=%d"
1390 " limit=%4.2f rating=%4.2f, certainty=%4.2f word=%s\n",
1391 debug, char_choice_index, *limit, word->rating(),
1392 word->certainty(), word->debug_string(getUnicharset()).string());
1393 }
1394 if (char_choice_index < char_choices.length()) {
1395 BLOB_CHOICE_IT blob_choice_it;
1396 blob_choice_it.set_to_list(char_choices.get(char_choice_index));
1397 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1398 blob_choice_it.forward()) {
1399 append_choices(debug, char_choices, *(blob_choice_it.data()),
1400 char_choice_index, prev_char_frag_info, word,
1401 certainties, limit, best_choice, more_args);
1402
1403 }
1404 }
1405 }
1406
1407 /**********************************************************************
1408 * append_choices
1409 *
1410 * Check to see whether or not the next choice is worth appending to
1411 * the word being generated. If so then keep going deeper into the word.
1412 *
1413 * This function assumes that Dict::go_deeper_fxn_ is set.
1414 **********************************************************************/
append_choices(const char * debug,const BLOB_CHOICE_LIST_VECTOR & char_choices,const BLOB_CHOICE & blob_choice,int char_choice_index,const CHAR_FRAGMENT_INFO * prev_char_frag_info,WERD_CHOICE * word,float certainties[],float * limit,WERD_CHOICE * best_choice,void * more_args)1415 void Dict::append_choices(
1416 const char *debug,
1417 const BLOB_CHOICE_LIST_VECTOR &char_choices,
1418 const BLOB_CHOICE &blob_choice,
1419 int char_choice_index,
1420 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
1421 WERD_CHOICE *word,
1422 float certainties[],
1423 float *limit,
1424 WERD_CHOICE *best_choice,
1425 void *more_args) {
1426 int word_ending =
1427 (char_choice_index == char_choices.length() - 1) ? true : false;
1428
1429 // Deal with fragments.
1430 CHAR_FRAGMENT_INFO char_frag_info;
1431 if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
1432 blob_choice.certainty(), prev_char_frag_info, debug,
1433 word_ending, &char_frag_info)) {
1434 return; // blob_choice must be an invalid fragment
1435 }
1436 // Search the next letter if this character is a fragment.
1437 if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
1438 permute_choices(debug, char_choices, char_choice_index + 1,
1439 &char_frag_info, word, certainties, limit,
1440 best_choice, more_args);
1441 return;
1442 }
1443
1444 // Add the next unichar.
1445 float old_rating = word->rating();
1446 float old_certainty = word->certainty();
1447 uinT8 old_permuter = word->permuter();
1448 certainties[word->length()] = char_frag_info.certainty;
1449 word->append_unichar_id_space_allocated(
1450 char_frag_info.unichar_id, char_frag_info.num_fragments,
1451 char_frag_info.rating, char_frag_info.certainty);
1452
1453 // Explore the next unichar.
1454 (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
1455 &char_frag_info, word_ending, word, certainties,
1456 limit, best_choice, more_args);
1457
1458 // Remove the unichar we added to explore other choices in it's place.
1459 word->remove_last_unichar_id();
1460 word->set_rating(old_rating);
1461 word->set_certainty(old_certainty);
1462 word->set_permuter(old_permuter);
1463 }
1464
1465 /**********************************************************************
1466 * go_deeper_top_fragments_fxn
1467 *
1468 * If the choice being composed so far could be better
1469 * than best_choice keep exploring choices.
1470 **********************************************************************/
go_deeper_top_fragments_fxn(const char * debug,const BLOB_CHOICE_LIST_VECTOR & char_choices,int char_choice_index,const CHAR_FRAGMENT_INFO * prev_char_frag_info,bool word_ending,WERD_CHOICE * word,float certainties[],float * limit,WERD_CHOICE * best_choice,void * more_args)1471 void Dict::go_deeper_top_fragments_fxn(
1472 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
1473 int char_choice_index,
1474 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
1475 bool word_ending, WERD_CHOICE *word, float certainties[],
1476 float *limit, WERD_CHOICE *best_choice, void *more_args) {
1477 if (word->rating() < *limit) {
1478 if (word_ending) {
1479 if (fragments_debug > 1) {
1480 tprintf("fragments_debug new choice = %s\n",
1481 word->debug_string(getUnicharset()).string());
1482 }
1483 *limit = word->rating();
1484
1485 float adjust_factor;
1486 adjust_non_word(word, &adjust_factor);
1487 LogNewChoice(*word, adjust_factor, certainties, false);
1488
1489 if (word->rating() < best_choice->rating()) {
1490 *best_choice = *word;
1491 }
1492 } else { // search the next letter
1493 permute_choices(debug, char_choices, char_choice_index + 1,
1494 prev_char_frag_info, word, certainties, limit,
1495 best_choice, more_args);
1496 }
1497 } else {
1498 if (fragments_debug > 1) {
1499 tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
1500 word->debug_string(getUnicharset()).string(),
1501 word->rating(), *limit);
1502 }
1503 }
1504 }
1505
1506 } // namespace tesseract
1507