1 /******************************************************************************
2 ** Filename: stopper.c
3 ** Purpose: Stopping criteria for word classifier.
4 ** Author: Dan Johnson
5 ** History: Mon Apr 29 14:56:49 1991, DSJ, Created.
6 **
7 ** (c) Copyright Hewlett-Packard Company, 1988.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 ******************************************************************************/
18 /**----------------------------------------------------------------------------
19 Include Files and Type Defines
20 ----------------------------------------------------------------------------**/
21 #include "stopper.h"
22 #include "emalloc.h"
23 #include "matchdefs.h"
24 #include "callcpp.h"
25 #include "permute.h"
26 #include "context.h"
27 #include "danerror.h"
28 #include "const.h"
29 #include "freelist.h"
30 #include "efio.h"
31 #include "scanutils.h"
32 #include "unichar.h"
33 #include "varable.h"
34 #include "dict.h"
35 #include "image.h"
36 #include "ccutil.h"
37 #include "ratngs.h"
38 #include "ambigs.h"
39
40 #include <stdio.h>
41 #include <string.h>
42 #include <ctype.h>
43 #include <math.h>
44 #ifdef __UNIX__
45 #include <assert.h>
46 #endif
47
48 /* these are kludges - add appropriate .h file later */
49 /* from adaptmatch.cpp */
50 #define MAX_WERD_SIZE 100
51
52 typedef struct
53 {
54 VIABLE_CHOICE Choice;
55 float ChunkCertainty[MAX_NUM_CHUNKS];
56 UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS];
57 } EXPANDED_CHOICE;
58
59 /**----------------------------------------------------------------------------
60 Macros
61 ----------------------------------------------------------------------------**/
62 #define BestCertainty(Choices) (((VIABLE_CHOICE) first_node (Choices))->Certainty)
63 #define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating)
64 #define BestFactor(Choices) (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
65
66 #define AmbigThreshold(F1,F2) (((F2) - (F1)) * stopper_ambiguity_threshold_gain - \
67 stopper_ambiguity_threshold_offset)
68
69 /*---------------------------------------------------------------------------
70 Private Function Prototoypes
71 ----------------------------------------------------------------------------*/
72 void AddNewChunk(VIABLE_CHOICE Choice, int Blob);
73
74 int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1,
75 void *arg2); //VIABLE_CHOICE Choice2);
76
77 void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice);
78
79 int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice,
80 void *item2); //EXPANDED_CHOICE *BestChoice);
81
82 int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
83 const WERD_CHOICE &BestChoice);
84
85 /**----------------------------------------------------------------------
86 V a r i a b l e s
87 ----------------------------------------------------------------------**/
88 double_VAR(certainty_scale, 20.0, "Certainty scaling factor");
89
90 double_VAR(stopper_nondict_certainty_base, -2.50,
91 "Certainty threshold for non-dict words");
92
93 double_VAR(stopper_phase2_certainty_rejection_offset, 1.0,
94 "Reject certainty offset");
95
96 INT_VAR(stopper_smallword_size, 2,
97 "Size of dict word to be treated as non-dict word");
98
99 double_VAR(stopper_certainty_per_char, -0.50,
100 "Certainty to add for each dict char above small word size.");
101
102 double_VAR(stopper_allowable_character_badness, 3.0,
103 "Max certaintly variation allowed in a word (in sigma)");
104
105 INT_VAR(stopper_debug_level, 0, "Stopper debug level");
106
107 double_VAR(stopper_ambiguity_threshold_gain, 8.0,
108 "Gain factor for ambiguity threshold");
109
110 double_VAR(stopper_ambiguity_threshold_offset, 1.5,
111 "Certainty offset for ambiguity threshold");
112
113 BOOL_VAR(stopper_no_acceptable_choices, false,
114 "Make AcceptableChoice() always return false. Useful"
115 " when there is a need to explore all segmentations");
116
117 BOOL_VAR(save_raw_choices, false, "Save all explored raw choices");
118
119 INT_VAR (tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
120
121 STRING_VAR(word_to_debug, "", "Word for which stopper debug information"
122 " should be printed to stdout");
123
124 STRING_VAR(word_to_debug_lengths, "", "Lengths of unichars in word_to_debug");
125
126 /**----------------------------------------------------------------------------
127 Public Code
128 ----------------------------------------------------------------------------**/
129 /*---------------------------------------------------------------------------*/
130 namespace tesseract {
AcceptableChoice(BLOB_CHOICE_LIST_VECTOR * Choices,WERD_CHOICE * BestChoice,const WERD_CHOICE & RawChoice,DANGERR * fixpt,ACCEPTABLE_CHOICE_CALLER caller,bool * modified_blobs)131 int Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
132 WERD_CHOICE *BestChoice,
133 const WERD_CHOICE &RawChoice,
134 DANGERR *fixpt,
135 ACCEPTABLE_CHOICE_CALLER caller,
136 bool *modified_blobs) {
137 /*
138 ** Parameters:
139 ** Choices choices for current segmentation
140 ** BestChoice best choice for current segmentation
141 ** RawChoice best raw choice for current segmentation
142 ** Variables Used:
143 ** stopper_nondict_certainty_base certainty for a non-dict word
144 ** stopper_smallword_size size of word to be treated as non-word
145 ** stopper_certainty_per_char certainty to add for each dict char
146 ** Operation: Return TRUE if the results from this segmentation are
147 ** good enough to stop. Otherwise return FALSE.
148 ** Return: TRUE or FALSE.
149 ** Exceptions: none
150 ** History: Mon Apr 29 14:57:32 1991, DSJ, Created.
151 */
152 float CertaintyThreshold = stopper_nondict_certainty_base;
153 int WordSize;
154
155 if (stopper_no_acceptable_choices) return false;
156
157 if (fixpt != NULL)
158 fixpt->index = -1;
159 if (BestChoice->length() == 0)
160 return (FALSE);
161 if (caller == CHOPPER_CALLER && BestChoice->fragment_mark()) {
162 if (stopper_debug_level >= 1) {
163 cprintf("AcceptableChoice(): a choice with fragments beats BestChoice");
164 }
165 return false;
166 }
167
168 bool no_dang_ambigs =
169 NoDangerousAmbig(BestChoice, fixpt, true, Choices, modified_blobs);
170
171 if (stopper_debug_level >= 1)
172 tprintf("\nStopper: %s (word=%c, case=%c)\n",
173 BestChoice->debug_string(getUnicharset()).string(),
174 (valid_word(*BestChoice) ? 'y' : 'n'),
175 (Context::case_ok(*BestChoice, getUnicharset()) ? 'y' : 'n'));
176
177 if (valid_word(*BestChoice) &&
178 Context::case_ok(*BestChoice, getUnicharset())) {
179 WordSize = LengthOfShortestAlphaRun(*BestChoice);
180 WordSize -= stopper_smallword_size;
181 if (WordSize < 0)
182 WordSize = 0;
183 CertaintyThreshold += WordSize * stopper_certainty_per_char;
184 }
185
186 if (stopper_debug_level >= 1)
187 tprintf("Stopper: Certainty = %4.1f, Threshold = %4.1f\n",
188 BestChoice->certainty(), CertaintyThreshold);
189
190 if (no_dang_ambigs &&
191 BestChoice->certainty() > CertaintyThreshold &&
192 UniformCertainties(*Choices, *BestChoice)) {
193 return (TRUE);
194 } else {
195 return (FALSE);
196 }
197 } /* AcceptableChoice */
198
199
200 /*---------------------------------------------------------------------------*/
AcceptableResult(const WERD_CHOICE & BestChoice,const WERD_CHOICE & RawChoice)201 int Dict::AcceptableResult(const WERD_CHOICE &BestChoice,
202 const WERD_CHOICE &RawChoice) {
203 /*
204 ** Parameters:
205 ** BestChoice best choice for current word
206 ** RawChoice best raw choice for current word
207 ** Variables Used:
208 ** stopper_nondict_certainty_base certainty for a non-dict word
209 ** stopper_smallword_size size of word to be treated as non-word
210 ** stopper_certainty_per_char certainty to add for each dict char
211 ** best_choices_ list of all good choices found
212 ** reject_offset_ allowed offset before a word is rejected
213 ** Operation: Return FALSE if the best choice for the current word
214 ** is questionable and should be tried again on the second
215 ** pass or should be flagged to the user.
216 ** Return: TRUE or FALSE.
217 ** Exceptions: none
218 ** History: Thu May 9 14:05:05 1991, DSJ, Created.
219 */
220 float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
221 int WordSize;
222
223 if (stopper_debug_level >= 1) {
224 tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
225 BestChoice.debug_string(getUnicharset()).string(),
226 (valid_word(BestChoice) ? 'y' : 'n'),
227 (Context::case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'),
228 ((rest (best_choices_) != NIL) ? 'n' : 'y'));
229 }
230
231 if (BestChoice.length() == 0 || CurrentWordAmbig())
232 return (FALSE);
233 if (BestChoice.fragment_mark()) {
234 if (stopper_debug_level >= 1) {
235 cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n");
236 }
237 return false;
238 }
239 if (valid_word(BestChoice) &&
240 Context::case_ok(BestChoice, getUnicharset())) {
241 WordSize = LengthOfShortestAlphaRun(BestChoice);
242 WordSize -= stopper_smallword_size;
243 if (WordSize < 0)
244 WordSize = 0;
245 CertaintyThreshold += WordSize * stopper_certainty_per_char;
246 }
247
248 if (stopper_debug_level >= 1)
249 cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
250 BestChoice.certainty(), CertaintyThreshold);
251
252 if (BestChoice.certainty() > CertaintyThreshold &&
253 !stopper_no_acceptable_choices) {
254 if (stopper_debug_level >= 1)
255 cprintf("ACCEPTED\n");
256 return (TRUE);
257 }
258 else {
259 if (stopper_debug_level >= 1)
260 cprintf("REJECTED\n");
261 return (FALSE);
262 }
263 } /* AcceptableResult */
264
265
266 /*---------------------------------------------------------------------------*/
AlternativeChoicesWorseThan(FLOAT32 Threshold)267 int Dict::AlternativeChoicesWorseThan(FLOAT32 Threshold) {
268 /*
269 ** Parameters:
270 ** Threshold minimum adjust factor for alternative choices
271 ** Variables Used:
272 ** best_choices_ alternative choices for current word
273 ** Operation: This routine returns TRUE if there are no alternative
274 ** choices for the current word OR if all alternatives have
275 ** an adjust factor worse than Threshold.
276 ** Return: TRUE or FALSE.
277 ** Exceptions: none
278 ** History: Mon Jun 3 09:36:31 1991, DSJ, Created.
279 */
280 LIST Alternatives;
281 VIABLE_CHOICE Choice;
282
283 Alternatives = rest (best_choices_);
284 iterate(Alternatives) {
285 Choice = (VIABLE_CHOICE) first_node (Alternatives);
286 if (Choice->AdjustFactor <= Threshold)
287 return (FALSE);
288 }
289
290 return (TRUE);
291
292 } /* AlternativeChoicesWorseThan */
293
294
295 /*---------------------------------------------------------------------------*/
CurrentBestChoiceIs(const WERD_CHOICE & WordChoice)296 int Dict::CurrentBestChoiceIs(const WERD_CHOICE &WordChoice) {
297 /*
298 ** Parameters:
299 ** Word word that will be compared to the best choice
300 ** Variables Used:
301 ** best_choices_ set of best choices for current word
302 ** Operation: Returns TRUE if Word is the same as the current best
303 ** choice, FALSE otherwise.
304 ** Return: TRUE or FALSE
305 ** Exceptions: none
306 ** History: Thu May 30 14:44:22 1991, DSJ, Created.
307 */
308 return (best_choices_ != NIL &&
309 StringSameAs(WordChoice, (VIABLE_CHOICE)first_node(best_choices_)));
310 } /* CurrentBestChoiceIs */
311
312
313 /*---------------------------------------------------------------------------*/
CurrentBestChoiceAdjustFactor()314 FLOAT32 Dict::CurrentBestChoiceAdjustFactor() {
315 /*
316 ** Parameters: none
317 ** Variables Used:
318 ** best_choices_ set of best choices for current word
319 ** Operation: Return the adjustment factor for the best choice for
320 ** the current word.
321 ** Return: Adjust factor for current best choice.
322 ** Exceptions: none
323 ** History: Thu May 30 14:48:24 1991, DSJ, Created.
324 */
325 VIABLE_CHOICE BestChoice;
326
327 if (best_choices_ == NIL)
328 return (MAX_FLOAT32);
329
330 BestChoice = (VIABLE_CHOICE) first_node (best_choices_);
331 return (BestChoice->AdjustFactor);
332
333 } /* CurrentBestChoiceAdjustFactor */
334
335
336 /*---------------------------------------------------------------------------*/
CurrentWordAmbig()337 int Dict::CurrentWordAmbig() {
338 /*
339 ** Parameters: none
340 ** Variables Used:
341 ** best_choices_ set of best choices for current word
342 ** Operation: This routine returns TRUE if there are multiple good
343 ** choices for the current word and FALSE otherwise.
344 ** Return: TRUE or FALSE
345 ** Exceptions: none
346 ** History: Wed May 22 15:38:38 1991, DSJ, Created.
347 */
348 return (rest (best_choices_) != NIL);
349
350 } /* CurrentWordAmbig */
351
352
353 /*---------------------------------------------------------------------------*/
DebugWordChoices()354 void Dict::DebugWordChoices() {
355 /*
356 ** Parameters: none
357 ** Variables Used:
358 ** best_raw_choice_
359 ** best_choices_
360 ** Operation: Print the current choices for this word to stdout.
361 ** Return: none
362 ** Exceptions: none
363 ** History: Wed May 15 13:52:08 1991, DSJ, Created.
364 */
365 LIST Choices;
366 int i;
367 char LabelString[80];
368 VIABLE_CHOICE VChoice = (VIABLE_CHOICE)first_node(best_choices_);
369 bool force_debug =
370 fragments_debug && VChoice != NULL && VChoice->ComposedFromCharFragments;
371
372 if (stopper_debug_level >= 1 || force_debug ||
373 (((STRING)word_to_debug).length() > 0 && best_choices_ &&
374 StringSameAs(word_to_debug.string(), word_to_debug_lengths.string(),
375 (VIABLE_CHOICE)first_node(best_choices_)))) {
376 if (best_raw_choice_)
377 PrintViableChoice(stderr, "\nBest Raw Choice: ", best_raw_choice_);
378
379 i = 1;
380 Choices = best_choices_;
381 if (Choices)
382 cprintf("\nBest Cooked Choices:\n");
383 iterate(Choices) {
384 sprintf(LabelString, "Cooked Choice #%d: ", i);
385 PrintViableChoice(stderr, LabelString,
386 (VIABLE_CHOICE)first_node(Choices));
387 i++;
388 }
389 }
390 } /* DebugWordChoices */
391
392 // Print all the choices in raw_choices_ list for non 1-1 ambiguities.
PrintAmbigAlternatives(FILE * file,const char * label,int label_num_unichars)393 void Dict::PrintAmbigAlternatives(FILE *file, const char *label,
394 int label_num_unichars) {
395 iterate(raw_choices_) {
396 VIABLE_CHOICE Choice = (VIABLE_CHOICE)first_node(raw_choices_);
397 if (Choice->Length > 0 &&
398 (label_num_unichars > 1 || Choice->Length > 1)) {
399 for (int i = 0; i < Choice->Length; i++) {
400 fprintf(file, "%s",
401 getUnicharset().id_to_unichar(Choice->Blob[i].Class));
402 }
403 fflush(file);
404 fprintf(file, "\t%s\t%.4f\t%.4f\n", label,
405 Choice->Rating, Choice->Certainty);
406 }
407 }
408 }
409
410 /*---------------------------------------------------------------------------*/
FilterWordChoices()411 void Dict::FilterWordChoices() {
412 /*
413 ** Parameters: none
414 ** Variables Used:
415 ** best_choices_ set of choices for current word
416 ** Operation: This routine removes from best_choices_ all choices which
417 ** are not within a reasonable range of the best choice.
418 ** Return: none
419 ** Exceptions: none
420 ** History: Wed May 15 13:08:24 1991, DSJ, Created.
421 */
422 EXPANDED_CHOICE BestChoice;
423
424 if (best_choices_ == NIL || second_node (best_choices_) == NIL)
425 return;
426
427 /* compute certainties and class for each chunk in best choice */
428 ExpandChoice((VIABLE_CHOICE_STRUCT *)first_node(best_choices_), &BestChoice);
429
430 set_rest (best_choices_, delete_d (rest (best_choices_),
431 &BestChoice, FreeBadChoice));
432
433 } /* FilterWordChoices */
434
435 /*---------------------------------------------------------------------------*/
FindClassifierErrors(FLOAT32 MinRating,FLOAT32 MaxRating,FLOAT32 RatingMargin,FLOAT32 Thresholds[])436 void Dict::FindClassifierErrors(FLOAT32 MinRating,
437 FLOAT32 MaxRating,
438 FLOAT32 RatingMargin,
439 FLOAT32 Thresholds[]) {
440 /*
441 ** Parameters:
442 ** MinRating limits how tight to make a template
443 ** MaxRating limits how loose to make a template
444 ** RatingMargin amount of margin to put in template
445 ** Thresholds[] place to put error thresholds
446 ** Operation: This routine compares the best choice for the current
447 ** word to the best raw choice to determine which characters
448 ** were classified incorrectly by the classifier. It then
449 ** places a separate threshold into Thresholds for each
450 ** character in the word. If the classifier was correct,
451 ** MaxRating is placed into Thresholds. If the
452 ** classifier was incorrect, the avg. match rating (error
453 ** percentage) of the classifier's incorrect choice minus
454 ** some margin is
455 ** placed into thresholds. This can then be used by the
456 ** caller to try to create a new template for the desired
457 ** class that will classify the character with a rating better
458 ** than the threshold value. The match rating placed into
459 ** Thresholds is never allowed to be below MinRating in order
460 ** to prevent trying to make overly tight templates.
461 ** Return: none (results are placed in Thresholds)
462 ** Exceptions: none
463 ** History: Fri May 31 16:02:57 1991, DSJ, Created.
464 */
465 EXPANDED_CHOICE BestRaw;
466 VIABLE_CHOICE Choice;
467 int i, j, Chunk;
468 FLOAT32 AvgRating;
469 int NumErrorChunks;
470
471 assert (best_choices_ != NIL);
472 assert (best_raw_choice_ != NULL);
473
474 ExpandChoice(best_raw_choice_, &BestRaw);
475 Choice = (VIABLE_CHOICE) first_node (best_choices_);
476
477 for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) {
478 AvgRating = 0.0;
479 NumErrorChunks = 0;
480
481 for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
482 if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) {
483 AvgRating += BestRaw.ChunkCertainty[Chunk];
484 NumErrorChunks++;
485 }
486 }
487
488 if (NumErrorChunks > 0) {
489 AvgRating /= NumErrorChunks;
490 *Thresholds = (AvgRating / -certainty_scale) * (1.0 - RatingMargin);
491 }
492 else
493 *Thresholds = MaxRating;
494
495 if (*Thresholds > MaxRating)
496 *Thresholds = MaxRating;
497 if (*Thresholds < MinRating)
498 *Thresholds = MinRating;
499 }
500 } /* FindClassifierErrors */
501
502
503 /*---------------------------------------------------------------------------*/
InitChoiceAccum()504 void Dict::InitChoiceAccum() {
505 /*
506 ** Parameters: none
507 ** Operation: This routine initializes the data structures used to
508 ** keep track the good word choices found for a word.
509 ** Return: none
510 ** Exceptions: none
511 ** History: Fri May 17 07:59:00 1991, DSJ, Created.
512 */
513 BLOB_WIDTH *BlobWidth, *End;
514
515 if (best_raw_choice_)
516 memfree(best_raw_choice_);
517 best_raw_choice_ = NULL;
518
519 if (best_choices_)
520 destroy_nodes(best_choices_, memfree);
521 best_choices_ = NIL;
522
523 if (raw_choices_)
524 destroy_nodes(raw_choices_, memfree);
525 raw_choices_ = NIL;
526
527 EnableChoiceAccum();
528
529 for (BlobWidth = current_segmentation_,
530 End = current_segmentation_ + MAX_NUM_CHUNKS;
531 BlobWidth < End; *BlobWidth++ = 1);
532
533 } /* InitChoiceAccum */
534
535
536 /*---------------------------------------------------------------------------*/
LogNewSegmentation(PIECES_STATE BlobWidth)537 void Dict::LogNewSegmentation(PIECES_STATE BlobWidth) {
538 /*
539 ** Parameters:
540 ** BlobWidth[] number of chunks in each blob in segmentation
541 ** Variables Used:
542 ** current_segmentation blob widths for current segmentation
543 ** Operation: This routine updates the blob widths in current_segmentation
544 ** to be the same as provided in BlobWidth.
545 ** Return: none
546 ** Exceptions: none
547 ** History: Mon May 20 11:52:26 1991, DSJ, Created.
548 */
549 BLOB_WIDTH *Segmentation;
550
551 for (Segmentation = current_segmentation_; *BlobWidth != 0;
552 BlobWidth++, Segmentation++)
553 *Segmentation = *BlobWidth;
554 *Segmentation = 0;
555
556 } /* LogNewSegmentation */
557
558
559 /*---------------------------------------------------------------------------*/
LogNewSplit(int Blob)560 void Dict::LogNewSplit(int Blob) {
561 /*
562 ** Parameters:
563 ** Blob index of blob that was split
564 ** Variables Used:
565 ** best_raw_choice_ current best raw choice
566 ** best_choices_ list of best choices found so far
567 ** Operation: This routine adds 1 chunk to the specified blob for each
568 ** choice in best_choices_ and for the best_raw_choice_.
569 ** Return: none
570 ** Exceptions: none
571 ** History: Mon May 20 11:38:56 1991, DSJ, Created.
572 */
573 LIST Choices;
574
575 if (best_raw_choice_) {
576 AddNewChunk(best_raw_choice_, Blob);
577 }
578
579 Choices = best_choices_;
580 iterate(Choices) {
581 AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
582 }
583 Choices = raw_choices_;
584 iterate(Choices) {
585 AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
586 }
587 } /* LogNewSplit */
588
589
590 /*---------------------------------------------------------------------------*/
LogNewChoice(const WERD_CHOICE & WordChoice,FLOAT32 AdjustFactor,const float Certainties[],bool raw_choice)591 void Dict::LogNewChoice(const WERD_CHOICE &WordChoice,
592 FLOAT32 AdjustFactor,
593 const float Certainties[],
594 bool raw_choice) {
595 /*
596 ** Parameters:
597 ** Choice new choice for current word
598 ** AdjustFactor adjustment factor which was applied to choice
599 ** Certainties certainties for each char in new choice
600 ** ChoicesList list with choices seen so far
601 ** Variables Used:
602 ** best_raw_choice_ best raw choice so far for current word
603 ** Operation: This routine adds Choice to ChoicesList if the
604 ** adjusted certainty for Choice is within a reasonable range
605 ** of the best choice in ChoicesList. The ChoicesList
606 ** list is kept in sorted order by rating. Duplicates are
607 ** removed.
608 ** Return: none
609 ** Exceptions: none
610 ** History: Wed May 15 09:57:19 1991, DSJ, Created.
611 */
612 VIABLE_CHOICE NewChoice;
613 LIST ChoicesList;
614 LIST Choices;
615 FLOAT32 Threshold;
616
617 if (!keep_word_choices_)
618 return;
619
620 if (raw_choice) {
621 if (!best_raw_choice_)
622 best_raw_choice_ = NewViableChoice(WordChoice, AdjustFactor, Certainties);
623 else if (WordChoice.rating() < best_raw_choice_->Rating) {
624 if (ChoiceSameAs(WordChoice, best_raw_choice_))
625 FillViableChoice(WordChoice, AdjustFactor, Certainties, true,
626 best_raw_choice_);
627 else {
628 memfree(best_raw_choice_);
629 best_raw_choice_ =
630 NewViableChoice(WordChoice, AdjustFactor, Certainties);
631 }
632 }
633 if (!save_raw_choices) return;
634 ChoicesList = raw_choices_;
635 } else {
636 ChoicesList = best_choices_;
637 }
638
639 /* throw out obviously bad choices to save some work */
640 if (ChoicesList != NIL) {
641 Threshold = AmbigThreshold (BestFactor (ChoicesList), AdjustFactor);
642 if (Threshold > -stopper_ambiguity_threshold_offset)
643 Threshold = -stopper_ambiguity_threshold_offset;
644 if (WordChoice.certainty() - BestCertainty (ChoicesList) < Threshold)
645 return;
646 }
647
648 /* see if a choice with the same text string has already been found */
649 NewChoice = NULL;
650 Choices = ChoicesList;
651
652 iterate(Choices) {
653 if (ChoiceSameAs (WordChoice, (VIABLE_CHOICE) first_node (Choices))) {
654 if (WordChoice.rating() < BestRating (Choices)) {
655 NewChoice = (VIABLE_CHOICE) first_node (Choices);
656 } else {
657 return;
658 }
659 }
660 }
661
662 if (NewChoice) {
663 FillViableChoice(WordChoice, AdjustFactor, Certainties, true, NewChoice);
664 ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node);
665 }
666 else {
667 NewChoice = NewViableChoice (WordChoice, AdjustFactor, Certainties);
668 }
669
670 ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings);
671 if (stopper_debug_level >= 2)
672 raw_choice ? PrintViableChoice (stderr, "New Raw Choice: ", NewChoice) :
673 PrintViableChoice (stderr, "New Word Choice: ", NewChoice);
674 if (count (ChoicesList) > tessedit_truncate_wordchoice_log) {
675 Choices =
676 (LIST) nth_cell (ChoicesList, tessedit_truncate_wordchoice_log);
677 destroy_nodes (rest (Choices), Efree);
678 set_rest(Choices, NIL);
679 }
680
681 // Update raw_choices_/best_choices_ pointer.
682 if (raw_choice) {
683 raw_choices_ = ChoicesList;
684 } else {
685 best_choices_ = ChoicesList;
686 }
687 } /* LogNewChoice */
688
689
690 /*---------------------------------------------------------------------------*/
NoDangerousAmbig(WERD_CHOICE * best_choice,DANGERR * fix_pt,bool fix_replaceable,BLOB_CHOICE_LIST_VECTOR * blob_choices,bool * modified_blobs)691 int Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
692 DANGERR *fix_pt,
693 bool fix_replaceable,
694 BLOB_CHOICE_LIST_VECTOR *blob_choices,
695 bool *modified_blobs) {
696 if (stopper_debug_level > 2) {
697 tprintf("\nRunning NoDangerousAmbig() for %s\n",
698 best_choice->debug_string(getUnicharset()).string());
699 }
700
701 // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
702 // for each unichar id in BestChoice.
703 BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
704 int i;
705 bool modified_best_choice = false;
706 bool ambigs_found = false;
707 // For each position in best_choice:
708 // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
709 // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
710 // -- look for ambiguities corresponding to wrong_ngram in the list while
711 // adding the following unichar_ids from best_choice to wrong_ngram
712 //
713 // Repeat the above procedure twice: first time look through
714 // ambigs to be replaced and replace all the ambiguities found;
715 // second time look through dangerous ambiguities and construct
716 // ambig_blob_choices with fake a blob choice for each ambiguity
717 // and pass them to dawg_permute_and_select() to search for
718 // ambiguous words in the dictionaries.
719 //
720 // Note that during the execution of the for loop (on the first pass)
721 // if replacements are made the length of best_choice might change.
722 for (int pass = 0; pass < 2; ++pass) {
723 bool replace = (pass == 0);
724 const UnicharAmbigsVector &table = replace ?
725 getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
726 if (!replace) {
727 // Initialize ambig_blob_choices with lists containing a single
728 // unichar id for the correspoding position in best_choice.
729 // best_choice consisting from only the original letters will
730 // have a rating of 0.0.
731 for (i = 0; i < best_choice->length(); ++i) {
732 BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
733 BLOB_CHOICE_IT lst_it(lst);
734 lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
735 0.0, 0.0, 0, -1));
736 ambig_blob_choices.push_back(lst);
737 }
738 }
739 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
740 int wrong_ngram_index;
741 int next_index;
742 for (i = 0; i < best_choice->length(); ++i) {
743 UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
744 if (stopper_debug_level > 2) {
745 tprintf("Looking for %s ngrams starting with %s:\n",
746 replace ? "replaceable" : "ambiguous",
747 getUnicharset().debug_str(curr_unichar_id).string());
748 }
749 wrong_ngram_index = 0;
750 wrong_ngram[wrong_ngram_index] = curr_unichar_id;
751 if (curr_unichar_id == INVALID_UNICHAR_ID ||
752 curr_unichar_id >= table.size() ||
753 table[curr_unichar_id] == NULL) {
754 continue; // there is no ambig spec for this unichar id
755 }
756 AmbigSpec_IT spec_it(table[curr_unichar_id]);
757 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
758 const AmbigSpec *ambig_spec = spec_it.data();
759 wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
760 int compare = UnicharIdArrayUtils::compare(wrong_ngram,
761 ambig_spec->wrong_ngram);
762 if (stopper_debug_level > 2) {
763 tprintf("candidate ngram: ");
764 UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
765 tprintf("current ngram from spec: ");
766 UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
767 tprintf("comparison result: %d\n", compare);
768 }
769 if (compare == 0) {
770 if (replace) {
771 if (stopper_debug_level > 2) {
772 tprintf("replace ambiguity with: ");
773 UnicharIdArrayUtils::print(
774 ambig_spec->correct_fragments, getUnicharset());
775 }
776 ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
777 ambig_spec->correct_ngram_id,
778 best_choice, blob_choices, modified_blobs);
779 modified_best_choice = true;
780 } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
781 // We found dang ambig - update ambig_blob_choices.
782 if (stopper_debug_level > 2) {
783 tprintf("found ambiguity: ");
784 UnicharIdArrayUtils::print(
785 ambig_spec->correct_fragments, getUnicharset());
786 }
787 ambigs_found = true;
788 for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
789 ++tmp_index) {
790 // Add a blob choice for the corresponding fragment of the
791 // ambiguity. These fake blob choices are initialized with
792 // negative ratings (which are not possible for real blob
793 // choices), so that dawg_permute_and_select() considers any
794 // word not consisting of only the original letters a better
795 // choice and stops searching for alternatives once such a
796 // choice is found.
797 BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
798 bc_it.add_to_end(new BLOB_CHOICE(
799 ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, 0, -1));
800 }
801 }
802 spec_it.forward();
803 } else if (compare == -1) {
804 if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
805 ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
806 // Add the next unichar id to wrong_ngram and keep looking for
807 // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
808 wrong_ngram[++wrong_ngram_index] =
809 best_choice->unichar_id(next_index);
810 } else {
811 break; // no more matching ambigs in this AMBIG_SPEC_LIST
812 }
813 } else {
814 spec_it.forward();
815 }
816 } // end searching AmbigSpec_LIST
817 } // end searching best_choice
818 } // end searching replace and dangerous ambigs
819 if (modified_best_choice) best_choice->populate_unichars(getUnicharset());
820 // If any ambiguities were found permute the constructed ambig_blob_choices
821 // to see if an alternative dictionary word can be found.
822 if (ambigs_found) {
823 if (stopper_debug_level > 2) {
824 tprintf("\nResulting ambig_blob_choices:\n");
825 for (i = 0; i < ambig_blob_choices.length(); ++i) {
826 print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
827 tprintf("\n");
828 }
829 }
830 WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
831 ambigs_found = (alt_word->rating() < 0.0);
832 if (ambigs_found && stopper_debug_level >= 1) {
833 tprintf ("Stopper: Possible ambiguous word = %s\n",
834 alt_word->debug_string(getUnicharset()).string());
835 }
836 delete alt_word;
837 }
838 ambig_blob_choices.delete_data_pointers();
839 return !ambigs_found;
840 }
841
EndDangerousAmbigs()842 void Dict::EndDangerousAmbigs() {}
843
844 /*---------------------------------------------------------------------------*/
SettupStopperPass1()845 void Dict::SettupStopperPass1() {
846 /*
847 ** Parameters: none
848 ** Variables Used:
849 ** reject_offset_ offset allowed before word is rejected
850 ** Operation: This routine performs any settup of stopper variables
851 ** that is needed in preparation for the first pass.
852 ** Return: none
853 ** Exceptions: none
854 ** History: Mon Jun 3 12:32:00 1991, DSJ, Created.
855 */
856 reject_offset_ = 0.0;
857 } /* SettupStopperPass1 */
858
859
860 /*---------------------------------------------------------------------------*/
SettupStopperPass2()861 void Dict::SettupStopperPass2() {
862 /*
863 ** Parameters: none
864 ** Variables Used:
865 ** reject_offset_ offset allowed before word is rejected
866 ** Operation: This routine performs any settup of stopper variables
867 ** that is needed in preparation for the second pass.
868 ** Return: none
869 ** Exceptions: none
870 ** History: Mon Jun 3 12:32:00 1991, DSJ, Created.
871 */
872 reject_offset_ = stopper_phase2_certainty_rejection_offset;
873 } /* SettupStopperPass2 */
874 } // namespace tesseract
875
876
877 /**----------------------------------------------------------------------------
878 Private Code
879 ----------------------------------------------------------------------------**/
880 /*---------------------------------------------------------------------------*/
AddNewChunk(VIABLE_CHOICE Choice,int Blob)881 void AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
882 /*
883 ** Parameters:
884 ** Choice choice to add a new chunk to
885 ** Blob index of blob being split
886 ** Operation: This routine increments the chunk count of the character
887 ** in Choice which corresponds to Blob.
888 ** Return: none
889 ** Exceptions: none
890 ** History: Mon May 20 11:43:27 1991, DSJ, Created.
891 */
892 int i, LastChunk;
893
894 for (i = 0, LastChunk = 0; i < Choice->Length; i++) {
895 LastChunk += Choice->Blob[i].NumChunks;
896 if (Blob < LastChunk) {
897 (Choice->Blob[i].NumChunks)++;
898 return;
899 }
900 }
901 mem_tidy (1);
902 cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
903 Choice->Length, LastChunk, Blob);
904 assert(FALSE); /* this should never get executed */
905
906 } /* AddNewChunk */
907
908
909 /*---------------------------------------------------------------------------*/
910 namespace tesseract {
911 // Replaces the corresponding wrong ngram in werd_choice with the correct one.
912 // We indicate that this newly inserted ngram unichar is composed from several
913 // fragments and modify the corresponding entries in blob_choices to contain
914 // fragments of the correct ngram unichar instead of the original unichars.
915 // Ratings and certainties of entries in blob_choices and werd_choice are
916 // unichaged. E.g. for werd_choice mystring'' and ambiguity ''->":
917 // werd_choice becomes mystring", first ' in blob_choices becomes |"|0|2,
918 // second one is set to |"|1|2.
ReplaceAmbig(int wrong_ngram_begin_index,int wrong_ngram_size,UNICHAR_ID correct_ngram_id,WERD_CHOICE * werd_choice,BLOB_CHOICE_LIST_VECTOR * blob_choices,bool * modified_blobs)919 void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
920 UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
921 BLOB_CHOICE_LIST_VECTOR *blob_choices,
922 bool *modified_blobs) {
923 int num_blobs_to_replace = 0;
924 int begin_blob_index = 0;
925 int i;
926 for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
927 if (i >= wrong_ngram_begin_index) {
928 num_blobs_to_replace += werd_choice->fragment_length(i);
929 } else {
930 begin_blob_index += werd_choice->fragment_length(i);
931 }
932 }
933 BLOB_CHOICE_IT bit;
934 int temp_blob_index = begin_blob_index;
935 const char *temp_uch = NULL;
936 const char *correct_ngram_str =
937 getUnicharset().id_to_unichar(correct_ngram_id);
938 for (int replaced_count = 0; replaced_count < wrong_ngram_size;
939 ++replaced_count) {
940 if (blob_choices != NULL) {
941 UNICHAR_ID uch_id = werd_choice->unichar_id(wrong_ngram_begin_index);
942 int fraglen = werd_choice->fragment_length(wrong_ngram_begin_index);
943 if (fraglen > 1) temp_uch = getUnicharset().id_to_unichar(uch_id);
944 for (i = 0; i < fraglen; ++i) {
945 if (fraglen > 1) {
946 STRING frag_str =
947 CHAR_FRAGMENT::to_string(temp_uch, i, fraglen);
948 getUnicharset().unichar_insert(frag_str.string());
949 uch_id = getUnicharset().unichar_to_id(frag_str.string());
950 }
951 bit.set_to_list(blob_choices->get(temp_blob_index));
952 STRING correct_frag_uch =
953 CHAR_FRAGMENT::to_string(correct_ngram_str,
954 temp_blob_index - begin_blob_index,
955 num_blobs_to_replace);
956 getUnicharset().unichar_insert(correct_frag_uch.string());
957 UNICHAR_ID correct_frag_uch_id =
958 getUnicharset().unichar_to_id(correct_frag_uch.string());
959 // Find the WERD_CHOICE corresponding to the original unichar in
960 // the list of blob choices, add the derived character fragment
961 // before it with the same rating and certainty.
962 for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) {
963 if (bit.data()->unichar_id() == correct_frag_uch_id) {
964 break; // the unichar we want to insert is already there
965 }
966 if (bit.data()->unichar_id() == uch_id) {
967 bit.add_before_then_move(new BLOB_CHOICE(*(bit.data())));
968 bit.data()->set_unichar_id(correct_frag_uch_id);
969 if (modified_blobs != NULL) *modified_blobs = true;
970 break;
971 }
972 }
973 temp_blob_index++;
974 }
975 }
976 // Remove current unichar from werd_choice. On the last iteration
977 // set the correct replacement unichar instead of removing a unichar.
978 if (replaced_count + 1 == wrong_ngram_size) {
979 werd_choice->set_unichar_id(correct_ngram_id,
980 num_blobs_to_replace, 0.0, 0.0, wrong_ngram_begin_index);
981 } else {
982 werd_choice->remove_unichar_id(wrong_ngram_begin_index);
983 }
984 }
985 if (stopper_debug_level >= 1) {
986 tprintf("ReplaceAmbigs() modified werd_choice: %s\n",
987 werd_choice->debug_string(getUnicharset()).string());
988 werd_choice->print();
989 if (modified_blobs != NULL && *modified_blobs && blob_choices != NULL) {
990 tprintf("Modified blob_choices: ");
991 for (int i = 0; i < blob_choices->size(); ++i) {
992 print_ratings_list("\n", blob_choices->get(i), getUnicharset());
993 }
994 }
995 }
996 }
997
998
999 /*---------------------------------------------------------------------------*/
ChoiceSameAs(const WERD_CHOICE & WordChoice,VIABLE_CHOICE ViableChoice)1000 int Dict::ChoiceSameAs(const WERD_CHOICE &WordChoice,
1001 VIABLE_CHOICE ViableChoice) {
1002 /*
1003 ** Parameters:
1004 ** Choice choice to compare to ViableChoice
1005 ** ViableChoice viable choice to compare to Choice
1006 ** Operation: This routine compares the corresponding strings of
1007 ** Choice and ViableChoice and returns TRUE if they are the
1008 ** same, FALSE otherwise.
1009 ** Return: TRUE or FALSE.
1010 ** Exceptions: none
1011 ** History: Fri May 17 08:48:04 1991, DSJ, Created.
1012 */
1013 return (StringSameAs(WordChoice, ViableChoice));
1014
1015 } /* ChoiceSameAs */
1016 } // namespace tesseract
1017
1018
1019 /*---------------------------------------------------------------------------*/
CmpChoiceRatings(void * arg1,void * arg2)1020 int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1,
1021 void *arg2) { //VIABLE_CHOICE Choice2)
1022 /*
1023 ** Parameters:
1024 ** Choice1, Choice2 choices to compare ratings for
1025 ** Operation: Return -1 if the rating for Choice1 is less than the
1026 ** rating for Choice2, otherwise return (1).
1027 ** Return: -1 or 1
1028 ** Exceptions: none
1029 ** History: Wed May 15 13:02:37 1991, DSJ, Created.
1030 */
1031 float R1, R2;
1032 VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1;
1033 VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2;
1034
1035 R1 = Choice1->Rating;
1036 R2 = Choice2->Rating;
1037
1038 if (R1 < R2)
1039 return (-1);
1040 else
1041 return (1);
1042
1043 } /* CmpChoiceRatings */
1044
1045
1046 /*---------------------------------------------------------------------------*/
ExpandChoice(VIABLE_CHOICE Choice,EXPANDED_CHOICE * ExpandedChoice)1047 void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice) {
1048 /*
1049 ** Parameters:
1050 ** Choice choice to be expanded
1051 ** ExpandedChoice place to put resulting expanded choice
1052 ** Operation: This routine expands Choice and places the results
1053 ** in ExpandedChoice. The primary function of expansion
1054 ** is to create an two arrays, one which holds the corresponding
1055 ** certainty for each chunk in Choice, and one which holds
1056 ** the class for each chunk.
1057 ** Return: none (results are placed in ExpandedChoice)
1058 ** Exceptions: none
1059 ** History: Fri May 31 15:21:57 1991, DSJ, Created.
1060 */
1061 int i, j, Chunk;
1062
1063 ExpandedChoice->Choice = Choice;
1064 for (i = 0, Chunk = 0; i < Choice->Length; i++)
1065 for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
1066 ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty;
1067 ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class;
1068 }
1069 } /* ExpandChoice */
1070
1071 /*---------------------------------------------------------------------------*/
FreeBadChoice(void * item1,void * item2)1072 int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice,
1073 void *item2) { //EXPANDED_CHOICE *BestChoice)
1074 /*
1075 ** Parameters:
1076 ** Choice choice to be tested
1077 ** BestChoice best choice found
1078 ** Variables Used:
1079 ** stopper_ambiguity_threshold_gain
1080 ** stopper_ambiguity_threshold_offset
1081 ** Operation: If the certainty of any chunk in Choice is not ambiguous
1082 ** with the corresponding chunk in the best choice, free
1083 ** Choice and return TRUE. Otherwise, return FALSE.
1084 ** Return: TRUE or FALSE.
1085 ** Exceptions: none
1086 ** History: Wed May 15 13:20:26 1991, DSJ, Created.
1087 */
1088 int i, j, Chunk;
1089 FLOAT32 Threshold;
1090 VIABLE_CHOICE Choice;
1091 EXPANDED_CHOICE *BestChoice;
1092
1093 Choice = (VIABLE_CHOICE) item1;
1094 BestChoice = (EXPANDED_CHOICE *) item2;
1095
1096 Threshold = AmbigThreshold (BestChoice->Choice->AdjustFactor,
1097 Choice->AdjustFactor);
1098
1099 for (i = 0, Chunk = 0; i < Choice->Length; i++)
1100 for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
1101 if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
1102 Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
1103 Threshold) {
1104 memfree(Choice);
1105 return (TRUE);
1106 }
1107
1108 return (FALSE);
1109
1110 } /* FreeBadChoice */
1111
1112
1113 /*---------------------------------------------------------------------------*/
1114 namespace tesseract {
LengthOfShortestAlphaRun(const WERD_CHOICE & WordChoice)1115 int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) {
1116 /*
1117 ** Parameters:
1118 ** Word word to be tested
1119 ** Operation: Return the length of the shortest alpha run in Word.
1120 ** Return: Return the length of the shortest alpha run in Word.
1121 ** Exceptions: none
1122 ** History: Tue May 14 07:50:45 1991, DSJ, Created.
1123 */
1124 register int Shortest = MAX_INT32;
1125 register int Length;
1126 int x;
1127 int y;
1128
1129 for (x = 0; x < WordChoice.length(); ++x) {
1130 if (getUnicharset().get_isalpha(WordChoice.unichar_id(x))) {
1131 for (y = x + 1, Length = 1;
1132 y < WordChoice.length() &&
1133 getUnicharset().get_isalpha(WordChoice.unichar_id(y));
1134 ++y, ++Length);
1135 if (Length < Shortest) {
1136 Shortest = Length;
1137 }
1138 if (y == WordChoice.length()) {
1139 break;
1140 }
1141 }
1142 }
1143 if (Shortest == MAX_INT32)
1144 Shortest = 0;
1145
1146 return (Shortest);
1147
1148 } /* LengthOfShortestAlphaRun */
1149
1150
1151 /*---------------------------------------------------------------------------*/
NewViableChoice(const WERD_CHOICE & WordChoice,FLOAT32 AdjustFactor,const float Certainties[])1152 VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice,
1153 FLOAT32 AdjustFactor,
1154 const float Certainties[]) {
1155 /*
1156 ** Parameters:
1157 ** Choice choice to be converted to a viable choice
1158 ** AdjustFactor factor used to adjust ratings for Choice
1159 ** Certainties certainty for each character in Choice
1160 ** Variables Used:
1161 ** current_segmentation segmentation corresponding to Choice
1162 ** Operation: Allocate a new viable choice data structure, copy
1163 ** Choice, Certainties, and current_segmentation_ into it,
1164 ** and return a pointer to it.
1165 ** Return: Ptr to new viable choice.
1166 ** Exceptions: none
1167 ** History: Thu May 16 15:28:29 1991, DSJ, Created.
1168 */
1169 int Length = WordChoice.length();
1170 assert (Length <= MAX_NUM_CHUNKS && Length > 0);
1171 VIABLE_CHOICE NewChoice = (VIABLE_CHOICE) Emalloc (
1172 sizeof (VIABLE_CHOICE_STRUCT) + (Length - 1) * sizeof (CHAR_CHOICE));
1173 FillViableChoice(WordChoice, AdjustFactor, Certainties, false, NewChoice);
1174 return (NewChoice);
1175 } /* NewViableChoice */
1176
1177
1178 /*---------------------------------------------------------------------------*/
PrintViableChoice(FILE * File,const char * Label,VIABLE_CHOICE Choice)1179 void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
1180 /*
1181 ** Parameters:
1182 ** File open text file to print Choice to
1183 ** Label text label to be printed with Choice
1184 ** Choice choice to be printed
1185 ** Operation: This routine dumps a text representation of the
1186 ** specified Choice to File.
1187 ** Return: none
1188 ** Exceptions: none
1189 ** History: Mon May 20 11:16:44 1991, DSJ, Created.
1190 */
1191 int i, j;
1192
1193 fprintf (File, "%s", Label);
1194
1195 fprintf(File, "(R=%5.1f, C=%4.1f, F=%4.2f, Frag=%d) ",
1196 Choice->Rating, Choice->Certainty,
1197 Choice->AdjustFactor, Choice->ComposedFromCharFragments);
1198
1199 for (i = 0; i < Choice->Length; i++)
1200 fprintf(File, "%s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
1201 fprintf(File, "\n");
1202
1203 for (i = 0; i < Choice->Length; i++) {
1204 fprintf(File, " %s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
1205 for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++)
1206 fprintf(File, " ");
1207 }
1208 fprintf(File, "\n");
1209
1210 for (i = 0; i < Choice->Length; i++) {
1211 for (j = 0; j < Choice->Blob[i].NumChunks; j++)
1212 fprintf(File, "%3d ", (int) (Choice->Blob[i].Certainty * -10.0));
1213 }
1214 fprintf(File, "\n");
1215
1216 for (i = 0; i < Choice->Length; i++) {
1217 for (j = 0; j < Choice->Blob[i].NumChunks; j++)
1218 fprintf(File, "%3d ", Choice->Blob[i].NumChunks);
1219 }
1220 fprintf(File, "\n");
1221 } /* PrintViableChoice */
1222
1223
1224 /*---------------------------------------------------------------------------*/
FillViableChoice(const WERD_CHOICE & WordChoice,FLOAT32 AdjustFactor,const float Certainties[],bool SameString,VIABLE_CHOICE ViableChoice)1225 void Dict::FillViableChoice(const WERD_CHOICE &WordChoice,
1226 FLOAT32 AdjustFactor, const float Certainties[],
1227 bool SameString, VIABLE_CHOICE ViableChoice) {
1228 /*
1229 ** Parameters:
1230 ** WordChoice a choice with info that will be copied
1231 ** AdjustFactor factor used to adjust ratings for AChoice
1232 ** Certainties certainty for each character in AChoice
1233 ** SameString if true the string in the viable choice
1234 ** will not be changed
1235 ** ViableChoice existing viable choice to fill in
1236 ** Variables Used:
1237 ** current_segmentation_ segmentation for NewChoice
1238 ** Operation:
1239 ** Fill ViableChoice with information from AChoice,
1240 ** AdjustFactor, and Certainties.
1241 ** Return: none
1242 ** Exceptions: none
1243 ** History: Fri May 17 13:35:58 1991, DSJ, Created.
1244 */
1245 CHAR_CHOICE *NewChar;
1246 BLOB_WIDTH *BlobWidth;
1247 int x;
1248
1249 ViableChoice->Rating = WordChoice.rating();
1250 ViableChoice->Certainty = WordChoice.certainty();
1251 ViableChoice->AdjustFactor = AdjustFactor;
1252 ViableChoice->ComposedFromCharFragments = false;
1253 if (!SameString) {
1254 ViableChoice->Length = WordChoice.length();
1255 }
1256 for (x = 0,
1257 NewChar = &(ViableChoice->Blob[0]),
1258 BlobWidth = current_segmentation_;
1259 x < WordChoice.length();
1260 x++, NewChar++, Certainties++, BlobWidth++) {
1261 if (!SameString) {
1262 NewChar->Class = WordChoice.unichar_id(x);
1263 }
1264 NewChar->NumChunks = *BlobWidth;
1265 NewChar->Certainty = *Certainties;
1266 for (int i = 1; i < WordChoice.fragment_length(x); ++i) {
1267 BlobWidth++;
1268 assert(*BlobWidth > 0);
1269 NewChar->NumChunks += *BlobWidth;
1270 ViableChoice->ComposedFromCharFragments = true;
1271 }
1272 }
1273 } /* FillViableChoice */
1274
1275
1276 // Compares unichar ids in word_choice to those in viable_choice,
1277 // returns true if they are the same, false otherwise.
StringSameAs(const WERD_CHOICE & WordChoice,VIABLE_CHOICE ViableChoice)1278 bool Dict::StringSameAs(const WERD_CHOICE &WordChoice,
1279 VIABLE_CHOICE ViableChoice) {
1280 if (WordChoice.length() != ViableChoice->Length) {
1281 return false;
1282 }
1283 int i;
1284 CHAR_CHOICE *CharChoice;
1285 for (i = 0, CharChoice = &(ViableChoice->Blob[0]);
1286 i < ViableChoice->Length; CharChoice++, i++) {
1287 if (CharChoice->Class != WordChoice.unichar_id(i)) {
1288 return false;
1289 }
1290 }
1291 return true;
1292 }
1293
1294 /*---------------------------------------------------------------------------*/
StringSameAs(const char * String,const char * String_lengths,VIABLE_CHOICE ViableChoice)1295 int Dict::StringSameAs(const char *String,
1296 const char *String_lengths,
1297 VIABLE_CHOICE ViableChoice) {
1298 /*
1299 ** Parameters:
1300 ** String string to compare to ViableChoice
1301 ** String_lengths lengths of unichars in String
1302 ** ViableChoice viable choice to compare to String
1303 ** Operation: This routine compares String to ViableChoice and
1304 ** returns TRUE if they are the same, FALSE otherwise.
1305 ** Return: TRUE or FALSE.
1306 ** Exceptions: none
1307 ** History: Fri May 17 08:48:04 1991, DSJ, Created.
1308 */
1309 CHAR_CHOICE *Char;
1310 int i;
1311 int current_unichar_length;
1312
1313 for (Char = &(ViableChoice->Blob[0]), i = 0;
1314 i < ViableChoice->Length;
1315 String += *(String_lengths++), Char++, i++) {
1316 current_unichar_length = strlen(getUnicharset().id_to_unichar(Char->Class));
1317 if (current_unichar_length != *String_lengths ||
1318 strncmp(String, getUnicharset().id_to_unichar(Char->Class),
1319 current_unichar_length) != 0)
1320 return (FALSE);
1321 }
1322
1323 if (*String == 0)
1324 return (TRUE);
1325 else
1326 return (FALSE);
1327
1328 } /* StringSameAs */
1329 } // namespace tesseract
1330
1331 /*---------------------------------------------------------------------------*/
UniformCertainties(const BLOB_CHOICE_LIST_VECTOR & Choices,const WERD_CHOICE & BestChoice)1332 int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
1333 const WERD_CHOICE &BestChoice) {
1334 /*
1335 ** Parameters:
1336 ** Choices choices for current segmentation
1337 ** BestChoice best choice for current segmentation
1338 ** Variables Used:
1339 ** stopper_allowable_character_badness
1340 ** max allowed certainty variation
1341 ** Operation: This routine returns TRUE if the certainty of the
1342 ** BestChoice word is within a reasonable range of the average
1343 ** certainties for the best choices for each character in
1344 ** the segmentation. This test is used to catch words in which
1345 ** one character is much worse than the other characters in
1346 ** the word (i.e. FALSE will be returned in that case).
1347 ** The algorithm computes the mean and std deviation of the
1348 ** certainties in the word with the worst certainty thrown out.
1349 ** Return: TRUE or FALSE.
1350 ** Exceptions: none
1351 ** History: Tue May 14 08:23:21 1991, DSJ, Created.
1352 */
1353 float Certainty;
1354 float WorstCertainty = MAX_FLOAT32;
1355 float CertaintyThreshold;
1356 FLOAT64 TotalCertainty;
1357 FLOAT64 TotalCertaintySquared;
1358 FLOAT64 Variance;
1359 FLOAT32 Mean, StdDev;
1360 int WordLength;
1361
1362 WordLength = Choices.length();
1363 if (WordLength < 3)
1364 return (TRUE);
1365
1366 TotalCertainty = TotalCertaintySquared = 0.0;
1367 BLOB_CHOICE_IT BlobChoiceIt;
1368 for (int i = 0; i < Choices.length(); ++i) {
1369 BlobChoiceIt.set_to_list(Choices.get(i));
1370 Certainty = BlobChoiceIt.data()->certainty();
1371 TotalCertainty += Certainty;
1372 TotalCertaintySquared += Certainty * Certainty;
1373 if (Certainty < WorstCertainty)
1374 WorstCertainty = Certainty;
1375 }
1376
1377 /* subtract off worst certainty from statistics */
1378 WordLength--;
1379 TotalCertainty -= WorstCertainty;
1380 TotalCertaintySquared -= WorstCertainty * WorstCertainty;
1381
1382 Mean = TotalCertainty / WordLength;
1383 Variance = ((WordLength * TotalCertaintySquared -
1384 TotalCertainty * TotalCertainty) /
1385 (WordLength * (WordLength - 1)));
1386 if (Variance < 0.0)
1387 Variance = 0.0;
1388 StdDev = sqrt (Variance);
1389
1390 CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
1391 if (CertaintyThreshold > stopper_nondict_certainty_base)
1392 CertaintyThreshold = stopper_nondict_certainty_base;
1393
1394 if (BestChoice.certainty() < CertaintyThreshold) {
1395 if (stopper_debug_level >= 1)
1396 cprintf("Stopper: Non-uniform certainty = %4.1f"
1397 " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
1398 BestChoice.certainty(), Mean, StdDev, CertaintyThreshold);
1399 return (FALSE);
1400 } else {
1401 return (TRUE);
1402 }
1403 } /* UniformCertainties */
1404