• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  **	Filename:    stopper.c
3  **	Purpose:     Stopping criteria for word classifier.
4  **	Author:      Dan Johnson
5  **	History:     Mon Apr 29 14:56:49 1991, DSJ, Created.
6  **
7  **	(c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 /**----------------------------------------------------------------------------
19           Include Files and Type Defines
20 ----------------------------------------------------------------------------**/
21 #include "stopper.h"
22 #include "emalloc.h"
23 #include "matchdefs.h"
24 #include "callcpp.h"
25 #include "permute.h"
26 #include "context.h"
27 #include "danerror.h"
28 #include "const.h"
29 #include "freelist.h"
30 #include "efio.h"
31 #include "scanutils.h"
32 #include "unichar.h"
33 #include "varable.h"
34 #include "dict.h"
35 #include "image.h"
36 #include "ccutil.h"
37 #include "ratngs.h"
38 #include "ambigs.h"
39 
40 #include <stdio.h>
41 #include <string.h>
42 #include <ctype.h>
43 #include <math.h>
44 #ifdef __UNIX__
45 #include <assert.h>
46 #endif
47 
48 /* these are kludges - add appropriate .h file later */
49 /* from adaptmatch.cpp */
50 #define MAX_WERD_SIZE   100
51 
52 typedef struct
53 {
54   VIABLE_CHOICE Choice;
55   float ChunkCertainty[MAX_NUM_CHUNKS];
56   UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS];
57 } EXPANDED_CHOICE;
58 
59 /**----------------------------------------------------------------------------
60           Macros
61 ----------------------------------------------------------------------------**/
62 #define BestCertainty(Choices)  (((VIABLE_CHOICE) first_node (Choices))->Certainty)
63 #define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating)
64 #define BestFactor(Choices) (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
65 
66 #define AmbigThreshold(F1,F2)	(((F2) - (F1)) * stopper_ambiguity_threshold_gain - \
67 				stopper_ambiguity_threshold_offset)
68 
69 /*---------------------------------------------------------------------------
70           Private Function Prototoypes
71 ----------------------------------------------------------------------------*/
72 void AddNewChunk(VIABLE_CHOICE Choice, int Blob);
73 
74 int CmpChoiceRatings(void *arg1,   //VIABLE_CHOICE         Choice1,
75                      void *arg2);  //VIABLE_CHOICE         Choice2);
76 
77 void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice);
78 
79 int FreeBadChoice(void *item1,   //VIABLE_CHOICE                 Choice,
80                   void *item2);  //EXPANDED_CHOICE                       *BestChoice);
81 
82 int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
83                        const WERD_CHOICE &BestChoice);
84 
85 /**----------------------------------------------------------------------
86                      V a r i a b l e s
87 ----------------------------------------------------------------------**/
88 double_VAR(certainty_scale, 20.0, "Certainty scaling factor");
89 
90 double_VAR(stopper_nondict_certainty_base, -2.50,
91            "Certainty threshold for non-dict words");
92 
93 double_VAR(stopper_phase2_certainty_rejection_offset, 1.0,
94            "Reject certainty offset");
95 
96 INT_VAR(stopper_smallword_size, 2,
97         "Size of dict word to be treated as non-dict word");
98 
99 double_VAR(stopper_certainty_per_char, -0.50,
100            "Certainty to add for each dict char above small word size.");
101 
102 double_VAR(stopper_allowable_character_badness, 3.0,
103            "Max certaintly variation allowed in a word (in sigma)");
104 
105 INT_VAR(stopper_debug_level, 0, "Stopper debug level");
106 
107 double_VAR(stopper_ambiguity_threshold_gain, 8.0,
108            "Gain factor for ambiguity threshold");
109 
110 double_VAR(stopper_ambiguity_threshold_offset, 1.5,
111            "Certainty offset for ambiguity threshold");
112 
113 BOOL_VAR(stopper_no_acceptable_choices, false,
114          "Make AcceptableChoice() always return false. Useful"
115          " when there is a need to explore all segmentations");
116 
117 BOOL_VAR(save_raw_choices, false, "Save all explored raw choices");
118 
119 INT_VAR (tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
120 
121 STRING_VAR(word_to_debug, "", "Word for which stopper debug information"
122            " should be printed to stdout");
123 
124 STRING_VAR(word_to_debug_lengths, "", "Lengths of unichars in word_to_debug");
125 
126 /**----------------------------------------------------------------------------
127               Public Code
128 ----------------------------------------------------------------------------**/
129 /*---------------------------------------------------------------------------*/
130 namespace tesseract {
AcceptableChoice(BLOB_CHOICE_LIST_VECTOR * Choices,WERD_CHOICE * BestChoice,const WERD_CHOICE & RawChoice,DANGERR * fixpt,ACCEPTABLE_CHOICE_CALLER caller,bool * modified_blobs)131 int Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
132                            WERD_CHOICE *BestChoice,
133                            const WERD_CHOICE &RawChoice,
134                            DANGERR *fixpt,
135                            ACCEPTABLE_CHOICE_CALLER caller,
136                            bool *modified_blobs) {
137 /*
138  **	Parameters:
139  **		Choices		choices for current segmentation
140  **		BestChoice	best choice for current segmentation
141  **		RawChoice	best raw choice for current segmentation
142  **	Variables Used:
143  **		stopper_nondict_certainty_base	certainty for a non-dict word
144  **		stopper_smallword_size		size of word to be treated as non-word
145  **		stopper_certainty_per_char	certainty to add for each dict char
146  **	Operation: Return TRUE if the results from this segmentation are
147  **		good enough to stop.  Otherwise return FALSE.
148  **	Return: TRUE or FALSE.
149  **	Exceptions: none
150  **	History: Mon Apr 29 14:57:32 1991, DSJ, Created.
151  */
152   float CertaintyThreshold = stopper_nondict_certainty_base;
153   int WordSize;
154 
155   if (stopper_no_acceptable_choices) return false;
156 
157   if (fixpt != NULL)
158     fixpt->index = -1;
159   if (BestChoice->length() == 0)
160     return (FALSE);
161   if (caller == CHOPPER_CALLER && BestChoice->fragment_mark()) {
162     if (stopper_debug_level >= 1) {
163       cprintf("AcceptableChoice(): a choice with fragments beats BestChoice");
164     }
165     return false;
166   }
167 
168   bool no_dang_ambigs =
169     NoDangerousAmbig(BestChoice, fixpt, true, Choices, modified_blobs);
170 
171   if (stopper_debug_level >= 1)
172     tprintf("\nStopper:  %s (word=%c, case=%c)\n",
173             BestChoice->debug_string(getUnicharset()).string(),
174             (valid_word(*BestChoice) ? 'y' : 'n'),
175             (Context::case_ok(*BestChoice, getUnicharset()) ? 'y' : 'n'));
176 
177   if (valid_word(*BestChoice) &&
178       Context::case_ok(*BestChoice, getUnicharset())) {
179     WordSize = LengthOfShortestAlphaRun(*BestChoice);
180     WordSize -= stopper_smallword_size;
181     if (WordSize < 0)
182       WordSize = 0;
183     CertaintyThreshold += WordSize * stopper_certainty_per_char;
184   }
185 
186   if (stopper_debug_level >= 1)
187     tprintf("Stopper:  Certainty = %4.1f, Threshold = %4.1f\n",
188             BestChoice->certainty(), CertaintyThreshold);
189 
190   if (no_dang_ambigs &&
191       BestChoice->certainty() > CertaintyThreshold &&
192       UniformCertainties(*Choices, *BestChoice)) {
193     return (TRUE);
194   } else {
195     return (FALSE);
196   }
197 }                                /* AcceptableChoice */
198 
199 
200 /*---------------------------------------------------------------------------*/
AcceptableResult(const WERD_CHOICE & BestChoice,const WERD_CHOICE & RawChoice)201 int Dict::AcceptableResult(const WERD_CHOICE &BestChoice,
202                            const WERD_CHOICE &RawChoice) {
203 /*
204  **	Parameters:
205  **		BestChoice	best choice for current word
206  **		RawChoice	best raw choice for current word
207  **	Variables Used:
208  **		stopper_nondict_certainty_base	certainty for a non-dict word
209  **		stopper_smallword_size		size of word to be treated as non-word
210  **		stopper_certainty_per_char	certainty to add for each dict char
211  **		best_choices_		list of all good choices found
212  **		reject_offset_		allowed offset before a word is rejected
213  **	Operation: Return FALSE if the best choice for the current word
214  **		is questionable and should be tried again on the second
215  **		pass or should be flagged to the user.
216  **	Return: TRUE or FALSE.
217  **	Exceptions: none
218  **	History: Thu May  9 14:05:05 1991, DSJ, Created.
219  */
220   float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
221   int WordSize;
222 
223   if (stopper_debug_level >= 1) {
224     tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
225             BestChoice.debug_string(getUnicharset()).string(),
226             (valid_word(BestChoice) ? 'y' : 'n'),
227             (Context::case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'),
228             ((rest (best_choices_) != NIL) ? 'n' : 'y'));
229   }
230 
231   if (BestChoice.length() == 0 || CurrentWordAmbig())
232     return (FALSE);
233   if (BestChoice.fragment_mark()) {
234     if (stopper_debug_level >= 1) {
235       cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n");
236     }
237     return false;
238   }
239   if (valid_word(BestChoice) &&
240       Context::case_ok(BestChoice, getUnicharset())) {
241     WordSize = LengthOfShortestAlphaRun(BestChoice);
242     WordSize -= stopper_smallword_size;
243     if (WordSize < 0)
244       WordSize = 0;
245     CertaintyThreshold += WordSize * stopper_certainty_per_char;
246   }
247 
248   if (stopper_debug_level >= 1)
249     cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ",
250       BestChoice.certainty(), CertaintyThreshold);
251 
252   if (BestChoice.certainty() > CertaintyThreshold &&
253       !stopper_no_acceptable_choices) {
254     if (stopper_debug_level >= 1)
255       cprintf("ACCEPTED\n");
256     return (TRUE);
257   }
258   else {
259     if (stopper_debug_level >= 1)
260       cprintf("REJECTED\n");
261     return (FALSE);
262   }
263 }                                /* AcceptableResult */
264 
265 
266 /*---------------------------------------------------------------------------*/
AlternativeChoicesWorseThan(FLOAT32 Threshold)267 int Dict::AlternativeChoicesWorseThan(FLOAT32 Threshold) {
268 /*
269  **	Parameters:
270  **		Threshold	minimum adjust factor for alternative choices
271  **	Variables Used:
272  **		best_choices_	alternative choices for current word
273  **	Operation: This routine returns TRUE if there are no alternative
274  **		choices for the current word OR if all alternatives have
275  **		an adjust factor worse than Threshold.
276  **	Return: TRUE or FALSE.
277  **	Exceptions: none
278  **	History: Mon Jun  3 09:36:31 1991, DSJ, Created.
279  */
280   LIST Alternatives;
281   VIABLE_CHOICE Choice;
282 
283   Alternatives = rest (best_choices_);
284   iterate(Alternatives) {
285     Choice = (VIABLE_CHOICE) first_node (Alternatives);
286     if (Choice->AdjustFactor <= Threshold)
287       return (FALSE);
288   }
289 
290   return (TRUE);
291 
292 }                                /* AlternativeChoicesWorseThan */
293 
294 
295 /*---------------------------------------------------------------------------*/
CurrentBestChoiceIs(const WERD_CHOICE & WordChoice)296 int Dict::CurrentBestChoiceIs(const WERD_CHOICE &WordChoice) {
297 /*
298  **	Parameters:
299  **             Word            word that will be compared to the best choice
300  **	Variables Used:
301  **		best_choices_	set of best choices for current word
302  **	Operation: Returns TRUE if Word is the same as the current best
303  **		choice, FALSE otherwise.
304  **	Return: TRUE or FALSE
305  **	Exceptions: none
306  **	History: Thu May 30 14:44:22 1991, DSJ, Created.
307  */
308   return (best_choices_ != NIL &&
309           StringSameAs(WordChoice, (VIABLE_CHOICE)first_node(best_choices_)));
310 }                                /* CurrentBestChoiceIs */
311 
312 
313 /*---------------------------------------------------------------------------*/
CurrentBestChoiceAdjustFactor()314 FLOAT32 Dict::CurrentBestChoiceAdjustFactor() {
315 /*
316  **	Parameters: none
317  **	Variables Used:
318  **		best_choices_	set of best choices for current word
319  **	Operation: Return the adjustment factor for the best choice for
320  **		the current word.
321  **	Return: Adjust factor for current best choice.
322  **	Exceptions: none
323  **	History: Thu May 30 14:48:24 1991, DSJ, Created.
324  */
325   VIABLE_CHOICE BestChoice;
326 
327   if (best_choices_ == NIL)
328     return (MAX_FLOAT32);
329 
330   BestChoice = (VIABLE_CHOICE) first_node (best_choices_);
331   return (BestChoice->AdjustFactor);
332 
333 }                                /* CurrentBestChoiceAdjustFactor */
334 
335 
336 /*---------------------------------------------------------------------------*/
CurrentWordAmbig()337 int Dict::CurrentWordAmbig() {
338 /*
339  **	Parameters: none
340  **	Variables Used:
341  **		best_choices_	set of best choices for current word
342  **	Operation: This routine returns TRUE if there are multiple good
343  **		choices for the current word and FALSE otherwise.
344  **	Return: TRUE or FALSE
345  **	Exceptions: none
346  **	History: Wed May 22 15:38:38 1991, DSJ, Created.
347  */
348   return (rest (best_choices_) != NIL);
349 
350 }                                /* CurrentWordAmbig */
351 
352 
353 /*---------------------------------------------------------------------------*/
DebugWordChoices()354 void Dict::DebugWordChoices() {
355 /*
356  **	Parameters: none
357  **	Variables Used:
358  **		best_raw_choice_
359  **		best_choices_
360  **	Operation: Print the current choices for this word to stdout.
361  **	Return: none
362  **	Exceptions: none
363  **	History: Wed May 15 13:52:08 1991, DSJ, Created.
364  */
365   LIST Choices;
366   int i;
367   char LabelString[80];
368   VIABLE_CHOICE VChoice = (VIABLE_CHOICE)first_node(best_choices_);
369   bool force_debug =
370     fragments_debug && VChoice != NULL && VChoice->ComposedFromCharFragments;
371 
372   if (stopper_debug_level >= 1 || force_debug ||
373   (((STRING)word_to_debug).length() > 0 && best_choices_ &&
374        StringSameAs(word_to_debug.string(), word_to_debug_lengths.string(),
375                     (VIABLE_CHOICE)first_node(best_choices_)))) {
376     if (best_raw_choice_)
377       PrintViableChoice(stderr, "\nBest Raw Choice:   ", best_raw_choice_);
378 
379     i = 1;
380     Choices = best_choices_;
381     if (Choices)
382       cprintf("\nBest Cooked Choices:\n");
383     iterate(Choices) {
384       sprintf(LabelString, "Cooked Choice #%d:  ", i);
385       PrintViableChoice(stderr, LabelString,
386                         (VIABLE_CHOICE)first_node(Choices));
387       i++;
388     }
389   }
390 }                                /* DebugWordChoices */
391 
392 // Print all the choices in raw_choices_ list for non 1-1 ambiguities.
PrintAmbigAlternatives(FILE * file,const char * label,int label_num_unichars)393 void Dict::PrintAmbigAlternatives(FILE *file, const char *label,
394                                   int label_num_unichars) {
395   iterate(raw_choices_) {
396     VIABLE_CHOICE Choice = (VIABLE_CHOICE)first_node(raw_choices_);
397     if (Choice->Length > 0 &&
398         (label_num_unichars > 1 || Choice->Length > 1)) {
399       for (int i = 0; i < Choice->Length; i++) {
400         fprintf(file, "%s",
401                 getUnicharset().id_to_unichar(Choice->Blob[i].Class));
402       }
403       fflush(file);
404       fprintf(file, "\t%s\t%.4f\t%.4f\n", label,
405               Choice->Rating, Choice->Certainty);
406     }
407   }
408 }
409 
410 /*---------------------------------------------------------------------------*/
FilterWordChoices()411 void Dict::FilterWordChoices() {
412 /*
413  **	Parameters: none
414  **	Variables Used:
415  **		best_choices_	set of choices for current word
416  **	Operation: This routine removes from best_choices_ all choices which
417  **		are not within a reasonable range of the best choice.
418  **	Return: none
419  **	Exceptions: none
420  **	History: Wed May 15 13:08:24 1991, DSJ, Created.
421  */
422   EXPANDED_CHOICE BestChoice;
423 
424   if (best_choices_ == NIL || second_node (best_choices_) == NIL)
425     return;
426 
427   /* compute certainties and class for each chunk in best choice */
428   ExpandChoice((VIABLE_CHOICE_STRUCT *)first_node(best_choices_), &BestChoice);
429 
430   set_rest (best_choices_, delete_d (rest (best_choices_),
431     &BestChoice, FreeBadChoice));
432 
433 }                                /* FilterWordChoices */
434 
435 /*---------------------------------------------------------------------------*/
FindClassifierErrors(FLOAT32 MinRating,FLOAT32 MaxRating,FLOAT32 RatingMargin,FLOAT32 Thresholds[])436 void Dict::FindClassifierErrors(FLOAT32 MinRating,
437                           FLOAT32 MaxRating,
438                           FLOAT32 RatingMargin,
439                           FLOAT32 Thresholds[]) {
440 /*
441  **	Parameters:
442  **		MinRating		limits how tight to make a template
443  **		MaxRating		limits how loose to make a template
444  **		RatingMargin		amount of margin to put in template
445  **		Thresholds[]		place to put error thresholds
446  **	Operation: This routine compares the best choice for the current
447  **		word to the best raw choice to determine which characters
448  **		were classified incorrectly by the classifier.  It then
449  **		places a separate threshold into Thresholds for each
450  **		character in the word.  If the classifier was correct,
451  **		MaxRating is placed into Thresholds.  If the
452  **		classifier was incorrect, the avg. match rating (error
453  **		percentage) of the classifier's incorrect choice minus
454  **		some margin is
455  **		placed into thresholds.  This can then be used by the
456  **		caller to try to create a new template for the desired
457  **		class that will classify the character with a rating better
458  **		than the threshold value.  The match rating placed into
459  **		Thresholds is never allowed to be below MinRating in order
460  **		to prevent trying to make overly tight templates.
461  **	Return: none (results are placed in Thresholds)
462  **	Exceptions: none
463  **	History: Fri May 31 16:02:57 1991, DSJ, Created.
464  */
465   EXPANDED_CHOICE BestRaw;
466   VIABLE_CHOICE Choice;
467   int i, j, Chunk;
468   FLOAT32 AvgRating;
469   int NumErrorChunks;
470 
471   assert (best_choices_ != NIL);
472   assert (best_raw_choice_ != NULL);
473 
474   ExpandChoice(best_raw_choice_, &BestRaw);
475   Choice = (VIABLE_CHOICE) first_node (best_choices_);
476 
477   for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) {
478     AvgRating = 0.0;
479     NumErrorChunks = 0;
480 
481     for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
482       if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) {
483         AvgRating += BestRaw.ChunkCertainty[Chunk];
484         NumErrorChunks++;
485       }
486     }
487 
488     if (NumErrorChunks > 0) {
489       AvgRating /= NumErrorChunks;
490       *Thresholds = (AvgRating / -certainty_scale) * (1.0 - RatingMargin);
491     }
492     else
493       *Thresholds = MaxRating;
494 
495     if (*Thresholds > MaxRating)
496       *Thresholds = MaxRating;
497     if (*Thresholds < MinRating)
498       *Thresholds = MinRating;
499   }
500 }                                /* FindClassifierErrors */
501 
502 
503 /*---------------------------------------------------------------------------*/
InitChoiceAccum()504 void Dict::InitChoiceAccum() {
505 /*
506  **	Parameters: none
507  **	Operation: This routine initializes the data structures used to
508  **		keep track the good word choices found for a word.
509  **	Return: none
510  **	Exceptions: none
511  **	History: Fri May 17 07:59:00 1991, DSJ, Created.
512  */
513   BLOB_WIDTH *BlobWidth, *End;
514 
515   if (best_raw_choice_)
516     memfree(best_raw_choice_);
517   best_raw_choice_ = NULL;
518 
519   if (best_choices_)
520     destroy_nodes(best_choices_, memfree);
521   best_choices_ = NIL;
522 
523   if (raw_choices_)
524     destroy_nodes(raw_choices_, memfree);
525   raw_choices_ = NIL;
526 
527   EnableChoiceAccum();
528 
529   for (BlobWidth = current_segmentation_,
530     End = current_segmentation_ + MAX_NUM_CHUNKS;
531     BlobWidth < End; *BlobWidth++ = 1);
532 
533 }                                /* InitChoiceAccum */
534 
535 
536 /*---------------------------------------------------------------------------*/
LogNewSegmentation(PIECES_STATE BlobWidth)537 void Dict::LogNewSegmentation(PIECES_STATE BlobWidth) {
538 /*
539  **	Parameters:
540  **		BlobWidth[]	number of chunks in each blob in segmentation
541  **	Variables Used:
542  **		current_segmentation	blob widths for current segmentation
543  **	Operation: This routine updates the blob widths in current_segmentation
544  **		to be the same as provided in BlobWidth.
545  **	Return: none
546  **	Exceptions: none
547  **	History: Mon May 20 11:52:26 1991, DSJ, Created.
548  */
549   BLOB_WIDTH *Segmentation;
550 
551   for (Segmentation = current_segmentation_; *BlobWidth != 0;
552     BlobWidth++, Segmentation++)
553   *Segmentation = *BlobWidth;
554   *Segmentation = 0;
555 
556 }                                /* LogNewSegmentation */
557 
558 
559 /*---------------------------------------------------------------------------*/
LogNewSplit(int Blob)560 void Dict::LogNewSplit(int Blob) {
561 /*
562  **	Parameters:
563  **		Blob	index of blob that was split
564  **	Variables Used:
565  **		best_raw_choice_	current best raw choice
566  **		best_choices_	list of best choices found so far
567  **	Operation: This routine adds 1 chunk to the specified blob for each
568  **		choice in best_choices_ and for the best_raw_choice_.
569  **	Return: none
570  **	Exceptions: none
571  **	History: Mon May 20 11:38:56 1991, DSJ, Created.
572  */
573   LIST Choices;
574 
575   if (best_raw_choice_) {
576     AddNewChunk(best_raw_choice_, Blob);
577   }
578 
579   Choices = best_choices_;
580   iterate(Choices) {
581     AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
582   }
583   Choices = raw_choices_;
584   iterate(Choices) {
585     AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
586   }
587 }                                /* LogNewSplit */
588 
589 
590 /*---------------------------------------------------------------------------*/
LogNewChoice(const WERD_CHOICE & WordChoice,FLOAT32 AdjustFactor,const float Certainties[],bool raw_choice)591 void Dict::LogNewChoice(const WERD_CHOICE &WordChoice,
592                             FLOAT32 AdjustFactor,
593                         const float Certainties[],
594                         bool raw_choice) {
595 /*
596  **	Parameters:
597  **		Choice		new choice for current word
598  **		AdjustFactor	adjustment factor which was applied to choice
599  **		Certainties	certainties for each char in new choice
600  **		ChoicesList	list with choices seen so far
601  **     Variables Used:
602  **		best_raw_choice_	best raw choice so far for current word
603  **	Operation: This routine adds Choice to ChoicesList if the
604  **		adjusted certainty for Choice is within a reasonable range
605  **		of the best choice in ChoicesList.  The ChoicesList
606  **		list is kept in sorted order by rating. Duplicates are
607  **		removed.
608  **	Return: none
609  **	Exceptions: none
610  **	History: Wed May 15 09:57:19 1991, DSJ, Created.
611  */
612   VIABLE_CHOICE NewChoice;
613   LIST ChoicesList;
614   LIST Choices;
615   FLOAT32 Threshold;
616 
617   if (!keep_word_choices_)
618     return;
619 
620   if (raw_choice) {
621     if (!best_raw_choice_)
622       best_raw_choice_ = NewViableChoice(WordChoice, AdjustFactor, Certainties);
623     else if (WordChoice.rating() < best_raw_choice_->Rating) {
624       if (ChoiceSameAs(WordChoice, best_raw_choice_))
625         FillViableChoice(WordChoice, AdjustFactor, Certainties, true,
626                          best_raw_choice_);
627       else {
628         memfree(best_raw_choice_);
629         best_raw_choice_ =
630           NewViableChoice(WordChoice, AdjustFactor, Certainties);
631       }
632     }
633     if (!save_raw_choices) return;
634     ChoicesList = raw_choices_;
635   } else {
636     ChoicesList = best_choices_;
637   }
638 
639   /* throw out obviously bad choices to save some work */
640   if (ChoicesList != NIL) {
641     Threshold = AmbigThreshold (BestFactor (ChoicesList), AdjustFactor);
642     if (Threshold > -stopper_ambiguity_threshold_offset)
643       Threshold = -stopper_ambiguity_threshold_offset;
644     if (WordChoice.certainty() - BestCertainty (ChoicesList) < Threshold)
645       return;
646   }
647 
648   /* see if a choice with the same text string has already been found */
649   NewChoice = NULL;
650   Choices = ChoicesList;
651 
652   iterate(Choices) {
653     if (ChoiceSameAs (WordChoice, (VIABLE_CHOICE) first_node (Choices))) {
654       if (WordChoice.rating() < BestRating (Choices)) {
655         NewChoice = (VIABLE_CHOICE) first_node (Choices);
656       } else {
657         return;
658       }
659     }
660   }
661 
662   if (NewChoice) {
663     FillViableChoice(WordChoice, AdjustFactor, Certainties, true, NewChoice);
664     ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node);
665   }
666   else {
667     NewChoice = NewViableChoice (WordChoice, AdjustFactor, Certainties);
668   }
669 
670   ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings);
671   if (stopper_debug_level >= 2)
672     raw_choice ? PrintViableChoice (stderr, "New Raw Choice:  ", NewChoice) :
673     PrintViableChoice (stderr, "New Word Choice:  ", NewChoice);
674   if (count (ChoicesList) > tessedit_truncate_wordchoice_log) {
675     Choices =
676       (LIST) nth_cell (ChoicesList, tessedit_truncate_wordchoice_log);
677     destroy_nodes (rest (Choices), Efree);
678     set_rest(Choices, NIL);
679   }
680 
681   // Update raw_choices_/best_choices_ pointer.
682   if (raw_choice) {
683     raw_choices_ = ChoicesList;
684   } else {
685     best_choices_ = ChoicesList;
686   }
687 }                                /* LogNewChoice */
688 
689 
690 /*---------------------------------------------------------------------------*/
NoDangerousAmbig(WERD_CHOICE * best_choice,DANGERR * fix_pt,bool fix_replaceable,BLOB_CHOICE_LIST_VECTOR * blob_choices,bool * modified_blobs)691 int Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
692                            DANGERR *fix_pt,
693                            bool fix_replaceable,
694                            BLOB_CHOICE_LIST_VECTOR *blob_choices,
695                            bool *modified_blobs) {
696   if (stopper_debug_level > 2) {
697     tprintf("\nRunning NoDangerousAmbig() for %s\n",
698             best_choice->debug_string(getUnicharset()).string());
699   }
700 
701   // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
702   // for each unichar id in BestChoice.
703   BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
704   int i;
705   bool modified_best_choice = false;
706   bool ambigs_found = false;
707   // For each position in best_choice:
708   // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
709   // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
710   // -- look for ambiguities corresponding to wrong_ngram in the list while
711   //    adding the following unichar_ids from best_choice to wrong_ngram
712   //
713   // Repeat the above procedure twice: first time look through
714   // ambigs to be replaced and replace all the ambiguities found;
715   // second time look through dangerous ambiguities and construct
716   // ambig_blob_choices with fake a blob choice for each ambiguity
717   // and pass them to dawg_permute_and_select() to search for
718   // ambiguous words in the dictionaries.
719   //
720   // Note that during the execution of the for loop (on the first pass)
721   // if replacements are made the length of best_choice might change.
722   for (int pass = 0; pass < 2; ++pass) {
723     bool replace = (pass == 0);
724     const UnicharAmbigsVector &table = replace ?
725       getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
726     if (!replace) {
727       // Initialize ambig_blob_choices with lists containing a single
728       // unichar id for the correspoding position in best_choice.
729       // best_choice consisting from only the original letters will
730       // have a rating of 0.0.
731       for (i = 0; i < best_choice->length(); ++i) {
732         BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
733         BLOB_CHOICE_IT lst_it(lst);
734         lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
735                                           0.0, 0.0, 0, -1));
736         ambig_blob_choices.push_back(lst);
737       }
738     }
739     UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
740     int wrong_ngram_index;
741     int next_index;
742     for (i = 0; i < best_choice->length(); ++i) {
743       UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
744       if (stopper_debug_level > 2) {
745         tprintf("Looking for %s ngrams starting with %s:\n",
746                 replace ? "replaceable" : "ambiguous",
747                 getUnicharset().debug_str(curr_unichar_id).string());
748       }
749       wrong_ngram_index = 0;
750       wrong_ngram[wrong_ngram_index] = curr_unichar_id;
751       if (curr_unichar_id == INVALID_UNICHAR_ID ||
752           curr_unichar_id >= table.size() ||
753           table[curr_unichar_id] == NULL) {
754         continue;  // there is no ambig spec for this unichar id
755       }
756       AmbigSpec_IT spec_it(table[curr_unichar_id]);
757       for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
758         const AmbigSpec *ambig_spec = spec_it.data();
759         wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
760         int compare = UnicharIdArrayUtils::compare(wrong_ngram,
761                                                    ambig_spec->wrong_ngram);
762         if (stopper_debug_level > 2) {
763           tprintf("candidate ngram: ");
764           UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
765           tprintf("current ngram from spec: ");
766           UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
767           tprintf("comparison result: %d\n", compare);
768         }
769         if (compare == 0) {
770           if (replace) {
771             if (stopper_debug_level > 2) {
772               tprintf("replace ambiguity with: ");
773               UnicharIdArrayUtils::print(
774                   ambig_spec->correct_fragments, getUnicharset());
775             }
776             ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
777                          ambig_spec->correct_ngram_id,
778                          best_choice, blob_choices, modified_blobs);
779             modified_best_choice = true;
780           } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
781             // We found dang ambig - update ambig_blob_choices.
782             if (stopper_debug_level > 2) {
783               tprintf("found ambiguity: ");
784               UnicharIdArrayUtils::print(
785                   ambig_spec->correct_fragments, getUnicharset());
786             }
787             ambigs_found = true;
788             for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
789                  ++tmp_index) {
790               // Add a blob choice for the corresponding fragment of the
791               // ambiguity. These fake blob choices are initialized with
792               // negative ratings (which are not possible for real blob
793               // choices), so that dawg_permute_and_select() considers any
794               // word not consisting of only the original letters a better
795               // choice and stops searching for alternatives once such a
796               // choice is found.
797               BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
798               bc_it.add_to_end(new BLOB_CHOICE(
799                   ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, 0, -1));
800             }
801           }
802           spec_it.forward();
803         } else if (compare == -1) {
804           if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
805               ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
806             // Add the next unichar id to wrong_ngram and keep looking for
807             // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
808             wrong_ngram[++wrong_ngram_index] =
809               best_choice->unichar_id(next_index);
810     } else {
811             break;  // no more matching ambigs in this AMBIG_SPEC_LIST
812     }
813         } else {
814           spec_it.forward();
815   }
816       }  // end searching AmbigSpec_LIST
817     }  // end searching best_choice
818   }  // end searching replace and dangerous ambigs
819   if (modified_best_choice) best_choice->populate_unichars(getUnicharset());
820   // If any ambiguities were found permute the constructed ambig_blob_choices
821   // to see if an alternative dictionary word can be found.
822   if (ambigs_found) {
823     if (stopper_debug_level > 2) {
824       tprintf("\nResulting ambig_blob_choices:\n");
825       for (i = 0; i < ambig_blob_choices.length(); ++i) {
826         print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
827         tprintf("\n");
828     }
829   }
830     WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
831     ambigs_found = (alt_word->rating() < 0.0);
832     if (ambigs_found && stopper_debug_level >= 1) {
833       tprintf ("Stopper: Possible ambiguous word = %s\n",
834                alt_word->debug_string(getUnicharset()).string());
835     }
836     delete alt_word;
837   }
838   ambig_blob_choices.delete_data_pointers();
839   return !ambigs_found;
840 }
841 
EndDangerousAmbigs()842 void Dict::EndDangerousAmbigs() {}
843 
844 /*---------------------------------------------------------------------------*/
SettupStopperPass1()845 void Dict::SettupStopperPass1() {
846 /*
847  **	Parameters: none
848  **	Variables Used:
849  **		reject_offset_	offset allowed before word is rejected
850  **	Operation: This routine performs any settup of stopper variables
851  **		that is needed in preparation for the first pass.
852  **	Return: none
853  **	Exceptions: none
854  **	History: Mon Jun  3 12:32:00 1991, DSJ, Created.
855  */
856   reject_offset_ = 0.0;
857 }                                /* SettupStopperPass1 */
858 
859 
860 /*---------------------------------------------------------------------------*/
SettupStopperPass2()861 void Dict::SettupStopperPass2() {
862 /*
863  **	Parameters: none
864  **	Variables Used:
865  **		reject_offset_	offset allowed before word is rejected
866  **	Operation: This routine performs any settup of stopper variables
867  **		that is needed in preparation for the second pass.
868  **	Return: none
869  **	Exceptions: none
870  **	History: Mon Jun  3 12:32:00 1991, DSJ, Created.
871  */
872   reject_offset_ = stopper_phase2_certainty_rejection_offset;
873 }                                /* SettupStopperPass2 */
874 }  // namespace tesseract
875 
876 
877 /**----------------------------------------------------------------------------
878               Private Code
879 ----------------------------------------------------------------------------**/
880 /*---------------------------------------------------------------------------*/
AddNewChunk(VIABLE_CHOICE Choice,int Blob)881 void AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
882 /*
883  **	Parameters:
884  **		Choice	choice to add a new chunk to
885  **		Blob	index of blob being split
886  **	Operation: This routine increments the chunk count of the character
887  **		in Choice which corresponds to Blob.
888  **	Return: none
889  **	Exceptions: none
890  **	History: Mon May 20 11:43:27 1991, DSJ, Created.
891  */
892   int i, LastChunk;
893 
894   for (i = 0, LastChunk = 0; i < Choice->Length; i++) {
895     LastChunk += Choice->Blob[i].NumChunks;
896     if (Blob < LastChunk) {
897       (Choice->Blob[i].NumChunks)++;
898       return;
899     }
900   }
901   mem_tidy (1);
902   cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
903            Choice->Length, LastChunk, Blob);
904   assert(FALSE);  /* this should never get executed */
905 
906 }                                /* AddNewChunk */
907 
908 
909 /*---------------------------------------------------------------------------*/
910 namespace tesseract {
911 // Replaces the corresponding wrong ngram in werd_choice with the correct one.
912 // We indicate that this newly inserted ngram unichar is composed from several
913 // fragments and modify the corresponding entries in blob_choices to contain
914 // fragments of the correct ngram unichar instead of the original unichars.
915 // Ratings and certainties of entries in blob_choices and werd_choice are
916 // unichaged. E.g. for werd_choice mystring'' and ambiguity ''->":
917 // werd_choice becomes mystring", first ' in blob_choices becomes |"|0|2,
918 // second one is set to |"|1|2.
ReplaceAmbig(int wrong_ngram_begin_index,int wrong_ngram_size,UNICHAR_ID correct_ngram_id,WERD_CHOICE * werd_choice,BLOB_CHOICE_LIST_VECTOR * blob_choices,bool * modified_blobs)919 void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
920                         UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
921                         BLOB_CHOICE_LIST_VECTOR *blob_choices,
922                         bool *modified_blobs) {
923   int num_blobs_to_replace = 0;
924   int begin_blob_index = 0;
925   int i;
926   for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
927     if (i >= wrong_ngram_begin_index) {
928       num_blobs_to_replace +=  werd_choice->fragment_length(i);
929     } else {
930       begin_blob_index += werd_choice->fragment_length(i);
931       }
932         }
933   BLOB_CHOICE_IT bit;
934   int temp_blob_index = begin_blob_index;
935   const char *temp_uch = NULL;
936   const char *correct_ngram_str =
937     getUnicharset().id_to_unichar(correct_ngram_id);
938   for (int replaced_count = 0; replaced_count < wrong_ngram_size;
939        ++replaced_count) {
940     if (blob_choices != NULL) {
941       UNICHAR_ID uch_id = werd_choice->unichar_id(wrong_ngram_begin_index);
942       int fraglen = werd_choice->fragment_length(wrong_ngram_begin_index);
943       if (fraglen > 1) temp_uch = getUnicharset().id_to_unichar(uch_id);
944       for (i = 0; i < fraglen; ++i) {
945         if (fraglen > 1) {
946           STRING frag_str =
947             CHAR_FRAGMENT::to_string(temp_uch, i, fraglen);
948           getUnicharset().unichar_insert(frag_str.string());
949           uch_id = getUnicharset().unichar_to_id(frag_str.string());
950         }
951         bit.set_to_list(blob_choices->get(temp_blob_index));
952         STRING correct_frag_uch =
953           CHAR_FRAGMENT::to_string(correct_ngram_str,
954                                    temp_blob_index - begin_blob_index,
955                                    num_blobs_to_replace);
956         getUnicharset().unichar_insert(correct_frag_uch.string());
957         UNICHAR_ID correct_frag_uch_id =
958           getUnicharset().unichar_to_id(correct_frag_uch.string());
959         // Find the WERD_CHOICE corresponding to the original unichar in
960         // the list of blob choices, add the derived character fragment
961         // before it with the same rating and certainty.
962         for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) {
963           if (bit.data()->unichar_id() == correct_frag_uch_id) {
964             break;  // the unichar we want to insert is already there
965           }
966           if (bit.data()->unichar_id() == uch_id) {
967             bit.add_before_then_move(new BLOB_CHOICE(*(bit.data())));
968             bit.data()->set_unichar_id(correct_frag_uch_id);
969             if (modified_blobs != NULL) *modified_blobs = true;
970             break;
971           }
972         }
973         temp_blob_index++;
974       }
975     }
976     // Remove current unichar from werd_choice. On the last iteration
977     // set the correct replacement unichar instead of removing a unichar.
978     if (replaced_count + 1 == wrong_ngram_size) {
979       werd_choice->set_unichar_id(correct_ngram_id,
980           num_blobs_to_replace, 0.0, 0.0, wrong_ngram_begin_index);
981     } else {
982       werd_choice->remove_unichar_id(wrong_ngram_begin_index);
983     }
984   }
985   if (stopper_debug_level >= 1) {
986     tprintf("ReplaceAmbigs() modified werd_choice: %s\n",
987             werd_choice->debug_string(getUnicharset()).string());
988     werd_choice->print();
989     if (modified_blobs != NULL && *modified_blobs && blob_choices != NULL) {
990       tprintf("Modified blob_choices: ");
991       for (int i = 0; i < blob_choices->size(); ++i) {
992         print_ratings_list("\n", blob_choices->get(i), getUnicharset());
993       }
994       }
995     }
996   }
997 
998 
999 /*---------------------------------------------------------------------------*/
ChoiceSameAs(const WERD_CHOICE & WordChoice,VIABLE_CHOICE ViableChoice)1000 int Dict::ChoiceSameAs(const WERD_CHOICE &WordChoice,
1001                        VIABLE_CHOICE ViableChoice) {
1002 /*
1003  **	Parameters:
1004  **		Choice		choice to compare to ViableChoice
1005  **		ViableChoice	viable choice to compare to Choice
1006  **	Operation: This routine compares the corresponding strings of
1007  **		Choice and ViableChoice and returns TRUE if they are the
1008  **		same, FALSE otherwise.
1009  **	Return: TRUE or FALSE.
1010  **	Exceptions: none
1011  **	History: Fri May 17 08:48:04 1991, DSJ, Created.
1012  */
1013   return (StringSameAs(WordChoice, ViableChoice));
1014 
1015 }                                /* ChoiceSameAs */
1016 }  // namespace tesseract
1017 
1018 
1019 /*---------------------------------------------------------------------------*/
CmpChoiceRatings(void * arg1,void * arg2)1020 int CmpChoiceRatings(void *arg1,    //VIABLE_CHOICE                 Choice1,
1021                      void *arg2) {  //VIABLE_CHOICE                 Choice2)
1022 /*
1023  **	Parameters:
1024  **		Choice1, Choice2	choices to compare ratings for
1025  **	Operation: Return -1 if the rating for Choice1 is less than the
1026  **		rating for Choice2, otherwise return (1).
1027  **	Return: -1 or 1
1028  **	Exceptions: none
1029  **	History: Wed May 15 13:02:37 1991, DSJ, Created.
1030  */
1031   float R1, R2;
1032   VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1;
1033   VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2;
1034 
1035   R1 = Choice1->Rating;
1036   R2 = Choice2->Rating;
1037 
1038   if (R1 < R2)
1039     return (-1);
1040   else
1041     return (1);
1042 
1043 }                                /* CmpChoiceRatings */
1044 
1045 
1046 /*---------------------------------------------------------------------------*/
ExpandChoice(VIABLE_CHOICE Choice,EXPANDED_CHOICE * ExpandedChoice)1047 void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice) {
1048 /*
1049  **	Parameters:
1050  **		Choice		choice to be expanded
1051  **		ExpandedChoice	place to put resulting expanded choice
1052  **	Operation: This routine expands Choice and places the results
1053  **		in ExpandedChoice.  The primary function of expansion
1054  **		is to create an two arrays, one which holds the corresponding
1055  **		certainty for each chunk in Choice, and one which holds
1056  **		the class for each chunk.
1057  **	Return: none (results are placed in ExpandedChoice)
1058  **	Exceptions: none
1059  **	History: Fri May 31 15:21:57 1991, DSJ, Created.
1060  */
1061   int i, j, Chunk;
1062 
1063   ExpandedChoice->Choice = Choice;
1064   for (i = 0, Chunk = 0; i < Choice->Length; i++)
1065   for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
1066     ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty;
1067     ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class;
1068   }
1069 }                                /* ExpandChoice */
1070 
1071 /*---------------------------------------------------------------------------*/
FreeBadChoice(void * item1,void * item2)1072 int FreeBadChoice(void *item1,    //VIABLE_CHOICE                 Choice,
1073                   void *item2) {  //EXPANDED_CHOICE                       *BestChoice)
1074 /*
1075  **	Parameters:
1076  **		Choice			choice to be tested
1077  **		BestChoice		best choice found
1078  **	Variables Used:
1079  **		stopper_ambiguity_threshold_gain
1080  **		stopper_ambiguity_threshold_offset
1081  **	Operation: If the certainty of any chunk in Choice is not ambiguous
1082  **		with the corresponding chunk in the best choice, free
1083  **		Choice and return TRUE.  Otherwise, return FALSE.
1084  **	Return: TRUE or FALSE.
1085  **	Exceptions: none
1086  **	History: Wed May 15 13:20:26 1991, DSJ, Created.
1087  */
1088   int i, j, Chunk;
1089   FLOAT32 Threshold;
1090   VIABLE_CHOICE Choice;
1091   EXPANDED_CHOICE *BestChoice;
1092 
1093   Choice = (VIABLE_CHOICE) item1;
1094   BestChoice = (EXPANDED_CHOICE *) item2;
1095 
1096   Threshold = AmbigThreshold (BestChoice->Choice->AdjustFactor,
1097     Choice->AdjustFactor);
1098 
1099   for (i = 0, Chunk = 0; i < Choice->Length; i++)
1100     for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
1101       if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
1102     Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
1103       Threshold) {
1104         memfree(Choice);
1105     return (TRUE);
1106   }
1107 
1108   return (FALSE);
1109 
1110 }                                /* FreeBadChoice */
1111 
1112 
1113 /*---------------------------------------------------------------------------*/
1114 namespace tesseract {
LengthOfShortestAlphaRun(const WERD_CHOICE & WordChoice)1115 int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) {
1116 /*
1117  **	Parameters:
1118  **		Word            word to be tested
1119  **	Operation: Return the length of the shortest alpha run in Word.
1120  **	Return:  Return the length of the shortest alpha run in Word.
1121  **	Exceptions: none
1122  **	History: Tue May 14 07:50:45 1991, DSJ, Created.
1123  */
1124   register int Shortest = MAX_INT32;
1125   register int Length;
1126   int x;
1127   int y;
1128 
1129   for (x = 0; x < WordChoice.length(); ++x) {
1130     if (getUnicharset().get_isalpha(WordChoice.unichar_id(x))) {
1131       for (y = x + 1, Length = 1;
1132            y < WordChoice.length() &&
1133            getUnicharset().get_isalpha(WordChoice.unichar_id(y));
1134            ++y, ++Length);
1135       if (Length < Shortest) {
1136         Shortest = Length;
1137       }
1138       if (y == WordChoice.length()) {
1139         break;
1140       }
1141     }
1142   }
1143   if (Shortest == MAX_INT32)
1144     Shortest = 0;
1145 
1146   return (Shortest);
1147 
1148 }                                /* LengthOfShortestAlphaRun */
1149 
1150 
1151 /*---------------------------------------------------------------------------*/
NewViableChoice(const WERD_CHOICE & WordChoice,FLOAT32 AdjustFactor,const float Certainties[])1152 VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice,
1153                                     FLOAT32 AdjustFactor,
1154                                     const float Certainties[]) {
1155 /*
1156  **	Parameters:
1157  **		Choice		choice to be converted to a viable choice
1158  **		AdjustFactor	factor used to adjust ratings for Choice
1159  **		Certainties	certainty for each character in Choice
1160  **	Variables Used:
1161  **		current_segmentation	segmentation corresponding to Choice
1162  **	Operation: Allocate a new viable choice data structure, copy
1163  **		Choice, Certainties, and current_segmentation_ into it,
1164  **		and return a pointer to it.
1165  **	Return: Ptr to new viable choice.
1166  **	Exceptions: none
1167  **	History: Thu May 16 15:28:29 1991, DSJ, Created.
1168  */
1169   int Length = WordChoice.length();
1170   assert (Length <= MAX_NUM_CHUNKS && Length > 0);
1171   VIABLE_CHOICE NewChoice = (VIABLE_CHOICE) Emalloc (
1172       sizeof (VIABLE_CHOICE_STRUCT) + (Length - 1) * sizeof (CHAR_CHOICE));
1173   FillViableChoice(WordChoice, AdjustFactor, Certainties, false, NewChoice);
1174   return (NewChoice);
1175 }                                /* NewViableChoice */
1176 
1177 
1178 /*---------------------------------------------------------------------------*/
PrintViableChoice(FILE * File,const char * Label,VIABLE_CHOICE Choice)1179 void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
1180 /*
1181  **	Parameters:
1182  **		File	open text file to print Choice to
1183  **		Label	text label to be printed with Choice
1184  **		Choice	choice to be printed
1185  **	Operation: This routine dumps a text representation of the
1186  **		specified Choice to File.
1187  **	Return: none
1188  **	Exceptions: none
1189  **	History: Mon May 20 11:16:44 1991, DSJ, Created.
1190  */
1191   int i, j;
1192 
1193   fprintf (File, "%s", Label);
1194 
1195   fprintf(File, "(R=%5.1f, C=%4.1f, F=%4.2f, Frag=%d)  ",
1196     Choice->Rating, Choice->Certainty,
1197     Choice->AdjustFactor, Choice->ComposedFromCharFragments);
1198 
1199   for (i = 0; i < Choice->Length; i++)
1200     fprintf(File, "%s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
1201   fprintf(File, "\n");
1202 
1203   for (i = 0; i < Choice->Length; i++) {
1204     fprintf(File, "  %s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
1205     for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++)
1206       fprintf(File, "    ");
1207   }
1208   fprintf(File, "\n");
1209 
1210   for (i = 0; i < Choice->Length; i++) {
1211     for (j = 0; j < Choice->Blob[i].NumChunks; j++)
1212       fprintf(File, "%3d ", (int) (Choice->Blob[i].Certainty * -10.0));
1213   }
1214   fprintf(File, "\n");
1215 
1216   for (i = 0; i < Choice->Length; i++) {
1217     for (j = 0; j < Choice->Blob[i].NumChunks; j++)
1218       fprintf(File, "%3d ", Choice->Blob[i].NumChunks);
1219   }
1220   fprintf(File, "\n");
1221 }                                /* PrintViableChoice */
1222 
1223 
1224 /*---------------------------------------------------------------------------*/
FillViableChoice(const WERD_CHOICE & WordChoice,FLOAT32 AdjustFactor,const float Certainties[],bool SameString,VIABLE_CHOICE ViableChoice)1225 void Dict::FillViableChoice(const WERD_CHOICE &WordChoice,
1226                             FLOAT32 AdjustFactor, const float Certainties[],
1227                             bool SameString, VIABLE_CHOICE ViableChoice) {
1228 /*
1229  **	Parameters:
1230  **		WordChoice 	a choice with info that will be copied
1231  **		AdjustFactor	factor used to adjust ratings for AChoice
1232  **		Certainties	certainty for each character in AChoice
1233  **             SameString      if true the string in the viable choice
1234  **                             will not be changed
1235  **		ViableChoice	existing viable choice to fill in
1236  **	Variables Used:
1237  **		current_segmentation_	segmentation for NewChoice
1238  **	Operation:
1239  **             Fill ViableChoice with information from AChoice,
1240  **             AdjustFactor, and Certainties.
1241  **	Return: none
1242  **	Exceptions: none
1243  **	History: Fri May 17 13:35:58 1991, DSJ, Created.
1244  */
1245   CHAR_CHOICE *NewChar;
1246   BLOB_WIDTH *BlobWidth;
1247   int x;
1248 
1249   ViableChoice->Rating = WordChoice.rating();
1250   ViableChoice->Certainty = WordChoice.certainty();
1251   ViableChoice->AdjustFactor = AdjustFactor;
1252   ViableChoice->ComposedFromCharFragments = false;
1253   if (!SameString) {
1254     ViableChoice->Length = WordChoice.length();
1255   }
1256   for (x = 0,
1257        NewChar = &(ViableChoice->Blob[0]),
1258        BlobWidth = current_segmentation_;
1259        x < WordChoice.length();
1260        x++, NewChar++, Certainties++, BlobWidth++) {
1261     if (!SameString) {
1262       NewChar->Class = WordChoice.unichar_id(x);
1263     }
1264     NewChar->NumChunks = *BlobWidth;
1265     NewChar->Certainty = *Certainties;
1266     for (int i = 1; i < WordChoice.fragment_length(x); ++i) {
1267       BlobWidth++;
1268       assert(*BlobWidth > 0);
1269       NewChar->NumChunks += *BlobWidth;
1270       ViableChoice->ComposedFromCharFragments = true;
1271     }
1272   }
1273 }                                /* FillViableChoice */
1274 
1275 
1276 // Compares unichar ids in word_choice to those in viable_choice,
1277 // returns true if they are the same, false otherwise.
StringSameAs(const WERD_CHOICE & WordChoice,VIABLE_CHOICE ViableChoice)1278 bool Dict::StringSameAs(const WERD_CHOICE &WordChoice,
1279                         VIABLE_CHOICE ViableChoice) {
1280   if (WordChoice.length() != ViableChoice->Length) {
1281     return false;
1282   }
1283   int i;
1284   CHAR_CHOICE *CharChoice;
1285   for (i = 0, CharChoice = &(ViableChoice->Blob[0]);
1286        i < ViableChoice->Length; CharChoice++, i++) {
1287     if (CharChoice->Class != WordChoice.unichar_id(i)) {
1288       return false;
1289     }
1290   }
1291   return true;
1292 }
1293 
1294 /*---------------------------------------------------------------------------*/
StringSameAs(const char * String,const char * String_lengths,VIABLE_CHOICE ViableChoice)1295 int Dict::StringSameAs(const char *String,
1296                        const char *String_lengths,
1297                        VIABLE_CHOICE ViableChoice) {
1298 /*
1299  **	Parameters:
1300  **		String		string to compare to ViableChoice
1301  **		String_lengths	lengths of unichars in String
1302  **		ViableChoice	viable choice to compare to String
1303  **	Operation: This routine compares String to ViableChoice and
1304  **		returns TRUE if they are the same, FALSE otherwise.
1305  **	Return: TRUE or FALSE.
1306  **	Exceptions: none
1307  **	History: Fri May 17 08:48:04 1991, DSJ, Created.
1308  */
1309   CHAR_CHOICE *Char;
1310   int i;
1311   int current_unichar_length;
1312 
1313   for (Char = &(ViableChoice->Blob[0]), i = 0;
1314     i < ViableChoice->Length;
1315        String += *(String_lengths++), Char++, i++) {
1316     current_unichar_length = strlen(getUnicharset().id_to_unichar(Char->Class));
1317   if (current_unichar_length != *String_lengths ||
1318       strncmp(String, getUnicharset().id_to_unichar(Char->Class),
1319               current_unichar_length) != 0)
1320     return (FALSE);
1321   }
1322 
1323   if (*String == 0)
1324     return (TRUE);
1325   else
1326     return (FALSE);
1327 
1328 }                                /* StringSameAs */
1329 }  // namespace tesseract
1330 
1331 /*---------------------------------------------------------------------------*/
UniformCertainties(const BLOB_CHOICE_LIST_VECTOR & Choices,const WERD_CHOICE & BestChoice)1332 int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
1333                        const WERD_CHOICE &BestChoice) {
1334 /*
1335  **	Parameters:
1336  **		Choices		choices for current segmentation
1337  **		BestChoice	best choice for current segmentation
1338  **	Variables Used:
1339  **		stopper_allowable_character_badness
1340  **             max allowed certainty variation
1341  **	Operation: This routine returns TRUE if the certainty of the
1342  **		BestChoice word is within a reasonable range of the average
1343  **		certainties for the best choices for each character in
1344  **		the segmentation.  This test is used to catch words in which
1345  **		one character is much worse than the other characters in
1346  **		the word (i.e. FALSE will be returned in that case).
1347  **		The algorithm computes the mean and std deviation of the
1348  **		certainties in the word with the worst certainty thrown out.
1349  **	Return: TRUE or FALSE.
1350  **	Exceptions: none
1351  **	History: Tue May 14 08:23:21 1991, DSJ, Created.
1352  */
1353   float Certainty;
1354   float WorstCertainty = MAX_FLOAT32;
1355   float CertaintyThreshold;
1356   FLOAT64 TotalCertainty;
1357   FLOAT64 TotalCertaintySquared;
1358   FLOAT64 Variance;
1359   FLOAT32 Mean, StdDev;
1360   int WordLength;
1361 
1362   WordLength = Choices.length();
1363   if (WordLength < 3)
1364     return (TRUE);
1365 
1366   TotalCertainty = TotalCertaintySquared = 0.0;
1367   BLOB_CHOICE_IT BlobChoiceIt;
1368   for (int i = 0; i < Choices.length(); ++i) {
1369     BlobChoiceIt.set_to_list(Choices.get(i));
1370     Certainty = BlobChoiceIt.data()->certainty();
1371     TotalCertainty += Certainty;
1372     TotalCertaintySquared += Certainty * Certainty;
1373     if (Certainty < WorstCertainty)
1374       WorstCertainty = Certainty;
1375   }
1376 
1377   /* subtract off worst certainty from statistics */
1378   WordLength--;
1379   TotalCertainty -= WorstCertainty;
1380   TotalCertaintySquared -= WorstCertainty * WorstCertainty;
1381 
1382   Mean = TotalCertainty / WordLength;
1383   Variance = ((WordLength * TotalCertaintySquared -
1384     TotalCertainty * TotalCertainty) /
1385     (WordLength * (WordLength - 1)));
1386   if (Variance < 0.0)
1387     Variance = 0.0;
1388   StdDev = sqrt (Variance);
1389 
1390   CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
1391   if (CertaintyThreshold > stopper_nondict_certainty_base)
1392     CertaintyThreshold = stopper_nondict_certainty_base;
1393 
1394   if (BestChoice.certainty() < CertaintyThreshold) {
1395     if (stopper_debug_level >= 1)
1396       cprintf("Stopper: Non-uniform certainty = %4.1f"
1397               " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
1398               BestChoice.certainty(), Mean, StdDev, CertaintyThreshold);
1399     return (FALSE);
1400   } else {
1401     return (TRUE);
1402   }
1403 }                                /* UniformCertainties */
1404