• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  **                         Filename:    adaptmatch.c
3  **                         Purpose:     High level adaptive matcher.
4  **                         Author:      Dan Johnson
5  **                         History:     Mon Mar 11 10:00:10 1991, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 
19 /**----------------------------------------------------------------------------
20           Include Files and Type Defines
21 ----------------------------------------------------------------------------**/
22 #include <ctype.h>
23 #include "adaptmatch.h"
24 #include "normfeat.h"
25 #include "mfoutline.h"
26 #include "picofeat.h"
27 #include "float2int.h"
28 #include "outfeat.h"
29 #include "emalloc.h"
30 #include "intfx.h"
31 #include "speckle.h"
32 #include "efio.h"
33 #include "normmatch.h"
34 #include "permute.h"
35 #include "context.h"
36 #include "ndminx.h"
37 #include "intproto.h"
38 #include "const.h"
39 #include "globals.h"
40 #include "werd.h"
41 #include "callcpp.h"
42 #include "tordvars.h"
43 #include "varable.h"
44 #include "classify.h"
45 #include "unicharset.h"
46 
47 #include <stdio.h>
48 #include <string.h>
49 #include <ctype.h>
50 #include <stdlib.h>
51 #include <math.h>
52 #ifdef __UNIX__
53 #include <assert.h>
54 #endif
55 
56 #define ADAPT_TEMPLATE_SUFFIX ".a"
57 
58 #define MAX_MATCHES         10
59 #define UNLIKELY_NUM_FEAT 200
60 #define NO_DEBUG      0
61 #define MAX_ADAPTABLE_WERD_SIZE 40
62 
63 #define ADAPTABLE_WERD    (GOOD_WERD + 0.05)
64 
65 #define Y_DIM_OFFSET    (Y_SHIFT - BASELINE_Y_SHIFT)
66 
67 #define WORST_POSSIBLE_RATING (1.0)
68 
69 struct ADAPT_RESULTS
70 {
71   inT32 BlobLength;
72   int NumMatches;
73   bool HasNonfragment;
74   CLASS_ID Classes[MAX_NUM_CLASSES];
75   FLOAT32 Ratings[MAX_CLASS_ID + 1];
76   uinT8 Configs[MAX_CLASS_ID + 1];
77   FLOAT32 BestRating;
78   CLASS_ID BestClass;
79   uinT8 BestConfig;
80   CLASS_PRUNER_RESULTS CPResults;
81 
82   // Initializes data members to the default values. Sets the initial
83   // rating of each class to be the worst possible rating (1.0).
InitializeADAPT_RESULTS84   inline void Initialize() {
85      BlobLength = MAX_INT32;
86      NumMatches = 0;
87      HasNonfragment = false;
88      BestRating = WORST_POSSIBLE_RATING;
89      BestClass = NO_CLASS;
90      BestConfig = 0;
91      for (int i = 0; i <= MAX_CLASS_ID; ++i) {
92        Ratings[i] = WORST_POSSIBLE_RATING;
93      }
94   }
95 };
96 
97 
98 
99 typedef struct
100 {
101   ADAPT_TEMPLATES Templates;
102   CLASS_ID ClassId;
103   int ConfigId;
104 }
105 
106 
107 PROTO_KEY;
108 
109 /**----------------------------------------------------------------------------
110           Private Macros
111 ----------------------------------------------------------------------------**/
112 #define MarginalMatch(Rating)       \
113 ((Rating) > matcher_great_threshold)
114 
115 #define TempConfigReliable(Config)  \
116 ((Config)->NumTimesSeen >= matcher_min_examples_for_prototyping)
117 
118 #define InitIntFX() (FeaturesHaveBeenExtracted = FALSE)
119 
120 /**----------------------------------------------------------------------------
121           Private Function Prototypes
122 ----------------------------------------------------------------------------**/
123 void AdaptToChar(TBLOB *Blob,
124                  LINE_STATS *LineStats,
125                  CLASS_ID ClassId,
126                  FLOAT32 Threshold);
127 
128 void AdaptToPunc(TBLOB *Blob,
129                  LINE_STATS *LineStats,
130                  CLASS_ID ClassId,
131                  FLOAT32 Threshold);
132 
133 void AmbigClassifier(TBLOB *Blob,
134                      LINE_STATS *LineStats,
135                      INT_TEMPLATES Templates,
136                      UNICHAR_ID *Ambiguities,
137                      ADAPT_RESULTS *Results);
138 
139 UNICHAR_ID *BaselineClassifier(TBLOB *Blob,
140                                LINE_STATS *LineStats,
141                                ADAPT_TEMPLATES Templates,
142                                ADAPT_RESULTS *Results);
143 
144 void make_config_pruner(INT_TEMPLATES templates, CONFIG_PRUNER *config_pruner);
145 
146 void CharNormClassifier(TBLOB *Blob,
147                         LINE_STATS *LineStats,
148                         INT_TEMPLATES Templates,
149                         ADAPT_RESULTS *Results);
150 
151 void ClassifyAsNoise(ADAPT_RESULTS *Results);
152 
153 int CompareCurrentRatings(const void *arg1,
154                           const void *arg2);
155 
156 void ConvertMatchesToChoices(ADAPT_RESULTS *Results,
157                              BLOB_CHOICE_LIST *Choices);
158 
159 void DebugAdaptiveClassifier(TBLOB *Blob,
160                              LINE_STATS *LineStats,
161                              ADAPT_RESULTS *Results);
162 
163 void DoAdaptiveMatch(TBLOB *Blob,
164                      LINE_STATS *LineStats,
165                      ADAPT_RESULTS *Results);
166 
167 void GetAdaptThresholds(TWERD * Word,
168 LINE_STATS * LineStats,
169 const WERD_CHOICE& BestChoice,
170 const WERD_CHOICE& BestRawChoice, FLOAT32 Thresholds[]);
171 
172 UNICHAR_ID *GetAmbiguities(TBLOB *Blob,
173                            LINE_STATS *LineStats,
174                            CLASS_ID CorrectClass);
175 
176 namespace tesseract {
177 int GetBaselineFeatures(TBLOB *Blob,
178                         LINE_STATS *LineStats,
179                         INT_TEMPLATES Templates,
180                         INT_FEATURE_ARRAY IntFeatures,
181                         CLASS_NORMALIZATION_ARRAY CharNormArray,
182                         inT32 *BlobLength);
183 
184 
185 int GetIntBaselineFeatures(TBLOB *Blob,
186                            LINE_STATS *LineStats,
187                            INT_TEMPLATES Templates,
188                            INT_FEATURE_ARRAY IntFeatures,
189                            CLASS_NORMALIZATION_ARRAY CharNormArray,
190                            inT32 *BlobLength);
191 
192 }  // namespace tesseract.
193 
194 void InitMatcherRatings(register FLOAT32 *Rating);
195 
196 PROTO_ID MakeNewTempProtos(FEATURE_SET Features,
197 int NumBadFeat,
198 FEATURE_ID BadFeat[],
199 INT_CLASS IClass,
200 ADAPT_CLASS Class, BIT_VECTOR TempProtoMask);
201 
202 void MakePermanent(ADAPT_TEMPLATES Templates,
203                    CLASS_ID ClassId,
204                    int ConfigId,
205                    TBLOB *Blob,
206                    LINE_STATS *LineStats);
207 
208 int MakeTempProtoPerm(void *item1, void *item2);
209 
210 int NumBlobsIn(TWERD *Word);
211 
212 int NumOutlinesInBlob(TBLOB *Blob);
213 
214 void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
215 
216 void RemoveBadMatches(ADAPT_RESULTS *Results);
217 
218 void RemoveExtraPuncs(ADAPT_RESULTS *Results);
219 
220 void SetAdaptiveThreshold(FLOAT32 Threshold);
221 void ShowBestMatchFor(TBLOB *Blob,
222                       LINE_STATS *LineStats,
223                       CLASS_ID ClassId,
224                       BOOL8 AdaptiveOn,
225                       BOOL8 PreTrainedOn);
226 
227 
228 /**----------------------------------------------------------------------------
229         Global Data Definitions and Declarations
230 ----------------------------------------------------------------------------**/
231 /* variables used to hold performance statistics */
232 static int AdaptiveMatcherCalls = 0;
233 static int BaselineClassifierCalls = 0;
234 static int CharNormClassifierCalls = 0;
235 static int AmbigClassifierCalls = 0;
236 static int NumWordsAdaptedTo = 0;
237 static int NumCharsAdaptedTo = 0;
238 static int NumBaselineClassesTried = 0;
239 static int NumCharNormClassesTried = 0;
240 static int NumAmbigClassesTried = 0;
241 static int NumClassesOutput = 0;
242 static int NumAdaptationsFailed = 0;
243 
244 /* define globals used to hold onto extracted features.  This is used
245 to map from the old scheme in which baseline features and char norm
246 features are extracted separately, to the new scheme in which they
247 are extracted at the same time. */
248 static BOOL8 FeaturesHaveBeenExtracted = FALSE;
249 static BOOL8 FeaturesOK = TRUE;
250 static INT_FEATURE_ARRAY BaselineFeatures;
251 static INT_FEATURE_ARRAY CharNormFeatures;
252 static INT_FX_RESULT_STRUCT FXInfo;
253 
254 /* use a global variable to hold onto the current ratings so that the
255 comparison function passes to qsort can get at them */
256 static FLOAT32 *CurrentRatings;
257 
258 /* define globals to hold filenames of training data */
259 static CLASS_CUTOFF_ARRAY CharNormCutoffs;
260 static CLASS_CUTOFF_ARRAY BaselineCutoffs;
261 
262 /* define control knobs for adaptive matcher */
263 BOOL_VAR(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier");
264 
265 BOOL_VAR(classify_use_pre_adapted_templates, 0,
266          "Use pre-adapted classifier templates");
267 
268 BOOL_VAR(classify_save_adapted_templates, 0,
269          "Save adapted templates to a file");
270 
271 BOOL_VAR(classify_enable_adaptive_debugger, 0, "Enable match debugger");
272 
273 INT_VAR(matcher_debug_level, 0, "Matcher Debug Level");
274 INT_VAR(matcher_debug_flags, 0, "Matcher Debug Flags");
275 
276 INT_VAR(classify_learning_debug_level, 0, "Learning Debug Level: ");
277 
278 double_VAR(matcher_good_threshold, 0.125, "Good Match (0-1)");
279 double_VAR(matcher_great_threshold, 0.0, "Great Match (0-1)");
280 
281 double_VAR(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)");
282 double_VAR(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)");
283 double_VAR(matcher_rating_margin, 0.1, "New template margin (0-1)");
284 double_VAR(matcher_avg_noise_size, 12.0, "Avg. noise blob length: ");
285 
286 INT_VAR(matcher_permanent_classes_min, 1, "Min # of permanent classes");
287 
288 INT_VAR(matcher_min_examples_for_prototyping, 3, "Reliable Config Threshold");
289 
290 double_VAR(matcher_clustering_max_angle_delta, 0.015,
291            "Maximum angle delta for prototype clustering");
292 
293 BOOL_VAR(classify_enable_int_fx, 1, "Enable integer fx");
294 
295 BOOL_VAR(classify_enable_new_adapt_rules, 1, "Enable new adaptation rules");
296 
297 double_VAR(rating_scale, 1.5, "Rating scaling factor");
298 extern double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
299 
300 INT_VAR(matcher_failed_adaptations_before_reset, 150,
301         "Number of failed adaptions before adapted templates reset");
302 
303 double_VAR(tessedit_class_miss_scale, 0.00390625,
304            "Scale factor for features not used");
305 
306 BOOL_VAR(tess_cn_matching, 0, "Character Normalized Matching");
307 BOOL_VAR(tess_bn_matching, 0, "Baseline Normalized Matching");
308 
309 /**----------------------------------------------------------------------------
310               Public Code
311 ----------------------------------------------------------------------------**/
312 /*---------------------------------------------------------------------------*/
313 namespace tesseract {
AdaptiveClassifier(TBLOB * Blob,TBLOB * DotBlob,TEXTROW * Row,BLOB_CHOICE_LIST * Choices,CLASS_PRUNER_RESULTS CPResults)314 void Classify::AdaptiveClassifier(TBLOB *Blob,
315                                   TBLOB *DotBlob,
316                                   TEXTROW *Row,
317                                   BLOB_CHOICE_LIST *Choices,
318                                   CLASS_PRUNER_RESULTS CPResults) {
319 /*
320  **  Parameters: Blob    blob to be classified
321  **              DotBlob         (obsolete)
322  **              Row             row of text that word appears in
323  **  Globals: CurrentRatings  used by compare function for qsort
324  **                         Operation: This routine calls the adaptive matcher
325  **                         which returns (in an array) the class id of each
326  **                         class matched.
327  **                         It also returns the number of classes matched.
328  **                         For each class matched it places the best rating
329  **                         found for that class into the Ratings array.
330  **                         Bad matches are then removed so that they don't
331  **                         need to be sorted.  The remaining good matches are
332  **                         then sorted and converted to choices.
333  **                         This routine also performs some simple speckle
334  **                         filtering.
335  **  Return: Choices    List of choices found by adaptive matcher.
336  **          CPResults  Array of CPResultStruct of size MAX_NUM_CLASSES is
337  **                     filled on return with the choices found by the
338  **                     class pruner and the ratings therefrom. Also
339  **                     contains the detailed results of the integer matcher.
340  **                         Exceptions: none
341  **                         History: Mon Mar 11 10:00:58 1991, DSJ, Created.
342  */
343   assert(Choices != NULL);
344   ADAPT_RESULTS *Results = new ADAPT_RESULTS();
345   LINE_STATS LineStats;
346 
347   if (matcher_failed_adaptations_before_reset >= 0 &&
348       NumAdaptationsFailed >= matcher_failed_adaptations_before_reset) {
349     NumAdaptationsFailed = 0;
350     ResetAdaptiveClassifier();
351   }
352   if (AdaptedTemplates == NULL)
353     AdaptedTemplates = NewAdaptedTemplates (true);
354 
355   EnterClassifyMode;
356 
357   Results->Initialize();
358   GetLineStatsFromRow(Row, &LineStats);
359 
360   DoAdaptiveMatch(Blob, &LineStats, Results);
361   if (CPResults != NULL)
362     memcpy(CPResults, Results->CPResults,
363            sizeof(CPResults[0]) * Results->NumMatches);
364   RemoveBadMatches(Results);
365 
366   /* save ratings in a global so that CompareCurrentRatings() can see them */
367   CurrentRatings = Results->Ratings;
368   qsort ((void *) (Results->Classes), Results->NumMatches,
369     sizeof (CLASS_ID), CompareCurrentRatings);
370 
371   RemoveExtraPuncs(Results);
372   ConvertMatchesToChoices(Results, Choices);
373 
374   if (matcher_debug_level >= 1) {
375     cprintf ("AD Matches =  ");
376     PrintAdaptiveMatchResults(stdout, Results);
377   }
378 
379   if (LargeSpeckle (Blob, Row))
380     AddLargeSpeckleTo(Choices);
381 
382 #ifndef GRAPHICS_DISABLED
383   if (classify_enable_adaptive_debugger)
384     DebugAdaptiveClassifier(Blob, &LineStats, Results);
385 #endif
386 
387   NumClassesOutput += Choices->length();
388   if (Choices->length() == 0) {
389     if (!bln_numericmode)
390       tprintf ("Empty classification!\n");  // Should never normally happen.
391     Choices = new BLOB_CHOICE_LIST();
392     BLOB_CHOICE_IT temp_it;
393     temp_it.set_to_list(Choices);
394     temp_it.add_to_end(new BLOB_CHOICE(0, 50.0f, -20.0f, -1, NULL));
395   }
396 
397   delete Results;
398 }                                /* AdaptiveClassifier */
399 
400 
401 /*---------------------------------------------------------------------------*/
AdaptToWord(TWERD * Word,TEXTROW * Row,const WERD_CHOICE & BestChoice,const WERD_CHOICE & BestRawChoice,const char * rejmap)402 void Classify::AdaptToWord(TWERD *Word,
403                            TEXTROW *Row,
404                            const WERD_CHOICE& BestChoice,
405                            const WERD_CHOICE& BestRawChoice,
406                            const char *rejmap) {
407 /*
408  **                         Parameters:
409  **                         Word
410  **                         word to be adapted to
411  **                         Row
412  **                         row of text that word is found in
413  **                         BestChoice
414  **                         best choice for word found by system
415  **                         BestRawChoice
416  **                         best choice for word found by classifier only
417  **                         Globals:
418  **                         EnableLearning
419  **                         TRUE if learning is enabled
420  **                         Operation: This routine implements a preliminary
421  **                         version of the rules which are used to decide
422  **                         which characters to adapt to.
423  **                         A word is adapted to if it is in the dictionary or
424  **                         if it is a "good" number (no trailing units, etc.).
425  **                         It cannot contain broken or merged characters.
426  **                         Within that word, only letters and digits are
427  **                         adapted to (no punctuation).
428  **                         Return: none
429  **                         Exceptions: none
430  **                         History: Thu Mar 14 07:40:36 1991, DSJ, Created.
431 */
432   TBLOB *Blob;
433   LINE_STATS LineStats;
434   FLOAT32 Thresholds[MAX_ADAPTABLE_WERD_SIZE];
435   FLOAT32 *Threshold;
436   const char *map = rejmap;
437   char map_char = '1';
438   const char* BestChoice_string = BestChoice.unichar_string().string();
439   const char* BestChoice_lengths = BestChoice.unichar_lengths().string();
440 
441   if (strlen(BestChoice_lengths) > MAX_ADAPTABLE_WERD_SIZE)
442     return;
443 
444   if (EnableLearning) {
445     NumWordsAdaptedTo++;
446 
447     #ifndef SECURE_NAMES
448     if (classify_learning_debug_level >= 1)
449       cprintf ("\n\nAdapting to word = %s\n",
450                BestChoice.debug_string(unicharset).string());
451     #endif
452     GetLineStatsFromRow(Row, &LineStats);
453 
454     GetAdaptThresholds(Word,
455                        &LineStats,
456                        BestChoice,
457                        BestRawChoice,
458                        Thresholds);
459 
460     for (Blob = Word->blobs, Threshold = Thresholds; Blob != NULL;
461          Blob = Blob->next, BestChoice_string += *(BestChoice_lengths++),
462              Threshold++) {
463       InitIntFX();
464 
465       if (rejmap != NULL)
466         map_char = *map++;
467 
468       assert (map_char == '1' || map_char == '0');
469 
470       if (map_char == '1') {
471 
472 //        if (unicharset.get_isalpha (BestChoice_string, *BestChoice_lengths) ||
473 //            unicharset.get_isdigit (BestChoice_string, *BestChoice_lengths)) {
474           /* SPECIAL RULE:  don't adapt to an 'i' which is the first char
475              in a word because they are too ambiguous with 'I'.
476              The new adaptation rules should account for this
477              automatically, since they exclude ambiguous words from
478              adaptation, but for safety's sake we'll leave the rule in.
479              Also, don't adapt to i's that have only 1 blob in them
480              because this creates too much ambiguity for broken
481              characters. */
482           if (*BestChoice_lengths == 1 &&
483               (*BestChoice_string == 'i'
484                || (il1_adaption_test && *BestChoice_string == 'I' &&
485                (Blob->next == NULL ||
486                unicharset.get_islower (BestChoice_string + *BestChoice_lengths,
487                                        *(BestChoice_lengths + 1)))))
488               && (Blob == Word->blobs
489                   || (!(unicharset.get_isalpha (BestChoice_string -
490                                                 *(BestChoice_lengths - 1),
491                                                 *(BestChoice_lengths - 1)) ||
492                         unicharset.get_isdigit (BestChoice_string -
493                                                 *(BestChoice_lengths - 1),
494                                                 *(BestChoice_lengths - 1))))
495 
496                   || (!il1_adaption_test && NumOutlinesInBlob(Blob) != 2))) {
497             if (classify_learning_debug_level >= 1)
498               cprintf ("Rejecting char = %s\n", unicharset.id_to_unichar(
499                            unicharset.unichar_to_id(BestChoice_string,
500                                                     *BestChoice_lengths)));
501           }
502           else {
503             #ifndef SECURE_NAMES
504             if (classify_learning_debug_level >= 1)
505               cprintf ("Adapting to char = %s, thr= %g\n",
506                        unicharset.id_to_unichar(
507                            unicharset.unichar_to_id(BestChoice_string,
508                                                     *BestChoice_lengths)),
509                            *Threshold);
510             #endif
511             AdaptToChar(Blob, &LineStats,
512                         unicharset.unichar_to_id(BestChoice_string,
513                                                  *BestChoice_lengths),
514                         *Threshold);
515           }
516 //        }
517 //        else
518 //          AdaptToPunc(Blob, &LineStats,
519 //                      unicharset.unichar_to_id(BestChoice_string,
520 //                                               *BestChoice_lengths),
521 //                      *Threshold);
522       }
523     }
524     if (classify_learning_debug_level >= 1)
525       cprintf ("\n");
526   }
527 }                                /* AdaptToWord */
528 
529 
530 /*---------------------------------------------------------------------------*/
EndAdaptiveClassifier()531 void Classify::EndAdaptiveClassifier() {
532 /*
533  **                         Parameters: none
534  **                         Globals:
535  **                         AdaptedTemplates
536  **                         current set of adapted templates
537  **                         classify_save_adapted_templates
538  **                         TRUE if templates should be saved
539  **                         classify_enable_adaptive_matcher
540  **                         TRUE if adaptive matcher is enabled
541  **                         Operation: This routine performs cleanup operations
542  **                         on the adaptive classifier.  It should be called
543  **                         before the program is terminated.  Its main function
544  **                         is to save the adapted templates to a file.
545  **                         Return: none
546  **                         Exceptions: none
547  **                         History: Tue Mar 19 14:37:06 1991, DSJ, Created.
548 */
549   STRING Filename;
550   FILE *File;
551 
552   #ifndef SECURE_NAMES
553   if (AdaptedTemplates != NULL &&
554       classify_enable_adaptive_matcher && classify_save_adapted_templates) {
555     Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
556     File = fopen (Filename.string(), "wb");
557     if (File == NULL)
558       cprintf ("Unable to save adapted templates to %s!\n", Filename.string());
559     else {
560       cprintf ("\nSaving adapted templates to %s ...", Filename.string());
561       fflush(stdout);
562       WriteAdaptedTemplates(File, AdaptedTemplates);
563       cprintf ("\n");
564       fclose(File);
565     }
566   }
567   #endif
568 
569   if (AdaptedTemplates != NULL) {
570     free_adapted_templates(AdaptedTemplates);
571     AdaptedTemplates = NULL;
572   }
573 
574   if (PreTrainedTemplates != NULL) {
575     free_int_templates(PreTrainedTemplates);
576     PreTrainedTemplates = NULL;
577   }
578   getDict().EndDangerousAmbigs();
579   FreeNormProtos();
580   if (AllProtosOn != NULL) {
581     FreeBitVector(AllProtosOn);
582     FreeBitVector(PrunedProtos);
583     FreeBitVector(AllConfigsOn);
584     FreeBitVector(AllProtosOff);
585     FreeBitVector(AllConfigsOff);
586     FreeBitVector(TempProtoMask);
587     AllProtosOn = NULL;
588     PrunedProtos = NULL;
589     AllConfigsOn = NULL;
590     AllProtosOff = NULL;
591     AllConfigsOff = NULL;
592     TempProtoMask = NULL;
593   }
594 }                                /* EndAdaptiveClassifier */
595 
596 
597 /*---------------------------------------------------------------------------*/
InitAdaptiveClassifier()598 void Classify::InitAdaptiveClassifier() {
599 /*
600  **                         Parameters: none
601  **                         Globals:
602  **                         BuiltInTemplatesFile
603  **                         file to get built-in temps from
604  **                         BuiltInCutoffsFile
605  **                         file to get avg. feat per class from
606  **                         PreTrainedTemplates
607  **                         pre-trained configs and protos
608  **                         AdaptedTemplates
609  **                         templates adapted to current page
610  **                         CharNormCutoffs
611  **                         avg # of features per class
612  **                         AllProtosOn
613  **                         dummy proto mask with all bits 1
614  **                         AllConfigsOn
615  **                         dummy config mask with all bits 1
616  **                         classify_use_pre_adapted_templates
617  **                         enables use of pre-adapted templates
618  **                         Operation: This routine reads in the training
619  **                         information needed by the adaptive classifier
620  **                         and saves it into global variables.
621  **                         Return: none
622  **                         Exceptions: none
623  **                         History: Mon Mar 11 12:49:34 1991, DSJ, Created.
624 */
625   if (!classify_enable_adaptive_matcher)
626     return;
627   if (AllProtosOn != NULL)
628     EndAdaptiveClassifier();  // Don't leak with multiple inits.
629 
630   // If there is no language_data_path_prefix, the classifier will be
631   // adaptive only.
632   if (language_data_path_prefix.length() > 0) {
633     if (!tessdata_manager.SeekToStart(TESSDATA_INTTEMP)) {
634       inttemp_loaded_ = false;
635     } else {
636       PreTrainedTemplates =
637         ReadIntTemplates(tessdata_manager.GetDataFilePtr());
638       if (global_tessdata_manager_debug_level) tprintf("Loaded inttemp\n");
639 
640       ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_PFFMTABLE));
641       ReadNewCutoffs(tessdata_manager.GetDataFilePtr(),
642                      tessdata_manager.GetEndOffset(TESSDATA_PFFMTABLE),
643                      CharNormCutoffs);
644       if (global_tessdata_manager_debug_level) tprintf("Loaded pffmtable\n");
645 
646       ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_NORMPROTO));
647       NormProtos =
648         ReadNormProtos(tessdata_manager.GetDataFilePtr(),
649                        tessdata_manager.GetEndOffset(TESSDATA_NORMPROTO));
650       if (global_tessdata_manager_debug_level) tprintf("Loaded normproto\n");
651 
652       inttemp_loaded_ = true;
653     }
654   }
655 
656   InitIntegerMatcher();
657   InitIntegerFX();
658 
659   AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
660   PrunedProtos = NewBitVector(MAX_NUM_PROTOS);
661   AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
662   AllProtosOff = NewBitVector(MAX_NUM_PROTOS);
663   AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
664   TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
665   set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
666   set_all_bits(PrunedProtos, WordsInVectorOfSize(MAX_NUM_PROTOS));
667   set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
668   zero_all_bits(AllProtosOff, WordsInVectorOfSize(MAX_NUM_PROTOS));
669   zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));
670 
671   if (classify_use_pre_adapted_templates) {
672     FILE *File;
673     STRING Filename;
674 
675     Filename = imagefile;
676     Filename += ADAPT_TEMPLATE_SUFFIX;
677     File = fopen(Filename.string(), "rb");
678     if (File == NULL) {
679       AdaptedTemplates = NewAdaptedTemplates(true);
680     } else {
681       #ifndef SECURE_NAMES
682       cprintf("\nReading pre-adapted templates from %s ...\n",
683               Filename.string());
684       fflush(stdout);
685       #endif
686       AdaptedTemplates = ReadAdaptedTemplates(File);
687       cprintf("\n");
688       fclose(File);
689       PrintAdaptedTemplates(stdout, AdaptedTemplates);
690 
691       for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
692         BaselineCutoffs[i] = CharNormCutoffs[i];
693       }
694     }
695   } else {
696     if (AdaptedTemplates != NULL)
697       free_adapted_templates(AdaptedTemplates);
698     AdaptedTemplates = NewAdaptedTemplates(true);
699   }
700 }                                /* InitAdaptiveClassifier */
701 
ResetAdaptiveClassifier()702 void Classify::ResetAdaptiveClassifier() {
703   free_adapted_templates(AdaptedTemplates);
704   AdaptedTemplates = NULL;
705 }
706 }  // namespace tesseract
707 
708 
709 /*---------------------------------------------------------------------------*/
710 namespace tesseract {
PrintAdaptiveStatistics(FILE * File)711 void Classify::PrintAdaptiveStatistics(FILE *File) {
712 /*
713  **                         Parameters:
714  **                         File
715  **                         open text file to print adaptive statistics to
716  **                         Globals: none
717  **                         Operation: Print to File the statistics which have
718  **                         been gathered for the adaptive matcher.
719  **                         Return: none
720  **                         Exceptions: none
721  **                         History: Thu Apr 18 14:37:37 1991, DSJ, Created.
722 */
723   #ifndef SECURE_NAMES
724 
725   fprintf (File, "\nADAPTIVE MATCHER STATISTICS:\n");
726   fprintf (File, "\tNum blobs classified = %d\n", AdaptiveMatcherCalls);
727   fprintf (File, "\tNum classes output   = %d (Avg = %4.2f)\n",
728     NumClassesOutput,
729     ((AdaptiveMatcherCalls == 0) ? (0.0) :
730   ((float) NumClassesOutput / AdaptiveMatcherCalls)));
731   fprintf (File, "\t\tBaseline Classifier: %4d calls (%4.2f classes/call)\n",
732     BaselineClassifierCalls,
733     ((BaselineClassifierCalls == 0) ? (0.0) :
734   ((float) NumBaselineClassesTried / BaselineClassifierCalls)));
735   fprintf (File, "\t\tCharNorm Classifier: %4d calls (%4.2f classes/call)\n",
736     CharNormClassifierCalls,
737     ((CharNormClassifierCalls == 0) ? (0.0) :
738   ((float) NumCharNormClassesTried / CharNormClassifierCalls)));
739   fprintf (File, "\t\tAmbig    Classifier: %4d calls (%4.2f classes/call)\n",
740     AmbigClassifierCalls,
741     ((AmbigClassifierCalls == 0) ? (0.0) :
742   ((float) NumAmbigClassesTried / AmbigClassifierCalls)));
743 
744   fprintf (File, "\nADAPTIVE LEARNER STATISTICS:\n");
745   fprintf (File, "\tNumber of words adapted to: %d\n", NumWordsAdaptedTo);
746   fprintf (File, "\tNumber of chars adapted to: %d\n", NumCharsAdaptedTo);
747 
748   PrintAdaptedTemplates(File, AdaptedTemplates);
749   #endif
750 }                                /* PrintAdaptiveStatistics */
751 
752 
753 /*---------------------------------------------------------------------------*/
SettupPass1()754 void Classify::SettupPass1() {
755 /*
756  **                         Parameters: none
757  **                         Globals:
758  **                         EnableLearning
759  **                         set to TRUE by this routine
760  **                         Operation: This routine prepares the adaptive
761  **                         matcher for the start
762  **                         of the first pass.  Learning is enabled (unless it
763  **                         is disabled for the whole program).
764  **                         Return: none
765  **                         Exceptions: none
766  **                         History: Mon Apr 15 16:39:29 1991, DSJ, Created.
767 */
768   /* Note: this is somewhat redundant, it simply says that if learning is
769   enabled then it will remain enabled on the first pass.  If it is
770   disabled, then it will remain disabled.  This is only put here to
771   make it very clear that learning is controlled directly by the global
772     setting of EnableLearning. */
773   EnableLearning = classify_enable_learning;
774 
775   getDict().SettupStopperPass1();
776 
777 }                                /* SettupPass1 */
778 
779 
780 /*---------------------------------------------------------------------------*/
SettupPass2()781 void Classify::SettupPass2() {
782 /*
783  **                         Parameters: none
784  **                         Globals:
785  **                         EnableLearning
786  **                         set to FALSE by this routine
787  **                         Operation: This routine prepares the adaptive
788  **                         matcher for the start of the second pass.  Further
789  **                         learning is disabled.
790  **                         Return: none
791  **                         Exceptions: none
792  **                         History: Mon Apr 15 16:39:29 1991, DSJ, Created.
793 */
794   EnableLearning = FALSE;
795   getDict().SettupStopperPass2();
796 
797 }                                /* SettupPass2 */
798 
799 
800 /*---------------------------------------------------------------------------*/
InitAdaptedClass(TBLOB * Blob,LINE_STATS * LineStats,CLASS_ID ClassId,ADAPT_CLASS Class,ADAPT_TEMPLATES Templates)801 void Classify::InitAdaptedClass(TBLOB *Blob,
802                                 LINE_STATS *LineStats,
803                                 CLASS_ID ClassId,
804                                 ADAPT_CLASS Class,
805                                 ADAPT_TEMPLATES Templates) {
806 /*
807  **                          Parameters:
808  **                          Blob
809  **                          blob to model new class after
810  **                          LineStats
811  **                          statistics for text row blob is in
812  **                          ClassId
813  **                          id of the class to be initialized
814  **                          Class
815  **                          adapted class to be initialized
816  **                          Templates
817  **                          adapted templates to add new class to
818  **                          Globals:
819  **                          AllProtosOn
820  **                          dummy mask with all 1's
821  **                          BaselineCutoffs
822  **                          kludge needed to get cutoffs
823  **                          PreTrainedTemplates
824  **                          kludge needed to get cutoffs
825  **                          Operation: This routine creates a new adapted
826  **                          class and uses Blob as the model for the first
827  **                          config in that class.
828  **                          Return: none
829  **                          Exceptions: none
830  **                          History: Thu Mar 14 12:49:39 1991, DSJ, Created.
831 */
832   FEATURE_SET Features;
833   int Fid, Pid;
834   FEATURE Feature;
835   int NumFeatures;
836   TEMP_PROTO TempProto;
837   PROTO Proto;
838   INT_CLASS IClass;
839   TEMP_CONFIG Config;
840 
841   classify_norm_method.set_value(baseline);
842   Features = ExtractOutlineFeatures (Blob, LineStats);
843   NumFeatures = Features->NumFeatures;
844   if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
845     FreeFeatureSet(Features);
846     return;
847   }
848 
849   Config = NewTempConfig (NumFeatures - 1);
850   TempConfigFor (Class, 0) = Config;
851 
852   /* this is a kludge to construct cutoffs for adapted templates */
853   if (Templates == AdaptedTemplates)
854     BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
855 
856   IClass = ClassForClassId (Templates->Templates, ClassId);
857 
858   for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
859     Pid = AddIntProto (IClass);
860     assert (Pid != NO_PROTO);
861 
862     Feature = Features->Features[Fid];
863     TempProto = NewTempProto ();
864     Proto = &(TempProto->Proto);
865 
866     /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
867        ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
868        instead of the -0.25 to 0.75 used in baseline normalization */
869     Proto->Angle = Feature->Params[OutlineFeatDir];
870     Proto->X = Feature->Params[OutlineFeatX];
871     Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
872     Proto->Length = Feature->Params[OutlineFeatLength];
873     FillABC(Proto);
874 
875     TempProto->ProtoId = Pid;
876     SET_BIT (Config->Protos, Pid);
877 
878     ConvertProto(Proto, Pid, IClass);
879     AddProtoToProtoPruner(Proto, Pid, IClass);
880 
881     Class->TempProtos = push (Class->TempProtos, TempProto);
882   }
883   FreeFeatureSet(Features);
884 
885   AddIntConfig(IClass);
886   ConvertConfig (AllProtosOn, 0, IClass);
887 
888   if (classify_learning_debug_level >= 1) {
889     cprintf ("Added new class '%s' with class id %d and %d protos.\n",
890              unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
891   }
892 
893   if (IsEmptyAdaptedClass(Class))
894     (Templates->NumNonEmptyClasses)++;
895 }                                /* InitAdaptedClass */
896 }  // namespace tesseract
897 
898 
899 /*---------------------------------------------------------------------------*/
GetAdaptiveFeatures(TBLOB * Blob,LINE_STATS * LineStats,INT_FEATURE_ARRAY IntFeatures,FEATURE_SET * FloatFeatures)900 int GetAdaptiveFeatures(TBLOB *Blob,
901                         LINE_STATS *LineStats,
902                         INT_FEATURE_ARRAY IntFeatures,
903                         FEATURE_SET *FloatFeatures) {
904 /*
905  **                         Parameters:
906  **                         Blob
907  **                         blob to extract features from
908  **                         LineStats
909  **                         statistics about text row blob is in
910  **                         IntFeatures
911  **                         array to fill with integer features
912  **                         FloatFeatures
913  **                         place to return actual floating-pt features
914  **                         Globals: none
915  **                         Operation: This routine sets up the feature
916  **                         extractor to extract baseline normalized
917  **                         pico-features.
918  **                         The extracted pico-features are converted
919  **                         to integer form and placed in IntFeatures. The
920  **                         original floating-pt. features are returned in
921  **                         FloatFeatures.
922  **                         Return: Number of pico-features returned (0 if
923  **                         an error occurred)
924  **                         Exceptions: none
925  **                         History: Tue Mar 12 17:55:18 1991, DSJ, Created.
926 */
927   FEATURE_SET Features;
928   int NumFeatures;
929 
930   classify_norm_method.set_value(baseline);
931   Features = ExtractPicoFeatures (Blob, LineStats);
932 
933   NumFeatures = Features->NumFeatures;
934   if (NumFeatures > UNLIKELY_NUM_FEAT) {
935     FreeFeatureSet(Features);
936     return (0);
937   }
938 
939   ComputeIntFeatures(Features, IntFeatures);
940   *FloatFeatures = Features;
941 
942   return (NumFeatures);
943 
944 }                                /* GetAdaptiveFeatures */
945 
946 
947 /**----------------------------------------------------------------------------
948               Private Code
949 ----------------------------------------------------------------------------**/
950 /*---------------------------------------------------------------------------*/
951 namespace tesseract {
AdaptableWord(TWERD * Word,const WERD_CHOICE & BestChoiceWord,const WERD_CHOICE & RawChoiceWord)952 int Classify::AdaptableWord(TWERD *Word,
953                             const WERD_CHOICE &BestChoiceWord,
954                             const WERD_CHOICE &RawChoiceWord) {
955 /*
956  **                         Parameters:
957  **                         Word
958  **                         current word
959  **                         BestChoice
960  **                         best overall choice for word with context
961  **                         BestRawChoice
962  **                         best choice for word without context
963  **                         Globals: none
964  **                         Operation: Return TRUE if the specified word is
965  **                         acceptable for adaptation.
966  **                         Return: TRUE or FALSE
967  **                         Exceptions: none
968  **                         History: Thu May 30 14:25:06 1991, DSJ, Created.
969 */
970   int BestChoiceLength = BestChoiceWord.length();
971   return (  // rules that apply in general - simplest to compute first
972     BestChoiceLength > 0 &&
973     BestChoiceLength == NumBlobsIn (Word) &&
974     BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE && (
975     (classify_enable_new_adapt_rules &&
976      getDict().CurrentBestChoiceAdjustFactor() <= ADAPTABLE_WERD &&
977      getDict().AlternativeChoicesWorseThan(ADAPTABLE_WERD) &&
978      getDict().CurrentBestChoiceIs(BestChoiceWord)) ||
979     (!classify_enable_new_adapt_rules &&  // old rules
980      BestChoiceLength == RawChoiceWord.length() &&
981      ((getDict().valid_word_or_number(BestChoiceWord) &&
982        Context::case_ok(BestChoiceWord, getDict().getUnicharset()))))));
983 }
984 
985 /*---------------------------------------------------------------------------*/
AdaptToChar(TBLOB * Blob,LINE_STATS * LineStats,CLASS_ID ClassId,FLOAT32 Threshold)986   void Classify::AdaptToChar(TBLOB *Blob,
987                              LINE_STATS *LineStats,
988                              CLASS_ID ClassId,
989                              FLOAT32 Threshold) {
990 /*
991  **                         Parameters:
992  **                         Blob
993               blob to add to templates for ClassId
994 **                          LineStats
995               statistics about text line blob is in
996 **                          ClassId
997               class to add blob to
998 **                          Threshold
999               minimum match rating to existing template
1000 **                          Globals:
1001 **                          AdaptedTemplates
1002               current set of adapted templates
1003 **                          AllProtosOn
1004               dummy mask to match against all protos
1005 **                          AllConfigsOn
1006               dummy mask to match against all configs
1007 **                          Operation:
1008 **                          Return: none
1009 **                          Exceptions: none
1010 **                          History: Thu Mar 14 09:36:03 1991, DSJ, Created.
1011 */
1012   int NumFeatures;
1013   INT_FEATURE_ARRAY IntFeatures;
1014   INT_RESULT_STRUCT IntResult;
1015   INT_CLASS IClass;
1016   ADAPT_CLASS Class;
1017   TEMP_CONFIG TempConfig;
1018   FEATURE_SET FloatFeatures;
1019   int NewTempConfigId;
1020 
1021   NumCharsAdaptedTo++;
1022   if (!LegalClassId (ClassId))
1023     return;
1024 
1025   Class = AdaptedTemplates->Class[ClassId];
1026   assert(Class != NULL);
1027   if (IsEmptyAdaptedClass(Class)) {
1028     InitAdaptedClass(Blob, LineStats, ClassId, Class, AdaptedTemplates);
1029   }
1030   else {
1031     IClass = ClassForClassId (AdaptedTemplates->Templates, ClassId);
1032 
1033     NumFeatures = GetAdaptiveFeatures (Blob, LineStats,
1034       IntFeatures, &FloatFeatures);
1035     if (NumFeatures <= 0)
1036       return;
1037 
1038     SetBaseLineMatch();
1039     IntegerMatcher (IClass, AllProtosOn, AllConfigsOn,
1040       NumFeatures, NumFeatures, IntFeatures, 0,
1041       &IntResult, NO_DEBUG);
1042 
1043     SetAdaptiveThreshold(Threshold);
1044 
1045     if (IntResult.Rating <= Threshold) {
1046       if (ConfigIsPermanent (Class, IntResult.Config)) {
1047         if (classify_learning_debug_level >= 1)
1048           cprintf ("Found good match to perm config %d = %4.1f%%.\n",
1049             IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
1050         FreeFeatureSet(FloatFeatures);
1051         return;
1052       }
1053 
1054       TempConfig = TempConfigFor (Class, IntResult.Config);
1055       IncreaseConfidence(TempConfig);
1056       if (classify_learning_debug_level >= 1)
1057         cprintf ("Increasing reliability of temp config %d to %d.\n",
1058           IntResult.Config, TempConfig->NumTimesSeen);
1059 
1060       if (TempConfigReliable (TempConfig))
1061         MakePermanent (AdaptedTemplates, ClassId, IntResult.Config,
1062           Blob, LineStats);
1063     }
1064     else {
1065       if (classify_learning_debug_level >= 1)
1066         cprintf ("Found poor match to temp config %d = %4.1f%%.\n",
1067           IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
1068       NewTempConfigId = MakeNewTemporaryConfig(AdaptedTemplates,
1069                                                ClassId,
1070                                                NumFeatures,
1071                                                IntFeatures,
1072                                                FloatFeatures);
1073 
1074       if (NewTempConfigId >= 0 &&
1075           TempConfigReliable (TempConfigFor (Class, NewTempConfigId)))
1076         MakePermanent (AdaptedTemplates, ClassId, NewTempConfigId,
1077                        Blob, LineStats);
1078 
1079 #ifndef GRAPHICS_DISABLED
1080       if (classify_learning_debug_level >= 1) {
1081         IntegerMatcher (IClass, AllProtosOn, AllConfigsOn,
1082           NumFeatures, NumFeatures, IntFeatures, 0,
1083           &IntResult, NO_DEBUG);
1084         cprintf ("Best match to temp config %d = %4.1f%%.\n",
1085           IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
1086         if (classify_learning_debug_level >= 2) {
1087           uinT32 ConfigMask;
1088           ConfigMask = 1 << IntResult.Config;
1089           ShowMatchDisplay();
1090           IntegerMatcher (IClass, AllProtosOn, (BIT_VECTOR)&ConfigMask,
1091             NumFeatures, NumFeatures, IntFeatures, 0,
1092             &IntResult, 6 | 0x19);
1093           UpdateMatchDisplay();
1094           GetClassToDebug ("Adapting");
1095         }
1096       }
1097 #endif
1098     }
1099     FreeFeatureSet(FloatFeatures);
1100   }
1101 }                                /* AdaptToChar */
1102 
1103 
1104 /*---------------------------------------------------------------------------*/
AdaptToPunc(TBLOB * Blob,LINE_STATS * LineStats,CLASS_ID ClassId,FLOAT32 Threshold)1105 void Classify::AdaptToPunc(TBLOB *Blob,
1106                            LINE_STATS *LineStats,
1107                            CLASS_ID ClassId,
1108                            FLOAT32 Threshold) {
1109 /*
1110  **                         Parameters:
1111  **                         Blob
1112               blob to add to templates for ClassId
1113 **                          LineStats
1114               statistics about text line blob is in
1115 **                          ClassId
1116               class to add blob to
1117 **                          Threshold
1118               minimum match rating to existing template
1119 **                          Globals:
1120 **                          PreTrainedTemplates
1121               current set of built-in templates
1122 **                          Operation:
1123 **                          Return: none
1124 **                          Exceptions: none
1125 **                          History: Thu Mar 14 09:36:03 1991, DSJ, Created.
1126 */
1127   ADAPT_RESULTS *Results = new ADAPT_RESULTS();
1128   int i;
1129 
1130   Results->Initialize();
1131   CharNormClassifier(Blob, LineStats, PreTrainedTemplates, Results);
1132   RemoveBadMatches(Results);
1133 
1134   if (Results->NumMatches != 1) {
1135     if (classify_learning_debug_level >= 1) {
1136       cprintf ("Rejecting punc = %s (Alternatives = ",
1137                unicharset.id_to_unichar(ClassId));
1138 
1139       for (i = 0; i < Results->NumMatches; i++)
1140         cprintf ("%s", unicharset.id_to_unichar(Results->Classes[i]));
1141       cprintf (")\n");
1142     }
1143   } else {
1144 
1145   #ifndef SECURE_NAMES
1146     if (classify_learning_debug_level >= 1)
1147     cprintf ("Adapting to punc = %s, thr= %g\n",
1148              unicharset.id_to_unichar(ClassId), Threshold);
1149   #endif
1150   AdaptToChar(Blob, LineStats, ClassId, Threshold);
1151   }
1152   delete Results;
1153 }                                /* AdaptToPunc */
1154 
1155 
1156 /*---------------------------------------------------------------------------*/
AddNewResult(ADAPT_RESULTS * Results,CLASS_ID ClassId,FLOAT32 Rating,int ConfigId)1157 void Classify::AddNewResult(ADAPT_RESULTS *Results,
1158                             CLASS_ID ClassId,
1159                             FLOAT32 Rating,
1160                             int ConfigId) {
1161 /*
1162  **                         Parameters:
1163  **                         Results
1164               results to add new result to
1165 **                          ClassId
1166               class of new result
1167 **                          Rating
1168               rating of new result
1169 **                          ConfigId
1170               config id of new result
1171 **                          Globals:
1172 **                          matcher_bad_match_pad
1173               defines limits of an acceptable match
1174 **                          Operation: This routine adds the result of a classification into
1175 **                          Results.  If the new rating is much worse than the current
1176 **                          best rating, it is not entered into results because it
1177 **                          would end up being stripped later anyway.  If the new rating
1178 **                          is better than the old rating for the class, it replaces the
1179 **                          old rating.  If this is the first rating for the class, the
1180 **                          class is added to the list of matched classes in Results.
1181 **                          If the new rating is better than the best so far, it
1182 **                          becomes the best so far.
1183 **                          Return: none
1184 **                          Exceptions: none
1185 **                          History: Tue Mar 12 18:19:29 1991, DSJ, Created.
1186 */
1187   FLOAT32 OldRating;
1188   INT_CLASS_STRUCT* CharClass = NULL;
1189 
1190   OldRating = Results->Ratings[ClassId];
1191   if (Rating <= Results->BestRating + matcher_bad_match_pad && Rating < OldRating) {
1192     if (!unicharset.get_fragment(ClassId)) {
1193       Results->HasNonfragment = true;
1194     }
1195     Results->Ratings[ClassId] = Rating;
1196     if (ClassId != NO_CLASS)
1197       CharClass = ClassForClassId(PreTrainedTemplates, ClassId);
1198     if (CharClass != NULL)
1199       Results->Configs[ClassId] = ConfigId;
1200     else
1201       Results->Configs[ClassId] = ~0;
1202 
1203     if (Rating < Results->BestRating &&
1204         // Ensure that fragments do no affect best rating, class and config.
1205         // This is needed so that at least one non-fragmented character is
1206         // always present in the Results.
1207         // TODO(daria): verify that this helps accuracy and does not
1208         // hurt performance.
1209         !unicharset.get_fragment(ClassId)) {
1210       Results->BestRating = Rating;
1211       Results->BestClass = ClassId;
1212       Results->BestConfig = ConfigId;
1213     }
1214 
1215     /* if this is first rating for class, add to list of classes matched */
1216     if (OldRating == WORST_POSSIBLE_RATING)
1217       Results->Classes[Results->NumMatches++] = ClassId;
1218   }
1219 }                                /* AddNewResult */
1220 
1221 
1222 /*---------------------------------------------------------------------------*/
AmbigClassifier(TBLOB * Blob,LINE_STATS * LineStats,INT_TEMPLATES Templates,UNICHAR_ID * Ambiguities,ADAPT_RESULTS * Results)1223 void Classify::AmbigClassifier(TBLOB *Blob,
1224                                LINE_STATS *LineStats,
1225                                INT_TEMPLATES Templates,
1226                                UNICHAR_ID *Ambiguities,
1227                                ADAPT_RESULTS *Results) {
1228 /*
1229  **                         Parameters:
1230  **                         Blob
1231               blob to be classified
1232 **                          LineStats
1233               statistics for text line Blob is in
1234 **                          Templates
1235               built-in templates to classify against
1236 **                          Ambiguities
1237               array of class id's to match against
1238 **                          Results
1239               place to put match results
1240 **                          Globals:
1241 **                          AllProtosOn
1242               mask that enables all protos
1243 **                          AllConfigsOn
1244               mask that enables all configs
1245 **                          Operation: This routine is identical to CharNormClassifier()
1246 **                          except that it does no class pruning.  It simply matches
1247 **                          the unknown blob against the classes listed in
1248 **                          Ambiguities.
1249 **                          Return: none
1250 **                          Exceptions: none
1251 **                          History: Tue Mar 12 19:40:36 1991, DSJ, Created.
1252 */
1253   int NumFeatures;
1254   INT_FEATURE_ARRAY IntFeatures;
1255   CLASS_NORMALIZATION_ARRAY CharNormArray;
1256   INT_RESULT_STRUCT IntResult;
1257   CLASS_ID ClassId;
1258 
1259   AmbigClassifierCalls++;
1260 
1261   NumFeatures = GetCharNormFeatures (Blob, LineStats,
1262     Templates,
1263     IntFeatures, CharNormArray,
1264     &(Results->BlobLength));
1265   if (NumFeatures <= 0)
1266     return;
1267 
1268   if (matcher_debug_level >= 2)
1269     cprintf ("AM Matches =  ");
1270 
1271   while (*Ambiguities >= 0) {
1272     ClassId = *Ambiguities;
1273 
1274     SetCharNormMatch();
1275     IntegerMatcher (ClassForClassId (Templates, ClassId),
1276       AllProtosOn, AllConfigsOn,
1277       Results->BlobLength, NumFeatures, IntFeatures,
1278       CharNormArray[ClassId], &IntResult, NO_DEBUG);
1279 
1280     if (matcher_debug_level >= 2)
1281       cprintf ("%s-%-2d %2.0f  ", unicharset.id_to_unichar(ClassId),
1282                IntResult.Config,
1283                IntResult.Rating * 100.0);
1284 
1285     AddNewResult (Results, ClassId, IntResult.Rating, IntResult.Config);
1286 
1287     Ambiguities++;
1288 
1289     NumAmbigClassesTried++;
1290   }
1291   if (matcher_debug_level >= 2)
1292     cprintf ("\n");
1293 
1294 }                                /* AmbigClassifier */
1295 
1296 /*---------------------------------------------------------------------------*/
1297 // Factored-out calls to IntegerMatcher based on class pruner results.
1298 // Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.
MasterMatcher(INT_TEMPLATES templates,inT16 num_features,INT_FEATURE_ARRAY features,CLASS_NORMALIZATION_ARRAY norm_factors,ADAPT_CLASS * classes,int debug,int num_classes,CLASS_PRUNER_RESULTS results,ADAPT_RESULTS * final_results)1299 void Classify::MasterMatcher(INT_TEMPLATES templates,
1300                              inT16 num_features,
1301                              INT_FEATURE_ARRAY features,
1302                              CLASS_NORMALIZATION_ARRAY norm_factors,
1303                              ADAPT_CLASS* classes,
1304                              int debug,
1305                              int num_classes,
1306                              CLASS_PRUNER_RESULTS results,
1307                              ADAPT_RESULTS* final_results) {
1308   for (int c = 0; c < num_classes; c++) {
1309     CLASS_ID class_id = results[c].Class;
1310     INT_RESULT_STRUCT& int_result = results[c].IMResult;
1311     BIT_VECTOR protos = classes != NULL ? classes[class_id]->PermProtos
1312                                         : AllProtosOn;
1313     BIT_VECTOR configs = classes != NULL ? classes[class_id]->PermConfigs
1314                                          : AllConfigsOn;
1315 
1316     IntegerMatcher(ClassForClassId(templates, class_id),
1317                    protos, configs, final_results->BlobLength,
1318                    num_features, features, norm_factors[class_id],
1319                    &int_result, debug);
1320     // Compute class feature corrections.
1321     double miss_penalty = tessedit_class_miss_scale *
1322                           int_result.FeatureMisses;
1323     if (matcher_debug_level >= 2 || tord_display_ratings > 1) {
1324       cprintf("%s-%-2d %2.1f(CP%2.1f, IM%2.1f + MP%2.1f)  ",
1325               unicharset.id_to_unichar(class_id), int_result.Config,
1326               (int_result.Rating + miss_penalty) * 100.0,
1327               results[c].Rating * 100.0,
1328               int_result.Rating * 100.0, miss_penalty * 100.0);
1329       if (c % 4 == 3)
1330         cprintf ("\n");
1331     }
1332     int_result.Rating += miss_penalty;
1333     if (int_result.Rating > WORST_POSSIBLE_RATING)
1334       int_result.Rating = WORST_POSSIBLE_RATING;
1335     AddNewResult(final_results, class_id, int_result.Rating, int_result.Config);
1336     // Add unichars ambiguous with class_id with the same rating as class_id.
1337     if (use_definite_ambigs_for_classifier) {
1338       const UnicharIdVector *definite_ambigs =
1339         getDict().getUnicharAmbigs().OneToOneDefiniteAmbigs(class_id);
1340       int ambigs_size = (definite_ambigs == NULL) ? 0 : definite_ambigs->size();
1341       for (int ambig = 0; ambig < ambigs_size; ++ambig) {
1342         UNICHAR_ID ambig_class_id = (*definite_ambigs)[ambig];
1343         if (matcher_debug_level >= 3) {
1344           tprintf("class: %d definite ambig: %d rating: old %.4f new %.4f\n",
1345                   class_id, ambig_class_id,
1346                   final_results->Ratings[ambig_class_id], int_result.Rating);
1347         }
1348         if (final_results->Ratings[ambig_class_id] < WORST_POSSIBLE_RATING) {
1349           // ambig_class_id was already added to final_results,
1350           // so just need to modify the rating.
1351           if (int_result.Rating < final_results->Ratings[ambig_class_id]) {
1352             final_results->Ratings[ambig_class_id] = int_result.Rating;
1353   }
1354         } else {
1355           AddNewResult(final_results, ambig_class_id,
1356                        int_result.Rating, int_result.Config);
1357         }
1358       }
1359     }
1360   }
1361   if (matcher_debug_level >= 2 || tord_display_ratings > 1)
1362     cprintf("\n");
1363 }
1364 }  // namespace tesseract
1365 
1366 /*---------------------------------------------------------------------------*/
1367 namespace tesseract {
BaselineClassifier(TBLOB * Blob,LINE_STATS * LineStats,ADAPT_TEMPLATES Templates,ADAPT_RESULTS * Results)1368 UNICHAR_ID *Classify::BaselineClassifier(TBLOB *Blob,
1369                                          LINE_STATS *LineStats,
1370                                          ADAPT_TEMPLATES Templates,
1371                                          ADAPT_RESULTS *Results) {
1372 /*
1373  **                         Parameters:
1374  **                         Blob
1375               blob to be classified
1376 **                          LineStats
1377               statistics for text line Blob is in
1378 **                          Templates
1379               current set of adapted templates
1380 **                          Results
1381               place to put match results
1382 **                          Globals:
1383 **                          BaselineCutoffs
1384               expected num features for each class
1385 **                          Operation: This routine extracts baseline normalized features
1386 **                          from the unknown character and matches them against the
1387 **                          specified set of templates.  The classes which match
1388 **                          are added to Results.
1389 **                          Return: Array of possible ambiguous chars that should be checked.
1390 **                          Exceptions: none
1391 **                          History: Tue Mar 12 19:38:03 1991, DSJ, Created.
1392 */
1393   int NumFeatures;
1394   int NumClasses;
1395   INT_FEATURE_ARRAY IntFeatures;
1396   CLASS_NORMALIZATION_ARRAY CharNormArray;
1397   CLASS_ID ClassId;
1398 
1399   BaselineClassifierCalls++;
1400 
1401   NumFeatures = GetBaselineFeatures (Blob, LineStats,
1402     Templates->Templates,
1403     IntFeatures, CharNormArray,
1404     &(Results->BlobLength));
1405   if (NumFeatures <= 0)
1406     return NULL;
1407 
1408   NumClasses = ClassPruner (Templates->Templates, NumFeatures,
1409     IntFeatures, CharNormArray,
1410     BaselineCutoffs, Results->CPResults,
1411     matcher_debug_flags);
1412 
1413   NumBaselineClassesTried += NumClasses;
1414 
1415   if (matcher_debug_level >= 2 || tord_display_ratings > 1)
1416     cprintf ("BL Matches =  ");
1417 
1418   SetBaseLineMatch();
1419   MasterMatcher(Templates->Templates, NumFeatures, IntFeatures, CharNormArray,
1420                 Templates->Class, matcher_debug_flags, NumClasses,
1421                 Results->CPResults, Results);
1422 
1423   ClassId = Results->BestClass;
1424   if (ClassId == NO_CLASS)
1425     return (NULL);
1426   /* this is a bug - maybe should return "" */
1427 
1428   return (Templates->Class[ClassId]->Config[Results->BestConfig].Perm);
1429 }                                /* BaselineClassifier */
1430 
1431 
1432 /*---------------------------------------------------------------------------*/
CharNormClassifier(TBLOB * Blob,LINE_STATS * LineStats,INT_TEMPLATES Templates,ADAPT_RESULTS * Results)1433 int Classify::CharNormClassifier(TBLOB *Blob,
1434                                  LINE_STATS *LineStats,
1435                                  INT_TEMPLATES Templates,
1436                                  ADAPT_RESULTS *Results) {
1437 /*
1438  **                         Parameters:
1439  **                         Blob
1440               blob to be classified
1441 **                          LineStats
1442               statistics for text line Blob is in
1443 **                          Templates
1444               templates to classify unknown against
1445 **                          Results
1446               place to put match results
1447 **                          Globals:
1448 **                          CharNormCutoffs
1449               expected num features for each class
1450 **                          AllProtosOn
1451               mask that enables all protos
1452 **                          AllConfigsOn
1453               mask that enables all configs
1454 **                          Operation: This routine extracts character normalized features
1455 **                          from the unknown character and matches them against the
1456 **                          specified set of templates.  The classes which match
1457 **                          are added to Results.
1458 **                          Return: none
1459 **                          Exceptions: none
1460 **                          History: Tue Mar 12 16:02:52 1991, DSJ, Created.
1461 */
1462   int NumFeatures;
1463   int NumClasses;
1464   INT_FEATURE_ARRAY IntFeatures;
1465   CLASS_NORMALIZATION_ARRAY CharNormArray;
1466 
1467   CharNormClassifierCalls++;
1468 
1469   NumFeatures = GetCharNormFeatures(Blob, LineStats,
1470     Templates,
1471     IntFeatures, CharNormArray,
1472     &(Results->BlobLength));
1473   if (NumFeatures <= 0)
1474     return 0;
1475 
1476   NumClasses = ClassPruner(Templates, NumFeatures,
1477                            IntFeatures, CharNormArray,
1478                            CharNormCutoffs, Results->CPResults,
1479                            matcher_debug_flags);
1480 
1481   if (tessedit_single_match && NumClasses > 1)
1482     NumClasses = 1;
1483   NumCharNormClassesTried += NumClasses;
1484 
1485   SetCharNormMatch();
1486   MasterMatcher(Templates, NumFeatures, IntFeatures, CharNormArray,
1487                 NULL, matcher_debug_flags, NumClasses,
1488                 Results->CPResults, Results);
1489   return NumFeatures;
1490 }                                /* CharNormClassifier */
1491 
1492 
1493 /*---------------------------------------------------------------------------*/
ClassifyAsNoise(ADAPT_RESULTS * Results)1494 void Classify::ClassifyAsNoise(ADAPT_RESULTS *Results) {
1495 /*
1496  **                         Parameters:
1497 **                          Results
1498               results to add noise classification to
1499 **                          Globals:
1500 **                          matcher_avg_noise_size
1501               avg. length of a noise blob
1502 **                          Operation: This routine computes a rating which reflects the
1503 **                          likelihood that the blob being classified is a noise
1504 **                          blob.  NOTE: assumes that the blob length has already been
1505 **                          computed and placed into Results.
1506 **                          Return: none
1507 **                          Exceptions: none
1508 **                          History: Tue Mar 12 18:36:52 1991, DSJ, Created.
1509 */
1510   register FLOAT32 Rating;
1511 
1512   Rating = Results->BlobLength / matcher_avg_noise_size;
1513   Rating *= Rating;
1514   Rating /= 1.0 + Rating;
1515 
1516   AddNewResult (Results, NO_CLASS, Rating, 0);
1517 }                                /* ClassifyAsNoise */
1518 }  // namespace tesserct
1519 
1520 
1521 /*---------------------------------------------------------------------------*/
CompareCurrentRatings(const void * arg1,const void * arg2)1522 int CompareCurrentRatings(                     //CLASS_ID              *Class1,
1523                           const void *arg1,
1524                           const void *arg2) {  //CLASS_ID              *Class2)
1525 /*
1526  **                         Parameters:
1527  **                         Class1, Class2
1528               classes whose ratings are to be compared
1529 **                          Globals:
1530 **                          CurrentRatings
1531               contains actual ratings for each class
1532 **                          Operation: This routine gets the ratings for the 2 specified classes
1533 **                          from a global variable (CurrentRatings) and returns:
1534 **          -1 if Rating1 < Rating2
1535 **                          0 if Rating1 = Rating2
1536 **                          1 if Rating1 > Rating2
1537 **                          Return: Order of classes based on their ratings (see above).
1538 **                          Exceptions: none
1539 **                          History: Tue Mar 12 14:18:31 1991, DSJ, Created.
1540 */
1541   FLOAT32 Rating1, Rating2;
1542   CLASS_ID *Class1 = (CLASS_ID *) arg1;
1543   CLASS_ID *Class2 = (CLASS_ID *) arg2;
1544 
1545   Rating1 = CurrentRatings[*Class1];
1546   Rating2 = CurrentRatings[*Class2];
1547 
1548   if (Rating1 < Rating2)
1549     return (-1);
1550   else if (Rating1 > Rating2)
1551     return (1);
1552   else
1553     return (0);
1554 
1555 }                                /* CompareCurrentRatings */
1556 
1557 
1558 /*---------------------------------------------------------------------------*/
1559 // The function converts the given match ratings to the list of blob
1560 // choices with ratings and certainties (used by the context checkers).
1561 // If character fragments are present in the results, this function also makes
1562 // sure that there is at least one non-fragmented classification included.
1563 // For each classificaiton result check the unicharset for "definite"
1564 // ambiguities and modify the resulting Choices accordingly.
1565 namespace tesseract {
ConvertMatchesToChoices(ADAPT_RESULTS * Results,BLOB_CHOICE_LIST * Choices)1566 void Classify::ConvertMatchesToChoices(ADAPT_RESULTS *Results,
1567                                        BLOB_CHOICE_LIST *Choices) {
1568   assert(Choices != NULL);
1569   int i;
1570   CLASS_ID NextMatch;
1571   FLOAT32 Rating;
1572   FLOAT32 Certainty;
1573   BLOB_CHOICE_IT temp_it;
1574   bool contains_nonfrag = false;
1575   temp_it.set_to_list(Choices);
1576   int choices_length = 0;
1577   for (i = 0; i < Results->NumMatches; i++) {
1578     NextMatch = Results->Classes[i];
1579     bool current_is_frag = (unicharset.get_fragment(NextMatch) != NULL);
1580     if (temp_it.length()+1 == MAX_MATCHES &&
1581         !contains_nonfrag && current_is_frag) {
1582       continue;  // look for a non-fragmented character to fill the
1583                  // last spot in Choices if only fragments are present
1584     }
1585     // BlobLength can never be legally 0, this means recognition failed.
1586     // But we must return a classification result because some invoking
1587     // functions (chopper/permuter) do not anticipate a null blob choice.
1588     // So we need to assign a poor, but not infinitely bad score.
1589     if (Results->BlobLength == 0) {
1590       Certainty = -20;
1591       Rating = 100;    // should be -certainty * real_blob_length
1592     } else {
1593     Rating = Certainty = Results->Ratings[NextMatch];
1594     Rating *= rating_scale * Results->BlobLength;
1595     Certainty *= -certainty_scale;
1596     }
1597     temp_it.add_to_end(new BLOB_CHOICE(NextMatch, Rating, Certainty,
1598                                        Results->Configs[NextMatch],
1599                                        unicharset.get_script(NextMatch)));
1600     contains_nonfrag |= !current_is_frag;  // update contains_nonfrag
1601     choices_length++;
1602     if (choices_length >= MAX_MATCHES) break;
1603   }
1604   Results->NumMatches = choices_length;
1605 }  // ConvertMatchesToChoices
1606 
1607 
1608 /*---------------------------------------------------------------------------*/
1609 #ifndef GRAPHICS_DISABLED
DebugAdaptiveClassifier(TBLOB * Blob,LINE_STATS * LineStats,ADAPT_RESULTS * Results)1610 void Classify::DebugAdaptiveClassifier(TBLOB *Blob,
1611                                        LINE_STATS *LineStats,
1612                                        ADAPT_RESULTS *Results) {
1613 /*
1614  **                         Parameters:
1615  **                         Blob
1616               blob whose classification is being debugged
1617 **                          LineStats
1618               statistics for text line blob is in
1619 **                          Results
1620               results of match being debugged
1621 **                          Globals: none
1622 **                          Operation:
1623 **                          Return: none
1624 **                          Exceptions: none
1625 **                          History: Wed Mar 13 16:44:41 1991, DSJ, Created.
1626 */
1627   const char *Prompt =
1628     "Left-click in IntegerMatch Window to continue or right click to debug...";
1629   const char *DebugMode = "All Templates";
1630   CLASS_ID LastClass = Results->BestClass;
1631   CLASS_ID ClassId;
1632   BOOL8 AdaptiveOn = TRUE;
1633   BOOL8 PreTrainedOn = TRUE;
1634 
1635   ShowMatchDisplay();
1636   cprintf ("\nDebugging class = %s  (%s) ...\n",
1637            unicharset.id_to_unichar(LastClass), DebugMode);
1638   ShowBestMatchFor(Blob, LineStats, LastClass, AdaptiveOn, PreTrainedOn);
1639   UpdateMatchDisplay();
1640 
1641   while ((ClassId = GetClassToDebug (Prompt)) != 0) {
1642 #if 0
1643     switch (ClassId) {
1644       case 'b':
1645         AdaptiveOn = TRUE;
1646         PreTrainedOn = FALSE;
1647         DebugMode = "Adaptive Templates Only";
1648         break;
1649 
1650       case 'c':
1651         AdaptiveOn = FALSE;
1652         PreTrainedOn = TRUE;
1653         DebugMode = "PreTrained Templates Only";
1654         break;
1655 
1656       case 'a':
1657         AdaptiveOn = TRUE;
1658         PreTrainedOn = TRUE;
1659         DebugMode = "All Templates";
1660         break;
1661 
1662       default:
1663         LastClass = ClassId;
1664         break;
1665     }
1666 #endif
1667     LastClass = ClassId;
1668 
1669     ShowMatchDisplay();
1670     cprintf ("\nDebugging class = %d = %s  (%s) ...\n",
1671              LastClass, unicharset.id_to_unichar(LastClass), DebugMode);
1672     ShowBestMatchFor(Blob, LineStats, LastClass, AdaptiveOn, PreTrainedOn);
1673     UpdateMatchDisplay();
1674   }
1675 }                                /* DebugAdaptiveClassifier */
1676 #endif
1677 
1678 /*---------------------------------------------------------------------------*/
DoAdaptiveMatch(TBLOB * Blob,LINE_STATS * LineStats,ADAPT_RESULTS * Results)1679 void Classify::DoAdaptiveMatch(TBLOB *Blob,
1680                      LINE_STATS *LineStats,
1681                      ADAPT_RESULTS *Results) {
1682   /*
1683    **                         Parameters:
1684    **                         Blob
1685    blob to be classified
1686    **                          LineStats
1687    statistics for text line Blob is in
1688    **                          Results
1689    place to put match results
1690    **                          Globals:
1691    **                          PreTrainedTemplates
1692    built-in training templates
1693    **                          AdaptedTemplates
1694    templates adapted for this page
1695    **                          matcher_great_threshold
1696    rating limit for a great match
1697    **                          Operation: This routine performs an adaptive classification.
1698    **                          If we have not yet adapted to enough classes, a simple
1699    **                          classification to the pre-trained templates is performed.
1700    **                          Otherwise, we match the blob against the adapted templates.
1701    **                          If the adapted templates do not match well, we try a
1702    **                          match against the pre-trained templates.  If an adapted
1703    **                          template match is found, we do a match to any pre-trained
1704    **                          templates which could be ambiguous.  The results from all
1705    **                          of these classifications are merged together into Results.
1706    **                          Return: none
1707    **                          Exceptions: none
1708    **                          History: Tue Mar 12 08:50:11 1991, DSJ, Created.
1709    */
1710   UNICHAR_ID *Ambiguities;
1711 
1712   AdaptiveMatcherCalls++;
1713   InitIntFX();
1714 
1715   if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min
1716       || tess_cn_matching) {
1717     CharNormClassifier(Blob, LineStats, PreTrainedTemplates, Results);
1718   }
1719   else {
1720     Ambiguities = BaselineClassifier (Blob, LineStats,
1721                                       AdaptedTemplates, Results);
1722     if ((Results->NumMatches > 0 && MarginalMatch (Results->BestRating)
1723          && !tess_bn_matching) || Results->NumMatches == 0) {
1724       CharNormClassifier(Blob, LineStats, PreTrainedTemplates, Results);
1725     } else if (Ambiguities && *Ambiguities >= 0) {
1726       AmbigClassifier(Blob,
1727                       LineStats,
1728                       PreTrainedTemplates,
1729                       Ambiguities,
1730                       Results);
1731     }
1732   }
1733 
1734   // Force the blob to be classified as noise
1735   // if the results contain only fragments.
1736   // TODO(daria): verify that this is better than
1737   // just adding a NULL classificaiton.
1738   if (!Results->HasNonfragment) {
1739     Results->NumMatches = 0;
1740   }
1741   if (Results->NumMatches == 0)
1742     ClassifyAsNoise(Results);
1743 }   /* DoAdaptiveMatch */
1744 
1745 /*---------------------------------------------------------------------------*/
1746 void
GetAdaptThresholds(TWERD * Word,LINE_STATS * LineStats,const WERD_CHOICE & BestChoice,const WERD_CHOICE & BestRawChoice,FLOAT32 Thresholds[])1747 Classify::GetAdaptThresholds (TWERD * Word,
1748                               LINE_STATS * LineStats,
1749                               const WERD_CHOICE& BestChoice,
1750                               const WERD_CHOICE& BestRawChoice,
1751                               FLOAT32 Thresholds[]) {
1752   /*
1753    **                           Parameters:
1754    **                           Word
1755    current word
1756    **                            LineStats
1757    line stats for row word is in
1758    **                            BestChoice
1759    best choice for current word with context
1760    **                            BestRawChoice
1761    best choice for current word without context
1762    **                            Thresholds
1763    array of thresholds to be filled in
1764    **                            Globals:
1765    **                            classify_enable_new_adapt_rules
1766    **                            matcher_good_threshold
1767    **                            matcher_perfect_threshold
1768    **                            matcher_rating_margin
1769    **                            Operation: This routine tries to estimate how tight the adaptation
1770    **                            threshold should be set for each character in the current
1771    **                            word.  In general, the routine tries to set tighter
1772    **                            thresholds for a character when the current set of templates
1773    **                            would have made an error on that character.  It tries
1774    **                            to set a threshold tight enough to eliminate the error.
1775    **                            Two different sets of rules can be used to determine the
1776    **                            desired thresholds.
1777    **                            Return: none (results are returned in Thresholds)
1778    **                            Exceptions: none
1779    **                            History: Fri May 31 09:22:08 1991, DSJ, Created.
1780    */
1781   TBLOB *Blob;
1782   const char* BestChoice_string = BestChoice.unichar_string().string();
1783   const char* BestChoice_lengths = BestChoice.unichar_lengths().string();
1784   const char* BestRawChoice_string = BestRawChoice.unichar_string().string();
1785   const char* BestRawChoice_lengths = BestRawChoice.unichar_lengths().string();
1786 
1787   if (classify_enable_new_adapt_rules &&   /* new rules */
1788       getDict().CurrentBestChoiceIs(BestChoice)) {
1789     getDict().FindClassifierErrors(matcher_perfect_threshold,
1790                          matcher_good_threshold,
1791                          matcher_rating_margin,
1792                          Thresholds);
1793   }
1794   else {                       /* old rules */
1795     for (Blob = Word->blobs;
1796          Blob != NULL;
1797          Blob = Blob->next, BestChoice_string += *(BestChoice_lengths++),
1798          BestRawChoice_string += *(BestRawChoice_lengths++), Thresholds++)
1799       if (*(BestChoice_lengths) == *(BestRawChoice_lengths) &&
1800           strncmp(BestChoice_string, BestRawChoice_string,
1801                   *(BestChoice_lengths)) == 0)
1802         *Thresholds = matcher_good_threshold;
1803       else {
1804         /* the blob was incorrectly classified - find the rating threshold
1805            needed to create a template which will correct the error with
1806            some margin.  However, don't waste time trying to make
1807            templates which are too tight. */
1808         *Thresholds = GetBestRatingFor (Blob, LineStats,
1809                                         unicharset.unichar_to_id(
1810                                             BestChoice_string,
1811                                             *BestChoice_lengths));
1812         *Thresholds *= (1.0 - matcher_rating_margin);
1813         if (*Thresholds > matcher_good_threshold)
1814           *Thresholds = matcher_good_threshold;
1815         if (*Thresholds < matcher_perfect_threshold)
1816           *Thresholds = matcher_perfect_threshold;
1817       }
1818   }
1819 }                              /* GetAdaptThresholds */
1820 
1821 /*---------------------------------------------------------------------------*/
GetAmbiguities(TBLOB * Blob,LINE_STATS * LineStats,CLASS_ID CorrectClass)1822 UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob,
1823                                      LINE_STATS *LineStats,
1824                                      CLASS_ID CorrectClass) {
1825   /*
1826    **                           Parameters:
1827    **                           Blob
1828    blob to get classification ambiguities for
1829    **                            LineStats
1830    statistics for text line blob is in
1831    **                            CorrectClass
1832    correct class for Blob
1833    **                            Globals:
1834    **                            CurrentRatings
1835    used by qsort compare routine
1836    **                            PreTrainedTemplates
1837    built-in templates
1838    **                            Operation: This routine matches blob to the built-in templates
1839    **                            to find out if there are any classes other than the correct
1840    **                            class which are potential ambiguities.
1841    **                            Return: String containing all possible ambiguous classes.
1842    **                            Exceptions: none
1843    **                            History: Fri Mar 15 08:08:22 1991, DSJ, Created.
1844    */
1845   ADAPT_RESULTS *Results = new ADAPT_RESULTS();
1846   UNICHAR_ID *Ambiguities;
1847   int i;
1848 
1849   EnterClassifyMode;
1850 
1851   Results->Initialize();
1852 
1853   CharNormClassifier(Blob, LineStats, PreTrainedTemplates, Results);
1854   RemoveBadMatches(Results);
1855 
1856   /* save ratings in a global so that CompareCurrentRatings() can see them */
1857   CurrentRatings = Results->Ratings;
1858   qsort ((void *) (Results->Classes), Results->NumMatches,
1859          sizeof (CLASS_ID), CompareCurrentRatings);
1860 
1861   /* copy the class id's into an string of ambiguities - don't copy if
1862      the correct class is the only class id matched */
1863   Ambiguities = (UNICHAR_ID *) Emalloc (sizeof (UNICHAR_ID) *
1864                                         (Results->NumMatches + 1));
1865   if (Results->NumMatches > 1 ||
1866       (Results->NumMatches == 1 && Results->Classes[0] != CorrectClass)) {
1867     for (i = 0; i < Results->NumMatches; i++)
1868       Ambiguities[i] = Results->Classes[i];
1869     Ambiguities[i] = -1;
1870   }
1871   else
1872     Ambiguities[0] = -1;
1873 
1874   delete Results;
1875   return (Ambiguities);
1876 }                              /* GetAmbiguities */
1877 
1878 /*---------------------------------------------------------------------------*/
GetBaselineFeatures(TBLOB * Blob,LINE_STATS * LineStats,INT_TEMPLATES Templates,INT_FEATURE_ARRAY IntFeatures,CLASS_NORMALIZATION_ARRAY CharNormArray,inT32 * BlobLength)1879 int GetBaselineFeatures(TBLOB *Blob,
1880                         LINE_STATS *LineStats,
1881                         INT_TEMPLATES Templates,
1882                         INT_FEATURE_ARRAY IntFeatures,
1883                         CLASS_NORMALIZATION_ARRAY CharNormArray,
1884                         inT32 *BlobLength) {
1885   /*
1886    **                           Parameters:
1887    **                           Blob
1888    blob to extract features from
1889    **                            LineStats
1890    statistics about text row blob is in
1891    **                            Templates
1892    used to compute char norm adjustments
1893    **                            IntFeatures
1894    array to fill with integer features
1895    **                            CharNormArray
1896    array to fill with dummy char norm adjustments
1897    **                            BlobLength
1898    length of blob in baseline-normalized units
1899    **                            Globals: none
1900    **                            Operation: This routine sets up the feature extractor to extract
1901    **                            baseline normalized pico-features.
1902    **                            The extracted pico-features are converted
1903    **                            to integer form and placed in IntFeatures.  CharNormArray
1904    **                            is filled with 0's to indicate to the matcher that no
1905    **                            character normalization adjustment needs to be done.
1906    **                            The total length of all blob outlines
1907    **                            in baseline normalized units is also returned.
1908    **                            Return: Number of pico-features returned (0 if an error occurred)
1909    **                            Exceptions: none
1910    **                            History: Tue Mar 12 17:55:18 1991, DSJ, Created.
1911    */
1912   FEATURE_SET Features;
1913   int NumFeatures;
1914 
1915   if (classify_enable_int_fx)
1916     return (GetIntBaselineFeatures (Blob, LineStats, Templates,
1917                                     IntFeatures, CharNormArray, BlobLength));
1918 
1919   classify_norm_method.set_value(baseline);
1920   Features = ExtractPicoFeatures (Blob, LineStats);
1921 
1922   NumFeatures = Features->NumFeatures;
1923   *BlobLength = NumFeatures;
1924   if (NumFeatures > UNLIKELY_NUM_FEAT) {
1925     FreeFeatureSet(Features);
1926     return (0);
1927   }
1928 
1929   ComputeIntFeatures(Features, IntFeatures);
1930   ClearCharNormArray(Templates, CharNormArray);
1931 
1932   FreeFeatureSet(Features);
1933   return NumFeatures;
1934 }                              /* GetBaselineFeatures */
1935 
GetBestRatingFor(TBLOB * Blob,LINE_STATS * LineStats,CLASS_ID ClassId)1936 FLOAT32 Classify::GetBestRatingFor(TBLOB *Blob,
1937                                    LINE_STATS *LineStats,
1938                                    CLASS_ID ClassId) {
1939   /*
1940    **                           Parameters:
1941    **                           Blob
1942    blob to get best rating for
1943    **                            LineStats
1944    statistics about text line blob is in
1945    **                            ClassId
1946    class blob is to be compared to
1947    **                            Globals:
1948    **                            PreTrainedTemplates
1949    built-in templates
1950    **                            AdaptedTemplates
1951    current set of adapted templates
1952    **                            AllProtosOn
1953    dummy mask to enable all protos
1954    **                            AllConfigsOn
1955    dummy mask to enable all configs
1956    **                            Operation: This routine classifies Blob against both sets of
1957    **                            templates for the specified class and returns the best
1958    **                            rating found.
1959    **                            Return: Best rating for match of Blob to ClassId.
1960    **                            Exceptions: none
1961    **                            History: Tue Apr  9 09:01:24 1991, DSJ, Created.
1962    */
1963   int NumCNFeatures, NumBLFeatures;
1964   INT_FEATURE_ARRAY CNFeatures, BLFeatures;
1965   INT_RESULT_STRUCT CNResult, BLResult;
1966   inT32 BlobLength;
1967 
1968   CNResult.Rating = BLResult.Rating = 1.0;
1969 
1970   if (!LegalClassId (ClassId))
1971     return (1.0);
1972 
1973   uinT8 *CNAdjust = new uinT8[MAX_NUM_CLASSES];
1974   uinT8 *BLAdjust = new uinT8[MAX_NUM_CLASSES];
1975 
1976   if (!UnusedClassIdIn (PreTrainedTemplates, ClassId)) {
1977     NumCNFeatures = GetCharNormFeatures (Blob, LineStats,
1978                                          PreTrainedTemplates,
1979                                          CNFeatures, CNAdjust, &BlobLength);
1980     if (NumCNFeatures > 0) {
1981       SetCharNormMatch();
1982       IntegerMatcher (ClassForClassId (PreTrainedTemplates, ClassId),
1983                       AllProtosOn, AllConfigsOn,
1984                       BlobLength, NumCNFeatures, CNFeatures,
1985                       CNAdjust[ClassId], &CNResult, NO_DEBUG);
1986     }
1987   }
1988 
1989   if (!UnusedClassIdIn (AdaptedTemplates->Templates, ClassId)) {
1990     NumBLFeatures = GetBaselineFeatures (Blob, LineStats,
1991                                          AdaptedTemplates->Templates,
1992                                          BLFeatures, BLAdjust, &BlobLength);
1993     if (NumBLFeatures > 0) {
1994       SetBaseLineMatch();
1995       IntegerMatcher(ClassForClassId(AdaptedTemplates->Templates, ClassId),
1996                       AdaptedTemplates->Class[ClassId]->PermProtos,
1997                       AdaptedTemplates->Class[ClassId]->PermConfigs,
1998                       BlobLength, NumBLFeatures, BLFeatures,
1999                       BLAdjust[ClassId], &BLResult, NO_DEBUG);
2000     }
2001   }
2002 
2003   // Clean up.
2004   delete[] CNAdjust;
2005   delete[] BLAdjust;
2006 
2007   return (MIN (BLResult.Rating, CNResult.Rating));
2008 }                              /* GetBestRatingFor */
2009 
2010 /*---------------------------------------------------------------------------*/
GetCharNormFeatures(TBLOB * Blob,LINE_STATS * LineStats,INT_TEMPLATES Templates,INT_FEATURE_ARRAY IntFeatures,CLASS_NORMALIZATION_ARRAY CharNormArray,inT32 * BlobLength)2011 int Classify::GetCharNormFeatures(TBLOB *Blob,
2012                                   LINE_STATS *LineStats,
2013                                   INT_TEMPLATES Templates,
2014                                   INT_FEATURE_ARRAY IntFeatures,
2015                                   CLASS_NORMALIZATION_ARRAY CharNormArray,
2016                                   inT32 *BlobLength) {
2017   /*
2018    **                           Parameters:
2019    **                           Blob
2020    blob to extract features from
2021    **                            LineStats
2022    statistics about text row blob is in
2023    **                            Templates
2024    used to compute char norm adjustments
2025    **                            IntFeatures
2026    array to fill with integer features
2027    **                            CharNormArray
2028    array to fill with char norm adjustments
2029    **                            BlobLength
2030    length of blob in baseline-normalized units
2031    **                            Globals: none
2032    **                            Operation: This routine sets up the feature extractor to extract
2033    **                            character normalization features and character normalized
2034    **                            pico-features.  The extracted pico-features are converted
2035    **                            to integer form and placed in IntFeatures.  The character
2036    **                            normalization features are matched to each class in
2037    **                            templates and the resulting adjustment factors are returned
2038    **                            in CharNormArray.  The total length of all blob outlines
2039    **                            in baseline normalized units is also returned.
2040    **                            Return: Number of pico-features returned (0 if an error occurred)
2041    **                            Exceptions: none
2042    **                            History: Tue Mar 12 17:55:18 1991, DSJ, Created.
2043    */
2044   return (GetIntCharNormFeatures (Blob, LineStats, Templates,
2045                                   IntFeatures, CharNormArray, BlobLength));
2046 }                              /* GetCharNormFeatures */
2047 
2048 /*---------------------------------------------------------------------------*/
GetIntBaselineFeatures(TBLOB * Blob,LINE_STATS * LineStats,INT_TEMPLATES Templates,INT_FEATURE_ARRAY IntFeatures,CLASS_NORMALIZATION_ARRAY CharNormArray,inT32 * BlobLength)2049 int GetIntBaselineFeatures(TBLOB *Blob,
2050                            LINE_STATS *LineStats,
2051                            INT_TEMPLATES Templates,
2052                            INT_FEATURE_ARRAY IntFeatures,
2053                            CLASS_NORMALIZATION_ARRAY CharNormArray,
2054                            inT32 *BlobLength) {
2055   /*
2056    **                           Parameters:
2057    **                           Blob
2058    blob to extract features from
2059    **                            LineStats
2060    statistics about text row blob is in
2061    **                            Templates
2062    used to compute char norm adjustments
2063    **                            IntFeatures
2064    array to fill with integer features
2065    **                            CharNormArray
2066    array to fill with dummy char norm adjustments
2067    **                            BlobLength
2068    length of blob in baseline-normalized units
2069    **                            Globals:
2070    **                            FeaturesHaveBeenExtracted
2071    TRUE if fx has been done
2072    **                            BaselineFeatures
2073    holds extracted baseline feat
2074    **                            CharNormFeatures
2075    holds extracted char norm feat
2076    **                            FXInfo
2077    holds misc. FX info
2078    **                            Operation: This routine calls the integer (Hardware) feature
2079    **                            extractor if it has not been called before for this blob.
2080    **                            The results from the feature extractor are placed into
2081    **                            globals so that they can be used in other routines without
2082    **                            re-extracting the features.
2083    **                            It then copies the baseline features into the IntFeatures
2084    **                            array provided by the caller.
2085    **                            Return: Number of features extracted or 0 if an error occured.
2086    **                            Exceptions: none
2087    **                            History: Tue May 28 10:40:52 1991, DSJ, Created.
2088    */
2089   register INT_FEATURE Src, Dest, End;
2090 
2091   if (!FeaturesHaveBeenExtracted) {
2092     FeaturesOK = ExtractIntFeat (Blob, BaselineFeatures,
2093                                  CharNormFeatures, &FXInfo);
2094     FeaturesHaveBeenExtracted = TRUE;
2095   }
2096 
2097   if (!FeaturesOK) {
2098     *BlobLength = FXInfo.NumBL;
2099     return (0);
2100   }
2101 
2102   for (Src = BaselineFeatures, End = Src + FXInfo.NumBL, Dest = IntFeatures;
2103        Src < End;
2104        *Dest++ = *Src++);
2105 
2106   ClearCharNormArray(Templates, CharNormArray);
2107   *BlobLength = FXInfo.NumBL;
2108   return (FXInfo.NumBL);
2109 }                              /* GetIntBaselineFeatures */
2110 
2111 /*---------------------------------------------------------------------------*/
GetIntCharNormFeatures(TBLOB * Blob,LINE_STATS * LineStats,INT_TEMPLATES Templates,INT_FEATURE_ARRAY IntFeatures,CLASS_NORMALIZATION_ARRAY CharNormArray,inT32 * BlobLength)2112 int Classify::GetIntCharNormFeatures(TBLOB *Blob,
2113                                      LINE_STATS *LineStats,
2114                                      INT_TEMPLATES Templates,
2115                                      INT_FEATURE_ARRAY IntFeatures,
2116                                      CLASS_NORMALIZATION_ARRAY CharNormArray,
2117                                      inT32 *BlobLength) {
2118   /*
2119    **                           Parameters:
2120    **                           Blob
2121    blob to extract features from
2122    **                            LineStats
2123    statistics about text row blob is in
2124    **                            Templates
2125    used to compute char norm adjustments
2126    **                            IntFeatures
2127    array to fill with integer features
2128    **                            CharNormArray
2129    array to fill with dummy char norm adjustments
2130    **                            BlobLength
2131    length of blob in baseline-normalized units
2132    **                            Globals:
2133    **                            FeaturesHaveBeenExtracted
2134    TRUE if fx has been done
2135    **                            BaselineFeatures
2136    holds extracted baseline feat
2137    **                            CharNormFeatures
2138    holds extracted char norm feat
2139    **                            FXInfo
2140    holds misc. FX info
2141    **                            Operation: This routine calls the integer (Hardware) feature
2142    **                            extractor if it has not been called before for this blob.
2143    **                            The results from the feature extractor are placed into
2144    **                            globals so that they can be used in other routines without
2145    **                            re-extracting the features.
2146    **                            It then copies the char norm features into the IntFeatures
2147    **                            array provided by the caller.
2148    **                            Return: Number of features extracted or 0 if an error occured.
2149    **                            Exceptions: none
2150    **                            History: Tue May 28 10:40:52 1991, DSJ, Created.
2151    */
2152   register INT_FEATURE Src, Dest, End;
2153   FEATURE NormFeature;
2154   FLOAT32 Baseline, Scale;
2155 
2156   if (!FeaturesHaveBeenExtracted) {
2157     FeaturesOK = ExtractIntFeat (Blob, BaselineFeatures,
2158                                  CharNormFeatures, &FXInfo);
2159     FeaturesHaveBeenExtracted = TRUE;
2160   }
2161 
2162   if (!FeaturesOK) {
2163     *BlobLength = FXInfo.NumBL;
2164     return (0);
2165   }
2166 
2167   for (Src = CharNormFeatures, End = Src + FXInfo.NumCN, Dest = IntFeatures;
2168        Src < End;
2169        *Dest++ = *Src++);
2170 
2171   NormFeature = NewFeature (&CharNormDesc);
2172   Baseline = BaselineAt (LineStats, FXInfo.Xmean);
2173   Scale = ComputeScaleFactor (LineStats);
2174   NormFeature->Params[CharNormY] = (FXInfo.Ymean - Baseline) * Scale;
2175   NormFeature->Params[CharNormLength] =
2176       FXInfo.Length * Scale / LENGTH_COMPRESSION;
2177   NormFeature->Params[CharNormRx] = FXInfo.Rx * Scale;
2178   NormFeature->Params[CharNormRy] = FXInfo.Ry * Scale;
2179   ComputeIntCharNormArray(NormFeature, Templates, CharNormArray);
2180   FreeFeature(NormFeature);
2181 
2182   *BlobLength = FXInfo.NumBL;
2183   return (FXInfo.NumCN);
2184 }                              /* GetIntCharNormFeatures */
2185 
2186 /*---------------------------------------------------------------------------*/
MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,CLASS_ID ClassId,int NumFeatures,INT_FEATURE_ARRAY Features,FEATURE_SET FloatFeatures)2187 int Classify::MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
2188                            CLASS_ID ClassId,
2189                            int NumFeatures,
2190                            INT_FEATURE_ARRAY Features,
2191                            FEATURE_SET FloatFeatures) {
2192   /*
2193    **                           Parameters:
2194    **                           Templates
2195    adapted templates to add new config to
2196    **                            ClassId
2197    class id to associate with new config
2198    **                            NumFeatures
2199    number of features in IntFeatures
2200    **                            Features
2201    features describing model for new config
2202    **                            FloatFeatures
2203    floating-pt representation of features
2204    **                            Globals:
2205    **                            AllProtosOn
2206    mask to enable all protos
2207    **                            AllConfigsOff
2208    mask to disable all configs
2209    **                            TempProtoMask
2210    defines old protos matched in new config
2211    **                            Operation:
2212    **                            Return: The id of the new config created, a negative integer in
2213    **                                                    case of error.
2214    **                            Exceptions: none
2215    **                            History: Fri Mar 15 08:49:46 1991, DSJ, Created.
2216    */
2217   INT_CLASS IClass;
2218   ADAPT_CLASS Class;
2219   PROTO_ID OldProtos[MAX_NUM_PROTOS];
2220   FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
2221   int NumOldProtos;
2222   int NumBadFeatures;
2223   int MaxProtoId, OldMaxProtoId;
2224   int BlobLength = 0;
2225   int MaskSize;
2226   int ConfigId;
2227   TEMP_CONFIG Config;
2228   int i;
2229   int debug_level = NO_DEBUG;
2230 
2231   if (classify_learning_debug_level >= 3)
2232     debug_level =
2233         PRINT_MATCH_SUMMARY | PRINT_FEATURE_MATCHES | PRINT_PROTO_MATCHES;
2234 
2235   IClass = ClassForClassId (Templates->Templates, ClassId);
2236   Class = Templates->Class[ClassId];
2237 
2238   if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
2239     ++NumAdaptationsFailed;
2240     if (classify_learning_debug_level >= 1)
2241       cprintf ("Cannot make new temporary config: maximum number exceeded.\n");
2242     return -1;
2243   }
2244 
2245   OldMaxProtoId = IClass->NumProtos - 1;
2246 
2247   NumOldProtos = FindGoodProtos (IClass, AllProtosOn, AllConfigsOff,
2248                                  BlobLength, NumFeatures, Features,
2249                                  OldProtos, debug_level);
2250 
2251   MaskSize = WordsInVectorOfSize (MAX_NUM_PROTOS);
2252   zero_all_bits(TempProtoMask, MaskSize);
2253   for (i = 0; i < NumOldProtos; i++)
2254     SET_BIT (TempProtoMask, OldProtos[i]);
2255 
2256   NumBadFeatures = FindBadFeatures (IClass, TempProtoMask, AllConfigsOn,
2257                                     BlobLength, NumFeatures, Features,
2258                                     BadFeatures, debug_level);
2259 
2260   MaxProtoId = MakeNewTempProtos (FloatFeatures, NumBadFeatures, BadFeatures,
2261                                   IClass, Class, TempProtoMask);
2262   if (MaxProtoId == NO_PROTO) {
2263     ++NumAdaptationsFailed;
2264     if (classify_learning_debug_level >= 1)
2265       cprintf ("Cannot make new temp protos: maximum number exceeded.\n");
2266     return -1;
2267   }
2268 
2269   ConfigId = AddIntConfig (IClass);
2270   ConvertConfig(TempProtoMask, ConfigId, IClass);
2271   Config = NewTempConfig (MaxProtoId);
2272   TempConfigFor (Class, ConfigId) = Config;
2273   copy_all_bits (TempProtoMask, Config->Protos, Config->ProtoVectorSize);
2274 
2275   if (classify_learning_debug_level >= 1)
2276     cprintf ("Making new temp config %d using %d old and %d new protos.\n",
2277              ConfigId, NumOldProtos, MaxProtoId - OldMaxProtoId);
2278 
2279   return ConfigId;
2280 }                              /* MakeNewTemporaryConfig */
2281 }  // namespace tesseract
2282 
2283 /*---------------------------------------------------------------------------*/
2284 PROTO_ID
MakeNewTempProtos(FEATURE_SET Features,int NumBadFeat,FEATURE_ID BadFeat[],INT_CLASS IClass,ADAPT_CLASS Class,BIT_VECTOR TempProtoMask)2285 MakeNewTempProtos(FEATURE_SET Features,
2286                   int NumBadFeat,
2287                   FEATURE_ID BadFeat[],
2288                   INT_CLASS IClass,
2289                   ADAPT_CLASS Class, BIT_VECTOR TempProtoMask) {
2290   /*
2291    **                           Parameters:
2292    **                           Features
2293    floating-pt features describing new character
2294    **                            NumBadFeat
2295    number of bad features to turn into protos
2296    **                            BadFeat
2297    feature id's of bad features
2298    **                            IClass
2299    integer class templates to add new protos to
2300    **                            Class
2301    adapted class templates to add new protos to
2302    **                            TempProtoMask
2303    proto mask to add new protos to
2304    **                            Globals: none
2305    **                            Operation: This routine finds sets of sequential bad features
2306    **                            that all have the same angle and converts each set into
2307    **                            a new temporary proto.  The temp proto is added to the
2308    **                            proto pruner for IClass, pushed onto the list of temp
2309    **                            protos in Class, and added to TempProtoMask.
2310    **                            Return: Max proto id in class after all protos have been added.
2311    **                            Exceptions: none
2312    **                            History: Fri Mar 15 11:39:38 1991, DSJ, Created.
2313    */
2314   FEATURE_ID *ProtoStart;
2315   FEATURE_ID *ProtoEnd;
2316   FEATURE_ID *LastBad;
2317   TEMP_PROTO TempProto;
2318   PROTO Proto;
2319   FEATURE F1, F2;
2320   FLOAT32 X1, X2, Y1, Y2;
2321   FLOAT32 A1, A2, AngleDelta;
2322   FLOAT32 SegmentLength;
2323   PROTO_ID Pid;
2324 
2325   for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat;
2326        ProtoStart < LastBad; ProtoStart = ProtoEnd) {
2327     F1 = Features->Features[*ProtoStart];
2328     X1 = F1->Params[PicoFeatX];
2329     Y1 = F1->Params[PicoFeatY];
2330     A1 = F1->Params[PicoFeatDir];
2331 
2332     for (ProtoEnd = ProtoStart + 1,
2333          SegmentLength = GetPicoFeatureLength();
2334          ProtoEnd < LastBad;
2335          ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
2336       F2 = Features->Features[*ProtoEnd];
2337       X2 = F2->Params[PicoFeatX];
2338       Y2 = F2->Params[PicoFeatY];
2339       A2 = F2->Params[PicoFeatDir];
2340 
2341       AngleDelta = fabs(A1 - A2);
2342       if (AngleDelta > 0.5)
2343         AngleDelta = 1.0 - AngleDelta;
2344 
2345       if (AngleDelta > matcher_clustering_max_angle_delta ||
2346           fabs(X1 - X2) > SegmentLength ||
2347           fabs(Y1 - Y2) > SegmentLength)
2348         break;
2349     }
2350 
2351     F2 = Features->Features[*(ProtoEnd - 1)];
2352     X2 = F2->Params[PicoFeatX];
2353     Y2 = F2->Params[PicoFeatY];
2354     A2 = F2->Params[PicoFeatDir];
2355 
2356     Pid = AddIntProto(IClass);
2357     if (Pid == NO_PROTO)
2358       return (NO_PROTO);
2359 
2360     TempProto = NewTempProto();
2361     Proto = &(TempProto->Proto);
2362 
2363     /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
2364        ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
2365        instead of the -0.25 to 0.75 used in baseline normalization */
2366     Proto->Length = SegmentLength;
2367     Proto->Angle = A1;
2368     Proto->X = (X1 + X2) / 2.0;
2369     Proto->Y = (Y1 + Y2) / 2.0 - Y_DIM_OFFSET;
2370     FillABC(Proto);
2371 
2372     TempProto->ProtoId = Pid;
2373     SET_BIT(TempProtoMask, Pid);
2374 
2375     ConvertProto(Proto, Pid, IClass);
2376     AddProtoToProtoPruner(Proto, Pid, IClass);
2377 
2378     Class->TempProtos = push(Class->TempProtos, TempProto);
2379   }
2380   return IClass->NumProtos - 1;
2381 }                              /* MakeNewTempProtos */
2382 
2383 /*---------------------------------------------------------------------------*/
2384 namespace tesseract {
MakePermanent(ADAPT_TEMPLATES Templates,CLASS_ID ClassId,int ConfigId,TBLOB * Blob,LINE_STATS * LineStats)2385 void Classify::MakePermanent(ADAPT_TEMPLATES Templates,
2386                              CLASS_ID ClassId,
2387                              int ConfigId,
2388                              TBLOB *Blob,
2389                              LINE_STATS *LineStats) {
2390   /*
2391    **                           Parameters:
2392    **                           Templates
2393    current set of adaptive templates
2394    **                            ClassId
2395    class containing config to be made permanent
2396    **                            ConfigId
2397    config to be made permanent
2398    **                            Blob
2399    current blob being adapted to
2400    **                            LineStats
2401    statistics about text line Blob is in
2402    **                            Globals: none
2403    **                            Operation:
2404    **                            Return: none
2405    **                            Exceptions: none
2406    **                            History: Thu Mar 14 15:54:08 1991, DSJ, Created.
2407    */
2408   UNICHAR_ID *Ambigs;
2409   TEMP_CONFIG Config;
2410   ADAPT_CLASS Class;
2411   PROTO_KEY ProtoKey;
2412 
2413   Class = Templates->Class[ClassId];
2414   Config = TempConfigFor(Class, ConfigId);
2415 
2416   MakeConfigPermanent(Class, ConfigId);
2417   if (Class->NumPermConfigs == 0)
2418     Templates->NumPermClasses++;
2419   Class->NumPermConfigs++;
2420 
2421   ProtoKey.Templates = Templates;
2422   ProtoKey.ClassId = ClassId;
2423   ProtoKey.ConfigId = ConfigId;
2424   Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey,
2425                                MakeTempProtoPerm);
2426   FreeTempConfig(Config);
2427 
2428   Ambigs = GetAmbiguities(Blob, LineStats, ClassId);
2429   PermConfigFor(Class, ConfigId) = Ambigs;
2430 
2431   if (classify_learning_debug_level >= 1) {
2432     cprintf("Making config %d permanent with ambiguities '",
2433             ConfigId, Ambigs);
2434     for (UNICHAR_ID *AmbigsPointer = Ambigs;
2435          *AmbigsPointer >= 0; ++AmbigsPointer)
2436       cprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
2437     cprintf("'.\n");
2438   }
2439 }                              /* MakePermanent */
2440 }  // namespace tesseract
2441 
2442 /*---------------------------------------------------------------------------*/
MakeTempProtoPerm(void * item1,void * item2)2443 int MakeTempProtoPerm(void *item1, void *item2) {
2444                         /*
2445                          **                           Parameters:
2446    ** TempProto temporary proto to compare to key
2447    ** ProtoKey defines which protos to make permanent
2448                          **                            Globals: none
2449                          **                            Operation: This routine converts TempProto to be permanent if
2450                          **                            its proto id is used by the configuration specified in
2451                          **                            ProtoKey.
2452                          **                            Return: TRUE if TempProto is converted, FALSE otherwise
2453                          **                            Exceptions: none
2454                          **                            History: Thu Mar 14 18:49:54 1991, DSJ, Created.
2455                          */
2456                         ADAPT_CLASS Class;
2457                         TEMP_CONFIG Config;
2458                         TEMP_PROTO TempProto;
2459                         PROTO_KEY *ProtoKey;
2460 
2461                         TempProto = (TEMP_PROTO) item1;
2462                         ProtoKey = (PROTO_KEY *) item2;
2463 
2464                         Class = ProtoKey->Templates->Class[ProtoKey->ClassId];
2465                         Config = TempConfigFor (Class, ProtoKey->ConfigId);
2466 
2467                         if (TempProto->ProtoId > Config->MaxProtoId ||
2468                             !test_bit (Config->Protos, TempProto->ProtoId))
2469     return FALSE;
2470 
2471                         MakeProtoPermanent (Class, TempProto->ProtoId);
2472                         AddProtoToClassPruner (&(TempProto->Proto), ProtoKey->ClassId,
2473                                                ProtoKey->Templates->Templates);
2474                         FreeTempProto(TempProto);
2475 
2476   return TRUE;
2477                       }                              /* MakeTempProtoPerm */
2478 
2479 /*---------------------------------------------------------------------------*/
NumBlobsIn(TWERD * Word)2480 int NumBlobsIn(TWERD *Word) {
2481   /*
2482    **                           Parameters:
2483    **                           Word
2484    word to count blobs in
2485    **                            Globals: none
2486    **                            Operation: This routine returns the number of blobs in Word.
2487    **                            Return: Number of blobs in Word.
2488    **                            Exceptions: none
2489    **                            History: Thu Mar 14 08:30:27 1991, DSJ, Created.
2490    */
2491   register TBLOB *Blob;
2492   register int NumBlobs;
2493 
2494   if (Word == NULL)
2495     return (0);
2496 
2497   for (Blob = Word->blobs, NumBlobs = 0;
2498        Blob != NULL; Blob = Blob->next, NumBlobs++);
2499 
2500   return (NumBlobs);
2501 
2502 }                              /* NumBlobsIn */
2503 
2504 /*---------------------------------------------------------------------------*/
NumOutlinesInBlob(TBLOB * Blob)2505 int NumOutlinesInBlob(TBLOB *Blob) {
2506   /*
2507    **                           Parameters:
2508    **                           Blob
2509    blob to count outlines in
2510    **                            Globals: none
2511    **                            Operation: This routine returns the number of OUTER outlines
2512    **                            in Blob.
2513    **                            Return: Number of outer outlines in Blob.
2514    **                            Exceptions: none
2515    **                            History: Mon Jun 10 15:46:20 1991, DSJ, Created.
2516    */
2517   register TESSLINE *Outline;
2518   register int NumOutlines;
2519 
2520   if (Blob == NULL)
2521     return (0);
2522 
2523   for (Outline = Blob->outlines, NumOutlines = 0;
2524        Outline != NULL; Outline = Outline->next, NumOutlines++);
2525 
2526   return (NumOutlines);
2527 
2528 }                              /* NumOutlinesInBlob */
2529 
2530 /*---------------------------------------------------------------------------*/
2531 namespace tesseract {
PrintAdaptiveMatchResults(FILE * File,ADAPT_RESULTS * Results)2532 void Classify::PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results) {
2533   /*
2534    **                           Parameters:
2535    **                           File
2536    open text file to write Results to
2537    **                            Results
2538    match results to write to File
2539    **                            Globals: none
2540    **                            Operation: This routine writes the matches in Results to File.
2541    **                            Return: none
2542    **                            Exceptions: none
2543    **                            History: Mon Mar 18 09:24:53 1991, DSJ, Created.
2544    */
2545   for (int i = 0; i < Results->NumMatches; ++i) {
2546     cprintf("%s(%d) %.2f  ",
2547             unicharset.debug_str(Results->Classes[i]).string(),
2548             Results->Classes[i],
2549             Results->Ratings[Results->Classes[i]] * 100.0);
2550   }
2551   printf("\n");
2552 }                              /* PrintAdaptiveMatchResults */
2553 
2554 /*---------------------------------------------------------------------------*/
RemoveBadMatches(ADAPT_RESULTS * Results)2555 void Classify::RemoveBadMatches(ADAPT_RESULTS *Results) {
2556   /*
2557    **                           Parameters:
2558    **                           Results
2559    contains matches to be filtered
2560    **                            Globals:
2561    **                            matcher_bad_match_pad
2562    defines a "bad match"
2563    **                            Operation: This routine steps thru each matching class in Results
2564    **                            and removes it from the match list if its rating
2565    **                            is worse than the BestRating plus a pad.  In other words,
2566    **                            all good matches get moved to the front of the classes
2567    **                            array.
2568    **                            Return: none
2569    **                            Exceptions: none
2570    **                            History: Tue Mar 12 13:51:03 1991, DSJ, Created.
2571    */
2572   int Next, NextGood;
2573   FLOAT32 *Rating = Results->Ratings;
2574   CLASS_ID *Match = Results->Classes;
2575   FLOAT32 BadMatchThreshold;
2576   static const char* romans = "i v x I V X";
2577   BadMatchThreshold = Results->BestRating + matcher_bad_match_pad;
2578 
2579   if (bln_numericmode) {
2580     UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
2581         unicharset.unichar_to_id("1") : -1;
2582     UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
2583         unicharset.unichar_to_id("0") : -1;
2584     for (Next = NextGood = 0; Next < Results->NumMatches; Next++) {
2585       if (Rating[Match[Next]] <= BadMatchThreshold) {
2586         if (!unicharset.get_isalpha(Match[Next]) ||
2587             strstr(romans, unicharset.id_to_unichar(Match[Next])) != NULL) {
2588           Match[NextGood++] = Match[Next];
2589         } else if (unichar_id_one >= 0 && unicharset.eq(Match[Next], "l") &&
2590                    Rating[unichar_id_one] >= BadMatchThreshold) {
2591           Match[NextGood++] = unichar_id_one;
2592           Rating[unichar_id_one] = Rating[unicharset.unichar_to_id("l")];
2593         } else if (unichar_id_zero >= 0 && unicharset.eq(Match[Next], "O") &&
2594                    Rating[unichar_id_zero] >= BadMatchThreshold) {
2595           Match[NextGood++] = unichar_id_zero;
2596           Rating[unichar_id_zero] = Rating[unicharset.unichar_to_id("O")];
2597         }
2598       }
2599     }
2600   }
2601   else {
2602     for (Next = NextGood = 0; Next < Results->NumMatches; Next++) {
2603       if (Rating[Match[Next]] <= BadMatchThreshold)
2604         Match[NextGood++] = Match[Next];
2605     }
2606   }
2607 
2608   Results->NumMatches = NextGood;
2609 
2610 }                              /* RemoveBadMatches */
2611 
2612 /*----------------------------------------------------------------------------*/
RemoveExtraPuncs(ADAPT_RESULTS * Results)2613 void Classify::RemoveExtraPuncs(ADAPT_RESULTS *Results) {
2614   /*
2615    **                           Parameters:
2616    **                           Results
2617    contains matches to be filtered
2618    **                            Globals:
2619    **                            matcher_bad_match_pad
2620    defines a "bad match"
2621    **                            Operation: This routine steps thru each matching class in Results
2622    **                            and removes it from the match list if its rating
2623    **                            is worse than the BestRating plus a pad.  In other words,
2624    **                            all good matches get moved to the front of the classes
2625    **                            array.
2626    **                            Return: none
2627    **                            Exceptions: none
2628    **                            History: Tue Mar 12 13:51:03 1991, DSJ, Created.
2629    */
2630   int Next, NextGood;
2631   int punc_count;              /*no of garbage characters */
2632   int digit_count;
2633   CLASS_ID *Match = Results->Classes;
2634   /*garbage characters */
2635   static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
2636   static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
2637 
2638   punc_count = 0;
2639   digit_count = 0;
2640   for (Next = NextGood = 0; Next < Results->NumMatches; Next++) {
2641     if (strstr (punc_chars,
2642                 unicharset.id_to_unichar(Match[Next])) == NULL) {
2643       if (strstr (digit_chars,
2644                   unicharset.id_to_unichar(Match[Next])) == NULL) {
2645         Match[NextGood++] = Match[Next];
2646       }
2647       else {
2648         if (digit_count < 1)
2649           Match[NextGood++] = Match[Next];
2650         digit_count++;
2651       }
2652     }
2653     else {
2654       if (punc_count < 2)
2655         Match[NextGood++] = Match[Next];
2656       punc_count++;            /*count them */
2657     }
2658   }
2659   Results->NumMatches = NextGood;
2660 }                              /* RemoveExtraPuncs */
2661 }  // namespace tesseract
2662 
2663 /*---------------------------------------------------------------------------*/
SetAdaptiveThreshold(FLOAT32 Threshold)2664 void SetAdaptiveThreshold(FLOAT32 Threshold) {
2665   /*
2666    **                           Parameters:
2667    **                           Threshold
2668    threshold for creating new templates
2669    **                            Globals:
2670    **                            matcher_good_threshold
2671    default good match rating
2672    **                            Operation: This routine resets the internal thresholds inside
2673    **                            the integer matcher to correspond to the specified
2674    **                            threshold.
2675    **                            Return: none
2676    **                            Exceptions: none
2677    **                            History: Tue Apr  9 08:33:13 1991, DSJ, Created.
2678    */
2679   if (Threshold == matcher_good_threshold) {
2680     /* the blob was probably classified correctly - use the default rating
2681        threshold */
2682     SetProtoThresh (0.9);
2683     SetFeatureThresh (0.9);
2684   }
2685   else {
2686     /* the blob was probably incorrectly classified */
2687     SetProtoThresh (1.0 - Threshold);
2688     SetFeatureThresh (1.0 - Threshold);
2689   }
2690 }                              /* SetAdaptiveThreshold */
2691 
2692 /*---------------------------------------------------------------------------*/
2693 namespace tesseract {
ShowBestMatchFor(TBLOB * Blob,LINE_STATS * LineStats,CLASS_ID ClassId,BOOL8 AdaptiveOn,BOOL8 PreTrainedOn)2694 void Classify::ShowBestMatchFor(TBLOB *Blob,
2695                                 LINE_STATS *LineStats,
2696                                 CLASS_ID ClassId,
2697                                 BOOL8 AdaptiveOn,
2698                                 BOOL8 PreTrainedOn) {
2699   /*
2700    **                           Parameters:
2701    **                           Blob
2702    blob to show best matching config for
2703    **                            LineStats
2704    statistics for text line Blob is in
2705    **                            ClassId
2706    class whose configs are to be searched
2707    **                            AdaptiveOn
2708    TRUE if adaptive configs are enabled
2709    **                            PreTrainedOn
2710    TRUE if pretrained configs are enabled
2711    **                            Globals:
2712    **                            PreTrainedTemplates
2713    built-in training
2714    **                            AdaptedTemplates
2715    adaptive templates
2716    **                            AllProtosOn
2717    dummy proto mask
2718    **                            AllConfigsOn
2719    dummy config mask
2720    **                            Operation: This routine compares Blob to both sets of templates
2721    **        (adaptive and pre-trained) and then displays debug
2722    **                            information for the config which matched best.
2723    **                            Return: none
2724    **                            Exceptions: none
2725    **                            History: Fri Mar 22 08:43:52 1991, DSJ, Created.
2726    */
2727   int NumCNFeatures = 0, NumBLFeatures = 0;
2728   INT_FEATURE_ARRAY CNFeatures, BLFeatures;
2729   INT_RESULT_STRUCT CNResult, BLResult;
2730   inT32 BlobLength;
2731   uinT32 ConfigMask;
2732   static int next_config = -1;
2733 
2734   if (PreTrainedOn) next_config = -1;
2735 
2736   CNResult.Rating = BLResult.Rating = 2.0;
2737 
2738   if (!LegalClassId (ClassId)) {
2739     cprintf ("%d is not a legal class id!!\n", ClassId);
2740     return;
2741   }
2742 
2743   uinT8 *CNAdjust = new uinT8[MAX_NUM_CLASSES];
2744   uinT8 *BLAdjust = new uinT8[MAX_NUM_CLASSES];
2745 
2746   if (PreTrainedOn) {
2747     if (UnusedClassIdIn (PreTrainedTemplates, ClassId))
2748       cprintf ("No built-in templates for class %d = %s\n",
2749                ClassId, unicharset.id_to_unichar(ClassId));
2750     else {
2751       NumCNFeatures = GetCharNormFeatures (Blob, LineStats,
2752                                            PreTrainedTemplates,
2753                                            CNFeatures, CNAdjust,
2754                                            &BlobLength);
2755       if (NumCNFeatures <= 0)
2756         cprintf ("Illegal blob (char norm features)!\n");
2757       else {
2758         SetCharNormMatch();
2759         IntegerMatcher (ClassForClassId (PreTrainedTemplates, ClassId),
2760                         AllProtosOn, AllConfigsOn,
2761                         BlobLength, NumCNFeatures, CNFeatures,
2762                         CNAdjust[ClassId], &CNResult, NO_DEBUG);
2763 
2764         cprintf ("Best built-in template match is config %2d (%4.1f) (cn=%d)\n",
2765                  CNResult.Config, CNResult.Rating * 100.0, CNAdjust[ClassId]);
2766       }
2767     }
2768   }
2769 
2770   if (AdaptiveOn) {
2771     if (UnusedClassIdIn (AdaptedTemplates->Templates, ClassId))
2772       cprintf ("No AD templates for class %d = %s\n",
2773                ClassId, unicharset.id_to_unichar(ClassId));
2774     else {
2775       NumBLFeatures = GetBaselineFeatures (Blob, LineStats,
2776                                            AdaptedTemplates->Templates,
2777                                            BLFeatures, BLAdjust,
2778                                            &BlobLength);
2779       if (NumBLFeatures <= 0)
2780         cprintf ("Illegal blob (baseline features)!\n");
2781       else {
2782         SetBaseLineMatch();
2783         IntegerMatcher (ClassForClassId
2784                         (AdaptedTemplates->Templates, ClassId),
2785                         AllProtosOn, AllConfigsOn,
2786                         // AdaptedTemplates->Class[ClassId]->PermProtos,
2787                         // AdaptedTemplates->Class[ClassId]->PermConfigs,
2788                         BlobLength, NumBLFeatures, BLFeatures,
2789                         BLAdjust[ClassId], &BLResult, NO_DEBUG);
2790 
2791 #ifndef SECURE_NAMES
2792         ADAPT_CLASS Class = AdaptedTemplates->Class[ClassId];
2793         cprintf ("Best adaptive template match is config %2d (%4.1f) %s\n",
2794                  BLResult.Config, BLResult.Rating * 100.0,
2795                  ConfigIsPermanent(Class, BLResult.Config) ? "Perm" : "Temp");
2796 #endif
2797       }
2798     }
2799   }
2800 
2801   cprintf ("\n");
2802   if (BLResult.Rating < CNResult.Rating) {
2803     if (next_config < 0) {
2804       ConfigMask = 1 << BLResult.Config;
2805       next_config = 0;
2806     } else {
2807       ConfigMask = 1 << next_config;
2808       ++next_config;
2809     }
2810     classify_norm_method.set_value(baseline);
2811 
2812     SetBaseLineMatch();
2813     IntegerMatcher (ClassForClassId (AdaptedTemplates->Templates, ClassId),
2814                     AllProtosOn,
2815                     //        AdaptedTemplates->Class[ClassId]->PermProtos,
2816                     (BIT_VECTOR) & ConfigMask,
2817                     BlobLength, NumBLFeatures, BLFeatures,
2818                     BLAdjust[ClassId], &BLResult, matcher_debug_flags);
2819     cprintf ("Adaptive template match for config %2d is %4.1f\n",
2820              BLResult.Config, BLResult.Rating * 100.0);
2821   }
2822   else {
2823     ConfigMask = 1 << CNResult.Config;
2824     classify_norm_method.set_value(character);
2825 
2826     SetCharNormMatch();
2827     //xiaofan
2828     IntegerMatcher (ClassForClassId (PreTrainedTemplates, ClassId), AllProtosOn, (BIT_VECTOR) & ConfigMask,
2829                     BlobLength, NumCNFeatures, CNFeatures,
2830                     CNAdjust[ClassId], &CNResult, matcher_debug_flags);
2831   }
2832 
2833   // Clean up.
2834   delete[] CNAdjust;
2835   delete[] BLAdjust;
2836 }                              /* ShowBestMatchFor */
2837 }  // namespace tesseract
2838