• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        classify.h
3 // Description: classify class.
4 // Author:      Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H__
20 #define TESSERACT_CLASSIFY_CLASSIFY_H__
21 
22 #include "adaptive.h"
23 #include "ccstruct.h"
24 #include "classify.h"
25 #include "dict.h"
26 #include "fxdefs.h"
27 #include "intmatcher.h"
28 #include "ratngs.h"
29 #include "ocrfeatures.h"
30 #include "unicity_table.h"
31 
32 class WERD_CHOICE;
33 struct ADAPT_RESULTS;
34 struct NORM_PROTOS;
35 
36 namespace tesseract {
37 class Classify : public CCStruct {
38  public:
39   Classify();
40   ~Classify();
getDict()41   Dict& getDict() {
42     return dict_;
43   }
44   /* adaptive.cpp ************************************************************/
45   ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
46   int ClassPruner(INT_TEMPLATES IntTemplates,
47                             inT16 NumFeatures,
48                             INT_FEATURE_ARRAY Features,
49                             CLASS_NORMALIZATION_ARRAY NormalizationFactors,
50                             CLASS_CUTOFF_ARRAY ExpectedNumFeatures,
51                             CLASS_PRUNER_RESULTS Results,
52                             int Debug);
53   void ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset,
54                                 CLASS_CUTOFF_ARRAY Cutoffs);
55   void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
56   void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
57   ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File);
58   /* normmatch.cpp ************************************************************/
59   FLOAT32 ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature, BOOL8 DebugMatch);
60   void FreeNormProtos();
61   NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
62   /* protos.cpp ***************************************************************/
63   void ReadClassFile();
64   INT_TEMPLATES
65       CreateIntTemplates(CLASSES FloatProtos,
66                          const UNICHARSET& target_unicharset);
67   /* adaptmatch.cpp ***********************************************************/
68   void AdaptToWord(TWERD *Word,
69                    TEXTROW *Row,
70                    const WERD_CHOICE& BestChoice,
71                    const WERD_CHOICE& BestRawChoice,
72                    const char *rejmap);
73   void InitAdaptiveClassifier();
74   void InitAdaptedClass(TBLOB *Blob,
75                         LINE_STATS *LineStats,
76                         CLASS_ID ClassId,
77                         ADAPT_CLASS Class,
78                         ADAPT_TEMPLATES Templates);
79   void AdaptToPunc(TBLOB *Blob,
80                    LINE_STATS *LineStats,
81                    CLASS_ID ClassId,
82                    FLOAT32 Threshold);
83   void AmbigClassifier(TBLOB *Blob,
84                        LINE_STATS *LineStats,
85                        INT_TEMPLATES Templates,
86                        UNICHAR_ID *Ambiguities,
87                        ADAPT_RESULTS *Results);
88   void MasterMatcher(INT_TEMPLATES templates,
89                      inT16 num_features,
90                      INT_FEATURE_ARRAY features,
91                      CLASS_NORMALIZATION_ARRAY norm_factors,
92                      ADAPT_CLASS* classes,
93                      int debug,
94                      int num_classes,
95                      CLASS_PRUNER_RESULTS results,
96                      ADAPT_RESULTS* final_results);
97   void ConvertMatchesToChoices(ADAPT_RESULTS *Results,
98                                BLOB_CHOICE_LIST *Choices);
99   void AddNewResult(ADAPT_RESULTS *Results,
100                     CLASS_ID ClassId,
101                     FLOAT32 Rating,
102                     int ConfigId);
103 #ifndef GRAPHICS_DISABLED
104   void DebugAdaptiveClassifier(TBLOB *Blob,
105                                LINE_STATS *LineStats,
106                                ADAPT_RESULTS *Results);
107 #endif
108   void GetAdaptThresholds (TWERD * Word,
109                            LINE_STATS * LineStats,
110                            const WERD_CHOICE& BestChoice,
111                            const WERD_CHOICE& BestRawChoice,
112                            FLOAT32 Thresholds[]);
113 
114   int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
115                              CLASS_ID ClassId,
116                              int NumFeatures,
117                              INT_FEATURE_ARRAY Features,
118                              FEATURE_SET FloatFeatures);
119   void MakePermanent(ADAPT_TEMPLATES Templates,
120                      CLASS_ID ClassId,
121                      int ConfigId,
122                      TBLOB *Blob,
123                      LINE_STATS *LineStats);
124   void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
125   void RemoveExtraPuncs(ADAPT_RESULTS *Results);
126   void RemoveBadMatches(ADAPT_RESULTS *Results);
127   void ShowBestMatchFor(TBLOB *Blob,
128                         LINE_STATS *LineStats,
129                         CLASS_ID ClassId,
130                         BOOL8 AdaptiveOn,
131                         BOOL8 PreTrainedOn);
132   UNICHAR_ID *BaselineClassifier(TBLOB *Blob,
133                                  LINE_STATS *LineStats,
134                                  ADAPT_TEMPLATES Templates,
135                                  ADAPT_RESULTS *Results);
136   int CharNormClassifier(TBLOB *Blob,
137                          LINE_STATS *LineStats,
138                          INT_TEMPLATES Templates,
139                          ADAPT_RESULTS *Results);
140   UNICHAR_ID *GetAmbiguities(TBLOB *Blob,
141                              LINE_STATS *LineStats,
142                              CLASS_ID CorrectClass);
143   void DoAdaptiveMatch(TBLOB *Blob,
144                        LINE_STATS *LineStats,
145                        ADAPT_RESULTS *Results);
146   void AdaptToChar(TBLOB *Blob,
147                    LINE_STATS *LineStats,
148                    CLASS_ID ClassId,
149                    FLOAT32 Threshold);
150   int AdaptableWord(TWERD *Word,
151                   const WERD_CHOICE &BestChoiceWord,
152                   const WERD_CHOICE &RawChoiceWord);
153   void EndAdaptiveClassifier();
154   void PrintAdaptiveStatistics(FILE *File);
155   void SettupPass1();
156   void SettupPass2();
157   void AdaptiveClassifier(TBLOB *Blob,
158                           TBLOB *DotBlob,
159                           TEXTROW *Row,
160                           BLOB_CHOICE_LIST *Choices,
161                           CLASS_PRUNER_RESULTS cp_results);
162   void ClassifyAsNoise(ADAPT_RESULTS *Results);
163   void ResetAdaptiveClassifier();
164 
165   FLOAT32 GetBestRatingFor(TBLOB *Blob,
166                            LINE_STATS *LineStats,
167                            CLASS_ID ClassId);
168   int GetCharNormFeatures(TBLOB *Blob,
169                           LINE_STATS *LineStats,
170                           INT_TEMPLATES Templates,
171                           INT_FEATURE_ARRAY IntFeatures,
172                           CLASS_NORMALIZATION_ARRAY CharNormArray,
173                           inT32 *BlobLength);
174   int GetIntCharNormFeatures(TBLOB *Blob,
175                              LINE_STATS *LineStats,
176                              INT_TEMPLATES Templates,
177                              INT_FEATURE_ARRAY IntFeatures,
178                              CLASS_NORMALIZATION_ARRAY CharNormArray,
179                              inT32 *BlobLength);
180 
181   /* float2int.cpp ************************************************************/
182   void ComputeIntCharNormArray(FEATURE NormFeature,
183                                INT_TEMPLATES Templates,
184                                CLASS_NORMALIZATION_ARRAY CharNormArray);
185   /* intproto.cpp *************************************************************/
186   INT_TEMPLATES ReadIntTemplates(FILE *File);
187   void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
188                          const UNICHARSET& target_unicharset);
189   CLASS_ID GetClassToDebug(const char *Prompt);
190   /* font detection ***********************************************************/
get_fontinfo_table()191   UnicityTable<FontInfo>& get_fontinfo_table() {
192     return fontinfo_table_;
193   }
get_fontset_table()194   UnicityTable<FontSet>& get_fontset_table() {
195     return fontset_table_;
196   }
197   /* adaptmatch.cpp ***********************************************************/
198   /* name of current image file being processed */
199   INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
200   /* use class variables to hold onto built-in templates and adapted
201      templates */
202   INT_TEMPLATES PreTrainedTemplates;
203   ADAPT_TEMPLATES AdaptedTemplates;
204   // Successful load of inttemp allows base tesseract classfier to be used.
205   bool inttemp_loaded_;
206 
207   /* create dummy proto and config masks for use with the built-in templates */
208   BIT_VECTOR AllProtosOn;
209   BIT_VECTOR PrunedProtos;
210   BIT_VECTOR AllConfigsOn;
211   BIT_VECTOR AllProtosOff;
212   BIT_VECTOR AllConfigsOff;
213   BIT_VECTOR TempProtoMask;
214   // External control of adaption.
215   BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
216   // Internal control of Adaption so it doesn't work on pass2.
217   BOOL_VAR_H(classify_recog_devanagari, false,
218              "Whether recognizing a language with devanagari script.");
219   bool EnableLearning;
220   /* normmatch.cpp */
221   NORM_PROTOS *NormProtos;
222   /* font detection ***********************************************************/
223   UnicityTable<FontInfo> fontinfo_table_;
224   UnicityTable<FontSet> fontset_table_;
225  private:
226   Dict dict_;
227 };
228 }  // namespace tesseract
229 
230 #endif  // TESSERACT_CLASSIFY_CLASSIFY_H__
231