1 /////////////////////////////////////////////////////////////////////// 2 // File: classify.h 3 // Description: classify class. 4 // Author: Samuel Charron 5 // 6 // (C) Copyright 2006, Google Inc. 7 // Licensed under the Apache License, Version 2.0 (the "License"); 8 // you may not use this file except in compliance with the License. 9 // You may obtain a copy of the License at 10 // http://www.apache.org/licenses/LICENSE-2.0 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // 17 /////////////////////////////////////////////////////////////////////// 18 19 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H__ 20 #define TESSERACT_CLASSIFY_CLASSIFY_H__ 21 22 #include "adaptive.h" 23 #include "ccstruct.h" 24 #include "classify.h" 25 #include "dict.h" 26 #include "fxdefs.h" 27 #include "intmatcher.h" 28 #include "ratngs.h" 29 #include "ocrfeatures.h" 30 #include "unicity_table.h" 31 32 class WERD_CHOICE; 33 struct ADAPT_RESULTS; 34 struct NORM_PROTOS; 35 36 namespace tesseract { 37 class Classify : public CCStruct { 38 public: 39 Classify(); 40 ~Classify(); getDict()41 Dict& getDict() { 42 return dict_; 43 } 44 /* adaptive.cpp ************************************************************/ 45 ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset); 46 int ClassPruner(INT_TEMPLATES IntTemplates, 47 inT16 NumFeatures, 48 INT_FEATURE_ARRAY Features, 49 CLASS_NORMALIZATION_ARRAY NormalizationFactors, 50 CLASS_CUTOFF_ARRAY ExpectedNumFeatures, 51 CLASS_PRUNER_RESULTS Results, 52 int Debug); 53 void ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset, 54 CLASS_CUTOFF_ARRAY Cutoffs); 55 void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates); 56 void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates); 57 ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File); 58 /* normmatch.cpp ************************************************************/ 59 FLOAT32 ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature, BOOL8 DebugMatch); 60 void FreeNormProtos(); 61 NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset); 62 /* protos.cpp ***************************************************************/ 63 void ReadClassFile(); 64 INT_TEMPLATES 65 CreateIntTemplates(CLASSES FloatProtos, 66 const UNICHARSET& target_unicharset); 67 /* adaptmatch.cpp ***********************************************************/ 68 void AdaptToWord(TWERD *Word, 69 TEXTROW *Row, 70 const WERD_CHOICE& BestChoice, 71 const WERD_CHOICE& BestRawChoice, 72 const char *rejmap); 73 void InitAdaptiveClassifier(); 74 void InitAdaptedClass(TBLOB *Blob, 75 LINE_STATS *LineStats, 76 CLASS_ID ClassId, 77 ADAPT_CLASS Class, 78 ADAPT_TEMPLATES Templates); 79 void AdaptToPunc(TBLOB *Blob, 80 LINE_STATS *LineStats, 81 CLASS_ID ClassId, 82 FLOAT32 Threshold); 83 void AmbigClassifier(TBLOB *Blob, 84 LINE_STATS *LineStats, 85 INT_TEMPLATES Templates, 86 UNICHAR_ID *Ambiguities, 87 ADAPT_RESULTS *Results); 88 void MasterMatcher(INT_TEMPLATES templates, 89 inT16 num_features, 90 INT_FEATURE_ARRAY features, 91 CLASS_NORMALIZATION_ARRAY norm_factors, 92 ADAPT_CLASS* classes, 93 int debug, 94 int num_classes, 95 CLASS_PRUNER_RESULTS results, 96 ADAPT_RESULTS* final_results); 97 void ConvertMatchesToChoices(ADAPT_RESULTS *Results, 98 BLOB_CHOICE_LIST *Choices); 99 void AddNewResult(ADAPT_RESULTS *Results, 100 CLASS_ID ClassId, 101 FLOAT32 Rating, 102 int ConfigId); 103 #ifndef GRAPHICS_DISABLED 104 void DebugAdaptiveClassifier(TBLOB *Blob, 105 LINE_STATS *LineStats, 106 ADAPT_RESULTS *Results); 107 #endif 108 void GetAdaptThresholds (TWERD * Word, 109 LINE_STATS * LineStats, 110 const WERD_CHOICE& BestChoice, 111 const WERD_CHOICE& BestRawChoice, 112 FLOAT32 Thresholds[]); 113 114 int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, 115 CLASS_ID ClassId, 116 int NumFeatures, 117 INT_FEATURE_ARRAY Features, 118 FEATURE_SET FloatFeatures); 119 void MakePermanent(ADAPT_TEMPLATES Templates, 120 CLASS_ID ClassId, 121 int ConfigId, 122 TBLOB *Blob, 123 LINE_STATS *LineStats); 124 void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results); 125 void RemoveExtraPuncs(ADAPT_RESULTS *Results); 126 void RemoveBadMatches(ADAPT_RESULTS *Results); 127 void ShowBestMatchFor(TBLOB *Blob, 128 LINE_STATS *LineStats, 129 CLASS_ID ClassId, 130 BOOL8 AdaptiveOn, 131 BOOL8 PreTrainedOn); 132 UNICHAR_ID *BaselineClassifier(TBLOB *Blob, 133 LINE_STATS *LineStats, 134 ADAPT_TEMPLATES Templates, 135 ADAPT_RESULTS *Results); 136 int CharNormClassifier(TBLOB *Blob, 137 LINE_STATS *LineStats, 138 INT_TEMPLATES Templates, 139 ADAPT_RESULTS *Results); 140 UNICHAR_ID *GetAmbiguities(TBLOB *Blob, 141 LINE_STATS *LineStats, 142 CLASS_ID CorrectClass); 143 void DoAdaptiveMatch(TBLOB *Blob, 144 LINE_STATS *LineStats, 145 ADAPT_RESULTS *Results); 146 void AdaptToChar(TBLOB *Blob, 147 LINE_STATS *LineStats, 148 CLASS_ID ClassId, 149 FLOAT32 Threshold); 150 int AdaptableWord(TWERD *Word, 151 const WERD_CHOICE &BestChoiceWord, 152 const WERD_CHOICE &RawChoiceWord); 153 void EndAdaptiveClassifier(); 154 void PrintAdaptiveStatistics(FILE *File); 155 void SettupPass1(); 156 void SettupPass2(); 157 void AdaptiveClassifier(TBLOB *Blob, 158 TBLOB *DotBlob, 159 TEXTROW *Row, 160 BLOB_CHOICE_LIST *Choices, 161 CLASS_PRUNER_RESULTS cp_results); 162 void ClassifyAsNoise(ADAPT_RESULTS *Results); 163 void ResetAdaptiveClassifier(); 164 165 FLOAT32 GetBestRatingFor(TBLOB *Blob, 166 LINE_STATS *LineStats, 167 CLASS_ID ClassId); 168 int GetCharNormFeatures(TBLOB *Blob, 169 LINE_STATS *LineStats, 170 INT_TEMPLATES Templates, 171 INT_FEATURE_ARRAY IntFeatures, 172 CLASS_NORMALIZATION_ARRAY CharNormArray, 173 inT32 *BlobLength); 174 int GetIntCharNormFeatures(TBLOB *Blob, 175 LINE_STATS *LineStats, 176 INT_TEMPLATES Templates, 177 INT_FEATURE_ARRAY IntFeatures, 178 CLASS_NORMALIZATION_ARRAY CharNormArray, 179 inT32 *BlobLength); 180 181 /* float2int.cpp ************************************************************/ 182 void ComputeIntCharNormArray(FEATURE NormFeature, 183 INT_TEMPLATES Templates, 184 CLASS_NORMALIZATION_ARRAY CharNormArray); 185 /* intproto.cpp *************************************************************/ 186 INT_TEMPLATES ReadIntTemplates(FILE *File); 187 void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, 188 const UNICHARSET& target_unicharset); 189 CLASS_ID GetClassToDebug(const char *Prompt); 190 /* font detection ***********************************************************/ get_fontinfo_table()191 UnicityTable<FontInfo>& get_fontinfo_table() { 192 return fontinfo_table_; 193 } get_fontset_table()194 UnicityTable<FontSet>& get_fontset_table() { 195 return fontset_table_; 196 } 197 /* adaptmatch.cpp ***********************************************************/ 198 /* name of current image file being processed */ 199 INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP"); 200 /* use class variables to hold onto built-in templates and adapted 201 templates */ 202 INT_TEMPLATES PreTrainedTemplates; 203 ADAPT_TEMPLATES AdaptedTemplates; 204 // Successful load of inttemp allows base tesseract classfier to be used. 205 bool inttemp_loaded_; 206 207 /* create dummy proto and config masks for use with the built-in templates */ 208 BIT_VECTOR AllProtosOn; 209 BIT_VECTOR PrunedProtos; 210 BIT_VECTOR AllConfigsOn; 211 BIT_VECTOR AllProtosOff; 212 BIT_VECTOR AllConfigsOff; 213 BIT_VECTOR TempProtoMask; 214 // External control of adaption. 215 BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier"); 216 // Internal control of Adaption so it doesn't work on pass2. 217 BOOL_VAR_H(classify_recog_devanagari, false, 218 "Whether recognizing a language with devanagari script."); 219 bool EnableLearning; 220 /* normmatch.cpp */ 221 NORM_PROTOS *NormProtos; 222 /* font detection ***********************************************************/ 223 UnicityTable<FontInfo> fontinfo_table_; 224 UnicityTable<FontSet> fontset_table_; 225 private: 226 Dict dict_; 227 }; 228 } // namespace tesseract 229 230 #endif // TESSERACT_CLASSIFY_CLASSIFY_H__ 231