1 /////////////////////////////////////////////////////////////////////// 2 // File: ambigs.h 3 // Description: Constants, flags, functions for dealing with 4 // ambiguities (training and recognition). 5 // Author: Daria Antonova 6 // Created: Mon Aug 23 11:26:43 PDT 2008 7 // 8 // (C) Copyright 2008, Google Inc. 9 // Licensed under the Apache License, Version 2.0 (the "License"); 10 // you may not use this file except in compliance with the License. 11 // You may obtain a copy of the License at 12 // http://www.apache.org/licenses/LICENSE-2.0 13 // Unless required by applicable law or agreed to in writing, software 14 // distributed under the License is distributed on an "AS IS" BASIS, 15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 // See the License for the specific language governing permissions and 17 // limitations under the License. 18 // 19 /////////////////////////////////////////////////////////////////////// 20 21 #ifndef TESSERACT_CCUTIL_AMBIGS_H_ 22 #define TESSERACT_CCUTIL_AMBIGS_H_ 23 24 #include "elst.h" 25 #include "tprintf.h" 26 #include "unichar.h" 27 #include "unicharset.h" 28 #include "genericvector.h" 29 30 #define MAX_AMBIG_SIZE 10 31 32 extern INT_VAR_H(global_ambigs_debug_level, 0, 33 "Debug level for unichar ambiguities"); 34 extern BOOL_VAR_H(use_definite_ambigs_for_classifier, 0, 35 "Use definite ambiguities when running character classifier"); 36 37 namespace tesseract { 38 39 static const int kUnigramAmbigsBufferSize = 1000; 40 static const char kAmbigNgramSeparator[] = { ' ', '\0' }; 41 static const char kAmbigDelimiters[] = "\t "; 42 static const char kIllegalMsg[] = 43 "Illegal ambiguity specification on line %d\n"; 44 static const char kIllegalUnicharMsg[] = 45 "Illegal unichar %s in ambiguity specification\n"; 46 47 enum AmbigType { 48 NOT_AMBIG, // the ngram pair is not ambiguous 49 REPLACE_AMBIG, // ocred ngram should always be substituted with correct 50 DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1) 51 SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1) 52 CASE_AMBIG, // this is a case ambiguity (1-1) 53 54 AMBIG_TYPE_COUNT // number of enum entries 55 }; 56 57 // A collection of utility functions for arrays of UNICHAR_IDs that are 58 // terminated by INVALID_UNICHAR_ID. 59 class UnicharIdArrayUtils { 60 public: 61 // Compares two arrays of unichar ids. Returns -1 if the length of array1 is 62 // less than length of array2, if any array1[i] is less than array2[i]. 63 // Returns 0 if the arrays are equal, 1 otherwise. 64 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID. compare(const UNICHAR_ID array1[],const UNICHAR_ID array2[])65 static inline int compare(const UNICHAR_ID array1[], 66 const UNICHAR_ID array2[]) { 67 const UNICHAR_ID *ptr1 = array1; 68 const UNICHAR_ID *ptr2 = array2; 69 while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) { 70 if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1; 71 ++ptr1; 72 ++ptr2; 73 } 74 if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0; 75 return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1; 76 } 77 78 // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied. 79 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID 80 // and that dst has enough space for all the elements from src. copy(const UNICHAR_ID src[],UNICHAR_ID dst[])81 static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) { 82 int i = 0; 83 do { 84 dst[i] = src[i]; 85 } while (dst[i++] != INVALID_UNICHAR_ID); 86 return i - 1; 87 } 88 89 // Prints unichars corresponding to the unichar_ids in the given array. 90 // The function assumes that array is terminated by INVALID_UNICHAR_ID. print(const UNICHAR_ID array[],const UNICHARSET & unicharset)91 static inline void print(const UNICHAR_ID array[], 92 const UNICHARSET &unicharset) { 93 const UNICHAR_ID *ptr = array; 94 if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]"); 95 while (*ptr != INVALID_UNICHAR_ID) { 96 tprintf("%s ", unicharset.id_to_unichar(*ptr++)); 97 } 98 tprintf("( "); 99 ptr = array; 100 while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++); 101 tprintf(")\n"); 102 } 103 }; 104 105 // AMBIG_SPEC_LIST stores a list of dangerous ambigs that 106 // start with the same unichar (e.g. r->t rn->m rr1->m). 107 class AmbigSpec : public ELIST_LINK { 108 public: 109 AmbigSpec(); ~AmbigSpec()110 ~AmbigSpec() {} 111 112 // Comparator function for sorting AmbigSpec_LISTs. The lists will 113 // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors 114 // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1]. compare_ambig_specs(const void * spec1,const void * spec2)115 static int compare_ambig_specs(const void *spec1, const void *spec2) { 116 const AmbigSpec *s1 = 117 *reinterpret_cast<const AmbigSpec * const *>(spec1); 118 const AmbigSpec *s2 = 119 *reinterpret_cast<const AmbigSpec * const *>(spec2); 120 return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram); 121 } 122 123 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; 124 UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1]; 125 UNICHAR_ID correct_ngram_id; 126 AmbigType type; 127 int wrong_ngram_size; 128 }; 129 ELISTIZEH(AmbigSpec); 130 131 // AMBIG_TABLE[i] stores a set of ambiguities whose 132 // wrong ngram starts with unichar id i. 133 typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector; 134 typedef GenericVector<UNICHAR_ID> UnicharIdVector; 135 136 class UnicharAmbigs { 137 public: UnicharAmbigs()138 UnicharAmbigs() {} ~UnicharAmbigs()139 ~UnicharAmbigs() { 140 replace_ambigs_.delete_data_pointers(); 141 dang_ambigs_.delete_data_pointers(); 142 one_to_one_definite_ambigs_.delete_data_pointers(); 143 } 144 dang_ambigs()145 const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; } replace_ambigs()146 const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; } 147 148 // Fills in two ambiguity tables (replaceable and dangerous) with information 149 // read from the ambigs file. An ambiguity table is an array of lists. 150 // The array is indexed by a class id. Each entry in the table provides 151 // a list of potential ambiguities which can start with the corresponding 152 // character. For example the ambiguity "rn -> m", would be located in the 153 // table at index of unicharset.unichar_to_id('r'). 154 // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in 155 // one_to_one_definite_ambigs_. This vector is also indexed by the class id 156 // of the wrong part of the ambiguity and each entry contains a vector of 157 // unichar ids that are ambiguous to it. 158 void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset, 159 UNICHARSET *unicharset); 160 161 // Return definite 1-1 ambigs. OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id)162 const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const { 163 if (one_to_one_definite_ambigs_.empty()) return NULL; 164 return one_to_one_definite_ambigs_[unichar_id]; 165 } 166 167 private: 168 169 bool ParseAmbiguityLine(int line_num, int version, 170 const UNICHARSET &unicharset, char *buffer, 171 int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds, 172 int *ReplacementAmbigPartSize, 173 char *ReplacementString, int *type); 174 void InsertIntoTable(UnicharAmbigsVector &table, 175 int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds, 176 int ReplacementAmbigPartSize, 177 const char *ReplacementString, int type, 178 AmbigSpec *ambig_spec, UNICHARSET *unicharset); 179 UnicharAmbigsVector dang_ambigs_; 180 UnicharAmbigsVector replace_ambigs_; 181 GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_; 182 }; 183 184 } // namespace tesseract 185 186 #endif // TESSERACT_CCUTIL_AMBIGS_H_ 187