• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        ambigs.h
3 // Description: Constants, flags, functions for dealing with
4 //              ambiguities (training and recognition).
5 // Author:      Daria Antonova
6 // Created:     Mon Aug 23 11:26:43 PDT 2008
7 //
8 // (C) Copyright 2008, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
19 ///////////////////////////////////////////////////////////////////////
20 
21 #ifndef TESSERACT_CCUTIL_AMBIGS_H_
22 #define TESSERACT_CCUTIL_AMBIGS_H_
23 
24 #include "elst.h"
25 #include "tprintf.h"
26 #include "unichar.h"
27 #include "unicharset.h"
28 #include "genericvector.h"
29 
30 #define MAX_AMBIG_SIZE    10
31 
32 extern INT_VAR_H(global_ambigs_debug_level, 0,
33                  "Debug level for unichar ambiguities");
34 extern BOOL_VAR_H(use_definite_ambigs_for_classifier, 0,
35                   "Use definite ambiguities when running character classifier");
36 
37 namespace tesseract {
38 
39 static const int kUnigramAmbigsBufferSize = 1000;
40 static const char kAmbigNgramSeparator[] = { ' ', '\0' };
41 static const char kAmbigDelimiters[] = "\t ";
42 static const char kIllegalMsg[] =
43   "Illegal ambiguity specification on line %d\n";
44 static const char kIllegalUnicharMsg[] =
45   "Illegal unichar %s in ambiguity specification\n";
46 
47 enum AmbigType {
48   NOT_AMBIG,        // the ngram pair is not ambiguous
49   REPLACE_AMBIG,    // ocred ngram should always be substituted with correct
50   DEFINITE_AMBIG,   // add correct ngram to the classifier results (1-1)
51   SIMILAR_AMBIG,    // use pairwise classifier for ocred/correct pair (1-1)
52   CASE_AMBIG,       // this is a case ambiguity (1-1)
53 
54   AMBIG_TYPE_COUNT  // number of enum entries
55 };
56 
57 // A collection of utility functions for arrays of UNICHAR_IDs that are
58 // terminated by INVALID_UNICHAR_ID.
59 class UnicharIdArrayUtils {
60  public:
61   // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
62   // less than length of array2, if any array1[i] is less than array2[i].
63   // Returns 0 if the arrays are equal, 1 otherwise.
64   // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
compare(const UNICHAR_ID array1[],const UNICHAR_ID array2[])65   static inline int compare(const UNICHAR_ID array1[],
66                             const UNICHAR_ID array2[]) {
67     const UNICHAR_ID *ptr1 = array1;
68     const UNICHAR_ID *ptr2 = array2;
69     while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
70       if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
71       ++ptr1;
72       ++ptr2;
73     }
74     if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
75     return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
76   }
77 
78   // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
79   // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
80   // and that dst has enough space for all the elements from src.
copy(const UNICHAR_ID src[],UNICHAR_ID dst[])81   static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
82     int i = 0;
83     do {
84       dst[i] = src[i];
85     } while (dst[i++] != INVALID_UNICHAR_ID);
86     return i - 1;
87   }
88 
89   // Prints unichars corresponding to the unichar_ids in the given array.
90   // The function assumes that array is terminated by INVALID_UNICHAR_ID.
print(const UNICHAR_ID array[],const UNICHARSET & unicharset)91   static inline void print(const UNICHAR_ID array[],
92                            const UNICHARSET &unicharset) {
93     const UNICHAR_ID *ptr = array;
94     if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
95     while (*ptr != INVALID_UNICHAR_ID) {
96       tprintf("%s ", unicharset.id_to_unichar(*ptr++));
97     }
98     tprintf("( ");
99     ptr = array;
100     while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
101     tprintf(")\n");
102   }
103 };
104 
105 // AMBIG_SPEC_LIST stores a list of dangerous ambigs that
106 // start with the same unichar (e.g. r->t rn->m rr1->m).
107 class AmbigSpec : public ELIST_LINK {
108  public:
109   AmbigSpec();
~AmbigSpec()110   ~AmbigSpec() {}
111 
112   // Comparator function for sorting AmbigSpec_LISTs. The lists will
113   // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
114   // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
compare_ambig_specs(const void * spec1,const void * spec2)115   static int compare_ambig_specs(const void *spec1, const void *spec2) {
116     const AmbigSpec *s1 =
117       *reinterpret_cast<const AmbigSpec * const *>(spec1);
118     const AmbigSpec *s2 =
119       *reinterpret_cast<const AmbigSpec * const *>(spec2);
120     return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
121   }
122 
123   UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
124   UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
125   UNICHAR_ID correct_ngram_id;
126   AmbigType type;
127   int wrong_ngram_size;
128 };
129 ELISTIZEH(AmbigSpec);
130 
131 // AMBIG_TABLE[i] stores a set of ambiguities whose
132 // wrong ngram starts with unichar id i.
133 typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;
134 typedef GenericVector<UNICHAR_ID> UnicharIdVector;
135 
136 class UnicharAmbigs {
137  public:
UnicharAmbigs()138   UnicharAmbigs() {}
~UnicharAmbigs()139   ~UnicharAmbigs() {
140     replace_ambigs_.delete_data_pointers();
141     dang_ambigs_.delete_data_pointers();
142     one_to_one_definite_ambigs_.delete_data_pointers();
143   }
144 
dang_ambigs()145   const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
replace_ambigs()146   const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
147 
148   // Fills in two ambiguity tables (replaceable and dangerous) with information
149   // read from the ambigs file. An ambiguity table is an array of lists.
150   // The array is indexed by a class id. Each entry in the table provides
151   // a list of potential ambiguities which can start with the corresponding
152   // character. For example the ambiguity "rn -> m", would be located in the
153   // table at index of unicharset.unichar_to_id('r').
154   // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
155   // one_to_one_definite_ambigs_. This vector is also indexed by the class id
156   // of the wrong part of the ambiguity and each entry contains a vector of
157   // unichar ids that are ambiguous to it.
158   void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset,
159                          UNICHARSET *unicharset);
160 
161   // Return definite 1-1 ambigs.
OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id)162   const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
163     if (one_to_one_definite_ambigs_.empty()) return NULL;
164     return one_to_one_definite_ambigs_[unichar_id];
165   }
166 
167  private:
168 
169   bool ParseAmbiguityLine(int line_num, int version,
170                           const UNICHARSET &unicharset, char *buffer,
171                           int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
172                           int *ReplacementAmbigPartSize,
173                           char *ReplacementString, int *type);
174   void InsertIntoTable(UnicharAmbigsVector &table,
175                        int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
176                        int ReplacementAmbigPartSize,
177                        const char *ReplacementString, int type,
178                        AmbigSpec *ambig_spec, UNICHARSET *unicharset);
179   UnicharAmbigsVector dang_ambigs_;
180   UnicharAmbigsVector replace_ambigs_;
181   GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
182 };
183 
184 }  // namespace tesseract
185 
186 #endif  // TESSERACT_CCUTIL_AMBIGS_H_
187