• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        dict.h
3 // Description: dict class.
4 // Author:      Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TESSERACT_DICT_DICT_H_
20 #define TESSERACT_DICT_DICT_H_
21 
22 #include "ambigs.h"
23 #include "choices.h"
24 #include "choicearr.h"
25 #include "dawg.h"
26 #include "image.h"
27 #include "ratngs.h"
28 #include "stopper.h"
29 #include "trie.h"
30 #include "unicharset.h"
31 
32 extern STRING_VAR_H(global_user_words_suffix, "user-words",
33                     "A list of user-provided words.");
34 extern INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
35 
36 #define MAX_WERD_LENGTH        (inT64) 40
37 #define NO_RATING               -1
38 #define FREQ_WERD               1.0
39 #define GOOD_WERD               1.1
40 #define OK_WERD                 1.3125
41 
42 // Struct used to hold temporary information about fragments.
43 struct CHAR_FRAGMENT_INFO {
44   UNICHAR_ID unichar_id;
45   const CHAR_FRAGMENT *fragment;
46   int num_fragments;
47   float rating;
48   float certainty;
49 };
50 
51 namespace tesseract {
52 
53 typedef GenericVector<Dawg *> DawgVector;
54 
55 struct DawgArgs {
DawgArgsDawgArgs56   DawgArgs(DawgInfoVector *d, DawgInfoVector *c,
57            DawgInfoVector *ud, DawgInfoVector *uc, float r) :
58     active_dawgs(d), constraints(c), updated_active_dawgs(ud),
59     updated_constraints(uc), rating_margin(r) {
60     for (int i = 0; i < MAX_WERD_LENGTH; ++i) {
61       rating_array[i] = NO_RATING;
62     }
63     permuter = NO_PERM;
64   }
65   DawgInfoVector *active_dawgs;
66   DawgInfoVector *constraints;
67   DawgInfoVector *updated_active_dawgs;
68   DawgInfoVector *updated_constraints;
69   PermuterType permuter;
70   float rating_margin;  // prunning margin ratio
71   float rating_array[MAX_WERD_LENGTH];
72 };
73 
74 class Dict {
75  public:
76   Dict(Image* image_ptr);
77   ~Dict();
getImage()78   Image* getImage() {
79     return image_ptr_;
80   }
getUnicharset()81   UNICHARSET& getUnicharset() {
82     return getImage()->getCCUtil()->unicharset;
83   }
getUnicharAmbigs()84   const UnicharAmbigs &getUnicharAmbigs() {
85     return getImage()->getCCUtil()->unichar_ambigs;
86   }
87 
88   /* hyphen.cpp ************************************************************/
89 
90   // Returns true if we've recorded the beginning of a hyphenated word.
hyphenated()91   inline bool hyphenated() { return !last_word_on_line_ && hyphen_word_; }
92   // Size of the base word (the part on the line before) of a hyphenated word.
hyphen_base_size()93   inline int hyphen_base_size() {
94     return this->hyphenated() ? hyphen_word_->length() : 0;
95   }
96   // If this word is hyphenated copy the base word (the part on
97   // the line before) of a hyphenated word into the given word.
98   // This function assumes that word is not NULL.
copy_hyphen_info(WERD_CHOICE * word)99   inline void copy_hyphen_info(WERD_CHOICE *word) {
100     if (this->hyphenated()) {
101       *word = *hyphen_word_;
102       if (hyphen_debug_level) word->print("copy_hyphen_info: ");
103     }
104   }
105   // Erase the unichar ids corresponding to the portion of the word
106   // from the previous line. The word is not changed if it is not
107   // split between lines and hyphenated.
remove_hyphen_head(WERD_CHOICE * word)108   inline void remove_hyphen_head(WERD_CHOICE *word) {
109     if (this->hyphenated()) {
110       word->remove_unichar_ids(0, hyphen_word_->length());
111       if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
112     }
113   }
114   // Check whether the word has a hyphen at the end.
has_hyphen_end(const WERD_CHOICE & word)115   inline bool has_hyphen_end(const WERD_CHOICE &word) {
116     int word_index = word.length() - 1;
117     return (last_word_on_line_ && word_index > 0 &&
118             word.unichar_id(word_index) == hyphen_unichar_id_);
119   }
120   // Unless the previous word was the last one on the line, and the current
121   // one is not (thus it is the first one on the line), erase hyphen_word_,
122   // clear hyphen_active_dawgs_, hyphen_constraints_ update last_word_on_line_.
123   void reset_hyphen_vars(bool last_word_on_line);
124   // Update hyphen_word_, and copy the given DawgInfoVectors into
125   // hyphen_active_dawgs_ and hyphen_constraints_.
126   void set_hyphen_word(const WERD_CHOICE &word,
127                        const DawgInfoVector &active_dawgs,
128                        const DawgInfoVector &constraints);
129 
130   /* permdawg.cpp ************************************************************/
131   // If new_rating < best_choice->rating(), copy word int best_choice
132   // and update rating and permuter of best_choice to the new given values.
update_best_choice(const WERD_CHOICE & word,WERD_CHOICE * best_choice)133   inline void update_best_choice(
134       const WERD_CHOICE &word, WERD_CHOICE *best_choice) {
135     if (word.rating() < best_choice->rating()) {
136       *best_choice = word;
137     }
138   }
139   // Fill the given active_dawgs vector with dawgs that could contain the
140   // beginning of the word. If hyphenated() returns true, copy the entries
141   // from hyphen_active_dawgs_ instead.
142   void init_active_dawgs(DawgInfoVector *active_dawgs);
143   // If hyphenated() returns true, copy the entries from hyphen_constraints_
144   // into the given constraints vector.
145   void init_constraints(DawgInfoVector *constraints);
146   // Recursively explore all the possible character combinations in
147   // the given char_choices. Use go_deeper_dawg_fxn() to explore all the
148   // dawgs in the dawgs_ vector in parallel and discard invalid words.
149   //
150   // Allocate and return a WERD_CHOICE with the best valid word found.
151   WERD_CHOICE *dawg_permute_and_select(
152       const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
153   void adjust_word(WERD_CHOICE *best_choice,
154                    float *certainty_array);
155   // If the choice being composed so far could be a dictionary word
156   // and we have not reached the end of the word keep exploring the
157   // char_choices further.
158   // Also:
159   // -- set hyphen word if needed
160   // -- if word_ending is true and word is better than best_choice
161   //    copy word to best_choice log new word choice
162   void go_deeper_dawg_fxn(
163       const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
164       int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
165       bool word_ending, WERD_CHOICE *word, float certainties[],
166       float *limit, WERD_CHOICE *best_choice, void *void_more_args);
167 
168   /* permute.cpp *************************************************************/
169   void add_document_word(const WERD_CHOICE &best_choice);
170   void init_permute();
171   WERD_CHOICE *permute_top_choice(
172     const BLOB_CHOICE_LIST_VECTOR &char_choices,
173     float* rating_limit,
174     WERD_CHOICE *raw_choice,
175     BOOL8 *any_alpha);
176   const char* choose_il1(const char *first_char,       //first choice
177                          const char *second_char,      //second choice
178                          const char *third_char,       //third choice
179                          const char *prev_char,        //prev in word
180                          const char *next_char,        //next in word
181                          const char *next_next_char);  //after next next in word
valid_word(const WERD_CHOICE & word)182   int valid_word(const WERD_CHOICE &word) {
183     return valid_word(word, false);  // return NO_PERM for words with digits
184   }
valid_word_or_number(const WERD_CHOICE & word)185   int valid_word_or_number(const WERD_CHOICE &word) {
186     return valid_word(word, true);  // return NUMBER_PERM for valid numbers
187   }
188   int valid_word(const WERD_CHOICE &word, bool numbers_ok);
189   bool valid_punctuation(const WERD_CHOICE &word);
190   WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
191                            float rating_limit,
192                            WERD_CHOICE *raw_choice);
193   void end_permute();
194   void adjust_non_word(WERD_CHOICE *word, float *adjust_factor);
195   void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
196                        float rating_limit,
197                        int start,
198                        int end,
199                        WERD_CHOICE *current_word);
200   void permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
201                           float limit,
202                           WERD_CHOICE *best_choice,
203                           WERD_CHOICE *raw_choice);
204   WERD_CHOICE *permute_compound_words(
205       const BLOB_CHOICE_LIST_VECTOR &char_choices,
206       float rating_limit);
207   // checks if the dominant word script, if there is one, is same as target.
208   bool word_script_eq(const BLOB_CHOICE_LIST_VECTOR &char_choices,
209                       int target_script_id);
210   // Incoporate segmentation cost into word rating
211   void incorporate_segcost(WERD_CHOICE* word);
212   // checks for script-consistent permutations
213   WERD_CHOICE *permute_script_words(
214       const BLOB_CHOICE_LIST_VECTOR &char_choices);
215 
216   WERD_CHOICE *top_fragments_permute_and_select(
217       const BLOB_CHOICE_LIST_VECTOR &char_choices,
218       float rating_limit);
219   // If the choice being composed so far could be better
220   // than best_choice keep exploring char_choices.
221   // If we have reached the end of the word and word is better than
222   // best_choice, copy word to best_choice and log a new word choice.
223   void go_deeper_top_fragments_fxn(
224       const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
225       int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
226       bool word_ending, WERD_CHOICE *word, float certainties[],
227       float *limit, WERD_CHOICE *best_choice, void *more_args);
228 
229   // Semi-generic functions used by multiple permuters.
230   bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
231                            float curr_rating, float curr_certainty,
232                            const CHAR_FRAGMENT_INFO *prev_char_frag_info,
233                            const char *debug, int word_ending,
234                            CHAR_FRAGMENT_INFO *char_frag_info);
235   void permute_choices(
236       const char *debug,
237       const BLOB_CHOICE_LIST_VECTOR &char_choices,
238       int char_choice_index,
239       const CHAR_FRAGMENT_INFO *prev_char_frag_info,
240       WERD_CHOICE *word,
241       float certainties[],
242       float *limit,
243       WERD_CHOICE *best_choice,
244       void *more_args);
245 
246   void append_choices(
247       const char *debug,
248       const BLOB_CHOICE_LIST_VECTOR &char_choices,
249       const BLOB_CHOICE &blob_choice,
250       int char_choice_index,
251       const CHAR_FRAGMENT_INFO *prev_char_frag_info,
252       WERD_CHOICE *word,
253       float certainties[],
254       float *limit,
255       WERD_CHOICE *best_choice,
256       void *more_args);
257   // Pointer to go_deeper function that will be modified by various permuters.
258   void (Dict::*go_deeper_fxn_)(const char *debug,
259                                const BLOB_CHOICE_LIST_VECTOR &char_choices,
260                              int char_choice_index,
261                              const CHAR_FRAGMENT_INFO *prev_char_frag_info,
262                                bool word_ending, WERD_CHOICE *word,
263                                float certainties[], float *limit,
264                                WERD_CHOICE *best_choice, void *void_more_args);
265   /* stopper.cpp *************************************************************/
266   int NoDangerousAmbig(WERD_CHOICE *BestChoice,
267                        DANGERR *fixpt,
268                        bool fix_replaceable,
269                        BLOB_CHOICE_LIST_VECTOR *Choices,
270                        bool *modified_blobs);
271   void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
272                     UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
273                     BLOB_CHOICE_LIST_VECTOR *blob_choices,
274                     bool *modified_blobs);
275 
DisableChoiceAccum()276   inline void DisableChoiceAccum() { keep_word_choices_ = FALSE; }
EnableChoiceAccum()277   inline void EnableChoiceAccum() { keep_word_choices_ = TRUE; }
278 
279   int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
280   VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice,
281                                 FLOAT32 AdjustFactor,
282                                 const float Certainties[]);
283   void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
284   int StringSameAs(const char *String,
285                    const char *String_lengths,
286                    VIABLE_CHOICE ViableChoice);
287   bool StringSameAs(const WERD_CHOICE &WordChoice,
288                     VIABLE_CHOICE ViableChoice);
289   int AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
290                        WERD_CHOICE *BestChoice,
291                        const WERD_CHOICE &RawChoice,
292                        DANGERR *fixpt,
293                        ACCEPTABLE_CHOICE_CALLER caller,
294                        bool *modified_blobs);
295   int AcceptableResult(const WERD_CHOICE &BestChoice,
296                        const WERD_CHOICE &RawChoice);
297   int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice);
298   void LogNewChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor,
299                     const float Certainties[], bool raw_choice);
300   void EndDangerousAmbigs();
301   int CurrentBestChoiceIs(const WERD_CHOICE &WordChoice);
302   FLOAT32 CurrentBestChoiceAdjustFactor();
303   int CurrentWordAmbig();
304   void DebugWordChoices();
305   void PrintAmbigAlternatives(FILE *file, const char *label,
306                               int label_num_unichars);
307   void FillViableChoice(const WERD_CHOICE &WordChoice,
308                         FLOAT32 AdjustFactor, const float Certainties[],
309                         bool SameString, VIABLE_CHOICE ViableChoice);
310   int AlternativeChoicesWorseThan(FLOAT32 Threshold);
311   void FilterWordChoices();
312   void FindClassifierErrors(FLOAT32 MinRating,
313                           FLOAT32 MaxRating,
314                           FLOAT32 RatingMargin,
315                           FLOAT32 Thresholds[]);
316   void InitChoiceAccum();
317   void LogNewSegmentation(PIECES_STATE BlobWidth);
318   void LogNewSplit(int Blob);
319   void SettupStopperPass1();
320   void SettupStopperPass2();
321   /* choices.cpp *************************************************************/
322   void print_word_string(const char* str);
323   void print_word_choice(const char *label, A_CHOICE* choice);
324   void print_choices(const char *label,
325                      CHOICES rating);   // List of (A_CHOICE*).
326   /* permngram.cpp ***********************************************************/
327   A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices,
328                                      float rating_limit,
329                                      const Dawg *dawg);
330   /* dawg.cpp ****************************************************************/
331 
332   // Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
333   // of the current state the letter at word_index in the given word
334   // is allowed according to at least one of the dawgs in dawgs_,
335   // otherwise returns NO_PERM.
336   //
337   // The state is described by void_dawg_args, which are interpreted as
338   // DawgArgs and contain two relevant input vectors: active_dawgs and
339   // constraints. Each entry in the active_dawgs vector contains an index
340   // into the dawgs_ vector and an EDGE_REF that indicates the last edge
341   // followed in the dawg. Each entry in the constraints vector contains
342   // an index into the dawgs_ vector and an EDGE_REF that indicates an edge
343   // in a pattern dawg followed to match a pattern. Currently constraints
344   // are used to save the state of punctuation dawgs after leading
345   // punctuation was found.
346   //
347   // Input:
348   // At word_index 0 dawg_args->active_dawgs should contain an entry for each
349   // dawg whose type has a bit set in kBeginningDawgsType,
350   // dawg_args->constraints should be empty. EDGE_REFs in active_dawgs and
351   // constraints vectors should be initialized to NO_EDGE. If hyphen state
352   // needs to be applied, initial dawg_args->active_dawgs and
353   // dawg_args->constrains can be copied from the saved hyphen state
354   // (maintained by Dict).
355   // For word_index > 0 the corresponding state (active_dawgs and constraints)
356   // can be obtained from dawg_args->updated_* passed to def_letter_is_okay
357   // for word_index-1.
358   // Note: the function assumes that active_dags, constraints and updated_*
359   // member variables of dawg_args are not NULL.
360   //
361   // Output:
362   // The function fills in dawg_args->updated_active_dawgs vector with the
363   // entries for dawgs that contain the word up to the letter at word_index.
364   // The new constraints (if any) are added to dawg_args->updated_constraints,
365   // the constraints from dawg_args->constraints are also copied into it.
366   //
367   // Detailed description:
368   // In order to determine whether the word is still valid after considering
369   // all the letters up to the one at word_index the following is done for
370   // each entry in dawg_args->active_dawgs:
371   //
372   // -- next starting node is obtained from entry.ref and edge_char_of() is
373   //    called to obtain the next edge
374   // -- if a valid edge is found, the function returns the updated permuter
375   //    code true and an entry [entry.dawg_index, edge] is inserted in
376   //    dawg_args->updated_active_dawgs
377   //    otherwise:
378   //    -- if we are dealing with dawg of type DAWG_TYPE_PUNCTUATION,
379   //       edge_char_of() is called again, but now with kPatternUnicharID
380   //       as unichar_id; if a valid edge is found it is recorded in
381   //       dawg_args->updated_constraints
382   //    -- the function checks whether the word can end with the previous
383   //       letter
384   //    -- each successor of the dawg (e.g. dawgs with type DAWG_TYPE_WORD
385   //       could be successors to dawgs with type DAWG_TYPE_PUNCTUATION; the
386   //       successors are defined by successors_ vector) is explored and if
387   //       a letter is found in the successor dawg, a new entry is inserted
388   //       into dawg_args->updated_active_dawgs with EDGE_REF being either
389   //       NO_EDGE or an EDGE_REF recorded in constraints vector for the
390   //       corresponding dawg index
391 
392   //
393   int def_letter_is_okay(void* void_dawg_args, int word_index,
394                          const void* word, bool word_end);
395 
396   int new_letter_is_okay(void* void_dawg_args, int word_index,
397                          const void* word, bool word_end);
398   int (Dict::*letter_is_okay_)(void* void_dawg_args, int word_index,
399                                const void *word, bool word_end);
400   // Return the number of dawgs in the dawgs_ vector.
NumDawgs()401   inline const int NumDawgs() const { return dawgs_.size(); }
402   // Return i-th dawg pointer recorded in the dawgs_ vector.
GetDawg(int index)403   inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
404   // At word ending make sure all the recorded constraints are satisfied.
405   // Each constraint signifies that we found a beginning pattern in a
406   // pattern dawg. Check that this pattern can end here (e.g. if some
407   // leading punctuation is found this would ensure that we are not
408   // expecting any particular trailing punctuation after the word).
ConstraintsOk(const DawgInfoVector & constraints,int word_end,DawgType current_dawg_type)409   inline bool ConstraintsOk(const DawgInfoVector &constraints,
410                             int word_end, DawgType current_dawg_type) {
411     if (!word_end) return true;
412     if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
413     for (int c = 0; c < constraints.length(); ++c) {
414       const DawgInfo &cinfo = constraints[c];
415       Dawg *cdawg = dawgs_[cinfo.dawg_index];
416       if (!cdawg->end_of_word(cinfo.ref)) {
417         if (dawg_debug_level >= 3) {
418           tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n",
419                   cinfo.dawg_index, cinfo.ref);
420         }
421         return false;
422       }
423     }
424     return true;
425   }
426   // Record the maximum of the two permuters in permuter.
UpdatePermuter(PermuterType new_permuter,PermuterType * permuter)427   static inline void UpdatePermuter(PermuterType new_permuter,
428                                     PermuterType *permuter) {
429     if (dawg_debug_level >= 3) tprintf("Letter found\n");
430     if (new_permuter > *permuter) *permuter = new_permuter;
431   }
432 
433   /* conversion.cpp **********************************************************/
434   // TODO(daria): remove these function when conversion.cpp is deprecated
435   // and all the code is converted to work with unichar ids.
436   void LogNewWordChoice(A_CHOICE *a_choice,
437                         FLOAT32 adjust_factor,
438                         const float certainties[],
439                         const UNICHARSET &unicharset);
440   int valid_word(const char *string);
441 
442  private:
443   // Private member variables.
444   Image* image_ptr_;
445   // Table that stores ambiguities computed during training
446   // (loaded when NoDangerousAmbigs() is called for the first time).
447   // Each entry i in the table stores a set of amibiguities whose
448   // wrong ngram starts with unichar id i.
449   UnicharAmbigs *dang_ambigs_table_;
450   // Same as above, but for ambiguities with replace flag set.
451   UnicharAmbigs *replace_ambigs_table_;
452   // Flag used to disable accumulation of word choices
453   // during compound word permutation.
454   BOOL8 keep_word_choices_;
455   // Additional certainty padding allowed before a word is rejected.
456   FLOAT32 reject_offset_;
457   // Current word segmentation.
458   PIECES_STATE current_segmentation_;
459   // Variables to keep track of best/raw word choices.
460   VIABLE_CHOICE best_raw_choice_;
461   LIST raw_choices_;
462   LIST best_choices_;
463   // Hyphen-related variables.
464   UNICHAR_ID hyphen_unichar_id_;
465   WERD_CHOICE *hyphen_word_;
466   DawgInfoVector hyphen_active_dawgs_;
467   DawgInfoVector hyphen_constraints_;
468   bool last_word_on_line_;
469   // Dawgs.
470   DawgVector dawgs_;
471   SuccessorListsVector successors_;
472   Dawg *freq_dawg_;
473   Trie *pending_words_;
474   // The following pointers are only cached for convenience.
475   // The dawgs will be deleted when dawgs_ vector is destroyed.
476   // TODO(daria): need to support multiple languages in the future,
477   // so maybe will need to maintain a list of dawgs of each kind.
478   Trie *document_words_;
479 };
480 }  // namespace tesseract
481 
482 #endif  // THIRD_PARTY_TESSERACT_DICT_DICT_H_
483