1 /////////////////////////////////////////////////////////////////////// 2 // File: dict.h 3 // Description: dict class. 4 // Author: Samuel Charron 5 // 6 // (C) Copyright 2006, Google Inc. 7 // Licensed under the Apache License, Version 2.0 (the "License"); 8 // you may not use this file except in compliance with the License. 9 // You may obtain a copy of the License at 10 // http://www.apache.org/licenses/LICENSE-2.0 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // 17 /////////////////////////////////////////////////////////////////////// 18 19 #ifndef TESSERACT_DICT_DICT_H_ 20 #define TESSERACT_DICT_DICT_H_ 21 22 #include "ambigs.h" 23 #include "choices.h" 24 #include "choicearr.h" 25 #include "dawg.h" 26 #include "image.h" 27 #include "ratngs.h" 28 #include "stopper.h" 29 #include "trie.h" 30 #include "unicharset.h" 31 32 extern STRING_VAR_H(global_user_words_suffix, "user-words", 33 "A list of user-provided words."); 34 extern INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words."); 35 36 #define MAX_WERD_LENGTH (inT64) 40 37 #define NO_RATING -1 38 #define FREQ_WERD 1.0 39 #define GOOD_WERD 1.1 40 #define OK_WERD 1.3125 41 42 // Struct used to hold temporary information about fragments. 43 struct CHAR_FRAGMENT_INFO { 44 UNICHAR_ID unichar_id; 45 const CHAR_FRAGMENT *fragment; 46 int num_fragments; 47 float rating; 48 float certainty; 49 }; 50 51 namespace tesseract { 52 53 typedef GenericVector<Dawg *> DawgVector; 54 55 struct DawgArgs { DawgArgsDawgArgs56 DawgArgs(DawgInfoVector *d, DawgInfoVector *c, 57 DawgInfoVector *ud, DawgInfoVector *uc, float r) : 58 active_dawgs(d), constraints(c), updated_active_dawgs(ud), 59 updated_constraints(uc), rating_margin(r) { 60 for (int i = 0; i < MAX_WERD_LENGTH; ++i) { 61 rating_array[i] = NO_RATING; 62 } 63 permuter = NO_PERM; 64 } 65 DawgInfoVector *active_dawgs; 66 DawgInfoVector *constraints; 67 DawgInfoVector *updated_active_dawgs; 68 DawgInfoVector *updated_constraints; 69 PermuterType permuter; 70 float rating_margin; // prunning margin ratio 71 float rating_array[MAX_WERD_LENGTH]; 72 }; 73 74 class Dict { 75 public: 76 Dict(Image* image_ptr); 77 ~Dict(); getImage()78 Image* getImage() { 79 return image_ptr_; 80 } getUnicharset()81 UNICHARSET& getUnicharset() { 82 return getImage()->getCCUtil()->unicharset; 83 } getUnicharAmbigs()84 const UnicharAmbigs &getUnicharAmbigs() { 85 return getImage()->getCCUtil()->unichar_ambigs; 86 } 87 88 /* hyphen.cpp ************************************************************/ 89 90 // Returns true if we've recorded the beginning of a hyphenated word. hyphenated()91 inline bool hyphenated() { return !last_word_on_line_ && hyphen_word_; } 92 // Size of the base word (the part on the line before) of a hyphenated word. hyphen_base_size()93 inline int hyphen_base_size() { 94 return this->hyphenated() ? hyphen_word_->length() : 0; 95 } 96 // If this word is hyphenated copy the base word (the part on 97 // the line before) of a hyphenated word into the given word. 98 // This function assumes that word is not NULL. copy_hyphen_info(WERD_CHOICE * word)99 inline void copy_hyphen_info(WERD_CHOICE *word) { 100 if (this->hyphenated()) { 101 *word = *hyphen_word_; 102 if (hyphen_debug_level) word->print("copy_hyphen_info: "); 103 } 104 } 105 // Erase the unichar ids corresponding to the portion of the word 106 // from the previous line. The word is not changed if it is not 107 // split between lines and hyphenated. remove_hyphen_head(WERD_CHOICE * word)108 inline void remove_hyphen_head(WERD_CHOICE *word) { 109 if (this->hyphenated()) { 110 word->remove_unichar_ids(0, hyphen_word_->length()); 111 if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: "); 112 } 113 } 114 // Check whether the word has a hyphen at the end. has_hyphen_end(const WERD_CHOICE & word)115 inline bool has_hyphen_end(const WERD_CHOICE &word) { 116 int word_index = word.length() - 1; 117 return (last_word_on_line_ && word_index > 0 && 118 word.unichar_id(word_index) == hyphen_unichar_id_); 119 } 120 // Unless the previous word was the last one on the line, and the current 121 // one is not (thus it is the first one on the line), erase hyphen_word_, 122 // clear hyphen_active_dawgs_, hyphen_constraints_ update last_word_on_line_. 123 void reset_hyphen_vars(bool last_word_on_line); 124 // Update hyphen_word_, and copy the given DawgInfoVectors into 125 // hyphen_active_dawgs_ and hyphen_constraints_. 126 void set_hyphen_word(const WERD_CHOICE &word, 127 const DawgInfoVector &active_dawgs, 128 const DawgInfoVector &constraints); 129 130 /* permdawg.cpp ************************************************************/ 131 // If new_rating < best_choice->rating(), copy word int best_choice 132 // and update rating and permuter of best_choice to the new given values. update_best_choice(const WERD_CHOICE & word,WERD_CHOICE * best_choice)133 inline void update_best_choice( 134 const WERD_CHOICE &word, WERD_CHOICE *best_choice) { 135 if (word.rating() < best_choice->rating()) { 136 *best_choice = word; 137 } 138 } 139 // Fill the given active_dawgs vector with dawgs that could contain the 140 // beginning of the word. If hyphenated() returns true, copy the entries 141 // from hyphen_active_dawgs_ instead. 142 void init_active_dawgs(DawgInfoVector *active_dawgs); 143 // If hyphenated() returns true, copy the entries from hyphen_constraints_ 144 // into the given constraints vector. 145 void init_constraints(DawgInfoVector *constraints); 146 // Recursively explore all the possible character combinations in 147 // the given char_choices. Use go_deeper_dawg_fxn() to explore all the 148 // dawgs in the dawgs_ vector in parallel and discard invalid words. 149 // 150 // Allocate and return a WERD_CHOICE with the best valid word found. 151 WERD_CHOICE *dawg_permute_and_select( 152 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit); 153 void adjust_word(WERD_CHOICE *best_choice, 154 float *certainty_array); 155 // If the choice being composed so far could be a dictionary word 156 // and we have not reached the end of the word keep exploring the 157 // char_choices further. 158 // Also: 159 // -- set hyphen word if needed 160 // -- if word_ending is true and word is better than best_choice 161 // copy word to best_choice log new word choice 162 void go_deeper_dawg_fxn( 163 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, 164 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, 165 bool word_ending, WERD_CHOICE *word, float certainties[], 166 float *limit, WERD_CHOICE *best_choice, void *void_more_args); 167 168 /* permute.cpp *************************************************************/ 169 void add_document_word(const WERD_CHOICE &best_choice); 170 void init_permute(); 171 WERD_CHOICE *permute_top_choice( 172 const BLOB_CHOICE_LIST_VECTOR &char_choices, 173 float* rating_limit, 174 WERD_CHOICE *raw_choice, 175 BOOL8 *any_alpha); 176 const char* choose_il1(const char *first_char, //first choice 177 const char *second_char, //second choice 178 const char *third_char, //third choice 179 const char *prev_char, //prev in word 180 const char *next_char, //next in word 181 const char *next_next_char); //after next next in word valid_word(const WERD_CHOICE & word)182 int valid_word(const WERD_CHOICE &word) { 183 return valid_word(word, false); // return NO_PERM for words with digits 184 } valid_word_or_number(const WERD_CHOICE & word)185 int valid_word_or_number(const WERD_CHOICE &word) { 186 return valid_word(word, true); // return NUMBER_PERM for valid numbers 187 } 188 int valid_word(const WERD_CHOICE &word, bool numbers_ok); 189 bool valid_punctuation(const WERD_CHOICE &word); 190 WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices, 191 float rating_limit, 192 WERD_CHOICE *raw_choice); 193 void end_permute(); 194 void adjust_non_word(WERD_CHOICE *word, float *adjust_factor); 195 void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, 196 float rating_limit, 197 int start, 198 int end, 199 WERD_CHOICE *current_word); 200 void permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, 201 float limit, 202 WERD_CHOICE *best_choice, 203 WERD_CHOICE *raw_choice); 204 WERD_CHOICE *permute_compound_words( 205 const BLOB_CHOICE_LIST_VECTOR &char_choices, 206 float rating_limit); 207 // checks if the dominant word script, if there is one, is same as target. 208 bool word_script_eq(const BLOB_CHOICE_LIST_VECTOR &char_choices, 209 int target_script_id); 210 // Incoporate segmentation cost into word rating 211 void incorporate_segcost(WERD_CHOICE* word); 212 // checks for script-consistent permutations 213 WERD_CHOICE *permute_script_words( 214 const BLOB_CHOICE_LIST_VECTOR &char_choices); 215 216 WERD_CHOICE *top_fragments_permute_and_select( 217 const BLOB_CHOICE_LIST_VECTOR &char_choices, 218 float rating_limit); 219 // If the choice being composed so far could be better 220 // than best_choice keep exploring char_choices. 221 // If we have reached the end of the word and word is better than 222 // best_choice, copy word to best_choice and log a new word choice. 223 void go_deeper_top_fragments_fxn( 224 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, 225 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, 226 bool word_ending, WERD_CHOICE *word, float certainties[], 227 float *limit, WERD_CHOICE *best_choice, void *more_args); 228 229 // Semi-generic functions used by multiple permuters. 230 bool fragment_state_okay(UNICHAR_ID curr_unichar_id, 231 float curr_rating, float curr_certainty, 232 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 233 const char *debug, int word_ending, 234 CHAR_FRAGMENT_INFO *char_frag_info); 235 void permute_choices( 236 const char *debug, 237 const BLOB_CHOICE_LIST_VECTOR &char_choices, 238 int char_choice_index, 239 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 240 WERD_CHOICE *word, 241 float certainties[], 242 float *limit, 243 WERD_CHOICE *best_choice, 244 void *more_args); 245 246 void append_choices( 247 const char *debug, 248 const BLOB_CHOICE_LIST_VECTOR &char_choices, 249 const BLOB_CHOICE &blob_choice, 250 int char_choice_index, 251 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 252 WERD_CHOICE *word, 253 float certainties[], 254 float *limit, 255 WERD_CHOICE *best_choice, 256 void *more_args); 257 // Pointer to go_deeper function that will be modified by various permuters. 258 void (Dict::*go_deeper_fxn_)(const char *debug, 259 const BLOB_CHOICE_LIST_VECTOR &char_choices, 260 int char_choice_index, 261 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 262 bool word_ending, WERD_CHOICE *word, 263 float certainties[], float *limit, 264 WERD_CHOICE *best_choice, void *void_more_args); 265 /* stopper.cpp *************************************************************/ 266 int NoDangerousAmbig(WERD_CHOICE *BestChoice, 267 DANGERR *fixpt, 268 bool fix_replaceable, 269 BLOB_CHOICE_LIST_VECTOR *Choices, 270 bool *modified_blobs); 271 void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, 272 UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, 273 BLOB_CHOICE_LIST_VECTOR *blob_choices, 274 bool *modified_blobs); 275 DisableChoiceAccum()276 inline void DisableChoiceAccum() { keep_word_choices_ = FALSE; } EnableChoiceAccum()277 inline void EnableChoiceAccum() { keep_word_choices_ = TRUE; } 278 279 int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice); 280 VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice, 281 FLOAT32 AdjustFactor, 282 const float Certainties[]); 283 void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice); 284 int StringSameAs(const char *String, 285 const char *String_lengths, 286 VIABLE_CHOICE ViableChoice); 287 bool StringSameAs(const WERD_CHOICE &WordChoice, 288 VIABLE_CHOICE ViableChoice); 289 int AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices, 290 WERD_CHOICE *BestChoice, 291 const WERD_CHOICE &RawChoice, 292 DANGERR *fixpt, 293 ACCEPTABLE_CHOICE_CALLER caller, 294 bool *modified_blobs); 295 int AcceptableResult(const WERD_CHOICE &BestChoice, 296 const WERD_CHOICE &RawChoice); 297 int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice); 298 void LogNewChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, 299 const float Certainties[], bool raw_choice); 300 void EndDangerousAmbigs(); 301 int CurrentBestChoiceIs(const WERD_CHOICE &WordChoice); 302 FLOAT32 CurrentBestChoiceAdjustFactor(); 303 int CurrentWordAmbig(); 304 void DebugWordChoices(); 305 void PrintAmbigAlternatives(FILE *file, const char *label, 306 int label_num_unichars); 307 void FillViableChoice(const WERD_CHOICE &WordChoice, 308 FLOAT32 AdjustFactor, const float Certainties[], 309 bool SameString, VIABLE_CHOICE ViableChoice); 310 int AlternativeChoicesWorseThan(FLOAT32 Threshold); 311 void FilterWordChoices(); 312 void FindClassifierErrors(FLOAT32 MinRating, 313 FLOAT32 MaxRating, 314 FLOAT32 RatingMargin, 315 FLOAT32 Thresholds[]); 316 void InitChoiceAccum(); 317 void LogNewSegmentation(PIECES_STATE BlobWidth); 318 void LogNewSplit(int Blob); 319 void SettupStopperPass1(); 320 void SettupStopperPass2(); 321 /* choices.cpp *************************************************************/ 322 void print_word_string(const char* str); 323 void print_word_choice(const char *label, A_CHOICE* choice); 324 void print_choices(const char *label, 325 CHOICES rating); // List of (A_CHOICE*). 326 /* permngram.cpp ***********************************************************/ 327 A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices, 328 float rating_limit, 329 const Dawg *dawg); 330 /* dawg.cpp ****************************************************************/ 331 332 // Returns the maximal permuter code (from ccstruct/ratngs.h) if in light 333 // of the current state the letter at word_index in the given word 334 // is allowed according to at least one of the dawgs in dawgs_, 335 // otherwise returns NO_PERM. 336 // 337 // The state is described by void_dawg_args, which are interpreted as 338 // DawgArgs and contain two relevant input vectors: active_dawgs and 339 // constraints. Each entry in the active_dawgs vector contains an index 340 // into the dawgs_ vector and an EDGE_REF that indicates the last edge 341 // followed in the dawg. Each entry in the constraints vector contains 342 // an index into the dawgs_ vector and an EDGE_REF that indicates an edge 343 // in a pattern dawg followed to match a pattern. Currently constraints 344 // are used to save the state of punctuation dawgs after leading 345 // punctuation was found. 346 // 347 // Input: 348 // At word_index 0 dawg_args->active_dawgs should contain an entry for each 349 // dawg whose type has a bit set in kBeginningDawgsType, 350 // dawg_args->constraints should be empty. EDGE_REFs in active_dawgs and 351 // constraints vectors should be initialized to NO_EDGE. If hyphen state 352 // needs to be applied, initial dawg_args->active_dawgs and 353 // dawg_args->constrains can be copied from the saved hyphen state 354 // (maintained by Dict). 355 // For word_index > 0 the corresponding state (active_dawgs and constraints) 356 // can be obtained from dawg_args->updated_* passed to def_letter_is_okay 357 // for word_index-1. 358 // Note: the function assumes that active_dags, constraints and updated_* 359 // member variables of dawg_args are not NULL. 360 // 361 // Output: 362 // The function fills in dawg_args->updated_active_dawgs vector with the 363 // entries for dawgs that contain the word up to the letter at word_index. 364 // The new constraints (if any) are added to dawg_args->updated_constraints, 365 // the constraints from dawg_args->constraints are also copied into it. 366 // 367 // Detailed description: 368 // In order to determine whether the word is still valid after considering 369 // all the letters up to the one at word_index the following is done for 370 // each entry in dawg_args->active_dawgs: 371 // 372 // -- next starting node is obtained from entry.ref and edge_char_of() is 373 // called to obtain the next edge 374 // -- if a valid edge is found, the function returns the updated permuter 375 // code true and an entry [entry.dawg_index, edge] is inserted in 376 // dawg_args->updated_active_dawgs 377 // otherwise: 378 // -- if we are dealing with dawg of type DAWG_TYPE_PUNCTUATION, 379 // edge_char_of() is called again, but now with kPatternUnicharID 380 // as unichar_id; if a valid edge is found it is recorded in 381 // dawg_args->updated_constraints 382 // -- the function checks whether the word can end with the previous 383 // letter 384 // -- each successor of the dawg (e.g. dawgs with type DAWG_TYPE_WORD 385 // could be successors to dawgs with type DAWG_TYPE_PUNCTUATION; the 386 // successors are defined by successors_ vector) is explored and if 387 // a letter is found in the successor dawg, a new entry is inserted 388 // into dawg_args->updated_active_dawgs with EDGE_REF being either 389 // NO_EDGE or an EDGE_REF recorded in constraints vector for the 390 // corresponding dawg index 391 392 // 393 int def_letter_is_okay(void* void_dawg_args, int word_index, 394 const void* word, bool word_end); 395 396 int new_letter_is_okay(void* void_dawg_args, int word_index, 397 const void* word, bool word_end); 398 int (Dict::*letter_is_okay_)(void* void_dawg_args, int word_index, 399 const void *word, bool word_end); 400 // Return the number of dawgs in the dawgs_ vector. NumDawgs()401 inline const int NumDawgs() const { return dawgs_.size(); } 402 // Return i-th dawg pointer recorded in the dawgs_ vector. GetDawg(int index)403 inline const Dawg *GetDawg(int index) const { return dawgs_[index]; } 404 // At word ending make sure all the recorded constraints are satisfied. 405 // Each constraint signifies that we found a beginning pattern in a 406 // pattern dawg. Check that this pattern can end here (e.g. if some 407 // leading punctuation is found this would ensure that we are not 408 // expecting any particular trailing punctuation after the word). ConstraintsOk(const DawgInfoVector & constraints,int word_end,DawgType current_dawg_type)409 inline bool ConstraintsOk(const DawgInfoVector &constraints, 410 int word_end, DawgType current_dawg_type) { 411 if (!word_end) return true; 412 if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true; 413 for (int c = 0; c < constraints.length(); ++c) { 414 const DawgInfo &cinfo = constraints[c]; 415 Dawg *cdawg = dawgs_[cinfo.dawg_index]; 416 if (!cdawg->end_of_word(cinfo.ref)) { 417 if (dawg_debug_level >= 3) { 418 tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n", 419 cinfo.dawg_index, cinfo.ref); 420 } 421 return false; 422 } 423 } 424 return true; 425 } 426 // Record the maximum of the two permuters in permuter. UpdatePermuter(PermuterType new_permuter,PermuterType * permuter)427 static inline void UpdatePermuter(PermuterType new_permuter, 428 PermuterType *permuter) { 429 if (dawg_debug_level >= 3) tprintf("Letter found\n"); 430 if (new_permuter > *permuter) *permuter = new_permuter; 431 } 432 433 /* conversion.cpp **********************************************************/ 434 // TODO(daria): remove these function when conversion.cpp is deprecated 435 // and all the code is converted to work with unichar ids. 436 void LogNewWordChoice(A_CHOICE *a_choice, 437 FLOAT32 adjust_factor, 438 const float certainties[], 439 const UNICHARSET &unicharset); 440 int valid_word(const char *string); 441 442 private: 443 // Private member variables. 444 Image* image_ptr_; 445 // Table that stores ambiguities computed during training 446 // (loaded when NoDangerousAmbigs() is called for the first time). 447 // Each entry i in the table stores a set of amibiguities whose 448 // wrong ngram starts with unichar id i. 449 UnicharAmbigs *dang_ambigs_table_; 450 // Same as above, but for ambiguities with replace flag set. 451 UnicharAmbigs *replace_ambigs_table_; 452 // Flag used to disable accumulation of word choices 453 // during compound word permutation. 454 BOOL8 keep_word_choices_; 455 // Additional certainty padding allowed before a word is rejected. 456 FLOAT32 reject_offset_; 457 // Current word segmentation. 458 PIECES_STATE current_segmentation_; 459 // Variables to keep track of best/raw word choices. 460 VIABLE_CHOICE best_raw_choice_; 461 LIST raw_choices_; 462 LIST best_choices_; 463 // Hyphen-related variables. 464 UNICHAR_ID hyphen_unichar_id_; 465 WERD_CHOICE *hyphen_word_; 466 DawgInfoVector hyphen_active_dawgs_; 467 DawgInfoVector hyphen_constraints_; 468 bool last_word_on_line_; 469 // Dawgs. 470 DawgVector dawgs_; 471 SuccessorListsVector successors_; 472 Dawg *freq_dawg_; 473 Trie *pending_words_; 474 // The following pointers are only cached for convenience. 475 // The dawgs will be deleted when dawgs_ vector is destroyed. 476 // TODO(daria): need to support multiple languages in the future, 477 // so maybe will need to maintain a list of dawgs of each kind. 478 Trie *document_words_; 479 }; 480 } // namespace tesseract 481 482 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_ 483