1 /* -*-C-*- 2 ******************************************************************************** 3 * 4 * File: dawg.h (Formerly dawg.h) 5 * Description: Definition of a class that represents Directed Accyclic Word 6 * Graph (DAWG), functions to build and manipulate the DAWG. 7 * Author: Mark Seaman, SW Productivity 8 * Created: Fri Oct 16 14:37:00 1987 9 * Modified: Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt 10 * Language: C 11 * Package: N/A 12 * Status: Reusable Software Component 13 * 14 * (c) Copyright 1987, Hewlett-Packard Company. 15 ** Licensed under the Apache License, Version 2.0 (the "License"); 16 ** you may not use this file except in compliance with the License. 17 ** You may obtain a copy of the License at 18 ** http://www.apache.org/licenses/LICENSE-2.0 19 ** Unless required by applicable law or agreed to in writing, software 20 ** distributed under the License is distributed on an "AS IS" BASIS, 21 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 ** See the License for the specific language governing permissions and 23 ** limitations under the License. 24 * 25 *********************************************************************************/ 26 27 #ifndef DAWG_H 28 #define DAWG_H 29 30 /*---------------------------------------------------------------------- 31 I n c l u d e s 32 ----------------------------------------------------------------------*/ 33 34 #include "elst.h" 35 #include "general.h" 36 #include "ratngs.h" 37 #include "varable.h" 38 39 /*---------------------------------------------------------------------- 40 V a r i a b l e s 41 ----------------------------------------------------------------------*/ 42 43 extern INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info, to" 44 " 2 for more details, to 3 to see all the debug messages"); 45 46 #ifdef __MSW32__ 47 #define NO_EDGE (inT64) 0xffffffffffffffffi64 48 #else 49 #define NO_EDGE (inT64) 0xffffffffffffffffll 50 #endif 51 52 /*---------------------------------------------------------------------- 53 T y p e s 54 ----------------------------------------------------------------------*/ 55 class UNICHARSET; 56 57 typedef uinT64 EDGE_RECORD; 58 typedef EDGE_RECORD *EDGE_ARRAY; 59 typedef inT64 EDGE_REF; 60 typedef inT64 NODE_REF; 61 typedef EDGE_REF *NODE_MAP; 62 63 namespace tesseract { 64 65 struct NodeChild { 66 UNICHAR_ID unichar_id; 67 EDGE_REF edge_ref; NodeChildNodeChild68 NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {} NodeChildNodeChild69 NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {} 70 }; 71 72 typedef GenericVector<NodeChild> NodeChildVector; 73 typedef GenericVector<int> SuccessorList; 74 typedef GenericVector<SuccessorList *> SuccessorListsVector; 75 76 enum DawgType { 77 DAWG_TYPE_PUNCTUATION, 78 DAWG_TYPE_PREFIX, 79 DAWG_TYPE_ROOT, 80 DAWG_TYPE_WORD, 81 DAWG_TYPE_SUFFIX, 82 DAWG_TYPE_NUMBER, 83 84 DAWG_TYPE_COUNT // number of enum entries 85 }; 86 87 /*---------------------------------------------------------------------- 88 C o n s t a n t s 89 ----------------------------------------------------------------------*/ 90 #define FORWARD_EDGE (inT32) 0 91 #define BACKWARD_EDGE (inT32) 1 92 #define MAX_NODE_EDGES_DISPLAY (inT64) 100 93 #define LAST_FLAG (inT64) 1 94 #define DIRECTION_FLAG (inT64) 2 95 #define WERD_END_FLAG (inT64) 4 96 #define LETTER_START_BIT 0 97 #define NUM_FLAG_BITS 3 98 #define REFFORMAT "%lld" 99 100 // Set kBeginningDawgsType[i] to true if a Dawg of 101 // DawgType i can contain the beginning of a word. 102 static const bool kBeginningDawgsType[] = {1, 1, 0, 1, 0, 1 }; 103 104 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = { 105 { 0, 1, 0, 1, 0, 0 }, // for DAWG_TYPE_PUNCTUATION 106 { 0, 0, 1, 1, 0, 0 }, // for DAWG_TYPE_PREFIX 107 { 0, 0, 0, 0, 1, 0 }, // for DAWG_TYPE_ROOT 108 { 1, 0, 0, 0, 0, 0 }, // for DAWG_TYPE_WORD 109 { 1, 0, 0, 0, 0, 0 }, // for DAWG_TYPE_SUFFIX 110 { 0, 0, 0, 0, 0, 0 } // for DAWG_TYPE_NUMBER 111 }; 112 113 static const char kWildcard[] = "*"; 114 115 116 /*---------------------------------------------------------------------- 117 C l a s s e s a n d S t r u c t s 118 ----------------------------------------------------------------------*/ 119 // 120 // Abstract class (an interface) that declares methods needed by the 121 // various tesseract classes to operate on SquishedDawg and Trie objects. 122 // 123 // This class initializes all the edge masks (since their usage by 124 // SquishedDawg and Trie is identical) and implements simple accessors 125 // for each of the fields encoded in an EDGE_RECORD. 126 // This class also implements word_in_dawg() and check_for_words() 127 // (since they use only the public methods of SquishedDawg and Trie 128 // classes that are inherited from the Dawg base class). 129 // 130 class Dawg { 131 public: 132 // Magic number to determine endianness when reading the Dawg from file. 133 static const inT16 kDawgMagicNumber = 42; 134 // A special unichar id that indicates that any appropriate pattern 135 // (e.g.dicitonary word, 0-9 digit, etc) can be inserted instead 136 // Used for expressing patterns in punctuation and number Dawgs. 137 static const UNICHAR_ID kPatternUnicharID = 0; 138 type()139 inline DawgType type() const { return type_; } lang()140 inline const STRING &lang() const { return lang_; } permuter()141 inline PermuterType permuter() const { return perm_; } 142 ~Dawg()143 virtual ~Dawg() {}; 144 145 // Returns true if the given word is in the Dawg. 146 bool word_in_dawg(const WERD_CHOICE &word) const; 147 148 // Checks the Dawg for the words that are listed in the requested file. 149 // Returns the number of words in the given file missing from the Dawg. 150 int check_for_words(const char *filename, 151 const UNICHARSET &unicharset, 152 bool enable_wildcard) const; 153 154 // Pure virtual function that should be implemented by the derived classes. 155 156 // Returns the edge that corresponds to the letter out of this node. 157 virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, 158 bool word_end) const = 0; 159 160 // Fills the given NodeChildVector with all the unichar ids (and the 161 // corresponding EDGE_REFs) for which there is an edge out of this node. 162 virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0; 163 164 // Returns the next node visited by following the edge 165 // indicated by the given EDGE_REF. 166 virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0; 167 168 // Returns true if the edge indicated by the given EDGE_REF 169 // marks the end of a word. 170 virtual bool end_of_word(EDGE_REF edge_ref) const = 0; 171 172 // Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. 173 virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0; 174 175 // Prints the contents of the node indicated by the given NODE_REF. 176 // At most max_num_edges will be printed. 177 virtual void print_node(NODE_REF node, int max_num_edges) const = 0; 178 179 protected: Dawg()180 Dawg() {} 181 182 // Returns the next node visited by following this edge. next_node_from_edge_rec(const EDGE_RECORD & edge_rec)183 inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const { 184 return ((edge_rec & next_node_mask_) >> next_node_start_bit_); 185 } 186 // Returns the direction flag of this edge. direction_from_edge_rec(const EDGE_RECORD & edge_rec)187 inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const { 188 return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ? 189 BACKWARD_EDGE : FORWARD_EDGE; 190 } 191 // Returns true if this edge marks the end of a word. end_of_word_from_edge_rec(const EDGE_RECORD & edge_rec)192 inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const { 193 return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0; 194 } 195 // Returns UNICHAR_ID recorded in this edge. unichar_id_from_edge_rec(const EDGE_RECORD & edge_rec)196 inline UNICHAR_ID unichar_id_from_edge_rec( 197 const EDGE_RECORD &edge_rec) const { 198 return ((edge_rec & letter_mask_) >> LETTER_START_BIT); 199 } 200 // Sets the next node link for this edge in the Dawg. set_next_node_in_edge_rec(EDGE_RECORD * edge_rec,EDGE_REF value)201 inline void set_next_node_in_edge_rec( 202 EDGE_RECORD *edge_rec, EDGE_REF value) { 203 *edge_rec &= (~next_node_mask_); 204 *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_); 205 } 206 // Sets this edge record to be the last one in a sequence of edges. set_last_flag_in_edge_rec(EDGE_RECORD * edge_rec)207 inline void set_last_flag_in_edge_rec(EDGE_RECORD *edge_rec) { 208 *edge_rec |= (LAST_FLAG << flag_start_bit_); 209 } 210 // Sequentially compares the given values of unichar ID, next node 211 // and word end marker with the values in the given EDGE_RECORD. 212 // Returns: 1 if at any step the given input value exceeds 213 // that of edge_rec (and all the values already 214 // checked are the same) 215 // 0 if edge_rec_match() returns true 216 // -1 otherwise given_greater_than_edge_rec(NODE_REF next_node,bool word_end,UNICHAR_ID unichar_id,const EDGE_RECORD & edge_rec)217 inline int given_greater_than_edge_rec(NODE_REF next_node, 218 bool word_end, 219 UNICHAR_ID unichar_id, 220 const EDGE_RECORD &edge_rec) const { 221 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec); 222 NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec); 223 bool curr_word_end = end_of_word_from_edge_rec(edge_rec); 224 if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node, 225 curr_word_end, curr_unichar_id)) return 0; 226 if (unichar_id > curr_unichar_id) return 1; 227 if (unichar_id == curr_unichar_id) { 228 if (next_node > curr_next_node) return 1; 229 if (next_node == curr_next_node) { 230 if (word_end > curr_word_end) return 1; 231 } 232 } 233 return -1; 234 } 235 // Returns true if all the values are equal (any value matches 236 // next_node if next_node == NO_EDGE, any value matches word_end 237 // if word_end is false). edge_rec_match(NODE_REF next_node,bool word_end,UNICHAR_ID unichar_id,NODE_REF other_next_node,bool other_word_end,UNICHAR_ID other_unichar_id)238 inline bool edge_rec_match(NODE_REF next_node, 239 bool word_end, 240 UNICHAR_ID unichar_id, 241 NODE_REF other_next_node, 242 bool other_word_end, 243 UNICHAR_ID other_unichar_id) const { 244 return ((unichar_id == other_unichar_id) && 245 (next_node == NO_EDGE || next_node == other_next_node) && 246 (!word_end || (word_end == other_word_end))); 247 } 248 249 // Sets type_, lang_, perm_, unicharset_size_. 250 // Initializes the values of various masks from unicharset_size_. 251 void init(DawgType type, const STRING &lang, 252 PermuterType perm, int unicharset_size); 253 254 // Matches all of the words that are represented by this string. 255 // If wilcard is set to something other than INVALID_UNICHAR_ID, 256 // the *'s in this string are interpreted as wildcards. 257 // WERD_CHOICE param is not passed by const so that wildcard searches 258 // can modify it and work without having to copy WERD_CHOICEs. 259 bool match_words(WERD_CHOICE *word, inT32 index, 260 NODE_REF node, UNICHAR_ID wildcard) const; 261 262 // Member Variables. 263 DawgType type_; 264 STRING lang_; 265 // Permuter code that should be used if the word is found in this Dawg. 266 PermuterType perm_; 267 // Variables to construct various edge masks. Formerly: 268 // #define NEXT_EDGE_MASK (inT64) 0xfffffff800000000i64 269 // #define FLAGS_MASK (inT64) 0x0000000700000000i64 270 // #define LETTER_MASK (inT64) 0x00000000ffffffffi64 271 int unicharset_size_; 272 int flag_start_bit_; 273 int next_node_start_bit_; 274 uinT64 next_node_mask_; 275 uinT64 flags_mask_; 276 uinT64 letter_mask_; 277 }; 278 279 // 280 // DawgInfo struct and DawgInfoVector class are used for 281 // storing information about the current Dawg search state. 282 // 283 struct DawgInfo { DawgInfoDawgInfo284 DawgInfo() : dawg_index(-1), ref(NO_EDGE) {} DawgInfoDawgInfo285 DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {} 286 bool operator==(const DawgInfo &other) { 287 return (this->dawg_index == other.dawg_index && 288 this->ref == other.ref); 289 } 290 int dawg_index; 291 EDGE_REF ref; 292 }; 293 class DawgInfoVector : public GenericVector<DawgInfo> { 294 public: 295 // Overload destructor, since clear() does not delete data_[] any more. ~DawgInfoVector()296 ~DawgInfoVector() { 297 if (size_reserved_ > 0) { 298 delete[] data_; 299 size_used_ = 0; 300 size_reserved_ = 0; 301 } 302 } 303 // Overload clear() in order to avoid allocating/deallocating memory 304 // when clearing the vector and re-inserting entries into it later. clear()305 void clear() { size_used_ = 0; } 306 // Adds an entry for the given dawg_index with the given node to the vec. 307 // Returns false if the same entry already exists in the vector, 308 // true otherwise. add_unique(const DawgInfo & new_info,const char * debug_msg)309 inline bool add_unique(const DawgInfo &new_info, const char *debug_msg) { 310 for (int i = 0; i < size_used_; ++i) { 311 if (data_[i] == new_info) return false; 312 } 313 push_back(new_info); 314 if (dawg_debug_level) { 315 tprintf("%s[%d, " REFFORMAT "]\n", debug_msg, 316 new_info.dawg_index, new_info.ref); 317 } 318 return true; 319 } 320 // Removes an entry that equals to the given DawgInfo. 321 // This function assumes that the entries in the vector are unique. 322 // Returns true if an entry was found and removed. remove(const DawgInfo & info)323 inline bool remove(const DawgInfo &info) { 324 for (int i = 0; i < size_used_; ++i) { 325 if (data_[i] == info) { 326 for (int j = i + 1; j < size_used_; ++j) { 327 data_[j-1] = data_[j]; 328 } 329 size_used_--; 330 return true; 331 } 332 } 333 return false; 334 } 335 }; 336 337 // 338 // Concrete class that can operate on a compacted (squished) Dawg (read, 339 // search and write to file). This class is read-only in the sense that 340 // new words can not be added to an instance of SquishedDawg. 341 // The underlying representation of the nodes and edges in SquishedDawg 342 // is stored as a contiguous EDGE_ARRAY (read from file or given as an 343 // argument to the constructor). 344 // 345 class SquishedDawg : public Dawg { 346 public: SquishedDawg(FILE * file,DawgType type,const STRING & lang,PermuterType perm)347 SquishedDawg(FILE *file, DawgType type, 348 const STRING &lang, PermuterType perm) { 349 read_squished_dawg(file, type, lang, perm); 350 num_forward_edges_in_node0 = num_forward_edges(0); 351 } SquishedDawg(const char * filename,DawgType type,const STRING & lang,PermuterType perm)352 SquishedDawg(const char* filename, DawgType type, 353 const STRING &lang, PermuterType perm) { 354 FILE *file = fopen(filename, "rb"); 355 if (file == NULL) { 356 tprintf("Failed to open dawg file %s\n", filename); 357 exit(1); 358 } 359 read_squished_dawg(file, type, lang, perm); 360 num_forward_edges_in_node0 = num_forward_edges(0); 361 fclose(file); 362 } SquishedDawg(EDGE_ARRAY edges,int num_edges,DawgType type,const STRING & lang,PermuterType perm,int unicharset_size)363 SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, 364 const STRING &lang, PermuterType perm, int unicharset_size) : 365 edges_(edges), num_edges_(num_edges) { 366 init(type, lang, perm, unicharset_size); 367 num_forward_edges_in_node0 = num_forward_edges(0); 368 if (dawg_debug_level > 3) print_all("SquishedDawg:"); 369 } 370 ~SquishedDawg(); 371 372 // Returns the edge that corresponds to the letter out of this node. 373 EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, 374 bool word_end) const; 375 376 // Fills the given NodeChildVector with all the unichar ids (and the 377 // corresponding EDGE_REFs) for which there is an edge out of this node. unichar_ids_of(NODE_REF node,NodeChildVector * vec)378 void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const { 379 EDGE_REF edge = node; 380 if (!edge_occupied(edge) || edge == NO_EDGE) return; 381 assert(forward_edge(edge)); // we don't expect any backward edges to 382 do { // be present when this funciton is called 383 vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge)); 384 } while (!last_edge(edge++)); 385 } 386 387 // Returns the next node visited by following the edge 388 // indicated by the given EDGE_REF. next_node(EDGE_REF edge)389 NODE_REF next_node(EDGE_REF edge) const { 390 return next_node_from_edge_rec((edges_[edge])); 391 } 392 393 // Returns true if the edge indicated by the given EDGE_REF 394 // marks the end of a word. end_of_word(EDGE_REF edge_ref)395 bool end_of_word(EDGE_REF edge_ref) const { 396 return end_of_word_from_edge_rec((edges_[edge_ref])); 397 } 398 399 // Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. edge_letter(EDGE_REF edge_ref)400 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const { 401 return unichar_id_from_edge_rec((edges_[edge_ref])); 402 } 403 404 // Prints the contents of the node indicated by the given NODE_REF. 405 // At most max_num_edges will be printed. 406 void print_node(NODE_REF node, int max_num_edges) const; 407 408 // Writes the squished/reduced Dawg to a file. 409 void write_squished_dawg(const char *filename); 410 411 private: 412 // Sets the next node link for this edge. set_next_node(EDGE_REF edge_ref,EDGE_REF value)413 inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) { 414 set_next_node_in_edge_rec(&(edges_[edge_ref]), value); 415 } 416 // Sets the edge to be empty. set_empty_edge(EDGE_REF edge_ref)417 inline void set_empty_edge(EDGE_REF edge_ref) { 418 (edges_[edge_ref] = next_node_mask_); 419 } 420 // Goes through all the edges and clears each one out. clear_all_edges()421 inline void clear_all_edges() { 422 for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge); 423 } 424 // Clears the last flag of this edge. clear_last_flag(EDGE_REF edge_ref)425 inline void clear_last_flag(EDGE_REF edge_ref) { 426 (edges_[edge_ref] &= ~(LAST_FLAG << flag_start_bit_)); 427 } 428 // Returns true if this edge is in the forward direction. forward_edge(EDGE_REF edge_ref)429 inline bool forward_edge(EDGE_REF edge_ref) const { 430 return (edge_occupied(edge_ref) && 431 (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); 432 } 433 // Returns true if this edge is in the backward direction. backward_edge(EDGE_REF edge_ref)434 inline bool backward_edge(EDGE_REF edge_ref) const { 435 return (edge_occupied(edge_ref) && 436 (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); 437 } 438 // Returns true if the edge spot in this location is occupied. edge_occupied(EDGE_REF edge_ref)439 inline bool edge_occupied(EDGE_REF edge_ref) const { 440 return (edges_[edge_ref] != next_node_mask_); 441 } 442 // Returns true if this edge is the last edge in a sequence. last_edge(EDGE_REF edge_ref)443 inline bool last_edge(EDGE_REF edge_ref) const { 444 return (edges_[edge_ref] & (LAST_FLAG << flag_start_bit_)) != 0; 445 } 446 447 // Counts and returns the number of forward edges in this node. 448 inT32 num_forward_edges(NODE_REF node) const; 449 450 // Reads SquishedDawg from a file. 451 void read_squished_dawg(FILE *file, DawgType type, 452 const STRING &lang, PermuterType perm); 453 454 // Prints the contents of an edge indicated by the given EDGE_REF. 455 void print_edge(EDGE_REF edge) const; 456 457 // Prints the contents of the SquishedDawg. print_all(const char * msg)458 void print_all(const char* msg) { 459 tprintf("\n__________________________\n%s\n", msg); 460 for (int i = 0; i < num_edges_; ++i) print_edge(i); 461 tprintf("__________________________\n"); 462 } 463 // Constructs a mapping from the memory node indices to disk node indices. 464 NODE_MAP build_node_map(inT32 *num_nodes) const; 465 466 467 // Member variables. 468 EDGE_ARRAY edges_; 469 int num_edges_; 470 int num_forward_edges_in_node0; 471 }; 472 } // namespace tesseract 473 474 #endif 475