• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File:        dawg.h  (Formerly dawg.h)
5  * Description:  Definition of a class that represents Directed Accyclic Word
6  *               Graph (DAWG), functions to build and manipulate the DAWG.
7  * Author:       Mark Seaman, SW Productivity
8  * Created:      Fri Oct 16 14:37:00 1987
9  * Modified:     Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt
10  * Language:     C
11  * Package:      N/A
12  * Status:       Reusable Software Component
13  *
14  * (c) Copyright 1987, Hewlett-Packard Company.
15  ** Licensed under the Apache License, Version 2.0 (the "License");
16  ** you may not use this file except in compliance with the License.
17  ** You may obtain a copy of the License at
18  ** http://www.apache.org/licenses/LICENSE-2.0
19  ** Unless required by applicable law or agreed to in writing, software
20  ** distributed under the License is distributed on an "AS IS" BASIS,
21  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  ** See the License for the specific language governing permissions and
23  ** limitations under the License.
24  *
25  *********************************************************************************/
26 
27 #ifndef DAWG_H
28 #define DAWG_H
29 
30 /*----------------------------------------------------------------------
31               I n c l u d e s
32 ----------------------------------------------------------------------*/
33 
34 #include "elst.h"
35 #include "general.h"
36 #include "ratngs.h"
37 #include "varable.h"
38 
39 /*----------------------------------------------------------------------
40               V a r i a b l e s
41 ----------------------------------------------------------------------*/
42 
43 extern INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info, to"
44                  " 2 for more details, to 3 to see all the debug messages");
45 
46 #ifdef __MSW32__
47 #define NO_EDGE                (inT64) 0xffffffffffffffffi64
48 #else
49 #define NO_EDGE                (inT64) 0xffffffffffffffffll
50 #endif
51 
52 /*----------------------------------------------------------------------
53               T y p e s
54 ----------------------------------------------------------------------*/
55 class UNICHARSET;
56 
57 typedef uinT64 EDGE_RECORD;
58 typedef EDGE_RECORD *EDGE_ARRAY;
59 typedef inT64 EDGE_REF;
60 typedef inT64 NODE_REF;
61 typedef EDGE_REF *NODE_MAP;
62 
63 namespace tesseract {
64 
65 struct NodeChild {
66   UNICHAR_ID unichar_id;
67   EDGE_REF edge_ref;
NodeChildNodeChild68   NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {}
NodeChildNodeChild69   NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
70 };
71 
72 typedef GenericVector<NodeChild> NodeChildVector;
73 typedef GenericVector<int> SuccessorList;
74 typedef GenericVector<SuccessorList *> SuccessorListsVector;
75 
76 enum DawgType {
77   DAWG_TYPE_PUNCTUATION,
78   DAWG_TYPE_PREFIX,
79   DAWG_TYPE_ROOT,
80   DAWG_TYPE_WORD,
81   DAWG_TYPE_SUFFIX,
82   DAWG_TYPE_NUMBER,
83 
84   DAWG_TYPE_COUNT  // number of enum entries
85 };
86 
87 /*----------------------------------------------------------------------
88               C o n s t a n t s
89 ----------------------------------------------------------------------*/
90 #define FORWARD_EDGE           (inT32) 0
91 #define BACKWARD_EDGE          (inT32) 1
92 #define MAX_NODE_EDGES_DISPLAY (inT64) 100
93 #define LAST_FLAG              (inT64) 1
94 #define DIRECTION_FLAG         (inT64) 2
95 #define WERD_END_FLAG          (inT64) 4
96 #define LETTER_START_BIT       0
97 #define NUM_FLAG_BITS          3
98 #define REFFORMAT "%lld"
99 
100 // Set kBeginningDawgsType[i] to true if a Dawg of
101 // DawgType i can contain the beginning of a word.
102 static const bool kBeginningDawgsType[] = {1, 1, 0, 1, 0, 1 };
103 
104 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
105   { 0, 1, 0, 1, 0, 0 },  // for DAWG_TYPE_PUNCTUATION
106   { 0, 0, 1, 1, 0, 0 },  // for DAWG_TYPE_PREFIX
107   { 0, 0, 0, 0, 1, 0 },  // for DAWG_TYPE_ROOT
108   { 1, 0, 0, 0, 0, 0 },  // for DAWG_TYPE_WORD
109   { 1, 0, 0, 0, 0, 0 },  // for DAWG_TYPE_SUFFIX
110   { 0, 0, 0, 0, 0, 0 }   // for DAWG_TYPE_NUMBER
111 };
112 
113 static const char kWildcard[] = "*";
114 
115 
116 /*----------------------------------------------------------------------
117               C l a s s e s   a n d   S t r u c t s
118 ----------------------------------------------------------------------*/
119 //
120 // Abstract class (an interface) that declares methods needed by the
121 // various tesseract classes to operate on SquishedDawg and Trie objects.
122 //
123 // This class initializes all the edge masks (since their usage by
124 // SquishedDawg and Trie is identical) and implements simple accessors
125 // for each of the fields encoded in an EDGE_RECORD.
126 // This class also implements word_in_dawg() and check_for_words()
127 // (since they use only the public methods of SquishedDawg and Trie
128 // classes that are inherited from the Dawg base class).
129 //
130 class Dawg {
131  public:
132   // Magic number to determine endianness when reading the Dawg from file.
133   static const inT16 kDawgMagicNumber = 42;
134   // A special unichar id that indicates that any appropriate pattern
135   // (e.g.dicitonary word, 0-9 digit, etc) can be inserted instead
136   // Used for expressing patterns in punctuation and number Dawgs.
137   static const UNICHAR_ID kPatternUnicharID = 0;
138 
type()139   inline DawgType type() const { return type_; }
lang()140   inline const STRING &lang() const { return lang_; }
permuter()141   inline PermuterType permuter() const { return perm_; }
142 
~Dawg()143   virtual ~Dawg() {};
144 
145   // Returns true if the given word is in the Dawg.
146   bool word_in_dawg(const WERD_CHOICE &word) const;
147 
148   // Checks the Dawg for the words that are listed in the requested file.
149   // Returns the number of words in the given file missing from the Dawg.
150   int check_for_words(const char *filename,
151                       const UNICHARSET &unicharset,
152                       bool enable_wildcard) const;
153 
154   // Pure virtual function that should be implemented by the derived classes.
155 
156   // Returns the edge that corresponds to the letter out of this node.
157   virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
158                                 bool word_end) const = 0;
159 
160   // Fills the given NodeChildVector with all the unichar ids (and the
161   // corresponding EDGE_REFs) for which there is an edge out of this node.
162   virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0;
163 
164   // Returns the next node visited by following the edge
165   // indicated by the given EDGE_REF.
166   virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
167 
168   // Returns true if the edge indicated by the given EDGE_REF
169   // marks the end of a word.
170   virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
171 
172   // Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
173   virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
174 
175   // Prints the contents of the node indicated by the given NODE_REF.
176   // At most max_num_edges will be printed.
177   virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
178 
179  protected:
Dawg()180   Dawg() {}
181 
182   // Returns the next node visited by following this edge.
next_node_from_edge_rec(const EDGE_RECORD & edge_rec)183   inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
184     return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
185   }
186   // Returns the direction flag of this edge.
direction_from_edge_rec(const EDGE_RECORD & edge_rec)187   inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
188     return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
189       BACKWARD_EDGE : FORWARD_EDGE;
190   }
191   // Returns true if this edge marks the end of a word.
end_of_word_from_edge_rec(const EDGE_RECORD & edge_rec)192   inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
193     return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
194   }
195   // Returns UNICHAR_ID recorded in this edge.
unichar_id_from_edge_rec(const EDGE_RECORD & edge_rec)196   inline UNICHAR_ID unichar_id_from_edge_rec(
197       const EDGE_RECORD &edge_rec) const {
198     return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
199   }
200   // Sets the next node link for this edge in the Dawg.
set_next_node_in_edge_rec(EDGE_RECORD * edge_rec,EDGE_REF value)201   inline void set_next_node_in_edge_rec(
202       EDGE_RECORD *edge_rec, EDGE_REF value) {
203     *edge_rec &= (~next_node_mask_);
204     *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
205   }
206   // Sets this edge record to be the last one in a sequence of edges.
set_last_flag_in_edge_rec(EDGE_RECORD * edge_rec)207   inline void set_last_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
208     *edge_rec |= (LAST_FLAG << flag_start_bit_);
209   }
210   // Sequentially compares the given values of unichar ID, next node
211   // and word end marker with the values in the given EDGE_RECORD.
212   // Returns: 1 if at any step the given input value exceeds
213   //            that of edge_rec (and all the values already
214   //            checked are the same)
215   //          0 if edge_rec_match() returns true
216   //         -1 otherwise
given_greater_than_edge_rec(NODE_REF next_node,bool word_end,UNICHAR_ID unichar_id,const EDGE_RECORD & edge_rec)217   inline int given_greater_than_edge_rec(NODE_REF next_node,
218                                          bool word_end,
219                                          UNICHAR_ID unichar_id,
220                                          const EDGE_RECORD &edge_rec) const {
221     UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
222     NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
223     bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
224     if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
225                        curr_word_end, curr_unichar_id)) return 0;
226     if (unichar_id > curr_unichar_id) return 1;
227     if (unichar_id == curr_unichar_id) {
228       if (next_node > curr_next_node) return 1;
229       if (next_node == curr_next_node) {
230         if (word_end > curr_word_end) return 1;
231       }
232     }
233     return -1;
234   }
235   // Returns true if all the values are equal (any value matches
236   // next_node if next_node == NO_EDGE, any value matches word_end
237   // if word_end is false).
edge_rec_match(NODE_REF next_node,bool word_end,UNICHAR_ID unichar_id,NODE_REF other_next_node,bool other_word_end,UNICHAR_ID other_unichar_id)238   inline bool edge_rec_match(NODE_REF next_node,
239                              bool word_end,
240                              UNICHAR_ID unichar_id,
241                              NODE_REF other_next_node,
242                              bool other_word_end,
243                              UNICHAR_ID other_unichar_id) const {
244     return ((unichar_id == other_unichar_id) &&
245             (next_node == NO_EDGE || next_node == other_next_node) &&
246             (!word_end || (word_end == other_word_end)));
247   }
248 
249   // Sets type_, lang_, perm_, unicharset_size_.
250   // Initializes the values of various masks from unicharset_size_.
251   void init(DawgType type, const STRING &lang,
252             PermuterType perm, int unicharset_size);
253 
254   // Matches all of the words that are represented by this string.
255   // If wilcard is set to something other than INVALID_UNICHAR_ID,
256   // the *'s in this string are interpreted as wildcards.
257   // WERD_CHOICE param is not passed by const so that wildcard searches
258   // can modify it and work without having to copy WERD_CHOICEs.
259   bool match_words(WERD_CHOICE *word, inT32 index,
260                    NODE_REF node, UNICHAR_ID wildcard) const;
261 
262   // Member Variables.
263   DawgType type_;
264   STRING lang_;
265   // Permuter code that should be used if the word is found in this Dawg.
266   PermuterType perm_;
267   // Variables to construct various edge masks. Formerly:
268   // #define NEXT_EDGE_MASK (inT64) 0xfffffff800000000i64
269   // #define FLAGS_MASK     (inT64) 0x0000000700000000i64
270   // #define LETTER_MASK    (inT64) 0x00000000ffffffffi64
271   int unicharset_size_;
272   int flag_start_bit_;
273   int next_node_start_bit_;
274   uinT64 next_node_mask_;
275   uinT64 flags_mask_;
276   uinT64 letter_mask_;
277 };
278 
279 //
280 // DawgInfo struct and DawgInfoVector class are used for
281 // storing information about the current Dawg search state.
282 //
283 struct DawgInfo {
DawgInfoDawgInfo284   DawgInfo() : dawg_index(-1), ref(NO_EDGE) {}
DawgInfoDawgInfo285   DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {}
286   bool operator==(const DawgInfo &other) {
287     return (this->dawg_index == other.dawg_index &&
288             this->ref == other.ref);
289   }
290   int dawg_index;
291   EDGE_REF ref;
292 };
293 class DawgInfoVector : public GenericVector<DawgInfo> {
294  public:
295   // Overload destructor, since clear() does not delete data_[] any more.
~DawgInfoVector()296   ~DawgInfoVector() {
297     if (size_reserved_ > 0) {
298       delete[] data_;
299       size_used_ = 0;
300       size_reserved_ = 0;
301     }
302   }
303   // Overload clear() in order to avoid allocating/deallocating memory
304   // when clearing the vector and re-inserting entries into it later.
clear()305   void clear() { size_used_ = 0; }
306   // Adds an entry for the given dawg_index with the given node to the vec.
307   // Returns false if the same entry already exists in the vector,
308   // true otherwise.
add_unique(const DawgInfo & new_info,const char * debug_msg)309   inline bool add_unique(const DawgInfo &new_info, const char *debug_msg) {
310     for (int i = 0; i < size_used_; ++i) {
311       if (data_[i] == new_info) return false;
312     }
313     push_back(new_info);
314     if (dawg_debug_level) {
315       tprintf("%s[%d, " REFFORMAT "]\n", debug_msg,
316               new_info.dawg_index, new_info.ref);
317     }
318     return true;
319   }
320   // Removes an entry that equals to the given DawgInfo.
321   // This function assumes that the entries in the vector are unique.
322   // Returns true if an entry was found and removed.
remove(const DawgInfo & info)323   inline bool remove(const DawgInfo &info) {
324     for (int i = 0; i < size_used_; ++i) {
325       if (data_[i] == info) {
326         for (int j = i + 1; j < size_used_; ++j) {
327           data_[j-1] = data_[j];
328         }
329         size_used_--;
330         return true;
331       }
332     }
333     return false;
334   }
335 };
336 
337 //
338 // Concrete class that can operate on a compacted (squished) Dawg (read,
339 // search and write to file). This class is read-only in the sense that
340 // new words can not be added to an instance of SquishedDawg.
341 // The underlying representation of the nodes and edges in SquishedDawg
342 // is stored as a contiguous EDGE_ARRAY (read from file or given as an
343 // argument to the constructor).
344 //
345 class SquishedDawg : public Dawg {
346  public:
SquishedDawg(FILE * file,DawgType type,const STRING & lang,PermuterType perm)347   SquishedDawg(FILE *file, DawgType type,
348                const STRING &lang, PermuterType perm) {
349     read_squished_dawg(file, type, lang, perm);
350     num_forward_edges_in_node0 = num_forward_edges(0);
351   }
SquishedDawg(const char * filename,DawgType type,const STRING & lang,PermuterType perm)352   SquishedDawg(const char* filename, DawgType type,
353                const STRING &lang, PermuterType perm) {
354     FILE *file = fopen(filename, "rb");
355     if (file == NULL) {
356       tprintf("Failed to open dawg file %s\n", filename);
357       exit(1);
358     }
359     read_squished_dawg(file, type, lang, perm);
360     num_forward_edges_in_node0 = num_forward_edges(0);
361     fclose(file);
362   }
SquishedDawg(EDGE_ARRAY edges,int num_edges,DawgType type,const STRING & lang,PermuterType perm,int unicharset_size)363   SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
364                const STRING &lang, PermuterType perm, int unicharset_size) :
365     edges_(edges), num_edges_(num_edges) {
366     init(type, lang, perm, unicharset_size);
367     num_forward_edges_in_node0 = num_forward_edges(0);
368     if (dawg_debug_level > 3) print_all("SquishedDawg:");
369   }
370   ~SquishedDawg();
371 
372   // Returns the edge that corresponds to the letter out of this node.
373   EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
374                         bool word_end) const;
375 
376   // Fills the given NodeChildVector with all the unichar ids (and the
377   // corresponding EDGE_REFs) for which there is an edge out of this node.
unichar_ids_of(NODE_REF node,NodeChildVector * vec)378   void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const {
379     EDGE_REF edge = node;
380     if (!edge_occupied(edge) || edge == NO_EDGE) return;
381     assert(forward_edge(edge));  // we don't expect any backward edges to
382     do {                         // be present when this funciton is called
383       vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
384     } while (!last_edge(edge++));
385   }
386 
387   // Returns the next node visited by following the edge
388   // indicated by the given EDGE_REF.
next_node(EDGE_REF edge)389   NODE_REF next_node(EDGE_REF edge) const {
390     return next_node_from_edge_rec((edges_[edge]));
391   }
392 
393   // Returns true if the edge indicated by the given EDGE_REF
394   // marks the end of a word.
end_of_word(EDGE_REF edge_ref)395   bool end_of_word(EDGE_REF edge_ref) const {
396     return end_of_word_from_edge_rec((edges_[edge_ref]));
397   }
398 
399   // Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
edge_letter(EDGE_REF edge_ref)400   UNICHAR_ID edge_letter(EDGE_REF edge_ref) const {
401     return unichar_id_from_edge_rec((edges_[edge_ref]));
402   }
403 
404   // Prints the contents of the node indicated by the given NODE_REF.
405   // At most max_num_edges will be printed.
406   void print_node(NODE_REF node, int max_num_edges) const;
407 
408   // Writes the squished/reduced Dawg to a file.
409   void write_squished_dawg(const char *filename);
410 
411  private:
412   // Sets the next node link for this edge.
set_next_node(EDGE_REF edge_ref,EDGE_REF value)413   inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
414     set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
415   }
416   // Sets the edge to be empty.
set_empty_edge(EDGE_REF edge_ref)417   inline void set_empty_edge(EDGE_REF edge_ref) {
418     (edges_[edge_ref] = next_node_mask_);
419   }
420   // Goes through all the edges and clears each one out.
clear_all_edges()421   inline void clear_all_edges() {
422     for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
423   }
424   // Clears the last flag of this edge.
clear_last_flag(EDGE_REF edge_ref)425   inline void clear_last_flag(EDGE_REF edge_ref) {
426      (edges_[edge_ref] &= ~(LAST_FLAG << flag_start_bit_));
427   }
428   // Returns true if this edge is in the forward direction.
forward_edge(EDGE_REF edge_ref)429   inline bool forward_edge(EDGE_REF edge_ref) const {
430     return (edge_occupied(edge_ref) &&
431             (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
432   }
433   // Returns true if this edge is in the backward direction.
backward_edge(EDGE_REF edge_ref)434   inline bool backward_edge(EDGE_REF edge_ref) const {
435     return (edge_occupied(edge_ref) &&
436             (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
437   }
438   // Returns true if the edge spot in this location is occupied.
edge_occupied(EDGE_REF edge_ref)439   inline bool edge_occupied(EDGE_REF edge_ref) const {
440     return (edges_[edge_ref] != next_node_mask_);
441   }
442   // Returns true if this edge is the last edge in a sequence.
last_edge(EDGE_REF edge_ref)443   inline bool last_edge(EDGE_REF edge_ref) const {
444     return (edges_[edge_ref] & (LAST_FLAG << flag_start_bit_)) != 0;
445   }
446 
447   // Counts and returns the number of forward edges in this node.
448   inT32 num_forward_edges(NODE_REF node) const;
449 
450   // Reads SquishedDawg from a file.
451   void read_squished_dawg(FILE *file, DawgType type,
452                           const STRING &lang, PermuterType perm);
453 
454   // Prints the contents of an edge indicated by the given EDGE_REF.
455   void print_edge(EDGE_REF edge) const;
456 
457   // Prints the contents of the SquishedDawg.
print_all(const char * msg)458   void print_all(const char* msg) {
459     tprintf("\n__________________________\n%s\n", msg);
460     for (int i = 0; i < num_edges_; ++i) print_edge(i);
461     tprintf("__________________________\n");
462   }
463   // Constructs a mapping from the memory node indices to disk node indices.
464   NODE_MAP build_node_map(inT32 *num_nodes) const;
465 
466 
467   // Member variables.
468   EDGE_ARRAY edges_;
469   int num_edges_;
470   int num_forward_edges_in_node0;
471 };
472 }  // namespace tesseract
473 
474 #endif
475