• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        permngram.cpp
3 // Description: Character n-gram permuter
4 // Author:      Thomas Kielbus
5 // Created:     Wed Sep 12 11:26:43 PDT 2007
6 //
7 // (C) Copyright 2007, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19 
20 #include "const.h"
21 #include "permngram.h"
22 #include "permute.h"
23 #include "dawg.h"
24 #include "tordvars.h"
25 #include "stopper.h"
26 #include "globals.h"
27 #include "context.h"
28 #include "ndminx.h"
29 #include "dict.h"
30 #include "conversion.h"
31 
32 #include <math.h>
33 #include <ctype.h>
34 
35 // Ratio to control the relative importance of the classifier and the ngram
36 // in the final score of a classification unit. Must be >= 0 and <= 1.
37 // A value of 1.0 uses only the shape classifier score.
38 // A value of 0.0 uses only the ngram score.
39 double_VAR(classifier_score_ngram_score_ratio,
40            0.7,
41            "");
42 
43 // Rating adjustment multiplier for words not in the DAWG. Must be >= 1.
44 double_VAR(non_dawg_prefix_rating_adjustment,
45            1.5,
46            "");
47 
48 // HypothesisPrefix represents a word prefix during the search of the
49 // character-level n-gram model based permuter.
50 // It holds the data needed to create the corresponding A_CHOICE.
51 // Note that the string stored in the _word data member always begin with a
52 // space character. This is used by the n-gram model to score the word.
53 // HypothesisPrefix also contains the node in the DAWG that is reached when
54 // searching for the corresponding prefix.
55 class HypothesisPrefix {
56  public:
57   HypothesisPrefix();
58   HypothesisPrefix(const HypothesisPrefix& prefix,
59                    A_CHOICE* choice,
60                    bool end_of_word,
61                    const tesseract::Dawg *dawg,
62                    tesseract::Dict* dict);
63 
rating() const64   double rating() const {return rating_;}
certainty() const65   double certainty() const {return certainty_;}
word() const66   const char* word() const {return word_;}
unichar_lengths() const67   const char* unichar_lengths() const {return unichar_lengths_;}
certainty_array() const68   const float* certainty_array() const {return certainty_array_;}
is_dawg_prefix() const69   bool is_dawg_prefix() const {return is_dawg_prefix_;}
dawg_node() const70   NODE_REF dawg_node() const {return dawg_node_;}
71 
72  private:
73   double rating_;
74   double certainty_;
75   char word_[UNICHAR_LEN * MAX_WERD_LENGTH + 2];
76   char unichar_lengths_[MAX_WERD_LENGTH + 1];
77   float certainty_array_[MAX_WERD_LENGTH + 1];
78   NODE_REF dawg_node_;
79   bool is_dawg_prefix_;
80 };
81 
82 // HypothesisPrefix is the class used as nodes in HypothesisPrefixLists
83 typedef HypothesisPrefix HypothesisPrefixListNode;
84 
85 // HypothesisPrefixList maintains a sorted list of HypothesisPrefixes. The size
86 // is bounded by the argument given to the constructor.
87 // For the sake of simplicity, current implementation is not as efficient as it
88 // could be. The list is represented by a static array of pointers to its
89 // elements. All nodes are stored in positions from 0 to (size() - 1).
90 class HypothesisPrefixList {
91  public:
92   HypothesisPrefixList(int size_bound);
93   ~HypothesisPrefixList();
94 
95   void add_node(HypothesisPrefix* node);
size() const96   int size() const {return _size;}
97   void clear();
node(int index)98   const HypothesisPrefix& node(int index) {return *_list_nodes[index];}
99 
100  private:
101   HypothesisPrefix** _list_nodes;
102   int _size_bound;
103   int _size;
104 };
105 
106 // Return the classifier_score_ngram_score_ratio for a given choice string.
107 // The classification decision for characters like comma and period should
108 // be based only on shape rather than on shape and n-gram score.
109 // Return 1.0 for them, the default classifier_score_ngram_score_ratio
110 // otherwise.
111 static double get_classifier_score_ngram_score_ratio(const char* choice);
112 
113 // Permute the given char_choices using a character level n-gram model and
114 // return the best word choice found.
115 // This is performed by maintaining a HypothesisPrefixList of HypothesisPrefixes.
116 // For each character position, each possible character choice is appended to
117 // the best current prefixes to create the list of best prefixes at the next
118 // character position.
119 namespace tesseract {
ngram_permute_and_select(CHOICES_LIST char_choices,float rating_limit,const Dawg * dawg)120 A_CHOICE *Dict::ngram_permute_and_select(CHOICES_LIST char_choices,
121                                          float rating_limit,
122                                          const Dawg *dawg) {
123   if (array_count (char_choices) <= MAX_WERD_LENGTH) {
124     CHOICES choices;
125     int char_index_max = array_count(char_choices);
126     HypothesisPrefixList list_1(20);
127     HypothesisPrefixList list_2(20);
128     HypothesisPrefixList* current_list = &list_1;
129     HypothesisPrefixList* next_list = &list_2;
130     HypothesisPrefix* initial_node = new HypothesisPrefix();
131     current_list->add_node(initial_node);
132     for (int char_index = 0; char_index < char_index_max; ++char_index) {
133       iterate_list(choices, (CHOICES) array_index(char_choices, char_index)) {
134         A_CHOICE* choice = (A_CHOICE *) first_node(choices);
135         for (int node_index = 0;
136              node_index < current_list->size();
137              ++node_index) {
138           // Append this choice to the current node
139           HypothesisPrefix* new_node = new HypothesisPrefix(
140               current_list->node(node_index),
141               choice,
142               char_index == char_index_max - 1,
143               dawg, this);
144           next_list->add_node(new_node);
145         }
146       }
147       // Clear current list and switch lists
148       current_list->clear();
149       HypothesisPrefixList* temp_list = current_list;
150       current_list = next_list;
151       next_list = temp_list;
152 
153       // Give up if the current best rating is worse than rating_limit
154       if (current_list->node(0).rating() > rating_limit)
155         return new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM);
156     }
157     const HypothesisPrefix& best_word = current_list->node(0);
158     A_CHOICE* best_choice = new_choice (best_word.word() + 1,
159                                         best_word.unichar_lengths(),
160                                         best_word.rating(),
161                                         best_word.certainty(), -1,
162                                         valid_word(best_word.word() + 1) ?
163                                         SYSTEM_DAWG_PERM : TOP_CHOICE_PERM);
164     LogNewWordChoice(best_choice, best_word.is_dawg_prefix() ?
165                      1.0 : non_dawg_prefix_rating_adjustment,
166                      const_cast<float*>(best_word.certainty_array()),
167                      getUnicharset());
168     return best_choice;
169   } else {
170     return new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM);
171   }
172 }
173 }  // namespace tesseract
174 
get_classifier_score_ngram_score_ratio(const char * choice)175 double get_classifier_score_ngram_score_ratio(const char* choice) {
176   if (!strcmp(",", choice) ||
177       !strcmp(".", choice))
178     return 1.0;
179   else
180     return classifier_score_ngram_score_ratio;
181 }
182 
183 // Initial HypothesisPrefix constructor used to create the first state of the
184 // search.
HypothesisPrefix()185 HypothesisPrefix::HypothesisPrefix() {
186   rating_ = 0;
187   certainty_ = MAXFLOAT;
188   strcpy(word_, " ");
189   unichar_lengths_[0] = '\0';
190   dawg_node_ = 0;
191   is_dawg_prefix_ = true;
192 }
193 
194 // Main constructor to create a new HypothesisPrefix by appending a character
195 // choice (A_CHOICE) to an existing HypothesisPrefix. This constructor takes
196 // care of copying the original prefix's data members, appends the character
197 // choice to the word and updates its rating using a character-level n-gram
198 // model. The state in the DAWG is also updated.
HypothesisPrefix(const HypothesisPrefix & prefix,A_CHOICE * choice,bool end_of_word,const tesseract::Dawg * dawg,tesseract::Dict * dict)199 HypothesisPrefix::HypothesisPrefix(const HypothesisPrefix& prefix,
200                                    A_CHOICE* choice,
201                                    bool end_of_word,
202                                    const tesseract::Dawg *dawg,
203                                    tesseract::Dict* dict) {
204   char* word_ptr = word_;
205   const char* prefix_word_ptr = prefix.word_;
206 
207   // Copy first space character
208   *(word_ptr++) = *(prefix_word_ptr++);
209 
210   // Copy existing word, unichar_lengths, certainty_array
211   int char_index;
212   for (char_index = 0;
213        prefix.unichar_lengths_[char_index] != '\0';
214        ++char_index) {
215     for (int char_subindex = 0;
216          char_subindex < prefix.unichar_lengths_[char_index];
217          ++char_subindex) {
218       *(word_ptr++) = *(prefix_word_ptr++);
219     }
220     unichar_lengths_[char_index] = prefix.unichar_lengths_[char_index];
221     certainty_array_[char_index] = prefix.certainty_array_[char_index];
222   }
223 
224   // If choice is empty, use a space character instead
225   const char* class_string_choice = *class_string(choice) == '\0' ?
226       " " : class_string(choice);
227 
228   // Update certainty
229   certainty_ = MIN(prefix.certainty_, class_certainty(choice));
230 
231   // Apprend choice to the word
232   strcpy(word_ptr, class_string_choice);
233   unichar_lengths_[char_index] = strlen(class_string_choice);
234   unichar_lengths_[char_index + 1] = '\0';
235 
236   // Append choice certainty to the certainty array
237   certainty_array_[char_index] = class_certainty(choice);
238 
239   // Copy DAWG node state
240   dawg_node_ = prefix.dawg_node_;
241   is_dawg_prefix_ = prefix.is_dawg_prefix_;
242 
243   // Verify DAWG and update dawg_node_ if the current prefix is already valid
244   if (is_dawg_prefix_) {
245     for (int char_subindex = 0;
246          class_string_choice[char_subindex] != '\0';
247          ++char_subindex) {
248 
249       // TODO(daria): update this code (and the rest of ngram permuter code
250       // to deal with unichar ids, make use of the new parallel dawg search
251       // and use WERD_CHOICE, BLOB_CHOICE_LIST_VECTOR instead of the deprecated
252       // A_CHOICE.
253       tprintf("Error: ngram permuter functionality is not available\n");
254       exit(1);
255 
256       // Verify each byte of the appended character. Note that word_ptr points
257       // to the first byte so (word_ptr - (word_ + 1)) is the index of the first
258       // new byte in the string that starts at (word_ + 1).
259       /*
260       int current_byte_index = word_ptr - (word_ + 1) + char_subindex;
261       if (!(dict->*dict->letter_is_okay_)(
262          dawg, &dawg_node_, current_byte_index, word_ + 1,
263          end_of_word && class_string_choice[char_subindex + 1] == '\0')) {
264         dawg_node_ = NO_EDGE;
265         is_dawg_prefix_ = false;
266         break;
267       }
268       */
269     }
270   }
271 
272   // Copy the prefix rating
273   rating_ = prefix.rating_;
274 
275   // Compute rating of current character
276   double probability = probability_in_context(prefix.word_, -1,
277                                               class_string_choice, -1);
278 
279   // If last character of the word, take the following space into account
280   if (end_of_word)
281     probability *= probability_in_context(word_, -1, " ", -1);
282 
283   double local_classifier_score_ngram_score_ratio =
284       get_classifier_score_ngram_score_ratio(class_string_choice);
285 
286   double classifier_rating = class_rating(choice);
287   double ngram_rating = -log(probability) / log(2.0);
288   double mixed_rating =
289       local_classifier_score_ngram_score_ratio * classifier_rating +
290       (1 - local_classifier_score_ngram_score_ratio) * ngram_rating;
291 
292   // If the current word is not a valid prefix, adjust the rating of the
293   // character being appended. If it used to be a valid prefix, compensate for
294   // previous adjustments.
295   if (!is_dawg_prefix_) {
296     if (prefix.is_dawg_prefix_)
297       rating_ *= non_dawg_prefix_rating_adjustment;
298     mixed_rating *= non_dawg_prefix_rating_adjustment;
299   }
300 
301   // Update rating by adding the rating of the character being appended.
302   rating_ += mixed_rating;
303 }
304 
305 // Create an empty HypothesisPrefixList. Its maximum size is set to the given
306 // bound.
HypothesisPrefixList(int size_bound)307 HypothesisPrefixList::HypothesisPrefixList(int size_bound):
308     _size_bound(size_bound),
309     _size(0) {
310   _list_nodes = new HypothesisPrefix*[_size_bound];
311   for (int i = 0; i < _size_bound; ++i)
312     _list_nodes[i] = NULL;
313 }
314 
315 // Destroy a HypothesisPrefixList all contained nodes are deleted as well.
~HypothesisPrefixList()316 HypothesisPrefixList::~HypothesisPrefixList() {
317   this->clear();
318   delete[] _list_nodes;
319 }
320 
321 // Add a node to the HypothesisPrefixList. Maintains the sorted list property.
322 // Note that the HypothesisPrefixList takes ownership of the given node and
323 // might delete it if needed. It must therefore have been allocated on the heap.
add_node(HypothesisPrefix * node)324 void HypothesisPrefixList::add_node(HypothesisPrefix* node) {
325   // Detect nodes that have a worst rating that the current maximum and treat
326   // them separately.
327   if (_size > 0 && _list_nodes[_size - 1]->rating() < node->rating()) {
328     if (_size == _size_bound) {
329       // The list is already full. This node will not be added
330       delete node;
331     } else {
332       // The list is not full. Add the node at the last position.
333       _list_nodes[_size] = node;
334       ++_size;
335     }
336     return;
337   }
338   // Find the correct position
339   int node_index_target = 0;
340   while (node_index_target < _size_bound &&
341          _list_nodes[node_index_target] != NULL &&
342          _list_nodes[node_index_target]->rating() < node->rating()) {
343     ++node_index_target;
344   }
345   if (node_index_target >= _size_bound) {
346     delete node;
347   } else {
348     // Move next states by 1. Starting from the last one.
349     int node_index_move = _size - 1;
350     while (node_index_move >= node_index_target) {
351       if (node_index_move == _size_bound - 1)
352         delete _list_nodes[node_index_move];
353       else
354         _list_nodes[node_index_move + 1] = _list_nodes[node_index_move];
355       _list_nodes[node_index_move] = NULL;
356       --node_index_move;
357     }
358     // Insert new node
359     _list_nodes[node_index_target] = node;
360     // Increment size if it has changed
361     if (_size < _size_bound)
362       ++_size;
363   }
364 }
365 
366 // Delete all contained nodes and set the size of the HypothesisPrefixList to 0
clear()367 void HypothesisPrefixList::clear() {
368   for (int i = 0; i < _size_bound && _list_nodes[i] != NULL; ++i) {
369     delete _list_nodes[i];
370     _list_nodes[i] = NULL;
371   }
372   _size = 0;
373 }
374