• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**********************************************************************
2  * File:        tfacepp.cpp  (Formerly tface++.c)
3  * Description: C++ side of the C/C++ Tess/Editor interface.
4  * Author:                  Ray Smith
5  * Created:                 Thu Apr 23 15:39:23 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #ifdef __UNIX__
22 #include          <assert.h>
23 #endif
24 #include          "errcode.h"
25 #include          "ratngs.h"
26 #include          "reject.h"
27 #include          "werd.h"
28 #include          "tfacep.h"
29 #include          "tstruct.h"
30 #include          "tfacepp.h"
31 #include          "tessvars.h"
32 #include          "globals.h"
33 #include          "reject.h"
34 #include          "tesseractclass.h"
35 
36 #define EXTERN
37 
38 EXTERN BOOL_VAR (tessedit_override_permuter, TRUE, "According to dict_word");
39 
40 
41 #define MAX_UNDIVIDED_LENGTH 24
42 
43 
44 
45 /**********************************************************************
46  * recog_word
47  *
48  * Convert the word to tess form and pass it to the tess segmenter.
49  * Convert the output back to editor form.
50  **********************************************************************/
51 namespace tesseract {
recog_word(WERD * word,DENORM * denorm,POLY_MATCHER matcher,POLY_TESTER tester,POLY_TESTER trainer,BOOL8 testing,WERD_CHOICE * & raw_choice,BLOB_CHOICE_LIST_CLIST * blob_choices,WERD * & outword)52 WERD_CHOICE *Tesseract::recog_word(                     //recog one owrd
53                                    WERD *word,          //word to do
54                                    DENORM *denorm,      //de-normaliser
55                                                         //matcher function
56                                    POLY_MATCHER matcher,
57                                    POLY_TESTER tester,  //tester function
58                                    POLY_TESTER trainer, //trainer function
59                                    BOOL8 testing,       //true if answer driven
60                                                         //raw result
61                                    WERD_CHOICE *&raw_choice,
62                                                         //list of blob lists
63                                    BLOB_CHOICE_LIST_CLIST *blob_choices,
64                                    WERD *&outword       //bln word output
65                                   ) {
66   WERD_CHOICE *word_choice;
67   uinT8 perm_type;
68   uinT8 real_dict_perm_type;
69 
70   if (word->blob_list ()->empty ()) {
71     word_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
72                                   TOP_CHOICE_PERM, unicharset);
73     raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
74                                  TOP_CHOICE_PERM, unicharset);
75     outword = word->poly_copy (denorm->row ()->x_height ());
76   }
77   else
78     word_choice = recog_word_recursive (word, denorm, matcher, tester,
79       trainer, testing, raw_choice,
80       blob_choices, outword);
81   if ((word_choice->length() != outword->blob_list()->length()) ||
82       (word_choice->length() != blob_choices->length())) {
83     tprintf
84       ("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
85       word_choice->debug_string(unicharset).string(),
86       word_choice->length(), outword->blob_list()->length(),
87       blob_choices->length());
88   }
89   ASSERT_HOST(word_choice->length() == outword->blob_list()->length());
90   ASSERT_HOST(word_choice->length() == blob_choices->length());
91 
92   /* Copy any reject blobs into the outword */
93   outword->rej_blob_list()->deep_copy(word->rej_blob_list(), &PBLOB::deep_copy);
94 
95   if (tessedit_override_permuter) {
96     /* Override the permuter type if a straight dictionary check disagrees. */
97     perm_type = word_choice->permuter();
98     if ((perm_type != SYSTEM_DAWG_PERM) &&
99         (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
100       real_dict_perm_type = dict_word(*word_choice);
101       if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
102            (real_dict_perm_type == FREQ_DAWG_PERM) ||
103            (real_dict_perm_type == USER_DAWG_PERM)) &&
104           (alpha_count(word_choice->unichar_string().string(),
105                       word_choice->unichar_lengths().string()) > 0)) {
106         word_choice->set_permuter (real_dict_perm_type);  // use dict perm
107       }
108     }
109     if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) {
110       tprintf ("Permuter Type Flipped from %d to %d\n",
111         perm_type, word_choice->permuter ());
112     }
113   }
114   assert ((word_choice == NULL) == (raw_choice == NULL));
115   return word_choice;
116 }
117 
118 
119 /**********************************************************************
120  * recog_word_recursive
121  *
122  * Convert the word to tess form and pass it to the tess segmenter.
123  * Convert the output back to editor form.
124  **********************************************************************/
125 WERD_CHOICE *
recog_word_recursive(WERD * word,DENORM * denorm,POLY_MATCHER matcher,POLY_TESTER tester,POLY_TESTER trainer,BOOL8 testing,WERD_CHOICE * & raw_choice,BLOB_CHOICE_LIST_CLIST * blob_choices,WERD * & outword)126 Tesseract::recog_word_recursive(
127     WERD *word,                            // word to do
128     DENORM *denorm,                        // de-normaliser
129     POLY_MATCHER matcher,                  // matcher function
130     POLY_TESTER tester,                    // tester function
131     POLY_TESTER trainer,                   // trainer function
132     BOOL8 testing,                         // true if answer driven
133     WERD_CHOICE *&raw_choice,              // raw result
134     BLOB_CHOICE_LIST_CLIST *blob_choices,  // list of blob lists
135     WERD *&outword                         // bln word output
136     ) {
137   inT32 initial_blob_choice_len;
138   inT32 word_length;                      // no of blobs
139   STRING word_string;                     // converted from tess
140   STRING word_string_lengths;
141   BLOB_CHOICE_LIST_VECTOR *tess_ratings;  // tess results
142   TWERD *tessword;                        // tess format
143   BLOB_CHOICE_LIST_C_IT blob_choices_it;  // iterator
144 
145   tess_matcher = matcher;           // install matcher
146   tess_tester = testing ? tester : NULL;
147   tess_trainer = testing ? trainer : NULL;
148   tess_denorm = denorm;
149   tess_word = word;
150   //      blob_matchers[1]=call_matcher;
151   if (word->blob_list ()->length () > MAX_UNDIVIDED_LENGTH) {
152     return split_and_recog_word (word, denorm, matcher, tester, trainer,
153       testing, raw_choice, blob_choices,
154       outword);
155   } else {
156     UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
157     WERD_CHOICE *best_choice = new WERD_CHOICE();
158     raw_choice = new WERD_CHOICE();
159     initial_blob_choice_len = blob_choices->length();
160     tessword = make_tess_word (word, NULL);
161     tess_ratings = cc_recog(tessword, best_choice, raw_choice,
162                             testing && tester != NULL,
163                             testing && trainer != NULL,
164                             word->flag(W_EOL));
165 
166     outword = make_ed_word (tessword, word);  // convert word
167     if (outword == NULL) {
168       outword = word->poly_copy (denorm->row ()->x_height ());
169     }
170     delete_word(tessword);  // get rid of it
171     word_length = outword->blob_list()->length();  // no of blobs
172 
173     // Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices.
174     blob_choices_it.set_to_list(blob_choices);
175     for (int i = 0; i < tess_ratings->length(); ++i) {
176       blob_choices_it.add_to_end(tess_ratings->get(i));
177     }
178     delete tess_ratings;
179 
180     // Pad raw_choice with spaces if needed.
181     if (raw_choice->length() < word_length) {
182       while (raw_choice->length() < word_length) {
183         raw_choice->append_unichar_id(space_id, 1, 0.0,
184                                       raw_choice->certainty());
185       }
186       raw_choice->populate_unichars(unicharset);
187     }
188 
189     // Do sanity checks and minor fixes on best_choice.
190     if (best_choice->length() > word_length) {
191       tprintf("recog_word: Discarded long string \"%s\""
192               " (%d characters vs %d blobs)\n",
193               best_choice->unichar_string().string (),
194               best_choice->length(), word_length);
195       best_choice->make_bad();  // should never happen
196       tprintf("Word is at (%g,%g)\n",
197               denorm->origin(),
198               denorm->y(word->bounding_box().bottom(), 0.0));
199     }
200     if (blob_choices->length() - initial_blob_choice_len != word_length) {
201       best_choice->make_bad();  // force rejection
202       tprintf ("recog_word: Choices list len:%d; blob lists len:%d\n",
203         blob_choices->length(), word_length);
204       blob_choices_it.set_to_list(blob_choices);  // list of lists
205       while (blob_choices->length() - initial_blob_choice_len < word_length) {
206         blob_choices_it.add_to_end(new BLOB_CHOICE_LIST());  // add a fake one
207         tprintf("recog_word: Added dummy choice list\n");
208       }
209       while (blob_choices->length() - initial_blob_choice_len > word_length) {
210         blob_choices_it.move_to_last(); // should never happen
211         delete blob_choices_it.extract();
212         tprintf("recog_word: Deleted choice list\n");
213       }
214     }
215     if (best_choice->length() < word_length) {
216       while (best_choice->length() < word_length) {
217         best_choice->append_unichar_id(space_id, 1, 0.0,
218                                        best_choice->certainty());
219       }
220       best_choice->populate_unichars(unicharset);
221     }
222 
223     return best_choice;
224   }
225 }
226 
227 
228 /**********************************************************************
229  * split_and_recog_word
230  *
231  * Convert the word to tess form and pass it to the tess segmenter.
232  * Convert the output back to editor form.
233  **********************************************************************/
234 
235 WERD_CHOICE *
split_and_recog_word(WERD * word,DENORM * denorm,POLY_MATCHER matcher,POLY_TESTER tester,POLY_TESTER trainer,BOOL8 testing,WERD_CHOICE * & raw_choice,BLOB_CHOICE_LIST_CLIST * blob_choices,WERD * & outword)236 Tesseract::split_and_recog_word(                        //recog one owrd
237                                 WERD *word,             //word to do
238                                 DENORM *denorm,         //de-normaliser
239                                 POLY_MATCHER matcher,   //matcher function
240                                 POLY_TESTER tester,     //tester function
241                                 POLY_TESTER trainer,    //trainer function
242                                 BOOL8 testing,          //true if answer driven
243                                                         //raw result
244                                 WERD_CHOICE *&raw_choice,
245                                                         //list of blob lists
246                                 BLOB_CHOICE_LIST_CLIST *blob_choices,
247                                 WERD *&outword          //bln word output
248                                ) {
249   //   inT32                                                      outword1_len;
250   //   inT32                                                      outword2_len;
251   WERD *first_word;              //poly copy of word
252   WERD *second_word;             //fabricated word
253   WERD *outword2;                //2nd output word
254   PBLOB *blob;
255   WERD_CHOICE *result;           //return value
256   WERD_CHOICE *result2;          //output of 2nd word
257   WERD_CHOICE *raw_choice2;      //raw version of 2nd
258   float gap;                     //blob gap
259   float bestgap;                 //biggest gap
260   PBLOB_LIST new_blobs;          //list of gathered blobs
261   PBLOB_IT blob_it;
262                                  //iterator
263   PBLOB_IT new_blob_it = &new_blobs;
264 
265   first_word = word->poly_copy (denorm->row ()->x_height ());
266   blob_it.set_to_list (first_word->blob_list ());
267   bestgap = -MAX_INT32;
268   while (!blob_it.at_last ()) {
269     blob = blob_it.data ();
270                                  //gap to next
271     gap = blob_it.data_relative(1)->bounding_box().left() -
272         blob->bounding_box().right();
273     blob_it.forward ();
274     if (gap > bestgap) {
275       bestgap = gap;             //find biggest
276       new_blob_it = blob_it;     //save position
277     }
278   }
279                                  //take 2nd half
280   new_blobs.assign_to_sublist (&new_blob_it, &blob_it);
281                                  //make it a word
282   second_word = new WERD (&new_blobs, 1, NULL);
283   ASSERT_HOST (word->blob_list ()->length () ==
284     first_word->blob_list ()->length () +
285     second_word->blob_list ()->length ());
286 
287   result = recog_word_recursive (first_word, denorm, matcher,
288     tester, trainer, testing, raw_choice,
289     blob_choices, outword);
290   delete first_word;             //done that one
291   result2 = recog_word_recursive (second_word, denorm, matcher,
292     tester, trainer, testing, raw_choice2,
293     blob_choices, outword2);
294   delete second_word;            //done that too
295   *result += *result2;           //combine ratings
296   delete result2;
297   *raw_choice += *raw_choice2;
298   delete raw_choice2;            //finished with it
299   //   outword1_len= outword->blob_list()->length();
300   //   outword2_len= outword2->blob_list()->length();
301   outword->join_on (outword2);   //join words
302   delete outword2;
303   //   if ( outword->blob_list()->length() != outword1_len + outword2_len )
304   //      tprintf( "Split&Recog: part1len=%d; part2len=%d; combinedlen=%d\n",
305   //                                outword1_len, outword2_len, outword->blob_list()->length() );
306   //   ASSERT_HOST( outword->blob_list()->length() == outword1_len + outword2_len );
307   return result;
308 }
309 
310 }  // namespace tesseract
311 
312 /**********************************************************************
313  * call_tester
314  *
315  * Called from Tess with a blob in tess form.
316  * Convert the blob to editor form.
317  * Call the tester setup by the segmenter in tess_tester.
318  **********************************************************************/
319 #if 0  // dead code
320 void call_tester(                     //call a tester
321                  const STRING& filename,
322                  TBLOB *tessblob,     //blob to test
323                  BOOL8 correct_blob,  //true if good
324                  char *text,          //source text
325                  inT32 count,         //chars in text
326                  LIST result          //output of matcher
327                 ) {
328   PBLOB *blob;                   //converted blob
329   BLOB_CHOICE_LIST ratings;      //matcher result
330 
331   blob = make_ed_blob (tessblob);//convert blob
332   if (blob == NULL)
333     return;
334                                  //make it right type
335   convert_choice_list(result, ratings);
336   if (tess_tester != NULL)
337     (*tess_tester) (filename, blob, tess_denorm, correct_blob, text, count, &ratings);
338   delete blob;                   //don't need that now
339 }
340 #endif
341 
342 /**********************************************************************
343  * call_train_tester
344  *
345  * Called from Tess with a blob in tess form.
346  * Convert the blob to editor form.
347  * Call the trainer setup by the segmenter in tess_trainer.
348  **********************************************************************/
349 #if 0  // dead code
350 void call_train_tester(                     //call a tester
351                        const STRING& filename,
352                        TBLOB *tessblob,     //blob to test
353                        BOOL8 correct_blob,  //true if good
354                        char *text,          //source text
355                        inT32 count,         //chars in text
356                        LIST result          //output of matcher
357                       ) {
358   PBLOB *blob;                   //converted blob
359   BLOB_CHOICE_LIST ratings;      //matcher result
360 
361   blob = make_ed_blob (tessblob);//convert blob
362   if (blob == NULL)
363     return;
364                                  //make it right type
365   convert_choice_list(result, ratings);
366   if (tess_trainer != NULL)
367     (*tess_trainer) (filename, blob, tess_denorm, correct_blob, text, count, &ratings);
368   delete blob;                   //don't need that now
369 }
370 #endif
371