1 /**********************************************************************
2 * File: tfacepp.cpp (Formerly tface++.c)
3 * Description: C++ side of the C/C++ Tess/Editor interface.
4 * Author: Ray Smith
5 * Created: Thu Apr 23 15:39:23 BST 1992
6 *
7 * (C) Copyright 1992, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 #include "mfcpch.h"
21 #ifdef __UNIX__
22 #include <assert.h>
23 #endif
24 #include "errcode.h"
25 #include "ratngs.h"
26 #include "reject.h"
27 #include "werd.h"
28 #include "tfacep.h"
29 #include "tstruct.h"
30 #include "tfacepp.h"
31 #include "tessvars.h"
32 #include "globals.h"
33 #include "reject.h"
34 #include "tesseractclass.h"
35
36 #define EXTERN
37
38 EXTERN BOOL_VAR (tessedit_override_permuter, TRUE, "According to dict_word");
39
40
41 #define MAX_UNDIVIDED_LENGTH 24
42
43
44
45 /**********************************************************************
46 * recog_word
47 *
48 * Convert the word to tess form and pass it to the tess segmenter.
49 * Convert the output back to editor form.
50 **********************************************************************/
51 namespace tesseract {
recog_word(WERD * word,DENORM * denorm,POLY_MATCHER matcher,POLY_TESTER tester,POLY_TESTER trainer,BOOL8 testing,WERD_CHOICE * & raw_choice,BLOB_CHOICE_LIST_CLIST * blob_choices,WERD * & outword)52 WERD_CHOICE *Tesseract::recog_word( //recog one owrd
53 WERD *word, //word to do
54 DENORM *denorm, //de-normaliser
55 //matcher function
56 POLY_MATCHER matcher,
57 POLY_TESTER tester, //tester function
58 POLY_TESTER trainer, //trainer function
59 BOOL8 testing, //true if answer driven
60 //raw result
61 WERD_CHOICE *&raw_choice,
62 //list of blob lists
63 BLOB_CHOICE_LIST_CLIST *blob_choices,
64 WERD *&outword //bln word output
65 ) {
66 WERD_CHOICE *word_choice;
67 uinT8 perm_type;
68 uinT8 real_dict_perm_type;
69
70 if (word->blob_list ()->empty ()) {
71 word_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
72 TOP_CHOICE_PERM, unicharset);
73 raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
74 TOP_CHOICE_PERM, unicharset);
75 outword = word->poly_copy (denorm->row ()->x_height ());
76 }
77 else
78 word_choice = recog_word_recursive (word, denorm, matcher, tester,
79 trainer, testing, raw_choice,
80 blob_choices, outword);
81 if ((word_choice->length() != outword->blob_list()->length()) ||
82 (word_choice->length() != blob_choices->length())) {
83 tprintf
84 ("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
85 word_choice->debug_string(unicharset).string(),
86 word_choice->length(), outword->blob_list()->length(),
87 blob_choices->length());
88 }
89 ASSERT_HOST(word_choice->length() == outword->blob_list()->length());
90 ASSERT_HOST(word_choice->length() == blob_choices->length());
91
92 /* Copy any reject blobs into the outword */
93 outword->rej_blob_list()->deep_copy(word->rej_blob_list(), &PBLOB::deep_copy);
94
95 if (tessedit_override_permuter) {
96 /* Override the permuter type if a straight dictionary check disagrees. */
97 perm_type = word_choice->permuter();
98 if ((perm_type != SYSTEM_DAWG_PERM) &&
99 (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
100 real_dict_perm_type = dict_word(*word_choice);
101 if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
102 (real_dict_perm_type == FREQ_DAWG_PERM) ||
103 (real_dict_perm_type == USER_DAWG_PERM)) &&
104 (alpha_count(word_choice->unichar_string().string(),
105 word_choice->unichar_lengths().string()) > 0)) {
106 word_choice->set_permuter (real_dict_perm_type); // use dict perm
107 }
108 }
109 if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) {
110 tprintf ("Permuter Type Flipped from %d to %d\n",
111 perm_type, word_choice->permuter ());
112 }
113 }
114 assert ((word_choice == NULL) == (raw_choice == NULL));
115 return word_choice;
116 }
117
118
119 /**********************************************************************
120 * recog_word_recursive
121 *
122 * Convert the word to tess form and pass it to the tess segmenter.
123 * Convert the output back to editor form.
124 **********************************************************************/
125 WERD_CHOICE *
recog_word_recursive(WERD * word,DENORM * denorm,POLY_MATCHER matcher,POLY_TESTER tester,POLY_TESTER trainer,BOOL8 testing,WERD_CHOICE * & raw_choice,BLOB_CHOICE_LIST_CLIST * blob_choices,WERD * & outword)126 Tesseract::recog_word_recursive(
127 WERD *word, // word to do
128 DENORM *denorm, // de-normaliser
129 POLY_MATCHER matcher, // matcher function
130 POLY_TESTER tester, // tester function
131 POLY_TESTER trainer, // trainer function
132 BOOL8 testing, // true if answer driven
133 WERD_CHOICE *&raw_choice, // raw result
134 BLOB_CHOICE_LIST_CLIST *blob_choices, // list of blob lists
135 WERD *&outword // bln word output
136 ) {
137 inT32 initial_blob_choice_len;
138 inT32 word_length; // no of blobs
139 STRING word_string; // converted from tess
140 STRING word_string_lengths;
141 BLOB_CHOICE_LIST_VECTOR *tess_ratings; // tess results
142 TWERD *tessword; // tess format
143 BLOB_CHOICE_LIST_C_IT blob_choices_it; // iterator
144
145 tess_matcher = matcher; // install matcher
146 tess_tester = testing ? tester : NULL;
147 tess_trainer = testing ? trainer : NULL;
148 tess_denorm = denorm;
149 tess_word = word;
150 // blob_matchers[1]=call_matcher;
151 if (word->blob_list ()->length () > MAX_UNDIVIDED_LENGTH) {
152 return split_and_recog_word (word, denorm, matcher, tester, trainer,
153 testing, raw_choice, blob_choices,
154 outword);
155 } else {
156 UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
157 WERD_CHOICE *best_choice = new WERD_CHOICE();
158 raw_choice = new WERD_CHOICE();
159 initial_blob_choice_len = blob_choices->length();
160 tessword = make_tess_word (word, NULL);
161 tess_ratings = cc_recog(tessword, best_choice, raw_choice,
162 testing && tester != NULL,
163 testing && trainer != NULL,
164 word->flag(W_EOL));
165
166 outword = make_ed_word (tessword, word); // convert word
167 if (outword == NULL) {
168 outword = word->poly_copy (denorm->row ()->x_height ());
169 }
170 delete_word(tessword); // get rid of it
171 word_length = outword->blob_list()->length(); // no of blobs
172
173 // Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices.
174 blob_choices_it.set_to_list(blob_choices);
175 for (int i = 0; i < tess_ratings->length(); ++i) {
176 blob_choices_it.add_to_end(tess_ratings->get(i));
177 }
178 delete tess_ratings;
179
180 // Pad raw_choice with spaces if needed.
181 if (raw_choice->length() < word_length) {
182 while (raw_choice->length() < word_length) {
183 raw_choice->append_unichar_id(space_id, 1, 0.0,
184 raw_choice->certainty());
185 }
186 raw_choice->populate_unichars(unicharset);
187 }
188
189 // Do sanity checks and minor fixes on best_choice.
190 if (best_choice->length() > word_length) {
191 tprintf("recog_word: Discarded long string \"%s\""
192 " (%d characters vs %d blobs)\n",
193 best_choice->unichar_string().string (),
194 best_choice->length(), word_length);
195 best_choice->make_bad(); // should never happen
196 tprintf("Word is at (%g,%g)\n",
197 denorm->origin(),
198 denorm->y(word->bounding_box().bottom(), 0.0));
199 }
200 if (blob_choices->length() - initial_blob_choice_len != word_length) {
201 best_choice->make_bad(); // force rejection
202 tprintf ("recog_word: Choices list len:%d; blob lists len:%d\n",
203 blob_choices->length(), word_length);
204 blob_choices_it.set_to_list(blob_choices); // list of lists
205 while (blob_choices->length() - initial_blob_choice_len < word_length) {
206 blob_choices_it.add_to_end(new BLOB_CHOICE_LIST()); // add a fake one
207 tprintf("recog_word: Added dummy choice list\n");
208 }
209 while (blob_choices->length() - initial_blob_choice_len > word_length) {
210 blob_choices_it.move_to_last(); // should never happen
211 delete blob_choices_it.extract();
212 tprintf("recog_word: Deleted choice list\n");
213 }
214 }
215 if (best_choice->length() < word_length) {
216 while (best_choice->length() < word_length) {
217 best_choice->append_unichar_id(space_id, 1, 0.0,
218 best_choice->certainty());
219 }
220 best_choice->populate_unichars(unicharset);
221 }
222
223 return best_choice;
224 }
225 }
226
227
228 /**********************************************************************
229 * split_and_recog_word
230 *
231 * Convert the word to tess form and pass it to the tess segmenter.
232 * Convert the output back to editor form.
233 **********************************************************************/
234
235 WERD_CHOICE *
split_and_recog_word(WERD * word,DENORM * denorm,POLY_MATCHER matcher,POLY_TESTER tester,POLY_TESTER trainer,BOOL8 testing,WERD_CHOICE * & raw_choice,BLOB_CHOICE_LIST_CLIST * blob_choices,WERD * & outword)236 Tesseract::split_and_recog_word( //recog one owrd
237 WERD *word, //word to do
238 DENORM *denorm, //de-normaliser
239 POLY_MATCHER matcher, //matcher function
240 POLY_TESTER tester, //tester function
241 POLY_TESTER trainer, //trainer function
242 BOOL8 testing, //true if answer driven
243 //raw result
244 WERD_CHOICE *&raw_choice,
245 //list of blob lists
246 BLOB_CHOICE_LIST_CLIST *blob_choices,
247 WERD *&outword //bln word output
248 ) {
249 // inT32 outword1_len;
250 // inT32 outword2_len;
251 WERD *first_word; //poly copy of word
252 WERD *second_word; //fabricated word
253 WERD *outword2; //2nd output word
254 PBLOB *blob;
255 WERD_CHOICE *result; //return value
256 WERD_CHOICE *result2; //output of 2nd word
257 WERD_CHOICE *raw_choice2; //raw version of 2nd
258 float gap; //blob gap
259 float bestgap; //biggest gap
260 PBLOB_LIST new_blobs; //list of gathered blobs
261 PBLOB_IT blob_it;
262 //iterator
263 PBLOB_IT new_blob_it = &new_blobs;
264
265 first_word = word->poly_copy (denorm->row ()->x_height ());
266 blob_it.set_to_list (first_word->blob_list ());
267 bestgap = -MAX_INT32;
268 while (!blob_it.at_last ()) {
269 blob = blob_it.data ();
270 //gap to next
271 gap = blob_it.data_relative(1)->bounding_box().left() -
272 blob->bounding_box().right();
273 blob_it.forward ();
274 if (gap > bestgap) {
275 bestgap = gap; //find biggest
276 new_blob_it = blob_it; //save position
277 }
278 }
279 //take 2nd half
280 new_blobs.assign_to_sublist (&new_blob_it, &blob_it);
281 //make it a word
282 second_word = new WERD (&new_blobs, 1, NULL);
283 ASSERT_HOST (word->blob_list ()->length () ==
284 first_word->blob_list ()->length () +
285 second_word->blob_list ()->length ());
286
287 result = recog_word_recursive (first_word, denorm, matcher,
288 tester, trainer, testing, raw_choice,
289 blob_choices, outword);
290 delete first_word; //done that one
291 result2 = recog_word_recursive (second_word, denorm, matcher,
292 tester, trainer, testing, raw_choice2,
293 blob_choices, outword2);
294 delete second_word; //done that too
295 *result += *result2; //combine ratings
296 delete result2;
297 *raw_choice += *raw_choice2;
298 delete raw_choice2; //finished with it
299 // outword1_len= outword->blob_list()->length();
300 // outword2_len= outword2->blob_list()->length();
301 outword->join_on (outword2); //join words
302 delete outword2;
303 // if ( outword->blob_list()->length() != outword1_len + outword2_len )
304 // tprintf( "Split&Recog: part1len=%d; part2len=%d; combinedlen=%d\n",
305 // outword1_len, outword2_len, outword->blob_list()->length() );
306 // ASSERT_HOST( outword->blob_list()->length() == outword1_len + outword2_len );
307 return result;
308 }
309
310 } // namespace tesseract
311
312 /**********************************************************************
313 * call_tester
314 *
315 * Called from Tess with a blob in tess form.
316 * Convert the blob to editor form.
317 * Call the tester setup by the segmenter in tess_tester.
318 **********************************************************************/
319 #if 0 // dead code
320 void call_tester( //call a tester
321 const STRING& filename,
322 TBLOB *tessblob, //blob to test
323 BOOL8 correct_blob, //true if good
324 char *text, //source text
325 inT32 count, //chars in text
326 LIST result //output of matcher
327 ) {
328 PBLOB *blob; //converted blob
329 BLOB_CHOICE_LIST ratings; //matcher result
330
331 blob = make_ed_blob (tessblob);//convert blob
332 if (blob == NULL)
333 return;
334 //make it right type
335 convert_choice_list(result, ratings);
336 if (tess_tester != NULL)
337 (*tess_tester) (filename, blob, tess_denorm, correct_blob, text, count, &ratings);
338 delete blob; //don't need that now
339 }
340 #endif
341
342 /**********************************************************************
343 * call_train_tester
344 *
345 * Called from Tess with a blob in tess form.
346 * Convert the blob to editor form.
347 * Call the trainer setup by the segmenter in tess_trainer.
348 **********************************************************************/
349 #if 0 // dead code
350 void call_train_tester( //call a tester
351 const STRING& filename,
352 TBLOB *tessblob, //blob to test
353 BOOL8 correct_blob, //true if good
354 char *text, //source text
355 inT32 count, //chars in text
356 LIST result //output of matcher
357 ) {
358 PBLOB *blob; //converted blob
359 BLOB_CHOICE_LIST ratings; //matcher result
360
361 blob = make_ed_blob (tessblob);//convert blob
362 if (blob == NULL)
363 return;
364 //make it right type
365 convert_choice_list(result, ratings);
366 if (tess_trainer != NULL)
367 (*tess_trainer) (filename, blob, tess_denorm, correct_blob, text, count, &ratings);
368 delete blob; //don't need that now
369 }
370 #endif
371