1 /******************************************************************
2 * File: control.cpp (Formerly control.c)
3 * Description: Module-independent matcher controller.
4 * Author: Ray Smith
5 * Created: Thu Apr 23 11:09:58 BST 1992
6 * ReHacked: Tue Sep 22 08:42:49 BST 1992 Phil Cheatle
7 *
8 * (C) Copyright 1992, Hewlett-Packard Ltd.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 **********************************************************************/
20
21 #include "mfcpch.h"
22 #include "mainblk.h"
23 #include <string.h>
24 #include <math.h>
25 #ifdef __UNIX__
26 #include <assert.h>
27 #include <unistd.h>
28 #include <errno.h>
29 #endif
30 #include <ctype.h>
31 #include "ocrclass.h"
32 #include "werdit.h"
33 #include "drawfx.h"
34 #include "tfacep.h"
35 #include "tessbox.h"
36 #include "tessvars.h"
37 //#include "fxtop.h"
38 #include "pgedit.h"
39 #include "reject.h"
40 #include "adaptions.h"
41 #include "charcut.h"
42 #include "fixxht.h"
43 #include "fixspace.h"
44 #include "genblob.h"
45 #include "docqual.h"
46 #include "control.h"
47 #include "secname.h"
48 #include "output.h"
49 #include "callcpp.h"
50 #include "notdll.h"
51 #include "tordvars.h"
52 #include "adaptmatch.h"
53 #include "globals.h"
54 #include "tesseractclass.h"
55
56 #define MIN_FONT_ROW_COUNT 8
57 #define MAX_XHEIGHT_DIFF 3
58
59 #define EXTERN
60 //extern "C" {
61 //EXTERN BOOL_VAR(tessedit_small_match,FALSE,"Use small matrix matcher");
62
63 //extern FILE* matcher_fp;
64 //extern FILE* correct_fp;
65 //};
66 BOOL_VAR (tessedit_small_match, FALSE, "Use small matrix matcher");
67 EXTERN BOOL_VAR (tessedit_print_text, FALSE, "Write text to stdout");
68 EXTERN BOOL_VAR (tessedit_draw_words, FALSE, "Draw source words");
69 EXTERN BOOL_VAR (tessedit_draw_outwords, FALSE, "Draw output words");
70 EXTERN BOOL_VAR (tessedit_training_wiseowl, FALSE, "Call WO to learn blobs");
71 EXTERN BOOL_VAR (tessedit_training_tess, FALSE, "Call Tess to learn blobs");
72 EXTERN BOOL_VAR (tessedit_matcher_is_wiseowl, FALSE, "Call WO to classify");
73 EXTERN BOOL_VAR (tessedit_dump_choices, FALSE, "Dump char choices");
74 EXTERN BOOL_VAR (tessedit_fix_fuzzy_spaces, TRUE,
75 "Try to improve fuzzy spaces");
76 EXTERN BOOL_VAR (tessedit_unrej_any_wd, FALSE,
77 "Dont bother with word plausibility");
78 EXTERN BOOL_VAR (tessedit_fix_hyphens, TRUE, "Crunch double hyphens?");
79
80 EXTERN BOOL_VAR (tessedit_reject_fullstops, FALSE, "Reject all fullstops");
81 EXTERN BOOL_VAR (tessedit_reject_suspect_fullstops, FALSE,
82 "Reject suspect fullstops");
83 EXTERN BOOL_VAR (tessedit_redo_xheight, TRUE, "Check/Correct x-height");
84 EXTERN BOOL_VAR (tessedit_cluster_adaption_on, TRUE,
85 "Do our own adaption - ems only");
86 EXTERN BOOL_VAR (tessedit_enable_doc_dict, TRUE,
87 "Add words to the document dictionary");
88 EXTERN BOOL_VAR (word_occ_first, FALSE, "Do word occ before re-est xht");
89 EXTERN BOOL_VAR (tessedit_debug_fonts, FALSE, "Output font info per char");
90 EXTERN BOOL_VAR (tessedit_xht_fiddles_on_done_wds, TRUE,
91 "Apply xht fix up even if done");
92 EXTERN BOOL_VAR (tessedit_xht_fiddles_on_no_rej_wds, TRUE,
93 "Apply xht fix up even in no rejects");
94 EXTERN INT_VAR (x_ht_check_word_occ, 2, "Check Char Block occupancy");
95 EXTERN INT_VAR (x_ht_stringency, 1, "How many confirmed a/n to accept?");
96 EXTERN BOOL_VAR (x_ht_quality_check, TRUE, "Dont allow worse quality");
97 EXTERN BOOL_VAR (tessedit_debug_block_rejection, FALSE,
98 "Block and Row stats");
99 EXTERN INT_VAR (debug_x_ht_level, 0, "Reestimate debug");
100 EXTERN BOOL_VAR (rej_use_xht, TRUE, "Individual rejection control");
101 EXTERN BOOL_VAR (debug_acceptable_wds, FALSE, "Dump word pass/fail chk");
102
103 EXTERN STRING_VAR (chs_leading_punct, "('`\"", "Leading punctuation");
104 EXTERN
105 STRING_VAR (chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
106 EXTERN STRING_VAR (chs_trailing_punct2, ")'`\"",
107 "2nd Trailing punctuation");
108
109 EXTERN double_VAR (quality_rej_pc, 0.08,
110 "good_quality_doc lte rejection limit");
111 EXTERN double_VAR (quality_blob_pc, 0.0,
112 "good_quality_doc gte good blobs limit");
113 EXTERN double_VAR (quality_outline_pc, 1.0,
114 "good_quality_doc lte outline error limit");
115 EXTERN double_VAR (quality_char_pc, 0.95,
116 "good_quality_doc gte good char limit");
117 EXTERN INT_VAR (quality_min_initial_alphas_reqd, 2,
118 "alphas in a good word");
119
120 EXTERN BOOL_VAR (tessedit_tess_adapt_to_rejmap, FALSE,
121 "Use reject map to control Tesseract adaption");
122 EXTERN INT_VAR (tessedit_tess_adaption_mode, 0x27,
123 "Adaptation decision algorithm for tess");
124 EXTERN INT_VAR (tessedit_em_adaption_mode, 0,
125 "Adaptation decision algorithm for ems matrix matcher");
126 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass1, FALSE,
127 "Adapt using clusterer after pass 1");
128 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass2, FALSE,
129 "Adapt using clusterer after pass 1");
130 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass3, FALSE,
131 "Adapt using clusterer after pass 1");
132 EXTERN BOOL_VAR (tessedit_cluster_adapt_before_pass1, FALSE,
133 "Adapt using clusterer before Tess adaping during pass 1");
134 EXTERN INT_VAR (tessedit_cluster_adaption_mode, 0,
135 "Adaptation decision algorithm for matrix matcher");
136 EXTERN BOOL_VAR (tessedit_adaption_debug, FALSE,
137 "Generate and print debug information for adaption");
138 EXTERN BOOL_VAR (tessedit_minimal_rej_pass1, FALSE,
139 "Do minimal rejection on pass 1 output");
140 EXTERN BOOL_VAR (tessedit_test_adaption, FALSE,
141 "Test adaption criteria");
142 EXTERN BOOL_VAR (tessedit_global_adaption, FALSE,
143 "Adapt to all docs over time");
144 EXTERN BOOL_VAR (tessedit_matcher_log, FALSE, "Log matcher activity");
145 EXTERN INT_VAR (tessedit_test_adaption_mode, 3,
146 "Adaptation decision algorithm for tess");
147 EXTERN BOOL_VAR(save_best_choices, FALSE,
148 "Save the results of the recognition step"
149 " (blob_choices) within the corresponding WERD_CHOICE");
150
151 EXTERN BOOL_VAR (test_pt, FALSE, "Test for point");
152 EXTERN double_VAR (test_pt_x, 99999.99, "xcoord");
153 EXTERN double_VAR (test_pt_y, 99999.99, "ycoord");
154
155 extern int display_ratings;
156 extern int number_debug;
157 FILE *choice_file = NULL; //Choice file ptr
158
CLISTIZE(PBLOB)159 CLISTIZEH (PBLOB) CLISTIZE (PBLOB)
160 /* DEBUGGING */
161 inT16 blob_count(WERD *w) {
162 return w->blob_list ()->length ();
163 }
164
165
166 /**********************************************************************
167 * recog_pseudo_word
168 *
169 * Make a word from the selected blobs and run Tess on them.
170 **********************************************************************/
171 namespace tesseract {
recog_pseudo_word(BLOCK_LIST * block_list,TBOX & selection_box)172 void Tesseract::recog_pseudo_word( //recognize blobs
173 BLOCK_LIST *block_list, //blocks to check
174 TBOX &selection_box) {
175 WERD *word;
176 ROW *pseudo_row; //row of word
177 BLOCK *pseudo_block; //block of word
178
179 word = make_pseudo_word (block_list, selection_box,
180 pseudo_block, pseudo_row);
181 if (word != NULL) {
182 recog_interactive(pseudo_block, pseudo_row, word);
183 delete word;
184 }
185 }
186
187
188 /**********************************************************************
189 * recog_interactive
190 *
191 * Recognize a single word in interactive mode.
192 **********************************************************************/
recog_interactive(BLOCK * block,ROW * row,WERD * word)193 BOOL8 Tesseract::recog_interactive( //recognize blobs
194 BLOCK *block, //block
195 ROW *row, //row of word
196 WERD *word //word to recognize
197 ) {
198 WERD_RES word_res(word);
199 inT16 char_qual;
200 inT16 good_char_qual;
201
202 classify_word_pass2(&word_res, block, row);
203 #ifndef SECURE_NAMES
204 if (tessedit_debug_quality_metrics) {
205 word_char_quality(&word_res, row, &char_qual, &good_char_qual);
206 tprintf
207 ("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
208 word_res.reject_map.length (), word_blob_quality (&word_res, row),
209 word_outline_errs (&word_res), char_qual, good_char_qual);
210 }
211 #endif
212 return TRUE;
213 }
214
215
216 /**********************************************************************
217 * recog_all_words()
218 *
219 * Walk the current block list applying the specified word processor function
220 * to all words
221 **********************************************************************/
222
recog_all_words(PAGE_RES * page_res,volatile ETEXT_DESC * monitor,TBOX * target_word_box,inT16 dopasses)223 void Tesseract::recog_all_words( //process words
224 PAGE_RES *page_res, //page structure
225 //progress monitor
226 volatile ETEXT_DESC *monitor,
227 // specifies just to extract a rectangle
228 TBOX *target_word_box,
229 //0 - all, 1 just pass 1, 2 passes 2 and higher
230 inT16 dopasses
231 ) {
232 //reset page iterator
233 static PAGE_RES_IT page_res_it;
234 inT16 chars_in_word;
235 inT16 rejects_in_word;
236 static CHAR_SAMPLES_LIST em_clusters;
237 static CHAR_SAMPLE_LIST ems_waiting;
238 static CHAR_SAMPLES_LIST char_clusters;
239 static CHAR_SAMPLE_LIST chars_waiting;
240 inT16 blob_quality = 0;
241 inT16 outline_errs = 0;
242 static inT16 doc_blob_quality = 0;
243 static inT16 doc_outline_errs = 0;
244 static inT16 doc_char_quality = 0;
245 inT16 all_char_quality;
246 inT16 accepted_all_char_quality;
247 static inT16 good_char_count = 0;
248 static inT16 doc_good_char_quality = 0;
249 int i;
250
251
252 inT32 tess_adapt_mode = 0;
253 static inT32 word_count; //count of words in doc
254 inT32 word_index; //current word
255 static int dict_words;
256
257 if (tessedit_minimal_rej_pass1) {
258 tessedit_test_adaption.set_value (TRUE);
259 tessedit_minimal_rejection.set_value (TRUE);
260 }
261
262 if (tessedit_cluster_adapt_before_pass1) {
263 tess_adapt_mode = tessedit_tess_adaption_mode;
264 tessedit_tess_adaption_mode.set_value (0);
265 tessedit_tess_adapt_to_rejmap.set_value (TRUE);
266 }
267
268
269 if (dopasses==0 || dopasses==1)
270 {
271 page_res_it.page_res=page_res;
272 page_res_it.restart_page();
273
274 /* Pass 1 */
275 word_count = 0;
276 if (monitor != NULL) {
277 monitor->ocr_alive = TRUE;
278 while (page_res_it.word () != NULL) {
279 word_count++;
280 page_res_it.forward ();
281 }
282 page_res_it.restart_page ();
283 }
284 else
285 word_count = 1;
286
287 word_index = 0;
288
289 em_clusters.clear();
290 ems_waiting.clear();
291 char_clusters.clear();
292 chars_waiting.clear();
293 dict_words = 0;
294 doc_blob_quality = 0;
295 doc_outline_errs = 0;
296 doc_char_quality = 0;
297 good_char_count = 0;
298 doc_good_char_quality = 0;
299
300 while (page_res_it.word () != NULL) {
301 set_global_loc_code(LOC_PASS1);
302 word_index++;
303 if (monitor != NULL) {
304 monitor->ocr_alive = TRUE;
305 monitor->progress = 30 + 50 * word_index / word_count;
306 if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
307 (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
308 dict_words)))
309 return;
310 }
311 classify_word_pass1(page_res_it.word(), page_res_it.row()->row,
312 page_res_it.block()->block, FALSE, NULL, NULL);
313 if (tessedit_dump_choices) {
314 #ifndef GRAPHICS_DISABLED
315 word_dumper(NULL, page_res_it.row()->row, page_res_it.word()->word);
316 #endif
317 tprintf("Pass1: %s [%s]\n",
318 page_res_it.word()->best_choice->unichar_string().string(),
319 page_res_it.word()->best_choice->
320 debug_string(unicharset).string());
321 }
322
323 if (tessedit_test_adaption && !tessedit_minimal_rejection) {
324 if (!word_adaptable (page_res_it.word (),
325 tessedit_test_adaption_mode)) {
326 page_res_it.word ()->reject_map.rej_word_tess_failure ();
327 //FAKE PERM REJ
328 } else {
329 // Override rejection mechanisms for this word.
330 UNICHAR_ID space = unicharset.unichar_to_id(" ");
331 for (i = 0; i < page_res_it.word()->best_choice->length(); i++) {
332 if ((page_res_it.word()->best_choice->unichar_id(i) != space) &&
333 page_res_it.word()->reject_map[i].rejected())
334 page_res_it.word ()->reject_map[i].setrej_minimal_rej_accept();
335 }
336 }
337 }
338
339 if ((tessedit_cluster_adapt_after_pass1
340 || tessedit_cluster_adapt_after_pass3
341 || tessedit_cluster_adapt_before_pass1)
342 && tessedit_cluster_adaption_mode != 0) {
343 collect_characters_for_adaption (page_res_it.word (),
344 &char_clusters, &chars_waiting);
345 }
346 // Count dict words.
347 if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
348 ++dict_words;
349 page_res_it.forward ();
350 }
351
352 if (tessedit_cluster_adapt_before_pass1)
353 tessedit_tess_adaption_mode.set_value (tess_adapt_mode);
354
355 page_res_it.restart_page ();
356 while ((tessedit_cluster_adapt_after_pass1
357 || tessedit_cluster_adapt_before_pass1)
358 && page_res_it.word () != NULL) {
359 if (monitor != NULL)
360 monitor->ocr_alive = TRUE;
361 if (tessedit_cluster_adapt_after_pass1)
362 adapt_to_good_samples (page_res_it.word (),
363 &char_clusters, &chars_waiting);
364 else
365 classify_word_pass1 (page_res_it.word (),
366 page_res_it.row ()->row,
367 page_res_it.block()->block,
368 TRUE, &char_clusters, &chars_waiting);
369
370 page_res_it.forward ();
371 }
372
373 //
374
375
376 }
377
378 if (dopasses==1) return;
379
380 /* Pass 2 */
381 page_res_it.restart_page ();
382 word_index = 0;
383 while (!tessedit_test_adaption && page_res_it.word () != NULL) {
384 set_global_loc_code(LOC_PASS2);
385 word_index++;
386 if (monitor != NULL) {
387 monitor->ocr_alive = TRUE;
388 monitor->progress = 80 + 10 * word_index / word_count;
389 if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
390 (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
391 dict_words)))
392 return;
393 }
394 //changed by jetsoft
395 //specific to its needs to extract one word when need
396
397 if (target_word_box)
398 {
399
400 TBOX current_word_box=page_res_it.word ()->word->bounding_box();
401 FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
402 if (!target_word_box->contains(center_pt))
403 {
404 page_res_it.forward ();
405 continue;
406 }
407
408 }
409 //end jetsoft
410
411 classify_word_pass2(page_res_it.word(), page_res_it.block()->block,
412 page_res_it.row()->row);
413 if (tessedit_dump_choices) {
414 #ifndef GRAPHICS_DISABLED
415 word_dumper(NULL, page_res_it.row()->row, page_res_it.word()->word);
416 #endif
417 tprintf("Pass2: %s [%s]\n",
418 page_res_it.word()->best_choice->unichar_string().string(),
419 page_res_it.word()->best_choice->
420 debug_string(unicharset).string());
421 }
422
423 if (tessedit_em_adaption_mode > 0)
424 collect_ems_for_adaption (page_res_it.word (),
425 &em_clusters, &ems_waiting);
426
427 if (tessedit_cluster_adapt_after_pass2
428 && tessedit_cluster_adaption_mode != 0)
429 collect_characters_for_adaption (page_res_it.word (),
430 &char_clusters, &chars_waiting);
431 page_res_it.forward ();
432 }
433
434 /* Another pass */
435 set_global_loc_code(LOC_FUZZY_SPACE);
436
437 if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
438 && !tessedit_word_for_word)
439 fix_fuzzy_spaces(monitor, word_count, page_res);
440
441 if (!tessedit_test_adaption && tessedit_em_adaption_mode != 0)
442 // Initially ems only
443 print_em_stats(&em_clusters, &ems_waiting);
444
445 /* Pass 3 - used for checking confusion sets */
446 page_res_it.restart_page ();
447 word_index = 0;
448 while (!tessedit_test_adaption && page_res_it.word () != NULL) {
449 set_global_loc_code(LOC_MM_ADAPT);
450 word_index++;
451 if (monitor != NULL) {
452 monitor->ocr_alive = TRUE;
453 monitor->progress = 95 + 5 * word_index / word_count;
454 }
455 check_debug_pt (page_res_it.word (), 70);
456 /* Use good matches to sort out confusions */
457
458
459 //changed by jetsoft
460 //specific to its needs to extract one word when need
461
462 if (target_word_box)
463 {
464
465 TBOX current_word_box=page_res_it.word ()->word->bounding_box();
466 FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
467 if (!target_word_box->contains(center_pt))
468 {
469 page_res_it.forward ();
470 continue;
471 }
472
473 }
474 // end jetsoft
475
476 if (tessedit_em_adaption_mode != 0)
477 adapt_to_good_ems (page_res_it.word (), &em_clusters, &ems_waiting);
478
479 if (tessedit_cluster_adapt_after_pass2
480 && tessedit_cluster_adaption_mode != 0)
481 adapt_to_good_samples (page_res_it.word (),
482 &char_clusters, &chars_waiting);
483
484 UNICHAR_ID dot = unicharset.unichar_to_id(".");
485 if (tessedit_reject_fullstops &&
486 page_res_it.word()->best_choice->contains_unichar_id(dot)) {
487 reject_all_fullstops (page_res_it.word ());
488 } else if (tessedit_reject_suspect_fullstops &&
489 page_res_it.word()->best_choice->contains_unichar_id(dot)) {
490 reject_suspect_fullstops (page_res_it.word ());
491 }
492
493 page_res_it.rej_stat_word ();
494 chars_in_word = page_res_it.word ()->reject_map.length ();
495 rejects_in_word = page_res_it.word ()->reject_map.reject_count ();
496
497 blob_quality = word_blob_quality (page_res_it.word (),
498 page_res_it.row ()->row);
499 doc_blob_quality += blob_quality;
500 outline_errs = word_outline_errs (page_res_it.word ());
501 doc_outline_errs += outline_errs;
502 word_char_quality (page_res_it.word (),
503 page_res_it.row ()->row,
504 &all_char_quality, &accepted_all_char_quality);
505 doc_char_quality += all_char_quality;
506 uinT8 permuter_type = page_res_it.word ()->best_choice->permuter ();
507 if ((permuter_type == SYSTEM_DAWG_PERM) ||
508 (permuter_type == FREQ_DAWG_PERM) ||
509 (permuter_type == USER_DAWG_PERM)) {
510 good_char_count += chars_in_word - rejects_in_word;
511 doc_good_char_quality += accepted_all_char_quality;
512 }
513 check_debug_pt (page_res_it.word (), 80);
514 if (tessedit_reject_bad_qual_wds &&
515 (blob_quality == 0) && (outline_errs >= chars_in_word))
516 page_res_it.word ()->reject_map.rej_word_bad_quality ();
517 check_debug_pt (page_res_it.word (), 90);
518 page_res_it.forward ();
519 }
520
521 page_res_it.restart_page ();
522 while (!tessedit_test_adaption
523 && tessedit_cluster_adapt_after_pass3 && page_res_it.word () != NULL) {
524 if (monitor != NULL)
525 monitor->ocr_alive = TRUE;
526
527 //changed by jetsoft
528 //specific to its needs to extract one word when need
529
530 if (target_word_box)
531 {
532
533 TBOX current_word_box=page_res_it.word ()->word->bounding_box();
534 FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
535 if (!target_word_box->contains(center_pt))
536 {
537 page_res_it.forward ();
538 continue;
539 }
540
541 }
542
543 //end jetsoft
544 if (tessedit_cluster_adaption_mode != 0)
545 adapt_to_good_samples (page_res_it.word (),
546 &char_clusters, &chars_waiting);
547 page_res_it.forward ();
548 }
549
550 #ifndef SECURE_NAMES
551 if (tessedit_debug_quality_metrics) {
552 tprintf
553 ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
554 page_res->char_count, page_res->rej_count,
555 page_res->rej_count / (float) page_res->char_count, doc_blob_quality,
556 doc_blob_quality / (float) page_res->char_count, doc_outline_errs,
557 doc_outline_errs / (float) page_res->char_count, doc_char_quality,
558 doc_char_quality / (float) page_res->char_count,
559 doc_good_char_quality,
560 good_char_count >
561 0 ? doc_good_char_quality / (float) good_char_count : 0.0);
562 }
563 #endif
564 BOOL8 good_quality_doc =
565 (page_res->rej_count / (float) page_res->char_count <= quality_rej_pc)
566 &&
567 (doc_blob_quality / (float) page_res->char_count >= quality_blob_pc) &&
568 (doc_outline_errs / (float) page_res->char_count <= quality_outline_pc) &&
569 (doc_char_quality / (float) page_res->char_count >= quality_char_pc);
570
571 /* Do whole document or whole block rejection pass*/
572
573 if (!tessedit_test_adaption) {
574 set_global_loc_code(LOC_DOC_BLK_REJ);
575 quality_based_rejection(page_res_it, good_quality_doc);
576 }
577 font_recognition_pass(page_res_it);
578
579 /* Write results pass */
580 set_global_loc_code(LOC_WRITE_RESULTS);
581 // This is now redundant, but retained commented so show how to obtain
582 // bounding boxes and style information.
583
584 // changed by jetsoft
585 // needed for dll to output memory structure
586 if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
587 output_pass(page_res_it, ocr_char_space() > 0, target_word_box);
588 // end jetsoft
589 }
590
591
592 /**********************************************************************
593 * classify_word_pass1
594 *
595 * Baseline normalize the word and pass it to Tess.
596 **********************************************************************/
597
classify_word_pass1(WERD_RES * word,ROW * row,BLOCK * block,BOOL8 cluster_adapt,CHAR_SAMPLES_LIST * char_clusters,CHAR_SAMPLE_LIST * chars_waiting)598 void Tesseract::classify_word_pass1( //recog one word
599 WERD_RES *word, //word to do
600 ROW *row,
601 BLOCK* block,
602 BOOL8 cluster_adapt,
603 CHAR_SAMPLES_LIST *char_clusters,
604 CHAR_SAMPLE_LIST *chars_waiting) {
605 WERD *bln_word; //baseline norm copy
606 //detailed results
607 BLOB_CHOICE_LIST_CLIST local_blob_choices;
608 BLOB_CHOICE_LIST_CLIST *blob_choices;
609 BOOL8 adapt_ok;
610 const char *rejmap;
611 inT16 index;
612 STRING mapstr = "";
613 char *match_string;
614 char word_string[1024];
615
616 if (save_best_choices)
617 blob_choices = new BLOB_CHOICE_LIST_CLIST();
618 else
619 blob_choices = &local_blob_choices;
620
621 if (matcher_fp != NULL) {
622 fgets (word_string, 1023, correct_fp);
623 if ((match_string = strchr (word_string, '\r')) != NULL)
624 *match_string = '\0';
625 if ((match_string = strchr (word_string, '\n')) != NULL)
626 *match_string = '\0';
627 if (word_string[0] != '\0') {
628 word->word->set_text (word_string);
629 word_answer = (char *) word->word->text ();
630 }
631 else
632 word_answer = NULL;
633 }
634
635 check_debug_pt (word, 0);
636 bln_word = make_bln_copy(word->word, row, block, word->x_height,
637 &word->denorm);
638
639 word->best_choice = tess_segment_pass1 (bln_word, &word->denorm,
640 &Tesseract::tess_default_matcher,
641 word->raw_choice, blob_choices,
642 word->outword);
643 /*
644 Test for TESS screw up on word. Recog_word has already ensured that the
645 choice list, outword blob lists and best_choice string are the same
646 length. A TESS screw up is indicated by a blank filled or 0 length string.
647 */
648 if ((word->best_choice->length() == 0) ||
649 (strspn (word->best_choice->unichar_string().string(), " ") ==
650 word->best_choice->length())) {
651 word->done = FALSE; // Try again on pass2 - adaption may help.
652 word->tess_failed = TRUE;
653 word->reject_map.initialise(word->best_choice->length());
654 word->reject_map.rej_word_tess_failure ();
655 } else {
656 word->tess_failed = FALSE;
657 if ((word->best_choice->length() !=
658 word->outword->blob_list()->length()) ||
659 (word->best_choice->length() != blob_choices->length())) {
660 tprintf
661 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
662 word->best_choice->debug_string(unicharset).string(),
663 word->best_choice->length(),
664 word->outword->blob_list()->length(),
665 blob_choices->length());
666 }
667 ASSERT_HOST (word->best_choice->length() ==
668 word->outword->blob_list()->length());
669 ASSERT_HOST (word->best_choice->length() == blob_choices->length ());
670
671 /*
672 The adaption step used to be here. It has been moved to after
673 make_reject_map so that we know whether the word will be accepted in the
674 first pass or not. This move will PREVENT adaption to words containing
675 double quotes because the word will not be identical to what tess thinks
676 its best choice is. (See CurrentBestChoiceIs in
677 danj/microfeatures/stopper.c which is used by AdaptableWord in
678 danj/microfeatures/adaptmatch.c)
679 */
680
681 if (word->word->flag (W_REP_CHAR)) {
682 fix_rep_char(word);
683 } else {
684 // TODO(daria) delete these hacks when replaced by more generic code.
685 // Convert '' (double single) to " (single double).
686 fix_quotes(word->best_choice, word->outword, blob_choices);
687 if (tessedit_fix_hyphens) // turn -- to -
688 fix_hyphens (word->best_choice, word->outword, blob_choices);
689 record_certainty (word->best_choice->certainty (), 1);
690 // accounting.
691
692 word->tess_accepted = tess_acceptable_word (word->best_choice,
693 word->raw_choice);
694
695 word->tess_would_adapt = tess_adaptable_word (word->outword,
696 word->best_choice,
697 word->raw_choice);
698 // Also sets word->done flag
699 make_reject_map (word, blob_choices, row, 1);
700
701 adapt_ok = word_adaptable (word, tessedit_tess_adaption_mode);
702
703 if (cluster_adapt)
704 adapt_to_good_samples(word, char_clusters, chars_waiting);
705
706 if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
707 if (!tessedit_tess_adapt_to_rejmap) {
708 rejmap = NULL;
709 } else {
710 ASSERT_HOST(word->reject_map.length() ==
711 word->best_choice->length());
712
713 for (index = 0; index < word->reject_map.length (); index++) {
714 if (adapt_ok || word->reject_map[index].accepted ())
715 mapstr += '1';
716 else
717 mapstr += '0';
718 }
719 rejmap = mapstr.string ();
720 }
721
722 // adapt to it.
723 tess_adapter (word->outword, &word->denorm,
724 *word->best_choice,
725 *word->raw_choice, rejmap);
726 }
727
728 if (tessedit_enable_doc_dict)
729 tess_add_doc_word (word->best_choice);
730 set_word_fonts(word, blob_choices);
731 }
732 }
733 #if 0
734 if (tessedit_print_text) {
735 write_cooked_text (bln_word, word->best_choice->string (),
736 word->done, FALSE, stdout);
737 }
738 #endif
739 delete bln_word;
740
741 // Save best choices in the WERD_CHOICE if needed
742 if (blob_choices != &local_blob_choices) {
743 word->best_choice->set_blob_choices(blob_choices);
744 } else {
745 blob_choices->deep_clear();
746 }
747 }
748
749 /**********************************************************************
750 * classify_word_pass2
751 *
752 * Control what to do with the word in pass 2
753 **********************************************************************/
754
classify_word_pass2(WERD_RES * word,BLOCK * block,ROW * row)755 void Tesseract::classify_word_pass2(WERD_RES *word, BLOCK* block, ROW *row) {
756 BOOL8 done_this_pass = FALSE;
757 WERD_RES new_x_ht_word (word->word);
758 float new_x_ht = 0.0;
759 inT16 old_xht_reject_count;
760 inT16 new_xht_reject_count;
761 inT16 old_xht_accept_count;
762 inT16 new_xht_accept_count;
763 BOOL8 accept_new_x_ht = FALSE;
764 inT16 old_chs_in_wd;
765 inT16 new_chs_in_wd;
766 inT16 old_word_quality;
767 inT16 new_word_quality;
768 inT16 dummy;
769
770 set_global_subloc_code(SUBLOC_NORM);
771 check_debug_pt (word, 30);
772 if (!word->done ||
773 tessedit_training_tess ||
774 tessedit_training_wiseowl) {
775 word->caps_height = 0.0;
776 if (word->x_height == 0.0f)
777 word->x_height = row->x_height();
778 if (word->outword != NULL) {
779 delete word->outword; //get rid of junk
780 delete word->best_choice;
781 delete word->raw_choice;
782 }
783 match_word_pass2 (word, row, block, word->x_height);
784 done_this_pass = TRUE;
785 check_debug_pt (word, 40);
786 }
787
788 if (!word->tess_failed && !word->word->flag (W_REP_CHAR)) {
789 set_global_subloc_code(SUBLOC_FIX_XHT);
790 if ((tessedit_xht_fiddles_on_done_wds || !word->done) &&
791 (tessedit_xht_fiddles_on_no_rej_wds ||
792 (word->reject_map.reject_count () > 0))) {
793 if ((x_ht_check_word_occ >= 2) && word_occ_first)
794 check_block_occ(word);
795
796 if (tessedit_redo_xheight)
797 re_estimate_x_ht(word, &new_x_ht);
798
799 if (((x_ht_check_word_occ >= 2) && !word_occ_first) ||
800 ((x_ht_check_word_occ >= 1) && (new_x_ht > 0)))
801 check_block_occ(word);
802 }
803 if (new_x_ht > 0) {
804 old_chs_in_wd = word->reject_map.length ();
805
806 /* Re-estimated x_ht error suggests a rematch is worthwhile. */
807 new_x_ht_word.x_height = new_x_ht;
808 new_x_ht_word.caps_height = 0.0;
809 match_word_pass2(&new_x_ht_word, row, block, new_x_ht_word.x_height);
810 if (!new_x_ht_word.tess_failed) {
811 if ((x_ht_check_word_occ >= 1) && word_occ_first)
812 check_block_occ(&new_x_ht_word);
813
814 re_estimate_x_ht(&new_x_ht_word, &new_x_ht);
815
816 if ((x_ht_check_word_occ >= 1) && !word_occ_first)
817 check_block_occ(&new_x_ht_word);
818
819 old_xht_reject_count = word->reject_map.reject_count ();
820 old_xht_accept_count = old_chs_in_wd - old_xht_reject_count;
821 new_xht_reject_count = new_x_ht_word.reject_map.reject_count ();
822 new_chs_in_wd = new_x_ht_word.reject_map.length ();
823 new_xht_accept_count = new_chs_in_wd - new_xht_reject_count;
824 accept_new_x_ht =
825 ((new_xht_accept_count > old_xht_accept_count) ||
826 ((new_xht_accept_count == old_xht_accept_count) &&
827 (new_xht_accept_count > 0))) &&
828 (!new_x_ht_word.guessed_x_ht ||
829 !new_x_ht_word.guessed_caps_ht);
830
831 if (accept_new_x_ht && x_ht_quality_check) {
832 word_char_quality(word, row, &old_word_quality, &dummy);
833 word_char_quality(&new_x_ht_word, row, &new_word_quality, &dummy);
834 if (old_word_quality > new_word_quality)
835 accept_new_x_ht = FALSE;
836 }
837
838 if (accept_new_x_ht && (x_ht_stringency > 0)) {
839 accept_new_x_ht =
840 (count_alphanums (&new_x_ht_word) > x_ht_stringency);
841 if (!accept_new_x_ht && rej_use_xht) {
842 if (debug_x_ht_level >= 1)
843 tprintf
844 ("Failed stringency test so reject original word\n");
845 word->reject_map.rej_word_xht_fixup ();
846 }
847 }
848
849 #ifndef SECURE_NAMES
850 if (debug_x_ht_level >= 1) {
851 tprintf ("New XHT Match:: %s ",
852 word->best_choice->debug_string(unicharset).string());
853 word->reject_map.print (debug_fp);
854 tprintf (" -> %s ",
855 new_x_ht_word.best_choice->debug_string(
856 unicharset).string());
857 new_x_ht_word.reject_map.print (debug_fp);
858 tprintf (" %s->%s %s %s\n",
859 word->guessed_x_ht ? "GUESS" : "CERT",
860 new_x_ht_word.guessed_x_ht ? "GUESS" : "CERT",
861 new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
862 accept_new_x_ht ? "ACCEPTED" : "");
863 }
864 #endif
865 }
866 if (accept_new_x_ht) {
867 /*
868 The new x_ht is deemed superior so put the final results in the real
869 word and destroy the old results
870 */
871 delete word->outword; //get rid of junk
872 word->outword = new_x_ht_word.outword;
873 word->denorm = new_x_ht_word.denorm;
874 delete word->best_choice;
875 word->best_choice = new_x_ht_word.best_choice;
876 delete word->raw_choice;
877 word->raw_choice = new_x_ht_word.raw_choice;
878 word->reject_map = new_x_ht_word.reject_map;
879 word->done = new_x_ht_word.done;
880 done_this_pass = TRUE;
881 }
882 else {
883 /*
884 The new x_ht is no better, so destroy the copy word and put any
885 uncertain x or cap ht estimate back to default. (I.e. dont blame
886 me if its bad!) Conditionally, use any ammended block occ chars.
887 */
888 //get rid of junk
889 delete new_x_ht_word.outword;
890 delete new_x_ht_word.best_choice;
891 delete new_x_ht_word.raw_choice;
892 }
893 //to keep new destructor happy
894 new_x_ht_word.outword = NULL;
895 //to keep new destructor happy
896 new_x_ht_word.best_choice = NULL;
897 //to keep new destructor happy
898 new_x_ht_word.raw_choice = NULL;
899
900 if (rej_mostly_reject_mode == 2) {
901 reject_mostly_rejects(word);
902 tprintf("Rejecting mostly rejects on %s ",
903 word->best_choice->debug_string(unicharset).string());
904 }
905 }
906
907 set_global_subloc_code(SUBLOC_NORM);
908
909 if (done_this_pass && !word->done && tessedit_save_stats) {
910 STRING word_str;
911 word->best_choice->string_and_lengths(unicharset, &word_str, NULL);
912 SaveBadWord(word_str.string(), word->best_choice->certainty());
913 }
914 record_certainty (word->best_choice->certainty(), 2);
915 //accounting
916 }
917 #ifndef GRAPHICS_DISABLED
918 if (tessedit_draw_outwords) {
919 if (fx_win == NULL)
920 create_fx_win();
921 clear_fx_win();
922 word->outword->plot (fx_win);
923 TBOX wbox = word->outword->bounding_box();
924 fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
925 wbox.right(), wbox.bottom());
926 //make_picture_current(fx_win);
927 ScrollView::Update();
928 }
929 #endif
930
931 set_global_subloc_code(SUBLOC_NORM);
932 #if 0
933 if (tessedit_print_text) {
934 write_cooked_text (word->outword, word->best_choice->string (),
935 word->done, done_this_pass, stdout);
936 }
937 #endif
938 check_debug_pt (word, 50);
939 }
940
941
942 /**********************************************************************
943 * match_word_pass2
944 *
945 * Baseline normalize the word and pass it to Tess.
946 **********************************************************************/
947
match_word_pass2(WERD_RES * word,ROW * row,BLOCK * block,float x_height)948 void Tesseract::match_word_pass2( //recog one word
949 WERD_RES *word, //word to do
950 ROW *row,
951 BLOCK* block,
952 float x_height) {
953 WERD *bln_word; //baseline norm copy
954 //detailed results
955 BLOB_CHOICE_LIST_CLIST local_blob_choices;
956 BLOB_CHOICE_LIST_CLIST *blob_choices;
957
958 if (save_best_choices)
959 blob_choices = new BLOB_CHOICE_LIST_CLIST();
960 else
961 blob_choices = &local_blob_choices;
962
963 set_global_subsubloc_code(SUBSUBLOC_OTHER);
964 if (matcher_fp != NULL) {
965 word_answer = (char *) word->word->text ();
966 if (word_answer != NULL && word_answer[0] == '\0')
967 word_answer = NULL;
968 }
969 bln_word = make_bln_copy (word->word, row, block, x_height, &word->denorm);
970 set_global_subsubloc_code(SUBSUBLOC_TESS);
971 if (tessedit_training_tess)
972 word->best_choice = correct_segment_pass2 (bln_word,
973 &word->denorm,
974 &Tesseract::tess_default_matcher,
975 tess_training_tester,
976 word->raw_choice,
977 blob_choices, word->outword);
978 else {
979 word->best_choice = tess_segment_pass2 (bln_word, &word->denorm,
980 &Tesseract::tess_default_matcher,
981 word->raw_choice, blob_choices,
982 word->outword);
983 }
984 set_global_subsubloc_code(SUBSUBLOC_OTHER);
985 /*
986 Test for TESS screw up on word. Recog_word has already ensured that the
987 choice list, outword blob lists and best_choice string are the same
988 length. A TESS screw up is indicated by a blank filled or 0 length string.
989 */
990 if ((word->best_choice->length() == 0) ||
991 (strspn (word->best_choice->unichar_string().string (), " ") ==
992 word->best_choice->length())) {
993 word->tess_failed = TRUE;
994 word->reject_map.initialise (word->best_choice->length());
995 word->reject_map.rej_word_tess_failure ();
996 // tprintf("Empty word produced\n");
997 }
998 else {
999 if ((word->best_choice->length() !=
1000 word->outword->blob_list()->length ()) ||
1001 (word->best_choice->length() != blob_choices->length())) {
1002 tprintf
1003 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
1004 word->best_choice->debug_string(unicharset).string(),
1005 word->best_choice->length(),
1006 word->outword->blob_list()->length(), blob_choices->length());
1007 }
1008 ASSERT_HOST (word->best_choice->length() ==
1009 word->outword->blob_list()->length());
1010 ASSERT_HOST (word->best_choice->length() == blob_choices->length());
1011
1012 word->tess_failed = FALSE;
1013 if (word->word->flag (W_REP_CHAR)) {
1014 fix_rep_char(word);
1015 }
1016 else {
1017 fix_quotes (word->best_choice,
1018 word->outword, blob_choices);
1019 if (tessedit_fix_hyphens)
1020 fix_hyphens (word->best_choice,
1021 word->outword, blob_choices);
1022 /* Dont trust fix_quotes! - though I think I've fixed the bug */
1023 if ((word->best_choice->length() !=
1024 word->outword->blob_list()->length()) ||
1025 (word->best_choice->length() != blob_choices->length())) {
1026 #ifndef SECURE_NAMES
1027 tprintf
1028 ("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
1029 word->best_choice->debug_string(unicharset).string(),
1030 word->best_choice->length(),
1031 word->outword->blob_list()->length(), blob_choices->length());
1032 #endif
1033
1034 }
1035 ASSERT_HOST (word->best_choice->length() ==
1036 word->outword->blob_list()->length());
1037 ASSERT_HOST (word->best_choice->length() == blob_choices->length());
1038
1039 word->tess_accepted = tess_acceptable_word(word->best_choice,
1040 word->raw_choice);
1041
1042 make_reject_map (word, blob_choices, row, 2);
1043 }
1044 }
1045
1046 // Save best choices in the WERD_CHOICE if needed
1047 if (blob_choices != &local_blob_choices)
1048 word->best_choice->set_blob_choices(blob_choices);
1049 else
1050 blob_choices->deep_clear();
1051
1052 delete bln_word;
1053 assert (word->raw_choice != NULL);
1054 }
1055 } // namespace tesseract
1056
1057
1058 /*************************************************************************
1059 * fix_rep_char()
1060 * The word is a repeated char. Find the repeated char character. Make a reject
1061 * string which rejects any char other than the voted char. Set the word to done
1062 * to stop rematching it.
1063 *
1064 *************************************************************************/
1065 namespace tesseract {
fix_rep_char(WERD_RES * word_res)1066 void Tesseract::fix_rep_char(WERD_RES *word_res) {
1067 struct REP_CH {
1068 UNICHAR_ID unichar_id;
1069 int count;
1070 };
1071 const WERD_CHOICE &word = *(word_res->best_choice);
1072 REP_CH *rep_ch; // array of char counts
1073 int rep_ch_count = 0; // how many unique chs
1074 int i, j;
1075 int total = 0;
1076 int max = 0;
1077 UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1078 UNICHAR_ID space = unicharset.unichar_to_id(" ");
1079
1080 rep_ch = new REP_CH[word.length()];
1081 for (i = 0; i < word.length(); ++i) {
1082 for (j = 0; j < rep_ch_count &&
1083 rep_ch[j].unichar_id != word.unichar_id(i); ++j);
1084 if (j < rep_ch_count) {
1085 rep_ch[j].count++;
1086 } else {
1087 rep_ch[rep_ch_count].unichar_id = word.unichar_id(i);
1088 rep_ch[rep_ch_count].count = 1;
1089 rep_ch_count++;
1090 }
1091 }
1092
1093 for (j = 0; j < rep_ch_count; j++) {
1094 total += rep_ch[j].count;
1095 if ((rep_ch[j].count > max) && (rep_ch[j].unichar_id != space)) {
1096 max = rep_ch[j].count;
1097 maxch_id = rep_ch[j].unichar_id;
1098 }
1099 }
1100 // tprintf( "REPEATED CHAR %s len=%d total=%d choice=%c\n",
1101 // word_str, word_len, total, maxch );
1102 delete[] rep_ch;
1103
1104 word_res->reject_map.initialise(word.length());
1105 for (i = 0; i < word.length(); ++i) {
1106 if (word.unichar_id(i) != maxch_id)
1107 word_res->reject_map[i].setrej_bad_repetition(); // rej unrecognised blobs
1108 }
1109 word_res->done = TRUE;
1110 }
1111
1112 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
1113 // training data.
1114
1115 // Utility function for fix_quotes
1116 // Return true if the next character in the string (given the UTF8 length in
1117 // bytes) is a quote character.
is_simple_quote(const char * signed_str,int length)1118 static int is_simple_quote(const char* signed_str, int length) {
1119 const unsigned char* str =
1120 reinterpret_cast<const unsigned char*>(signed_str);
1121 //standard 1 byte quotes
1122 return (length == 1 && (*str == '\'' || *str == '`')) ||
1123 //utf8 3 bytes curved quotes
1124 (length == 3 && ((*str == 0xe2 &&
1125 *(str + 1) == 0x80 &&
1126 *(str + 2) == 0x98) ||
1127 (*str == 0xe2 &&
1128 *(str + 1) == 0x80 &&
1129 *(str + 2) == 0x99)));
1130 }
1131
1132 /**********************************************************************
1133 * fix_quotes
1134 *
1135 * Change pairs of quotes to double quotes.
1136 **********************************************************************/
fix_quotes(WERD_CHOICE * choice,WERD * word,BLOB_CHOICE_LIST_CLIST * blob_choices)1137 void Tesseract::fix_quotes(WERD_CHOICE *choice, //choice to fix
1138 WERD *word, //word to do //char choices
1139 BLOB_CHOICE_LIST_CLIST *blob_choices) {
1140 if (!unicharset.contains_unichar("\"") ||
1141 !unicharset.get_enabled(unicharset.unichar_to_id("\"")))
1142 return; // Don't create it if it is disallowed.
1143
1144 PBLOB_IT blob_it = word->blob_list(); // blobs
1145 BLOB_CHOICE_LIST_C_IT blob_choices_it = blob_choices; // choices
1146 BLOB_CHOICE_IT it1; // first choices
1147 BLOB_CHOICE_IT it2; // second choices
1148
1149 int i;
1150 int modified = false;
1151 for (i = 0; i < choice->length()-1;
1152 ++i, blob_it.forward(), blob_choices_it.forward()) {
1153 const char *ch = unicharset.id_to_unichar(choice->unichar_id(i));
1154 const char *next_ch = unicharset.id_to_unichar(choice->unichar_id(i+1));
1155 if (is_simple_quote(ch, strlen(ch)) &&
1156 is_simple_quote(next_ch, strlen(next_ch))) {
1157 choice->set_unichar_id(unicharset.unichar_to_id("\""), i);
1158 choice->remove_unichar_id(i+1);
1159 modified = true;
1160 merge_blobs(blob_it.data(), blob_it.data_relative(1));
1161 blob_it.forward();
1162 delete blob_it.extract(); // get rid of spare
1163
1164 it1.set_to_list(blob_choices_it.data());
1165 it2.set_to_list(blob_choices_it.data_relative(1));
1166 if (it1.data()->certainty() < it2.data()->certainty()) {
1167 blob_choices_it.forward();
1168 delete blob_choices_it.extract(); // get rid of spare
1169 } else {
1170 delete blob_choices_it.extract(); // get rid of spare
1171 blob_choices_it.forward();
1172 }
1173 }
1174 }
1175 if (modified) {
1176 choice->populate_unichars(unicharset);
1177 }
1178 }
1179
1180
1181 /**********************************************************************
1182 * fix_hyphens
1183 *
1184 * Change pairs of hyphens to a single hyphen if the bounding boxes touch
1185 * Typically a long dash which has been segmented.
1186 **********************************************************************/
fix_hyphens(WERD_CHOICE * choice,WERD * word,BLOB_CHOICE_LIST_CLIST * blob_choices)1187 void Tesseract::fix_hyphens( //crunch double hyphens
1188 WERD_CHOICE *choice, //choice to fix
1189 WERD *word, //word to do //char choices
1190 BLOB_CHOICE_LIST_CLIST *blob_choices) {
1191 if (!unicharset.contains_unichar("-") ||
1192 !unicharset.get_enabled(unicharset.unichar_to_id("-")))
1193 return; // Don't create it if it is disallowed.
1194
1195 PBLOB_IT blob_it = word->blob_list();
1196 BLOB_CHOICE_LIST_C_IT blob_choices_it = blob_choices;
1197 BLOB_CHOICE_IT it1; // first choices
1198 BLOB_CHOICE_IT it2; // second choices
1199
1200 bool modified = false;
1201 for (int i = 0; i+1 < choice->length();
1202 ++i, blob_it.forward (), blob_choices_it.forward ()) {
1203 const char *ch = unicharset.id_to_unichar(choice->unichar_id(i));
1204 const char *next_ch = unicharset.id_to_unichar(choice->unichar_id(i+1));
1205 if (strlen(ch) != 1 || strlen(next_ch) != 1) continue;
1206 if ((*ch == '-' || *ch == '~') &&
1207 (*next_ch == '-' || *next_ch == '~') &&
1208 (blob_it.data()->bounding_box().right() >=
1209 blob_it.data_relative(1)->bounding_box().left ())) {
1210 choice->set_unichar_id(unicharset.unichar_to_id("-"), i);
1211 choice->remove_unichar_id(i+1);
1212 modified = true;
1213 merge_blobs(blob_it.data(), blob_it.data_relative(1));
1214 blob_it.forward();
1215 delete blob_it.extract(); // get rid of spare
1216
1217 it1.set_to_list(blob_choices_it.data());
1218 it2.set_to_list(blob_choices_it.data_relative(1));
1219 if (it1.data()->certainty() < it2.data()->certainty()) {
1220 blob_choices_it.forward();
1221 delete blob_choices_it.extract(); // get rid of spare
1222 } else {
1223 delete blob_choices_it.extract(); // get rid of spare
1224 blob_choices_it.forward();
1225 }
1226 }
1227 }
1228 if (modified) {
1229 choice->populate_unichars(unicharset);
1230 }
1231 }
1232 } // namespace tesseract
1233
1234
1235 /**********************************************************************
1236 * merge_blobs
1237 *
1238 * Add the outlines from blob2 to blob1. Blob2 is emptied but not deleted.
1239 **********************************************************************/
1240
merge_blobs(PBLOB * blob1,PBLOB * blob2)1241 void merge_blobs( //combine 2 blobs
1242 PBLOB *blob1, //dest blob
1243 PBLOB *blob2 //source blob
1244 ) {
1245 OUTLINE_IT outline_it = blob1->out_list ();
1246 //iterator
1247
1248 outline_it.move_to_last (); //go to end
1249 //do it
1250 outline_it.add_list_after (blob2->out_list ());
1251 }
1252
1253
1254 /**********************************************************************
1255 * choice_dump_tester
1256 *
1257 * Matcher tester function which generates .chc file entries.
1258 * Called via test_segment_pass2 for every blob tested by tess in a word.
1259 * (But only for words for which a correct segmentation could be found.)
1260 **********************************************************************/
1261 /* DEADCODE
1262 void choice_dump_tester( //dump chars in word
1263 PBLOB *, //blob
1264 DENORM *, //de-normaliser
1265 BOOL8 correct, //ly segmented
1266 char *text, //correct text
1267 inT32 count, //chars in text
1268 BLOB_CHOICE_LIST *ratings //list of results
1269 ) {
1270 STRING choice_file_name;
1271 BLOB_CHOICE *blob_choice;
1272 BLOB_CHOICE_IT it;
1273 char source_chars[20];
1274 char correct_char[3];
1275
1276 if (choice_file == NULL) {
1277 choice_file_name = imagebasename + ".chc";
1278 if (!(choice_file = fopen (choice_file_name.string (), "w"))) {
1279 CANTOPENFILE.error ("choice_dump_tester", EXIT, "%s %d",
1280 choice_file_name.string (), errno);
1281 }
1282 }
1283
1284 if ((count == 0) || (text == NULL) || (text[0] == '\0')) {
1285 strcpy (source_chars, "$$");
1286 strcpy (correct_char, "$$");
1287 }
1288 else {
1289 strncpy(source_chars, text, count);
1290 source_chars[count] = '\0';
1291 if (correct) {
1292 correct_char[0] = text[0];
1293 correct_char[1] = '\0';
1294 }
1295 else {
1296 strcpy (correct_char, "$$");
1297 }
1298 }
1299 fprintf (choice_file, "%s\t%s", source_chars, correct_char);
1300
1301 it.set_to_list (ratings);
1302 for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
1303 blob_choice = it.data ();
1304 fprintf (choice_file, "\t%s\t%f\t%f",
1305 blob_choice->unichar (),
1306 blob_choice->rating (), blob_choice->certainty ());
1307 }
1308 fprintf (choice_file, "\n");
1309 }
1310 */
1311
1312 /*************************************************************************
1313 * make_bln_copy()
1314 *
1315 * Generate a baseline normalised copy of the source word. The copy is done so
1316 * that whatever format the original word is in, a polygonal bln version is
1317 * generated as output.
1318 *************************************************************************/
1319
make_bln_copy(WERD * src_word,ROW * row,BLOCK * block,float x_height,DENORM * denorm)1320 WERD *make_bln_copy(WERD *src_word, ROW *row, BLOCK* block,
1321 float x_height, DENORM *denorm) {
1322 WERD *result = src_word->poly_copy(row->x_height());
1323
1324 result->baseline_normalise_x (row, x_height, denorm);
1325 if (block != NULL)
1326 denorm->set_block(block);
1327 return result;
1328 }
1329
1330
1331 namespace tesseract {
acceptable_word_string(const char * s,const char * lengths)1332 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s,
1333 const char *lengths) {
1334 int i = 0;
1335 int offset = 0;
1336 int leading_punct_count;
1337 int upper_count = 0;
1338 int hyphen_pos = -1;
1339 ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
1340
1341 if (strlen (lengths) > 20)
1342 return word_type;
1343
1344 /* Single Leading punctuation char*/
1345
1346 if ((s[offset] != '\0') && (STRING (chs_leading_punct).contains (s[offset])))
1347 offset += lengths[i++];
1348 leading_punct_count = i;
1349
1350 /* Initial cap */
1351 while ((s[offset] != '\0') &&
1352 unicharset.get_isupper(s + offset, lengths[i])) {
1353 offset += lengths[i++];
1354 upper_count++;
1355 }
1356 if (upper_count > 1)
1357 word_type = AC_UPPER_CASE;
1358 else {
1359 /* Lower case word, possibly with an initial cap */
1360 while ((s[offset] != '\0') &&
1361 unicharset.get_islower (s + offset, lengths[i])) {
1362 offset += lengths[i++];
1363 }
1364 if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1365 goto not_a_word;
1366 /*
1367 Allow a single hyphen in a lower case word
1368 - dont trust upper case - I've seen several cases of "H" -> "I-I"
1369 */
1370 if (lengths[i] == 1 && s[offset] == '-') {
1371 hyphen_pos = i;
1372 offset += lengths[i++];
1373 if (s[offset] != '\0') {
1374 while ((s[offset] != '\0') &&
1375 unicharset.get_islower(s + offset, lengths[i])) {
1376 offset += lengths[i++];
1377 }
1378 if (i < hyphen_pos + 3)
1379 goto not_a_word;
1380 }
1381 }
1382 else {
1383 /* Allow "'s" in NON hyphenated lower case words */
1384 if (lengths[i] == 1 && (s[offset] == '\'') &&
1385 lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1386 offset += lengths[i++];
1387 offset += lengths[i++];
1388 }
1389 }
1390 if (upper_count > 0)
1391 word_type = AC_INITIAL_CAP;
1392 else
1393 word_type = AC_LOWER_CASE;
1394 }
1395
1396 /* Up to two different, constrained trailing punctuation chars */
1397 if (lengths[i] == 1 && (s[offset] != '\0') &&
1398 (STRING (chs_trailing_punct1).contains (s[offset])))
1399 offset += lengths[i++];
1400 if (lengths[i] == 1 && (s[offset] != '\0') && i > 0 &&
1401 (s[offset - lengths[i - 1]] != s[offset]) &&
1402 (STRING (chs_trailing_punct2).contains (s[offset])))
1403 offset += lengths[i++];
1404
1405 if (s[offset] != '\0')
1406 word_type = AC_UNACCEPTABLE;
1407
1408 not_a_word:
1409
1410 if (word_type == AC_UNACCEPTABLE) {
1411 /* Look for abbreviation string */
1412 i = 0;
1413 offset = 0;
1414 if (s[0] != '\0' && unicharset.get_isupper (s, lengths[0])) {
1415 word_type = AC_UC_ABBREV;
1416 while ((s[offset] != '\0') &&
1417 unicharset.get_isupper(s + offset, lengths[i]) &&
1418 (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
1419 offset += lengths[i++];
1420 offset += lengths[i++];
1421 }
1422 }
1423 else if (s[0] != '\0' && unicharset.get_islower (s, lengths[0])) {
1424 word_type = AC_LC_ABBREV;
1425 while ((s[offset] != '\0') &&
1426 unicharset.get_islower(s + offset, lengths[i]) &&
1427 (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
1428 offset += lengths[i++];
1429 offset += lengths[i++];
1430 }
1431 }
1432 if (s[offset] != '\0')
1433 word_type = AC_UNACCEPTABLE;
1434 }
1435
1436 return word_type;
1437 }
1438
1439 } // namespace tesseract
1440
1441 /* DEBUGGING ROUTINE */
1442
check_debug_pt(WERD_RES * word,int location)1443 BOOL8 check_debug_pt(WERD_RES *word, int location) {
1444 BOOL8 show_map_detail = FALSE;
1445 inT16 i;
1446
1447 #ifndef SECURE_NAMES
1448 if (!test_pt)
1449 return FALSE;
1450
1451 tessedit_rejection_debug.set_value (FALSE);
1452 debug_x_ht_level.set_value (0);
1453 tessedit_cluster_debug.set_value (FALSE);
1454 nn_debug.set_value (FALSE);
1455 nn_reject_debug.set_value (FALSE);
1456
1457 if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
1458 if (location < 0)
1459 return TRUE; //For breakpoint use
1460 tessedit_rejection_debug.set_value (TRUE);
1461 debug_x_ht_level.set_value (20);
1462 tessedit_cluster_debug.set_value (TRUE);
1463 nn_debug.set_value (TRUE);
1464 nn_reject_debug.set_value (TRUE);
1465 tprintf ("\n\nTESTWD::");
1466 switch (location) {
1467 case 0:
1468 tprintf ("classify_word_pass1 start\n");
1469 word->word->print (debug_fp);
1470 break;
1471 case 10:
1472 tprintf ("make_reject_map: initial map");
1473 break;
1474 case 20:
1475 tprintf ("make_reject_map: after NN");
1476 break;
1477 case 30:
1478 tprintf ("classify_word_pass2 - START");
1479 break;
1480 case 40:
1481 tprintf ("classify_word_pass2 - Pre Xht");
1482 break;
1483 case 50:
1484 tprintf ("classify_word_pass2 - END");
1485 show_map_detail = TRUE;
1486 break;
1487 case 60:
1488 tprintf ("fixspace");
1489 break;
1490 case 70:
1491 tprintf ("MM pass START");
1492 break;
1493 case 80:
1494 tprintf ("MM pass END");
1495 break;
1496 case 90:
1497 tprintf ("After Poor quality rejection");
1498 break;
1499 case 100:
1500 tprintf ("unrej_good_quality_words - START");
1501 break;
1502 case 110:
1503 tprintf ("unrej_good_quality_words - END");
1504 break;
1505 case 120:
1506 tprintf ("Write results pass");
1507 show_map_detail = TRUE;
1508 break;
1509 }
1510 tprintf(" \"%s\" ",
1511 word->best_choice->unichar_string().string());
1512 word->reject_map.print (debug_fp);
1513 tprintf ("\n");
1514 if (show_map_detail) {
1515 tprintf ("\"%s\"\n", word->best_choice->unichar_string().string());
1516 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1517 tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1518 word->reject_map[i].full_print(debug_fp);
1519 }
1520 }
1521
1522 tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1523 tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1524 return TRUE;
1525 }
1526 else
1527 #endif
1528 return FALSE;
1529 }
1530
1531
1532 /**********************************************************************
1533 * set_word_fonts
1534 *
1535 * Get the fonts for the word.
1536 **********************************************************************/
1537 namespace tesseract {
set_word_fonts(WERD_RES * word,BLOB_CHOICE_LIST_CLIST * blob_choices)1538 void Tesseract::set_word_fonts(
1539 WERD_RES *word, // word to adapt to
1540 BLOB_CHOICE_LIST_CLIST *blob_choices // detailed results
1541 ) {
1542 inT32 index; // char id index
1543 UNICHAR_ID choice_char_id; // char id from word
1544 inT8 config; // font of char
1545 // character iterator
1546 BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
1547 BLOB_CHOICE_IT choice_it; // choice iterator
1548 int fontinfo_size = get_fontinfo_table().size();
1549 int fontset_size = get_fontset_table().size();
1550 if (fontinfo_size == 0 || fontset_size == 0)
1551 return;
1552 STATS fonts(0, fontinfo_size); // font counters
1553
1554 word->italic = 0;
1555 word->bold = 0;
1556 for (char_it.mark_cycle_pt(), index = 0;
1557 !char_it.cycled_list(); ++index, char_it.forward()) {
1558 choice_char_id = word->best_choice->unichar_id(index);
1559 choice_it.set_to_list(char_it.data());
1560 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1561 choice_it.forward()) {
1562 if (choice_it.data()->unichar_id() == choice_char_id) {
1563 config = choice_it.data()->config();
1564 int class_id = choice_it.data()->unichar_id();
1565 int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
1566 if (font_set_id >= 0 && config >= 0 && font_set_id < fontset_size) {
1567 FontSet font_set = get_fontset_table().get(font_set_id);
1568 if (tessedit_debug_fonts) {
1569 tprintf("%s(%d=%d%c%c)", unicharset.id_to_unichar(choice_char_id),
1570 config, (config & 31) >> 2,
1571 config & 2 ? 'N' : 'B', config & 1 ? 'N' : 'I');
1572 const char* fontname;
1573 if (config >= font_set.size) {
1574 fontname = "Unknown";
1575 } else {
1576 fontname = get_fontinfo_table().get(
1577 font_set.configs[config]).name;
1578 }
1579 tprintf("%s(%d,%d=%s)\n",
1580 unicharset.id_to_unichar(choice_it.data()->unichar_id()),
1581 font_set_id, config, fontname);
1582 }
1583 if (config < font_set.size) {
1584 int fontinfo_id = font_set.configs[config];
1585 if (fontinfo_id < fontinfo_size) {
1586 FontInfo fi = get_fontinfo_table().get(fontinfo_id);
1587 word->italic += fi.is_italic();
1588 word->bold += fi.is_bold();
1589 fonts.add(fontinfo_id, 1);
1590 }
1591 }
1592 }
1593 break;
1594 }
1595 }
1596 }
1597 find_modal_font(&fonts, &word->font1, &word->font1_count);
1598 find_modal_font(&fonts, &word->font2, &word->font2_count);
1599 if (tessedit_debug_fonts)
1600 tprintf("\n");
1601 if (word->font1_count > 0) {
1602 word->italic = word->bold = 0;
1603 for (char_it.mark_cycle_pt(), index = 0;
1604 !char_it.cycled_list(); char_it.forward(), ++index) {
1605 choice_char_id = word->best_choice->unichar_id(index);
1606 choice_it.set_to_list(char_it.data());
1607 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1608 choice_it.forward()) {
1609 if (choice_it.data()->unichar_id() == choice_char_id) {
1610 config = choice_it.data()->config();
1611 int class_id = choice_it.data()->unichar_id();
1612 int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
1613 if (font_set_id >= 0 && config >= 0 && font_set_id < fontset_size) {
1614 int fontinfo_id = get_fontset_table().get(font_set_id).
1615 configs[config];
1616 if (fontinfo_id == word->font1 && fontinfo_id < fontinfo_size) {
1617 FontInfo fi = fontinfo_table_.get(fontinfo_id);
1618 word->italic += fi.is_italic();
1619 word->bold += fi.is_bold();
1620 }
1621 }
1622 break;
1623 }
1624 }
1625 }
1626 }
1627 }
1628
1629
1630 /**********************************************************************
1631 * font_recognition_pass
1632 *
1633 * Smooth the fonts for the document.
1634 **********************************************************************/
1635
font_recognition_pass(PAGE_RES_IT & page_res_it)1636 void Tesseract::font_recognition_pass( //good chars in word
1637 PAGE_RES_IT &page_res_it) {
1638 inT32 length; //of word
1639 inT32 count; //of a feature
1640 inT8 doc_font; //modal font
1641 inT8 doc_font_count; //modal font
1642 inT32 doc_italic; //total italics
1643 inT32 doc_bold; //total bolds
1644 ROW_RES *row = NULL; //current row
1645 WERD_RES *word; //current word
1646 STATS fonts (0, get_fontinfo_table().size() ?
1647 get_fontinfo_table().size() : 32); // font counters
1648 STATS doc_fonts (0, get_fontinfo_table().size() ?
1649 get_fontinfo_table().size() : 32); // font counters
1650
1651 doc_italic = 0;
1652 doc_bold = 0;
1653 page_res_it.restart_page ();
1654 while (page_res_it.word () != NULL) {
1655 if (row != page_res_it.row ()) {
1656 if (row != NULL) {
1657 find_modal_font (&fonts, &row->font1, &row->font1_count);
1658 find_modal_font (&fonts, &row->font2, &row->font2_count);
1659 }
1660 row = page_res_it.row (); //current row
1661 fonts.clear (); //clear counters
1662 row->italic = 0;
1663 row->bold = 0;
1664 }
1665 word = page_res_it.word ();
1666 row->italic += word->italic;
1667 row->bold += word->bold;
1668 fonts.add (word->font1, word->font1_count);
1669 fonts.add (word->font2, word->font2_count);
1670 doc_italic += word->italic;
1671 doc_bold += word->bold;
1672 doc_fonts.add (word->font1, word->font1_count);
1673 doc_fonts.add (word->font2, word->font2_count);
1674 page_res_it.forward ();
1675 }
1676 if (row != NULL) {
1677 find_modal_font (&fonts, &row->font1, &row->font1_count);
1678 find_modal_font (&fonts, &row->font2, &row->font2_count);
1679 }
1680 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
1681 /*
1682 row=NULL;
1683 page_res_it.restart_page();
1684 while (page_res_it.word() != NULL)
1685 {
1686 if (row!=page_res_it.row())
1687 {
1688 row2=row;
1689 row=page_res_it.row();
1690 if (row->font1_count<MIN_FONT_ROW_COUNT)
1691 {
1692 fonts.clear();
1693 italic=0;
1694 bold=0;
1695 add_in_one_row(row,&fonts,&italic,&bold);
1696 if (row2!=NULL)
1697 {
1698 hdiff=row->row->x_height()-row2->row->x_height();
1699 if (hdiff<0)
1700 hdiff=-hdiff;
1701 if (hdiff<MAX_XHEIGHT_DIFF)
1702 add_in_one_row(row2,&fonts,&italic,&bold);
1703 }
1704 do
1705 page_res_it.forward();
1706 while (page_res_it.row()==row);
1707 row2=page_res_it.row();
1708 if (row2!=NULL)
1709 {
1710 hdiff=row->row->x_height()-row2->row->x_height();
1711 if (hdiff<0)
1712 hdiff=-hdiff;
1713 if (hdiff<MAX_XHEIGHT_DIFF)
1714 add_in_one_row(row2,&fonts,&italic,&bold);
1715 }
1716 row->italic=italic;
1717 row->bold=bold;
1718 find_modal_font(&fonts,&row->font1,&row->font1_count);
1719 find_modal_font(&fonts,&row->font2,&row->font2_count);
1720 }
1721 else
1722 page_res_it.forward();
1723 }
1724 else
1725 page_res_it.forward();
1726 }*/
1727
1728 page_res_it.restart_page ();
1729 while (page_res_it.word () != NULL) {
1730 row = page_res_it.row (); //current row
1731 word = page_res_it.word ();
1732 length = word->best_choice->length();
1733
1734 count = word->italic;
1735 if (count < 0)
1736 count = -count;
1737 if (!(count == length || (length > 3 && count >= length * 3 / 4)))
1738 word->italic = doc_italic > 0 ? 1 : -1;
1739
1740 count = word->bold;
1741 if (count < 0)
1742 count = -count;
1743 if (!(count == length || (length > 3 && count >= length * 3 / 4)))
1744 word->bold = doc_bold > 0 ? 1 : -1;
1745
1746 count = word->font1_count;
1747 if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
1748 word->font1 = doc_font;
1749 word->font1_count = doc_font_count;
1750 }
1751
1752 page_res_it.forward ();
1753 }
1754 }
1755 } // namespace tesseract
1756
1757
1758 /**********************************************************************
1759 * add_in_one_row
1760 *
1761 * Add into the stats for one row.
1762 **********************************************************************/
1763
add_in_one_row(ROW_RES * row,STATS * fonts,inT8 * italic,inT8 * bold)1764 void add_in_one_row( //good chars in word
1765 ROW_RES *row, //current row
1766 STATS *fonts, //font stats
1767 inT8 *italic, //output count
1768 inT8 *bold //output count
1769 ) {
1770 WERD_RES *word; //current word
1771 WERD_RES_IT word_it = &row->word_res_list;
1772
1773 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
1774 word = word_it.data ();
1775 *italic += word->italic;
1776 *bold += word->bold;
1777 if (word->font1_count > 0)
1778 fonts->add (word->font1, word->font1_count);
1779 if (word->font2_count > 0)
1780 fonts->add (word->font2, word->font2_count);
1781
1782 }
1783 }
1784
1785
1786 /**********************************************************************
1787 * find_modal_font
1788 *
1789 * Find the modal font and remove from the stats.
1790 **********************************************************************/
1791
find_modal_font(STATS * fonts,inT8 * font_out,inT8 * font_count)1792 void find_modal_font( //good chars in word
1793 STATS *fonts, //font stats
1794 inT8 *font_out, //output font
1795 inT8 *font_count //output count
1796 ) {
1797 inT8 font; //font index
1798 inT32 count; //pile couat
1799
1800 if (fonts->get_total () > 0) {
1801 font = (inT8) fonts->mode ();
1802 *font_out = font;
1803 count = fonts->pile_count (font);
1804 *font_count = count < MAX_INT8 ? count : MAX_INT8;
1805 fonts->add (font, -*font_count);
1806 }
1807 else {
1808 *font_out = -1;
1809 *font_count = 0;
1810 }
1811 }
1812