1 /**********************************************************************
2 * File: applybox.cpp (Formerly applybox.c)
3 * Description: Re segment rows according to box file data
4 * Author: Phil Cheatle
5 * Created: Wed Nov 24 09:11:23 GMT 1993
6 *
7 * (C) Copyright 1993, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19 #include "mfcpch.h"
20
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25
26 #ifdef HAVE_LIBLEPT
27 // Include leptonica library only if autoconf (or makefile etc) tell us to.
28 #include "allheaders.h"
29 #endif
30
31 #include "applybox.h"
32 #include <ctype.h>
33 #include <string.h>
34 #ifdef __UNIX__
35 #include <assert.h>
36 #include <errno.h>
37 #endif
38 #include "boxread.h"
39 #include "control.h"
40 #include "genblob.h"
41 #include "globals.h"
42 #include "fixxht.h"
43 #include "mainblk.h"
44 #include "matchdefs.h"
45 #include "secname.h"
46 #include "tessbox.h"
47 #include "unichar.h"
48 #include "unicharset.h"
49 #include "matchdefs.h"
50 #include "tesseractclass.h"
51
52 #define SECURE_NAMES
53 #ifndef SECURE_NAMES
54 #include "wordstats.h"
55 #endif
56
57 #define EXTERN
58 EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
59 EXTERN INT_VAR (applybox_debug, 5, "Debug level");
60 EXTERN INT_VAR (applybox_page, 0, "Page number to apply boxes from");
61 EXTERN STRING_VAR (applybox_test_exclusions, "",
62 "Chars ignored for testing");
63 EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");
64
65 EXTERN STRING_VAR(exposure_pattern, ".exp",
66 "Exposure value follows this pattern in the image"
67 " filename. The name of the image files are expected"
68 " to be in the form [lang].[fontname].exp[num].tif");
69
70 EXTERN BOOL_VAR(learn_chars_and_char_frags_mode, FALSE,
71 "Learn both character fragments (as is done in the"
72 " special low exposure mode) as well as unfragmented"
73 " characters.");
74
75 extern IMAGE page_image;
76
77 // The unicharset used during box training
78 static UNICHARSET unicharset_boxes;
79
80 /*************************************************************************
81 * The code re-assigns outlines to form words each with ONE labelled blob.
82 * Noise is left in UNLABELLED words. The chars on the page are checked crudely
83 * for sensible position relative to baseline and xht. Failed boxes are
84 * compensated for by duplicating other believable instances of the character.
85 *
86 * The box file is assumed to contain box definitions, one per line, of the
87 * following format:
88 * <Char> <left> <bottom> <right> <top> ... arbitrary trailing fields unused
89 *
90 * The approach taken is to search the WHOLE page for stuff overlapping each box.
91 * - This is not too inefficient and is SAFE.
92 * - We can detect overlapping blobs as we will be attempting to put a blob
93 * from a LABELLED word into the current word.
94 * - When all the boxes have been processed we can detect any stuff which is
95 * being ignored - it is the unlabelled words left on the page.
96 *
97 * A box should only overlap one row.
98 *
99 * A warning is given if the box is on the same row as the previous box, but NOT
100 * on the same row as the previous blob.
101 *
102 * Any OUTLINE which overlaps the box is put into the new word.
103 *
104 * ascender chars must ascend above xht significantly
105 * xht chars must not rise above row xht significantly
106 * bl chars must not descend below baseline significantly
107 * descender chars must descend below baseline significantly
108 *
109 * ?? Certain chars are DROPPED - to limit the training data.
110 *
111 *************************************************************************/
112 namespace tesseract {
apply_boxes(const STRING & fname,BLOCK_LIST * block_list)113 void Tesseract::apply_boxes(const STRING& fname,
114 BLOCK_LIST *block_list //real blocks
115 ) {
116 inT16 boxfile_lineno = 0;
117 inT16 boxfile_charno = 0;
118 TBOX box; //boxfile box
119 UNICHAR_ID uch_id; //correct ch from boxfile
120 ROW *row;
121 ROW *prev_row = NULL;
122 inT16 prev_box_right = MAX_INT16;
123 inT16 block_id;
124 inT16 row_id;
125 inT16 box_count = 0;
126 inT16 box_failures = 0;
127 inT16 labels_ok;
128 inT16 rows_ok;
129 inT16 bad_blobs;
130 inT16 *tgt_char_counts = NULL; // No. of box samples
131 inT16 i;
132 inT16 rebalance_count = 0;
133 UNICHAR_ID min_uch_id = INVALID_UNICHAR_ID;
134 inT16 min_samples;
135 inT16 final_labelled_blob_count;
136 bool low_exposure = false;
137
138 // Clean the unichar set
139 unicharset_boxes.clear();
140 // Space character needed to represent NIL classification
141 unicharset_boxes.unichar_insert(" ");
142
143 // Figure out whether this image file's exposure is less than 1, in which
144 // case when learning we will only pay attention to character fragments.
145 const char *ptr = strstr(imagefile.string(), exposure_pattern.string());
146 if (ptr != NULL &&
147 strtol(ptr += strlen(exposure_pattern.string()), NULL, 10) < 0) {
148 low_exposure = true;
149 }
150
151 FILE* box_file;
152 STRING filename = fname;
153 const char *lastdot; //of name
154
155 lastdot = strrchr (filename.string (), '.');
156 if (lastdot != NULL)
157 filename[lastdot - filename.string()] = '\0';
158
159 filename += ".box";
160 if (!(box_file = fopen (filename.string(), "r"))) {
161 CANTOPENFILE.error ("read_next_box", EXIT,
162 "Cant open box file %s %d",
163 filename.string(), errno);
164 }
165
166 tgt_char_counts = new inT16[MAX_NUM_CLASSES];
167 for (i = 0; i < MAX_NUM_CLASSES; i++)
168 tgt_char_counts[i] = 0;
169
170 clear_any_old_text(block_list);
171 while (read_next_box(applybox_page, box_file, &box, &uch_id)) {
172 box_count++;
173 if (!low_exposure || learn_chars_and_char_frags_mode) {
174 tgt_char_counts[uch_id]++;
175 }
176 row = find_row_of_box (block_list, box, block_id, row_id);
177 if (box.left () < prev_box_right) {
178 boxfile_lineno++;
179 boxfile_charno = 1;
180 }
181 else
182 boxfile_charno++;
183
184 if (row == NULL) {
185 box_failures++;
186 report_failed_box (boxfile_lineno, boxfile_charno, box,
187 unicharset_boxes.id_to_unichar(uch_id),
188 "FAILURE! box overlaps no blobs or blobs in multiple rows");
189 }
190 else {
191 if ((box.left () >= prev_box_right) && (row != prev_row))
192 report_failed_box (boxfile_lineno, boxfile_charno, box,
193 unicharset_boxes.id_to_unichar(uch_id),
194 "WARNING! false row break");
195 box_failures += resegment_box (row, box, uch_id, block_id, row_id,
196 boxfile_lineno, boxfile_charno, tgt_char_counts, low_exposure, true);
197 prev_row = row;
198 }
199 prev_box_right = box.right ();
200 }
201 tidy_up(block_list,
202 labels_ok,
203 rows_ok,
204 bad_blobs,
205 tgt_char_counts,
206 rebalance_count,
207 &min_uch_id,
208 min_samples,
209 final_labelled_blob_count,
210 low_exposure,
211 true);
212 tprintf ("APPLY_BOXES:\n");
213 tprintf (" Boxes read from boxfile: %6d\n", box_count);
214 tprintf (" Initially labelled blobs: %6d in %d rows\n",
215 labels_ok, rows_ok);
216 tprintf (" Box failures detected: %6d\n", box_failures);
217 tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count);
218 tprintf (" \"%s\" has fewest samples:%6d\n",
219 unicharset_boxes.id_to_unichar(min_uch_id), min_samples);
220 tprintf (" Total unlabelled words: %6d\n",
221 bad_blobs);
222 tprintf (" Final labelled words: %6d\n",
223 final_labelled_blob_count);
224
225 // Clean up.
226 delete[] tgt_char_counts;
227 }
228
Boxes2BlockList(int box_cnt,TBOX * boxes,BLOCK_LIST * block_list,bool right2left)229 int Tesseract::Boxes2BlockList(int box_cnt, TBOX *boxes,
230 BLOCK_LIST *block_list,
231 bool right2left) {
232 inT16 boxfile_lineno = 0;
233 inT16 boxfile_charno = 0;
234 TBOX box;
235 ROW *row;
236 ROW *prev_row = NULL;
237 inT16 prev_box_right = MAX_INT16;
238 inT16 prev_box_left = 0;
239 inT16 block_id;
240 inT16 row_id;
241 inT16 box_failures = 0;
242 inT16 labels_ok;
243 inT16 rows_ok;
244 inT16 bad_blobs;
245 inT16 rebalance_count = 0;
246 UNICHAR_ID min_uch_id;
247 inT16 min_samples;
248 inT16 final_labelled_blob_count;
249
250 clear_any_old_text(block_list);
251 for (int box_idx = 0; box_idx < box_cnt; box_idx++) {
252 box = boxes[box_idx];
253
254 row = find_row_of_box(block_list, box, block_id, row_id);
255 // check for a new row
256 if ((right2left && box.right () > prev_box_left) ||
257 (!right2left && box.left () < prev_box_right)) {
258 boxfile_lineno++;
259 boxfile_charno = 1;
260 }
261 else {
262 boxfile_charno++;
263 }
264
265 if (row == NULL) {
266 box_failures++;
267 }
268 else {
269 box_failures += resegment_box(row, box, 0, block_id, row_id,
270 boxfile_lineno, boxfile_charno,
271 NULL, false, false);
272 prev_row = row;
273 }
274 prev_box_right = box.right ();
275 prev_box_left = box.left ();
276 }
277
278 tidy_up(block_list, labels_ok, rows_ok, bad_blobs, NULL,
279 rebalance_count, &min_uch_id, min_samples, final_labelled_blob_count,
280 false, false);
281
282 return box_failures;
283 }
284
285 } // namespace tesseract
286
287
clear_any_old_text(BLOCK_LIST * block_list)288 void clear_any_old_text( //remove correct text
289 BLOCK_LIST *block_list //real blocks
290 ) {
291 BLOCK_IT block_it(block_list);
292 ROW_IT row_it;
293 WERD_IT word_it;
294
295 for (block_it.mark_cycle_pt ();
296 !block_it.cycled_list (); block_it.forward ()) {
297 row_it.set_to_list (block_it.data ()->row_list ());
298 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
299 word_it.set_to_list (row_it.data ()->word_list ());
300 for (word_it.mark_cycle_pt ();
301 !word_it.cycled_list (); word_it.forward ()) {
302 word_it.data ()->set_text ("");
303 }
304 }
305 }
306 }
307
register_char(const char * uch)308 UNICHAR_ID register_char(const char *uch) {
309 if (!unicharset_boxes.contains_unichar(uch)) {
310 unicharset_boxes.unichar_insert(uch);
311 if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
312 tprintf("Error: Size of unicharset of boxes is "
313 "greater than MAX_NUM_CLASSES (%d)\n", MAX_NUM_CLASSES);
314 exit(1);
315 }
316 }
317 return unicharset_boxes.unichar_to_id(uch);
318 }
319
read_next_box(int page,FILE * box_file,TBOX * box,UNICHAR_ID * uch_id)320 BOOL8 read_next_box(int page,
321 FILE* box_file,
322 TBOX *box,
323 UNICHAR_ID *uch_id) {
324 int x_min;
325 int y_min;
326 int x_max;
327 int y_max;
328 char uch[kBoxReadBufSize];
329
330 if (read_next_box(page, box_file, uch, &x_min, &y_min, &x_max, &y_max)) {
331 *uch_id = register_char(uch);
332 *box = TBOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
333 return TRUE; // read a box ok
334 } else {
335 return FALSE; // EOF
336 }
337 }
338
339
find_row_of_box(BLOCK_LIST * block_list,const TBOX & box,inT16 & block_id,inT16 & row_id_to_process)340 ROW *find_row_of_box( //
341 BLOCK_LIST *block_list, //real blocks
342 const TBOX &box, //from boxfile
343 inT16 &block_id,
344 inT16 &row_id_to_process) {
345 BLOCK_IT block_it(block_list);
346 BLOCK *block;
347 ROW_IT row_it;
348 ROW *row;
349 ROW *row_to_process = NULL;
350 inT16 row_id;
351 WERD_IT word_it;
352 WERD *word;
353 BOOL8 polyg;
354 PBLOB_IT blob_it;
355 PBLOB *blob;
356 OUTLINE_IT outline_it;
357 OUTLINE *outline;
358
359 /*
360 Find row to process - error if box REALLY overlaps more than one row. (I.e
361 it overlaps blobs in the row - not just overlaps the bounding box of the
362 whole row.)
363 */
364
365 block_id = 0;
366 for (block_it.mark_cycle_pt ();
367 !block_it.cycled_list (); block_it.forward ()) {
368 block_id++;
369 row_id = 0;
370 block = block_it.data ();
371 if (block->bounding_box ().overlap (box)) {
372 row_it.set_to_list (block->row_list ());
373 for (row_it.mark_cycle_pt ();
374 !row_it.cycled_list (); row_it.forward ()) {
375 row_id++;
376 row = row_it.data ();
377 if (row->bounding_box ().overlap (box)) {
378 word_it.set_to_list (row->word_list ());
379 for (word_it.mark_cycle_pt ();
380 !word_it.cycled_list (); word_it.forward ()) {
381 word = word_it.data ();
382 polyg = word->flag (W_POLYGON);
383 if (word->bounding_box ().overlap (box)) {
384 blob_it.set_to_list (word->gblob_list ());
385 for (blob_it.mark_cycle_pt ();
386 !blob_it.cycled_list (); blob_it.forward ()) {
387 blob = blob_it.data ();
388 if (gblob_bounding_box (blob, polyg).
389 overlap (box)) {
390 outline_it.
391 set_to_list (gblob_out_list
392 (blob, polyg));
393 for (outline_it.mark_cycle_pt ();
394 !outline_it.cycled_list ();
395 outline_it.forward ()) {
396 outline = outline_it.data ();
397 if (goutline_bounding_box
398 (outline, polyg).major_overlap (box)) {
399 if ((row_to_process == NULL) ||
400 (row_to_process == row)) {
401 row_to_process = row;
402 row_id_to_process = row_id;
403 }
404 else
405 /* RETURN ERROR Box overlaps blobs in more than one row */
406 return NULL;
407 }
408 }
409 }
410 }
411 }
412 }
413 }
414 }
415 }
416 }
417 return row_to_process;
418 }
419
420
resegment_box(ROW * row,TBOX & box,UNICHAR_ID uch_id,inT16 block_id,inT16 row_id,inT16 boxfile_lineno,inT16 boxfile_charno,inT16 * tgt_char_counts,bool learn_char_fragments,bool learning)421 inT16 resegment_box( //
422 ROW *row,
423 TBOX &box,
424 UNICHAR_ID uch_id,
425 inT16 block_id,
426 inT16 row_id,
427 inT16 boxfile_lineno,
428 inT16 boxfile_charno,
429 inT16 *tgt_char_counts,
430 bool learn_char_fragments,
431 bool learning) {
432 WERD_LIST new_word_list;
433 WERD_IT word_it;
434 WERD_IT new_word_it(&new_word_list);
435 WERD *word = NULL;
436 WERD *new_word = NULL;
437 BOOL8 polyg = false;
438 PBLOB_IT blob_it;
439 PBLOB_IT new_blob_it;
440 PBLOB *blob;
441 PBLOB *new_blob;
442 OUTLINE_IT outline_it;
443 OUTLINE_LIST dummy; // Just to initialize new_outline_it.
444 OUTLINE_IT new_outline_it = &dummy;
445 OUTLINE *outline;
446 TBOX new_word_box;
447 TBOX curr_outline_box;
448 TBOX prev_outline_box;
449 float word_x_centre;
450 float baseline;
451 inT16 error_count = 0; //number of chars lost
452 STRING label;
453 UNICHAR_ID fragment_uch_id;
454 int fragment_index;
455 int new_word_it_len;
456
457 if (learning && applybox_debug > 6) {
458 tprintf("\nAPPLY_BOX: in resegment_box() for %s(%d)\n",
459 unicharset_boxes.id_to_unichar(uch_id), uch_id);
460 }
461 word_it.set_to_list (row->word_list ());
462 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
463 word = word_it.data ();
464 polyg = word->flag (W_POLYGON);
465 if (word->bounding_box ().overlap (box)) {
466 blob_it.set_to_list (word->gblob_list ());
467 prev_outline_box = TBOX(); // clear prev_outline_box
468 curr_outline_box = TBOX(); // clear curr_outline_box
469 for (blob_it.mark_cycle_pt ();
470 !blob_it.cycled_list (); blob_it.forward ()) {
471 blob = blob_it.data ();
472 if (gblob_bounding_box (blob, polyg).overlap (box)) {
473 outline_it.set_to_list (gblob_out_list (blob, polyg));
474 for (outline_it.mark_cycle_pt ();
475 !outline_it.cycled_list (); outline_it.forward ()) {
476 outline = outline_it.data ();
477 prev_outline_box += curr_outline_box;
478 curr_outline_box = goutline_bounding_box(outline, polyg);
479 if (curr_outline_box.major_overlap (box)) {
480 if (strlen (word->text ()) > 0) {
481 if (error_count == 0) {
482 error_count = 1;
483 if (learning && applybox_debug > 4)
484 report_failed_box (boxfile_lineno,
485 boxfile_charno,
486 box, unicharset_boxes.id_to_unichar(uch_id),
487 "FAILURE! box overlaps blob in labelled word");
488 }
489 if (learning && applybox_debug > 4)
490 tprintf ("APPLY_BOXES: ALSO ignoring corrupted char"
491 " blk:%d row:%d \"%s\"\n",
492 block_id, row_id, word_it.data()->text());
493 word_it.data ()->set_text (""); // UN label it
494 error_count++;
495 }
496 // Do not learn from fragments of characters that are broken
497 // into very small pieces to avoid picking up noise.
498 if ((learn_char_fragments || learn_chars_and_char_frags_mode) &&
499 ((C_OUTLINE *)outline)->area() < kMinFragmentOutlineArea) {
500 if (applybox_debug > 6) {
501 tprintf("APPLY_BOX: fragment outline area %d is too small"
502 " - not recording fragments of this character\n",
503 ((C_OUTLINE *)outline)->area());
504 }
505 error_count++;
506 }
507
508 if (error_count == 0) {
509 if (applybox_debug > 6 ) {
510 tprintf("APPLY_BOX: Previous ");
511 prev_outline_box.print();
512 tprintf("APPLY_BOX: Current area: %d ",
513 ((C_OUTLINE *)outline)->area());
514 curr_outline_box.print();
515 }
516 // When learning character fragments is enabled, we put
517 // outlines that do not overlap on x axis in separate WERDs.
518 bool start_new_word =
519 (learn_char_fragments || learn_chars_and_char_frags_mode) &&
520 !curr_outline_box.major_x_overlap(prev_outline_box);
521 if (new_word == NULL || start_new_word) {
522 if (new_word != NULL) { // add prev new_word to new_word_list
523 new_word_it.add_to_end(new_word);
524 }
525 // Make a new word with a single blob.
526 new_word = word->shallow_copy();
527 new_word->set_flag(W_FUZZY_NON, false);
528 new_word->set_flag(W_FUZZY_SP, false);
529 if (polyg){
530 new_blob = new PBLOB;
531 } else {
532 new_blob = (PBLOB *) new C_BLOB;
533 }
534 new_blob_it.set_to_list(new_word->gblob_list());
535 new_blob_it.add_to_end(new_blob);
536 new_outline_it.set_to_list(
537 gblob_out_list(new_blob, polyg));
538 }
539 new_outline_it.add_to_end(outline_it.extract()); // move blob
540 }
541 }
542 }
543 if (outline_it.empty()) // no outlines in blob
544 delete blob_it.extract(); // so delete blob
545 }
546 }
547 if (blob_it.empty()) // no blobs in word
548 delete word_it.extract(); // so delete word
549 }
550 }
551 if (new_word != NULL) { // add prev new_word to new_word_list
552 new_word_it.add_to_end(new_word);
553 }
554 new_word_it_len = new_word_it.length();
555
556 // Check for failures.
557 if (error_count > 0)
558 return error_count;
559 if (learning && new_word_it_len <= 0) {
560 report_failed_box(boxfile_lineno, boxfile_charno, box,
561 unicharset_boxes.id_to_unichar(uch_id),
562 "FAILURE! Couldn't find any blobs");
563 return 1; // failure
564 }
565
566 if (learning && new_word_it_len > CHAR_FRAGMENT::kMaxChunks) {
567 tprintf("APPLY_BOXES: too many fragments (%d) for char %s\n",
568 new_word_it_len, unicharset_boxes.id_to_unichar(uch_id));
569 return 1; // failure
570 }
571
572 // Add labelled character or character fragments to the word list.
573 fragment_index = 0;
574 new_word_it.move_to_first();
575 for (new_word_it.mark_cycle_pt(); !new_word_it.cycled_list();
576 new_word_it.forward()) {
577 new_word = new_word_it.extract();
578 if (new_word_it_len > 1) { // deal with a fragment
579 if (learning) {
580 label = CHAR_FRAGMENT::to_string(unicharset_boxes.id_to_unichar(uch_id),
581 fragment_index, new_word_it_len);
582 fragment_uch_id = register_char(label.string());
583 new_word->set_text(label.string());
584 ++fragment_index;
585 // For now we cheat by setting the expected number of char fragments
586 // to the number of char fragments actually parsed and labelled.
587 // TODO(daria): find out whether this can be improved.
588 tgt_char_counts[fragment_uch_id]++;
589 } else {
590 // No learning involved, Just stick a place-holder string
591 new_word->set_text("*");
592 }
593 if (applybox_debug > 5) {
594 tprintf("APPLY_BOX: adding char fragment %s\n", label.string());
595 }
596 } else { // deal with a regular character
597 if (learning) {
598 if (!learn_char_fragments || learn_chars_and_char_frags_mode) {
599 new_word->set_text(unicharset_boxes.id_to_unichar(uch_id));
600 } else {
601 // not interested in non-fragmented chars if learning fragments, so
602 // unlabel it.
603 new_word->set_text("");
604 }
605 } else {
606 // No learning involved here. Just stick a place holder string
607 new_word->set_text("*");
608 }
609 }
610 gblob_sort_list(new_word->gblob_list(), polyg);
611 word_it.add_to_end(new_word);
612 new_word_box = new_word->bounding_box();
613 word_x_centre = (new_word_box.left() + new_word_box.right()) / 2.0f;
614 baseline = row->base_line(word_x_centre);
615 }
616
617 // All done. Now check if the EOL, BOL flags are set correctly.
618 word_it.move_to_first();
619 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
620 word = word_it.data();
621 word->set_flag(W_BOL, false);
622 word->set_flag(W_EOL, false);
623 }
624 word->set_flag(W_EOL, true);
625 word_it.move_to_first();
626 word_it.data()->set_flag(W_BOL, true);
627 return 0; //success
628
629 #if 0
630 if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) {
631 if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
632 (new_word_box.top () <
633 baseline + (1 + applybox_error_band) * row->x_height ())) {
634 report_failed_box (boxfile_lineno, boxfile_charno, box,
635 unicharset_boxes.id_to_unichar(uch_id),
636 "FAILURE! caps-ht char didn't ascend");
637 new_word->set_text ("");
638 return 1;
639 }
640 if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
641 (new_word_box.top () <
642 baseline + (1 - applybox_error_band) * row->x_height ())) {
643 report_failed_box (boxfile_lineno, boxfile_charno, box,
644 unicharset_boxes.id_to_unichar(uch_id),
645 "FAILURE! Odd top char below xht");
646 new_word->set_text ("");
647 return 1;
648 }
649 if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
650 ((new_word_box.top () >
651 baseline + (1 + applybox_error_band) * row->x_height ()) ||
652 (new_word_box.top () <
653 baseline + (1 - applybox_error_band) * row->x_height ()))) {
654 report_failed_box (boxfile_lineno, boxfile_charno, box,
655 unicharset_boxes.id_to_unichar(uch_id),
656 "FAILURE! x-ht char didn't have top near xht");
657 new_word->set_text ("");
658 return 1;
659 }
660 if (STRING (chs_non_ambig_bl).contains
661 (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
662 ((new_word_box.bottom () <
663 baseline - applybox_error_band * row->x_height ()) ||
664 (new_word_box.bottom () >
665 baseline + applybox_error_band * row->x_height ()))) {
666 report_failed_box (boxfile_lineno, boxfile_charno, box,
667 unicharset_boxes.id_to_unichar(uch_id),
668 "FAILURE! non ambig BL char didnt have bottom near baseline");
669 new_word->set_text ("");
670 return 1;
671 }
672 if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
673 (new_word_box.bottom () >
674 baseline + applybox_error_band * row->x_height ())) {
675 report_failed_box (boxfile_lineno, boxfile_charno, box,
676 unicharset_boxes.id_to_unichar(uch_id),
677 "FAILURE! Odd bottom char above baseline");
678 new_word->set_text ("");
679 return 1;
680 }
681 if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
682 (new_word_box.bottom () >
683 baseline - applybox_error_band * row->x_height ())) {
684 report_failed_box (boxfile_lineno, boxfile_charno, box,
685 unicharset_boxes.id_to_unichar(uch_id),
686 "FAILURE! Descender doesn't descend");
687 new_word->set_text ("");
688 return 1;
689 }
690 }
691 #endif
692 }
693
694
695 /*************************************************************************
696 * tidy_up()
697 * - report >1 block
698 * - sort the words in each row.
699 * - report any rows with no labelled words.
700 * - report any remaining unlabelled words
701 * - report total labelled words
702 *
703 *************************************************************************/
tidy_up(BLOCK_LIST * block_list,inT16 & ok_char_count,inT16 & ok_row_count,inT16 & unlabelled_words,inT16 * tgt_char_counts,inT16 & rebalance_count,UNICHAR_ID * min_uch_id,inT16 & min_samples,inT16 & final_labelled_blob_count,bool learn_character_fragments,bool learning)704 void tidy_up( //
705 BLOCK_LIST *block_list, //real blocks
706 inT16 &ok_char_count,
707 inT16 &ok_row_count,
708 inT16 &unlabelled_words,
709 inT16 *tgt_char_counts,
710 inT16 &rebalance_count,
711 UNICHAR_ID *min_uch_id,
712 inT16 &min_samples,
713 inT16 &final_labelled_blob_count,
714 bool learn_character_fragments,
715 bool learning) {
716 BLOCK_IT block_it(block_list);
717 ROW_IT row_it;
718 ROW *row;
719 WERD_IT word_it;
720 WERD *word;
721 WERD *duplicate_word;
722 inT16 block_idx = 0;
723 inT16 row_idx;
724 inT16 all_row_idx = 0;
725 BOOL8 row_ok;
726 BOOL8 rebalance_needed = FALSE;
727 inT16 *labelled_char_counts = NULL; // num unique labelled samples
728 inT16 i;
729 UNICHAR_ID uch_id;
730 UNICHAR_ID prev_uch_id = -1;
731 BOOL8 at_dupe_of_prev_word;
732 ROW *prev_row = NULL;
733 inT16 left;
734 inT16 prev_left = -1;
735
736 labelled_char_counts = new inT16[MAX_NUM_CLASSES];
737 for (i = 0; i < MAX_NUM_CLASSES; i++)
738 labelled_char_counts[i] = 0;
739
740 ok_char_count = 0;
741 ok_row_count = 0;
742 unlabelled_words = 0;
743 if (learning && (applybox_debug > 4) && (block_it.length () != 1)) {
744 if (block_it.length() > 1) {
745 tprintf ("APPLY_BOXES: More than one block??\n");
746 } else {
747 tprintf("APPLY_BOXES: No blocks identified.\n");
748 }
749 }
750
751 for (block_it.mark_cycle_pt ();
752 !block_it.cycled_list (); block_it.forward ()) {
753 block_idx++;
754 row_idx = 0;
755 row_ok = FALSE;
756 row_it.set_to_list (block_it.data ()->row_list ());
757 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
758 row_idx++;
759 all_row_idx++;
760 row = row_it.data ();
761 word_it.set_to_list (row->word_list ());
762 word_it.sort (word_comparator);
763 for (word_it.mark_cycle_pt ();
764 !word_it.cycled_list (); word_it.forward ()) {
765 word = word_it.data ();
766 if (strlen (word->text ()) == 0 ||
767 unicharset_boxes.unichar_to_id(word->text()) < 0) {
768 unlabelled_words++;
769 if (learning && applybox_debug > 4 && !learn_character_fragments) {
770 tprintf ("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n",
771 block_idx, row_idx, all_row_idx);
772 }
773 } else {
774 if (word->gblob_list ()->length () != 1)
775 tprintf ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d"
776 " row:%d allrows:%d\n", block_idx, row_idx, all_row_idx);
777
778 ok_char_count++;
779 ++labelled_char_counts[unicharset_boxes.unichar_to_id(word->text())];
780 row_ok = TRUE;
781 }
782 }
783 if ((applybox_debug > 6) && (!row_ok)) {
784 tprintf("APPLY_BOXES: Row with no labelled words blk:%d row:%d"
785 " allrows:%d\n", block_idx, row_idx, all_row_idx);
786 }
787 else
788 ok_row_count++;
789 }
790 }
791
792 min_samples = 9999;
793 for (i = 0; i < unicharset_boxes.size(); i++) {
794 if (tgt_char_counts[i] > labelled_char_counts[i]) {
795 if (labelled_char_counts[i] <= 1) {
796 tprintf("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" -"
797 " target is %d:\n",
798 labelled_char_counts[i], unicharset_boxes.debug_str(i).string(),
799 tgt_char_counts[i]);
800 }
801 else {
802 rebalance_needed = TRUE;
803 if (applybox_debug > 0)
804 tprintf("APPLY_BOXES: REBALANCE REQD \"%s\" - target of"
805 " %d from %d labelled samples\n",
806 unicharset_boxes.debug_str(i).string(), tgt_char_counts[i],
807 labelled_char_counts[i]);
808 }
809 }
810 if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
811 min_samples = labelled_char_counts[i];
812 *min_uch_id = i;
813 }
814 }
815
816 while (applybox_rebalance && rebalance_needed) {
817 block_it.set_to_list (block_list);
818 for (block_it.mark_cycle_pt ();
819 !block_it.cycled_list (); block_it.forward ()) {
820 row_it.set_to_list (block_it.data ()->row_list ());
821 for (row_it.mark_cycle_pt ();
822 !row_it.cycled_list (); row_it.forward ()) {
823 row = row_it.data ();
824 word_it.set_to_list (row->word_list ());
825 for (word_it.mark_cycle_pt ();
826 !word_it.cycled_list (); word_it.forward ()) {
827 word = word_it.data ();
828 left = word->bounding_box ().left ();
829 if (*word->text () != '\0')
830 uch_id = unicharset_boxes.unichar_to_id(word->text ());
831 else
832 uch_id = -1;
833 at_dupe_of_prev_word = ((row == prev_row) &&
834 (left = prev_left) &&
835 (uch_id == prev_uch_id));
836 if ((uch_id != -1) &&
837 (labelled_char_counts[uch_id] > 1) &&
838 (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) &&
839 (!at_dupe_of_prev_word)) {
840 /* Duplicate the word to rebalance the labelled samples */
841 if (applybox_debug > 9) {
842 tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id));
843 word->bounding_box ().print ();
844 }
845 duplicate_word = new WERD;
846 *duplicate_word = *word;
847 word_it.add_after_then_move (duplicate_word);
848 rebalance_count++;
849 labelled_char_counts[uch_id]++;
850 }
851 prev_row = row;
852 prev_left = left;
853 prev_uch_id = uch_id;
854 }
855 }
856 }
857 rebalance_needed = FALSE;
858 for (i = 0; i < unicharset_boxes.size(); i++) {
859 if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
860 (labelled_char_counts[i] > 1)) {
861 rebalance_needed = TRUE;
862 break;
863 }
864 }
865 }
866
867 /* Now final check - count labeled blobs */
868 final_labelled_blob_count = 0;
869 block_it.set_to_list (block_list);
870 for (block_it.mark_cycle_pt ();
871 !block_it.cycled_list (); block_it.forward ()) {
872 row_it.set_to_list (block_it.data ()->row_list ());
873 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
874 row = row_it.data ();
875 word_it.set_to_list (row->word_list ());
876 word_it.sort (word_comparator);
877 for (word_it.mark_cycle_pt ();
878 !word_it.cycled_list (); word_it.forward ()) {
879 word = word_it.data ();
880 if ((strlen (word->text ()) > 0) &&
881 (word->gblob_list()->length() == 1)) {
882 final_labelled_blob_count++;
883 } else {
884 delete word_it.extract();
885 }
886 }
887 // delete the row if empty
888 if (row->word_list()->empty()) {
889 delete row_it.extract();
890 }
891 }
892 }
893
894 // Clean up.
895 delete[] labelled_char_counts;
896 }
897
898
report_failed_box(inT16 boxfile_lineno,inT16 boxfile_charno,TBOX box,const char * box_ch,const char * err_msg)899 void report_failed_box(inT16 boxfile_lineno,
900 inT16 boxfile_charno,
901 TBOX box,
902 const char *box_ch,
903 const char *err_msg) {
904 if (applybox_debug > 4)
905 tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
906 boxfile_lineno,
907 boxfile_charno,
908 box_ch,
909 box.left (), box.bottom (), box.right (), box.top (), err_msg);
910 }
911
912
apply_box_training(const STRING & filename,BLOCK_LIST * block_list)913 void apply_box_training(const STRING& filename, BLOCK_LIST *block_list) {
914 BLOCK_IT block_it(block_list);
915 ROW_IT row_it;
916 ROW *row;
917 WERD_IT word_it;
918 WERD *word;
919 WERD *bln_word;
920 WERD copy_outword; // copy to denorm
921 PBLOB_IT blob_it;
922 DENORM denorm;
923 inT16 count = 0;
924 char unichar[UNICHAR_LEN + 1];
925
926 unichar[UNICHAR_LEN] = '\0';
927 tprintf ("Generating training data\n");
928 for (block_it.mark_cycle_pt ();
929 !block_it.cycled_list (); block_it.forward ()) {
930 row_it.set_to_list (block_it.data ()->row_list ());
931 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
932 row = row_it.data ();
933 word_it.set_to_list (row->word_list ());
934 for (word_it.mark_cycle_pt ();
935 !word_it.cycled_list (); word_it.forward ()) {
936 word = word_it.data ();
937 if ((strlen (word->text ()) > 0) &&
938 (word->gblob_list ()->length () == 1)) {
939 // Here is a word with a single unichar label and a single blob so train on it.
940 bln_word = make_bln_copy(word, row, NULL, row->x_height (), &denorm);
941 blob_it.set_to_list (bln_word->blob_list ());
942 strncpy(unichar, word->text (), UNICHAR_LEN);
943 tess_training_tester (filename,
944 blob_it.data (), //single blob
945 &denorm, TRUE, //correct
946 unichar, //correct character
947 strlen(unichar), //character length
948 NULL);
949 copy_outword = *(bln_word);
950 copy_outword.baseline_denormalise (&denorm);
951 blob_it.set_to_list (copy_outword.blob_list ());
952 delete bln_word;
953 count++;
954 }
955 }
956 }
957 }
958 tprintf ("Generated training data for %d blobs\n", count);
959 }
960
961 namespace tesseract {
apply_box_testing(BLOCK_LIST * block_list)962 void Tesseract::apply_box_testing(BLOCK_LIST *block_list) {
963 BLOCK_IT block_it(block_list);
964 ROW_IT row_it;
965 ROW *row;
966 inT16 row_count = 0;
967 WERD_IT word_it;
968 WERD *word;
969 WERD *bln_word;
970 inT16 word_count = 0;
971 PBLOB_IT blob_it;
972 DENORM denorm;
973 inT16 count = 0;
974 char ch[2];
975 WERD *outword; //bln best choice
976 //segmentation
977 WERD_CHOICE *best_choice; //tess output
978 WERD_CHOICE *raw_choice; //top choice permuter
979 //detailed results
980 BLOB_CHOICE_LIST_CLIST blob_choices;
981 inT16 char_count = 0;
982 inT16 correct_count = 0;
983 inT16 err_count = 0;
984 inT16 rej_count = 0;
985 #ifndef SECURE_NAMES
986 WERDSTATS wordstats; //As from newdiff
987 #endif
988 char tess_rej_str[3];
989 char tess_long_str[3];
990
991 ch[1] = '\0';
992 strcpy (tess_rej_str, "|A");
993 strcpy (tess_long_str, "|B");
994
995 for (block_it.mark_cycle_pt ();
996 !block_it.cycled_list (); block_it.forward ()) {
997 row_it.set_to_list (block_it.data ()->row_list ());
998 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
999 row = row_it.data ();
1000 row_count++;
1001 word_count = 0;
1002 word_it.set_to_list (row->word_list ());
1003 for (word_it.mark_cycle_pt ();
1004 !word_it.cycled_list (); word_it.forward ()) {
1005 word = word_it.data ();
1006 word_count++;
1007 if ((strlen (word->text ()) == 1) &&
1008 !STRING (applybox_test_exclusions).contains (*word->text ())
1009 && (word->gblob_list ()->length () == 1)) {
1010 // Here is a word with a single char label and a single blob so test it.
1011 bln_word = make_bln_copy(word, row, NULL, row->x_height (), &denorm);
1012 blob_it.set_to_list (bln_word->blob_list ());
1013 ch[0] = *word->text ();
1014 char_count++;
1015 best_choice = tess_segment_pass1 (bln_word,
1016 &denorm,
1017 &Tesseract::tess_default_matcher,
1018 raw_choice,
1019 &blob_choices, outword);
1020
1021 /*
1022 Test for TESS screw up on word. Recog_word has already ensured that the
1023 choice list, outword blob lists and best_choice string are the same
1024 length. A TESS screw up is indicated by a blank filled or 0 length string.
1025 */
1026 if ((best_choice->length() == 0) ||
1027 (strspn(best_choice->unichar_string().string(), " ") ==
1028 best_choice->unichar_string().length())) {
1029 rej_count++;
1030 tprintf ("%d:%d: \"%s\" -> TESS FAILED\n",
1031 row_count, word_count, ch);
1032 #ifndef SECURE_NAMES
1033 wordstats.word (tess_rej_str, 2, ch, 1);
1034 #endif
1035 }
1036 else {
1037 if ((best_choice->length() != outword->blob_list()->length()) ||
1038 (best_choice->length() != blob_choices.length())) {
1039 tprintf
1040 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
1041 best_choice->unichar_string().string(),
1042 best_choice->length(),
1043 outword->blob_list ()->length(),
1044 blob_choices.length());
1045 }
1046 ASSERT_HOST(best_choice->length() ==
1047 outword->blob_list()->length());
1048 ASSERT_HOST(best_choice->length() == blob_choices.length());
1049 fix_quotes (best_choice,
1050 //turn to double
1051 outword, &blob_choices);
1052 if (strcmp (best_choice->unichar_string().string(), ch) != 0) {
1053 err_count++;
1054 tprintf ("%d:%d: \"%s\" -> \"%s\"\n",
1055 row_count, word_count, ch,
1056 best_choice->unichar_string().string());
1057 }
1058 else
1059 correct_count++;
1060 #ifndef SECURE_NAMES
1061 if (best_choice->unichar_string().length() > 2)
1062 wordstats.word(tess_long_str, 2, ch, 1);
1063 else
1064 wordstats.word(best_choice->unichar_string().string(),
1065 best_choice->unichar_string().length(),
1066 ch, 1);
1067 #endif
1068 }
1069 delete bln_word;
1070 delete outword;
1071 delete best_choice;
1072 delete raw_choice;
1073 blob_choices.deep_clear ();
1074 count++;
1075 }
1076 }
1077 }
1078 }
1079 #ifndef SECURE_NAMES
1080 wordstats.print (1, 100.0);
1081 wordstats.conf_matrix ();
1082 tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n",
1083 char_count, correct_count, rej_count, err_count);
1084 #endif
1085 }
1086
1087 } // namespace tesseract
1088