• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**********************************************************************
2  * File:        applybox.cpp  (Formerly applybox.c)
3  * Description: Re segment rows according to box file data
4  * Author:      Phil Cheatle
5  * Created:     Wed Nov 24 09:11:23 GMT 1993
6  *
7  * (C) Copyright 1993, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 #include "mfcpch.h"
20 
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25 
26 #ifdef HAVE_LIBLEPT
27 // Include leptonica library only if autoconf (or makefile etc) tell us to.
28 #include "allheaders.h"
29 #endif
30 
31 #include "applybox.h"
32 #include <ctype.h>
33 #include <string.h>
34 #ifdef __UNIX__
35 #include <assert.h>
36 #include <errno.h>
37 #endif
38 #include "boxread.h"
39 #include "control.h"
40 #include "genblob.h"
41 #include "globals.h"
42 #include "fixxht.h"
43 #include "mainblk.h"
44 #include "matchdefs.h"
45 #include "secname.h"
46 #include "tessbox.h"
47 #include "unichar.h"
48 #include "unicharset.h"
49 #include "matchdefs.h"
50 #include "tesseractclass.h"
51 
52 #define SECURE_NAMES
53 #ifndef SECURE_NAMES
54 #include          "wordstats.h"
55 #endif
56 
57 #define EXTERN
58 EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
59 EXTERN INT_VAR (applybox_debug, 5, "Debug level");
60 EXTERN INT_VAR (applybox_page, 0, "Page number to apply boxes from");
61 EXTERN STRING_VAR (applybox_test_exclusions, "",
62                    "Chars ignored for testing");
63 EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");
64 
65 EXTERN STRING_VAR(exposure_pattern, ".exp",
66                   "Exposure value follows this pattern in the image"
67                   " filename. The name of the image files are expected"
68                   " to be in the form [lang].[fontname].exp[num].tif");
69 
70 EXTERN BOOL_VAR(learn_chars_and_char_frags_mode, FALSE,
71                 "Learn both character fragments (as is done in the"
72                 " special low exposure mode) as well as unfragmented"
73                 " characters.");
74 
75 extern IMAGE page_image;
76 
77 // The unicharset used during box training
78 static UNICHARSET unicharset_boxes;
79 
80 /*************************************************************************
81  * The code re-assigns outlines to form words each with ONE labelled blob.
82  * Noise is left in UNLABELLED words. The chars on the page are checked crudely
83  * for sensible position relative to baseline and xht. Failed boxes are
84  * compensated for by duplicating other believable instances of the character.
85  *
86  * The box file is assumed to contain box definitions, one per line, of the
87  * following format:
88  *   <Char> <left> <bottom> <right> <top> ... arbitrary trailing fields unused
89  *
90  * The approach taken is to search the WHOLE page for stuff overlapping each box.
91  *  - This is not too inefficient and is SAFE.
92  *    - We can detect overlapping blobs as we will be attempting to put a blob
93  *      from a LABELLED word into the current word.
94  *    - When all the boxes have been processed we can detect any stuff which is
95  *      being ignored - it is the unlabelled words left on the page.
96  *
97  * A box should only overlap one row.
98  *
99  * A warning is given if the box is on the same row as the previous box, but NOT
100  * on the same row as the previous blob.
101  *
102  * Any OUTLINE which overlaps the box is put into the new word.
103  *
104  * ascender chars must ascend above xht significantly
105  * xht chars must not rise above row xht significantly
106  * bl chars must not descend below baseline significantly
107  * descender chars must descend below baseline significantly
108  *
109  * ?? Certain chars are DROPPED - to limit the training data.
110  *
111  *************************************************************************/
112 namespace tesseract {
apply_boxes(const STRING & fname,BLOCK_LIST * block_list)113 void Tesseract::apply_boxes(const STRING& fname,
114                             BLOCK_LIST *block_list    //real blocks
115                            ) {
116   inT16 boxfile_lineno = 0;
117   inT16 boxfile_charno = 0;
118   TBOX box;                       //boxfile box
119   UNICHAR_ID uch_id;             //correct ch from boxfile
120   ROW *row;
121   ROW *prev_row = NULL;
122   inT16 prev_box_right = MAX_INT16;
123   inT16 block_id;
124   inT16 row_id;
125   inT16 box_count = 0;
126   inT16 box_failures = 0;
127   inT16 labels_ok;
128   inT16 rows_ok;
129   inT16 bad_blobs;
130   inT16 *tgt_char_counts = NULL; // No. of box samples
131   inT16 i;
132   inT16 rebalance_count = 0;
133   UNICHAR_ID min_uch_id = INVALID_UNICHAR_ID;
134   inT16 min_samples;
135   inT16 final_labelled_blob_count;
136   bool low_exposure = false;
137 
138   // Clean the unichar set
139   unicharset_boxes.clear();
140   // Space character needed to represent NIL classification
141   unicharset_boxes.unichar_insert(" ");
142 
143   // Figure out whether this image file's exposure is less than 1, in which
144   // case when learning we will only pay attention to character fragments.
145   const char *ptr = strstr(imagefile.string(), exposure_pattern.string());
146   if (ptr != NULL &&
147       strtol(ptr += strlen(exposure_pattern.string()), NULL, 10) < 0) {
148     low_exposure = true;
149   }
150 
151   FILE* box_file;
152   STRING filename = fname;
153   const char *lastdot;           //of name
154 
155   lastdot = strrchr (filename.string (), '.');
156   if (lastdot != NULL)
157     filename[lastdot - filename.string()] = '\0';
158 
159   filename += ".box";
160   if (!(box_file = fopen (filename.string(), "r"))) {
161     CANTOPENFILE.error ("read_next_box", EXIT,
162       "Cant open box file %s %d",
163       filename.string(), errno);
164   }
165 
166   tgt_char_counts = new inT16[MAX_NUM_CLASSES];
167   for (i = 0; i < MAX_NUM_CLASSES; i++)
168     tgt_char_counts[i] = 0;
169 
170   clear_any_old_text(block_list);
171   while (read_next_box(applybox_page, box_file, &box, &uch_id)) {
172     box_count++;
173     if (!low_exposure || learn_chars_and_char_frags_mode) {
174       tgt_char_counts[uch_id]++;
175     }
176     row = find_row_of_box (block_list, box, block_id, row_id);
177     if (box.left () < prev_box_right) {
178       boxfile_lineno++;
179       boxfile_charno = 1;
180     }
181     else
182       boxfile_charno++;
183 
184     if (row == NULL) {
185       box_failures++;
186       report_failed_box (boxfile_lineno, boxfile_charno, box,
187                          unicharset_boxes.id_to_unichar(uch_id),
188         "FAILURE! box overlaps no blobs or blobs in multiple rows");
189     }
190     else {
191       if ((box.left () >= prev_box_right) && (row != prev_row))
192         report_failed_box (boxfile_lineno, boxfile_charno, box,
193                            unicharset_boxes.id_to_unichar(uch_id),
194           "WARNING! false row break");
195       box_failures += resegment_box (row, box, uch_id, block_id, row_id,
196         boxfile_lineno, boxfile_charno, tgt_char_counts, low_exposure, true);
197       prev_row = row;
198     }
199     prev_box_right = box.right ();
200   }
201   tidy_up(block_list,
202           labels_ok,
203           rows_ok,
204           bad_blobs,
205           tgt_char_counts,
206           rebalance_count,
207           &min_uch_id,
208           min_samples,
209           final_labelled_blob_count,
210           low_exposure,
211           true);
212   tprintf ("APPLY_BOXES:\n");
213   tprintf ("   Boxes read from boxfile:  %6d\n", box_count);
214   tprintf ("   Initially labelled blobs: %6d in %d rows\n",
215     labels_ok, rows_ok);
216   tprintf ("   Box failures detected:       %6d\n", box_failures);
217   tprintf ("   Duped blobs for rebalance:%6d\n", rebalance_count);
218   tprintf ("   \"%s\" has fewest samples:%6d\n",
219            unicharset_boxes.id_to_unichar(min_uch_id), min_samples);
220   tprintf ("                Total unlabelled words:   %6d\n",
221     bad_blobs);
222   tprintf ("                Final labelled words:     %6d\n",
223     final_labelled_blob_count);
224 
225   // Clean up.
226   delete[] tgt_char_counts;
227 }
228 
Boxes2BlockList(int box_cnt,TBOX * boxes,BLOCK_LIST * block_list,bool right2left)229 int Tesseract::Boxes2BlockList(int box_cnt, TBOX *boxes,
230                                BLOCK_LIST *block_list,
231                                bool right2left) {
232   inT16 boxfile_lineno = 0;
233   inT16 boxfile_charno = 0;
234   TBOX box;
235   ROW *row;
236   ROW *prev_row = NULL;
237   inT16 prev_box_right = MAX_INT16;
238   inT16 prev_box_left = 0;
239   inT16 block_id;
240   inT16 row_id;
241   inT16 box_failures = 0;
242   inT16 labels_ok;
243   inT16 rows_ok;
244   inT16 bad_blobs;
245   inT16 rebalance_count = 0;
246   UNICHAR_ID min_uch_id;
247   inT16 min_samples;
248   inT16 final_labelled_blob_count;
249 
250   clear_any_old_text(block_list);
251   for (int box_idx = 0; box_idx < box_cnt; box_idx++) {
252     box = boxes[box_idx];
253 
254     row = find_row_of_box(block_list, box, block_id, row_id);
255     // check for a new row
256     if ((right2left && box.right () > prev_box_left) ||
257         (!right2left && box.left () < prev_box_right)) {
258       boxfile_lineno++;
259       boxfile_charno = 1;
260     }
261     else {
262       boxfile_charno++;
263     }
264 
265     if (row == NULL) {
266       box_failures++;
267     }
268     else {
269       box_failures += resegment_box(row, box, 0, block_id, row_id,
270                                     boxfile_lineno, boxfile_charno,
271                                     NULL, false, false);
272       prev_row = row;
273     }
274     prev_box_right = box.right ();
275     prev_box_left = box.left ();
276   }
277 
278   tidy_up(block_list, labels_ok, rows_ok, bad_blobs, NULL,
279           rebalance_count, &min_uch_id, min_samples, final_labelled_blob_count,
280           false, false);
281 
282   return box_failures;
283 }
284 
285 }  // namespace tesseract
286 
287 
clear_any_old_text(BLOCK_LIST * block_list)288 void clear_any_old_text(                        //remove correct text
289                         BLOCK_LIST *block_list  //real blocks
290                        ) {
291   BLOCK_IT block_it(block_list);
292   ROW_IT row_it;
293   WERD_IT word_it;
294 
295   for (block_it.mark_cycle_pt ();
296   !block_it.cycled_list (); block_it.forward ()) {
297     row_it.set_to_list (block_it.data ()->row_list ());
298     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
299       word_it.set_to_list (row_it.data ()->word_list ());
300       for (word_it.mark_cycle_pt ();
301       !word_it.cycled_list (); word_it.forward ()) {
302         word_it.data ()->set_text ("");
303       }
304     }
305   }
306 }
307 
register_char(const char * uch)308 UNICHAR_ID register_char(const char *uch) {
309   if (!unicharset_boxes.contains_unichar(uch)) {
310     unicharset_boxes.unichar_insert(uch);
311     if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
312       tprintf("Error: Size of unicharset of boxes is "
313               "greater than MAX_NUM_CLASSES (%d)\n", MAX_NUM_CLASSES);
314       exit(1);
315     }
316   }
317   return unicharset_boxes.unichar_to_id(uch);
318 }
319 
read_next_box(int page,FILE * box_file,TBOX * box,UNICHAR_ID * uch_id)320 BOOL8 read_next_box(int page,
321                     FILE* box_file,
322                     TBOX *box,
323                     UNICHAR_ID *uch_id) {
324   int x_min;
325   int y_min;
326   int x_max;
327   int y_max;
328   char uch[kBoxReadBufSize];
329 
330   if (read_next_box(page, box_file, uch, &x_min, &y_min, &x_max, &y_max)) {
331     *uch_id = register_char(uch);
332     *box = TBOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
333     return TRUE;  // read a box ok
334   } else {
335     return FALSE;  // EOF
336   }
337 }
338 
339 
find_row_of_box(BLOCK_LIST * block_list,const TBOX & box,inT16 & block_id,inT16 & row_id_to_process)340 ROW *find_row_of_box(                         //
341                      BLOCK_LIST *block_list,  //real blocks
342                      const TBOX &box,                 //from boxfile
343                      inT16 &block_id,
344                      inT16 &row_id_to_process) {
345   BLOCK_IT block_it(block_list);
346   BLOCK *block;
347   ROW_IT row_it;
348   ROW *row;
349   ROW *row_to_process = NULL;
350   inT16 row_id;
351   WERD_IT word_it;
352   WERD *word;
353   BOOL8 polyg;
354   PBLOB_IT blob_it;
355   PBLOB *blob;
356   OUTLINE_IT outline_it;
357   OUTLINE *outline;
358 
359   /*
360     Find row to process - error if box REALLY overlaps more than one row. (I.e
361     it overlaps blobs in the row - not just overlaps the bounding box of the
362     whole row.)
363   */
364 
365   block_id = 0;
366   for (block_it.mark_cycle_pt ();
367   !block_it.cycled_list (); block_it.forward ()) {
368     block_id++;
369     row_id = 0;
370     block = block_it.data ();
371     if (block->bounding_box ().overlap (box)) {
372       row_it.set_to_list (block->row_list ());
373       for (row_it.mark_cycle_pt ();
374       !row_it.cycled_list (); row_it.forward ()) {
375         row_id++;
376         row = row_it.data ();
377         if (row->bounding_box ().overlap (box)) {
378           word_it.set_to_list (row->word_list ());
379           for (word_it.mark_cycle_pt ();
380           !word_it.cycled_list (); word_it.forward ()) {
381             word = word_it.data ();
382             polyg = word->flag (W_POLYGON);
383             if (word->bounding_box ().overlap (box)) {
384               blob_it.set_to_list (word->gblob_list ());
385               for (blob_it.mark_cycle_pt ();
386               !blob_it.cycled_list (); blob_it.forward ()) {
387                 blob = blob_it.data ();
388                 if (gblob_bounding_box (blob, polyg).
389                 overlap (box)) {
390                   outline_it.
391                     set_to_list (gblob_out_list
392                     (blob, polyg));
393                   for (outline_it.mark_cycle_pt ();
394                     !outline_it.cycled_list ();
395                   outline_it.forward ()) {
396                     outline = outline_it.data ();
397                     if (goutline_bounding_box
398                     (outline, polyg).major_overlap (box)) {
399                       if ((row_to_process == NULL) ||
400                       (row_to_process == row)) {
401                         row_to_process = row;
402                         row_id_to_process = row_id;
403                       }
404                       else
405                         /* RETURN ERROR Box overlaps blobs in more than one row  */
406                         return NULL;
407                     }
408                   }
409                 }
410               }
411             }
412           }
413         }
414       }
415     }
416   }
417   return row_to_process;
418 }
419 
420 
resegment_box(ROW * row,TBOX & box,UNICHAR_ID uch_id,inT16 block_id,inT16 row_id,inT16 boxfile_lineno,inT16 boxfile_charno,inT16 * tgt_char_counts,bool learn_char_fragments,bool learning)421 inT16 resegment_box(  //
422                     ROW *row,
423                     TBOX &box,
424                     UNICHAR_ID uch_id,
425                     inT16 block_id,
426                     inT16 row_id,
427                     inT16 boxfile_lineno,
428                     inT16 boxfile_charno,
429                     inT16 *tgt_char_counts,
430                     bool learn_char_fragments,
431                     bool learning) {
432   WERD_LIST new_word_list;
433   WERD_IT word_it;
434   WERD_IT new_word_it(&new_word_list);
435   WERD *word = NULL;
436   WERD *new_word = NULL;
437   BOOL8 polyg = false;
438   PBLOB_IT blob_it;
439   PBLOB_IT new_blob_it;
440   PBLOB *blob;
441   PBLOB *new_blob;
442   OUTLINE_IT outline_it;
443   OUTLINE_LIST dummy;  // Just to initialize new_outline_it.
444   OUTLINE_IT new_outline_it = &dummy;
445   OUTLINE *outline;
446   TBOX new_word_box;
447   TBOX curr_outline_box;
448   TBOX prev_outline_box;
449   float word_x_centre;
450   float baseline;
451   inT16 error_count = 0;         //number of chars lost
452   STRING label;
453   UNICHAR_ID fragment_uch_id;
454   int fragment_index;
455   int new_word_it_len;
456 
457   if (learning && applybox_debug > 6) {
458     tprintf("\nAPPLY_BOX: in resegment_box() for %s(%d)\n",
459             unicharset_boxes.id_to_unichar(uch_id), uch_id);
460   }
461   word_it.set_to_list (row->word_list ());
462   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
463     word = word_it.data ();
464     polyg = word->flag (W_POLYGON);
465     if (word->bounding_box ().overlap (box)) {
466       blob_it.set_to_list (word->gblob_list ());
467       prev_outline_box = TBOX();  // clear prev_outline_box
468       curr_outline_box = TBOX();  // clear curr_outline_box
469       for (blob_it.mark_cycle_pt ();
470       !blob_it.cycled_list (); blob_it.forward ()) {
471         blob = blob_it.data ();
472         if (gblob_bounding_box (blob, polyg).overlap (box)) {
473           outline_it.set_to_list (gblob_out_list (blob, polyg));
474           for (outline_it.mark_cycle_pt ();
475           !outline_it.cycled_list (); outline_it.forward ()) {
476             outline = outline_it.data ();
477             prev_outline_box += curr_outline_box;
478             curr_outline_box = goutline_bounding_box(outline, polyg);
479             if (curr_outline_box.major_overlap (box)) {
480               if (strlen (word->text ()) > 0) {
481                 if (error_count == 0) {
482                   error_count = 1;
483                   if (learning && applybox_debug > 4)
484                     report_failed_box (boxfile_lineno,
485                       boxfile_charno,
486                       box, unicharset_boxes.id_to_unichar(uch_id),
487                       "FAILURE! box overlaps blob in labelled word");
488                 }
489                 if (learning && applybox_debug > 4)
490                   tprintf ("APPLY_BOXES: ALSO ignoring corrupted char"
491                            " blk:%d row:%d \"%s\"\n",
492                            block_id, row_id, word_it.data()->text());
493                 word_it.data ()->set_text ("");  // UN label it
494                 error_count++;
495               }
496               // Do not learn from fragments of characters that are broken
497               // into very small pieces to avoid picking up noise.
498               if ((learn_char_fragments || learn_chars_and_char_frags_mode) &&
499                   ((C_OUTLINE *)outline)->area() < kMinFragmentOutlineArea) {
500                 if (applybox_debug > 6) {
501                   tprintf("APPLY_BOX: fragment outline area %d is too small"
502                           " - not recording fragments of this character\n",
503                           ((C_OUTLINE *)outline)->area());
504                 }
505                 error_count++;
506               }
507 
508               if (error_count == 0) {
509                 if (applybox_debug > 6 ) {
510                   tprintf("APPLY_BOX: Previous ");
511                   prev_outline_box.print();
512                   tprintf("APPLY_BOX: Current area: %d ",
513                           ((C_OUTLINE *)outline)->area());
514                   curr_outline_box.print();
515                 }
516                 // When learning character fragments is enabled, we put
517                 // outlines that do not overlap on x axis in separate WERDs.
518                 bool start_new_word =
519                     (learn_char_fragments || learn_chars_and_char_frags_mode) &&
520                   !curr_outline_box.major_x_overlap(prev_outline_box);
521                 if (new_word == NULL || start_new_word) {
522                   if (new_word != NULL) {  // add prev new_word to new_word_list
523                     new_word_it.add_to_end(new_word);
524                   }
525                   // Make a new word with a single blob.
526                   new_word = word->shallow_copy();
527                   new_word->set_flag(W_FUZZY_NON, false);
528                   new_word->set_flag(W_FUZZY_SP, false);
529                   if (polyg){
530                     new_blob = new PBLOB;
531                   } else {
532                     new_blob = (PBLOB *) new C_BLOB;
533                   }
534                   new_blob_it.set_to_list(new_word->gblob_list());
535                   new_blob_it.add_to_end(new_blob);
536                   new_outline_it.set_to_list(
537                       gblob_out_list(new_blob, polyg));
538                 }
539                 new_outline_it.add_to_end(outline_it.extract());  // move blob
540               }
541             }
542           }
543           if (outline_it.empty())      // no outlines in blob
544             delete blob_it.extract();  // so delete blob
545         }
546       }
547       if (blob_it.empty())         // no blobs in word
548         delete word_it.extract();  // so delete word
549     }
550   }
551   if (new_word != NULL) {  // add prev new_word to new_word_list
552     new_word_it.add_to_end(new_word);
553   }
554   new_word_it_len = new_word_it.length();
555 
556   // Check for failures.
557   if (error_count > 0)
558     return error_count;
559   if (learning && new_word_it_len <= 0) {
560     report_failed_box(boxfile_lineno, boxfile_charno, box,
561                       unicharset_boxes.id_to_unichar(uch_id),
562                       "FAILURE! Couldn't find any blobs");
563     return 1;  // failure
564   }
565 
566   if (learning && new_word_it_len > CHAR_FRAGMENT::kMaxChunks) {
567     tprintf("APPLY_BOXES: too many fragments (%d) for char %s\n",
568             new_word_it_len, unicharset_boxes.id_to_unichar(uch_id));
569     return 1;  // failure
570   }
571 
572   // Add labelled character or character fragments to the word list.
573   fragment_index = 0;
574   new_word_it.move_to_first();
575   for (new_word_it.mark_cycle_pt(); !new_word_it.cycled_list();
576        new_word_it.forward()) {
577     new_word = new_word_it.extract();
578     if (new_word_it_len > 1) {  // deal with a fragment
579       if (learning) {
580       label = CHAR_FRAGMENT::to_string(unicharset_boxes.id_to_unichar(uch_id),
581                                        fragment_index, new_word_it_len);
582       fragment_uch_id = register_char(label.string());
583       new_word->set_text(label.string());
584       ++fragment_index;
585       // For now we cheat by setting the expected number of char fragments
586       // to the number of char fragments actually parsed and labelled.
587       // TODO(daria): find out whether this can be improved.
588       tgt_char_counts[fragment_uch_id]++;
589       } else {
590         // No learning involved, Just stick a place-holder string
591         new_word->set_text("*");
592       }
593       if (applybox_debug > 5) {
594         tprintf("APPLY_BOX: adding char fragment %s\n", label.string());
595       }
596     } else {  // deal with a regular character
597       if (learning) {
598         if (!learn_char_fragments || learn_chars_and_char_frags_mode) {
599         new_word->set_text(unicharset_boxes.id_to_unichar(uch_id));
600         } else {
601           // not interested in non-fragmented chars if learning fragments, so
602           // unlabel it.
603           new_word->set_text("");
604         }
605       } else {
606         // No learning involved here. Just stick a place holder string
607         new_word->set_text("*");
608       }
609     }
610     gblob_sort_list(new_word->gblob_list(), polyg);
611     word_it.add_to_end(new_word);
612     new_word_box = new_word->bounding_box();
613     word_x_centre = (new_word_box.left() + new_word_box.right()) / 2.0f;
614     baseline = row->base_line(word_x_centre);
615   }
616 
617   // All done. Now check if the EOL, BOL flags are set correctly.
618   word_it.move_to_first();
619   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
620     word = word_it.data();
621     word->set_flag(W_BOL, false);
622     word->set_flag(W_EOL, false);
623   }
624   word->set_flag(W_EOL, true);
625   word_it.move_to_first();
626   word_it.data()->set_flag(W_BOL, true);
627   return 0;  //success
628 
629 #if 0
630     if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) {
631       if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
632           (new_word_box.top () <
633            baseline + (1 + applybox_error_band) * row->x_height ())) {
634         report_failed_box (boxfile_lineno, boxfile_charno, box,
635                            unicharset_boxes.id_to_unichar(uch_id),
636                            "FAILURE! caps-ht char didn't ascend");
637         new_word->set_text ("");
638         return 1;
639       }
640       if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
641           (new_word_box.top () <
642            baseline + (1 - applybox_error_band) * row->x_height ())) {
643         report_failed_box (boxfile_lineno, boxfile_charno, box,
644                            unicharset_boxes.id_to_unichar(uch_id),
645                            "FAILURE! Odd top char below xht");
646         new_word->set_text ("");
647         return 1;
648       }
649       if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
650           ((new_word_box.top () >
651             baseline + (1 + applybox_error_band) * row->x_height ()) ||
652            (new_word_box.top () <
653             baseline + (1 - applybox_error_band) * row->x_height ()))) {
654         report_failed_box (boxfile_lineno, boxfile_charno, box,
655                            unicharset_boxes.id_to_unichar(uch_id),
656                            "FAILURE! x-ht char didn't have top near xht");
657         new_word->set_text ("");
658         return 1;
659       }
660       if (STRING (chs_non_ambig_bl).contains
661           (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
662           ((new_word_box.bottom () <
663             baseline - applybox_error_band * row->x_height ()) ||
664            (new_word_box.bottom () >
665             baseline + applybox_error_band * row->x_height ()))) {
666         report_failed_box (boxfile_lineno, boxfile_charno, box,
667                            unicharset_boxes.id_to_unichar(uch_id),
668                            "FAILURE! non ambig BL char didnt have bottom near baseline");
669         new_word->set_text ("");
670         return 1;
671       }
672       if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
673           (new_word_box.bottom () >
674            baseline + applybox_error_band * row->x_height ())) {
675         report_failed_box (boxfile_lineno, boxfile_charno, box,
676                            unicharset_boxes.id_to_unichar(uch_id),
677                            "FAILURE! Odd bottom char above baseline");
678         new_word->set_text ("");
679         return 1;
680       }
681       if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
682           (new_word_box.bottom () >
683            baseline - applybox_error_band * row->x_height ())) {
684         report_failed_box (boxfile_lineno, boxfile_charno, box,
685                            unicharset_boxes.id_to_unichar(uch_id),
686         "FAILURE! Descender doesn't descend");
687         new_word->set_text ("");
688         return 1;
689       }
690     }
691 #endif
692 }
693 
694 
695 /*************************************************************************
696  * tidy_up()
697  *   - report >1 block
698  *   - sort the words in each row.
699  *   - report any rows with no labelled words.
700  *   - report any remaining unlabelled words
701  *   - report total labelled words
702  *
703  *************************************************************************/
tidy_up(BLOCK_LIST * block_list,inT16 & ok_char_count,inT16 & ok_row_count,inT16 & unlabelled_words,inT16 * tgt_char_counts,inT16 & rebalance_count,UNICHAR_ID * min_uch_id,inT16 & min_samples,inT16 & final_labelled_blob_count,bool learn_character_fragments,bool learning)704 void tidy_up(                         //
705              BLOCK_LIST *block_list,  //real blocks
706              inT16 &ok_char_count,
707              inT16 &ok_row_count,
708              inT16 &unlabelled_words,
709              inT16 *tgt_char_counts,
710              inT16 &rebalance_count,
711              UNICHAR_ID *min_uch_id,
712              inT16 &min_samples,
713              inT16 &final_labelled_blob_count,
714              bool learn_character_fragments,
715              bool learning) {
716   BLOCK_IT block_it(block_list);
717   ROW_IT row_it;
718   ROW *row;
719   WERD_IT word_it;
720   WERD *word;
721   WERD *duplicate_word;
722   inT16 block_idx = 0;
723   inT16 row_idx;
724   inT16 all_row_idx = 0;
725   BOOL8 row_ok;
726   BOOL8 rebalance_needed = FALSE;
727   inT16 *labelled_char_counts = NULL;  // num unique labelled samples
728   inT16 i;
729   UNICHAR_ID uch_id;
730   UNICHAR_ID prev_uch_id = -1;
731   BOOL8 at_dupe_of_prev_word;
732   ROW *prev_row = NULL;
733   inT16 left;
734   inT16 prev_left = -1;
735 
736   labelled_char_counts = new inT16[MAX_NUM_CLASSES];
737   for (i = 0; i < MAX_NUM_CLASSES; i++)
738     labelled_char_counts[i] = 0;
739 
740   ok_char_count = 0;
741   ok_row_count = 0;
742   unlabelled_words = 0;
743   if (learning && (applybox_debug > 4) && (block_it.length () != 1)) {
744     if (block_it.length() > 1) {
745     tprintf ("APPLY_BOXES: More than one block??\n");
746     } else {
747       tprintf("APPLY_BOXES: No blocks identified.\n");
748     }
749   }
750 
751   for (block_it.mark_cycle_pt ();
752   !block_it.cycled_list (); block_it.forward ()) {
753     block_idx++;
754     row_idx = 0;
755     row_ok = FALSE;
756     row_it.set_to_list (block_it.data ()->row_list ());
757     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
758       row_idx++;
759       all_row_idx++;
760       row = row_it.data ();
761       word_it.set_to_list (row->word_list ());
762       word_it.sort (word_comparator);
763       for (word_it.mark_cycle_pt ();
764       !word_it.cycled_list (); word_it.forward ()) {
765         word = word_it.data ();
766         if (strlen (word->text ()) == 0 ||
767             unicharset_boxes.unichar_to_id(word->text()) < 0) {
768           unlabelled_words++;
769           if (learning && applybox_debug > 4 && !learn_character_fragments) {
770             tprintf ("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n",
771                      block_idx, row_idx, all_row_idx);
772           }
773         } else {
774           if (word->gblob_list ()->length () != 1)
775             tprintf ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d"
776                      " row:%d allrows:%d\n", block_idx, row_idx, all_row_idx);
777 
778           ok_char_count++;
779           ++labelled_char_counts[unicharset_boxes.unichar_to_id(word->text())];
780           row_ok = TRUE;
781         }
782       }
783       if ((applybox_debug > 6) && (!row_ok)) {
784         tprintf("APPLY_BOXES: Row with no labelled words blk:%d row:%d"
785                 " allrows:%d\n", block_idx, row_idx, all_row_idx);
786       }
787       else
788         ok_row_count++;
789     }
790   }
791 
792   min_samples = 9999;
793   for (i = 0; i < unicharset_boxes.size(); i++) {
794     if (tgt_char_counts[i] > labelled_char_counts[i]) {
795       if (labelled_char_counts[i] <= 1) {
796         tprintf("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" -"
797                 " target is %d:\n",
798                 labelled_char_counts[i], unicharset_boxes.debug_str(i).string(),
799                 tgt_char_counts[i]);
800       }
801       else {
802         rebalance_needed = TRUE;
803         if (applybox_debug > 0)
804           tprintf("APPLY_BOXES: REBALANCE REQD \"%s\" - target of"
805                   " %d from %d labelled samples\n",
806                   unicharset_boxes.debug_str(i).string(), tgt_char_counts[i],
807                   labelled_char_counts[i]);
808       }
809     }
810     if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
811       min_samples = labelled_char_counts[i];
812       *min_uch_id = i;
813     }
814   }
815 
816   while (applybox_rebalance && rebalance_needed) {
817     block_it.set_to_list (block_list);
818     for (block_it.mark_cycle_pt ();
819     !block_it.cycled_list (); block_it.forward ()) {
820       row_it.set_to_list (block_it.data ()->row_list ());
821       for (row_it.mark_cycle_pt ();
822       !row_it.cycled_list (); row_it.forward ()) {
823         row = row_it.data ();
824         word_it.set_to_list (row->word_list ());
825         for (word_it.mark_cycle_pt ();
826         !word_it.cycled_list (); word_it.forward ()) {
827           word = word_it.data ();
828           left = word->bounding_box ().left ();
829           if (*word->text () != '\0')
830             uch_id = unicharset_boxes.unichar_to_id(word->text ());
831           else
832             uch_id = -1;
833           at_dupe_of_prev_word = ((row == prev_row) &&
834             (left = prev_left) &&
835             (uch_id == prev_uch_id));
836           if ((uch_id != -1) &&
837             (labelled_char_counts[uch_id] > 1) &&
838             (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) &&
839           (!at_dupe_of_prev_word)) {
840             /* Duplicate the word to rebalance the labelled samples */
841             if (applybox_debug > 9) {
842               tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id));
843               word->bounding_box ().print ();
844             }
845             duplicate_word = new WERD;
846             *duplicate_word = *word;
847             word_it.add_after_then_move (duplicate_word);
848             rebalance_count++;
849             labelled_char_counts[uch_id]++;
850           }
851           prev_row = row;
852           prev_left = left;
853           prev_uch_id = uch_id;
854         }
855       }
856     }
857     rebalance_needed = FALSE;
858     for (i = 0; i < unicharset_boxes.size(); i++) {
859       if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
860       (labelled_char_counts[i] > 1)) {
861         rebalance_needed = TRUE;
862         break;
863       }
864     }
865   }
866 
867   /* Now final check - count labeled blobs */
868   final_labelled_blob_count = 0;
869   block_it.set_to_list (block_list);
870   for (block_it.mark_cycle_pt ();
871   !block_it.cycled_list (); block_it.forward ()) {
872     row_it.set_to_list (block_it.data ()->row_list ());
873     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
874       row = row_it.data ();
875       word_it.set_to_list (row->word_list ());
876       word_it.sort (word_comparator);
877       for (word_it.mark_cycle_pt ();
878       !word_it.cycled_list (); word_it.forward ()) {
879         word = word_it.data ();
880         if ((strlen (word->text ()) > 0) &&
881             (word->gblob_list()->length() == 1)) {
882           final_labelled_blob_count++;
883         } else {
884           delete word_it.extract();
885         }
886       }
887       // delete the row if empty
888       if (row->word_list()->empty()) {
889         delete row_it.extract();
890     }
891   }
892 }
893 
894   // Clean up.
895   delete[] labelled_char_counts;
896 }
897 
898 
report_failed_box(inT16 boxfile_lineno,inT16 boxfile_charno,TBOX box,const char * box_ch,const char * err_msg)899 void report_failed_box(inT16 boxfile_lineno,
900                        inT16 boxfile_charno,
901                        TBOX box,
902                        const char *box_ch,
903                        const char *err_msg) {
904   if (applybox_debug > 4)
905     tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
906       boxfile_lineno,
907       boxfile_charno,
908       box_ch,
909       box.left (), box.bottom (), box.right (), box.top (), err_msg);
910 }
911 
912 
apply_box_training(const STRING & filename,BLOCK_LIST * block_list)913 void apply_box_training(const STRING& filename, BLOCK_LIST *block_list) {
914   BLOCK_IT block_it(block_list);
915   ROW_IT row_it;
916   ROW *row;
917   WERD_IT word_it;
918   WERD *word;
919   WERD *bln_word;
920   WERD copy_outword;             // copy to denorm
921   PBLOB_IT blob_it;
922   DENORM denorm;
923   inT16 count = 0;
924   char unichar[UNICHAR_LEN + 1];
925 
926   unichar[UNICHAR_LEN] = '\0';
927   tprintf ("Generating training data\n");
928   for (block_it.mark_cycle_pt ();
929   !block_it.cycled_list (); block_it.forward ()) {
930     row_it.set_to_list (block_it.data ()->row_list ());
931     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
932       row = row_it.data ();
933       word_it.set_to_list (row->word_list ());
934       for (word_it.mark_cycle_pt ();
935       !word_it.cycled_list (); word_it.forward ()) {
936         word = word_it.data ();
937         if ((strlen (word->text ()) > 0) &&
938         (word->gblob_list ()->length () == 1)) {
939           // Here is a word with a single unichar label and a single blob so train on it.
940           bln_word = make_bln_copy(word, row, NULL, row->x_height (), &denorm);
941           blob_it.set_to_list (bln_word->blob_list ());
942           strncpy(unichar, word->text (), UNICHAR_LEN);
943           tess_training_tester (filename,
944             blob_it.data (),     //single blob
945             &denorm, TRUE,       //correct
946             unichar,             //correct character
947             strlen(unichar),     //character length
948             NULL);
949           copy_outword = *(bln_word);
950           copy_outword.baseline_denormalise (&denorm);
951           blob_it.set_to_list (copy_outword.blob_list ());
952           delete bln_word;
953           count++;
954         }
955       }
956     }
957   }
958   tprintf ("Generated training data for %d blobs\n", count);
959 }
960 
961 namespace tesseract {
apply_box_testing(BLOCK_LIST * block_list)962 void Tesseract::apply_box_testing(BLOCK_LIST *block_list) {
963   BLOCK_IT block_it(block_list);
964   ROW_IT row_it;
965   ROW *row;
966   inT16 row_count = 0;
967   WERD_IT word_it;
968   WERD *word;
969   WERD *bln_word;
970   inT16 word_count = 0;
971   PBLOB_IT blob_it;
972   DENORM denorm;
973   inT16 count = 0;
974   char ch[2];
975   WERD *outword;                 //bln best choice
976   //segmentation
977   WERD_CHOICE *best_choice;      //tess output
978   WERD_CHOICE *raw_choice;       //top choice permuter
979                                  //detailed results
980   BLOB_CHOICE_LIST_CLIST blob_choices;
981   inT16 char_count = 0;
982   inT16 correct_count = 0;
983   inT16 err_count = 0;
984   inT16 rej_count = 0;
985   #ifndef SECURE_NAMES
986   WERDSTATS wordstats;           //As from newdiff
987   #endif
988   char tess_rej_str[3];
989   char tess_long_str[3];
990 
991   ch[1] = '\0';
992   strcpy (tess_rej_str, "|A");
993   strcpy (tess_long_str, "|B");
994 
995   for (block_it.mark_cycle_pt ();
996   !block_it.cycled_list (); block_it.forward ()) {
997     row_it.set_to_list (block_it.data ()->row_list ());
998     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
999       row = row_it.data ();
1000       row_count++;
1001       word_count = 0;
1002       word_it.set_to_list (row->word_list ());
1003       for (word_it.mark_cycle_pt ();
1004       !word_it.cycled_list (); word_it.forward ()) {
1005         word = word_it.data ();
1006         word_count++;
1007         if ((strlen (word->text ()) == 1) &&
1008           !STRING (applybox_test_exclusions).contains (*word->text ())
1009         && (word->gblob_list ()->length () == 1)) {
1010           // Here is a word with a single char label and a single blob so test it.
1011           bln_word = make_bln_copy(word, row, NULL, row->x_height (), &denorm);
1012           blob_it.set_to_list (bln_word->blob_list ());
1013           ch[0] = *word->text ();
1014           char_count++;
1015           best_choice = tess_segment_pass1 (bln_word,
1016             &denorm,
1017             &Tesseract::tess_default_matcher,
1018             raw_choice,
1019             &blob_choices, outword);
1020 
1021           /*
1022             Test for TESS screw up on word. Recog_word has already ensured that the
1023             choice list, outword blob lists and best_choice string are the same
1024             length. A TESS screw up is indicated by a blank filled or 0 length string.
1025           */
1026           if ((best_choice->length() == 0) ||
1027             (strspn(best_choice->unichar_string().string(), " ") ==
1028              best_choice->unichar_string().length())) {
1029             rej_count++;
1030             tprintf ("%d:%d: \"%s\" -> TESS FAILED\n",
1031               row_count, word_count, ch);
1032             #ifndef SECURE_NAMES
1033             wordstats.word (tess_rej_str, 2, ch, 1);
1034             #endif
1035           }
1036           else {
1037             if ((best_choice->length() != outword->blob_list()->length()) ||
1038                 (best_choice->length() != blob_choices.length())) {
1039               tprintf
1040                 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
1041                 best_choice->unichar_string().string(),
1042                 best_choice->length(),
1043                 outword->blob_list ()->length(),
1044                 blob_choices.length());
1045             }
1046             ASSERT_HOST(best_choice->length() ==
1047                         outword->blob_list()->length());
1048             ASSERT_HOST(best_choice->length() == blob_choices.length());
1049             fix_quotes (best_choice,
1050                                  //turn to double
1051               outword, &blob_choices);
1052             if (strcmp (best_choice->unichar_string().string(), ch) != 0) {
1053               err_count++;
1054               tprintf ("%d:%d: \"%s\" -> \"%s\"\n",
1055                 row_count, word_count, ch,
1056                 best_choice->unichar_string().string());
1057             }
1058             else
1059               correct_count++;
1060             #ifndef SECURE_NAMES
1061             if (best_choice->unichar_string().length() > 2)
1062               wordstats.word(tess_long_str, 2, ch, 1);
1063             else
1064               wordstats.word(best_choice->unichar_string().string(),
1065                              best_choice->unichar_string().length(),
1066                              ch, 1);
1067             #endif
1068           }
1069           delete bln_word;
1070           delete outword;
1071           delete best_choice;
1072           delete raw_choice;
1073           blob_choices.deep_clear ();
1074           count++;
1075         }
1076       }
1077     }
1078   }
1079   #ifndef SECURE_NAMES
1080   wordstats.print (1, 100.0);
1081   wordstats.conf_matrix ();
1082   tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n",
1083     char_count, correct_count, rej_count, err_count);
1084   #endif
1085 }
1086 
1087 }  // namespace tesseract
1088