• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**********************************************************************
2  * File:        tordmain.cpp  (Formerly textordp.c)
3  * Description: C++ top level textord code.
4  * Author:                  Ray Smith
5  * Created:                 Tue Jul 28 17:12:33 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 #include "mfcpch.h"
20 #ifdef __UNIX__
21 #include          <assert.h>
22 #endif
23 #include          "stderr.h"
24 #include          "globaloc.h"
25 #include          "tessout.h"
26 #include          "blread.h"
27 #include          "blobbox.h"
28 #include          "edgblob.h"
29 #include          "drawtord.h"
30 #include          "makerow.h"
31 #include          "wordseg.h"
32 #include          "ocrclass.h"
33 #include          "genblob.h"
34 #include          "imgs.h"
35 #include          "tordmain.h"
36 #include          "secname.h"
37 #include "tesseractclass.h"
38 
39 // Some of the code in this file is dependent upon leptonica. If you don't
40 // have it, you don't get this functionality.
41 #ifdef HAVE_CONFIG_H
42 #include "config_auto.h"
43 #endif
44 #ifdef HAVE_LIBLEPT
45 #include "allheaders.h"
46 #endif
47 
48 const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block";
49 
50 #undef EXTERN
51 #define EXTERN
52 
53 EXTERN BOOL_VAR (textord_no_rejects, FALSE, "Don't remove noise blobs");
54 EXTERN BOOL_VAR (textord_show_blobs, FALSE, "Display unsorted blobs");
55 EXTERN BOOL_VAR (textord_show_boxes, FALSE, "Display unsorted blobs");
56 EXTERN BOOL_VAR (textord_new_initial_xheight, TRUE,
57 "Use test xheight mechanism");
58 EXTERN BOOL_VAR (textord_exit_after, FALSE, "Exit after completing textord");
59 EXTERN INT_VAR (textord_max_noise_size, 7, "Pixel size of noise");
60 EXTERN double_VAR (textord_blob_size_bigile, 95,
61 "Percentile for large blobs");
62 EXTERN double_VAR (textord_noise_area_ratio, 0.7,
63 "Fraction of bounding box for noise");
64 EXTERN double_VAR (textord_blob_size_smallile, 20,
65 "Percentile for small blobs");
66 EXTERN double_VAR (textord_initialx_ile, 0.75,
67 "Ile of sizes for xheight guess");
68 EXTERN double_VAR (textord_initialasc_ile, 0.90,
69 "Ile of sizes for xheight guess");
70 EXTERN INT_VAR (textord_noise_sizefraction, 10,
71 "Fraction of size for maxima");
72 EXTERN double_VAR (textord_noise_sizelimit, 0.5,
73 "Fraction of x for big t count");
74 EXTERN INT_VAR (textord_noise_translimit, 16, "Transitions for normal blob");
75 EXTERN double_VAR (textord_noise_normratio, 2.0,
76 "Dot to norm ratio for deletion");
77 EXTERN BOOL_VAR (textord_noise_rejwords, TRUE, "Reject noise-like words");
78 EXTERN BOOL_VAR (textord_noise_rejrows, TRUE, "Reject noise-like rows");
79 EXTERN double_VAR (textord_noise_syfract, 0.2,
80 "xh fract error for norm blobs");
81 EXTERN double_VAR (textord_noise_sxfract, 0.4,
82 "xh fract width error for norm blobs");
83 EXTERN double_VAR(textord_noise_hfract, 1.0/64,
84 "Height fraction to discard outlines as speckle noise");
85 EXTERN INT_VAR (textord_noise_sncount, 1, "super norm blobs to save row");
86 EXTERN double_VAR (textord_noise_rowratio, 6.0,
87 "Dot to norm ratio for deletion");
88 
89 EXTERN BOOL_VAR (textord_noise_debug, FALSE, "Debug row garbage detector");
90 EXTERN double_VAR (textord_blshift_maxshift, 0.00, "Max baseline shift");
91 EXTERN double_VAR (textord_blshift_xfraction, 9.99,
92 "Min size of baseline shift");
93 EXTERN STRING_EVAR (tessedit_image_ext, ".tif", "Externsion for image file");
94 
95 #ifndef EMBEDDED
96 EXTERN clock_t previous_cpu;
97 #endif
98 
99 extern BOOL_VAR_H (polygon_tess_approximation, TRUE,
100 "Do tess poly instead of grey scale");
101 
102 #define MAX_NEAREST_DIST  600    //for block skew stats
103 #define MAX_BLOB_TRANSITIONS100  //for nois stats
104 
105 extern IMAGE page_image;         //must be defined somewhere
106 extern BOOL_VAR_H (interactive_mode, TRUE, "Run interactively?");
107 extern /*"C" */ ETEXT_DESC *global_monitor;     //progress monitor
108 
109 /**********************************************************************
110  * find_components
111  *
112  * Find the C_OUTLINEs of the connected components in each block, put them
113  * in C_BLOBs, and filter them by size, putting the different size
114  * grades on different lists in the matching TO_BLOCK in port_blocks.
115  **********************************************************************/
116 
find_components(BLOCK_LIST * blocks,TO_BLOCK_LIST * land_blocks,TO_BLOCK_LIST * port_blocks,TBOX * page_box)117 void find_components(
118                        BLOCK_LIST *blocks,
119                        TO_BLOCK_LIST *land_blocks,
120                        TO_BLOCK_LIST *port_blocks,
121                        TBOX *page_box) {
122   BLOCK *block;                  //current block
123   PDBLK_CLIST pd_blocks;         //copy of list
124   BLOCK_IT block_it = blocks;    //iterator
125   PDBLK_C_IT pd_it = &pd_blocks; //iterator
126   IMAGE thresh_image;            //thresholded
127 
128   int width = page_image.get_xsize();
129   int height = page_image.get_ysize();
130   if (width > MAX_INT16 || height > MAX_INT16) {
131     tprintf("Input image too large! (%d, %d)\n", width, height);
132     return;  // Can't handle it.
133   }
134 
135   ICOORD page_tr(width, height);
136   block_it.set_to_list (blocks);
137   if (global_monitor != NULL)
138     global_monitor->ocr_alive = TRUE;
139 
140     set_global_loc_code(LOC_EDGE_PROG);
141     if (!page_image.white_high ())
142       invert_image(&page_image);
143 
144 #ifndef EMBEDDED
145     previous_cpu = clock ();
146 #endif
147 
148     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
149     block_it.forward ()) {
150       block = block_it.data ();
151     if (block->poly_block() == NULL ||
152         block->poly_block()->IsText()) {
153 #ifndef GRAPHICS_DISABLED
154       extract_edges(NULL, &page_image, &page_image, page_tr, block);
155 #else
156       extract_edges(&page_image, &page_image, page_tr, block);
157 #endif
158       *page_box += block->bounding_box ();
159     }
160   }
161   if (global_monitor != NULL) {
162     global_monitor->ocr_alive = TRUE;
163     global_monitor->progress = 10;
164   }
165 
166   assign_blobs_to_blocks2(blocks, land_blocks, port_blocks);
167   if (global_monitor != NULL)
168     global_monitor->ocr_alive = TRUE;
169   filter_blobs (page_box->topright (), land_blocks, textord_test_landscape);
170 #ifndef EMBEDDED
171   previous_cpu = clock ();
172 #endif
173   filter_blobs (page_box->topright (), port_blocks, !textord_test_landscape);
174   if (global_monitor != NULL)
175     global_monitor->ocr_alive = TRUE;
176 }
177 
178 /**********************************************************************
179  * SetBlobStrokeWidth
180  *
181  * Set the horizontal and vertical stroke widths in the blob.
182  **********************************************************************/
SetBlobStrokeWidth(bool debug,BLOBNBOX * blob)183 void SetBlobStrokeWidth(bool debug, BLOBNBOX* blob) {
184 #ifdef HAVE_LIBLEPT
185   // Cut the blob rectangle into a Pix.
186   // TODO(rays) make the page_image a Pix so this is more direct.
187   const TBOX& box = blob->bounding_box();
188   IMAGE blob_im;
189   int width = box.width();
190   int height = box.height();
191   blob_im.create(width, height, 1);
192   copy_sub_image(&page_image, box.left(), box.bottom(), width, height,
193                  &blob_im, 0, 0, false);
194   Pix* pix = blob_im.ToPix();
195   Pix* dist_pix = pixDistanceFunction(pix, 4, 8, L_BOUNDARY_BG);
196   if (debug) {
197     pixWrite("cutpix.png", pix, IFF_PNG);
198     pixWrite("distpix.png", dist_pix, IFF_PNG);
199   }
200   pixDestroy(&pix);
201   // Compute the stroke widths.
202   uinT32* data = pixGetData(dist_pix);
203   int wpl = pixGetWpl(dist_pix);
204   // Horizontal width of stroke.
205   STATS h_stats(0, width + 1);
206   for (int y = 0; y < height; ++y) {
207     uinT32* pixels = data + y*wpl;
208     int prev_pixel = 0;
209     int pixel = GET_DATA_BYTE(pixels, 0);
210     for (int x = 1; x < width; ++x) {
211       int next_pixel = GET_DATA_BYTE(pixels, x);
212       // We are looking for a pixel that is equal to its vertical neighbours,
213       // yet greater than its left neighbour.
214       if (prev_pixel < pixel &&
215           (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
216           (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
217         if (pixel > next_pixel) {
218           // Single local max, so an odd width.
219           h_stats.add(pixel * 2 - 1, 1);
220         } else if (pixel == next_pixel && x + 1 < width &&
221                  pixel > GET_DATA_BYTE(pixels, x + 1)) {
222           // Double local max, so an even width.
223           h_stats.add(pixel * 2, 1);
224         }
225       }
226       prev_pixel = pixel;
227       pixel = next_pixel;
228     }
229   }
230   if (debug) {
231     h_stats.print(stderr, true);
232   }
233   // Vertical width of stroke.
234   STATS v_stats(0, height + 1);
235   for (int x = 0; x < width; ++x) {
236     int prev_pixel = 0;
237     int pixel = GET_DATA_BYTE(data, x);
238     for (int y = 1; y < height; ++y) {
239       uinT32* pixels = data + y*wpl;
240       int next_pixel = GET_DATA_BYTE(pixels, x);
241       // We are looking for a pixel that is equal to its horizontal neighbours,
242       // yet greater than its upper neighbour.
243       if (prev_pixel < pixel &&
244           (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
245           (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
246         if (pixel > next_pixel) {
247           // Single local max, so an odd width.
248           v_stats.add(pixel * 2 - 1, 1);
249         } else if (pixel == next_pixel && y + 1 < height &&
250                  pixel > GET_DATA_BYTE(pixels + wpl, x)) {
251           // Double local max, so an even width.
252           v_stats.add(pixel * 2, 1);
253         }
254       }
255       prev_pixel = pixel;
256       pixel = next_pixel;
257     }
258   }
259   if (debug) {
260     v_stats.print(stderr, true);
261   }
262   pixDestroy(&dist_pix);
263   // Store the horizontal and vertical width in the blob, keeping both
264   // widths if there is enough information, otherwse only the one with
265   // the most samples.
266   // If there are insufficent samples, store zero, rather than using
267   // 2*area/perimeter, as the numbers that gives do not match the numbers
268   // from the distance method.
269   if (debug) {
270     tprintf("box=%d,%d->%d,%d, hcount=%d, vcount=%d, target=%d\n",
271             box.left(), box.bottom(), box.right(), box.top(),
272             h_stats.get_total(), v_stats.get_total(), (width+height) /4);
273     tprintf("hstats median=%f, lq=%f, uq=%f, sd=%f\n",
274             h_stats.median(), h_stats.ile(0.25f), h_stats.ile(0.75f),
275             h_stats.sd());
276     tprintf("vstats median=%f, lq=%f, uq=%f, sd=%f\n",
277             v_stats.median(), v_stats.ile(0.25f), v_stats.ile(0.75f),
278             v_stats.sd());
279 
280   }
281   if (h_stats.get_total() >= (width + height) / 4) {
282     blob->set_horz_stroke_width(h_stats.ile(0.5f));
283     if (v_stats.get_total() >= (width + height) / 4)
284       blob->set_vert_stroke_width(v_stats.ile(0.5f));
285     else
286       blob->set_vert_stroke_width(0.0f);
287   } else {
288     if (v_stats.get_total() >= (width + height) / 4 ||
289         v_stats.get_total() > h_stats.get_total()) {
290       blob->set_horz_stroke_width(0.0f);
291       blob->set_vert_stroke_width(v_stats.ile(0.5f));
292     } else {
293       blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
294                                                           : 0.0f);
295       blob->set_vert_stroke_width(0.0f);
296     }
297   }
298 #else
299   // Without leptonica present, use the 2*area/perimeter as an approximation.
300   float width = 2.0f * blob->cblob()->area();
301   width /= blob->cblob()->perimeter();
302   blob->set_horz_stroke_width(width);
303   blob->set_vert_stroke_width(width);
304 #endif
305 }
306 
307 
308 /**********************************************************************
309  * assign_blobs_to_blocks2
310  *
311  * Make a list of TO_BLOCKs for portrait and landscape orientation.
312  **********************************************************************/
313 
assign_blobs_to_blocks2(BLOCK_LIST * blocks,TO_BLOCK_LIST * land_blocks,TO_BLOCK_LIST * port_blocks)314 void assign_blobs_to_blocks2(                             //split into groups
315                              BLOCK_LIST *blocks,          //blocks to process
316                              TO_BLOCK_LIST *land_blocks,  // ** unused **
317                              TO_BLOCK_LIST *port_blocks   //output list
318                             ) {
319   BLOCK *block;                  //current block
320   BLOBNBOX *newblob;             //created blob
321   C_BLOB *blob;                  //current blob
322   BLOCK_IT block_it = blocks;
323   C_BLOB_IT blob_it;             //iterator
324   BLOBNBOX_IT port_box_it;       //iterator
325                                  //destination iterator
326   TO_BLOCK_IT port_block_it = port_blocks;
327   TO_BLOCK *port_block;          //created block
328 
329   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
330     block = block_it.data ();
331     port_block = new TO_BLOCK (block);
332 
333     // Convert the good outlines to block->blob_list
334     port_box_it.set_to_list (&port_block->blobs);
335     blob_it.set_to_list(block->blob_list());
336     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
337       blob = blob_it.extract ();
338       newblob = new BLOBNBOX(blob);  // Convert blob to BLOBNBOX.
339       SetBlobStrokeWidth(false, newblob);
340       port_box_it.add_after_then_move (newblob);
341     }
342 
343     // Put the rejected outlines in block->noise_blobs, which allows them to
344     // be reconsidered and sorted back into rows and recover outlines mistakenly
345     // rejected.
346     port_box_it.set_to_list(&port_block->noise_blobs);
347     blob_it.set_to_list(block->reject_blobs());
348     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
349       blob = blob_it.extract();
350       newblob = new BLOBNBOX(blob);  // Convert blob to BLOBNBOX.
351       SetBlobStrokeWidth(false, newblob);
352       port_box_it.add_after_then_move(newblob);
353     }
354 
355     port_block_it.add_after_then_move (port_block);
356   }
357 }
358 
359 
360 /**********************************************************************
361  * filter_blobs
362  *
363  * Sort the blobs into sizes in all the blocks for later work.
364  **********************************************************************/
365 
filter_blobs(ICOORD page_tr,TO_BLOCK_LIST * blocks,BOOL8 testing_on)366 void filter_blobs(                        //split into groups
367                   ICOORD page_tr,         //top right
368                   TO_BLOCK_LIST *blocks,  //output list
369                   BOOL8 testing_on        //for plotting
370                  ) {
371   TO_BLOCK_IT block_it = blocks; //destination iterator
372   TO_BLOCK *block;               //created block
373 
374 #ifndef GRAPHICS_DISABLED
375   if (to_win != NULL)
376     to_win->Clear();
377 #endif
378   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
379   block_it.forward ()) {
380     block = block_it.data ();
381     block->line_size = filter_noise_blobs (&block->blobs,
382       &block->noise_blobs,
383       &block->small_blobs,
384       &block->large_blobs);
385     block->line_spacing =
386       block->line_size * (textord_merge_desc + textord_merge_x +
387       textord_merge_asc +
388       textord_merge_asc) / textord_merge_x;
389     block->line_size *= textord_min_linesize;
390     block->max_blob_size = block->line_size * textord_excess_blobsize;
391 #ifndef GRAPHICS_DISABLED
392     if (textord_show_blobs && testing_on) {
393       if (to_win == NULL)
394         create_to_win(page_tr);
395       block->plot_graded_blobs(to_win);
396     }
397     if (textord_show_boxes && testing_on) {
398       if (to_win == NULL)
399         create_to_win(page_tr);
400       plot_box_list (to_win, &block->noise_blobs, ScrollView::WHITE);
401       plot_box_list (to_win, &block->small_blobs, ScrollView::WHITE);
402       plot_box_list (to_win, &block->large_blobs, ScrollView::WHITE);
403       plot_box_list (to_win, &block->blobs, ScrollView::WHITE);
404     }
405 #endif
406   }
407 }
408 
409 
410 /**********************************************************************
411  * filter_noise_blobs
412  *
413  * Move small blobs to a separate list.
414  **********************************************************************/
415 
filter_noise_blobs(BLOBNBOX_LIST * src_list,BLOBNBOX_LIST * noise_list,BLOBNBOX_LIST * small_list,BLOBNBOX_LIST * large_list)416 float filter_noise_blobs(                            //separate noise
417                          BLOBNBOX_LIST *src_list,    //origonal list
418                          BLOBNBOX_LIST *noise_list,  //noise list
419                          BLOBNBOX_LIST *small_list,  //small blobs
420                          BLOBNBOX_LIST *large_list   //large blobs
421                         ) {
422   inT16 height;                  //height of blob
423   inT16 width;                   //of blob
424   BLOBNBOX_IT src_it = src_list; //iterators
425   BLOBNBOX_IT noise_it = noise_list;
426   BLOBNBOX_IT small_it = small_list;
427   BLOBNBOX_IT large_it = large_list;
428   STATS size_stats (0, MAX_NEAREST_DIST);
429   //blob heights
430   if (textord_new_initial_xheight)
431     return filter_noise_blobs2 (src_list, noise_list, small_list, large_list);
432   float min_y;                   //size limits
433   float max_y;
434   float max_x;
435 
436   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
437     if (src_it.data ()->bounding_box ().height () < textord_max_noise_size)
438       noise_it.add_after_then_move (src_it.extract ());
439   }
440   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
441     size_stats.add (src_it.data ()->bounding_box ().height (), 1);
442   }
443   min_y = floor (size_stats.ile (textord_blob_size_smallile / 100.0));
444   max_y = ceil (size_stats.ile (textord_blob_size_bigile / 100.0));
445   max_x = ceil (size_stats.ile (0.5) * textord_width_limit);
446   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
447     height = src_it.data ()->bounding_box ().height ();
448     width = src_it.data ()->bounding_box ().width ();
449     if (height < min_y)
450       small_it.add_after_then_move (src_it.extract ());
451     else if (height > max_y || width > max_x)
452       large_it.add_after_then_move (src_it.extract ());
453   }
454   return size_stats.ile (textord_initialx_ile);
455 }
456 
457 
458 /**********************************************************************
459  * filter_noise_blobs2
460  *
461  * Move small blobs to a separate list.
462  **********************************************************************/
463 
filter_noise_blobs2(BLOBNBOX_LIST * src_list,BLOBNBOX_LIST * noise_list,BLOBNBOX_LIST * small_list,BLOBNBOX_LIST * large_list)464 float filter_noise_blobs2(                            //separate noise
465                           BLOBNBOX_LIST *src_list,    //origonal list
466                           BLOBNBOX_LIST *noise_list,  //noise list
467                           BLOBNBOX_LIST *small_list,  //small blobs
468                           BLOBNBOX_LIST *large_list   //large blobs
469                          ) {
470   inT16 height;                  //height of blob
471   inT16 width;                   //of blob
472   BLOBNBOX *blob;                //current blob
473   float initial_x;               //first guess
474   BLOBNBOX_IT src_it = src_list; //iterators
475   BLOBNBOX_IT noise_it = noise_list;
476   BLOBNBOX_IT small_it = small_list;
477   BLOBNBOX_IT large_it = large_list;
478   STATS size_stats (0, MAX_NEAREST_DIST);
479   //blob heights
480   float min_y;                   //size limits
481   float max_y;
482   float max_x;
483   float max_height;              //of good blobs
484 
485   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
486     blob = src_it.data ();
487     if (blob->bounding_box ().height () < textord_max_noise_size)
488       noise_it.add_after_then_move (src_it.extract ());
489     else if (blob->enclosed_area () >= blob->bounding_box ().height ()
490       * blob->bounding_box ().width () * textord_noise_area_ratio)
491       small_it.add_after_then_move (src_it.extract ());
492   }
493   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
494     size_stats.add (src_it.data ()->bounding_box ().height (), 1);
495   }
496   initial_x = size_stats.ile (textord_initialx_ile);
497   max_y =
498     ceil (initial_x *
499     (textord_merge_desc + textord_merge_x +
500     2 * textord_merge_asc) / textord_merge_x);
501   min_y = floor (initial_x / 2);
502   max_x = ceil (initial_x * textord_width_limit);
503   small_it.move_to_first ();
504   for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
505   small_it.forward ()) {
506     height = small_it.data()->bounding_box().height();
507     if (height > max_y)
508       large_it.add_after_then_move(small_it.extract ());
509     else if (height >= min_y)
510       src_it.add_after_then_move(small_it.extract ());
511   }
512   size_stats.clear ();
513   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
514     height = src_it.data ()->bounding_box ().height ();
515     width = src_it.data ()->bounding_box ().width ();
516     if (height < min_y)
517       small_it.add_after_then_move (src_it.extract ());
518     else if (height > max_y || width > max_x)
519       large_it.add_after_then_move (src_it.extract ());
520     else
521       size_stats.add (height, 1);
522   }
523   max_height = size_stats.ile (textord_initialasc_ile);
524   //      printf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
525   //              max_y,min_y,initial_x,max_height);
526   max_height *= textord_merge_x / (textord_merge_x + textord_merge_asc);
527   if (max_height > initial_x)
528     initial_x = max_height;
529   //      printf(" ret=%g\n",initial_x);
530   return initial_x;
531 }
532 
533 
534 /**********************************************************************
535  * textord_page
536  *
537  * Textord the list of blobs and return a list of proper blocks.
538  **********************************************************************/
539 
textord_page(ICOORD page_tr,BLOCK_LIST * blocks,TO_BLOCK_LIST * land_blocks,TO_BLOCK_LIST * port_blocks,tesseract::Tesseract * tess)540 void textord_page(                             //make rows & words
541                   ICOORD page_tr,              //top right
542                   BLOCK_LIST *blocks,          //block list
543                   TO_BLOCK_LIST *land_blocks,  //rotated for landscape
544                   TO_BLOCK_LIST *port_blocks,  //output list
545                   tesseract::Tesseract* tess
546                  ) {
547   float gradient;                //global skew
548 
549   set_global_loc_code(LOC_TEXT_ORD_ROWS);
550   gradient = make_rows (page_tr, blocks, land_blocks, port_blocks, tess);
551   if (global_monitor != NULL) {
552     global_monitor->ocr_alive = TRUE;
553     global_monitor->progress = 20;
554   }
555   set_global_loc_code(LOC_TEXT_ORD_WORDS);
556   make_words(page_tr, gradient, blocks, land_blocks, port_blocks, tess);
557   if (global_monitor != NULL) {
558     global_monitor->ocr_alive = TRUE;
559     global_monitor->progress = 30;
560   }
561   cleanup_blocks(blocks);  //remove empties
562 #ifndef GRAPHICS_DISABLED
563   close_to_win();
564 #endif
565   if (textord_exit_after && !interactive_mode)
566     exit (0);
567 }
568 
569 /**********************************************************************
570  * cleanup_blocks
571  *
572  * Delete empty blocks, rows from the page.
573  **********************************************************************/
574 
cleanup_blocks(BLOCK_LIST * blocks)575 void cleanup_blocks(                    //remove empties
576                     BLOCK_LIST *blocks  //list
577                    ) {
578   BLOCK_IT block_it = blocks;    //iterator
579   ROW_IT row_it;                 //row iterator
580 
581   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
582   block_it.forward ()) {
583     row_it.set_to_list (block_it.data ()->row_list ());
584     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
585       clean_small_noise_from_words(row_it.data());
586       if ((textord_noise_rejrows
587         && !row_it.data ()->word_list ()->empty ()
588         && clean_noise_from_row (row_it.data ()))
589         || row_it.data ()->word_list ()->empty ())
590         delete row_it.extract ();//lose empty row
591       else {
592         if (textord_noise_rejwords)
593           clean_noise_from_words (row_it.data ());
594         if (textord_blshift_maxshift >= 0)
595           tweak_row_baseline (row_it.data ());
596       }
597     }
598     if (block_it.data ()->row_list ()->empty ()) {
599       delete block_it.extract ();//lose empty block
600     }
601   }
602 }
603 
604 
605 /**********************************************************************
606  * clean_noise_from_row
607  *
608  * Move blobs of words from rows of garbage into the reject blobs list.
609  **********************************************************************/
610 
clean_noise_from_row(ROW * row)611 BOOL8 clean_noise_from_row(          //remove empties
612                            ROW *row  //row to clean
613                           ) {
614   BOOL8 testing_on;
615   TBOX blob_box;                  //bounding box
616   C_BLOB *blob;                  //current blob
617   C_OUTLINE *outline;            //current outline
618   WERD *word;                    //current word
619   inT32 blob_size;               //biggest size
620   inT32 trans_count = 0;         //no of transitions
621   inT32 trans_threshold;         //noise tolerance
622   inT32 dot_count;               //small objects
623   inT32 norm_count;              //normal objects
624   inT32 super_norm_count;        //real char-like
625                                  //words of row
626   WERD_IT word_it = row->word_list ();
627   C_BLOB_IT blob_it;             //blob iterator
628   C_OUTLINE_IT out_it;           //outline iterator
629 
630   if (textord_test_y > row->base_line (textord_test_x)
631     && textord_show_blobs
632     && textord_test_y < row->base_line (textord_test_x) + row->x_height ())
633     testing_on = TRUE;
634   else
635     testing_on = FALSE;
636   dot_count = 0;
637   norm_count = 0;
638   super_norm_count = 0;
639   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
640     word = word_it.data ();      //current word
641                                  //blobs in word
642     blob_it.set_to_list (word->cblob_list ());
643     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
644     blob_it.forward ()) {
645       blob = blob_it.data ();
646       if (!word->flag (W_DONT_CHOP)) {
647                                  //get outlines
648         out_it.set_to_list (blob->out_list ());
649         for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
650         out_it.forward ()) {
651           outline = out_it.data ();
652           blob_box = outline->bounding_box ();
653           blob_size =
654             blob_box.width () >
655             blob_box.height ()? blob_box.width () : blob_box.
656             height();
657           if (blob_size < textord_noise_sizelimit * row->x_height ())
658             dot_count++;         //count smal outlines
659           if (!outline->child ()->empty ()
660             && blob_box.height () <
661             (1 + textord_noise_syfract) * row->x_height ()
662             && blob_box.height () >
663             (1 - textord_noise_syfract) * row->x_height ()
664             && blob_box.width () <
665             (1 + textord_noise_sxfract) * row->x_height ()
666             && blob_box.width () >
667             (1 - textord_noise_sxfract) * row->x_height ())
668             super_norm_count++;  //count smal outlines
669         }
670       }
671       else
672         super_norm_count++;
673       blob_box = blob->bounding_box ();
674       blob_size =
675         blob_box.width () >
676         blob_box.height ()? blob_box.width () : blob_box.height ();
677       if (blob_size >= textord_noise_sizelimit * row->x_height ()
678       && blob_size < row->x_height () * 2) {
679         trans_threshold = blob_size / textord_noise_sizefraction;
680         trans_count = blob->count_transitions (trans_threshold);
681         if (trans_count < textord_noise_translimit)
682           norm_count++;
683       }
684       else if (blob_box.height () > row->x_height () * 2
685         && (!word_it.at_first () || !blob_it.at_first ()))
686         dot_count += 2;
687       #ifndef SECURE_NAMES
688       if (testing_on) {
689         tprintf
690           ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
691           blob_box.left (), blob_box.bottom (), blob_box.right (),
692           blob_box.top (), blob->out_list ()->length (), trans_count,
693           blob_box.bottom () - row->base_line (blob_box.left ()));
694       }
695       #endif
696     }
697   }
698   #ifndef SECURE_NAMES
699   if (textord_noise_debug) {
700     tprintf ("Row ending at (%d,%g):",
701       blob_box.right (), row->base_line (blob_box.right ()));
702     tprintf (" R=%g, dc=%d, nc=%d, %s\n",
703       norm_count > 0 ? (float) dot_count / norm_count : 9999,
704       dot_count, norm_count,
705       dot_count > norm_count * textord_noise_normratio
706       && dot_count > 2 ? "REJECTED" : "ACCEPTED");
707   }
708   #endif
709   return super_norm_count < textord_noise_sncount
710     && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
711 }
712 
713 
714 /**********************************************************************
715  * clean_noise_from_words
716  *
717  * Move blobs of words from rows of garbage into the reject blobs list.
718  **********************************************************************/
719 
clean_noise_from_words(ROW * row)720 void clean_noise_from_words(          //remove empties
721                             ROW *row  //row to clean
722                            ) {
723   TBOX blob_box;                  //bounding box
724   inT8 *word_dud;                //was it chucked
725   C_BLOB *blob;                  //current blob
726   C_OUTLINE *outline;            //current outline
727   WERD *word;                    //current word
728   inT32 blob_size;               //biggest size
729   inT32 trans_count;             //no of transitions
730   inT32 trans_threshold;         //noise tolerance
731   inT32 dot_count;               //small objects
732   inT32 norm_count;              //normal objects
733   inT32 dud_words;               //number discarded
734   inT32 ok_words;                //number remaining
735   inT32 word_index;              //current word
736                                  //words of row
737   WERD_IT word_it = row->word_list ();
738   C_BLOB_IT blob_it;             //blob iterator
739   C_OUTLINE_IT out_it;           //outline iterator
740 
741   ok_words = word_it.length ();
742   if (ok_words == 0 || textord_no_rejects)
743     return;
744   word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8));
745   dud_words = 0;
746   ok_words = 0;
747   word_index = 0;
748   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
749     word = word_it.data ();      //current word
750     dot_count = 0;
751     norm_count = 0;
752                                  //blobs in word
753     blob_it.set_to_list (word->cblob_list ());
754     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
755     blob_it.forward ()) {
756       blob = blob_it.data ();
757       if (!word->flag (W_DONT_CHOP)) {
758                                  //get outlines
759         out_it.set_to_list (blob->out_list ());
760         for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
761         out_it.forward ()) {
762           outline = out_it.data ();
763           blob_box = outline->bounding_box ();
764           blob_size =
765             blob_box.width () >
766             blob_box.height ()? blob_box.width () : blob_box.
767             height();
768           if (blob_size < textord_noise_sizelimit * row->x_height ())
769             dot_count++;         //count smal outlines
770           if (!outline->child ()->empty ()
771             && blob_box.height () <
772             (1 + textord_noise_syfract) * row->x_height ()
773             && blob_box.height () >
774             (1 - textord_noise_syfract) * row->x_height ()
775             && blob_box.width () <
776             (1 + textord_noise_sxfract) * row->x_height ()
777             && blob_box.width () >
778             (1 - textord_noise_sxfract) * row->x_height ())
779             norm_count++;        //count smal outlines
780         }
781       }
782       else
783         norm_count++;
784       blob_box = blob->bounding_box ();
785       blob_size =
786         blob_box.width () >
787         blob_box.height ()? blob_box.width () : blob_box.height ();
788       if (blob_size >= textord_noise_sizelimit * row->x_height ()
789       && blob_size < row->x_height () * 2) {
790         trans_threshold = blob_size / textord_noise_sizefraction;
791         trans_count = blob->count_transitions (trans_threshold);
792         if (trans_count < textord_noise_translimit)
793           norm_count++;
794       }
795       else if (blob_box.height () > row->x_height () * 2
796         && (!word_it.at_first () || !blob_it.at_first ()))
797         dot_count += 2;
798     }
799     if (dot_count > 2) {
800       if (dot_count > norm_count * textord_noise_normratio * 2)
801         word_dud[word_index] = 2;
802       else if (dot_count > norm_count * textord_noise_normratio)
803         word_dud[word_index] = 1;
804       else
805         word_dud[word_index] = 0;
806     }
807     else
808       word_dud[word_index] = 0;
809     if (word_dud[word_index] == 2)
810       dud_words++;
811     else
812       ok_words++;
813     word_index++;
814   }
815 
816   word_index = 0;
817   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
818     if (word_dud[word_index] == 2
819     || (word_dud[word_index] == 1 && dud_words > ok_words)) {
820       word = word_it.data ();    //current word
821                                  //rejected blobs
822       blob_it.set_to_list (word->rej_cblob_list ());
823                                  //move from blobs
824       blob_it.add_list_after (word->cblob_list ());
825     }
826     word_index++;
827   }
828   free_mem(word_dud);
829 }
830 
831 // Remove outlines that are a tiny fraction in either width or height
832 // of the word height.
clean_small_noise_from_words(ROW * row)833 void clean_small_noise_from_words(ROW *row) {
834   WERD_IT word_it(row->word_list());
835   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
836     WERD* word = word_it.data();
837     int min_size = static_cast<int>(
838       textord_noise_hfract * word->bounding_box().height() + 0.5);
839     C_BLOB_IT blob_it(word->cblob_list());
840     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
841       C_BLOB* blob = blob_it.data();
842       C_OUTLINE_IT out_it(blob->out_list());
843       for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
844         C_OUTLINE* outline = out_it.data();
845         outline->RemoveSmallRecursive(min_size, &out_it);
846       }
847       if (blob->out_list()->empty()) {
848         delete blob_it.extract();
849       }
850     }
851     if (word->cblob_list()->empty()) {
852       if (!word_it.at_last()) {
853         // The next word is no longer a fuzzy non space if it was before,
854         // since the word before is about to be deleted.
855         WERD* next_word = word_it.data_relative(1);
856         if (next_word->flag(W_FUZZY_NON)) {
857           next_word->set_flag(W_FUZZY_NON, false);
858         }
859       }
860       delete word_it.extract();
861     }
862   }
863 }
864 
865 
866 /**********************************************************************
867  * tweak_row_baseline
868  *
869  * Shift baseline to fit the blobs more accurately where they are
870  * close enough.
871  **********************************************************************/
872 
tweak_row_baseline(ROW * row)873 void tweak_row_baseline(          //remove empties
874                         ROW *row  //row to clean
875                        ) {
876   TBOX blob_box;                  //bounding box
877   C_BLOB *blob;                  //current blob
878   WERD *word;                    //current word
879   inT32 blob_count;              //no of blobs
880   inT32 src_index;               //source segment
881   inT32 dest_index;              //destination segment
882   inT32 *xstarts;                //spline segments
883   double *coeffs;                //spline coeffs
884   float ydiff;                   //baseline error
885   float x_centre;                //centre of blob
886                                  //words of row
887   WERD_IT word_it = row->word_list ();
888   C_BLOB_IT blob_it;             //blob iterator
889 
890   blob_count = 0;
891   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
892     word = word_it.data ();      //current word
893                                  //get total blobs
894     blob_count += word->cblob_list ()->length ();
895   }
896   if (blob_count == 0)
897     return;
898   xstarts =
899     (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
900     sizeof (inT32));
901   coeffs =
902     (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
903     sizeof (double));
904 
905   src_index = 0;
906   dest_index = 0;
907   xstarts[0] = row->baseline.xcoords[0];
908   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
909     word = word_it.data ();      //current word
910                                  //blobs in word
911     blob_it.set_to_list (word->cblob_list ());
912     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
913     blob_it.forward ()) {
914       blob = blob_it.data ();
915       blob_box = blob->bounding_box ();
916       x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
917       ydiff = blob_box.bottom () - row->base_line (x_centre);
918       if (ydiff < 0)
919         ydiff = -ydiff / row->x_height ();
920       else
921         ydiff = ydiff / row->x_height ();
922       if (ydiff < textord_blshift_maxshift
923         && blob_box.height () / row->x_height () >
924       textord_blshift_xfraction) {
925         if (xstarts[dest_index] >= x_centre)
926           xstarts[dest_index] = blob_box.left ();
927         coeffs[dest_index * 3] = 0;
928         coeffs[dest_index * 3 + 1] = 0;
929         coeffs[dest_index * 3 + 2] = blob_box.bottom ();
930         //shift it
931         dest_index++;
932         xstarts[dest_index] = blob_box.right () + 1;
933       }
934       else {
935         if (xstarts[dest_index] <= x_centre) {
936           while (row->baseline.xcoords[src_index + 1] <= x_centre
937           && src_index < row->baseline.segments - 1) {
938             if (row->baseline.xcoords[src_index + 1] >
939             xstarts[dest_index]) {
940               coeffs[dest_index * 3] =
941                 row->baseline.quadratics[src_index].a;
942               coeffs[dest_index * 3 + 1] =
943                 row->baseline.quadratics[src_index].b;
944               coeffs[dest_index * 3 + 2] =
945                 row->baseline.quadratics[src_index].c;
946               dest_index++;
947               xstarts[dest_index] =
948                 row->baseline.xcoords[src_index + 1];
949             }
950             src_index++;
951           }
952           coeffs[dest_index * 3] =
953             row->baseline.quadratics[src_index].a;
954           coeffs[dest_index * 3 + 1] =
955             row->baseline.quadratics[src_index].b;
956           coeffs[dest_index * 3 + 2] =
957             row->baseline.quadratics[src_index].c;
958           dest_index++;
959           xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
960         }
961       }
962     }
963   }
964   while (src_index < row->baseline.segments
965     && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
966     src_index++;
967   while (src_index < row->baseline.segments) {
968     coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
969     coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
970     coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
971     dest_index++;
972     src_index++;
973     xstarts[dest_index] = row->baseline.xcoords[src_index];
974   }
975                                  //turn to spline
976   row->baseline = QSPLINE (dest_index, xstarts, coeffs);
977   free_mem(xstarts);
978   free_mem(coeffs);
979 }
980 
981 
982 /**********************************************************************
983  * blob_y_order
984  *
985  * Sort function to sort blobs in y from page top.
986  **********************************************************************/
987 
blob_y_order(void * item1,void * item2)988 inT32 blob_y_order(              //sort function
989                    void *item1,  //items to compare
990                    void *item2) {
991                                  //converted ptr
992   BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
993                                  //converted ptr
994   BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
995 
996   if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ())
997     return -1;
998   else if (blob1->bounding_box ().bottom () <
999     blob2->bounding_box ().bottom ())
1000     return 1;
1001   else {
1002     if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
1003       return -1;
1004     else if (blob1->bounding_box ().left () >
1005       blob2->bounding_box ().left ())
1006       return 1;
1007     else
1008       return 0;
1009   }
1010 }
1011 
1012