1 /**********************************************************************
2 * File: tordmain.cpp (Formerly textordp.c)
3 * Description: C++ top level textord code.
4 * Author: Ray Smith
5 * Created: Tue Jul 28 17:12:33 BST 1992
6 *
7 * (C) Copyright 1992, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19 #include "mfcpch.h"
20 #ifdef __UNIX__
21 #include <assert.h>
22 #endif
23 #include "stderr.h"
24 #include "globaloc.h"
25 #include "tessout.h"
26 #include "blread.h"
27 #include "blobbox.h"
28 #include "edgblob.h"
29 #include "drawtord.h"
30 #include "makerow.h"
31 #include "wordseg.h"
32 #include "ocrclass.h"
33 #include "genblob.h"
34 #include "imgs.h"
35 #include "tordmain.h"
36 #include "secname.h"
37 #include "tesseractclass.h"
38
39 // Some of the code in this file is dependent upon leptonica. If you don't
40 // have it, you don't get this functionality.
41 #ifdef HAVE_CONFIG_H
42 #include "config_auto.h"
43 #endif
44 #ifdef HAVE_LIBLEPT
45 #include "allheaders.h"
46 #endif
47
48 const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block";
49
50 #undef EXTERN
51 #define EXTERN
52
53 EXTERN BOOL_VAR (textord_no_rejects, FALSE, "Don't remove noise blobs");
54 EXTERN BOOL_VAR (textord_show_blobs, FALSE, "Display unsorted blobs");
55 EXTERN BOOL_VAR (textord_show_boxes, FALSE, "Display unsorted blobs");
56 EXTERN BOOL_VAR (textord_new_initial_xheight, TRUE,
57 "Use test xheight mechanism");
58 EXTERN BOOL_VAR (textord_exit_after, FALSE, "Exit after completing textord");
59 EXTERN INT_VAR (textord_max_noise_size, 7, "Pixel size of noise");
60 EXTERN double_VAR (textord_blob_size_bigile, 95,
61 "Percentile for large blobs");
62 EXTERN double_VAR (textord_noise_area_ratio, 0.7,
63 "Fraction of bounding box for noise");
64 EXTERN double_VAR (textord_blob_size_smallile, 20,
65 "Percentile for small blobs");
66 EXTERN double_VAR (textord_initialx_ile, 0.75,
67 "Ile of sizes for xheight guess");
68 EXTERN double_VAR (textord_initialasc_ile, 0.90,
69 "Ile of sizes for xheight guess");
70 EXTERN INT_VAR (textord_noise_sizefraction, 10,
71 "Fraction of size for maxima");
72 EXTERN double_VAR (textord_noise_sizelimit, 0.5,
73 "Fraction of x for big t count");
74 EXTERN INT_VAR (textord_noise_translimit, 16, "Transitions for normal blob");
75 EXTERN double_VAR (textord_noise_normratio, 2.0,
76 "Dot to norm ratio for deletion");
77 EXTERN BOOL_VAR (textord_noise_rejwords, TRUE, "Reject noise-like words");
78 EXTERN BOOL_VAR (textord_noise_rejrows, TRUE, "Reject noise-like rows");
79 EXTERN double_VAR (textord_noise_syfract, 0.2,
80 "xh fract error for norm blobs");
81 EXTERN double_VAR (textord_noise_sxfract, 0.4,
82 "xh fract width error for norm blobs");
83 EXTERN double_VAR(textord_noise_hfract, 1.0/64,
84 "Height fraction to discard outlines as speckle noise");
85 EXTERN INT_VAR (textord_noise_sncount, 1, "super norm blobs to save row");
86 EXTERN double_VAR (textord_noise_rowratio, 6.0,
87 "Dot to norm ratio for deletion");
88
89 EXTERN BOOL_VAR (textord_noise_debug, FALSE, "Debug row garbage detector");
90 EXTERN double_VAR (textord_blshift_maxshift, 0.00, "Max baseline shift");
91 EXTERN double_VAR (textord_blshift_xfraction, 9.99,
92 "Min size of baseline shift");
93 EXTERN STRING_EVAR (tessedit_image_ext, ".tif", "Externsion for image file");
94
95 #ifndef EMBEDDED
96 EXTERN clock_t previous_cpu;
97 #endif
98
99 extern BOOL_VAR_H (polygon_tess_approximation, TRUE,
100 "Do tess poly instead of grey scale");
101
102 #define MAX_NEAREST_DIST 600 //for block skew stats
103 #define MAX_BLOB_TRANSITIONS100 //for nois stats
104
105 extern IMAGE page_image; //must be defined somewhere
106 extern BOOL_VAR_H (interactive_mode, TRUE, "Run interactively?");
107 extern /*"C" */ ETEXT_DESC *global_monitor; //progress monitor
108
109 /**********************************************************************
110 * find_components
111 *
112 * Find the C_OUTLINEs of the connected components in each block, put them
113 * in C_BLOBs, and filter them by size, putting the different size
114 * grades on different lists in the matching TO_BLOCK in port_blocks.
115 **********************************************************************/
116
find_components(BLOCK_LIST * blocks,TO_BLOCK_LIST * land_blocks,TO_BLOCK_LIST * port_blocks,TBOX * page_box)117 void find_components(
118 BLOCK_LIST *blocks,
119 TO_BLOCK_LIST *land_blocks,
120 TO_BLOCK_LIST *port_blocks,
121 TBOX *page_box) {
122 BLOCK *block; //current block
123 PDBLK_CLIST pd_blocks; //copy of list
124 BLOCK_IT block_it = blocks; //iterator
125 PDBLK_C_IT pd_it = &pd_blocks; //iterator
126 IMAGE thresh_image; //thresholded
127
128 int width = page_image.get_xsize();
129 int height = page_image.get_ysize();
130 if (width > MAX_INT16 || height > MAX_INT16) {
131 tprintf("Input image too large! (%d, %d)\n", width, height);
132 return; // Can't handle it.
133 }
134
135 ICOORD page_tr(width, height);
136 block_it.set_to_list (blocks);
137 if (global_monitor != NULL)
138 global_monitor->ocr_alive = TRUE;
139
140 set_global_loc_code(LOC_EDGE_PROG);
141 if (!page_image.white_high ())
142 invert_image(&page_image);
143
144 #ifndef EMBEDDED
145 previous_cpu = clock ();
146 #endif
147
148 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
149 block_it.forward ()) {
150 block = block_it.data ();
151 if (block->poly_block() == NULL ||
152 block->poly_block()->IsText()) {
153 #ifndef GRAPHICS_DISABLED
154 extract_edges(NULL, &page_image, &page_image, page_tr, block);
155 #else
156 extract_edges(&page_image, &page_image, page_tr, block);
157 #endif
158 *page_box += block->bounding_box ();
159 }
160 }
161 if (global_monitor != NULL) {
162 global_monitor->ocr_alive = TRUE;
163 global_monitor->progress = 10;
164 }
165
166 assign_blobs_to_blocks2(blocks, land_blocks, port_blocks);
167 if (global_monitor != NULL)
168 global_monitor->ocr_alive = TRUE;
169 filter_blobs (page_box->topright (), land_blocks, textord_test_landscape);
170 #ifndef EMBEDDED
171 previous_cpu = clock ();
172 #endif
173 filter_blobs (page_box->topright (), port_blocks, !textord_test_landscape);
174 if (global_monitor != NULL)
175 global_monitor->ocr_alive = TRUE;
176 }
177
178 /**********************************************************************
179 * SetBlobStrokeWidth
180 *
181 * Set the horizontal and vertical stroke widths in the blob.
182 **********************************************************************/
SetBlobStrokeWidth(bool debug,BLOBNBOX * blob)183 void SetBlobStrokeWidth(bool debug, BLOBNBOX* blob) {
184 #ifdef HAVE_LIBLEPT
185 // Cut the blob rectangle into a Pix.
186 // TODO(rays) make the page_image a Pix so this is more direct.
187 const TBOX& box = blob->bounding_box();
188 IMAGE blob_im;
189 int width = box.width();
190 int height = box.height();
191 blob_im.create(width, height, 1);
192 copy_sub_image(&page_image, box.left(), box.bottom(), width, height,
193 &blob_im, 0, 0, false);
194 Pix* pix = blob_im.ToPix();
195 Pix* dist_pix = pixDistanceFunction(pix, 4, 8, L_BOUNDARY_BG);
196 if (debug) {
197 pixWrite("cutpix.png", pix, IFF_PNG);
198 pixWrite("distpix.png", dist_pix, IFF_PNG);
199 }
200 pixDestroy(&pix);
201 // Compute the stroke widths.
202 uinT32* data = pixGetData(dist_pix);
203 int wpl = pixGetWpl(dist_pix);
204 // Horizontal width of stroke.
205 STATS h_stats(0, width + 1);
206 for (int y = 0; y < height; ++y) {
207 uinT32* pixels = data + y*wpl;
208 int prev_pixel = 0;
209 int pixel = GET_DATA_BYTE(pixels, 0);
210 for (int x = 1; x < width; ++x) {
211 int next_pixel = GET_DATA_BYTE(pixels, x);
212 // We are looking for a pixel that is equal to its vertical neighbours,
213 // yet greater than its left neighbour.
214 if (prev_pixel < pixel &&
215 (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
216 (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
217 if (pixel > next_pixel) {
218 // Single local max, so an odd width.
219 h_stats.add(pixel * 2 - 1, 1);
220 } else if (pixel == next_pixel && x + 1 < width &&
221 pixel > GET_DATA_BYTE(pixels, x + 1)) {
222 // Double local max, so an even width.
223 h_stats.add(pixel * 2, 1);
224 }
225 }
226 prev_pixel = pixel;
227 pixel = next_pixel;
228 }
229 }
230 if (debug) {
231 h_stats.print(stderr, true);
232 }
233 // Vertical width of stroke.
234 STATS v_stats(0, height + 1);
235 for (int x = 0; x < width; ++x) {
236 int prev_pixel = 0;
237 int pixel = GET_DATA_BYTE(data, x);
238 for (int y = 1; y < height; ++y) {
239 uinT32* pixels = data + y*wpl;
240 int next_pixel = GET_DATA_BYTE(pixels, x);
241 // We are looking for a pixel that is equal to its horizontal neighbours,
242 // yet greater than its upper neighbour.
243 if (prev_pixel < pixel &&
244 (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
245 (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
246 if (pixel > next_pixel) {
247 // Single local max, so an odd width.
248 v_stats.add(pixel * 2 - 1, 1);
249 } else if (pixel == next_pixel && y + 1 < height &&
250 pixel > GET_DATA_BYTE(pixels + wpl, x)) {
251 // Double local max, so an even width.
252 v_stats.add(pixel * 2, 1);
253 }
254 }
255 prev_pixel = pixel;
256 pixel = next_pixel;
257 }
258 }
259 if (debug) {
260 v_stats.print(stderr, true);
261 }
262 pixDestroy(&dist_pix);
263 // Store the horizontal and vertical width in the blob, keeping both
264 // widths if there is enough information, otherwse only the one with
265 // the most samples.
266 // If there are insufficent samples, store zero, rather than using
267 // 2*area/perimeter, as the numbers that gives do not match the numbers
268 // from the distance method.
269 if (debug) {
270 tprintf("box=%d,%d->%d,%d, hcount=%d, vcount=%d, target=%d\n",
271 box.left(), box.bottom(), box.right(), box.top(),
272 h_stats.get_total(), v_stats.get_total(), (width+height) /4);
273 tprintf("hstats median=%f, lq=%f, uq=%f, sd=%f\n",
274 h_stats.median(), h_stats.ile(0.25f), h_stats.ile(0.75f),
275 h_stats.sd());
276 tprintf("vstats median=%f, lq=%f, uq=%f, sd=%f\n",
277 v_stats.median(), v_stats.ile(0.25f), v_stats.ile(0.75f),
278 v_stats.sd());
279
280 }
281 if (h_stats.get_total() >= (width + height) / 4) {
282 blob->set_horz_stroke_width(h_stats.ile(0.5f));
283 if (v_stats.get_total() >= (width + height) / 4)
284 blob->set_vert_stroke_width(v_stats.ile(0.5f));
285 else
286 blob->set_vert_stroke_width(0.0f);
287 } else {
288 if (v_stats.get_total() >= (width + height) / 4 ||
289 v_stats.get_total() > h_stats.get_total()) {
290 blob->set_horz_stroke_width(0.0f);
291 blob->set_vert_stroke_width(v_stats.ile(0.5f));
292 } else {
293 blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
294 : 0.0f);
295 blob->set_vert_stroke_width(0.0f);
296 }
297 }
298 #else
299 // Without leptonica present, use the 2*area/perimeter as an approximation.
300 float width = 2.0f * blob->cblob()->area();
301 width /= blob->cblob()->perimeter();
302 blob->set_horz_stroke_width(width);
303 blob->set_vert_stroke_width(width);
304 #endif
305 }
306
307
308 /**********************************************************************
309 * assign_blobs_to_blocks2
310 *
311 * Make a list of TO_BLOCKs for portrait and landscape orientation.
312 **********************************************************************/
313
assign_blobs_to_blocks2(BLOCK_LIST * blocks,TO_BLOCK_LIST * land_blocks,TO_BLOCK_LIST * port_blocks)314 void assign_blobs_to_blocks2( //split into groups
315 BLOCK_LIST *blocks, //blocks to process
316 TO_BLOCK_LIST *land_blocks, // ** unused **
317 TO_BLOCK_LIST *port_blocks //output list
318 ) {
319 BLOCK *block; //current block
320 BLOBNBOX *newblob; //created blob
321 C_BLOB *blob; //current blob
322 BLOCK_IT block_it = blocks;
323 C_BLOB_IT blob_it; //iterator
324 BLOBNBOX_IT port_box_it; //iterator
325 //destination iterator
326 TO_BLOCK_IT port_block_it = port_blocks;
327 TO_BLOCK *port_block; //created block
328
329 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
330 block = block_it.data ();
331 port_block = new TO_BLOCK (block);
332
333 // Convert the good outlines to block->blob_list
334 port_box_it.set_to_list (&port_block->blobs);
335 blob_it.set_to_list(block->blob_list());
336 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
337 blob = blob_it.extract ();
338 newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
339 SetBlobStrokeWidth(false, newblob);
340 port_box_it.add_after_then_move (newblob);
341 }
342
343 // Put the rejected outlines in block->noise_blobs, which allows them to
344 // be reconsidered and sorted back into rows and recover outlines mistakenly
345 // rejected.
346 port_box_it.set_to_list(&port_block->noise_blobs);
347 blob_it.set_to_list(block->reject_blobs());
348 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
349 blob = blob_it.extract();
350 newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
351 SetBlobStrokeWidth(false, newblob);
352 port_box_it.add_after_then_move(newblob);
353 }
354
355 port_block_it.add_after_then_move (port_block);
356 }
357 }
358
359
360 /**********************************************************************
361 * filter_blobs
362 *
363 * Sort the blobs into sizes in all the blocks for later work.
364 **********************************************************************/
365
filter_blobs(ICOORD page_tr,TO_BLOCK_LIST * blocks,BOOL8 testing_on)366 void filter_blobs( //split into groups
367 ICOORD page_tr, //top right
368 TO_BLOCK_LIST *blocks, //output list
369 BOOL8 testing_on //for plotting
370 ) {
371 TO_BLOCK_IT block_it = blocks; //destination iterator
372 TO_BLOCK *block; //created block
373
374 #ifndef GRAPHICS_DISABLED
375 if (to_win != NULL)
376 to_win->Clear();
377 #endif
378 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
379 block_it.forward ()) {
380 block = block_it.data ();
381 block->line_size = filter_noise_blobs (&block->blobs,
382 &block->noise_blobs,
383 &block->small_blobs,
384 &block->large_blobs);
385 block->line_spacing =
386 block->line_size * (textord_merge_desc + textord_merge_x +
387 textord_merge_asc +
388 textord_merge_asc) / textord_merge_x;
389 block->line_size *= textord_min_linesize;
390 block->max_blob_size = block->line_size * textord_excess_blobsize;
391 #ifndef GRAPHICS_DISABLED
392 if (textord_show_blobs && testing_on) {
393 if (to_win == NULL)
394 create_to_win(page_tr);
395 block->plot_graded_blobs(to_win);
396 }
397 if (textord_show_boxes && testing_on) {
398 if (to_win == NULL)
399 create_to_win(page_tr);
400 plot_box_list (to_win, &block->noise_blobs, ScrollView::WHITE);
401 plot_box_list (to_win, &block->small_blobs, ScrollView::WHITE);
402 plot_box_list (to_win, &block->large_blobs, ScrollView::WHITE);
403 plot_box_list (to_win, &block->blobs, ScrollView::WHITE);
404 }
405 #endif
406 }
407 }
408
409
410 /**********************************************************************
411 * filter_noise_blobs
412 *
413 * Move small blobs to a separate list.
414 **********************************************************************/
415
filter_noise_blobs(BLOBNBOX_LIST * src_list,BLOBNBOX_LIST * noise_list,BLOBNBOX_LIST * small_list,BLOBNBOX_LIST * large_list)416 float filter_noise_blobs( //separate noise
417 BLOBNBOX_LIST *src_list, //origonal list
418 BLOBNBOX_LIST *noise_list, //noise list
419 BLOBNBOX_LIST *small_list, //small blobs
420 BLOBNBOX_LIST *large_list //large blobs
421 ) {
422 inT16 height; //height of blob
423 inT16 width; //of blob
424 BLOBNBOX_IT src_it = src_list; //iterators
425 BLOBNBOX_IT noise_it = noise_list;
426 BLOBNBOX_IT small_it = small_list;
427 BLOBNBOX_IT large_it = large_list;
428 STATS size_stats (0, MAX_NEAREST_DIST);
429 //blob heights
430 if (textord_new_initial_xheight)
431 return filter_noise_blobs2 (src_list, noise_list, small_list, large_list);
432 float min_y; //size limits
433 float max_y;
434 float max_x;
435
436 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
437 if (src_it.data ()->bounding_box ().height () < textord_max_noise_size)
438 noise_it.add_after_then_move (src_it.extract ());
439 }
440 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
441 size_stats.add (src_it.data ()->bounding_box ().height (), 1);
442 }
443 min_y = floor (size_stats.ile (textord_blob_size_smallile / 100.0));
444 max_y = ceil (size_stats.ile (textord_blob_size_bigile / 100.0));
445 max_x = ceil (size_stats.ile (0.5) * textord_width_limit);
446 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
447 height = src_it.data ()->bounding_box ().height ();
448 width = src_it.data ()->bounding_box ().width ();
449 if (height < min_y)
450 small_it.add_after_then_move (src_it.extract ());
451 else if (height > max_y || width > max_x)
452 large_it.add_after_then_move (src_it.extract ());
453 }
454 return size_stats.ile (textord_initialx_ile);
455 }
456
457
458 /**********************************************************************
459 * filter_noise_blobs2
460 *
461 * Move small blobs to a separate list.
462 **********************************************************************/
463
filter_noise_blobs2(BLOBNBOX_LIST * src_list,BLOBNBOX_LIST * noise_list,BLOBNBOX_LIST * small_list,BLOBNBOX_LIST * large_list)464 float filter_noise_blobs2( //separate noise
465 BLOBNBOX_LIST *src_list, //origonal list
466 BLOBNBOX_LIST *noise_list, //noise list
467 BLOBNBOX_LIST *small_list, //small blobs
468 BLOBNBOX_LIST *large_list //large blobs
469 ) {
470 inT16 height; //height of blob
471 inT16 width; //of blob
472 BLOBNBOX *blob; //current blob
473 float initial_x; //first guess
474 BLOBNBOX_IT src_it = src_list; //iterators
475 BLOBNBOX_IT noise_it = noise_list;
476 BLOBNBOX_IT small_it = small_list;
477 BLOBNBOX_IT large_it = large_list;
478 STATS size_stats (0, MAX_NEAREST_DIST);
479 //blob heights
480 float min_y; //size limits
481 float max_y;
482 float max_x;
483 float max_height; //of good blobs
484
485 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
486 blob = src_it.data ();
487 if (blob->bounding_box ().height () < textord_max_noise_size)
488 noise_it.add_after_then_move (src_it.extract ());
489 else if (blob->enclosed_area () >= blob->bounding_box ().height ()
490 * blob->bounding_box ().width () * textord_noise_area_ratio)
491 small_it.add_after_then_move (src_it.extract ());
492 }
493 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
494 size_stats.add (src_it.data ()->bounding_box ().height (), 1);
495 }
496 initial_x = size_stats.ile (textord_initialx_ile);
497 max_y =
498 ceil (initial_x *
499 (textord_merge_desc + textord_merge_x +
500 2 * textord_merge_asc) / textord_merge_x);
501 min_y = floor (initial_x / 2);
502 max_x = ceil (initial_x * textord_width_limit);
503 small_it.move_to_first ();
504 for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
505 small_it.forward ()) {
506 height = small_it.data()->bounding_box().height();
507 if (height > max_y)
508 large_it.add_after_then_move(small_it.extract ());
509 else if (height >= min_y)
510 src_it.add_after_then_move(small_it.extract ());
511 }
512 size_stats.clear ();
513 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
514 height = src_it.data ()->bounding_box ().height ();
515 width = src_it.data ()->bounding_box ().width ();
516 if (height < min_y)
517 small_it.add_after_then_move (src_it.extract ());
518 else if (height > max_y || width > max_x)
519 large_it.add_after_then_move (src_it.extract ());
520 else
521 size_stats.add (height, 1);
522 }
523 max_height = size_stats.ile (textord_initialasc_ile);
524 // printf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
525 // max_y,min_y,initial_x,max_height);
526 max_height *= textord_merge_x / (textord_merge_x + textord_merge_asc);
527 if (max_height > initial_x)
528 initial_x = max_height;
529 // printf(" ret=%g\n",initial_x);
530 return initial_x;
531 }
532
533
534 /**********************************************************************
535 * textord_page
536 *
537 * Textord the list of blobs and return a list of proper blocks.
538 **********************************************************************/
539
textord_page(ICOORD page_tr,BLOCK_LIST * blocks,TO_BLOCK_LIST * land_blocks,TO_BLOCK_LIST * port_blocks,tesseract::Tesseract * tess)540 void textord_page( //make rows & words
541 ICOORD page_tr, //top right
542 BLOCK_LIST *blocks, //block list
543 TO_BLOCK_LIST *land_blocks, //rotated for landscape
544 TO_BLOCK_LIST *port_blocks, //output list
545 tesseract::Tesseract* tess
546 ) {
547 float gradient; //global skew
548
549 set_global_loc_code(LOC_TEXT_ORD_ROWS);
550 gradient = make_rows (page_tr, blocks, land_blocks, port_blocks, tess);
551 if (global_monitor != NULL) {
552 global_monitor->ocr_alive = TRUE;
553 global_monitor->progress = 20;
554 }
555 set_global_loc_code(LOC_TEXT_ORD_WORDS);
556 make_words(page_tr, gradient, blocks, land_blocks, port_blocks, tess);
557 if (global_monitor != NULL) {
558 global_monitor->ocr_alive = TRUE;
559 global_monitor->progress = 30;
560 }
561 cleanup_blocks(blocks); //remove empties
562 #ifndef GRAPHICS_DISABLED
563 close_to_win();
564 #endif
565 if (textord_exit_after && !interactive_mode)
566 exit (0);
567 }
568
569 /**********************************************************************
570 * cleanup_blocks
571 *
572 * Delete empty blocks, rows from the page.
573 **********************************************************************/
574
cleanup_blocks(BLOCK_LIST * blocks)575 void cleanup_blocks( //remove empties
576 BLOCK_LIST *blocks //list
577 ) {
578 BLOCK_IT block_it = blocks; //iterator
579 ROW_IT row_it; //row iterator
580
581 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
582 block_it.forward ()) {
583 row_it.set_to_list (block_it.data ()->row_list ());
584 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
585 clean_small_noise_from_words(row_it.data());
586 if ((textord_noise_rejrows
587 && !row_it.data ()->word_list ()->empty ()
588 && clean_noise_from_row (row_it.data ()))
589 || row_it.data ()->word_list ()->empty ())
590 delete row_it.extract ();//lose empty row
591 else {
592 if (textord_noise_rejwords)
593 clean_noise_from_words (row_it.data ());
594 if (textord_blshift_maxshift >= 0)
595 tweak_row_baseline (row_it.data ());
596 }
597 }
598 if (block_it.data ()->row_list ()->empty ()) {
599 delete block_it.extract ();//lose empty block
600 }
601 }
602 }
603
604
605 /**********************************************************************
606 * clean_noise_from_row
607 *
608 * Move blobs of words from rows of garbage into the reject blobs list.
609 **********************************************************************/
610
clean_noise_from_row(ROW * row)611 BOOL8 clean_noise_from_row( //remove empties
612 ROW *row //row to clean
613 ) {
614 BOOL8 testing_on;
615 TBOX blob_box; //bounding box
616 C_BLOB *blob; //current blob
617 C_OUTLINE *outline; //current outline
618 WERD *word; //current word
619 inT32 blob_size; //biggest size
620 inT32 trans_count = 0; //no of transitions
621 inT32 trans_threshold; //noise tolerance
622 inT32 dot_count; //small objects
623 inT32 norm_count; //normal objects
624 inT32 super_norm_count; //real char-like
625 //words of row
626 WERD_IT word_it = row->word_list ();
627 C_BLOB_IT blob_it; //blob iterator
628 C_OUTLINE_IT out_it; //outline iterator
629
630 if (textord_test_y > row->base_line (textord_test_x)
631 && textord_show_blobs
632 && textord_test_y < row->base_line (textord_test_x) + row->x_height ())
633 testing_on = TRUE;
634 else
635 testing_on = FALSE;
636 dot_count = 0;
637 norm_count = 0;
638 super_norm_count = 0;
639 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
640 word = word_it.data (); //current word
641 //blobs in word
642 blob_it.set_to_list (word->cblob_list ());
643 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
644 blob_it.forward ()) {
645 blob = blob_it.data ();
646 if (!word->flag (W_DONT_CHOP)) {
647 //get outlines
648 out_it.set_to_list (blob->out_list ());
649 for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
650 out_it.forward ()) {
651 outline = out_it.data ();
652 blob_box = outline->bounding_box ();
653 blob_size =
654 blob_box.width () >
655 blob_box.height ()? blob_box.width () : blob_box.
656 height();
657 if (blob_size < textord_noise_sizelimit * row->x_height ())
658 dot_count++; //count smal outlines
659 if (!outline->child ()->empty ()
660 && blob_box.height () <
661 (1 + textord_noise_syfract) * row->x_height ()
662 && blob_box.height () >
663 (1 - textord_noise_syfract) * row->x_height ()
664 && blob_box.width () <
665 (1 + textord_noise_sxfract) * row->x_height ()
666 && blob_box.width () >
667 (1 - textord_noise_sxfract) * row->x_height ())
668 super_norm_count++; //count smal outlines
669 }
670 }
671 else
672 super_norm_count++;
673 blob_box = blob->bounding_box ();
674 blob_size =
675 blob_box.width () >
676 blob_box.height ()? blob_box.width () : blob_box.height ();
677 if (blob_size >= textord_noise_sizelimit * row->x_height ()
678 && blob_size < row->x_height () * 2) {
679 trans_threshold = blob_size / textord_noise_sizefraction;
680 trans_count = blob->count_transitions (trans_threshold);
681 if (trans_count < textord_noise_translimit)
682 norm_count++;
683 }
684 else if (blob_box.height () > row->x_height () * 2
685 && (!word_it.at_first () || !blob_it.at_first ()))
686 dot_count += 2;
687 #ifndef SECURE_NAMES
688 if (testing_on) {
689 tprintf
690 ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
691 blob_box.left (), blob_box.bottom (), blob_box.right (),
692 blob_box.top (), blob->out_list ()->length (), trans_count,
693 blob_box.bottom () - row->base_line (blob_box.left ()));
694 }
695 #endif
696 }
697 }
698 #ifndef SECURE_NAMES
699 if (textord_noise_debug) {
700 tprintf ("Row ending at (%d,%g):",
701 blob_box.right (), row->base_line (blob_box.right ()));
702 tprintf (" R=%g, dc=%d, nc=%d, %s\n",
703 norm_count > 0 ? (float) dot_count / norm_count : 9999,
704 dot_count, norm_count,
705 dot_count > norm_count * textord_noise_normratio
706 && dot_count > 2 ? "REJECTED" : "ACCEPTED");
707 }
708 #endif
709 return super_norm_count < textord_noise_sncount
710 && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
711 }
712
713
714 /**********************************************************************
715 * clean_noise_from_words
716 *
717 * Move blobs of words from rows of garbage into the reject blobs list.
718 **********************************************************************/
719
clean_noise_from_words(ROW * row)720 void clean_noise_from_words( //remove empties
721 ROW *row //row to clean
722 ) {
723 TBOX blob_box; //bounding box
724 inT8 *word_dud; //was it chucked
725 C_BLOB *blob; //current blob
726 C_OUTLINE *outline; //current outline
727 WERD *word; //current word
728 inT32 blob_size; //biggest size
729 inT32 trans_count; //no of transitions
730 inT32 trans_threshold; //noise tolerance
731 inT32 dot_count; //small objects
732 inT32 norm_count; //normal objects
733 inT32 dud_words; //number discarded
734 inT32 ok_words; //number remaining
735 inT32 word_index; //current word
736 //words of row
737 WERD_IT word_it = row->word_list ();
738 C_BLOB_IT blob_it; //blob iterator
739 C_OUTLINE_IT out_it; //outline iterator
740
741 ok_words = word_it.length ();
742 if (ok_words == 0 || textord_no_rejects)
743 return;
744 word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8));
745 dud_words = 0;
746 ok_words = 0;
747 word_index = 0;
748 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
749 word = word_it.data (); //current word
750 dot_count = 0;
751 norm_count = 0;
752 //blobs in word
753 blob_it.set_to_list (word->cblob_list ());
754 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
755 blob_it.forward ()) {
756 blob = blob_it.data ();
757 if (!word->flag (W_DONT_CHOP)) {
758 //get outlines
759 out_it.set_to_list (blob->out_list ());
760 for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
761 out_it.forward ()) {
762 outline = out_it.data ();
763 blob_box = outline->bounding_box ();
764 blob_size =
765 blob_box.width () >
766 blob_box.height ()? blob_box.width () : blob_box.
767 height();
768 if (blob_size < textord_noise_sizelimit * row->x_height ())
769 dot_count++; //count smal outlines
770 if (!outline->child ()->empty ()
771 && blob_box.height () <
772 (1 + textord_noise_syfract) * row->x_height ()
773 && blob_box.height () >
774 (1 - textord_noise_syfract) * row->x_height ()
775 && blob_box.width () <
776 (1 + textord_noise_sxfract) * row->x_height ()
777 && blob_box.width () >
778 (1 - textord_noise_sxfract) * row->x_height ())
779 norm_count++; //count smal outlines
780 }
781 }
782 else
783 norm_count++;
784 blob_box = blob->bounding_box ();
785 blob_size =
786 blob_box.width () >
787 blob_box.height ()? blob_box.width () : blob_box.height ();
788 if (blob_size >= textord_noise_sizelimit * row->x_height ()
789 && blob_size < row->x_height () * 2) {
790 trans_threshold = blob_size / textord_noise_sizefraction;
791 trans_count = blob->count_transitions (trans_threshold);
792 if (trans_count < textord_noise_translimit)
793 norm_count++;
794 }
795 else if (blob_box.height () > row->x_height () * 2
796 && (!word_it.at_first () || !blob_it.at_first ()))
797 dot_count += 2;
798 }
799 if (dot_count > 2) {
800 if (dot_count > norm_count * textord_noise_normratio * 2)
801 word_dud[word_index] = 2;
802 else if (dot_count > norm_count * textord_noise_normratio)
803 word_dud[word_index] = 1;
804 else
805 word_dud[word_index] = 0;
806 }
807 else
808 word_dud[word_index] = 0;
809 if (word_dud[word_index] == 2)
810 dud_words++;
811 else
812 ok_words++;
813 word_index++;
814 }
815
816 word_index = 0;
817 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
818 if (word_dud[word_index] == 2
819 || (word_dud[word_index] == 1 && dud_words > ok_words)) {
820 word = word_it.data (); //current word
821 //rejected blobs
822 blob_it.set_to_list (word->rej_cblob_list ());
823 //move from blobs
824 blob_it.add_list_after (word->cblob_list ());
825 }
826 word_index++;
827 }
828 free_mem(word_dud);
829 }
830
831 // Remove outlines that are a tiny fraction in either width or height
832 // of the word height.
clean_small_noise_from_words(ROW * row)833 void clean_small_noise_from_words(ROW *row) {
834 WERD_IT word_it(row->word_list());
835 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
836 WERD* word = word_it.data();
837 int min_size = static_cast<int>(
838 textord_noise_hfract * word->bounding_box().height() + 0.5);
839 C_BLOB_IT blob_it(word->cblob_list());
840 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
841 C_BLOB* blob = blob_it.data();
842 C_OUTLINE_IT out_it(blob->out_list());
843 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
844 C_OUTLINE* outline = out_it.data();
845 outline->RemoveSmallRecursive(min_size, &out_it);
846 }
847 if (blob->out_list()->empty()) {
848 delete blob_it.extract();
849 }
850 }
851 if (word->cblob_list()->empty()) {
852 if (!word_it.at_last()) {
853 // The next word is no longer a fuzzy non space if it was before,
854 // since the word before is about to be deleted.
855 WERD* next_word = word_it.data_relative(1);
856 if (next_word->flag(W_FUZZY_NON)) {
857 next_word->set_flag(W_FUZZY_NON, false);
858 }
859 }
860 delete word_it.extract();
861 }
862 }
863 }
864
865
866 /**********************************************************************
867 * tweak_row_baseline
868 *
869 * Shift baseline to fit the blobs more accurately where they are
870 * close enough.
871 **********************************************************************/
872
tweak_row_baseline(ROW * row)873 void tweak_row_baseline( //remove empties
874 ROW *row //row to clean
875 ) {
876 TBOX blob_box; //bounding box
877 C_BLOB *blob; //current blob
878 WERD *word; //current word
879 inT32 blob_count; //no of blobs
880 inT32 src_index; //source segment
881 inT32 dest_index; //destination segment
882 inT32 *xstarts; //spline segments
883 double *coeffs; //spline coeffs
884 float ydiff; //baseline error
885 float x_centre; //centre of blob
886 //words of row
887 WERD_IT word_it = row->word_list ();
888 C_BLOB_IT blob_it; //blob iterator
889
890 blob_count = 0;
891 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
892 word = word_it.data (); //current word
893 //get total blobs
894 blob_count += word->cblob_list ()->length ();
895 }
896 if (blob_count == 0)
897 return;
898 xstarts =
899 (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
900 sizeof (inT32));
901 coeffs =
902 (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
903 sizeof (double));
904
905 src_index = 0;
906 dest_index = 0;
907 xstarts[0] = row->baseline.xcoords[0];
908 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
909 word = word_it.data (); //current word
910 //blobs in word
911 blob_it.set_to_list (word->cblob_list ());
912 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
913 blob_it.forward ()) {
914 blob = blob_it.data ();
915 blob_box = blob->bounding_box ();
916 x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
917 ydiff = blob_box.bottom () - row->base_line (x_centre);
918 if (ydiff < 0)
919 ydiff = -ydiff / row->x_height ();
920 else
921 ydiff = ydiff / row->x_height ();
922 if (ydiff < textord_blshift_maxshift
923 && blob_box.height () / row->x_height () >
924 textord_blshift_xfraction) {
925 if (xstarts[dest_index] >= x_centre)
926 xstarts[dest_index] = blob_box.left ();
927 coeffs[dest_index * 3] = 0;
928 coeffs[dest_index * 3 + 1] = 0;
929 coeffs[dest_index * 3 + 2] = blob_box.bottom ();
930 //shift it
931 dest_index++;
932 xstarts[dest_index] = blob_box.right () + 1;
933 }
934 else {
935 if (xstarts[dest_index] <= x_centre) {
936 while (row->baseline.xcoords[src_index + 1] <= x_centre
937 && src_index < row->baseline.segments - 1) {
938 if (row->baseline.xcoords[src_index + 1] >
939 xstarts[dest_index]) {
940 coeffs[dest_index * 3] =
941 row->baseline.quadratics[src_index].a;
942 coeffs[dest_index * 3 + 1] =
943 row->baseline.quadratics[src_index].b;
944 coeffs[dest_index * 3 + 2] =
945 row->baseline.quadratics[src_index].c;
946 dest_index++;
947 xstarts[dest_index] =
948 row->baseline.xcoords[src_index + 1];
949 }
950 src_index++;
951 }
952 coeffs[dest_index * 3] =
953 row->baseline.quadratics[src_index].a;
954 coeffs[dest_index * 3 + 1] =
955 row->baseline.quadratics[src_index].b;
956 coeffs[dest_index * 3 + 2] =
957 row->baseline.quadratics[src_index].c;
958 dest_index++;
959 xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
960 }
961 }
962 }
963 }
964 while (src_index < row->baseline.segments
965 && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
966 src_index++;
967 while (src_index < row->baseline.segments) {
968 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
969 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
970 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
971 dest_index++;
972 src_index++;
973 xstarts[dest_index] = row->baseline.xcoords[src_index];
974 }
975 //turn to spline
976 row->baseline = QSPLINE (dest_index, xstarts, coeffs);
977 free_mem(xstarts);
978 free_mem(coeffs);
979 }
980
981
982 /**********************************************************************
983 * blob_y_order
984 *
985 * Sort function to sort blobs in y from page top.
986 **********************************************************************/
987
blob_y_order(void * item1,void * item2)988 inT32 blob_y_order( //sort function
989 void *item1, //items to compare
990 void *item2) {
991 //converted ptr
992 BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
993 //converted ptr
994 BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
995
996 if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ())
997 return -1;
998 else if (blob1->bounding_box ().bottom () <
999 blob2->bounding_box ().bottom ())
1000 return 1;
1001 else {
1002 if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
1003 return -1;
1004 else if (blob1->bounding_box ().left () >
1005 blob2->bounding_box ().left ())
1006 return 1;
1007 else
1008 return 0;
1009 }
1010 }
1011
1012