• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**********************************************************************
2  * File:        makerow.cpp  (Formerly makerows.c)
3  * Description: Code to arrange blobs into rows of text.
4  * Author:		Ray Smith
5  * Created:		Mon Sep 21 14:34:48 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #ifdef __UNIX__
22 #include          <assert.h>
23 #endif
24 #include          "stderr.h"
25 #include          "blobbox.h"
26 #include          "lmedsq.h"
27 #include          "statistc.h"
28 #include          "drawtord.h"
29 #include          "blkocc.h"
30 #include          "sortflts.h"
31 #include          "oldbasel.h"
32 #include          "tordmain.h"
33 #include          "underlin.h"
34 #include          "makerow.h"
35 #include          "tprintf.h"
36 #include          "tesseractclass.h"
37 #include          "tovars.h"
38 
39 BOOL_VAR(textord_heavy_nr, FALSE, "Vigorously remove noise");
40 BOOL_VAR(textord_show_initial_rows, FALSE, "Display row accumulation");
41 BOOL_VAR(textord_show_parallel_rows, FALSE, "Display page correlated rows");
42 BOOL_VAR(textord_show_expanded_rows, FALSE, "Display rows after expanding");
43 BOOL_VAR(textord_show_final_rows, FALSE, "Display rows after final fitting");
44 BOOL_VAR(textord_show_final_blobs, FALSE, "Display blob bounds after pre-ass");
45 BOOL_VAR(textord_test_landscape, FALSE, "Tests refer to land/port");
46 BOOL_VAR(textord_parallel_baselines, TRUE, "Force parallel baselines");
47 BOOL_VAR(textord_straight_baselines, FALSE, "Force straight baselines");
48 BOOL_VAR(textord_quadratic_baselines, FALSE, "Use quadratic splines");
49 BOOL_VAR(textord_old_baselines, TRUE, "Use old baseline algorithm");
50 BOOL_VAR(textord_old_xheight, FALSE, "Use old xheight algorithm");
51 BOOL_VAR(textord_fix_xheight_bug, TRUE, "Use spline baseline");
52 BOOL_VAR(textord_fix_makerow_bug, TRUE, "Prevent multiple baselines");
53 BOOL_VAR(textord_cblob_blockocc, TRUE, "Use new projection for underlines");
54 BOOL_VAR(textord_debug_xheights, FALSE, "Test xheight algorithms");
55 BOOL_VAR(textord_biased_skewcalc, TRUE, "Bias skew estimates with line length");
56 BOOL_VAR(textord_interpolating_skew, TRUE, "Interpolate across gaps");
57 INT_VAR(textord_skewsmooth_offset, 2, "For smooth factor");
58 INT_VAR(textord_skewsmooth_offset2, 1, "For smooth factor");
59 INT_VAR(textord_test_x, -1, "coord of test pt");
60 INT_VAR(textord_test_y, -1, "coord of test pt");
61 INT_VAR(textord_min_blobs_in_row, 4, "Min blobs before gradient counted");
62 INT_VAR(textord_spline_minblobs, 8, "Min blobs in each spline segment");
63 INT_VAR(textord_spline_medianwin, 6, "Size of window for spline segmentation");
64 INT_VAR(textord_max_blob_overlaps, 4,
65         "Max number of blobs a big blob can overlap");
66 INT_VAR(textord_min_xheight, 10, "Min credible pixel xheight");
67 double_VAR(textord_spline_shift_fraction, 0.02,
68 "Fraction of line spacing for quad");
69 double_VAR(textord_spline_outlier_fraction, 0.1,
70 "Fraction of line spacing for outlier");
71 double_VAR(textord_skew_ile, 0.5, "Ile of gradients for page skew");
72 double_VAR(textord_skew_lag, 0.01, "Lag for skew on row accumulation");
73 double_VAR(textord_linespace_iqrlimit, 0.2, "Max iqr/median for linespace");
74 double_VAR(textord_width_limit, 8, "Max width of blobs to make rows");
75 double_VAR(textord_chop_width, 1.5, "Max width before chopping");
76 double_VAR(textord_expansion_factor, 1.0,
77 "Factor to expand rows by in expand_rows");
78 double_VAR(textord_overlap_x, 0.5, "Fraction of linespace for good overlap");
79 double_VAR(textord_merge_desc, 0.25, "Fraction of linespace for desc drop");
80 double_VAR(textord_merge_x, 0.5, "Fraction of linespace for x height");
81 double_VAR(textord_merge_asc, 0.25, "Fraction of linespace for asc height");
82 double_VAR(textord_minxh, 0.25, "fraction of linesize for min xheight");
83 double_VAR(textord_min_linesize, 1.25, "* blob height for initial linesize");
84 double_VAR(textord_excess_blobsize, 1.3,
85 "New row made if blob makes row this big");
86 double_VAR(textord_occupancy_threshold, 0.4, "Fraction of neighbourhood");
87 double_VAR(textord_underline_width, 2.0, "Multiple of line_size for underline");
88 double_VAR(textord_min_blob_height_fraction, 0.75,
89            "Min blob height/top to include blob top into xheight stats");
90 double_VAR(textord_xheight_mode_fraction, 0.4,
91 "Min pile height to make xheight");
92 double_VAR(textord_ascheight_mode_fraction, 0.08,
93 "Min pile height to make ascheight");
94 double_VAR(textord_descheight_mode_fraction, 0.08,
95            "Min pile height to make descheight");
96 double_VAR(textord_ascx_ratio_min, 1.3, "Min cap/xheight");
97 double_VAR(textord_ascx_ratio_max, 1.8, "Max cap/xheight");
98 double_VAR(textord_descx_ratio_min, 0.25, "Min desc/xheight");
99 double_VAR(textord_descx_ratio_max, 0.6, "Max desc/xheight");
100 double_VAR(textord_xheight_error_margin, 0.1, "Accepted variation");
101 
102 #define MAX_HEIGHT_MODES  12
103 
104 /**********************************************************************
105  * make_single_row
106  *
107  * Arrange the blobs into a single row.
108  **********************************************************************/
make_single_row(ICOORD page_tr,TO_BLOCK * block,TO_BLOCK_LIST * blocks,tesseract::Tesseract * tess)109 float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks,
110                       tesseract::Tesseract* tess) {
111   BLOBNBOX_IT blob_it = &block->blobs;
112   TO_ROW_IT row_it = block->get_rows();
113 
114   // Include all the small blobs and large blobs.
115   blob_it.add_list_after(&block->small_blobs);
116   blob_it.add_list_after(&block->noise_blobs);
117   blob_it.add_list_after(&block->large_blobs);
118   blob_it.sort(blob_x_order);
119   blob_it.move_to_first();
120   TO_ROW* row = NULL;
121   // Add all the blobs to a single TO_ROW.
122   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
123     BLOBNBOX* blob = blob_it.extract();
124     int top = blob->bounding_box().top();
125     int bottom = blob->bounding_box().bottom();
126     if (row == NULL) {
127       row = new TO_ROW(blob, top, bottom, block->line_size);
128       row_it.add_before_then_move(row);
129     } else {
130       row->add_blob(blob, top, bottom, block->line_size);
131     }
132   }
133   // Fit an LMS line to the row.
134   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward())
135     fit_lms_line(row_it.data());
136   float gradient;
137   float fit_error;
138   // Compute the skew based on the fitted line.
139   compute_page_skew(blocks, gradient, fit_error);
140   FCOORD rotation(1.0f, 0.0f);
141   // Associate i dots and other diacriticals with the appropriate blobs.
142   pre_associate_blobs(page_tr, block, rotation, false);
143   int block_edge = block->block->bounding_box().left();
144   fit_parallel_rows(block, gradient, rotation, block_edge, false);
145   // Make the curved baselines and setup some key block members.
146   make_spline_rows(block, gradient, rotation, block_edge, false, tess);
147   return gradient;
148 }
149 
150 /**********************************************************************
151  * make_rows
152  *
153  * Arrange the blobs into rows.
154  **********************************************************************/
make_rows(ICOORD page_tr,BLOCK_LIST * blocks,TO_BLOCK_LIST * land_blocks,TO_BLOCK_LIST * port_blocks,tesseract::Tesseract * tess)155 float make_rows(                             //make rows
156                 ICOORD page_tr,              //top right
157                 BLOCK_LIST *blocks,          //block list
158                 TO_BLOCK_LIST *land_blocks,  //rotated for landscape
159                 TO_BLOCK_LIST *port_blocks,  //output list
160                 tesseract::Tesseract* tess
161                ) {
162   float port_m;                  //global skew
163   float port_err;                //global noise
164   //   float                                     land_m;                                         //global skew
165   //      float                                   land_err;                                       //global noise
166   TO_BLOCK_IT block_it;          //iterator
167 
168   //don't do landscape for now
169   //      block_it.set_to_list(land_blocks);
170   //      for (block_it.mark_cycle_pt();!block_it.cycled_list();block_it.forward())
171   //              make_initial_textrows(page_tr,block_it.data(),FCOORD(0,-1),
172   //                      (BOOL8)textord_test_landscape);
173   block_it.set_to_list (port_blocks);
174   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
175     block_it.forward ())
176   make_initial_textrows (page_tr, block_it.data (), FCOORD (1.0f, 0.0f),
177       !(BOOL8) textord_test_landscape);
178                                  //compute globally
179   compute_page_skew(port_blocks, port_m, port_err);
180   //      compute_page_skew(land_blocks,land_m,land_err);                 //compute globally
181   //      tprintf("Portrait skew gradient=%g, error=%g.\n",
182   //              port_m,port_err);
183   //      tprintf("Landscape skew gradient=%g, error=%g.\n",
184   //              land_m,land_err);
185   block_it.set_to_list (port_blocks);
186   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
187     cleanup_rows (page_tr, block_it.data (), port_m, FCOORD (1.0f, 0.0f),
188       block_it.data ()->block->bounding_box ().left (),
189                  !(BOOL8)textord_test_landscape, tess);
190   }
191   block_it.set_to_list (land_blocks);
192   //      for (block_it.mark_cycle_pt();!block_it.cycled_list();block_it.forward())
193   //      {
194   //              cleanup_rows(page_tr,block_it.data(),land_m,FCOORD(0,-1),
195   //                      -block_it.data()->block->bounding_box().top(),
196   //                      (BOOL8)textord_test_landscape);
197   //      }
198   return port_m;                 //global skew
199 }
200 
201 
202 /**********************************************************************
203  * make_initial_textrows
204  *
205  * Arrange the good blobs into rows of text.
206  **********************************************************************/
make_initial_textrows(ICOORD page_tr,TO_BLOCK * block,FCOORD rotation,BOOL8 testing_on)207 void make_initial_textrows(                  //find lines
208                            ICOORD page_tr,
209                            TO_BLOCK *block,  //block to do
210                            FCOORD rotation,  //for drawing
211                            BOOL8 testing_on  //correct orientation
212                           ) {
213   TO_ROW_IT row_it = block->get_rows ();
214 
215 #ifndef GRAPHICS_DISABLED
216   ScrollView::Color colour;                 //of row
217 
218   if (textord_show_initial_rows && testing_on) {
219     if (to_win == NULL)
220       create_to_win(page_tr);
221   }
222 #endif
223                                  //guess skew
224   assign_blobs_to_rows (block, NULL, 0, TRUE, TRUE, textord_show_initial_rows && testing_on);
225   row_it.move_to_first ();
226   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
227     fit_lms_line (row_it.data ());
228 #ifndef GRAPHICS_DISABLED
229   if (textord_show_initial_rows && testing_on) {
230     colour = ScrollView::RED;
231     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
232       plot_to_row (row_it.data (), colour, rotation);
233       colour = (ScrollView::Color) (colour + 1);
234       if (colour > ScrollView::MAGENTA)
235         colour = ScrollView::RED;
236     }
237   }
238 #endif
239 }
240 
241 
242 /**********************************************************************
243  * fit_lms_line
244  *
245  * Fit an LMS line to a row.
246  **********************************************************************/
fit_lms_line(TO_ROW * row)247 void fit_lms_line(             //sort function
248                   TO_ROW *row  //row to fit
249                  ) {
250   float m, c;                    //fitted line
251   TBOX box;                       //blob box
252   LMS lms (row->blob_list ()->length ());
253                                  //blobs
254   BLOBNBOX_IT blob_it = row->blob_list ();
255 
256   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
257     box = blob_it.data ()->bounding_box ();
258     lms.add (FCOORD ((box.left () + box.right ()) / 2.0, box.bottom ()));
259   }
260   lms.fit (m, c);
261   row->set_line (m, c, lms.error ());
262 }
263 
264 
265 /**********************************************************************
266  * compute_page_skew
267  *
268  * Compute the skew over a full page by averaging the gradients over
269  * all the lines. Get the error of the same row.
270  **********************************************************************/
compute_page_skew(TO_BLOCK_LIST * blocks,float & page_m,float & page_err)271 void compute_page_skew(                        //get average gradient
272                        TO_BLOCK_LIST *blocks,  //list of blocks
273                        float &page_m,          //average gradient
274                        float &page_err         //average error
275                       ) {
276   inT32 row_count;               //total rows
277   inT32 blob_count;              //total_blobs
278   inT32 row_err;                 //integer error
279   float *gradients;              //of rows
280   float *errors;                 //of rows
281   inT32 row_index;               //of total
282   TO_ROW *row;                   //current row
283   TO_BLOCK_IT block_it = blocks; //iterator
284   TO_ROW_IT row_it;
285 
286   row_count = 0;
287   blob_count = 0;
288   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
289   block_it.forward ()) {
290     row_count += block_it.data ()->get_rows ()->length ();
291     //count up rows
292     row_it.set_to_list (block_it.data ()->get_rows ());
293     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
294       blob_count += row_it.data ()->blob_list ()->length ();
295   }
296   if (row_count == 0) {
297     page_m = 0.0f;
298     page_err = 0.0f;
299     return;
300   }
301   gradients = (float *) alloc_mem (blob_count * sizeof (float));
302   //get mem
303   errors = (float *) alloc_mem (blob_count * sizeof (float));
304   if (gradients == NULL || errors == NULL)
305     MEMORY_OUT.error ("compute_page_skew", ABORT, NULL);
306 
307   row_index = 0;
308   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
309   block_it.forward ()) {
310     row_it.set_to_list (block_it.data ()->get_rows ());
311     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
312       row = row_it.data ();
313       blob_count = row->blob_list ()->length ();
314       row_err = (inT32) ceil (row->line_error ());
315       if (row_err <= 0)
316         row_err = 1;
317       if (textord_biased_skewcalc) {
318         blob_count /= row_err;
319         for (blob_count /= row_err; blob_count > 0; blob_count--) {
320           gradients[row_index] = row->line_m ();
321           errors[row_index] = row->line_error ();
322           row_index++;
323         }
324       }
325       else if (blob_count >= textord_min_blobs_in_row) {
326                                  //get gradient
327         gradients[row_index] = row->line_m ();
328         errors[row_index] = row->line_error ();
329         row_index++;
330       }
331     }
332   }
333   if (row_index == 0) {
334                                  //desperate
335     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
336     block_it.forward ()) {
337       row_it.set_to_list (block_it.data ()->get_rows ());
338       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
339       row_it.forward ()) {
340         row = row_it.data ();
341         gradients[row_index] = row->line_m ();
342         errors[row_index] = row->line_error ();
343         row_index++;
344       }
345     }
346   }
347   row_count = row_index;
348   row_index = choose_nth_item ((inT32) (row_count * textord_skew_ile),
349     gradients, row_count);
350   page_m = gradients[row_index];
351   row_index = choose_nth_item ((inT32) (row_count * textord_skew_ile),
352     errors, row_count);
353   page_err = errors[row_index];
354   free_mem(gradients);
355   free_mem(errors);
356 }
357 
358 const double kNoiseSize = 0.5;  // Fraction of xheight.
359 const int kMinSize = 8;  // Min pixels to be xheight.
360 
361 // Return true if the dot looks like it is part of the i.
362 // Doesn't work for any other diacritical.
dot_of_i(BLOBNBOX * dot,BLOBNBOX * i,TO_ROW * row)363 static bool dot_of_i(BLOBNBOX* dot, BLOBNBOX* i, TO_ROW* row) {
364   const TBOX& ibox = i->bounding_box();
365   const TBOX& dotbox = dot->bounding_box();
366 
367   // Must overlap horizontally by enough and be high enough.
368   int overlap = MIN(dotbox.right(), ibox.right()) -
369                 MAX(dotbox.left(), ibox.left());
370   if (ibox.height() <= 2 * dotbox.height() ||
371       (overlap * 2 < ibox.width() && overlap < dotbox.width()))
372     return false;
373 
374   // If the i is tall and thin then it is good.
375   if (ibox.height() > ibox.width() * 2)
376     return true;  // The i or ! must be tall and thin.
377 
378   // It might still be tall and thin, but it might be joined to something.
379   // So search the outline for a piece of large height close to the edges
380   // of the dot.
381   const double kHeightFraction = 0.6;
382   double target_height = MIN(dotbox.bottom(), ibox.top());
383   target_height -= row->line_m()*dotbox.left() + row->line_c();
384   target_height *= kHeightFraction;
385   int left_min = dotbox.left() - dotbox.width();
386   int middle = (dotbox.left() + dotbox.right())/2;
387   int right_max = dotbox.right() + dotbox.width();
388   int left_miny = 0;
389   int left_maxy = 0;
390   int right_miny = 0;
391   int right_maxy = 0;
392   bool found_left = false;
393   bool found_right = false;
394   bool in_left = false;
395   bool in_right = false;
396   C_BLOB* blob = i->cblob();
397   C_OUTLINE_IT o_it = blob->out_list();
398   for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) {
399     C_OUTLINE* outline = o_it.data();
400     int length = outline->pathlength();
401     ICOORD pos = outline->start_pos();
402     for (int step = 0; step < length; pos += outline->step(step++)) {
403       int x = pos.x();
404       int y = pos.y();
405       if (x >= left_min && x < middle && !found_left) {
406         // We are in the left part so find min and max y.
407         if (in_left) {
408           if (y > left_maxy) left_maxy = y;
409           if (y < left_miny) left_miny = y;
410         } else {
411           left_maxy = left_miny = y;
412           in_left = true;
413         }
414       } else if (in_left) {
415         // We just left the left so look for size.
416         if (left_maxy - left_miny > target_height) {
417           if (found_right)
418             return true;
419           found_left = true;
420         }
421         in_left = false;
422       }
423       if (x <= right_max && x > middle && !found_right) {
424         // We are in the right part so find min and max y.
425         if (in_right) {
426           if (y > right_maxy) right_maxy = y;
427           if (y < right_miny) right_miny = y;
428         } else {
429           right_maxy = right_miny = y;
430           in_right = true;
431         }
432       } else if (in_right) {
433         // We just left the right so look for size.
434         if (right_maxy - right_miny > target_height) {
435           if (found_left)
436             return true;
437           found_right = true;
438         }
439         in_right = false;
440       }
441     }
442   }
443   return false;
444 }
445 
vigorous_noise_removal(TO_BLOCK * block)446 static void vigorous_noise_removal(TO_BLOCK* block) {
447   TO_ROW_IT row_it = block->get_rows ();
448   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
449     TO_ROW* row = row_it.data();
450     BLOBNBOX_IT b_it = row->blob_list();
451     // Estimate the xheight on the row.
452     int max_height = 0;
453     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
454       BLOBNBOX* blob = b_it.data();
455       if (blob->bounding_box().height() > max_height)
456         max_height = blob->bounding_box().height();
457     }
458     STATS hstats(0, max_height + 1);
459     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
460       BLOBNBOX* blob = b_it.data();
461       int height = blob->bounding_box().height();
462       if (height >= kMinSize)
463         hstats.add(blob->bounding_box().height(), 1);
464     }
465     float xheight = hstats.median();
466     // Delete small objects.
467     BLOBNBOX* prev = NULL;
468     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
469       BLOBNBOX* blob = b_it.data();
470       const TBOX& box = blob->bounding_box();
471       if (box.height() < kNoiseSize * xheight) {
472         // Small so delete unless it looks like an i dot.
473         if (prev != NULL) {
474           if (dot_of_i(blob, prev, row))
475             continue;  // Looks OK.
476         }
477         if (!b_it.at_last()) {
478           BLOBNBOX* next = b_it.data_relative(1);
479           if (dot_of_i(blob, next, row))
480             continue;  // Looks OK.
481         }
482         // It might be noise so get rid of it.
483         if (blob->blob() != NULL)
484           delete blob->blob();
485         if (blob->cblob() != NULL)
486           delete blob->cblob();
487         delete b_it.extract();
488       } else {
489         prev = blob;
490       }
491     }
492   }
493 }
494 
495 /**********************************************************************
496  * cleanup_rows
497  *
498  * Remove overlapping rows and fit all the blobs to what's left.
499  **********************************************************************/
cleanup_rows(ICOORD page_tr,TO_BLOCK * block,float gradient,FCOORD rotation,inT32 block_edge,BOOL8 testing_on,tesseract::Tesseract * tess)500 void cleanup_rows(                   //find lines
501                   ICOORD page_tr,    //top right
502                   TO_BLOCK *block,   //block to do
503                   float gradient,    //gradient to fit
504                   FCOORD rotation,   //for drawing
505                   inT32 block_edge,  //edge of block
506                   BOOL8 testing_on,  //correct orientation
507                   tesseract::Tesseract* tess
508                  ) {
509                                  //iterators
510   BLOBNBOX_IT blob_it = &block->blobs;
511   TO_ROW_IT row_it = block->get_rows ();
512 
513 #ifndef GRAPHICS_DISABLED
514   if (textord_show_parallel_rows && testing_on) {
515     if (to_win == NULL)
516       create_to_win(page_tr);
517   }
518 #endif
519                                  //get row coords
520   fit_parallel_rows(block,
521                     gradient,
522                     rotation,
523                     block_edge,
524                     textord_show_parallel_rows &&testing_on);
525   delete_non_dropout_rows(block,
526                           gradient,
527                           rotation,
528                           block_edge,
529                           textord_show_parallel_rows &&testing_on);
530   expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);
531   blob_it.set_to_list (&block->blobs);
532   row_it.set_to_list (block->get_rows ());
533   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
534     blob_it.add_list_after (row_it.data ()->blob_list ());
535   //give blobs back
536   assign_blobs_to_rows (block, &gradient, 1, FALSE, FALSE, FALSE);
537   //now new rows must be genuine
538   blob_it.set_to_list (&block->blobs);
539   blob_it.add_list_after (&block->large_blobs);
540   assign_blobs_to_rows (block, &gradient, 2, TRUE, TRUE, FALSE);
541   //safe to use big ones now
542   blob_it.set_to_list (&block->blobs);
543                                  //throw all blobs in
544   blob_it.add_list_after (&block->noise_blobs);
545   blob_it.add_list_after (&block->small_blobs);
546   assign_blobs_to_rows (block, &gradient, 3, FALSE, FALSE, FALSE);
547   //no rows for noise
548   row_it.set_to_list (block->get_rows ());
549   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
550     row_it.data ()->blob_list ()->sort (blob_x_order);
551   fit_parallel_rows(block, gradient, rotation, block_edge, FALSE);
552   if (textord_heavy_nr) {
553     vigorous_noise_removal(block);
554   }
555   separate_underlines(block, gradient, rotation, testing_on);
556   pre_associate_blobs(page_tr, block, rotation, testing_on);
557 
558 #ifndef GRAPHICS_DISABLED
559   if (textord_show_final_rows && testing_on) {
560     if (to_win == NULL)
561       create_to_win(page_tr);
562   }
563 #endif
564 
565   fit_parallel_rows(block, gradient, rotation, block_edge, FALSE);
566   //              textord_show_final_rows && testing_on);
567   make_spline_rows(block,
568                    gradient,
569                    rotation,
570                    block_edge,
571                    textord_show_final_rows && testing_on,
572                    tess);
573   // We only want to call compute_block_xheight() if
574   // both textord_old_xheight and textord_old_baselines are false.
575   // No need to call compute_block_xheight() if textord_old_baselines
576   // is true, since all appropriate xheight computation functions
577   // would be called from make_old_baselines().
578   // Note: it can not be the case that textord_old_baselines is
579   // false, and textord_old_xheight is true.
580   if (!textord_old_xheight && !textord_old_baselines)
581     compute_block_xheight(block, gradient, tess);
582   if (textord_restore_underlines)  // fix underlines
583     restore_underlined_blobs(block);
584 #ifndef GRAPHICS_DISABLED
585   if (textord_show_final_rows && testing_on) {
586     plot_blob_list (to_win, &block->blobs,
587                     ScrollView::MAGENTA, ScrollView::WHITE);
588     //show discarded blobs
589     plot_blob_list (to_win, &block->underlines,
590                     ScrollView::YELLOW, ScrollView::CORAL);
591   }
592   if (textord_show_final_rows && testing_on && block->blobs.length () > 0)
593     tprintf ("%d blobs discarded as noise\n", block->blobs.length ());
594   if (textord_show_final_rows && testing_on) {
595     draw_meanlines(block, gradient, block_edge, ScrollView::WHITE, rotation);
596   }
597 #endif
598 }
599 
600 
601 /**********************************************************************
602  * delete_non_dropout_rows
603  *
604  * Compute the linespacing and offset.
605  **********************************************************************/
delete_non_dropout_rows(TO_BLOCK * block,float gradient,FCOORD rotation,inT32 block_edge,BOOL8 testing_on)606 void delete_non_dropout_rows(                   //find lines
607                              TO_BLOCK *block,   //block to do
608                              float gradient,    //global skew
609                              FCOORD rotation,   //deskew vector
610                              inT32 block_edge,  //left edge
611                              BOOL8 testing_on   //correct orientation
612                             ) {
613   TBOX block_box;                 //deskewed block
614   inT32 *deltas;                 //change in occupation
615   inT32 *occupation;             //of pixel coords
616   inT32 max_y;                   //in block
617   inT32 min_y;
618   inT32 line_index;              //of scan line
619   inT32 line_count;              //no of scan lines
620   inT32 distance;                //to drop-out
621   inT32 xleft;                   //of block
622   inT32 ybottom;                 //of block
623   TO_ROW *row;                   //current row
624   TO_ROW_IT row_it = block->get_rows ();
625   BLOBNBOX_IT blob_it = &block->blobs;
626 
627   if (row_it.length () == 0)
628     return;                      //empty block
629   block_box = deskew_block_coords (block, gradient);
630   xleft = block->block->bounding_box ().left ();
631   ybottom = block->block->bounding_box ().bottom ();
632   min_y = block_box.bottom () - 1;
633   max_y = block_box.top () + 1;
634   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
635     line_index = (inT32) floor (row_it.data ()->intercept ());
636     if (line_index <= min_y)
637       min_y = line_index - 1;
638     if (line_index >= max_y)
639       max_y = line_index + 1;
640   }
641   line_count = max_y - min_y + 1;
642   if (line_count <= 0)
643     return;                      //empty block
644   deltas = (inT32 *) alloc_mem (line_count * sizeof (inT32));
645   occupation = (inT32 *) alloc_mem (line_count * sizeof (inT32));
646   if (deltas == NULL || occupation == NULL)
647     MEMORY_OUT.error ("compute_line_spacing", ABORT, NULL);
648 
649   compute_line_occupation(block, gradient, min_y, max_y, occupation, deltas);
650   compute_occupation_threshold ((inT32)
651     ceil (block->line_spacing *
652     (textord_merge_desc +
653     textord_merge_asc)),
654     (inT32) ceil (block->line_spacing *
655     (textord_merge_x +
656     textord_merge_asc)),
657     max_y - min_y + 1, occupation, deltas);
658 #ifndef GRAPHICS_DISABLED
659   if (testing_on) {
660     draw_occupation(xleft, ybottom, min_y, max_y, occupation, deltas);
661   }
662 #endif
663   compute_dropout_distances(occupation, deltas, line_count);
664   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
665     row = row_it.data ();
666     line_index = (inT32) floor (row->intercept ());
667     distance = deltas[line_index - min_y];
668     if (find_best_dropout_row (row, distance, block->line_spacing / 2,
669     line_index, &row_it, testing_on)) {
670 #ifndef GRAPHICS_DISABLED
671       if (testing_on)
672         plot_parallel_row(row, gradient, block_edge,
673                           ScrollView::WHITE, rotation);
674 #endif
675       blob_it.add_list_after (row_it.data ()->blob_list ());
676       delete row_it.extract ();  //too far away
677     }
678   }
679   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
680     blob_it.add_list_after (row_it.data ()->blob_list ());
681   }
682 
683   free_mem(deltas);
684   free_mem(occupation);
685 }
686 
687 
688 /**********************************************************************
689  * find_best_dropout_row
690  *
691  * Delete this row if it has a neighbour with better dropout characteristics.
692  * TRUE is returned if the row should be deleted.
693  **********************************************************************/
find_best_dropout_row(TO_ROW * row,inT32 distance,float dist_limit,inT32 line_index,TO_ROW_IT * row_it,BOOL8 testing_on)694 BOOL8 find_best_dropout_row(                    //find neighbours
695                             TO_ROW *row,        //row to test
696                             inT32 distance,     //dropout dist
697                             float dist_limit,   //threshold distance
698                             inT32 line_index,   //index of row
699                             TO_ROW_IT *row_it,  //current position
700                             BOOL8 testing_on    //correct orientation
701                            ) {
702   inT32 next_index;              //of neigbouring row
703   inT32 row_offset;              //from current row
704   inT32 abs_dist;                //absolute distance
705   inT8 row_inc;                  //increment to row_index
706   TO_ROW *next_row;              //nextious row
707 
708   if (testing_on)
709     tprintf ("Row at %g(%g), dropout dist=%d,",
710       row->intercept (), row->parallel_c (), distance);
711   if (distance < 0) {
712     row_inc = 1;
713     abs_dist = -distance;
714   }
715   else {
716     row_inc = -1;
717     abs_dist = distance;
718   }
719   if (abs_dist > dist_limit) {
720     if (testing_on) {
721       tprintf (" too far - deleting\n");
722     }
723     return TRUE;
724   }
725   if ((distance < 0 && !row_it->at_last ())
726   || (distance >= 0 && !row_it->at_first ())) {
727     row_offset = row_inc;
728     do {
729       next_row = row_it->data_relative (row_offset);
730       next_index = (inT32) floor (next_row->intercept ());
731       if ((distance < 0
732         && next_index < line_index
733         && next_index > line_index + distance + distance)
734         || (distance >= 0
735         && next_index > line_index
736       && next_index < line_index + distance + distance)) {
737         if (testing_on) {
738           tprintf (" nearer neighbour (%d) at %g\n",
739             line_index + distance - next_index,
740             next_row->intercept ());
741         }
742         return TRUE;             //other is nearer
743       }
744       else if (next_index == line_index
745       || next_index == line_index + distance + distance) {
746         if (row->believability () <= next_row->believability ()) {
747           if (testing_on) {
748             tprintf (" equal but more believable at %g (%g/%g)\n",
749               next_row->intercept (),
750               row->believability (),
751               next_row->believability ());
752           }
753           return TRUE;           //other is more believable
754         }
755       }
756       row_offset += row_inc;
757     }
758     while ((next_index == line_index
759       || next_index == line_index + distance + distance)
760       && row_offset < row_it->length ());
761     if (testing_on)
762       tprintf (" keeping\n");
763   }
764   return FALSE;
765 }
766 
767 
768 /**********************************************************************
769  * deskew_block_coords
770  *
771  * Compute the bounding box of all the blobs in the block
772  * if they were deskewed without actually doing it.
773  **********************************************************************/
deskew_block_coords(TO_BLOCK * block,float gradient)774 TBOX deskew_block_coords(                  //block box
775                         TO_BLOCK *block,  //block to do
776                         float gradient    //global skew
777                        ) {
778   TBOX result;                    //block bounds
779   TBOX blob_box;                  //of block
780   FCOORD rotation;               //deskew vector
781   float length;                  //of gradient vector
782   TO_ROW_IT row_it = block->get_rows ();
783   TO_ROW *row;                   //current row
784   BLOBNBOX *blob;                //current blob
785   BLOBNBOX_IT blob_it;           //iterator
786 
787   length = sqrt (gradient * gradient + 1);
788   rotation = FCOORD (1 / length, -gradient / length);
789   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
790     row = row_it.data ();
791     blob_it.set_to_list (row->blob_list ());
792     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
793     blob_it.forward ()) {
794       blob = blob_it.data ();
795       blob_box = blob->bounding_box ();
796       blob_box.rotate (rotation);//de-skew it
797       result += blob_box;
798     }
799   }
800   return result;
801 }
802 
803 
804 /**********************************************************************
805  * compute_line_occupation
806  *
807  * Compute the pixel projection back on the y axis given the global
808  * skew. Also compute the 1st derivative.
809  **********************************************************************/
compute_line_occupation(TO_BLOCK * block,float gradient,inT32 min_y,inT32 max_y,inT32 * occupation,inT32 * deltas)810 void compute_line_occupation(                    //project blobs
811                              TO_BLOCK *block,    //block to do
812                              float gradient,     //global skew
813                              inT32 min_y,        //min coord in block
814                              inT32 max_y,        //in block
815                              inT32 *occupation,  //output projection
816                              inT32 *deltas       //derivative
817                             ) {
818   inT32 line_count;              //maxy-miny+1
819   inT32 line_index;              //of scan line
820   int index;                     //array index for daft compilers
821   float top, bottom;             //coords of blob
822   inT32 width;                   //of blob
823   TO_ROW *row;                   //current row
824   TO_ROW_IT row_it = block->get_rows ();
825   BLOBNBOX *blob;                //current blob
826   BLOBNBOX_IT blob_it;           //iterator
827   float length;                  //of skew vector
828   TBOX blob_box;                  //bounding box
829   FCOORD rotation;               //inverse of skew
830 
831   line_count = max_y - min_y + 1;
832   length = sqrt (gradient * gradient + 1);
833   rotation = FCOORD (1 / length, -gradient / length);
834   for (line_index = 0; line_index < line_count; line_index++)
835     deltas[line_index] = 0;
836   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
837     row = row_it.data ();
838     blob_it.set_to_list (row->blob_list ());
839     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
840     blob_it.forward ()) {
841       blob = blob_it.data ();
842       blob_box = blob->bounding_box ();
843       blob_box.rotate (rotation);//de-skew it
844       top = blob_box.top ();
845       bottom = blob_box.bottom ();
846       width =
847         (inT32) floor ((FLOAT32) (blob_box.right () - blob_box.left ()));
848       if ((inT32) floor (bottom) < min_y
849         || (inT32) floor (bottom) - min_y >= line_count)
850         fprintf (stderr,
851           "Bad y coord of bottom, " INT32FORMAT "(" INT32FORMAT ","
852           INT32FORMAT ")\n", (inT32) floor (bottom), min_y, max_y);
853                                  //count transitions
854       index = (inT32) floor (bottom) - min_y;
855       deltas[index] += width;
856       if ((inT32) floor (top) < min_y
857         || (inT32) floor (top) - min_y >= line_count)
858         fprintf (stderr,
859           "Bad y coord of top, " INT32FORMAT "(" INT32FORMAT ","
860           INT32FORMAT ")\n", (inT32) floor (top), min_y, max_y);
861       index = (inT32) floor (top) - min_y;
862       deltas[index] -= width;
863     }
864   }
865   occupation[0] = deltas[0];
866   for (line_index = 1; line_index < line_count; line_index++)
867     occupation[line_index] = occupation[line_index - 1] + deltas[line_index];
868 }
869 
870 
871 /**********************************************************************
872  * compute_occupation_threshold
873  *
874  * Compute thresholds for textline or not for the occupation array.
875  **********************************************************************/
compute_occupation_threshold(inT32 low_window,inT32 high_window,inT32 line_count,inT32 * occupation,inT32 * thresholds)876 void compute_occupation_threshold(                    //project blobs
877                                   inT32 low_window,   //below result point
878                                   inT32 high_window,  //above result point
879                                   inT32 line_count,   //array sizes
880                                   inT32 *occupation,  //input projection
881                                   inT32 *thresholds   //output thresholds
882                                  ) {
883   inT32 line_index;              //of thresholds line
884   inT32 low_index;               //in occupation
885   inT32 high_index;              //in occupation
886   inT32 sum;                     //current average
887   inT32 divisor;                 //to get thresholds
888   inT32 min_index;               //of min occ
889   inT32 min_occ;                 //min in locality
890   inT32 test_index;              //for finding min
891 
892   divisor =
893     (inT32) ceil ((low_window + high_window) / textord_occupancy_threshold);
894   if (low_window + high_window < line_count) {
895     for (sum = 0, high_index = 0; high_index < low_window; high_index++)
896       sum += occupation[high_index];
897     for (low_index = 0; low_index < high_window; low_index++, high_index++)
898       sum += occupation[high_index];
899     min_occ = occupation[0];
900     min_index = 0;
901     for (test_index = 1; test_index < high_index; test_index++) {
902       if (occupation[test_index] <= min_occ) {
903         min_occ = occupation[test_index];
904         min_index = test_index;  //find min in region
905       }
906     }
907     for (line_index = 0; line_index < low_window; line_index++)
908       thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
909     //same out to end
910     for (low_index = 0; high_index < line_count; low_index++, high_index++) {
911       sum -= occupation[low_index];
912       sum += occupation[high_index];
913       if (occupation[high_index] <= min_occ) {
914                                  //find min in region
915         min_occ = occupation[high_index];
916         min_index = high_index;
917       }
918                                  //lost min from region
919       if (min_index <= low_index) {
920         min_occ = occupation[low_index + 1];
921         min_index = low_index + 1;
922         for (test_index = low_index + 2; test_index <= high_index;
923         test_index++) {
924           if (occupation[test_index] <= min_occ) {
925             min_occ = occupation[test_index];
926                                  //find min in region
927             min_index = test_index;
928           }
929         }
930       }
931       thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;
932     }
933   }
934   else {
935     min_occ = occupation[0];
936     min_index = 0;
937     for (sum = 0, low_index = 0; low_index < line_count; low_index++) {
938       if (occupation[low_index] < min_occ) {
939         min_occ = occupation[low_index];
940         min_index = low_index;
941       }
942       sum += occupation[low_index];
943     }
944     line_index = 0;
945   }
946   for (; line_index < line_count; line_index++)
947     thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
948   //same out to end
949 }
950 
951 
952 /**********************************************************************
953  * compute_dropout_distances
954  *
955  * Compute the distance from each coordinate to the nearest dropout.
956  **********************************************************************/
compute_dropout_distances(inT32 * occupation,inT32 * thresholds,inT32 line_count)957 void compute_dropout_distances(                    //project blobs
958                                inT32 *occupation,  //input projection
959                                inT32 *thresholds,  //output thresholds
960                                inT32 line_count    //array sizes
961                               ) {
962   inT32 line_index;              //of thresholds line
963   inT32 distance;                //from prev dropout
964   inT32 next_dist;               //to next dropout
965   inT32 back_index;              //for back filling
966   inT32 prev_threshold;          //before overwrite
967 
968   distance = -line_count;
969   line_index = 0;
970   do {
971     do {
972       distance--;
973       prev_threshold = thresholds[line_index];
974                                  //distance from prev
975       thresholds[line_index] = distance;
976       line_index++;
977     }
978     while (line_index < line_count
979       && (occupation[line_index] < thresholds[line_index]
980       || occupation[line_index - 1] >= prev_threshold));
981     if (line_index < line_count) {
982       back_index = line_index - 1;
983       next_dist = 1;
984       while (next_dist < -distance && back_index >= 0) {
985         thresholds[back_index] = next_dist;
986         back_index--;
987         next_dist++;
988         distance++;
989       }
990       distance = 1;
991     }
992   }
993   while (line_index < line_count);
994 }
995 
996 
997 /**********************************************************************
998  * expand_rows
999  *
1000  * Expand each row to the least of its allowed size and touching its
1001  * neighbours. If the expansion would entirely swallow a neighbouring row
1002  * then do so.
1003  **********************************************************************/
expand_rows(ICOORD page_tr,TO_BLOCK * block,float gradient,FCOORD rotation,inT32 block_edge,BOOL8 testing_on)1004 void expand_rows(                   //find lines
1005                  ICOORD page_tr,    //top right
1006                  TO_BLOCK *block,   //block to do
1007                  float gradient,    //gradient to fit
1008                  FCOORD rotation,   //for drawing
1009                  inT32 block_edge,  //edge of block
1010                  BOOL8 testing_on   //correct orientation
1011                 ) {
1012   BOOL8 swallowed_row;           //eaten a neighbour
1013   float y_max, y_min;            //new row limits
1014   float y_bottom, y_top;         //allowed limits
1015   TO_ROW *test_row;              //next row
1016   TO_ROW *row;                   //current row
1017                                  //iterators
1018   BLOBNBOX_IT blob_it = &block->blobs;
1019   TO_ROW_IT row_it = block->get_rows ();
1020 
1021 #ifndef GRAPHICS_DISABLED
1022   if (textord_show_expanded_rows && testing_on) {
1023     if (to_win == NULL)
1024       create_to_win(page_tr);
1025   }
1026 #endif
1027 
1028   adjust_row_limits(block);  //shift min,max.
1029   if (textord_new_initial_xheight) {
1030     if (block->get_rows ()->length () == 0)
1031       return;
1032     compute_row_stats(block, textord_show_expanded_rows &&testing_on);
1033   }
1034   assign_blobs_to_rows (block, &gradient, 4, TRUE, FALSE, FALSE);
1035   //get real membership
1036   if (block->get_rows ()->length () == 0)
1037     return;
1038   fit_parallel_rows(block,
1039                     gradient,
1040                     rotation,
1041                     block_edge,
1042                     textord_show_expanded_rows &&testing_on);
1043   if (!textord_new_initial_xheight)
1044     compute_row_stats(block, textord_show_expanded_rows &&testing_on);
1045   row_it.move_to_last ();
1046   do {
1047     row = row_it.data ();
1048     y_max = row->max_y ();       //get current limits
1049     y_min = row->min_y ();
1050     y_bottom = row->intercept () - block->line_size * textord_expansion_factor *
1051       textord_merge_desc;
1052     y_top = row->intercept () + block->line_size * textord_expansion_factor *
1053       (textord_merge_x + textord_merge_asc);
1054     if (y_min > y_bottom) {      //expansion allowed
1055       if (textord_show_expanded_rows && testing_on)
1056         tprintf("Expanding bottom of row at %f from %f to %f\n",
1057                 row->intercept(), y_min, y_bottom);
1058                                  //expandable
1059       swallowed_row = TRUE;
1060       while (swallowed_row && !row_it.at_last ()) {
1061         swallowed_row = FALSE;
1062                                  //get next one
1063         test_row = row_it.data_relative (1);
1064                                  //overlaps space
1065         if (test_row->max_y () > y_bottom) {
1066           if (test_row->min_y () > y_bottom) {
1067             if (textord_show_expanded_rows && testing_on)
1068               tprintf("Eating row below at %f\n", test_row->intercept());
1069             row_it.forward ();
1070 #ifndef GRAPHICS_DISABLED
1071             if (textord_show_expanded_rows && testing_on)
1072               plot_parallel_row(test_row,
1073                                 gradient,
1074                                 block_edge,
1075                                 ScrollView::WHITE,
1076                                 rotation);
1077 #endif
1078             blob_it.set_to_list (row->blob_list ());
1079             blob_it.add_list_after (test_row->blob_list ());
1080                                  //swallow complete row
1081             delete row_it.extract ();
1082             row_it.backward ();
1083             swallowed_row = TRUE;
1084           }
1085           else if (test_row->max_y () < y_min) {
1086                                  //shorter limit
1087             y_bottom = test_row->max_y ();
1088             if (textord_show_expanded_rows && testing_on)
1089               tprintf("Truncating limit to %f due to touching row at %f\n",
1090                       y_bottom, test_row->intercept());
1091           }
1092           else {
1093             y_bottom = y_min;    //can't expand it
1094             if (textord_show_expanded_rows && testing_on)
1095               tprintf("Not expanding limit beyond %f due to touching row at %f\n",
1096                       y_bottom, test_row->intercept());
1097           }
1098         }
1099       }
1100       y_min = y_bottom;          //expand it
1101     }
1102     if (y_max < y_top) {         //expansion allowed
1103       if (textord_show_expanded_rows && testing_on)
1104         tprintf("Expanding top of row at %f from %f to %f\n",
1105                 row->intercept(), y_max, y_top);
1106       swallowed_row = TRUE;
1107       while (swallowed_row && !row_it.at_first ()) {
1108         swallowed_row = FALSE;
1109                                  //get one above
1110         test_row = row_it.data_relative (-1);
1111         if (test_row->min_y () < y_top) {
1112           if (test_row->max_y () < y_top) {
1113             if (textord_show_expanded_rows && testing_on)
1114               tprintf("Eating row above at %f\n", test_row->intercept());
1115             row_it.backward ();
1116             blob_it.set_to_list (row->blob_list ());
1117 #ifndef GRAPHICS_DISABLED
1118             if (textord_show_expanded_rows && testing_on)
1119               plot_parallel_row(test_row,
1120                                 gradient,
1121                                 block_edge,
1122                                 ScrollView::WHITE,
1123                                 rotation);
1124 #endif
1125             blob_it.add_list_after (test_row->blob_list ());
1126                                  //swallow complete row
1127             delete row_it.extract ();
1128             row_it.forward ();
1129             swallowed_row = TRUE;
1130           }
1131           else if (test_row->min_y () < y_max) {
1132                                  //shorter limit
1133             y_top = test_row->min_y ();
1134             if (textord_show_expanded_rows && testing_on)
1135               tprintf("Truncating limit to %f due to touching row at %f\n",
1136                       y_top, test_row->intercept());
1137           }
1138           else {
1139             y_top = y_max;       //can't expand it
1140             if (textord_show_expanded_rows && testing_on)
1141               tprintf("Not expanding limit beyond %f due to touching row at %f\n",
1142                       y_top, test_row->intercept());
1143           }
1144         }
1145       }
1146       y_max = y_top;
1147     }
1148                                  //new limits
1149     row->set_limits (y_min, y_max);
1150     row_it.backward ();
1151   }
1152   while (!row_it.at_last ());
1153 }
1154 
1155 
1156 /**********************************************************************
1157  * adjust_row_limits
1158  *
1159  * Change the limits of rows to suit the default fractions.
1160  **********************************************************************/
adjust_row_limits(TO_BLOCK * block)1161 void adjust_row_limits(                 //tidy limits
1162                        TO_BLOCK *block  //block to do
1163                       ) {
1164   TO_ROW *row;                   //current row
1165   float size;                    //size of row
1166   float ymax;                    //top of row
1167   float ymin;                    //bottom of row
1168   TO_ROW_IT row_it = block->get_rows ();
1169 
1170   if (textord_show_expanded_rows)
1171     tprintf("Adjusting row limits for block(%d,%d)\n",
1172             block->block->bounding_box().left(),
1173             block->block->bounding_box().top());
1174   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1175     row = row_it.data ();
1176     size = row->max_y () - row->min_y ();
1177     if (textord_show_expanded_rows)
1178       tprintf("Row at %f has min %f, max %f, size %f\n",
1179               row->intercept(), row->min_y(), row->max_y(), size);
1180     size /= textord_merge_x + textord_merge_asc + textord_merge_desc;
1181     ymax = size * (textord_merge_x + textord_merge_asc);
1182     ymin = -size * textord_merge_desc;
1183     row->set_limits (row->intercept () + ymin, row->intercept () + ymax);
1184     row->merged = FALSE;
1185   }
1186 }
1187 
1188 
1189 /**********************************************************************
1190  * compute_row_stats
1191  *
1192  * Compute the linespacing and offset.
1193  **********************************************************************/
compute_row_stats(TO_BLOCK * block,BOOL8 testing_on)1194 void compute_row_stats(                  //find lines
1195                        TO_BLOCK *block,  //block to do
1196                        BOOL8 testing_on  //correct orientation
1197                       ) {
1198   inT32 row_index;               //of median
1199   TO_ROW *row;                   //current row
1200   TO_ROW *prev_row;              //previous row
1201   float iqr;                     //inter quartile range
1202   TO_ROW_IT row_it = block->get_rows ();
1203                                  //number of rows
1204   inT16 rowcount = row_it.length ();
1205   TO_ROW **rows;                 //for choose nth
1206 
1207   rows = (TO_ROW **) alloc_mem (rowcount * sizeof (TO_ROW *));
1208   if (rows == NULL)
1209     MEMORY_OUT.error ("compute_row_stats", ABORT, NULL);
1210   rowcount = 0;
1211   prev_row = NULL;
1212   row_it.move_to_last ();        //start at bottom
1213   do {
1214     row = row_it.data ();
1215     if (prev_row != NULL) {
1216       rows[rowcount++] = prev_row;
1217       prev_row->spacing = row->intercept () - prev_row->intercept ();
1218       if (testing_on)
1219         tprintf ("Row at %g yields spacing of %g\n",
1220           row->intercept (), prev_row->spacing);
1221     }
1222     prev_row = row;
1223     row_it.backward ();
1224   }
1225   while (!row_it.at_last ());
1226   block->key_row = prev_row;
1227   block->baseline_offset =
1228     fmod (prev_row->parallel_c (), block->line_spacing);
1229   if (testing_on)
1230     tprintf ("Blob based spacing=(%g,%g), offset=%g",
1231       block->line_size, block->line_spacing, block->baseline_offset);
1232   if (rowcount > 0) {
1233     row_index = choose_nth_item (rowcount * 3 / 4, rows, rowcount,
1234       sizeof (TO_ROW *), row_spacing_order);
1235     iqr = rows[row_index]->spacing;
1236     row_index = choose_nth_item (rowcount / 4, rows, rowcount,
1237       sizeof (TO_ROW *), row_spacing_order);
1238     iqr -= rows[row_index]->spacing;
1239     row_index = choose_nth_item (rowcount / 2, rows, rowcount,
1240       sizeof (TO_ROW *), row_spacing_order);
1241     block->key_row = rows[row_index];
1242     if (testing_on)
1243       tprintf (" row based=%g(%g)", rows[row_index]->spacing, iqr);
1244     if (rowcount > 2
1245     && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) {
1246       if (!textord_new_initial_xheight) {
1247         if (rows[row_index]->spacing < block->line_spacing
1248           && rows[row_index]->spacing > block->line_size)
1249           //within range
1250           block->line_size = rows[row_index]->spacing;
1251         //spacing=size
1252         else if (rows[row_index]->spacing > block->line_spacing)
1253           block->line_size = block->line_spacing;
1254         //too big so use max
1255       }
1256       else {
1257         if (rows[row_index]->spacing < block->line_spacing)
1258           block->line_size = rows[row_index]->spacing;
1259         else
1260           block->line_size = block->line_spacing;
1261         //too big so use max
1262       }
1263       if (block->line_size < textord_min_xheight)
1264         block->line_size = (float) textord_min_xheight;
1265       block->line_spacing = rows[row_index]->spacing;
1266       block->max_blob_size =
1267         block->line_spacing * textord_excess_blobsize;
1268     }
1269     block->baseline_offset = fmod (rows[row_index]->intercept (),
1270       block->line_spacing);
1271   }
1272   if (testing_on)
1273     tprintf ("\nEstimate line size=%g, spacing=%g, offset=%g\n",
1274       block->line_size, block->line_spacing, block->baseline_offset);
1275   free_mem(rows);
1276 }
1277 
1278 
1279 /**********************************************************************
1280  * compute_block_xheight
1281  *
1282  * Compute the xheight of the individual rows, then correlate them
1283  * and interpret ascenderless lines, correcting xheights.
1284  *
1285  * First we compute our best guess of the x-height of each row independently
1286  * with compute_row_xheight(), which looks for a pair of commonly occurring
1287  * heights that could be x-height and ascender height. This function also
1288  * attempts to find descenders of lowercase letters (i.e. not the small
1289  * descenders that could appear in upper case letters as Q,J).
1290  *
1291  * After this computation each row falls into one of the following categories:
1292  * ROW_ASCENDERS_FOUND: we found xheight and ascender modes, so this must be
1293  *                      a regular row; we'll use its xheight to compute
1294  *                      xheight and ascrise estimates for the block
1295  * ROW_DESCENDERS_FOUND: no ascenders, so we do not have a high confidence in
1296  *                       the xheight of this row (don't use it for estimating
1297  *                       block xheight), but this row can't contain all caps
1298  * ROW_UNKNOWN: a row with no ascenders/descenders, could be all lowercase
1299  *              (or mostly lowercase for fonts with very few ascenders),
1300  *              all upper case or small caps
1301  * ROW_INVALID: no meaningful xheight could be found for this row
1302  *
1303  * We then run correct_row_xheight() and use the computed xheight and ascrise
1304  * averages to correct xheight values of the rows in ROW_DESCENDERS_FOUND,
1305  * ROW_UNKNOWN and ROW_INVALID categories.
1306  *
1307  * **********************************************************************/
compute_block_xheight(TO_BLOCK * block,float gradient,tesseract::Tesseract * tess)1308 void compute_block_xheight(TO_BLOCK *block, float gradient,
1309                            tesseract::Tesseract *tess) {
1310   TO_ROW *row;                   //current row
1311   float asc_frac_xheight = textord_merge_asc / textord_merge_x;
1312   float desc_frac_xheight = textord_merge_desc / textord_merge_x;
1313   inT32 min_height, max_height;         // limits on xheight
1314   TO_ROW_IT row_it = block->get_rows ();
1315   if (row_it.empty()) return;  // no rows
1316 
1317   // Compute the best guess of xheight of each row individually.
1318   // Use xheight and ascrise values of the rows where ascenders were found.
1319   get_min_max_xheight(block->line_size, &min_height, &max_height);
1320   STATS row_asc_xheights(min_height, max_height + 1);
1321   STATS row_asc_ascrise(static_cast<int>(min_height * asc_frac_xheight),
1322                         static_cast<int>(max_height * asc_frac_xheight) + 1);
1323   int min_desc_height = static_cast<int>(min_height * desc_frac_xheight);
1324   int max_desc_height = static_cast<int>(max_height * desc_frac_xheight);
1325   STATS row_asc_descdrop(min_desc_height, max_desc_height + 1);
1326   STATS row_desc_xheights(min_height, max_height + 1);
1327   STATS row_desc_descdrop(min_desc_height, max_desc_height + 1);
1328   STATS row_cap_xheights(min_height, max_height + 1);
1329   STATS row_cap_floating_xheights(min_height, max_height + 1);
1330   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1331     row = row_it.data ();
1332     // Compute the xheight of this row if it has not been computed before.
1333     if (row->xheight <= 0.0) {
1334       compute_row_xheight(row, gradient, block->line_size, tess);
1335     }
1336     ROW_CATEGORY row_category = get_row_category(row);
1337     if (row_category == ROW_ASCENDERS_FOUND) {
1338       row_asc_xheights.add(static_cast<inT32>(row->xheight),
1339                            row->xheight_evidence);
1340       row_asc_ascrise.add(static_cast<inT32>(row->ascrise),
1341                           row->xheight_evidence);
1342       row_asc_descdrop.add(static_cast<inT32>(-row->descdrop),
1343                            row->xheight_evidence);
1344     } else if (row_category == ROW_DESCENDERS_FOUND) {
1345       row_desc_xheights.add(static_cast<inT32>(row->xheight),
1346                             row->xheight_evidence);
1347       row_desc_descdrop.add(static_cast<inT32>(-row->descdrop),
1348                             row->xheight_evidence);
1349     } else if (row_category == ROW_UNKNOWN) {
1350       fill_heights(row, gradient, min_height, max_height,
1351                    &row_cap_xheights, &row_cap_floating_xheights);
1352     }
1353   }
1354 
1355   float xheight = 0.0;
1356   float ascrise = 0.0;
1357   float descdrop = 0.0;
1358   // Compute our best guess of xheight of this block.
1359   if (row_asc_xheights.get_total() > 0) {
1360     // Determine xheight from rows where ascenders were found.
1361     xheight = row_asc_xheights.median();
1362     ascrise = row_asc_ascrise.median();
1363     descdrop = -row_asc_descdrop.median();
1364   } else if (row_desc_xheights.get_total() > 0) {
1365     // Determine xheight from rows where descenders were found.
1366     xheight = row_desc_xheights.median();
1367     descdrop = -row_desc_descdrop.median();
1368   } else if (row_cap_xheights.get_total() > 0) {
1369     // All the rows in the block were (a/de)scenderless.
1370     // Try to search for two modes in row_cap_heights that could
1371     // be the xheight and the capheight (e.g. some of the rows
1372     // were lowercase, but did not have enough (a/de)scenders.
1373     // If such two modes can not be found, this block is most
1374     // likely all caps (or all small caps, in which case the code
1375     // still works as intended).
1376     compute_xheight_from_modes(&row_cap_xheights, &row_cap_floating_xheights,
1377                                min_height, max_height, &(xheight), &(ascrise));
1378     if (ascrise == 0) {  // assume only caps in the whole block
1379       xheight = row_cap_xheights.median() * textord_merge_x /
1380         (textord_merge_x + textord_merge_asc);
1381   }
1382   } else {  // default block sizes
1383     xheight = block->line_size * textord_merge_x;
1384   }
1385   // Correct xheight, ascrise and descdrop if necessary.
1386   bool corrected_xheight = false;
1387   if (xheight < textord_min_xheight) {
1388     xheight = static_cast<float>(textord_min_xheight);
1389     corrected_xheight = true;
1390   }
1391   if (corrected_xheight || ascrise <= 0.0) {
1392     ascrise = xheight * asc_frac_xheight;
1393   }
1394   if (corrected_xheight || descdrop >= 0.0) {
1395     descdrop = -(xheight * desc_frac_xheight);
1396 }
1397   block->xheight = xheight;
1398 
1399   if (textord_debug_xheights) {
1400     tprintf("Block average xheight=%.4f, ascrise=%.4f, descdrop=%.4f\n",
1401             xheight, ascrise, descdrop);
1402     }
1403   // Correct xheight, ascrise, descdrop of rows based on block averages.
1404   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1405     correct_row_xheight(row_it.data(), xheight, ascrise, descdrop);
1406   }
1407 }
1408 
1409 /**********************************************************************
1410  * compute_row_xheight
1411  *
1412  * Estimate the xheight of this row.
1413  * Compute the ascender rise and descender drop at the same time.
1414  * Set xheigh_evidence to the number of blobs with the chosen xheight
1415  * that appear in this row.
1416  **********************************************************************/
compute_row_xheight(TO_ROW * row,float gradient,int block_line_size,tesseract::Tesseract * tess)1417 void compute_row_xheight(TO_ROW *row,          // row to do
1418                          float gradient,       // global skew
1419                          int block_line_size,
1420                          tesseract::Tesseract *tess) {
1421   // Find blobs representing repeated characters in rows and mark them.
1422   // This information is used for computing row xheight and at a later
1423   // stage when words are formed by make_words.
1424   if (!row->rep_chars_marked()) {
1425     mark_repeated_chars(row, block_line_size * textord_merge_x, tess);
1426   }
1427 
1428   int min_height, max_height;
1429   get_min_max_xheight(block_line_size, &min_height, &max_height);
1430   STATS heights(min_height, max_height + 1);
1431   STATS floating_heights(min_height, max_height + 1);
1432   fill_heights(row, gradient, min_height, max_height,
1433                &heights, &floating_heights);
1434   row->ascrise = 0.0f;
1435   row->xheight = 0.0f;
1436   row->xheight_evidence =
1437     compute_xheight_from_modes(&heights, &floating_heights, min_height,
1438                                max_height, &(row->xheight), &(row->ascrise));
1439   row->descdrop = 0.0f;
1440   if (row->xheight > 0.0) {
1441     row->descdrop = static_cast<float>(
1442         compute_row_descdrop(row, gradient, row->xheight_evidence, &heights));
1443   } else {
1444     // Since we could not find a meaningful xheight, the results
1445     // of mark_repeated_chars() should be recomputed at a later stage.
1446     row->clear_rep_chars_marked();
1447   }
1448 }
1449 
1450 /**********************************************************************
1451  * fill_heights
1452  *
1453  * Fill the given heights with heights of the blobs that are legal
1454  * candidates for estimating xheight.
1455  **********************************************************************/
fill_heights(TO_ROW * row,float gradient,int min_height,int max_height,STATS * heights,STATS * floating_heights)1456 void fill_heights(TO_ROW *row, float gradient, int min_height,
1457                   int max_height, STATS *heights, STATS *floating_heights) {
1458   float xcentre;                 //centre of blob
1459   float top;                     // top y coord of blob
1460   float height;                  //height of blob
1461   BLOBNBOX *blob;                //current blob
1462   int repeated_set;
1463   BLOBNBOX_IT blob_it = row->blob_list();
1464   if (blob_it.empty()) return;  // no blobs in this row
1465   bool has_rep_chars =
1466     row->rep_chars_marked() && row->num_repeated_sets() > 0;
1467   do {
1468     blob = blob_it.data ();
1469     if (!blob->joined_to_prev ()) {
1470       xcentre = (blob->bounding_box().left() +
1471         blob->bounding_box ().right ()) / 2.0f;
1472       top = blob->bounding_box().top();
1473       height = blob->bounding_box().height();
1474       if (textord_fix_xheight_bug)
1475         top -= row->baseline.y(xcentre);
1476       else
1477         top -= gradient * xcentre + row->parallel_c();
1478       if (top >= min_height && top <= max_height) {
1479         heights->add(static_cast<inT32>(floor(top + 0.5)), 1);
1480         if (height / top < textord_min_blob_height_fraction) {
1481           floating_heights->add(static_cast<inT32>(floor(top + 0.5)), 1);
1482     }
1483   }
1484     }
1485     // Skip repeated chars, since they are likely to skew the height stats.
1486     if (has_rep_chars && blob->repeated_set() != 0) {
1487       repeated_set = blob->repeated_set();
1488       blob_it.forward();
1489       while (!blob_it.at_first() &&
1490              blob_it.data()->repeated_set() == repeated_set) {
1491         blob_it.forward();
1492   if (textord_debug_xheights)
1493           tprintf("Skipping repeated char when computing xheight\n");
1494       }
1495     } else {
1496       blob_it.forward();
1497     }
1498   } while (!blob_it.at_first());
1499 }
1500 
1501 /**********************************************************************
1502  * compute_xheight_from_modes
1503  *
1504  * Given a STATS object heights, looks for two most frequently occurring
1505  * heights that look like xheight and xheight + ascrise. If found, sets
1506  * the values of *xheight and *ascrise accordingly, otherwise sets xheight
1507  * to any most frequently occurring height and sets *ascrise to 0.
1508  * Returns the number of times xheight occurred in heights.
1509  * For each mode that is considered for being an xheight the count of
1510  * floating blobs (stored in floating_heights) is subtracted from the
1511  * total count of the blobs of this height. This is done because blobs
1512  * that sit far above the baseline could represent valid ascenders, but
1513  * it is highly unlikely that such a character's height will be an xheight
1514  * (e.g.  -, ', =, ^, `, ", ', etc)
1515  **********************************************************************/
compute_xheight_from_modes(STATS * heights,STATS * floating_heights,int min_height,int max_height,float * xheight,float * ascrise)1516 int compute_xheight_from_modes(
1517     STATS *heights, STATS *floating_heights, int min_height,
1518     int max_height, float *xheight, float *ascrise) {
1519   int blob_index = heights->mode();  // find mode
1520   int blob_count = heights->pile_count(blob_index);  // get count of mode
1521   if (textord_debug_xheights) {
1522     tprintf ("min_height=%d, max_height=%d, mode=%d, count=%d, total=%d\n",
1523       min_height, max_height, blob_index, blob_count, heights->get_total());
1524     heights->print(NULL, true);
1525     floating_heights->print(NULL, true);
1526   }
1527   if (blob_count == 0) return 0;
1528   int modes[MAX_HEIGHT_MODES]; // biggest piles
1529   bool in_best_pile = FALSE;
1530   int prev_size = -MAX_INT32;
1531   int best_count = 0;
1532   int mode_count = compute_height_modes(heights, min_height, max_height,
1533                                         modes, MAX_HEIGHT_MODES);
1534   int x;
1535   if (textord_debug_xheights) {
1536     tprintf("found %d modes: ", mode_count);
1537     for (x = 0; x < mode_count; x++) tprintf("%d ", modes[x]);
1538     tprintf("\n");
1539   }
1540 
1541     for (x = 0; x < mode_count - 1; x++) {
1542       if (modes[x] != prev_size + 1)
1543         in_best_pile = FALSE;    //had empty height
1544     int modes_x_count = heights->pile_count(modes[x]) -
1545       floating_heights->pile_count(modes[x]);
1546     if ((modes_x_count >= blob_count * textord_xheight_mode_fraction) &&
1547         (in_best_pile || modes_x_count > best_count)) {
1548       for (int asc = x + 1; asc < mode_count; asc++) {
1549         float ratio =
1550           static_cast<float>(modes[asc]) / static_cast<float>(modes[x]);
1551         if (textord_ascx_ratio_min < ratio &&
1552             ratio < textord_ascx_ratio_max &&
1553             (heights->pile_count(modes[asc]) >=
1554              blob_count * textord_ascheight_mode_fraction)) {
1555           if (modes_x_count > best_count) {
1556             in_best_pile = true;
1557             best_count = modes_x_count;
1558           }
1559           if (textord_debug_xheights) {
1560             tprintf("X=%d, asc=%d, count=%d, ratio=%g\n",
1561                     modes[x], modes[asc]-modes[x], modes_x_count, ratio);
1562           }
1563             prev_size = modes[x];
1564           *xheight = static_cast<float>(modes[x]);
1565           *ascrise = static_cast<float>(modes[asc] - modes[x]);
1566           }
1567         }
1568       }
1569     }
1570   if (*xheight == 0) {  // single mode
1571     // Remove counts of the "floating" blobs (the one whose height is too
1572     // small in relation to it's top end of the bounding box) from heights
1573     // before computing the single-mode xheight.
1574     // Restore the counts in heights after the mode is found, since
1575     // floating blobs might be useful for determining potential ascenders
1576     // in compute_row_descdrop().
1577     if (floating_heights->get_total() > 0) {
1578       for (x = min_height; x < max_height; ++x) {
1579         heights->add(x, -(floating_heights->pile_count(x)));
1580     }
1581       blob_index = heights->mode();  // find the modified mode
1582       for (x = min_height; x < max_height; ++x) {
1583         heights->add(x, floating_heights->pile_count(x));
1584       }
1585     }
1586     *xheight = static_cast<float>(blob_index);
1587     *ascrise = 0.0f;
1588     best_count = heights->pile_count(blob_index);
1589     if (textord_debug_xheights)
1590       tprintf("Single mode xheight set to %g\n", *xheight);
1591   } else if (textord_debug_xheights) {
1592     tprintf("Multi-mode xheight set to %g, asc=%g\n", *xheight, *ascrise);
1593   }
1594   return best_count;
1595 }
1596 
1597 /**********************************************************************
1598  * compute_row_descdrop
1599  *
1600  * Estimates the descdrop of this row. This function looks for
1601  * "significant" descenders of lowercase letters (those that could
1602  * not just be the small descenders of upper case letters like Q,J).
1603  * The function also takes into account how many potential ascenders
1604  * this row might contain. If the number of potential ascenders along
1605  * with descenders is close to the expected fraction of the total
1606  * number of blobs in the row, the function returns the descender
1607  * height, returns 0 otherwise.
1608  **********************************************************************/
compute_row_descdrop(TO_ROW * row,float gradient,int xheight_blob_count,STATS * asc_heights)1609 inT32 compute_row_descdrop(TO_ROW *row, float gradient,
1610                            int xheight_blob_count, STATS *asc_heights) {
1611   // Count how many potential ascenders are in this row.
1612   int i_min = asc_heights->min_bucket();
1613   if ((i_min / row->xheight) < textord_ascx_ratio_min) {
1614     i_min = static_cast<int>(
1615         floor(row->xheight * textord_ascx_ratio_min + 0.5));
1616   }
1617   int i_max = asc_heights->max_bucket();
1618   if ((i_max / row->xheight) > textord_ascx_ratio_max) {
1619     i_max = static_cast<int>(floor(row->xheight * textord_ascx_ratio_max));
1620   }
1621   int num_potential_asc = 0;
1622   for (int i = i_min; i <= i_max; ++i) {
1623     num_potential_asc += asc_heights->pile_count(i);
1624   }
1625   inT32 min_height =
1626     static_cast<inT32>(floor(row->xheight * textord_descx_ratio_min + 0.5));
1627   inT32 max_height =
1628     static_cast<inT32>(floor(row->xheight * textord_descx_ratio_max));
1629   float xcentre;                 //centre of blob
1630   float height;                  //height of blob
1631   BLOBNBOX_IT blob_it = row->blob_list ();
1632   BLOBNBOX *blob;                //current blob
1633   STATS heights (min_height, max_height + 1);
1634   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
1635     blob = blob_it.data ();
1636     if (!blob->joined_to_prev ()) {
1637       xcentre = (blob->bounding_box().left() +
1638         blob->bounding_box ().right ()) / 2.0f;
1639       height = (gradient * xcentre + row->parallel_c() -
1640                 blob->bounding_box().bottom());
1641       if (height >= min_height && height <= max_height)
1642         heights.add(static_cast<int>(floor(height + 0.5)), 1);
1643     }
1644   }
1645   int blob_index = heights.mode();  // find mode
1646   int blob_count = heights.pile_count(blob_index);  // get count of mode
1647   float total_fraction =
1648     (textord_descheight_mode_fraction + textord_ascheight_mode_fraction);
1649   if (static_cast<float>(blob_count + num_potential_asc) <
1650       xheight_blob_count * total_fraction) {
1651     blob_count = 0;
1652     }
1653   int descdrop = blob_count > 0 ? -blob_index : 0;
1654   if (textord_debug_xheights) {
1655     tprintf("Descdrop: %d (potential ascenders %d, descenders %d)\n",
1656             descdrop, num_potential_asc, blob_count);
1657     heights.print(NULL, true);
1658   }
1659   return descdrop;
1660 }
1661 
1662 
1663 /**********************************************************************
1664  * compute_height_modes
1665  *
1666  * Find the top maxmodes values in the input array and put their
1667  * indices in the output in the order in which they occurred.
1668  **********************************************************************/
compute_height_modes(STATS * heights,inT32 min_height,inT32 max_height,inT32 * modes,inT32 maxmodes)1669 inT32 compute_height_modes(                   //find lines
1670                            STATS *heights,    //stats to search
1671                            inT32 min_height,  //bottom of range
1672                            inT32 max_height,  //top of range
1673                            inT32 *modes,      //output array
1674                            inT32 maxmodes     //size of modes
1675                           ) {
1676   inT32 pile_count;              //no in source pile
1677   inT32 src_count;               //no of source entries
1678   inT32 src_index;               //current entry
1679   inT32 least_count;             //height of smalllest
1680   inT32 least_index;             //index of least
1681   inT32 dest_count;              //index in modes
1682 
1683   src_count = max_height + 1 - min_height;
1684   dest_count = 0;
1685   least_count = MAX_INT32;
1686   least_index = -1;
1687   for (src_index = 0; src_index < src_count; src_index++) {
1688     pile_count = heights->pile_count (min_height + src_index);
1689     if (pile_count > 0) {
1690       if (dest_count < maxmodes) {
1691         if (pile_count < least_count) {
1692                                  //find smallest in array
1693           least_count = pile_count;
1694           least_index = dest_count;
1695         }
1696         modes[dest_count++] = min_height + src_index;
1697       }
1698       else if (pile_count >= least_count) {
1699         while (least_index < maxmodes - 1) {
1700           modes[least_index] = modes[least_index + 1];
1701           //shuffle up
1702           least_index++;
1703         }
1704                                  //new one on end
1705         modes[maxmodes - 1] = min_height + src_index;
1706         if (pile_count == least_count) {
1707                                  //new smallest
1708           least_index = maxmodes - 1;
1709         }
1710         else {
1711           least_count = heights->pile_count (modes[0]);
1712           least_index = 0;
1713           for (dest_count = 1; dest_count < maxmodes; dest_count++) {
1714             pile_count = heights->pile_count (modes[dest_count]);
1715             if (pile_count < least_count) {
1716                                  //find smallest
1717               least_count = pile_count;
1718               least_index = dest_count;
1719             }
1720           }
1721         }
1722       }
1723     }
1724   }
1725   return dest_count;
1726 }
1727 
1728 
1729 /**********************************************************************
1730  * correct_row_xheight
1731  *
1732  * Adjust the xheight etc of this row if not within reasonable limits
1733  * of the average for the block.
1734  **********************************************************************/
correct_row_xheight(TO_ROW * row,float xheight,float ascrise,float descdrop)1735 void correct_row_xheight(TO_ROW *row, float xheight,
1736                          float ascrise, float descdrop) {
1737   ROW_CATEGORY row_category = get_row_category(row);
1738   if (textord_debug_xheights) {
1739     tprintf("correcting row xheight: row->xheight %.4f"
1740             ", row->acrise %.4f row->descdrop %.4f\n",
1741             row->xheight, row->ascrise, row->descdrop);
1742   }
1743   bool normal_xheight =
1744     within_error_margin(row->xheight, xheight, textord_xheight_error_margin);
1745   bool cap_xheight =
1746     within_error_margin(row->xheight, xheight + ascrise,
1747                         textord_xheight_error_margin);
1748   // Use the average xheight/ascrise for the following cases:
1749   // -- the xheight of the row could not be determined at all
1750   // -- the row has descenders (e.g. "many groups", "ISBN 12345 p.3")
1751   //    and its xheight is close to either cap height or average xheight
1752   // -- the row does not have ascenders or descenders, but its xheight
1753   //    is close to the average block xheight (e.g. row with "www.mmm.com")
1754   if (row_category == ROW_ASCENDERS_FOUND) {
1755     if (row->descdrop >= 0.0)  {
1756       row->descdrop = row->xheight * (descdrop / xheight);
1757     }
1758   } else  if (row_category == ROW_INVALID ||
1759               (row_category == ROW_DESCENDERS_FOUND &&
1760                (normal_xheight || cap_xheight)) ||
1761               (row_category == ROW_UNKNOWN && normal_xheight)) {
1762     if (textord_debug_xheights) tprintf("using average xheight\n");
1763       row->xheight = xheight;
1764         row->ascrise = ascrise;
1765     row->descdrop = descdrop;
1766       }
1767   // Assume this is a row with mostly lowercase letters and it's xheight
1768   // is computed correctly (unfortunately there is no way to distinguish
1769   // this from the case when descenders are found, but the most common
1770   // height is capheight).
1771   else if (row_category == ROW_DESCENDERS_FOUND) {
1772     if (textord_debug_xheights) tprintf("lowercase, corrected ascrise\n");
1773     row->ascrise = row->xheight * (ascrise / xheight);
1774   }
1775   // Otherwise assume this row is an all-caps or small-caps row
1776   // and adjust xheight and ascrise of the row.
1777   else if (row_category == ROW_UNKNOWN) {
1778     row->all_caps = true;
1779     if (cap_xheight) { // regular all caps
1780       if (textord_debug_xheights) tprintf("all caps\n");
1781         row->xheight = xheight;
1782       row->ascrise = ascrise;
1783       row->descdrop = descdrop;
1784     } else {  // small caps or caps with an odd xheight
1785       if (textord_debug_xheights) {
1786         if (row->xheight < xheight + ascrise && row->xheight > xheight) {
1787           tprintf("small caps\n");
1788         } else {
1789           tprintf("all caps with irregular xheight\n");
1790       }
1791       }
1792       row->ascrise = row->xheight * (ascrise / (xheight + ascrise));
1793       row->xheight -= row->ascrise;
1794       row->descdrop = row->xheight * (descdrop / xheight);
1795     }
1796   }
1797   if (textord_debug_xheights) {
1798     tprintf("corrected row->xheight = %.4f, row->acrise = %.4f, row->descdrop"
1799             " = %.4f\n", row->xheight, row->ascrise, row->descdrop);
1800   }
1801 }
1802 
CountOverlaps(const TBOX & box,int min_height,BLOBNBOX_LIST * blobs)1803 static int CountOverlaps(const TBOX& box, int min_height,
1804                          BLOBNBOX_LIST* blobs) {
1805   int overlaps = 0;
1806   BLOBNBOX_IT blob_it(blobs);
1807   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1808     BLOBNBOX* blob = blob_it.data();
1809     TBOX blob_box = blob->bounding_box();
1810     if (blob_box.height() >= min_height && box.major_overlap(blob_box)) {
1811       ++overlaps;
1812     }
1813   }
1814   return overlaps;
1815 }
1816 
1817 /**********************************************************************
1818  * separate_underlines
1819  *
1820  * Test wide objects for being potential underlines. If they are then
1821  * put them in a separate list in the block.
1822  **********************************************************************/
separate_underlines(TO_BLOCK * block,float gradient,FCOORD rotation,BOOL8 testing_on)1823 void separate_underlines(                  //make rough chars
1824                          TO_BLOCK *block,  //block to do
1825                          float gradient,   //skew angle
1826                          FCOORD rotation,  //inverse landscape
1827                          BOOL8 testing_on  //correct orientation
1828                         ) {
1829   BLOBNBOX *blob;                //current blob
1830   PBLOB *poly_blob;              //rotated blob
1831   C_BLOB *rotated_blob;          //rotated blob
1832   TO_ROW *row;                   //current row
1833   float length;                  //of g_vec
1834   TBOX blob_box;
1835   FCOORD blob_rotation;          //inverse of rotation
1836   FCOORD g_vec;                  //skew rotation
1837   BLOBNBOX_IT blob_it;           //iterator
1838                                  //iterator
1839   BLOBNBOX_IT under_it = &block->underlines;
1840   BLOBNBOX_IT large_it = &block->large_blobs;
1841   TO_ROW_IT row_it = block->get_rows ();
1842   int min_blob_height = static_cast<int>(textord_min_blob_height_fraction *
1843                                          block->line_size + 0.5);
1844 
1845                                  //length of vector
1846   length = sqrt (1 + gradient * gradient);
1847   g_vec = FCOORD (1 / length, -gradient / length);
1848   blob_rotation = FCOORD (rotation.x (), -rotation.y ());
1849   blob_rotation.rotate (g_vec);  //unoding everything
1850   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1851     row = row_it.data ();
1852                                  //get blobs
1853     blob_it.set_to_list (row->blob_list ());
1854     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1855     blob_it.forward ()) {
1856       blob = blob_it.data ();
1857       blob_box = blob->bounding_box ();
1858       if (blob_box.width () > block->line_size * textord_underline_width) {
1859         if (textord_cblob_blockocc && blob->cblob () != NULL) {
1860           rotated_blob = crotate_cblob (blob->cblob (),
1861             blob_rotation);
1862           if (test_underline (testing_on && textord_show_final_rows,
1863                              rotated_blob, static_cast<inT16>(row->intercept()),
1864                              static_cast<inT16>(block->line_size *
1865             (textord_merge_x +
1866           textord_merge_asc / 2.0f)))) {
1867             under_it.add_after_then_move (blob_it.extract ());
1868             if (testing_on && textord_show_final_rows) {
1869               tprintf("Underlined blob at:");
1870                 rotated_blob->bounding_box().print();
1871               tprintf("Was:");
1872                 blob_box.print();
1873             }
1874           } else if (CountOverlaps(blob->bounding_box(), min_blob_height,
1875                                    row->blob_list()) >
1876                      textord_max_blob_overlaps) {
1877             large_it.add_after_then_move(blob_it.extract());
1878             if (testing_on && textord_show_final_rows) {
1879               tprintf("Large blob overlaps %d blobs at:",
1880                       CountOverlaps(blob_box, min_blob_height,
1881                                     row->blob_list()));
1882               blob_box.print();
1883             }
1884           }
1885           delete rotated_blob;
1886         }
1887         else {
1888           if (blob->blob () != NULL) {
1889             //                                      if (testing_on && textord_show_final_rows)
1890             //                                              tprintf("Rotating by (%g,%g)\n",
1891             //                                                      blob_rotation.x(),blob_rotation.y());
1892             poly_blob = rotate_blob (blob->blob (), blob_rotation);
1893           }
1894           else
1895             poly_blob = rotate_cblob (blob->cblob (),
1896               block->line_size,
1897               blob_rotation);
1898           if (test_underline
1899             (testing_on
1900             && textord_show_final_rows, poly_blob,
1901             row->intercept (),
1902             block->line_size * (textord_merge_x +
1903           textord_merge_asc / 2))) {
1904             if (testing_on && textord_show_final_rows) {
1905               tprintf ("Underlined blob at (%d,%d)->(%d,%d) ",
1906                 poly_blob->bounding_box ().left (),
1907                 poly_blob->bounding_box ().bottom (),
1908                 poly_blob->bounding_box ().right (),
1909                 poly_blob->bounding_box ().top ());
1910               tprintf ("(Was (%d,%d)->(%d,%d))\n",
1911                 blob_box.left (), blob_box.bottom (),
1912                 blob_box.right (), blob_box.top ());
1913             }
1914             under_it.add_after_then_move (blob_it.extract ());
1915           }
1916           delete poly_blob;
1917         }
1918       }
1919     }
1920   }
1921 }
1922 
1923 
1924 /**********************************************************************
1925  * pre_associate_blobs
1926  *
1927  * Associate overlapping blobs and fake chop wide blobs.
1928  **********************************************************************/
pre_associate_blobs(ICOORD page_tr,TO_BLOCK * block,FCOORD rotation,BOOL8 testing_on)1929 void pre_associate_blobs(                  //make rough chars
1930                          ICOORD page_tr,   //top right
1931                          TO_BLOCK *block,  //block to do
1932                          FCOORD rotation,  //inverse landscape
1933                          BOOL8 testing_on  //correct orientation
1934                         ) {
1935 #ifndef GRAPHICS_DISABLED
1936   ScrollView::Color colour;                 //of boxes
1937 #endif
1938   BLOBNBOX *blob;                //current blob
1939   BLOBNBOX *nextblob;            //next in list
1940   TBOX blob_box;
1941   FCOORD blob_rotation;          //inverse of rotation
1942   BLOBNBOX_IT blob_it;           //iterator
1943   BLOBNBOX_IT start_it;          //iterator
1944   TO_ROW_IT row_it = block->get_rows ();
1945 
1946 #ifndef GRAPHICS_DISABLED
1947   colour = ScrollView::RED;
1948 #endif
1949 
1950   blob_rotation = FCOORD (rotation.x (), -rotation.y ());
1951   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1952                                  //get blobs
1953     blob_it.set_to_list (row_it.data ()->blob_list ());
1954     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1955     blob_it.forward ()) {
1956       blob = blob_it.data ();
1957       blob_box = blob->bounding_box ();
1958       start_it = blob_it;        //save start point
1959       //                      if (testing_on && textord_show_final_blobs)
1960       //                      {
1961       //                              tprintf("Blob at (%d,%d)->(%d,%d), addr=%x, count=%d\n",
1962       //                                      blob_box.left(),blob_box.bottom(),
1963       //                                      blob_box.right(),blob_box.top(),
1964       //                                      (void*)blob,blob_it.length());
1965       //                      }
1966       bool overlap;
1967       do {
1968         overlap = false;
1969         if (!blob_it.at_last ()) {
1970           nextblob = blob_it.data_relative(1);
1971           overlap = blob_box.major_x_overlap(nextblob->bounding_box());
1972           if (overlap) {
1973             blob->merge(nextblob); // merge new blob
1974             blob_box = blob->bounding_box(); // get bigger box
1975             blob_it.forward();
1976           }
1977         }
1978       }
1979       while (overlap);
1980       blob->chop (&start_it, &blob_it,
1981         blob_rotation,
1982         block->line_size * textord_merge_x *
1983         textord_chop_width);
1984       //attempt chop
1985     }
1986 #ifndef GRAPHICS_DISABLED
1987     if (testing_on && textord_show_final_blobs) {
1988       if (to_win == NULL)
1989         create_to_win(page_tr);
1990       to_win->Pen(colour);
1991       for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1992       blob_it.forward ()) {
1993         blob = blob_it.data ();
1994         blob_box = blob->bounding_box ();
1995         blob_box.rotate (rotation);
1996         if (!blob->joined_to_prev ()) {
1997           to_win->Rectangle (blob_box.left (), blob_box.bottom (),
1998             blob_box.right (), blob_box.top ());
1999         }
2000       }
2001       colour = (ScrollView::Color) (colour + 1);
2002       if (colour > ScrollView::MAGENTA)
2003         colour = ScrollView::RED;
2004     }
2005 #endif
2006   }
2007 }
2008 
2009 
2010 /**********************************************************************
2011  * fit_parallel_rows
2012  *
2013  * Re-fit the rows in the block to the given gradient.
2014  **********************************************************************/
fit_parallel_rows(TO_BLOCK * block,float gradient,FCOORD rotation,inT32 block_edge,BOOL8 testing_on)2015 void fit_parallel_rows(                   //find lines
2016                        TO_BLOCK *block,   //block to do
2017                        float gradient,    //gradient to fit
2018                        FCOORD rotation,   //for drawing
2019                        inT32 block_edge,  //edge of block
2020                        BOOL8 testing_on   //correct orientation
2021                       ) {
2022 #ifndef GRAPHICS_DISABLED
2023   ScrollView::Color colour;                 //of row
2024 #endif
2025   TO_ROW_IT row_it = block->get_rows ();
2026 
2027   row_it.move_to_first ();
2028   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
2029     if (row_it.data ()->blob_list ()->empty ())
2030       delete row_it.extract ();  //nothing in it
2031     else
2032       fit_parallel_lms (gradient, row_it.data ());
2033   }
2034 #ifndef GRAPHICS_DISABLED
2035   if (testing_on) {
2036     colour = ScrollView::RED;
2037     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
2038       plot_parallel_row (row_it.data (), gradient,
2039         block_edge, colour, rotation);
2040       colour = (ScrollView::Color) (colour + 1);
2041       if (colour > ScrollView::MAGENTA)
2042         colour = ScrollView::RED;
2043     }
2044   }
2045 #endif
2046   row_it.sort (row_y_order);     //may have gone out of order
2047 }
2048 
2049 
2050 /**********************************************************************
2051  * fit_parallel_lms
2052  *
2053  * Fit an LMS line to a row.
2054  * Make the fit parallel to the given gradient and set the
2055  * row accordingly.
2056  **********************************************************************/
fit_parallel_lms(float gradient,TO_ROW * row)2057 void fit_parallel_lms(                 //sort function
2058                       float gradient,  //forced gradient
2059                       TO_ROW *row      //row to fit
2060                      ) {
2061   float c;                       //fitted line
2062   int blobcount;                 //no of blobs
2063   TBOX box;                       //blob box
2064   LMS lms (row->blob_list ()->length ());
2065                                  //blobs
2066   BLOBNBOX_IT blob_it = row->blob_list ();
2067 
2068   blobcount = 0;
2069   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
2070     if (!blob_it.data ()->joined_to_prev ()) {
2071       box = blob_it.data ()->bounding_box ();
2072       lms.
2073         add (FCOORD ((box.left () + box.right ()) / 2.0, box.bottom ()));
2074       blobcount++;
2075     }
2076   }
2077   lms.constrained_fit (gradient, c);
2078   row->set_parallel_line (gradient, c, lms.error ());
2079   if (textord_straight_baselines && blobcount > lms_line_trials) {
2080     lms.fit (gradient, c);
2081   }
2082                                  //set the other too
2083   row->set_line (gradient, c, lms.error ());
2084 }
2085 
2086 
2087 /**********************************************************************
2088  * make_spline_rows
2089  *
2090  * Re-fit the rows in the block to the given gradient.
2091  **********************************************************************/
make_spline_rows(TO_BLOCK * block,float gradient,FCOORD rotation,inT32 block_edge,BOOL8 testing_on,tesseract::Tesseract * tess)2092 void make_spline_rows(                   //find lines
2093                       TO_BLOCK *block,   //block to do
2094                       float gradient,    //gradient to fit
2095                       FCOORD rotation,   //for drawing
2096                       inT32 block_edge,  //edge of block
2097                       BOOL8 testing_on,  //correct orientation
2098                       tesseract::Tesseract* tess
2099                      ) {
2100 #ifndef GRAPHICS_DISABLED
2101   ScrollView::Color colour;       //of row
2102 #endif
2103   TO_ROW_IT row_it = block->get_rows ();
2104 
2105   row_it.move_to_first ();
2106   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
2107     if (row_it.data ()->blob_list ()->empty ())
2108       delete row_it.extract ();  //nothing in it
2109     else
2110       make_baseline_spline (row_it.data (), block);
2111   }
2112   if (textord_old_baselines) {
2113 #ifndef GRAPHICS_DISABLED
2114     if (testing_on) {
2115       colour = ScrollView::RED;
2116       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
2117       row_it.forward ()) {
2118         row_it.data ()->baseline.plot (to_win, colour);
2119         colour = (ScrollView::Color) (colour + 1);
2120         if (colour > ScrollView::MAGENTA)
2121           colour = ScrollView::RED;
2122       }
2123     }
2124 #endif
2125     make_old_baselines(block, testing_on, gradient, tess);
2126   }
2127 #ifndef GRAPHICS_DISABLED
2128   if (testing_on) {
2129     colour = ScrollView::RED;
2130     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
2131       row_it.data ()->baseline.plot (to_win, colour);
2132       colour = (ScrollView::Color) (colour + 1);
2133       if (colour > ScrollView::MAGENTA)
2134         colour = ScrollView::RED;
2135     }
2136   }
2137 #endif
2138 }
2139 
2140 
2141 /**********************************************************************
2142  * make_baseline_spline
2143  *
2144  * Fit an LMS line to a row.
2145  * Make the fit parallel to the given gradient and set the
2146  * row accordingly.
2147  **********************************************************************/
make_baseline_spline(TO_ROW * row,TO_BLOCK * block)2148 void make_baseline_spline(                 //sort function
2149                           TO_ROW *row,     //row to fit
2150                           TO_BLOCK *block  //block it came from
2151                          ) {
2152   float b, c;                    //fitted curve
2153   float middle;                  //x middle of blob
2154   TBOX box;                       //blob box
2155   LMS lms (row->blob_list ()->length ());
2156                                  //blobs
2157   BLOBNBOX_IT blob_it = row->blob_list ();
2158   inT32 *xstarts;                //spline boundaries
2159   double *coeffs;                //quadratic coeffs
2160   inT32 segments;                //no of segments
2161   inT32 segment;                 //current segment
2162 
2163   xstarts =
2164     (inT32 *) alloc_mem ((row->blob_list ()->length () + 1) * sizeof (inT32));
2165   if (segment_baseline (row, block, segments, xstarts)
2166   && !textord_straight_baselines && !textord_parallel_baselines) {
2167     if (textord_quadratic_baselines) {
2168       coeffs = (double *) alloc_mem (segments * 3 * sizeof (double));
2169       for (segment = 0; segment < segments; segment++) {
2170         lms.clear ();
2171         for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
2172         blob_it.forward ()) {
2173           if (!blob_it.data ()->joined_to_prev ()) {
2174             box = blob_it.data ()->bounding_box ();
2175             middle = (box.left () + box.right ()) / 2.0;
2176             if (middle >= xstarts[segment]
2177             && middle < xstarts[segment + 1]) {
2178               lms.add (FCOORD (middle, box.bottom ()));
2179             }
2180           }
2181         }
2182         if (textord_quadratic_baselines)
2183           lms.fit_quadratic (block->line_size *
2184             textord_spline_outlier_fraction,
2185             coeffs[segment * 3], b, c);
2186         else {
2187           lms.fit (b, c);
2188           coeffs[segment * 3] = 0;
2189         }
2190         coeffs[segment * 3 + 1] = b;
2191         coeffs[segment * 3 + 2] = c;
2192       }
2193     }
2194     else
2195       coeffs = linear_spline_baseline (row, block, segments, xstarts);
2196   }
2197   else {
2198     xstarts[1] = xstarts[segments];
2199     segments = 1;
2200     coeffs = (double *) alloc_mem (3 * sizeof (double));
2201     coeffs[0] = 0;
2202     coeffs[1] = row->line_m ();
2203     coeffs[2] = row->line_c ();
2204   }
2205   row->baseline = QSPLINE (segments, xstarts, coeffs);
2206   free_mem(coeffs);
2207   free_mem(xstarts);
2208 }
2209 
2210 
2211 /**********************************************************************
2212  * segment_baseline
2213  *
2214  * Divide the baseline up into segments which require a different
2215  * quadratic fitted to them.
2216  * Return TRUE if enough blobs were far enough away to need a quadratic.
2217  **********************************************************************/
2218 BOOL8
segment_baseline(TO_ROW * row,TO_BLOCK * block,inT32 & segments,inT32 xstarts[])2219 segment_baseline (               //split baseline
2220 TO_ROW * row,                    //row to fit
2221 TO_BLOCK * block,                //block it came from
2222 inT32 & segments,                //no fo segments
2223 inT32 xstarts[]                  //coords of segments
2224 ) {
2225   BOOL8 needs_curve;             //needs curved line
2226   int blobcount;                 //no of blobs
2227   int blobindex;                 //current blob
2228   int last_state;                //above, on , below
2229   int state;                     //of current blob
2230   float yshift;                  //from baseline
2231   TBOX box;                       //blob box
2232   TBOX new_box;                   //new_it box
2233   float middle;                  //xcentre of blob
2234                                  //blobs
2235   BLOBNBOX_IT blob_it = row->blob_list ();
2236   BLOBNBOX_IT new_it = blob_it;  //front end
2237   SORTED_FLOATS yshifts;         //shifts from baseline
2238 
2239   needs_curve = FALSE;
2240   box = box_next_pre_chopped (&blob_it);
2241   xstarts[0] = box.left ();
2242   segments = 1;
2243   blobcount = row->blob_list ()->length ();
2244   if (textord_oldbl_debug)
2245     tprintf ("Segmenting baseline of %d blobs at (%d,%d)\n",
2246       blobcount, box.left (), box.bottom ());
2247   if (blobcount <= textord_spline_medianwin
2248   || blobcount < textord_spline_minblobs) {
2249     blob_it.move_to_last ();
2250     box = blob_it.data ()->bounding_box ();
2251     xstarts[1] = box.right ();
2252     return FALSE;
2253   }
2254   last_state = 0;
2255   new_it.mark_cycle_pt ();
2256   for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) {
2257     new_box = box_next_pre_chopped (&new_it);
2258     middle = (new_box.left () + new_box.right ()) / 2.0;
2259     yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
2260                                  //record shift
2261     yshifts.add (yshift, blobindex);
2262     if (new_it.cycled_list ()) {
2263       xstarts[1] = new_box.right ();
2264       return FALSE;
2265     }
2266   }
2267   for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++)
2268     box = box_next_pre_chopped (&blob_it);
2269   do {
2270     new_box = box_next_pre_chopped (&new_it);
2271                                  //get middle one
2272     yshift = yshifts[textord_spline_medianwin / 2];
2273     if (yshift > textord_spline_shift_fraction * block->line_size)
2274       state = 1;
2275     else if (-yshift > textord_spline_shift_fraction * block->line_size)
2276       state = -1;
2277     else
2278       state = 0;
2279     if (state != 0)
2280       needs_curve = TRUE;
2281     //              tprintf("State=%d, prev=%d, shift=%g\n",
2282     //                      state,last_state,yshift);
2283     if (state != last_state && blobcount > textord_spline_minblobs) {
2284       xstarts[segments++] = box.left ();
2285       blobcount = 0;
2286     }
2287     last_state = state;
2288     yshifts.remove (blobindex - textord_spline_medianwin);
2289     box = box_next_pre_chopped (&blob_it);
2290     middle = (new_box.left () + new_box.right ()) / 2.0;
2291     yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
2292     yshifts.add (yshift, blobindex);
2293     blobindex++;
2294     blobcount++;
2295   }
2296   while (!new_it.cycled_list ());
2297   if (blobcount > textord_spline_minblobs || segments == 1) {
2298     xstarts[segments] = new_box.right ();
2299   }
2300   else {
2301     xstarts[--segments] = new_box.right ();
2302   }
2303   if (textord_oldbl_debug)
2304     tprintf ("Made %d segments on row at (%d,%d)\n",
2305       segments, box.right (), box.bottom ());
2306   return needs_curve;
2307 }
2308 
2309 
2310 /**********************************************************************
2311  * linear_spline_baseline
2312  *
2313  * Divide the baseline up into segments which require a different
2314  * quadratic fitted to them.
2315  * Return TRUE if enough blobs were far enough away to need a quadratic.
2316  **********************************************************************/
2317 double *
linear_spline_baseline(TO_ROW * row,TO_BLOCK * block,inT32 & segments,inT32 xstarts[])2318 linear_spline_baseline (         //split baseline
2319 TO_ROW * row,                    //row to fit
2320 TO_BLOCK * block,                //block it came from
2321 inT32 & segments,                //no fo segments
2322 inT32 xstarts[]                  //coords of segments
2323 ) {
2324   int blobcount;                 //no of blobs
2325   int blobindex;                 //current blob
2326   int index1, index2;            //blob numbers
2327   int blobs_per_segment;         //blobs in each
2328   TBOX box;                       //blob box
2329   TBOX new_box;                   //new_it box
2330   float middle;                  //xcentre of blob
2331                                  //blobs
2332   BLOBNBOX_IT blob_it = row->blob_list ();
2333   BLOBNBOX_IT new_it = blob_it;  //front end
2334   float b, c;                    //fitted curve
2335   LMS lms (row->blob_list ()->length ());
2336   double *coeffs;                //quadratic coeffs
2337   inT32 segment;                 //current segment
2338 
2339   box = box_next_pre_chopped (&blob_it);
2340   xstarts[0] = box.left ();
2341   blobcount = 1;
2342   while (!blob_it.at_first ()) {
2343     blobcount++;
2344     box = box_next_pre_chopped (&blob_it);
2345   }
2346   segments = blobcount / textord_spline_medianwin;
2347   if (segments < 1)
2348     segments = 1;
2349   blobs_per_segment = blobcount / segments;
2350   coeffs = (double *) alloc_mem (segments * 3 * sizeof (double));
2351   if (textord_oldbl_debug)
2352     tprintf
2353       ("Linear splining baseline of %d blobs at (%d,%d), into %d segments of %d blobs\n",
2354       blobcount, box.left (), box.bottom (), segments, blobs_per_segment);
2355   segment = 1;
2356   for (index2 = 0; index2 < blobs_per_segment / 2; index2++)
2357     box_next_pre_chopped(&new_it);
2358   index1 = 0;
2359   blobindex = index2;
2360   do {
2361     blobindex += blobs_per_segment;
2362     lms.clear ();
2363     while (index1 < blobindex || (segment == segments && index1 < blobcount)) {
2364       box = box_next_pre_chopped (&blob_it);
2365       middle = (box.left () + box.right ()) / 2.0;
2366       lms.add (FCOORD (middle, box.bottom ()));
2367       index1++;
2368       if (index1 == blobindex - blobs_per_segment / 2
2369       || index1 == blobcount - 1) {
2370         xstarts[segment] = box.left ();
2371       }
2372     }
2373     lms.fit (b, c);
2374     coeffs[segment * 3 - 3] = 0;
2375     coeffs[segment * 3 - 2] = b;
2376     coeffs[segment * 3 - 1] = c;
2377     segment++;
2378     if (segment > segments)
2379       break;
2380 
2381     blobindex += blobs_per_segment;
2382     lms.clear ();
2383     while (index2 < blobindex || (segment == segments && index2 < blobcount)) {
2384       new_box = box_next_pre_chopped (&new_it);
2385       middle = (new_box.left () + new_box.right ()) / 2.0;
2386       lms.add (FCOORD (middle, new_box.bottom ()));
2387       index2++;
2388       if (index2 == blobindex - blobs_per_segment / 2
2389       || index2 == blobcount - 1) {
2390         xstarts[segment] = new_box.left ();
2391       }
2392     }
2393     lms.fit (b, c);
2394     coeffs[segment * 3 - 3] = 0;
2395     coeffs[segment * 3 - 2] = b;
2396     coeffs[segment * 3 - 1] = c;
2397     segment++;
2398   }
2399   while (segment <= segments);
2400   return coeffs;
2401 }
2402 
2403 
2404 /**********************************************************************
2405  * assign_blobs_to_rows
2406  *
2407  * Make enough rows to allocate all the given blobs to one.
2408  * If a block skew is given, use that, else attempt to track it.
2409  **********************************************************************/
assign_blobs_to_rows(TO_BLOCK * block,float * gradient,int pass,BOOL8 reject_misses,BOOL8 make_new_rows,BOOL8 drawing_skew)2410 void assign_blobs_to_rows(                      //find lines
2411                           TO_BLOCK *block,      //block to do
2412                           float *gradient,      //block skew
2413                           int pass,             //identification
2414                           BOOL8 reject_misses,  //chuck big ones out
2415                           BOOL8 make_new_rows,  //add rows for unmatched
2416                           BOOL8 drawing_skew    //draw smoothed skew
2417                          ) {
2418   OVERLAP_STATE overlap_result;  //what to do with it
2419   float ycoord;                  //current y
2420   float top, bottom;             //of blob
2421   float g_length = 1.0f;         //from gradient
2422   inT16 row_count;               //no of rows
2423   inT16 left_x;                  //left edge
2424   inT16 last_x;                  //previous edge
2425   float block_skew;              //y delta
2426   float smooth_factor;           //for new coords
2427   float near_dist;               //dist to nearest row
2428   ICOORD testpt;                 //testing only
2429   BLOBNBOX *blob;                //current blob
2430   TO_ROW *row;                   //current row
2431   TO_ROW *dest_row;              //row to put blob in
2432                                  //iterators
2433   BLOBNBOX_IT blob_it = &block->blobs;
2434   TO_ROW_IT row_it = block->get_rows ();
2435 
2436   ycoord =
2437     (block->block->bounding_box ().bottom () +
2438     block->block->bounding_box ().top ()) / 2.0f;
2439   if (gradient != NULL)
2440     g_length = sqrt (1 + *gradient * *gradient);
2441 #ifndef GRAPHICS_DISABLED
2442   if (drawing_skew)
2443     to_win->SetCursor(block->block->bounding_box ().left (), ycoord);
2444 #endif
2445   testpt = ICOORD (textord_test_x, textord_test_y);
2446   blob_it.sort (blob_x_order);
2447   smooth_factor = 1.0;
2448   block_skew = 0.0f;
2449   row_count = row_it.length ();  //might have rows
2450   if (!blob_it.empty ()) {
2451     left_x = blob_it.data ()->bounding_box ().left ();
2452   }
2453   else {
2454     left_x = block->block->bounding_box ().left ();
2455   }
2456   last_x = left_x;
2457   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
2458     blob = blob_it.data ();
2459     if (gradient != NULL) {
2460       block_skew = (1 - 1 / g_length) * blob->bounding_box ().bottom ()
2461         + *gradient / g_length * blob->bounding_box ().left ();
2462     }
2463     else if (blob->bounding_box ().left () - last_x > block->line_size / 2
2464       && last_x - left_x > block->line_size * 2
2465     && textord_interpolating_skew) {
2466       //                      tprintf("Interpolating skew from %g",block_skew);
2467       block_skew *= (float) (blob->bounding_box ().left () - left_x)
2468         / (last_x - left_x);
2469       //                      tprintf("to %g\n",block_skew);
2470     }
2471     last_x = blob->bounding_box ().left ();
2472     top = blob->bounding_box ().top () - block_skew;
2473     bottom = blob->bounding_box ().bottom () - block_skew;
2474 #ifndef GRAPHICS_DISABLED
2475     if (drawing_skew)
2476       to_win->DrawTo(blob->bounding_box ().left (), ycoord + block_skew);
2477 #endif
2478     if (!row_it.empty ()) {
2479       for (row_it.move_to_first ();
2480         !row_it.at_last () && row_it.data ()->min_y () > top;
2481         row_it.forward ());
2482       row = row_it.data ();
2483       if (row->min_y () <= top && row->max_y () >= bottom) {
2484       //any overlap
2485         dest_row = row;
2486         overlap_result = most_overlapping_row (&row_it, dest_row,
2487           top, bottom,
2488           block->line_size,
2489           blob->bounding_box ().
2490           contains (testpt));
2491         if (overlap_result == NEW_ROW && !reject_misses)
2492           overlap_result = ASSIGN;
2493       }
2494       else {
2495         overlap_result = NEW_ROW;
2496         if (!make_new_rows) {
2497           near_dist = row_it.data_relative (-1)->min_y () - top;
2498                                  //below bottom
2499           if (bottom < row->min_y ()) {
2500             if (row->min_y () - bottom <=
2501               (block->line_spacing -
2502             block->line_size) * textord_merge_desc) {
2503                                  //done it
2504               overlap_result = ASSIGN;
2505               dest_row = row;
2506             }
2507           }
2508           else if (near_dist > 0
2509           && near_dist < bottom - row->max_y ()) {
2510             row_it.backward ();
2511             dest_row = row_it.data ();
2512             if (dest_row->min_y () - bottom <=
2513               (block->line_spacing -
2514             block->line_size) * textord_merge_desc) {
2515                                  //done it
2516               overlap_result = ASSIGN;
2517             }
2518           }
2519           else {
2520             if (top - row->max_y () <=
2521               (block->line_spacing -
2522               block->line_size) * (textord_overlap_x +
2523             textord_merge_asc)) {
2524                                  //done it
2525               overlap_result = ASSIGN;
2526               dest_row = row;
2527             }
2528           }
2529         }
2530       }
2531       if (overlap_result == ASSIGN)
2532         dest_row->add_blob (blob_it.extract (), top, bottom,
2533           block->line_size);
2534       if (overlap_result == NEW_ROW) {
2535         if (make_new_rows && top - bottom < block->max_blob_size) {
2536           dest_row =
2537             new TO_ROW (blob_it.extract (), top, bottom,
2538             block->line_size);
2539           row_count++;
2540           if (bottom > row_it.data ()->min_y ())
2541             row_it.add_before_then_move (dest_row);
2542           //insert in right place
2543           else
2544             row_it.add_after_then_move (dest_row);
2545           smooth_factor =
2546             1.0 / (row_count * textord_skew_lag +
2547             textord_skewsmooth_offset);
2548         }
2549         else
2550           overlap_result = REJECT;
2551       }
2552     }
2553     else if (make_new_rows && top - bottom < block->max_blob_size) {
2554       overlap_result = NEW_ROW;
2555       dest_row =
2556         new TO_ROW (blob_it.extract (), top, bottom, block->line_size);
2557       row_count++;
2558       row_it.add_after_then_move (dest_row);
2559       smooth_factor = 1.0 / (row_count * textord_skew_lag +
2560                              textord_skewsmooth_offset2);
2561     }
2562     else
2563       overlap_result = REJECT;
2564     if (blob->bounding_box ().contains (testpt)) {
2565       if (overlap_result != REJECT) {
2566         tprintf ("Test blob assigned to row at (%g,%g) on pass %d\n",
2567           dest_row->min_y (), dest_row->max_y (), pass);
2568       }
2569       else {
2570         tprintf ("Test blob assigned to no row on pass %d\n", pass);
2571       }
2572     }
2573     if (overlap_result != REJECT) {
2574       while (!row_it.at_first ()
2575         && row_it.data ()->min_y () >
2576       row_it.data_relative (-1)->min_y ()) {
2577         row = row_it.extract ();
2578         row_it.backward ();
2579         row_it.add_before_then_move (row);
2580       }
2581       while (!row_it.at_last ()
2582         && row_it.data ()->min_y () <
2583       row_it.data_relative (1)->min_y ()) {
2584         row = row_it.extract ();
2585         row_it.forward ();
2586                                  //keep rows in order
2587         row_it.add_after_then_move (row);
2588       }
2589       block_skew = (1 - smooth_factor) * block_skew
2590         + smooth_factor * (blob->bounding_box ().bottom () -
2591         dest_row->initial_min_y ());
2592     }
2593   }
2594   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
2595     if (row_it.data ()->blob_list ()->empty ())
2596       delete row_it.extract ();  //discard empty rows
2597   }
2598 }
2599 
2600 
2601 /**********************************************************************
2602  * most_overlapping_row
2603  *
2604  * Return the row which most overlaps the blob.
2605  **********************************************************************/
most_overlapping_row(TO_ROW_IT * row_it,TO_ROW * & best_row,float top,float bottom,float rowsize,BOOL8 testing_blob)2606 OVERLAP_STATE most_overlapping_row(                    //find best row
2607                                    TO_ROW_IT *row_it,  //iterator
2608                                    TO_ROW *&best_row,  //output row
2609                                    float top,          //top of blob
2610                                    float bottom,       //bottom of blob
2611                                    float rowsize,      //max row size
2612                                    BOOL8 testing_blob  //test stuff
2613                                   ) {
2614   OVERLAP_STATE result;          //result of tests
2615   float overlap;                 //of blob & row
2616   float bestover;                //nearest row
2617   float merge_top, merge_bottom; //size of merged row
2618   ICOORD testpt;                 //testing only
2619   TO_ROW *row;                   //current row
2620   TO_ROW *test_row;              //for multiple overlaps
2621   BLOBNBOX_IT blob_it;           //for merging rows
2622 
2623   result = ASSIGN;
2624   row = row_it->data ();
2625   bestover = top - bottom;
2626   if (top > row->max_y ())
2627     bestover -= top - row->max_y ();
2628   if (bottom < row->min_y ())
2629                                  //compute overlap
2630     bestover -= row->min_y () - bottom;
2631   if (testing_blob) {
2632     tprintf ("Test blob y=(%g,%g), row=(%f,%f), overlap=%f\n",
2633       bottom, top, row->min_y (), row->max_y (), bestover);
2634   }
2635   test_row = row;
2636   do {
2637     if (!row_it->at_last ()) {
2638       row_it->forward ();
2639       test_row = row_it->data ();
2640       if (test_row->min_y () <= top && test_row->max_y () >= bottom) {
2641         merge_top =
2642           test_row->max_y () >
2643           row->max_y ()? test_row->max_y () : row->max_y ();
2644         merge_bottom =
2645           test_row->min_y () <
2646           row->min_y ()? test_row->min_y () : row->min_y ();
2647         if (merge_top - merge_bottom <= rowsize) {
2648           if (testing_blob) {
2649             tprintf ("Merging rows at (%g,%g), (%g,%g)\n",
2650               row->min_y (), row->max_y (),
2651               test_row->min_y (), test_row->max_y ());
2652           }
2653           test_row->set_limits (merge_bottom, merge_top);
2654           blob_it.set_to_list (test_row->blob_list ());
2655           blob_it.add_list_after (row->blob_list ());
2656           blob_it.sort (blob_x_order);
2657           row_it->backward ();
2658           delete row_it->extract ();
2659           row_it->forward ();
2660           bestover = -1.0f;      //force replacement
2661         }
2662         overlap = top - bottom;
2663         if (top > test_row->max_y ())
2664           overlap -= top - test_row->max_y ();
2665         if (bottom < test_row->min_y ())
2666           overlap -= test_row->min_y () - bottom;
2667         if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {
2668           result = REJECT;
2669         }
2670         if (overlap > bestover) {
2671           bestover = overlap;    //find biggest overlap
2672           row = test_row;
2673         }
2674         if (testing_blob) {
2675           tprintf
2676             ("Test blob y=(%g,%g), row=(%f,%f), overlap=%f->%f\n",
2677             bottom, top, test_row->min_y (), test_row->max_y (),
2678             overlap, bestover);
2679         }
2680       }
2681     }
2682   }
2683   while (!row_it->at_last ()
2684     && test_row->min_y () <= top && test_row->max_y () >= bottom);
2685   while (row_it->data () != row)
2686     row_it->backward ();         //make it point to row
2687                                  //doesn't overlap much
2688   if (top - bottom - bestover > rowsize * textord_overlap_x &&
2689       (!textord_fix_makerow_bug || bestover < rowsize * textord_overlap_x)
2690     && result == ASSIGN)
2691     result = NEW_ROW;            //doesn't overlap enough
2692   best_row = row;
2693   return result;
2694 }
2695 
2696 
2697 /**********************************************************************
2698  * blob_x_order
2699  *
2700  * Sort function to sort blobs in x from page left.
2701  **********************************************************************/
blob_x_order(const void * item1,const void * item2)2702 int blob_x_order(                    //sort function
2703                  const void *item1,  //items to compare
2704                  const void *item2) {
2705                                  //converted ptr
2706   BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
2707                                  //converted ptr
2708   BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
2709 
2710   if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
2711     return -1;
2712   else if (blob1->bounding_box ().left () > blob2->bounding_box ().left ())
2713     return 1;
2714   else
2715     return 0;
2716 }
2717 
2718 
2719 /**********************************************************************
2720  * row_y_order
2721  *
2722  * Sort function to sort rows in y from page top.
2723  **********************************************************************/
row_y_order(const void * item1,const void * item2)2724 int row_y_order(                    //sort function
2725                 const void *item1,  //items to compare
2726                 const void *item2) {
2727                                  //converted ptr
2728   TO_ROW *row1 = *(TO_ROW **) item1;
2729                                  //converted ptr
2730   TO_ROW *row2 = *(TO_ROW **) item2;
2731 
2732   if (row1->parallel_c () > row2->parallel_c ())
2733     return -1;
2734   else if (row1->parallel_c () < row2->parallel_c ())
2735     return 1;
2736   else
2737     return 0;
2738 }
2739 
2740 
2741 /**********************************************************************
2742  * row_spacing_order
2743  *
2744  * Qsort style function to compare 2 TO_ROWS based on their spacing value.
2745  **********************************************************************/
row_spacing_order(const void * item1,const void * item2)2746 int row_spacing_order(                    //sort function
2747                       const void *item1,  //items to compare
2748                       const void *item2) {
2749                                  //converted ptr
2750   TO_ROW *row1 = *(TO_ROW **) item1;
2751                                  //converted ptr
2752   TO_ROW *row2 = *(TO_ROW **) item2;
2753 
2754   if (row1->spacing < row2->spacing)
2755     return -1;
2756   else if (row1->spacing > row2->spacing)
2757     return 1;
2758   else
2759     return 0;
2760 }
2761 
2762 /**********************************************************************
2763  * make_repeated_chars
2764  *
2765  * Mark textord_repeat_threshold or more adjacent chars which are the
2766  * same as repeated chars.
2767  **********************************************************************/
mark_repeated_chars(TO_ROW * row,float block_xheight,tesseract::Tesseract * tess)2768 void mark_repeated_chars(TO_ROW *row, float block_xheight,
2769                          tesseract::Tesseract *tess) {
2770   ROW *real_row = NULL;          //output row
2771   BLOBNBOX *bblob;               //current blob
2772   BLOBNBOX *nextblob;            //neighbour to compare
2773   BLOBNBOX_IT box_it;            //iterator
2774   BLOBNBOX_IT search_it;         //forward search
2775   inT32 blobcount;               //no of neighbours
2776   inT32 matched_blobcount;       //no of matches
2777   inT32 blobindex;               //in row
2778   inT32 row_length;              //blobs in row
2779   inT32 width_change;            //max width change
2780   inT32 blob_width;              //required blob width
2781   inT32 space_width;             //required gap width
2782   inT32 prev_right;              //right edge of last blob
2783   float rating;                  //match rating
2784   PBLOB *pblob1;                 //polygonal blob
2785   PBLOB *pblob2;                 //second blob
2786 
2787   // kern_size and space_size are computed in the same way as in
2788   // compute_block_pitch().
2789   float kern_size = ceil(block_xheight * textord_words_default_nonspace);
2790   float space_size = floor(block_xheight * textord_words_default_minspace);
2791   int num_repeated_sets = 0;
2792   box_it.set_to_list(row->blob_list());
2793   row_length = row->blob_list()->length();
2794   blobindex = 0;
2795   if (!box_it.empty()) {
2796     if (textord_debug_xheights)
2797       tprintf("Running mark_repeated_chars(), row length %d\n", row_length);
2798     real_row = new ROW(row, static_cast<inT32>(kern_size),
2799                        static_cast<inT32>(space_size));
2800     // Use block_xheight, since xheight of the row (used in the ROW()
2801     // constructor) might not have been computed yet.
2802     real_row->set_x_height(block_xheight);
2803     do {
2804       bblob = box_it.data();
2805       blobcount = 1;
2806       search_it = box_it;
2807       search_it.forward();
2808       matched_blobcount = 1;
2809       width_change = MAX_INT16;
2810       blob_width = 0;
2811       space_width = 0;
2812       prev_right = bblob->bounding_box().right();
2813       if (bblob->bounding_box().height() * 2 < block_xheight &&
2814           !bblob->joined_to_prev() &&
2815           (bblob->blob() != NULL || bblob->cblob() != NULL)) {
2816         pblob1 = (bblob->cblob() != NULL) ?
2817           new PBLOB(bblob->cblob(), block_xheight) : bblob->blob();
2818         rating = 0.0f;
2819         while (rating < textord_repeat_rating &&
2820                blobindex + blobcount < row_length &&
2821                ((nextblob = search_it.data())->blob() != NULL ||
2822                 nextblob->cblob() != NULL) &&
2823                nextblob->bounding_box().height() * 2 < block_xheight) {
2824           if (blobcount == 1) {
2825             space_width = nextblob->bounding_box().left() -
2826               bblob->bounding_box().right();
2827             blob_width = bblob->bounding_box().width();
2828             width_change = blob_width > space_width ? blob_width : space_width;
2829             width_change =
2830               static_cast<inT32>(width_change * textord_repch_width_variance);
2831             if (width_change < 3) width_change = 3;
2832           }
2833           if (nextblob->bounding_box().width() > blob_width + width_change ||
2834               nextblob->bounding_box().width() < blob_width - width_change ||
2835               nextblob->bounding_box().left() - prev_right >
2836               space_width + width_change ||
2837               nextblob->bounding_box().left() - prev_right <
2838               space_width - width_change) {
2839             break;  // not good enough
2840           }
2841           if (nextblob->blob() != NULL)
2842             rating = tess->compare_blobs(pblob1, real_row,
2843                                          nextblob->blob(), real_row);
2844           else {
2845               pblob2 = new PBLOB(nextblob->cblob(), block_xheight);
2846               rating = tess->compare_blobs(pblob1, real_row, pblob2, real_row);
2847               delete pblob2;
2848           }
2849           if (rating < textord_repeat_rating) {
2850             blobcount++;
2851             search_it.forward();
2852             matched_blobcount++;
2853             while (blobindex + blobcount < row_length &&
2854                    (search_it.data()->joined_to_prev() ||
2855                     (search_it.data()->blob() == NULL &&
2856                      search_it.data()->cblob() == NULL))) {
2857               search_it.forward();
2858               blobcount++;     //suck in joined bits
2859             }
2860           }
2861           prev_right = nextblob->bounding_box().right();
2862         }
2863         if (bblob->cblob() != NULL) delete pblob1;
2864       }
2865 
2866       // Record position and length of this run of repeated chars.
2867       if (matched_blobcount >= textord_repeat_threshold) {
2868         if (textord_debug_xheights) {
2869           tprintf("Found %d repeated chars starting at blob index %d\n",
2870                   blobcount, blobindex);
2871         }
2872         blobindex += blobcount;
2873         num_repeated_sets++;
2874         while (blobcount-- > 0 && !box_it.at_first()) {
2875           box_it.data()->set_repeated_set(num_repeated_sets);
2876           box_it.forward();
2877         }
2878       } else {  // just forward box_it to the next blob
2879         blobindex += blobcount;
2880         box_it.forward();
2881       }
2882     } while (!box_it.at_first());  // until all done
2883 
2884     if (real_row != NULL) delete real_row;
2885   }
2886   row->set_num_repeated_sets(num_repeated_sets);
2887 }
2888