• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**********************************************************************
2  * File:        topitch.cpp  (Formerly to_pitch.c)
3  * Description: Code to determine fixed pitchness and the pitch if fixed.
4  * Author:		Ray Smith
5  * Created:		Tue Aug 24 16:57:29 BST 1993
6  *
7  * (C) Copyright 1993, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #ifdef __UNIX__
22 #include          <assert.h>
23 #endif
24 #include          "stderr.h"
25 #include          "blobbox.h"
26 #include          "lmedsq.h"
27 #include          "statistc.h"
28 #include          "drawtord.h"
29 #include          "makerow.h"
30 #include          "pitsync1.h"
31 #include          "pithsync.h"
32 #include          "blobcmpl.h"
33 #include          "tovars.h"
34 #include          "wordseg.h"
35 #include          "topitch.h"
36 #include          "secname.h"
37 #include          "tesseractclass.h"
38 
39 #define EXTERN
40 
41 EXTERN BOOL_VAR (textord_all_prop, FALSE, "All doc is proportial text");
42 EXTERN BOOL_VAR (textord_debug_pitch_test, FALSE,
43 "Debug on fixed pitch test");
44 EXTERN BOOL_VAR (textord_disable_pitch_test, FALSE,
45 "Turn off dp fixed pitch algorithm");
46 EXTERN BOOL_VAR (textord_fast_pitch_test, FALSE,
47 "Do even faster pitch algorithm");
48 EXTERN BOOL_VAR (textord_debug_pitch_metric, FALSE,
49 "Write full metric stuff");
50 EXTERN BOOL_VAR (textord_show_row_cuts, FALSE, "Draw row-level cuts");
51 EXTERN BOOL_VAR (textord_show_page_cuts, FALSE, "Draw page-level cuts");
52 EXTERN BOOL_VAR (textord_pitch_cheat, FALSE,
53 "Use correct answer for fixed/prop");
54 EXTERN BOOL_VAR (textord_blockndoc_fixed, FALSE,
55 "Attempt whole doc/block fixed pitch");
56 EXTERN double_VAR (textord_projection_scale, 0.200, "Ding rate for mid-cuts");
57 EXTERN double_VAR (textord_balance_factor, 1.0,
58 "Ding rate for unbalanced char cells");
59 
60 #define FIXED_WIDTH_MULTIPLE  5
61 #define BLOCK_STATS_CLUSTERS  10
62 #define MAX_ALLOWED_PITCH 100    //max pixel pitch.
63 
64 /**********************************************************************
65  * compute_fixed_pitch
66  *
67  * Decide whether each row is fixed pitch individually.
68  * Correlate definite and uncertain results to obtain an individual
69  * result for each row in the TO_ROW class.
70  **********************************************************************/
71 
compute_fixed_pitch(ICOORD page_tr,TO_BLOCK_LIST * port_blocks,float gradient,FCOORD rotation,BOOL8 testing_on,tesseract::Tesseract * tess)72 void compute_fixed_pitch(                             //determine pitch
73                          ICOORD page_tr,              //top right
74                          TO_BLOCK_LIST *port_blocks,  //input list
75                          float gradient,              //page skew
76                          FCOORD rotation,             //for drawing
77                          BOOL8 testing_on,            //correct orientation
78                          tesseract::Tesseract* tess
79                         ) {
80   TO_BLOCK_IT block_it;          //iterator
81   TO_BLOCK *block;               //current block;
82   TO_ROW_IT row_it;              //row iterator
83   TO_ROW *row;                   //current row
84   int block_index;               //block number
85   int row_index;                 //row number
86 
87 #ifndef GRAPHICS_DISABLED
88   if (textord_show_initial_words && testing_on) {
89     if (to_win == NULL)
90       create_to_win(page_tr);
91   }
92 #endif
93 
94   block_it.set_to_list (port_blocks);
95   block_index = 1;
96   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
97   block_it.forward ()) {
98     block = block_it.data ();
99     compute_block_pitch(block, rotation, block_index, testing_on, tess);
100     block_index++;
101   }
102 
103   if (!try_doc_fixed (page_tr, port_blocks, gradient)) {
104     block_index = 1;
105     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
106     block_it.forward ()) {
107       block = block_it.data ();
108       if (!try_block_fixed (block, block_index))
109         try_rows_fixed(block, block_index, testing_on);
110       block_index++;
111     }
112   }
113 
114   block_index = 1;
115   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
116   block_it.forward ()) {
117     block = block_it.data ();
118     row_it.set_to_list (block->get_rows ());
119     row_index = 1;
120     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
121       row = row_it.data ();
122       fix_row_pitch(row, block, port_blocks, row_index, block_index);
123       row_index++;
124     }
125     block_index++;
126   }
127 #ifndef GRAPHICS_DISABLED
128   if (textord_show_initial_words && testing_on) {
129     ScrollView::Update();
130   }
131 #endif
132 }
133 
134 
135 /**********************************************************************
136  * fix_row_pitch
137  *
138  * Get a pitch_decision for this row by voting among similar rows in the
139  * block, then similar rows over all the page, or any other rows at all.
140  **********************************************************************/
141 
fix_row_pitch(TO_ROW * bad_row,TO_BLOCK * bad_block,TO_BLOCK_LIST * blocks,inT32 row_target,inT32 block_target)142 void fix_row_pitch(TO_ROW *bad_row,        // row to fix
143                    TO_BLOCK *bad_block,    //block of bad_row
144                    TO_BLOCK_LIST *blocks,  //blocks to scan
145                    inT32 row_target,       //number of row
146                    inT32 block_target) {   // number of block
147   inT16 mid_cuts;
148   int block_votes;               //votes in block
149   int like_votes;                //votes over page
150   int other_votes;               //votes of unlike blocks
151   int block_index;               //number of block
152   int row_index;                 //number of row
153   int maxwidth;                  //max pitch
154   TO_BLOCK_IT block_it = blocks; //block iterator
155   TO_ROW_IT row_it;
156   TO_BLOCK *block;               //current block
157   TO_ROW *row;                   //current row
158   float sp_sd;                   //space deviation
159   STATS block_stats;             //pitches in block
160   STATS like_stats;              //pitches in page
161 
162   block_votes = like_votes = other_votes = 0;
163   maxwidth = (inT32) ceil (bad_row->xheight * textord_words_maxspace);
164   if (bad_row->pitch_decision != PITCH_DEF_FIXED
165   && bad_row->pitch_decision != PITCH_DEF_PROP) {
166     block_stats.set_range (0, maxwidth);
167     like_stats.set_range (0, maxwidth);
168     block_index = 1;
169     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
170     block_it.forward ()) {
171       block = block_it.data ();
172       row_index = 1;
173       row_it.set_to_list (block->get_rows ());
174       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
175       row_it.forward ()) {
176         row = row_it.data ();
177         if ((bad_row->all_caps
178           && row->xheight + row->ascrise
179           <
180           (bad_row->xheight + bad_row->ascrise) * (1 +
181           textord_pitch_rowsimilarity)
182           && row->xheight + row->ascrise >
183           (bad_row->xheight + bad_row->ascrise) * (1 -
184           textord_pitch_rowsimilarity))
185           || (!bad_row->all_caps
186           && row->xheight <
187           bad_row->xheight * (1 + textord_pitch_rowsimilarity)
188           && row->xheight >
189         bad_row->xheight * (1 - textord_pitch_rowsimilarity))) {
190           if (block_index == block_target) {
191             if (row->pitch_decision == PITCH_DEF_FIXED) {
192               block_votes += textord_words_veto_power;
193               block_stats.add ((inT32) row->fixed_pitch,
194                 textord_words_veto_power);
195             }
196             else if (row->pitch_decision == PITCH_MAYBE_FIXED
197             || row->pitch_decision == PITCH_CORR_FIXED) {
198               block_votes++;
199               block_stats.add ((inT32) row->fixed_pitch, 1);
200             }
201             else if (row->pitch_decision == PITCH_DEF_PROP)
202               block_votes -= textord_words_veto_power;
203             else if (row->pitch_decision == PITCH_MAYBE_PROP
204               || row->pitch_decision == PITCH_CORR_PROP)
205               block_votes--;
206           }
207           else {
208             if (row->pitch_decision == PITCH_DEF_FIXED) {
209               like_votes += textord_words_veto_power;
210               like_stats.add ((inT32) row->fixed_pitch,
211                 textord_words_veto_power);
212             }
213             else if (row->pitch_decision == PITCH_MAYBE_FIXED
214             || row->pitch_decision == PITCH_CORR_FIXED) {
215               like_votes++;
216               like_stats.add ((inT32) row->fixed_pitch, 1);
217             }
218             else if (row->pitch_decision == PITCH_DEF_PROP)
219               like_votes -= textord_words_veto_power;
220             else if (row->pitch_decision == PITCH_MAYBE_PROP
221               || row->pitch_decision == PITCH_CORR_PROP)
222               like_votes--;
223           }
224         }
225         else {
226           if (row->pitch_decision == PITCH_DEF_FIXED)
227             other_votes += textord_words_veto_power;
228           else if (row->pitch_decision == PITCH_MAYBE_FIXED
229             || row->pitch_decision == PITCH_CORR_FIXED)
230             other_votes++;
231           else if (row->pitch_decision == PITCH_DEF_PROP)
232             other_votes -= textord_words_veto_power;
233           else if (row->pitch_decision == PITCH_MAYBE_PROP
234             || row->pitch_decision == PITCH_CORR_PROP)
235             other_votes--;
236         }
237         row_index++;
238       }
239       block_index++;
240     }
241     if (block_votes > textord_words_veto_power) {
242       bad_row->fixed_pitch = block_stats.ile (0.5);
243       bad_row->pitch_decision = PITCH_CORR_FIXED;
244     }
245     else if (block_votes <= textord_words_veto_power && like_votes > 0) {
246       bad_row->fixed_pitch = like_stats.ile (0.5);
247       bad_row->pitch_decision = PITCH_CORR_FIXED;
248     }
249     else {
250       bad_row->pitch_decision = PITCH_CORR_PROP;
251       #ifndef SECURE_NAMES
252       if (block_votes == 0 && like_votes == 0 && other_votes > 0
253         && (textord_debug_pitch_test || textord_debug_pitch_metric))
254         tprintf
255           ("Warning:row %d of block %d set prop with no like rows against trend\n",
256           row_target, block_target);
257       #endif
258     }
259   }
260   if (textord_debug_pitch_metric) {
261     tprintf (":b_votes=%d:l_votes=%d:o_votes=%d",
262       block_votes, like_votes, other_votes);
263     tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise);
264     }
265   if (bad_row->pitch_decision == PITCH_CORR_FIXED) {
266     if (bad_row->fixed_pitch < textord_min_xheight) {
267       if (block_votes > 0)
268         bad_row->fixed_pitch = block_stats.ile (0.5);
269       else if (block_votes == 0 && like_votes > 0)
270         bad_row->fixed_pitch = like_stats.ile (0.5);
271       else {
272         tprintf
273           ("Warning:guessing pitch as xheight on row %d, block %d\n",
274           row_target, block_target);
275         bad_row->fixed_pitch = bad_row->xheight;
276       }
277     }
278     if (bad_row->fixed_pitch < textord_min_xheight)
279       bad_row->fixed_pitch = (float) textord_min_xheight;
280     bad_row->kern_size = bad_row->fixed_pitch / 4;
281     bad_row->min_space = (inT32) (bad_row->fixed_pitch * 0.6);
282     bad_row->max_nonspace = (inT32) (bad_row->fixed_pitch * 0.4);
283     bad_row->space_threshold =
284       (bad_row->min_space + bad_row->max_nonspace) / 2;
285     bad_row->space_size = bad_row->fixed_pitch;
286     if (bad_row->char_cells.empty ())
287       tune_row_pitch (bad_row, &bad_row->projection,
288         bad_row->projection_left, bad_row->projection_right,
289         (bad_row->fixed_pitch +
290         bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
291         sp_sd, mid_cuts, &bad_row->char_cells, FALSE);
292   }
293   else if (bad_row->pitch_decision == PITCH_CORR_PROP
294   || bad_row->pitch_decision == PITCH_DEF_PROP) {
295     bad_row->fixed_pitch = 0.0f;
296     bad_row->char_cells.clear ();
297   }
298 }
299 
300 
301 /**********************************************************************
302  * compute_block_pitch
303  *
304  * Decide whether each block is fixed pitch individually.
305  **********************************************************************/
306 
compute_block_pitch(TO_BLOCK * block,FCOORD rotation,inT32 block_index,BOOL8 testing_on,tesseract::Tesseract * tess)307 void compute_block_pitch(                    //process each block
308                          TO_BLOCK *block,    //input list
309                          FCOORD rotation,    //for drawing
310                          inT32 block_index,  //block number
311                          BOOL8 testing_on,   //correct orientation
312                          tesseract::Tesseract* tess
313                         ) {
314   TBOX block_box;                 //bounding box
315 
316   block_box = block->block->bounding_box ();
317   if (testing_on && textord_debug_pitch_test) {
318     tprintf ("Block %d at (%d,%d)->(%d,%d)\n",
319       block_index,
320       block_box.left (), block_box.bottom (),
321       block_box.right (), block_box.top ());
322   }
323   block->min_space = (inT32) floor (block->xheight
324     * textord_words_default_minspace);
325   block->max_nonspace = (inT32) ceil (block->xheight
326     * textord_words_default_nonspace);
327   block->fixed_pitch = 0.0f;
328   block->space_size = (float) block->min_space;
329   block->kern_size = (float) block->max_nonspace;
330   block->pr_nonsp = block->xheight * words_default_prop_nonspace;
331   block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop;
332   if (!block->get_rows ()->empty ()) {
333     ASSERT_HOST (block->xheight > 0);
334     if (textord_repeat_extraction)
335       find_repeated_chars(block, textord_show_initial_words &&testing_on, tess);
336 #ifndef GRAPHICS_DISABLED
337     if (textord_show_initial_words && testing_on)
338       //overlap_picture_ops(TRUE);
339       ScrollView::Update();
340 #endif
341     compute_rows_pitch(block,
342                        block_index,
343                        textord_debug_pitch_test &&testing_on);
344   }
345 }
346 
347 
348 /**********************************************************************
349  * compute_rows_pitch
350  *
351  * Decide whether each row is fixed pitch individually.
352  **********************************************************************/
353 
compute_rows_pitch(TO_BLOCK * block,inT32 block_index,BOOL8 testing_on)354 BOOL8 compute_rows_pitch(                    //find line stats
355                          TO_BLOCK *block,    //block to do
356                          inT32 block_index,  //block number
357                          BOOL8 testing_on    //correct orientation
358                         ) {
359   inT32 maxwidth;                //of spaces
360   TO_ROW *row;                   //current row
361   inT32 row_index;               //row number.
362   float lower, upper;            //cluster thresholds
363   TO_ROW_IT row_it = block->get_rows ();
364 
365   row_index = 1;
366   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
367     row = row_it.data ();
368     ASSERT_HOST (row->xheight > 0);
369     row->compute_vertical_projection ();
370     maxwidth = (inT32) ceil (row->xheight * textord_words_maxspace);
371     if (row_pitch_stats (row, maxwidth, testing_on)
372       && find_row_pitch (row, maxwidth,
373       textord_dotmatrix_gap + 1, block, block_index,
374     row_index, testing_on)) {
375       if (row->fixed_pitch == 0) {
376         lower = row->pr_nonsp;
377         upper = row->pr_space;
378         row->space_size = upper;
379         row->kern_size = lower;
380       }
381     }
382     else {
383       row->fixed_pitch = 0.0f;   //insufficient data
384       row->pitch_decision = PITCH_DUNNO;
385     }
386     row_index++;
387   }
388   return FALSE;
389 }
390 
391 
392 /**********************************************************************
393  * try_doc_fixed
394  *
395  * Attempt to call the entire document fixed pitch.
396  **********************************************************************/
397 
try_doc_fixed(ICOORD page_tr,TO_BLOCK_LIST * port_blocks,float gradient)398 BOOL8 try_doc_fixed(                             //determine pitch
399                     ICOORD page_tr,              //top right
400                     TO_BLOCK_LIST *port_blocks,  //input list
401                     float gradient               //page skew
402                    ) {
403   inT16 master_x;                //uniform shifts
404   inT16 pitch;                   //median pitch.
405   int x;                         //profile coord
406   int prop_blocks;               //correct counts
407   int fixed_blocks;
408   int total_row_count;           //total in page
409                                  //iterator
410   TO_BLOCK_IT block_it = port_blocks;
411   TO_BLOCK *block;               //current block;
412   TO_ROW_IT row_it;              //row iterator
413   TO_ROW *row;                   //current row
414   inT16 projection_left;         //edges
415   inT16 projection_right;
416   inT16 row_left;                //edges of row
417   inT16 row_right;
418   ICOORDELT_LIST *master_cells;  //cells for page
419   float master_y;                //uniform shifts
420   float shift_factor;            //page skew correction
421   float row_shift;               //shift for row
422   float final_pitch;             //output pitch
423   float row_y;                   //baseline
424   STATS projection;              //entire page
425   STATS pitches (0, MAX_ALLOWED_PITCH);
426   //for median
427   float sp_sd;                   //space sd
428   inT16 mid_cuts;                //no of cheap cuts
429   float pitch_sd;                //sync rating
430 
431   if (block_it.empty ()
432     //      || block_it.data()==block_it.data_relative(1)
433     || !textord_blockndoc_fixed)
434     return FALSE;
435   shift_factor = gradient / (gradient * gradient + 1);
436   row_it.set_to_list (block_it.data ()->get_rows ());
437   master_x = row_it.data ()->projection_left;
438   master_y = row_it.data ()->baseline.y (master_x);
439   projection_left = MAX_INT16;
440   projection_right = -MAX_INT16;
441   prop_blocks = 0;
442   fixed_blocks = 0;
443   total_row_count = 0;
444 
445   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
446   block_it.forward ()) {
447     block = block_it.data ();
448     row_it.set_to_list (block->get_rows ());
449     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
450       row = row_it.data ();
451       total_row_count++;
452       if (row->fixed_pitch > 0)
453         pitches.add ((inT32) (row->fixed_pitch), 1);
454       //find median
455       row_y = row->baseline.y (master_x);
456       row_left =
457         (inT16) (row->projection_left -
458         shift_factor * (master_y - row_y));
459       row_right =
460         (inT16) (row->projection_right -
461         shift_factor * (master_y - row_y));
462       if (row_left < projection_left)
463         projection_left = row_left;
464       if (row_right > projection_right)
465         projection_right = row_right;
466     }
467   }
468   if (pitches.get_total () == 0)
469     return FALSE;
470   projection.set_range (projection_left, projection_right);
471 
472   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
473   block_it.forward ()) {
474     block = block_it.data ();
475     row_it.set_to_list (block->get_rows ());
476     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
477       row = row_it.data ();
478       row_y = row->baseline.y (master_x);
479       row_left =
480         (inT16) (row->projection_left -
481         shift_factor * (master_y - row_y));
482       for (x = row->projection_left; x < row->projection_right;
483       x++, row_left++) {
484         projection.add (row_left, row->projection.pile_count (x));
485       }
486     }
487   }
488 
489   row_it.set_to_list (block_it.data ()->get_rows ());
490   row = row_it.data ();
491 #ifndef GRAPHICS_DISABLED
492   if (textord_show_page_cuts && to_win != NULL)
493     projection.plot (to_win, projection_left,
494       row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
495 #endif
496   final_pitch = pitches.ile (0.5);
497   pitch = (inT16) final_pitch;
498   pitch_sd =
499     tune_row_pitch (row, &projection, projection_left, projection_right,
500     pitch * 0.75, final_pitch, sp_sd, mid_cuts,
501     &row->char_cells, FALSE);
502 
503   if (textord_debug_pitch_metric)
504     tprintf
505       ("try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n",
506       prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd,
507       pitch_sd / total_row_count, pitch_sd / pitch,
508       pitch_sd / total_row_count / pitch);
509 
510 #ifndef GRAPHICS_DISABLED
511   if (textord_show_page_cuts && to_win != NULL) {
512     master_cells = &row->char_cells;
513     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
514     block_it.forward ()) {
515       block = block_it.data ();
516       row_it.set_to_list (block->get_rows ());
517       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
518       row_it.forward ()) {
519         row = row_it.data ();
520         row_y = row->baseline.y (master_x);
521         row_shift = shift_factor * (master_y - row_y);
522         plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells);
523       }
524     }
525   }
526 #endif
527   row->char_cells.clear ();
528   return FALSE;
529 }
530 
531 
532 /**********************************************************************
533  * try_block_fixed
534  *
535  * Try to call the entire block fixed.
536  **********************************************************************/
537 
try_block_fixed(TO_BLOCK * block,inT32 block_index)538 BOOL8 try_block_fixed(                   //find line stats
539                       TO_BLOCK *block,   //block to do
540                       inT32 block_index  //block number
541                      ) {
542   return FALSE;
543 }
544 
545 
546 /**********************************************************************
547  * try_rows_fixed
548  *
549  * Decide whether each row is fixed pitch individually.
550  **********************************************************************/
551 
try_rows_fixed(TO_BLOCK * block,inT32 block_index,BOOL8 testing_on)552 BOOL8 try_rows_fixed(                    //find line stats
553                      TO_BLOCK *block,    //block to do
554                      inT32 block_index,  //block number
555                      BOOL8 testing_on    //correct orientation
556                     ) {
557   inT32 maxwidth;                //of spaces
558   TO_ROW *row;                   //current row
559   inT32 row_index;               //row number.
560   inT32 def_fixed = 0;           //counters
561   inT32 def_prop = 0;
562   inT32 maybe_fixed = 0;
563   inT32 maybe_prop = 0;
564   inT32 dunno = 0;
565   inT32 corr_fixed = 0;
566   inT32 corr_prop = 0;
567   float lower, upper;            //cluster thresholds
568   TO_ROW_IT row_it = block->get_rows ();
569 
570   row_index = 1;
571   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
572     row = row_it.data ();
573     ASSERT_HOST (row->xheight > 0);
574     maxwidth = (inT32) ceil (row->xheight * textord_words_maxspace);
575     if (row->fixed_pitch > 0 && fixed_pitch_row (row, block_index)) {
576       if (row->fixed_pitch == 0) {
577         lower = row->pr_nonsp;
578         upper = row->pr_space;
579         row->space_size = upper;
580         row->kern_size = lower;
581       }
582     }
583     row_index++;
584   }
585   count_block_votes(block,
586                     def_fixed,
587                     def_prop,
588                     maybe_fixed,
589                     maybe_prop,
590                     corr_fixed,
591                     corr_prop,
592                     dunno);
593   if (testing_on
594     && (textord_debug_pitch_test
595   || textord_blocksall_prop || textord_blocksall_fixed)) {
596     tprintf ("Initially:");
597     print_block_counts(block, block_index);
598   }
599   if (def_fixed > def_prop * textord_words_veto_power)
600     block->pitch_decision = PITCH_DEF_FIXED;
601   else if (def_prop > def_fixed * textord_words_veto_power)
602     block->pitch_decision = PITCH_DEF_PROP;
603   else if (def_fixed > 0 || def_prop > 0)
604     block->pitch_decision = PITCH_DUNNO;
605   else if (maybe_fixed > maybe_prop * textord_words_veto_power)
606     block->pitch_decision = PITCH_MAYBE_FIXED;
607   else if (maybe_prop > maybe_fixed * textord_words_veto_power)
608     block->pitch_decision = PITCH_MAYBE_PROP;
609   else
610     block->pitch_decision = PITCH_DUNNO;
611   return FALSE;
612 }
613 
614 
615 /**********************************************************************
616  * print_block_counts
617  *
618  * Count up how many rows have what decision and print the results.
619  **********************************************************************/
620 
print_block_counts(TO_BLOCK * block,inT32 block_index)621 void print_block_counts(                   //find line stats
622                         TO_BLOCK *block,   //block to do
623                         inT32 block_index  //block number
624                        ) {
625   inT32 def_fixed = 0;           //counters
626   inT32 def_prop = 0;
627   inT32 maybe_fixed = 0;
628   inT32 maybe_prop = 0;
629   inT32 dunno = 0;
630   inT32 corr_fixed = 0;
631   inT32 corr_prop = 0;
632 
633   count_block_votes(block,
634                     def_fixed,
635                     def_prop,
636                     maybe_fixed,
637                     maybe_prop,
638                     corr_fixed,
639                     corr_prop,
640                     dunno);
641   tprintf ("Block %d has (%d,%d,%d)",
642     block_index, def_fixed, maybe_fixed, corr_fixed);
643   if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed))
644     tprintf (" (Wrongly)");
645   tprintf (" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop);
646   if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop))
647     tprintf (" (Wrongly)");
648   tprintf (" prop, %d dunno\n", dunno);
649 }
650 
651 
652 /**********************************************************************
653  * count_block_votes
654  *
655  * Count the number of rows in the block with each kind of pitch_decision.
656  **********************************************************************/
657 
count_block_votes(TO_BLOCK * block,inT32 & def_fixed,inT32 & def_prop,inT32 & maybe_fixed,inT32 & maybe_prop,inT32 & corr_fixed,inT32 & corr_prop,inT32 & dunno)658 void count_block_votes(                   //find line stats
659                        TO_BLOCK *block,   //block to do
660                        inT32 &def_fixed,  //add to counts
661                        inT32 &def_prop,
662                        inT32 &maybe_fixed,
663                        inT32 &maybe_prop,
664                        inT32 &corr_fixed,
665                        inT32 &corr_prop,
666                        inT32 &dunno) {
667   TO_ROW *row;                   //current row
668   TO_ROW_IT row_it = block->get_rows ();
669 
670   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
671     row = row_it.data ();
672     switch (row->pitch_decision) {
673       case PITCH_DUNNO:
674         dunno++;
675         break;
676       case PITCH_DEF_PROP:
677         def_prop++;
678         break;
679       case PITCH_MAYBE_PROP:
680         maybe_prop++;
681         break;
682       case PITCH_DEF_FIXED:
683         def_fixed++;
684         break;
685       case PITCH_MAYBE_FIXED:
686         maybe_fixed++;
687         break;
688       case PITCH_CORR_PROP:
689         corr_prop++;
690         break;
691       case PITCH_CORR_FIXED:
692         corr_fixed++;
693         break;
694     }
695   }
696 }
697 
698 
699 /**********************************************************************
700  * row_pitch_stats
701  *
702  * Decide whether each row is fixed pitch individually.
703  **********************************************************************/
704 
row_pitch_stats(TO_ROW * row,inT32 maxwidth,BOOL8 testing_on)705 BOOL8 row_pitch_stats(                  //find line stats
706                       TO_ROW *row,      //current row
707                       inT32 maxwidth,   //of spaces
708                       BOOL8 testing_on  //correct orientation
709                      ) {
710   BLOBNBOX *blob;                //current blob
711   int gap_index;                 //current gap
712   inT32 prev_x;                  //end of prev blob
713   inT32 cluster_count;           //no of clusters
714   inT32 prev_count;              //of clusters
715   inT32 smooth_factor;           //for smoothing stats
716   TBOX blob_box;                  //bounding box
717   float lower, upper;            //cluster thresholds
718                                  //gap sizes
719   float gaps[BLOCK_STATS_CLUSTERS];
720                                  //blobs
721   BLOBNBOX_IT blob_it = row->blob_list ();
722   STATS gap_stats (0, maxwidth);
723   STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
724   //clusters
725 
726   smooth_factor =
727     (inT32) (row->xheight * textord_wordstats_smooth_factor + 1.5);
728   if (!blob_it.empty ()) {
729     prev_x = blob_it.data ()->bounding_box ().right ();
730     blob_it.forward ();
731     while (!blob_it.at_first ()) {
732       blob = blob_it.data ();
733       if (!blob->joined_to_prev ()) {
734         blob_box = blob->bounding_box ();
735         if (blob_box.left () - prev_x < maxwidth)
736           gap_stats.add (blob_box.left () - prev_x, 1);
737         prev_x = blob_box.right ();
738       }
739       blob_it.forward ();
740     }
741   }
742   if (gap_stats.get_total () == 0) {
743     return FALSE;
744   }
745   cluster_count = 0;
746   lower = row->xheight * words_initial_lower;
747   upper = row->xheight * words_initial_upper;
748   gap_stats.smooth (smooth_factor);
749   do {
750     prev_count = cluster_count;
751     cluster_count = gap_stats.cluster (lower, upper,
752       textord_spacesize_ratioprop,
753       BLOCK_STATS_CLUSTERS, cluster_stats);
754   }
755   while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
756   if (cluster_count < 1) {
757     return FALSE;
758   }
759   for (gap_index = 0; gap_index < cluster_count; gap_index++)
760     gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
761   //get medians
762   if (testing_on) {
763     tprintf ("cluster_count=%d:", cluster_count);
764     for (gap_index = 0; gap_index < cluster_count; gap_index++)
765       tprintf (" %g(%d)", gaps[gap_index],
766         cluster_stats[gap_index + 1].get_total ());
767     tprintf ("\n");
768   }
769   qsort (gaps, cluster_count, sizeof (float), sort_floats2);
770 
771   //Try to find proportional non-space and space for row.
772   lower = row->xheight * words_default_prop_nonspace;
773   upper = row->xheight * textord_words_min_minspace;
774   for (gap_index = 0; gap_index < cluster_count
775     && gaps[gap_index] < lower; gap_index++);
776   if (gap_index == 0) {
777     if (testing_on)
778       tprintf ("No clusters below nonspace threshold!!\n");
779     if (cluster_count > 1) {
780       row->pr_nonsp = gaps[0];
781       row->pr_space = gaps[1];
782     }
783     else {
784       row->pr_nonsp = lower;
785       row->pr_space = gaps[0];
786     }
787   }
788   else {
789     row->pr_nonsp = gaps[gap_index - 1];
790     while (gap_index < cluster_count && gaps[gap_index] < upper)
791       gap_index++;
792     if (gap_index == cluster_count) {
793       if (testing_on)
794         tprintf ("No clusters above nonspace threshold!!\n");
795       row->pr_space = lower * textord_spacesize_ratioprop;
796     }
797     else
798       row->pr_space = gaps[gap_index];
799   }
800 
801   //Now try to find the fixed pitch space and non-space.
802   upper = row->xheight * words_default_fixed_space;
803   for (gap_index = 0; gap_index < cluster_count
804     && gaps[gap_index] < upper; gap_index++);
805   if (gap_index == 0) {
806     if (testing_on)
807       tprintf ("No clusters below space threshold!!\n");
808     row->fp_nonsp = upper;
809     row->fp_space = gaps[0];
810   }
811   else {
812     row->fp_nonsp = gaps[gap_index - 1];
813     if (gap_index == cluster_count) {
814       if (testing_on)
815         tprintf ("No clusters above space threshold!!\n");
816       row->fp_space = row->xheight;
817     }
818     else
819       row->fp_space = gaps[gap_index];
820   }
821   if (testing_on) {
822     tprintf
823       ("Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, fp_space=%g\n",
824       row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space);
825   }
826   return TRUE;                   //computed some stats
827 }
828 
829 
830 /**********************************************************************
831  * find_row_pitch
832  *
833  * Check to see if this row could be fixed pitch using the given spacings.
834  * Blobs with gaps smaller than the lower threshold are assumed to be one.
835  * The larger threshold is the word gap threshold.
836  **********************************************************************/
837 
find_row_pitch(TO_ROW * row,inT32 maxwidth,inT32 dm_gap,TO_BLOCK * block,inT32 block_index,inT32 row_index,BOOL8 testing_on)838 BOOL8 find_row_pitch(                    //find lines
839                      TO_ROW *row,        //row to do
840                      inT32 maxwidth,     //max permitted space
841                      inT32 dm_gap,       //ignorable gaps
842                      TO_BLOCK *block,    //block of row
843                      inT32 block_index,  //block_number
844                      inT32 row_index,    //number of row
845                      BOOL8 testing_on    //correct orientation
846                     ) {
847   BOOL8 used_dm_model;           //looks lik dot matrix
848   float min_space;               //estimate threshold
849   float non_space;               //gap size
850   float gap_iqr;                 //interquartile range
851   float pitch_iqr;
852   float dm_gap_iqr;              //interquartile range
853   float dm_pitch_iqr;
854   float dm_pitch;                //pitch with dm on
855   float pitch;                   //revised estimate
856   float initial_pitch;           //guess at pitch
857   STATS gap_stats (0, maxwidth);
858                                  //centre-centre
859   STATS pitch_stats (0, maxwidth);
860 
861   row->fixed_pitch = 0.0f;
862   initial_pitch = row->fp_space;
863   if (initial_pitch > row->xheight * (1 + words_default_fixed_limit))
864     initial_pitch = row->xheight;//keep pitch decent
865   non_space = row->fp_nonsp;
866   if (non_space > initial_pitch)
867     non_space = initial_pitch;
868   min_space = (initial_pitch + non_space) / 2;
869 
870   if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
871   initial_pitch, min_space, TRUE, FALSE, dm_gap)) {
872     dm_gap_iqr = 0.0001;
873     dm_pitch_iqr = maxwidth * 2.0f;
874     dm_pitch = initial_pitch;
875   }
876   else {
877     dm_gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
878     dm_pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
879     dm_pitch = pitch_stats.ile (0.5);
880   }
881   gap_stats.clear ();
882   pitch_stats.clear ();
883   if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
884   initial_pitch, min_space, TRUE, FALSE, 0)) {
885     gap_iqr = 0.0001;
886     pitch_iqr = maxwidth * 3.0f;
887   }
888   else {
889     gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
890     pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
891     if (testing_on)
892       tprintf
893         ("First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
894         initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
895     initial_pitch = pitch_stats.ile (0.5);
896     if (min_space > initial_pitch
897       && count_pitch_stats (row, &gap_stats, &pitch_stats,
898     initial_pitch, initial_pitch, TRUE, FALSE, 0)) {
899       min_space = initial_pitch;
900       gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
901       pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
902       if (testing_on)
903         tprintf
904           ("Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
905           initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
906       initial_pitch = pitch_stats.ile (0.5);
907     }
908   }
909   if (textord_debug_pitch_metric)
910     tprintf ("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:",
911             block_index, row_index, 'X',
912     pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,
913             pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth ? 'D' :
914               (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M'));
915   if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {
916     row->pitch_decision = PITCH_DUNNO;
917     if (textord_debug_pitch_metric)
918       tprintf ("\n");
919     return FALSE;                //insufficient data
920   }
921   if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {
922     if (testing_on)
923       tprintf
924         ("Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
925         pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
926     gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
927     pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
928     pitch = pitch_stats.ile (0.5);
929     used_dm_model = FALSE;
930   }
931   else {
932     if (testing_on)
933       tprintf
934         ("Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
935         pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
936     gap_iqr = dm_gap_iqr;
937     pitch_iqr = dm_pitch_iqr;
938     pitch = dm_pitch;
939     used_dm_model = TRUE;
940   }
941   if (textord_debug_pitch_metric) {
942     tprintf ("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:",
943       pitch_iqr, gap_iqr, pitch);
944     tprintf ("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:",
945       pitch_iqr / gap_iqr, pitch_iqr / block->xheight,
946       pitch_iqr < gap_iqr * textord_fpiqr_ratio
947       && pitch_iqr < block->xheight * textord_max_pitch_iqr
948       && pitch < block->xheight * textord_words_default_maxspace
949       ? 'F' : 'P');
950   }
951   if (pitch_iqr < gap_iqr * textord_fpiqr_ratio
952     && pitch_iqr < block->xheight * textord_max_pitch_iqr
953     && pitch < block->xheight * textord_words_default_maxspace)
954     row->pitch_decision = PITCH_MAYBE_FIXED;
955   else
956     row->pitch_decision = PITCH_MAYBE_PROP;
957   row->fixed_pitch = pitch;
958   row->kern_size = gap_stats.ile (0.5);
959   row->min_space = (inT32) (row->fixed_pitch + non_space) / 2;
960   if (row->min_space > row->fixed_pitch)
961     row->min_space = (inT32) row->fixed_pitch;
962   row->max_nonspace = row->min_space;
963   row->space_size = row->fixed_pitch;
964   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
965   row->used_dm_model = used_dm_model;
966   return TRUE;
967 }
968 
969 
970 /**********************************************************************
971  * fixed_pitch_row
972  *
973  * Check to see if this row could be fixed pitch using the given spacings.
974  * Blobs with gaps smaller than the lower threshold are assumed to be one.
975  * The larger threshold is the word gap threshold.
976  **********************************************************************/
977 
fixed_pitch_row(TO_ROW * row,inT32 block_index)978 BOOL8 fixed_pitch_row(                   //find lines
979                       TO_ROW *row,       //row to do
980                       inT32 block_index  //block_number
981                      ) {
982   const char *res_string;        //pitch result
983   inT16 mid_cuts;                //no of cheap cuts
984   float non_space;               //gap size
985   float pitch_sd;                //error on pitch
986   float sp_sd;                   //space sd
987 
988   non_space = row->fp_nonsp;
989   if (non_space > row->fixed_pitch)
990     non_space = row->fixed_pitch;
991   if (textord_all_prop) {
992     // Set the decision to definitely proportional.
993     pitch_sd = textord_words_def_prop * row->fixed_pitch;
994     row->pitch_decision = PITCH_DEF_PROP;
995   } else {
996     pitch_sd = tune_row_pitch (row, &row->projection, row->projection_left,
997                                row->projection_right,
998                                (row->fixed_pitch + non_space * 3) / 4,
999                                row->fixed_pitch, sp_sd, mid_cuts,
1000                                &row->char_cells,
1001                                block_index == textord_debug_block);
1002     if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch
1003       && ((pitsync_linear_version & 3) < 3
1004       || ((pitsync_linear_version & 3) >= 3 && (row->used_dm_model
1005       || sp_sd > 20
1006     || (pitch_sd == 0 && sp_sd > 10))))) {
1007       if (pitch_sd < textord_words_def_fixed * row->fixed_pitch
1008         && !row->all_caps
1009         && ((pitsync_linear_version & 3) < 3 || sp_sd > 20))
1010         row->pitch_decision = PITCH_DEF_FIXED;
1011       else
1012         row->pitch_decision = PITCH_MAYBE_FIXED;
1013     }
1014     else if ((pitsync_linear_version & 3) < 3
1015       || sp_sd > 20
1016       || mid_cuts > 0
1017       || pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) {
1018       if (pitch_sd < textord_words_def_prop * row->fixed_pitch)
1019         row->pitch_decision = PITCH_MAYBE_PROP;
1020       else
1021         row->pitch_decision = PITCH_DEF_PROP;
1022     }
1023     else
1024       row->pitch_decision = PITCH_DUNNO;
1025   }
1026 
1027   if (textord_debug_pitch_metric) {
1028     res_string = "??";
1029     switch (row->pitch_decision) {
1030       case PITCH_DEF_PROP:
1031         res_string = "DP";
1032         break;
1033       case PITCH_MAYBE_PROP:
1034         res_string = "MP";
1035         break;
1036       case PITCH_DEF_FIXED:
1037         res_string = "DF";
1038         break;
1039       case PITCH_MAYBE_FIXED:
1040         res_string = "MF";
1041       default:
1042         res_string = "??";
1043     }
1044     tprintf (":sd/p=%g:occ=%g:init_res=%s\n",
1045       pitch_sd / row->fixed_pitch, sp_sd, res_string);
1046   }
1047   return TRUE;
1048 }
1049 
1050 
1051 /**********************************************************************
1052  * count_pitch_stats
1053  *
1054  * Count up the gap and pitch stats on the block to see if it is fixed pitch.
1055  * Blobs with gaps smaller than the lower threshold are assumed to be one.
1056  * The larger threshold is the word gap threshold.
1057  * The return value indicates whether there were any decent values to use.
1058  **********************************************************************/
1059 
count_pitch_stats(TO_ROW * row,STATS * gap_stats,STATS * pitch_stats,float initial_pitch,float min_space,BOOL8 ignore_outsize,BOOL8 split_outsize,inT32 dm_gap)1060 BOOL8 count_pitch_stats(                       //find lines
1061                         TO_ROW *row,           //row to do
1062                         STATS *gap_stats,      //blob gaps
1063                         STATS *pitch_stats,    //centre-centre stats
1064                         float initial_pitch,   //guess at pitch
1065                         float min_space,       //estimate space size
1066                         BOOL8 ignore_outsize,  //discard big objects
1067                         BOOL8 split_outsize,   //split big objects
1068                         inT32 dm_gap           //ignorable gaps
1069                        ) {
1070   BOOL8 prev_valid;              //not word broken
1071   BLOBNBOX *blob;                //current blob
1072                                  //blobs
1073   BLOBNBOX_IT blob_it = row->blob_list ();
1074   inT32 prev_right;              //end of prev blob
1075   inT32 prev_centre;             //centre of previous blob
1076   inT32 x_centre;                //centre of this blob
1077   inT32 blob_width;              //width of blob
1078   inT32 width_units;             //no of widths in blob
1079   float width;                   //blob width
1080   TBOX blob_box;                  //bounding box
1081   TBOX joined_box;                //of super blob
1082 
1083   gap_stats->clear ();
1084   pitch_stats->clear ();
1085   if (blob_it.empty ())
1086     return FALSE;
1087   prev_valid = FALSE;
1088   prev_centre = 0;
1089   prev_right = 0;                //stop complier warning
1090   joined_box = blob_it.data ()->bounding_box ();
1091   do {
1092     blob_it.forward ();
1093     blob = blob_it.data ();
1094     if (!blob->joined_to_prev ()) {
1095       blob_box = blob->bounding_box ();
1096       if ((blob_box.left () - joined_box.right () < dm_gap
1097         && !blob_it.at_first ())
1098         || (blob->cblob () == NULL && blob->blob () == NULL))
1099         joined_box += blob_box;  //merge blobs
1100       else {
1101         blob_width = joined_box.width ();
1102         if (split_outsize) {
1103           width_units =
1104             (inT32) floor ((float) blob_width / initial_pitch + 0.5);
1105           if (width_units < 1)
1106             width_units = 1;
1107           width_units--;
1108         }
1109         else if (ignore_outsize) {
1110           width = (float) blob_width / initial_pitch;
1111           width_units = width < 1 + words_default_fixed_limit
1112             && width > 1 - words_default_fixed_limit ? 0 : -1;
1113         }
1114         else
1115           width_units = 0;       //everything in
1116         x_centre = (inT32) (joined_box.left ()
1117           + (blob_width -
1118           width_units * initial_pitch) / 2);
1119         if (prev_valid && width_units >= 0) {
1120           //                                              if (width_units>0)
1121           //                                              {
1122           //                                                      tprintf("wu=%d, width=%d, xc=%d, adding %d\n",
1123           //                                                              width_units,blob_width,x_centre,x_centre-prev_centre);
1124           //                                              }
1125           gap_stats->add (joined_box.left () - prev_right, 1);
1126           pitch_stats->add (x_centre - prev_centre, 1);
1127         }
1128         prev_centre = (inT32) (x_centre + width_units * initial_pitch);
1129         prev_right = joined_box.right ();
1130         prev_valid = blob_box.left () - joined_box.right () < min_space;
1131         prev_valid = prev_valid && width_units >= 0;
1132         joined_box = blob_box;
1133       }
1134     }
1135   }
1136   while (!blob_it.at_first ());
1137   return gap_stats->get_total () >= 3;
1138 }
1139 
1140 
1141 /**********************************************************************
1142  * tune_row_pitch
1143  *
1144  * Use a dp algorithm to fit the character cells and return the sd of
1145  * the cell size over the row.
1146  **********************************************************************/
1147 
tune_row_pitch(TO_ROW * row,STATS * projection,inT16 projection_left,inT16 projection_right,float space_size,float & initial_pitch,float & best_sp_sd,inT16 & best_mid_cuts,ICOORDELT_LIST * best_cells,BOOL8 testing_on)1148 float tune_row_pitch(                             //find fp cells
1149                      TO_ROW *row,                 //row to do
1150                      STATS *projection,           //vertical projection
1151                      inT16 projection_left,       //edge of projection
1152                      inT16 projection_right,      //edge of projection
1153                      float space_size,            //size of blank
1154                      float &initial_pitch,        //guess at pitch
1155                      float &best_sp_sd,           //space sd
1156                      inT16 &best_mid_cuts,        //no of cheap cuts
1157                      ICOORDELT_LIST *best_cells,  //row cells
1158                      BOOL8 testing_on             //inidividual words
1159                     ) {
1160   int pitch_delta;               //offset pitch
1161   inT16 mid_cuts;                //cheap cuts
1162   float pitch_sd;                //current sd
1163   float best_sd;                 //best result
1164   float best_pitch;              //pitch for best result
1165   float initial_sd;              //starting error
1166   float sp_sd;                   //space sd
1167   ICOORDELT_LIST test_cells;     //row cells
1168   ICOORDELT_IT best_it;          //start of best list
1169 
1170   if (textord_fast_pitch_test)
1171     return tune_row_pitch2 (row, projection, projection_left,
1172       projection_right, space_size, initial_pitch,
1173       best_sp_sd,
1174     //space sd
1175       best_mid_cuts, best_cells, testing_on);
1176   if (textord_disable_pitch_test) {
1177     best_sp_sd = initial_pitch;
1178     return initial_pitch;
1179   }
1180   initial_sd =
1181     compute_pitch_sd(row,
1182                      projection,
1183                      projection_left,
1184                      projection_right,
1185                      space_size,
1186                      initial_pitch,
1187                      best_sp_sd,
1188                      best_mid_cuts,
1189                      best_cells,
1190                      testing_on);
1191   best_sd = initial_sd;
1192   best_pitch = initial_pitch;
1193   if (testing_on)
1194     tprintf ("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd);
1195   for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1196     pitch_sd =
1197       compute_pitch_sd (row, projection, projection_left, projection_right,
1198       space_size, initial_pitch + pitch_delta, sp_sd,
1199       mid_cuts, &test_cells, testing_on);
1200     if (testing_on)
1201       tprintf ("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta,
1202         pitch_sd);
1203     if (pitch_sd < best_sd) {
1204       best_sd = pitch_sd;
1205       best_mid_cuts = mid_cuts;
1206       best_sp_sd = sp_sd;
1207       best_pitch = initial_pitch + pitch_delta;
1208       best_cells->clear ();
1209       best_it.set_to_list (best_cells);
1210       best_it.add_list_after (&test_cells);
1211     }
1212     else
1213       test_cells.clear ();
1214     if (pitch_sd > initial_sd)
1215       break;                     //getting worse
1216   }
1217   for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1218     pitch_sd =
1219       compute_pitch_sd (row, projection, projection_left, projection_right,
1220       space_size, initial_pitch - pitch_delta, sp_sd,
1221       mid_cuts, &test_cells, testing_on);
1222     if (testing_on)
1223       tprintf ("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta,
1224         pitch_sd);
1225     if (pitch_sd < best_sd) {
1226       best_sd = pitch_sd;
1227       best_mid_cuts = mid_cuts;
1228       best_sp_sd = sp_sd;
1229       best_pitch = initial_pitch - pitch_delta;
1230       best_cells->clear ();
1231       best_it.set_to_list (best_cells);
1232       best_it.add_list_after (&test_cells);
1233     }
1234     else
1235       test_cells.clear ();
1236     if (pitch_sd > initial_sd)
1237       break;
1238   }
1239   initial_pitch = best_pitch;
1240 
1241   if (textord_debug_pitch_metric)
1242     print_pitch_sd(row,
1243                    projection,
1244                    projection_left,
1245                    projection_right,
1246                    space_size,
1247                    best_pitch);
1248 
1249   return best_sd;
1250 }
1251 
1252 
1253 /**********************************************************************
1254  * tune_row_pitch
1255  *
1256  * Use a dp algorithm to fit the character cells and return the sd of
1257  * the cell size over the row.
1258  **********************************************************************/
1259 
tune_row_pitch2(TO_ROW * row,STATS * projection,inT16 projection_left,inT16 projection_right,float space_size,float & initial_pitch,float & best_sp_sd,inT16 & best_mid_cuts,ICOORDELT_LIST * best_cells,BOOL8 testing_on)1260 float tune_row_pitch2(                             //find fp cells
1261                       TO_ROW *row,                 //row to do
1262                       STATS *projection,           //vertical projection
1263                       inT16 projection_left,       //edge of projection
1264                       inT16 projection_right,      //edge of projection
1265                       float space_size,            //size of blank
1266                       float &initial_pitch,        //guess at pitch
1267                       float &best_sp_sd,           //space sd
1268                       inT16 &best_mid_cuts,        //no of cheap cuts
1269                       ICOORDELT_LIST *best_cells,  //row cells
1270                       BOOL8 testing_on             //inidividual words
1271                      ) {
1272   int pitch_delta;               //offset pitch
1273   inT16 pixel;                   //pixel coord
1274   inT16 best_pixel;              //pixel coord
1275   inT16 best_delta;              //best pitch
1276   inT16 best_pitch;              //best pitch
1277   inT16 start;                   //of good range
1278   inT16 end;                     //of good range
1279   inT32 best_count;              //lowest sum
1280   float best_sd;                 //best result
1281   STATS *sum_proj;               //summed projection
1282 
1283   best_sp_sd = initial_pitch;
1284 
1285   if (textord_disable_pitch_test) {
1286     return initial_pitch;
1287   }
1288   sum_proj = new STATS[textord_pitch_range * 2 + 1];
1289   if (sum_proj == NULL)
1290     return initial_pitch;
1291   best_pitch = (inT32) initial_pitch;
1292 
1293   for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
1294     pitch_delta++)
1295   sum_proj[textord_pitch_range + pitch_delta].set_range (0,
1296       best_pitch +
1297       pitch_delta + 1);
1298   for (pixel = projection_left; pixel <= projection_right; pixel++) {
1299     for (pitch_delta = -textord_pitch_range;
1300       pitch_delta <= textord_pitch_range; pitch_delta++)
1301     sum_proj[textord_pitch_range +
1302         pitch_delta].add ((pixel - projection_left) % (best_pitch +
1303         pitch_delta),
1304         projection->pile_count (pixel));
1305   }
1306   best_count = sum_proj[textord_pitch_range].pile_count (0);
1307   best_delta = 0;
1308   best_pixel = 0;
1309   for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
1310   pitch_delta++) {
1311     for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {
1312       if (sum_proj[textord_pitch_range + pitch_delta].pile_count (pixel)
1313       < best_count) {
1314         best_count =
1315           sum_proj[textord_pitch_range +
1316           pitch_delta].pile_count (pixel);
1317         best_delta = pitch_delta;
1318         best_pixel = pixel;
1319       }
1320     }
1321   }
1322   if (testing_on)
1323     tprintf ("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n",
1324       initial_pitch, best_delta, best_count);
1325   best_pitch += best_delta;
1326   initial_pitch = best_pitch;
1327   best_count++;
1328   best_count += best_count;
1329   for (start = best_pixel - 2; start > best_pixel - best_pitch
1330     && sum_proj[textord_pitch_range +
1331     best_delta].pile_count (start % best_pitch) <= best_count;
1332     start--);
1333   for (end = best_pixel + 2;
1334     end < best_pixel + best_pitch
1335     && sum_proj[textord_pitch_range +
1336     best_delta].pile_count (end % best_pitch) <= best_count;
1337     end++);
1338 
1339   best_sd =
1340     compute_pitch_sd(row,
1341                      projection,
1342                      projection_left,
1343                      projection_right,
1344                      space_size,
1345                      initial_pitch,
1346                      best_sp_sd,
1347                      best_mid_cuts,
1348                      best_cells,
1349                      testing_on,
1350                      start,
1351                      end);
1352   if (testing_on)
1353     tprintf ("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch,
1354       best_sd);
1355 
1356   if (textord_debug_pitch_metric)
1357     print_pitch_sd(row,
1358                    projection,
1359                    projection_left,
1360                    projection_right,
1361                    space_size,
1362                    initial_pitch);
1363 
1364   delete[]sum_proj;
1365 
1366   return best_sd;
1367 }
1368 
1369 
1370 /**********************************************************************
1371  * compute_pitch_sd
1372  *
1373  * Use a dp algorithm to fit the character cells and return the sd of
1374  * the cell size over the row.
1375  **********************************************************************/
1376 
compute_pitch_sd(TO_ROW * row,STATS * projection,inT16 projection_left,inT16 projection_right,float space_size,float initial_pitch,float & sp_sd,inT16 & mid_cuts,ICOORDELT_LIST * row_cells,BOOL8 testing_on,inT16 start,inT16 end)1377 float compute_pitch_sd(                            //find fp cells
1378                        TO_ROW *row,                //row to do
1379                        STATS *projection,          //vertical projection
1380                        inT16 projection_left,      //edge
1381                        inT16 projection_right,     //edge
1382                        float space_size,           //size of blank
1383                        float initial_pitch,        //guess at pitch
1384                        float &sp_sd,               //space sd
1385                        inT16 &mid_cuts,            //no of free cuts
1386                        ICOORDELT_LIST *row_cells,  //list of chop pts
1387                        BOOL8 testing_on,           //inidividual words
1388                        inT16 start,                //start of good range
1389                        inT16 end                   //end of good range
1390                       ) {
1391   inT16 occupation;              //no of cells in word.
1392                                  //blobs
1393   BLOBNBOX_IT blob_it = row->blob_list ();
1394   BLOBNBOX_IT start_it;          //start of word
1395   BLOBNBOX_IT plot_it;           //for plotting
1396   inT16 blob_count;              //no of blobs
1397   TBOX blob_box;                  //bounding box
1398   TBOX prev_box;                  //of super blob
1399   inT32 prev_right;              //of word sync
1400   int scale_factor;              //on scores for big words
1401   inT32 sp_count;                //spaces
1402   FPSEGPT_LIST seg_list;         //char cells
1403   FPSEGPT_IT seg_it;             //iterator
1404   inT16 segpos;                  //position of segment
1405   inT16 cellpos;                 //previous cell boundary
1406                                  //iterator
1407   ICOORDELT_IT cell_it = row_cells;
1408   ICOORDELT *cell;               //new cell
1409   double sqsum;                  //sum of squares
1410   double spsum;                  //of spaces
1411   double sp_var;                 //space error
1412   double word_sync;              //result for word
1413   inT32 total_count;             //total blobs
1414 
1415   if ((pitsync_linear_version & 3) > 1) {
1416     word_sync = compute_pitch_sd2 (row, projection, projection_left,
1417       projection_right, initial_pitch,
1418       occupation, mid_cuts, row_cells,
1419       testing_on, start, end);
1420     sp_sd = occupation;
1421     return word_sync;
1422   }
1423   mid_cuts = 0;
1424   cellpos = 0;
1425   total_count = 0;
1426   sqsum = 0;
1427   sp_count = 0;
1428   spsum = 0;
1429   prev_right = -1;
1430   if (blob_it.empty ())
1431     return space_size * 10;
1432 #ifndef GRAPHICS_DISABLED
1433   if (testing_on && to_win > 0) {
1434     blob_box = blob_it.data ()->bounding_box ();
1435     projection->plot (to_win, projection_left,
1436       row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
1437   }
1438 #endif
1439   start_it = blob_it;
1440   blob_count = 0;
1441   blob_box = box_next (&blob_it);//first blob
1442   blob_it.mark_cycle_pt ();
1443   do {
1444     for (; blob_count > 0; blob_count--)
1445       box_next(&start_it);
1446     do {
1447       prev_box = blob_box;
1448       blob_count++;
1449       blob_box = box_next (&blob_it);
1450     }
1451     while (!blob_it.cycled_list ()
1452       && blob_box.left () - prev_box.right () < space_size);
1453     plot_it = start_it;
1454     if (pitsync_linear_version & 3)
1455       word_sync =
1456         check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2,
1457         projection, projection_left, projection_right,
1458         row->xheight * textord_projection_scale,
1459         occupation, &seg_list, start, end);
1460     else
1461       word_sync =
1462         check_pitch_sync (&start_it, blob_count, (inT16) initial_pitch, 2,
1463         projection, &seg_list);
1464     if (testing_on) {
1465       tprintf ("Word ending at (%d,%d), len=%d, sync rating=%g, ",
1466         prev_box.right (), prev_box.top (),
1467         seg_list.length () - 1, word_sync);
1468       seg_it.set_to_list (&seg_list);
1469       for (seg_it.mark_cycle_pt (); !seg_it.cycled_list ();
1470       seg_it.forward ()) {
1471         if (seg_it.data ()->faked)
1472           tprintf ("(F)");
1473         tprintf ("%d, ", seg_it.data ()->position ());
1474         //                              tprintf("C=%g, s=%g, sq=%g\n",
1475         //                                      seg_it.data()->cost_function(),
1476         //                                      seg_it.data()->sum(),
1477         //                                      seg_it.data()->squares());
1478       }
1479       tprintf ("\n");
1480     }
1481 #ifndef GRAPHICS_DISABLED
1482     if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
1483       plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1484 #endif
1485     seg_it.set_to_list (&seg_list);
1486     if (prev_right >= 0) {
1487       sp_var = seg_it.data ()->position () - prev_right;
1488       sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
1489       sp_var *= sp_var;
1490       spsum += sp_var;
1491       sp_count++;
1492     }
1493     for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
1494       segpos = seg_it.data ()->position ();
1495       if (cell_it.empty () || segpos > cellpos + initial_pitch / 2) {
1496                                  //big gap
1497         while (!cell_it.empty () && segpos > cellpos + initial_pitch * 3 / 2) {
1498           cell = new ICOORDELT (cellpos + (inT16) initial_pitch, 0);
1499           cell_it.add_after_then_move (cell);
1500           cellpos += (inT16) initial_pitch;
1501         }
1502                                  //make new one
1503         cell = new ICOORDELT (segpos, 0);
1504         cell_it.add_after_then_move (cell);
1505         cellpos = segpos;
1506       }
1507       else if (segpos > cellpos - initial_pitch / 2) {
1508         cell = cell_it.data ();
1509                                  //average positions
1510         cell->set_x ((cellpos + segpos) / 2);
1511         cellpos = cell->x ();
1512       }
1513     }
1514     seg_it.move_to_last ();
1515     prev_right = seg_it.data ()->position ();
1516     if (textord_pitch_scalebigwords) {
1517       scale_factor = (seg_list.length () - 2) / 2;
1518       if (scale_factor < 1)
1519         scale_factor = 1;
1520     }
1521     else
1522       scale_factor = 1;
1523     sqsum += word_sync * scale_factor;
1524     total_count += (seg_list.length () - 1) * scale_factor;
1525     seg_list.clear ();
1526   }
1527   while (!blob_it.cycled_list ());
1528   sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
1529   return total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
1530 }
1531 
1532 
1533 /**********************************************************************
1534  * compute_pitch_sd2
1535  *
1536  * Use a dp algorithm to fit the character cells and return the sd of
1537  * the cell size over the row.
1538  **********************************************************************/
1539 
compute_pitch_sd2(TO_ROW * row,STATS * projection,inT16 projection_left,inT16 projection_right,float initial_pitch,inT16 & occupation,inT16 & mid_cuts,ICOORDELT_LIST * row_cells,BOOL8 testing_on,inT16 start,inT16 end)1540 float compute_pitch_sd2(                            //find fp cells
1541                         TO_ROW *row,                //row to do
1542                         STATS *projection,          //vertical projection
1543                         inT16 projection_left,      //edge
1544                         inT16 projection_right,     //edge
1545                         float initial_pitch,        //guess at pitch
1546                         inT16 &occupation,          //no of occupied cells
1547                         inT16 &mid_cuts,            //no of free cuts
1548                         ICOORDELT_LIST *row_cells,  //list of chop pts
1549                         BOOL8 testing_on,           //inidividual words
1550                         inT16 start,                //start of good range
1551                         inT16 end                   //end of good range
1552                        ) {
1553                                  //blobs
1554   BLOBNBOX_IT blob_it = row->blob_list ();
1555   BLOBNBOX_IT plot_it;
1556   inT16 blob_count;              //no of blobs
1557   TBOX blob_box;                  //bounding box
1558   FPSEGPT_LIST seg_list;         //char cells
1559   FPSEGPT_IT seg_it;             //iterator
1560   inT16 segpos;                  //position of segment
1561                                  //iterator
1562   ICOORDELT_IT cell_it = row_cells;
1563   ICOORDELT *cell;               //new cell
1564   double word_sync;              //result for word
1565 
1566   mid_cuts = 0;
1567   if (blob_it.empty ()) {
1568     occupation = 0;
1569     return initial_pitch * 10;
1570   }
1571 #ifndef GRAPHICS_DISABLED
1572   if (testing_on && to_win > 0) {
1573     projection->plot (to_win, projection_left,
1574       row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
1575   }
1576 #endif
1577   blob_count = 0;
1578   blob_it.mark_cycle_pt ();
1579   do {
1580                                  //first blob
1581     blob_box = box_next (&blob_it);
1582     blob_count++;
1583   }
1584   while (!blob_it.cycled_list ());
1585   plot_it = blob_it;
1586   word_sync = check_pitch_sync2 (&blob_it, blob_count, (inT16) initial_pitch,
1587     2, projection, projection_left,
1588     projection_right,
1589     row->xheight * textord_projection_scale,
1590     occupation, &seg_list, start, end);
1591   if (testing_on) {
1592     tprintf ("Row ending at (%d,%d), len=%d, sync rating=%g, ",
1593       blob_box.right (), blob_box.top (),
1594       seg_list.length () - 1, word_sync);
1595     seg_it.set_to_list (&seg_list);
1596     for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
1597       if (seg_it.data ()->faked)
1598         tprintf ("(F)");
1599       tprintf ("%d, ", seg_it.data ()->position ());
1600       //                              tprintf("C=%g, s=%g, sq=%g\n",
1601       //                                      seg_it.data()->cost_function(),
1602       //                                      seg_it.data()->sum(),
1603       //                                      seg_it.data()->squares());
1604     }
1605     tprintf ("\n");
1606   }
1607 #ifndef GRAPHICS_DISABLED
1608   if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
1609     plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1610 #endif
1611   seg_it.set_to_list (&seg_list);
1612   for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
1613     segpos = seg_it.data ()->position ();
1614                                  //make new one
1615     cell = new ICOORDELT (segpos, 0);
1616     cell_it.add_after_then_move (cell);
1617     if (seg_it.at_last ())
1618       mid_cuts = seg_it.data ()->cheap_cuts ();
1619   }
1620   seg_list.clear ();
1621   return occupation > 0 ? sqrt (word_sync / occupation) : initial_pitch * 10;
1622 }
1623 
1624 
1625 /**********************************************************************
1626  * print_pitch_sd
1627  *
1628  * Use a dp algorithm to fit the character cells and return the sd of
1629  * the cell size over the row.
1630  **********************************************************************/
1631 
print_pitch_sd(TO_ROW * row,STATS * projection,inT16 projection_left,inT16 projection_right,float space_size,float initial_pitch)1632 void print_pitch_sd(                        //find fp cells
1633                     TO_ROW *row,            //row to do
1634                     STATS *projection,      //vertical projection
1635                     inT16 projection_left,  //edges //size of blank
1636                     inT16 projection_right,
1637                     float space_size,
1638                     float initial_pitch     //guess at pitch
1639                    ) {
1640   const char *res2;              //pitch result
1641   inT16 occupation;              //used cells
1642   float sp_sd;                   //space sd
1643                                  //blobs
1644   BLOBNBOX_IT blob_it = row->blob_list ();
1645   BLOBNBOX_IT start_it;          //start of word
1646   BLOBNBOX_IT row_start;         //start of row
1647   inT16 blob_count;              //no of blobs
1648   inT16 total_blob_count;        //total blobs in line
1649   TBOX blob_box;                  //bounding box
1650   TBOX prev_box;                  //of super blob
1651   inT32 prev_right;              //of word sync
1652   int scale_factor;              //on scores for big words
1653   inT32 sp_count;                //spaces
1654   FPSEGPT_LIST seg_list;         //char cells
1655   FPSEGPT_IT seg_it;             //iterator
1656   double sqsum;                  //sum of squares
1657   double spsum;                  //of spaces
1658   double sp_var;                 //space error
1659   double word_sync;              //result for word
1660   double total_count;            //total cuts
1661 
1662   if (blob_it.empty ())
1663     return;
1664   row_start = blob_it;
1665   total_blob_count = 0;
1666 
1667   total_count = 0;
1668   sqsum = 0;
1669   sp_count = 0;
1670   spsum = 0;
1671   prev_right = -1;
1672   blob_it = row_start;
1673   start_it = blob_it;
1674   blob_count = 0;
1675   blob_box = box_next (&blob_it);//first blob
1676   blob_it.mark_cycle_pt ();
1677   do {
1678     for (; blob_count > 0; blob_count--)
1679       box_next(&start_it);
1680     do {
1681       prev_box = blob_box;
1682       blob_count++;
1683       blob_box = box_next (&blob_it);
1684     }
1685     while (!blob_it.cycled_list ()
1686       && blob_box.left () - prev_box.right () < space_size);
1687     word_sync =
1688       check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2,
1689       projection, projection_left, projection_right,
1690       row->xheight * textord_projection_scale,
1691       occupation, &seg_list, 0, 0);
1692     total_blob_count += blob_count;
1693     seg_it.set_to_list (&seg_list);
1694     if (prev_right >= 0) {
1695       sp_var = seg_it.data ()->position () - prev_right;
1696       sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
1697       sp_var *= sp_var;
1698       spsum += sp_var;
1699       sp_count++;
1700     }
1701     seg_it.move_to_last ();
1702     prev_right = seg_it.data ()->position ();
1703     if (textord_pitch_scalebigwords) {
1704       scale_factor = (seg_list.length () - 2) / 2;
1705       if (scale_factor < 1)
1706         scale_factor = 1;
1707     }
1708     else
1709       scale_factor = 1;
1710     sqsum += word_sync * scale_factor;
1711     total_count += (seg_list.length () - 1) * scale_factor;
1712     seg_list.clear ();
1713   }
1714   while (!blob_it.cycled_list ());
1715   sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
1716   word_sync = total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
1717   tprintf ("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:",
1718     word_sync, word_sync / initial_pitch, sp_sd,
1719     word_sync < textord_words_pitchsd_threshold * initial_pitch
1720     ? 'F' : 'P');
1721 
1722   start_it = row_start;
1723   blob_it = row_start;
1724   word_sync =
1725     check_pitch_sync2 (&blob_it, total_blob_count, (inT16) initial_pitch, 2,
1726     projection, projection_left, projection_right,
1727     row->xheight * textord_projection_scale, occupation,
1728     &seg_list, 0, 0);
1729   if (occupation > 1)
1730     word_sync /= occupation;
1731   word_sync = sqrt (word_sync);
1732 
1733 #ifndef GRAPHICS_DISABLED
1734   if (textord_show_row_cuts && to_win != NULL)
1735     plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list);
1736 #endif
1737   seg_list.clear ();
1738   if (word_sync < textord_words_pitchsd_threshold * initial_pitch) {
1739     if (word_sync < textord_words_def_fixed * initial_pitch
1740       && !row->all_caps)
1741       res2 = "DF";
1742     else
1743       res2 = "MF";
1744   }
1745   else
1746     res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP";
1747   tprintf
1748     ("row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, all_caps=%d\n",
1749     word_sync, word_sync / initial_pitch,
1750     word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P',
1751     occupation, res2, initial_pitch, row->fixed_pitch, row->all_caps);
1752 }
1753 
1754 
1755 /**********************************************************************
1756  * sort_floats
1757  *
1758  * qsort function to sort 2 floats.
1759  **********************************************************************/
1760 
sort_floats2(const void * arg1,const void * arg2)1761 int sort_floats2(                   //qsort function
1762                  const void *arg1,  //ptrs to floats
1763                  const void *arg2) {
1764   float diff;                    //difference
1765 
1766   diff = *((float *) arg1) - *((float *) arg2);
1767   if (diff > 0)
1768     return 1;
1769   else if (diff < 0)
1770     return -1;
1771   else
1772     return 0;
1773 }
1774 
1775 
1776 /**********************************************************************
1777  * find_repeated_chars
1778  *
1779  * Find 4 or more adjacent chars which are the same and put them
1780  * into words in advance of fixed pitch checking and word generation.
1781  **********************************************************************/
find_repeated_chars(TO_BLOCK * block,BOOL8 testing_on,tesseract::Tesseract * tess)1782 void find_repeated_chars(                  //search for equal chars
1783                          TO_BLOCK *block,  //block to search
1784                          BOOL8 testing_on,  //dbug mode
1785                          tesseract::Tesseract* tess
1786                         ) {
1787   TO_ROW *row;
1788   BLOBNBOX_IT box_it;
1789   BLOBNBOX_IT search_it;         // forward search
1790   WERD_IT word_it;               //new words
1791   WERD *word;                    //new word
1792   TBOX word_box;                  //for plotting
1793   int blobcount, repeated_set;
1794 
1795   TO_ROW_IT row_it = block->get_rows();
1796   if (row_it.empty()) return;  // empty block
1797   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1798     row = row_it.data ();
1799     box_it.set_to_list (row->blob_list ());
1800     if (box_it.empty())  continue; // no blobs in this row
1801     if (!row->rep_chars_marked()) {
1802       mark_repeated_chars(row, block->xheight, tess);
1803     }
1804     if (row->num_repeated_sets() == 0) continue;  // nothing to do for this row
1805     word_it.set_to_list (&row->rep_words);
1806       do {
1807       if (box_it.data()->repeated_set() != 0 &&
1808           !box_it.data()->joined_to_prev()) {
1809         blobcount = 1;
1810         repeated_set = box_it.data()->repeated_set();
1811         search_it = box_it;
1812         search_it.forward ();
1813         while (!search_it.at_first() &&
1814                search_it.data()->repeated_set() == repeated_set) {
1815               blobcount++;
1816               search_it.forward ();
1817             }
1818         // After the call to make_real_word() all the blobs from this
1819         // repeated set will be removed from the blob list. box_it will be
1820         // set to point to the blob after the end of the extracted sequence.
1821         word = make_real_word(&box_it, blobcount,
1822                               box_it.at_first(), false, false, 1);
1823 #ifndef GRAPHICS_DISABLED
1824             if (testing_on) {
1825               word_box = word->bounding_box ();
1826           tprintf("Found repeated word of %d blobs from (%d,%d)->(%d,%d)\n",
1827                   blobcount, word_box.left(), word_box.bottom(),
1828                   word_box.right(), word_box.top());
1829               //perimeter_color_index(to_win, RED);
1830 	      to_win->Pen(255,0,0);
1831               //interior_style(to_win, INT_HOLLOW, TRUE);
1832           to_win->Rectangle(word_box.left(), word_box.bottom(),
1833                             word_box.right(), word_box.top());
1834             }
1835 #endif
1836         word->set_flag(W_REP_CHAR, true);
1837         word->set_flag(W_DONT_CHOP, true);
1838             word_it.add_after_then_move (word);
1839       } else {
1840         box_it.forward();
1841     }
1842     } while (!box_it.at_first());
1843   }
1844 }
1845 
1846 
1847 /**********************************************************************
1848  * plot_fp_word
1849  *
1850  * Plot a block of words as if fixed pitch.
1851  **********************************************************************/
1852 
1853 #ifndef GRAPHICS_DISABLED
plot_fp_word(TO_BLOCK * block,float pitch,float nonspace)1854 void plot_fp_word(                  //draw block of words
1855                   TO_BLOCK *block,  //block to draw
1856                   float pitch,      //pitch to draw with
1857                   float nonspace    //for space threshold
1858                  ) {
1859   TO_ROW *row;                   //current row
1860   TO_ROW_IT row_it = block->get_rows ();
1861 
1862   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1863     row = row_it.data ();
1864     row->min_space = (inT32) ((pitch + nonspace) / 2);
1865     row->max_nonspace = row->min_space;
1866     row->space_threshold = row->min_space;
1867     plot_word_decisions (to_win, (inT16) pitch, row);
1868   }
1869 }
1870 #endif
1871