1 /**********************************************************************
2 * File: topitch.cpp (Formerly to_pitch.c)
3 * Description: Code to determine fixed pitchness and the pitch if fixed.
4 * Author: Ray Smith
5 * Created: Tue Aug 24 16:57:29 BST 1993
6 *
7 * (C) Copyright 1993, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 #include "mfcpch.h"
21 #ifdef __UNIX__
22 #include <assert.h>
23 #endif
24 #include "stderr.h"
25 #include "blobbox.h"
26 #include "lmedsq.h"
27 #include "statistc.h"
28 #include "drawtord.h"
29 #include "makerow.h"
30 #include "pitsync1.h"
31 #include "pithsync.h"
32 #include "blobcmpl.h"
33 #include "tovars.h"
34 #include "wordseg.h"
35 #include "topitch.h"
36 #include "secname.h"
37 #include "tesseractclass.h"
38
39 #define EXTERN
40
41 EXTERN BOOL_VAR (textord_all_prop, FALSE, "All doc is proportial text");
42 EXTERN BOOL_VAR (textord_debug_pitch_test, FALSE,
43 "Debug on fixed pitch test");
44 EXTERN BOOL_VAR (textord_disable_pitch_test, FALSE,
45 "Turn off dp fixed pitch algorithm");
46 EXTERN BOOL_VAR (textord_fast_pitch_test, FALSE,
47 "Do even faster pitch algorithm");
48 EXTERN BOOL_VAR (textord_debug_pitch_metric, FALSE,
49 "Write full metric stuff");
50 EXTERN BOOL_VAR (textord_show_row_cuts, FALSE, "Draw row-level cuts");
51 EXTERN BOOL_VAR (textord_show_page_cuts, FALSE, "Draw page-level cuts");
52 EXTERN BOOL_VAR (textord_pitch_cheat, FALSE,
53 "Use correct answer for fixed/prop");
54 EXTERN BOOL_VAR (textord_blockndoc_fixed, FALSE,
55 "Attempt whole doc/block fixed pitch");
56 EXTERN double_VAR (textord_projection_scale, 0.200, "Ding rate for mid-cuts");
57 EXTERN double_VAR (textord_balance_factor, 1.0,
58 "Ding rate for unbalanced char cells");
59
60 #define FIXED_WIDTH_MULTIPLE 5
61 #define BLOCK_STATS_CLUSTERS 10
62 #define MAX_ALLOWED_PITCH 100 //max pixel pitch.
63
64 /**********************************************************************
65 * compute_fixed_pitch
66 *
67 * Decide whether each row is fixed pitch individually.
68 * Correlate definite and uncertain results to obtain an individual
69 * result for each row in the TO_ROW class.
70 **********************************************************************/
71
compute_fixed_pitch(ICOORD page_tr,TO_BLOCK_LIST * port_blocks,float gradient,FCOORD rotation,BOOL8 testing_on,tesseract::Tesseract * tess)72 void compute_fixed_pitch( //determine pitch
73 ICOORD page_tr, //top right
74 TO_BLOCK_LIST *port_blocks, //input list
75 float gradient, //page skew
76 FCOORD rotation, //for drawing
77 BOOL8 testing_on, //correct orientation
78 tesseract::Tesseract* tess
79 ) {
80 TO_BLOCK_IT block_it; //iterator
81 TO_BLOCK *block; //current block;
82 TO_ROW_IT row_it; //row iterator
83 TO_ROW *row; //current row
84 int block_index; //block number
85 int row_index; //row number
86
87 #ifndef GRAPHICS_DISABLED
88 if (textord_show_initial_words && testing_on) {
89 if (to_win == NULL)
90 create_to_win(page_tr);
91 }
92 #endif
93
94 block_it.set_to_list (port_blocks);
95 block_index = 1;
96 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
97 block_it.forward ()) {
98 block = block_it.data ();
99 compute_block_pitch(block, rotation, block_index, testing_on, tess);
100 block_index++;
101 }
102
103 if (!try_doc_fixed (page_tr, port_blocks, gradient)) {
104 block_index = 1;
105 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
106 block_it.forward ()) {
107 block = block_it.data ();
108 if (!try_block_fixed (block, block_index))
109 try_rows_fixed(block, block_index, testing_on);
110 block_index++;
111 }
112 }
113
114 block_index = 1;
115 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
116 block_it.forward ()) {
117 block = block_it.data ();
118 row_it.set_to_list (block->get_rows ());
119 row_index = 1;
120 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
121 row = row_it.data ();
122 fix_row_pitch(row, block, port_blocks, row_index, block_index);
123 row_index++;
124 }
125 block_index++;
126 }
127 #ifndef GRAPHICS_DISABLED
128 if (textord_show_initial_words && testing_on) {
129 ScrollView::Update();
130 }
131 #endif
132 }
133
134
135 /**********************************************************************
136 * fix_row_pitch
137 *
138 * Get a pitch_decision for this row by voting among similar rows in the
139 * block, then similar rows over all the page, or any other rows at all.
140 **********************************************************************/
141
fix_row_pitch(TO_ROW * bad_row,TO_BLOCK * bad_block,TO_BLOCK_LIST * blocks,inT32 row_target,inT32 block_target)142 void fix_row_pitch(TO_ROW *bad_row, // row to fix
143 TO_BLOCK *bad_block, //block of bad_row
144 TO_BLOCK_LIST *blocks, //blocks to scan
145 inT32 row_target, //number of row
146 inT32 block_target) { // number of block
147 inT16 mid_cuts;
148 int block_votes; //votes in block
149 int like_votes; //votes over page
150 int other_votes; //votes of unlike blocks
151 int block_index; //number of block
152 int row_index; //number of row
153 int maxwidth; //max pitch
154 TO_BLOCK_IT block_it = blocks; //block iterator
155 TO_ROW_IT row_it;
156 TO_BLOCK *block; //current block
157 TO_ROW *row; //current row
158 float sp_sd; //space deviation
159 STATS block_stats; //pitches in block
160 STATS like_stats; //pitches in page
161
162 block_votes = like_votes = other_votes = 0;
163 maxwidth = (inT32) ceil (bad_row->xheight * textord_words_maxspace);
164 if (bad_row->pitch_decision != PITCH_DEF_FIXED
165 && bad_row->pitch_decision != PITCH_DEF_PROP) {
166 block_stats.set_range (0, maxwidth);
167 like_stats.set_range (0, maxwidth);
168 block_index = 1;
169 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
170 block_it.forward ()) {
171 block = block_it.data ();
172 row_index = 1;
173 row_it.set_to_list (block->get_rows ());
174 for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
175 row_it.forward ()) {
176 row = row_it.data ();
177 if ((bad_row->all_caps
178 && row->xheight + row->ascrise
179 <
180 (bad_row->xheight + bad_row->ascrise) * (1 +
181 textord_pitch_rowsimilarity)
182 && row->xheight + row->ascrise >
183 (bad_row->xheight + bad_row->ascrise) * (1 -
184 textord_pitch_rowsimilarity))
185 || (!bad_row->all_caps
186 && row->xheight <
187 bad_row->xheight * (1 + textord_pitch_rowsimilarity)
188 && row->xheight >
189 bad_row->xheight * (1 - textord_pitch_rowsimilarity))) {
190 if (block_index == block_target) {
191 if (row->pitch_decision == PITCH_DEF_FIXED) {
192 block_votes += textord_words_veto_power;
193 block_stats.add ((inT32) row->fixed_pitch,
194 textord_words_veto_power);
195 }
196 else if (row->pitch_decision == PITCH_MAYBE_FIXED
197 || row->pitch_decision == PITCH_CORR_FIXED) {
198 block_votes++;
199 block_stats.add ((inT32) row->fixed_pitch, 1);
200 }
201 else if (row->pitch_decision == PITCH_DEF_PROP)
202 block_votes -= textord_words_veto_power;
203 else if (row->pitch_decision == PITCH_MAYBE_PROP
204 || row->pitch_decision == PITCH_CORR_PROP)
205 block_votes--;
206 }
207 else {
208 if (row->pitch_decision == PITCH_DEF_FIXED) {
209 like_votes += textord_words_veto_power;
210 like_stats.add ((inT32) row->fixed_pitch,
211 textord_words_veto_power);
212 }
213 else if (row->pitch_decision == PITCH_MAYBE_FIXED
214 || row->pitch_decision == PITCH_CORR_FIXED) {
215 like_votes++;
216 like_stats.add ((inT32) row->fixed_pitch, 1);
217 }
218 else if (row->pitch_decision == PITCH_DEF_PROP)
219 like_votes -= textord_words_veto_power;
220 else if (row->pitch_decision == PITCH_MAYBE_PROP
221 || row->pitch_decision == PITCH_CORR_PROP)
222 like_votes--;
223 }
224 }
225 else {
226 if (row->pitch_decision == PITCH_DEF_FIXED)
227 other_votes += textord_words_veto_power;
228 else if (row->pitch_decision == PITCH_MAYBE_FIXED
229 || row->pitch_decision == PITCH_CORR_FIXED)
230 other_votes++;
231 else if (row->pitch_decision == PITCH_DEF_PROP)
232 other_votes -= textord_words_veto_power;
233 else if (row->pitch_decision == PITCH_MAYBE_PROP
234 || row->pitch_decision == PITCH_CORR_PROP)
235 other_votes--;
236 }
237 row_index++;
238 }
239 block_index++;
240 }
241 if (block_votes > textord_words_veto_power) {
242 bad_row->fixed_pitch = block_stats.ile (0.5);
243 bad_row->pitch_decision = PITCH_CORR_FIXED;
244 }
245 else if (block_votes <= textord_words_veto_power && like_votes > 0) {
246 bad_row->fixed_pitch = like_stats.ile (0.5);
247 bad_row->pitch_decision = PITCH_CORR_FIXED;
248 }
249 else {
250 bad_row->pitch_decision = PITCH_CORR_PROP;
251 #ifndef SECURE_NAMES
252 if (block_votes == 0 && like_votes == 0 && other_votes > 0
253 && (textord_debug_pitch_test || textord_debug_pitch_metric))
254 tprintf
255 ("Warning:row %d of block %d set prop with no like rows against trend\n",
256 row_target, block_target);
257 #endif
258 }
259 }
260 if (textord_debug_pitch_metric) {
261 tprintf (":b_votes=%d:l_votes=%d:o_votes=%d",
262 block_votes, like_votes, other_votes);
263 tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise);
264 }
265 if (bad_row->pitch_decision == PITCH_CORR_FIXED) {
266 if (bad_row->fixed_pitch < textord_min_xheight) {
267 if (block_votes > 0)
268 bad_row->fixed_pitch = block_stats.ile (0.5);
269 else if (block_votes == 0 && like_votes > 0)
270 bad_row->fixed_pitch = like_stats.ile (0.5);
271 else {
272 tprintf
273 ("Warning:guessing pitch as xheight on row %d, block %d\n",
274 row_target, block_target);
275 bad_row->fixed_pitch = bad_row->xheight;
276 }
277 }
278 if (bad_row->fixed_pitch < textord_min_xheight)
279 bad_row->fixed_pitch = (float) textord_min_xheight;
280 bad_row->kern_size = bad_row->fixed_pitch / 4;
281 bad_row->min_space = (inT32) (bad_row->fixed_pitch * 0.6);
282 bad_row->max_nonspace = (inT32) (bad_row->fixed_pitch * 0.4);
283 bad_row->space_threshold =
284 (bad_row->min_space + bad_row->max_nonspace) / 2;
285 bad_row->space_size = bad_row->fixed_pitch;
286 if (bad_row->char_cells.empty ())
287 tune_row_pitch (bad_row, &bad_row->projection,
288 bad_row->projection_left, bad_row->projection_right,
289 (bad_row->fixed_pitch +
290 bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
291 sp_sd, mid_cuts, &bad_row->char_cells, FALSE);
292 }
293 else if (bad_row->pitch_decision == PITCH_CORR_PROP
294 || bad_row->pitch_decision == PITCH_DEF_PROP) {
295 bad_row->fixed_pitch = 0.0f;
296 bad_row->char_cells.clear ();
297 }
298 }
299
300
301 /**********************************************************************
302 * compute_block_pitch
303 *
304 * Decide whether each block is fixed pitch individually.
305 **********************************************************************/
306
compute_block_pitch(TO_BLOCK * block,FCOORD rotation,inT32 block_index,BOOL8 testing_on,tesseract::Tesseract * tess)307 void compute_block_pitch( //process each block
308 TO_BLOCK *block, //input list
309 FCOORD rotation, //for drawing
310 inT32 block_index, //block number
311 BOOL8 testing_on, //correct orientation
312 tesseract::Tesseract* tess
313 ) {
314 TBOX block_box; //bounding box
315
316 block_box = block->block->bounding_box ();
317 if (testing_on && textord_debug_pitch_test) {
318 tprintf ("Block %d at (%d,%d)->(%d,%d)\n",
319 block_index,
320 block_box.left (), block_box.bottom (),
321 block_box.right (), block_box.top ());
322 }
323 block->min_space = (inT32) floor (block->xheight
324 * textord_words_default_minspace);
325 block->max_nonspace = (inT32) ceil (block->xheight
326 * textord_words_default_nonspace);
327 block->fixed_pitch = 0.0f;
328 block->space_size = (float) block->min_space;
329 block->kern_size = (float) block->max_nonspace;
330 block->pr_nonsp = block->xheight * words_default_prop_nonspace;
331 block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop;
332 if (!block->get_rows ()->empty ()) {
333 ASSERT_HOST (block->xheight > 0);
334 if (textord_repeat_extraction)
335 find_repeated_chars(block, textord_show_initial_words &&testing_on, tess);
336 #ifndef GRAPHICS_DISABLED
337 if (textord_show_initial_words && testing_on)
338 //overlap_picture_ops(TRUE);
339 ScrollView::Update();
340 #endif
341 compute_rows_pitch(block,
342 block_index,
343 textord_debug_pitch_test &&testing_on);
344 }
345 }
346
347
348 /**********************************************************************
349 * compute_rows_pitch
350 *
351 * Decide whether each row is fixed pitch individually.
352 **********************************************************************/
353
compute_rows_pitch(TO_BLOCK * block,inT32 block_index,BOOL8 testing_on)354 BOOL8 compute_rows_pitch( //find line stats
355 TO_BLOCK *block, //block to do
356 inT32 block_index, //block number
357 BOOL8 testing_on //correct orientation
358 ) {
359 inT32 maxwidth; //of spaces
360 TO_ROW *row; //current row
361 inT32 row_index; //row number.
362 float lower, upper; //cluster thresholds
363 TO_ROW_IT row_it = block->get_rows ();
364
365 row_index = 1;
366 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
367 row = row_it.data ();
368 ASSERT_HOST (row->xheight > 0);
369 row->compute_vertical_projection ();
370 maxwidth = (inT32) ceil (row->xheight * textord_words_maxspace);
371 if (row_pitch_stats (row, maxwidth, testing_on)
372 && find_row_pitch (row, maxwidth,
373 textord_dotmatrix_gap + 1, block, block_index,
374 row_index, testing_on)) {
375 if (row->fixed_pitch == 0) {
376 lower = row->pr_nonsp;
377 upper = row->pr_space;
378 row->space_size = upper;
379 row->kern_size = lower;
380 }
381 }
382 else {
383 row->fixed_pitch = 0.0f; //insufficient data
384 row->pitch_decision = PITCH_DUNNO;
385 }
386 row_index++;
387 }
388 return FALSE;
389 }
390
391
392 /**********************************************************************
393 * try_doc_fixed
394 *
395 * Attempt to call the entire document fixed pitch.
396 **********************************************************************/
397
try_doc_fixed(ICOORD page_tr,TO_BLOCK_LIST * port_blocks,float gradient)398 BOOL8 try_doc_fixed( //determine pitch
399 ICOORD page_tr, //top right
400 TO_BLOCK_LIST *port_blocks, //input list
401 float gradient //page skew
402 ) {
403 inT16 master_x; //uniform shifts
404 inT16 pitch; //median pitch.
405 int x; //profile coord
406 int prop_blocks; //correct counts
407 int fixed_blocks;
408 int total_row_count; //total in page
409 //iterator
410 TO_BLOCK_IT block_it = port_blocks;
411 TO_BLOCK *block; //current block;
412 TO_ROW_IT row_it; //row iterator
413 TO_ROW *row; //current row
414 inT16 projection_left; //edges
415 inT16 projection_right;
416 inT16 row_left; //edges of row
417 inT16 row_right;
418 ICOORDELT_LIST *master_cells; //cells for page
419 float master_y; //uniform shifts
420 float shift_factor; //page skew correction
421 float row_shift; //shift for row
422 float final_pitch; //output pitch
423 float row_y; //baseline
424 STATS projection; //entire page
425 STATS pitches (0, MAX_ALLOWED_PITCH);
426 //for median
427 float sp_sd; //space sd
428 inT16 mid_cuts; //no of cheap cuts
429 float pitch_sd; //sync rating
430
431 if (block_it.empty ()
432 // || block_it.data()==block_it.data_relative(1)
433 || !textord_blockndoc_fixed)
434 return FALSE;
435 shift_factor = gradient / (gradient * gradient + 1);
436 row_it.set_to_list (block_it.data ()->get_rows ());
437 master_x = row_it.data ()->projection_left;
438 master_y = row_it.data ()->baseline.y (master_x);
439 projection_left = MAX_INT16;
440 projection_right = -MAX_INT16;
441 prop_blocks = 0;
442 fixed_blocks = 0;
443 total_row_count = 0;
444
445 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
446 block_it.forward ()) {
447 block = block_it.data ();
448 row_it.set_to_list (block->get_rows ());
449 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
450 row = row_it.data ();
451 total_row_count++;
452 if (row->fixed_pitch > 0)
453 pitches.add ((inT32) (row->fixed_pitch), 1);
454 //find median
455 row_y = row->baseline.y (master_x);
456 row_left =
457 (inT16) (row->projection_left -
458 shift_factor * (master_y - row_y));
459 row_right =
460 (inT16) (row->projection_right -
461 shift_factor * (master_y - row_y));
462 if (row_left < projection_left)
463 projection_left = row_left;
464 if (row_right > projection_right)
465 projection_right = row_right;
466 }
467 }
468 if (pitches.get_total () == 0)
469 return FALSE;
470 projection.set_range (projection_left, projection_right);
471
472 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
473 block_it.forward ()) {
474 block = block_it.data ();
475 row_it.set_to_list (block->get_rows ());
476 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
477 row = row_it.data ();
478 row_y = row->baseline.y (master_x);
479 row_left =
480 (inT16) (row->projection_left -
481 shift_factor * (master_y - row_y));
482 for (x = row->projection_left; x < row->projection_right;
483 x++, row_left++) {
484 projection.add (row_left, row->projection.pile_count (x));
485 }
486 }
487 }
488
489 row_it.set_to_list (block_it.data ()->get_rows ());
490 row = row_it.data ();
491 #ifndef GRAPHICS_DISABLED
492 if (textord_show_page_cuts && to_win != NULL)
493 projection.plot (to_win, projection_left,
494 row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
495 #endif
496 final_pitch = pitches.ile (0.5);
497 pitch = (inT16) final_pitch;
498 pitch_sd =
499 tune_row_pitch (row, &projection, projection_left, projection_right,
500 pitch * 0.75, final_pitch, sp_sd, mid_cuts,
501 &row->char_cells, FALSE);
502
503 if (textord_debug_pitch_metric)
504 tprintf
505 ("try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n",
506 prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd,
507 pitch_sd / total_row_count, pitch_sd / pitch,
508 pitch_sd / total_row_count / pitch);
509
510 #ifndef GRAPHICS_DISABLED
511 if (textord_show_page_cuts && to_win != NULL) {
512 master_cells = &row->char_cells;
513 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
514 block_it.forward ()) {
515 block = block_it.data ();
516 row_it.set_to_list (block->get_rows ());
517 for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
518 row_it.forward ()) {
519 row = row_it.data ();
520 row_y = row->baseline.y (master_x);
521 row_shift = shift_factor * (master_y - row_y);
522 plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells);
523 }
524 }
525 }
526 #endif
527 row->char_cells.clear ();
528 return FALSE;
529 }
530
531
532 /**********************************************************************
533 * try_block_fixed
534 *
535 * Try to call the entire block fixed.
536 **********************************************************************/
537
try_block_fixed(TO_BLOCK * block,inT32 block_index)538 BOOL8 try_block_fixed( //find line stats
539 TO_BLOCK *block, //block to do
540 inT32 block_index //block number
541 ) {
542 return FALSE;
543 }
544
545
546 /**********************************************************************
547 * try_rows_fixed
548 *
549 * Decide whether each row is fixed pitch individually.
550 **********************************************************************/
551
try_rows_fixed(TO_BLOCK * block,inT32 block_index,BOOL8 testing_on)552 BOOL8 try_rows_fixed( //find line stats
553 TO_BLOCK *block, //block to do
554 inT32 block_index, //block number
555 BOOL8 testing_on //correct orientation
556 ) {
557 inT32 maxwidth; //of spaces
558 TO_ROW *row; //current row
559 inT32 row_index; //row number.
560 inT32 def_fixed = 0; //counters
561 inT32 def_prop = 0;
562 inT32 maybe_fixed = 0;
563 inT32 maybe_prop = 0;
564 inT32 dunno = 0;
565 inT32 corr_fixed = 0;
566 inT32 corr_prop = 0;
567 float lower, upper; //cluster thresholds
568 TO_ROW_IT row_it = block->get_rows ();
569
570 row_index = 1;
571 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
572 row = row_it.data ();
573 ASSERT_HOST (row->xheight > 0);
574 maxwidth = (inT32) ceil (row->xheight * textord_words_maxspace);
575 if (row->fixed_pitch > 0 && fixed_pitch_row (row, block_index)) {
576 if (row->fixed_pitch == 0) {
577 lower = row->pr_nonsp;
578 upper = row->pr_space;
579 row->space_size = upper;
580 row->kern_size = lower;
581 }
582 }
583 row_index++;
584 }
585 count_block_votes(block,
586 def_fixed,
587 def_prop,
588 maybe_fixed,
589 maybe_prop,
590 corr_fixed,
591 corr_prop,
592 dunno);
593 if (testing_on
594 && (textord_debug_pitch_test
595 || textord_blocksall_prop || textord_blocksall_fixed)) {
596 tprintf ("Initially:");
597 print_block_counts(block, block_index);
598 }
599 if (def_fixed > def_prop * textord_words_veto_power)
600 block->pitch_decision = PITCH_DEF_FIXED;
601 else if (def_prop > def_fixed * textord_words_veto_power)
602 block->pitch_decision = PITCH_DEF_PROP;
603 else if (def_fixed > 0 || def_prop > 0)
604 block->pitch_decision = PITCH_DUNNO;
605 else if (maybe_fixed > maybe_prop * textord_words_veto_power)
606 block->pitch_decision = PITCH_MAYBE_FIXED;
607 else if (maybe_prop > maybe_fixed * textord_words_veto_power)
608 block->pitch_decision = PITCH_MAYBE_PROP;
609 else
610 block->pitch_decision = PITCH_DUNNO;
611 return FALSE;
612 }
613
614
615 /**********************************************************************
616 * print_block_counts
617 *
618 * Count up how many rows have what decision and print the results.
619 **********************************************************************/
620
print_block_counts(TO_BLOCK * block,inT32 block_index)621 void print_block_counts( //find line stats
622 TO_BLOCK *block, //block to do
623 inT32 block_index //block number
624 ) {
625 inT32 def_fixed = 0; //counters
626 inT32 def_prop = 0;
627 inT32 maybe_fixed = 0;
628 inT32 maybe_prop = 0;
629 inT32 dunno = 0;
630 inT32 corr_fixed = 0;
631 inT32 corr_prop = 0;
632
633 count_block_votes(block,
634 def_fixed,
635 def_prop,
636 maybe_fixed,
637 maybe_prop,
638 corr_fixed,
639 corr_prop,
640 dunno);
641 tprintf ("Block %d has (%d,%d,%d)",
642 block_index, def_fixed, maybe_fixed, corr_fixed);
643 if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed))
644 tprintf (" (Wrongly)");
645 tprintf (" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop);
646 if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop))
647 tprintf (" (Wrongly)");
648 tprintf (" prop, %d dunno\n", dunno);
649 }
650
651
652 /**********************************************************************
653 * count_block_votes
654 *
655 * Count the number of rows in the block with each kind of pitch_decision.
656 **********************************************************************/
657
count_block_votes(TO_BLOCK * block,inT32 & def_fixed,inT32 & def_prop,inT32 & maybe_fixed,inT32 & maybe_prop,inT32 & corr_fixed,inT32 & corr_prop,inT32 & dunno)658 void count_block_votes( //find line stats
659 TO_BLOCK *block, //block to do
660 inT32 &def_fixed, //add to counts
661 inT32 &def_prop,
662 inT32 &maybe_fixed,
663 inT32 &maybe_prop,
664 inT32 &corr_fixed,
665 inT32 &corr_prop,
666 inT32 &dunno) {
667 TO_ROW *row; //current row
668 TO_ROW_IT row_it = block->get_rows ();
669
670 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
671 row = row_it.data ();
672 switch (row->pitch_decision) {
673 case PITCH_DUNNO:
674 dunno++;
675 break;
676 case PITCH_DEF_PROP:
677 def_prop++;
678 break;
679 case PITCH_MAYBE_PROP:
680 maybe_prop++;
681 break;
682 case PITCH_DEF_FIXED:
683 def_fixed++;
684 break;
685 case PITCH_MAYBE_FIXED:
686 maybe_fixed++;
687 break;
688 case PITCH_CORR_PROP:
689 corr_prop++;
690 break;
691 case PITCH_CORR_FIXED:
692 corr_fixed++;
693 break;
694 }
695 }
696 }
697
698
699 /**********************************************************************
700 * row_pitch_stats
701 *
702 * Decide whether each row is fixed pitch individually.
703 **********************************************************************/
704
row_pitch_stats(TO_ROW * row,inT32 maxwidth,BOOL8 testing_on)705 BOOL8 row_pitch_stats( //find line stats
706 TO_ROW *row, //current row
707 inT32 maxwidth, //of spaces
708 BOOL8 testing_on //correct orientation
709 ) {
710 BLOBNBOX *blob; //current blob
711 int gap_index; //current gap
712 inT32 prev_x; //end of prev blob
713 inT32 cluster_count; //no of clusters
714 inT32 prev_count; //of clusters
715 inT32 smooth_factor; //for smoothing stats
716 TBOX blob_box; //bounding box
717 float lower, upper; //cluster thresholds
718 //gap sizes
719 float gaps[BLOCK_STATS_CLUSTERS];
720 //blobs
721 BLOBNBOX_IT blob_it = row->blob_list ();
722 STATS gap_stats (0, maxwidth);
723 STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
724 //clusters
725
726 smooth_factor =
727 (inT32) (row->xheight * textord_wordstats_smooth_factor + 1.5);
728 if (!blob_it.empty ()) {
729 prev_x = blob_it.data ()->bounding_box ().right ();
730 blob_it.forward ();
731 while (!blob_it.at_first ()) {
732 blob = blob_it.data ();
733 if (!blob->joined_to_prev ()) {
734 blob_box = blob->bounding_box ();
735 if (blob_box.left () - prev_x < maxwidth)
736 gap_stats.add (blob_box.left () - prev_x, 1);
737 prev_x = blob_box.right ();
738 }
739 blob_it.forward ();
740 }
741 }
742 if (gap_stats.get_total () == 0) {
743 return FALSE;
744 }
745 cluster_count = 0;
746 lower = row->xheight * words_initial_lower;
747 upper = row->xheight * words_initial_upper;
748 gap_stats.smooth (smooth_factor);
749 do {
750 prev_count = cluster_count;
751 cluster_count = gap_stats.cluster (lower, upper,
752 textord_spacesize_ratioprop,
753 BLOCK_STATS_CLUSTERS, cluster_stats);
754 }
755 while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
756 if (cluster_count < 1) {
757 return FALSE;
758 }
759 for (gap_index = 0; gap_index < cluster_count; gap_index++)
760 gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
761 //get medians
762 if (testing_on) {
763 tprintf ("cluster_count=%d:", cluster_count);
764 for (gap_index = 0; gap_index < cluster_count; gap_index++)
765 tprintf (" %g(%d)", gaps[gap_index],
766 cluster_stats[gap_index + 1].get_total ());
767 tprintf ("\n");
768 }
769 qsort (gaps, cluster_count, sizeof (float), sort_floats2);
770
771 //Try to find proportional non-space and space for row.
772 lower = row->xheight * words_default_prop_nonspace;
773 upper = row->xheight * textord_words_min_minspace;
774 for (gap_index = 0; gap_index < cluster_count
775 && gaps[gap_index] < lower; gap_index++);
776 if (gap_index == 0) {
777 if (testing_on)
778 tprintf ("No clusters below nonspace threshold!!\n");
779 if (cluster_count > 1) {
780 row->pr_nonsp = gaps[0];
781 row->pr_space = gaps[1];
782 }
783 else {
784 row->pr_nonsp = lower;
785 row->pr_space = gaps[0];
786 }
787 }
788 else {
789 row->pr_nonsp = gaps[gap_index - 1];
790 while (gap_index < cluster_count && gaps[gap_index] < upper)
791 gap_index++;
792 if (gap_index == cluster_count) {
793 if (testing_on)
794 tprintf ("No clusters above nonspace threshold!!\n");
795 row->pr_space = lower * textord_spacesize_ratioprop;
796 }
797 else
798 row->pr_space = gaps[gap_index];
799 }
800
801 //Now try to find the fixed pitch space and non-space.
802 upper = row->xheight * words_default_fixed_space;
803 for (gap_index = 0; gap_index < cluster_count
804 && gaps[gap_index] < upper; gap_index++);
805 if (gap_index == 0) {
806 if (testing_on)
807 tprintf ("No clusters below space threshold!!\n");
808 row->fp_nonsp = upper;
809 row->fp_space = gaps[0];
810 }
811 else {
812 row->fp_nonsp = gaps[gap_index - 1];
813 if (gap_index == cluster_count) {
814 if (testing_on)
815 tprintf ("No clusters above space threshold!!\n");
816 row->fp_space = row->xheight;
817 }
818 else
819 row->fp_space = gaps[gap_index];
820 }
821 if (testing_on) {
822 tprintf
823 ("Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, fp_space=%g\n",
824 row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space);
825 }
826 return TRUE; //computed some stats
827 }
828
829
830 /**********************************************************************
831 * find_row_pitch
832 *
833 * Check to see if this row could be fixed pitch using the given spacings.
834 * Blobs with gaps smaller than the lower threshold are assumed to be one.
835 * The larger threshold is the word gap threshold.
836 **********************************************************************/
837
find_row_pitch(TO_ROW * row,inT32 maxwidth,inT32 dm_gap,TO_BLOCK * block,inT32 block_index,inT32 row_index,BOOL8 testing_on)838 BOOL8 find_row_pitch( //find lines
839 TO_ROW *row, //row to do
840 inT32 maxwidth, //max permitted space
841 inT32 dm_gap, //ignorable gaps
842 TO_BLOCK *block, //block of row
843 inT32 block_index, //block_number
844 inT32 row_index, //number of row
845 BOOL8 testing_on //correct orientation
846 ) {
847 BOOL8 used_dm_model; //looks lik dot matrix
848 float min_space; //estimate threshold
849 float non_space; //gap size
850 float gap_iqr; //interquartile range
851 float pitch_iqr;
852 float dm_gap_iqr; //interquartile range
853 float dm_pitch_iqr;
854 float dm_pitch; //pitch with dm on
855 float pitch; //revised estimate
856 float initial_pitch; //guess at pitch
857 STATS gap_stats (0, maxwidth);
858 //centre-centre
859 STATS pitch_stats (0, maxwidth);
860
861 row->fixed_pitch = 0.0f;
862 initial_pitch = row->fp_space;
863 if (initial_pitch > row->xheight * (1 + words_default_fixed_limit))
864 initial_pitch = row->xheight;//keep pitch decent
865 non_space = row->fp_nonsp;
866 if (non_space > initial_pitch)
867 non_space = initial_pitch;
868 min_space = (initial_pitch + non_space) / 2;
869
870 if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
871 initial_pitch, min_space, TRUE, FALSE, dm_gap)) {
872 dm_gap_iqr = 0.0001;
873 dm_pitch_iqr = maxwidth * 2.0f;
874 dm_pitch = initial_pitch;
875 }
876 else {
877 dm_gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
878 dm_pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
879 dm_pitch = pitch_stats.ile (0.5);
880 }
881 gap_stats.clear ();
882 pitch_stats.clear ();
883 if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
884 initial_pitch, min_space, TRUE, FALSE, 0)) {
885 gap_iqr = 0.0001;
886 pitch_iqr = maxwidth * 3.0f;
887 }
888 else {
889 gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
890 pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
891 if (testing_on)
892 tprintf
893 ("First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
894 initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
895 initial_pitch = pitch_stats.ile (0.5);
896 if (min_space > initial_pitch
897 && count_pitch_stats (row, &gap_stats, &pitch_stats,
898 initial_pitch, initial_pitch, TRUE, FALSE, 0)) {
899 min_space = initial_pitch;
900 gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
901 pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
902 if (testing_on)
903 tprintf
904 ("Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
905 initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
906 initial_pitch = pitch_stats.ile (0.5);
907 }
908 }
909 if (textord_debug_pitch_metric)
910 tprintf ("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:",
911 block_index, row_index, 'X',
912 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,
913 pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth ? 'D' :
914 (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M'));
915 if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {
916 row->pitch_decision = PITCH_DUNNO;
917 if (textord_debug_pitch_metric)
918 tprintf ("\n");
919 return FALSE; //insufficient data
920 }
921 if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {
922 if (testing_on)
923 tprintf
924 ("Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
925 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
926 gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
927 pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
928 pitch = pitch_stats.ile (0.5);
929 used_dm_model = FALSE;
930 }
931 else {
932 if (testing_on)
933 tprintf
934 ("Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
935 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
936 gap_iqr = dm_gap_iqr;
937 pitch_iqr = dm_pitch_iqr;
938 pitch = dm_pitch;
939 used_dm_model = TRUE;
940 }
941 if (textord_debug_pitch_metric) {
942 tprintf ("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:",
943 pitch_iqr, gap_iqr, pitch);
944 tprintf ("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:",
945 pitch_iqr / gap_iqr, pitch_iqr / block->xheight,
946 pitch_iqr < gap_iqr * textord_fpiqr_ratio
947 && pitch_iqr < block->xheight * textord_max_pitch_iqr
948 && pitch < block->xheight * textord_words_default_maxspace
949 ? 'F' : 'P');
950 }
951 if (pitch_iqr < gap_iqr * textord_fpiqr_ratio
952 && pitch_iqr < block->xheight * textord_max_pitch_iqr
953 && pitch < block->xheight * textord_words_default_maxspace)
954 row->pitch_decision = PITCH_MAYBE_FIXED;
955 else
956 row->pitch_decision = PITCH_MAYBE_PROP;
957 row->fixed_pitch = pitch;
958 row->kern_size = gap_stats.ile (0.5);
959 row->min_space = (inT32) (row->fixed_pitch + non_space) / 2;
960 if (row->min_space > row->fixed_pitch)
961 row->min_space = (inT32) row->fixed_pitch;
962 row->max_nonspace = row->min_space;
963 row->space_size = row->fixed_pitch;
964 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
965 row->used_dm_model = used_dm_model;
966 return TRUE;
967 }
968
969
970 /**********************************************************************
971 * fixed_pitch_row
972 *
973 * Check to see if this row could be fixed pitch using the given spacings.
974 * Blobs with gaps smaller than the lower threshold are assumed to be one.
975 * The larger threshold is the word gap threshold.
976 **********************************************************************/
977
fixed_pitch_row(TO_ROW * row,inT32 block_index)978 BOOL8 fixed_pitch_row( //find lines
979 TO_ROW *row, //row to do
980 inT32 block_index //block_number
981 ) {
982 const char *res_string; //pitch result
983 inT16 mid_cuts; //no of cheap cuts
984 float non_space; //gap size
985 float pitch_sd; //error on pitch
986 float sp_sd; //space sd
987
988 non_space = row->fp_nonsp;
989 if (non_space > row->fixed_pitch)
990 non_space = row->fixed_pitch;
991 if (textord_all_prop) {
992 // Set the decision to definitely proportional.
993 pitch_sd = textord_words_def_prop * row->fixed_pitch;
994 row->pitch_decision = PITCH_DEF_PROP;
995 } else {
996 pitch_sd = tune_row_pitch (row, &row->projection, row->projection_left,
997 row->projection_right,
998 (row->fixed_pitch + non_space * 3) / 4,
999 row->fixed_pitch, sp_sd, mid_cuts,
1000 &row->char_cells,
1001 block_index == textord_debug_block);
1002 if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch
1003 && ((pitsync_linear_version & 3) < 3
1004 || ((pitsync_linear_version & 3) >= 3 && (row->used_dm_model
1005 || sp_sd > 20
1006 || (pitch_sd == 0 && sp_sd > 10))))) {
1007 if (pitch_sd < textord_words_def_fixed * row->fixed_pitch
1008 && !row->all_caps
1009 && ((pitsync_linear_version & 3) < 3 || sp_sd > 20))
1010 row->pitch_decision = PITCH_DEF_FIXED;
1011 else
1012 row->pitch_decision = PITCH_MAYBE_FIXED;
1013 }
1014 else if ((pitsync_linear_version & 3) < 3
1015 || sp_sd > 20
1016 || mid_cuts > 0
1017 || pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) {
1018 if (pitch_sd < textord_words_def_prop * row->fixed_pitch)
1019 row->pitch_decision = PITCH_MAYBE_PROP;
1020 else
1021 row->pitch_decision = PITCH_DEF_PROP;
1022 }
1023 else
1024 row->pitch_decision = PITCH_DUNNO;
1025 }
1026
1027 if (textord_debug_pitch_metric) {
1028 res_string = "??";
1029 switch (row->pitch_decision) {
1030 case PITCH_DEF_PROP:
1031 res_string = "DP";
1032 break;
1033 case PITCH_MAYBE_PROP:
1034 res_string = "MP";
1035 break;
1036 case PITCH_DEF_FIXED:
1037 res_string = "DF";
1038 break;
1039 case PITCH_MAYBE_FIXED:
1040 res_string = "MF";
1041 default:
1042 res_string = "??";
1043 }
1044 tprintf (":sd/p=%g:occ=%g:init_res=%s\n",
1045 pitch_sd / row->fixed_pitch, sp_sd, res_string);
1046 }
1047 return TRUE;
1048 }
1049
1050
1051 /**********************************************************************
1052 * count_pitch_stats
1053 *
1054 * Count up the gap and pitch stats on the block to see if it is fixed pitch.
1055 * Blobs with gaps smaller than the lower threshold are assumed to be one.
1056 * The larger threshold is the word gap threshold.
1057 * The return value indicates whether there were any decent values to use.
1058 **********************************************************************/
1059
count_pitch_stats(TO_ROW * row,STATS * gap_stats,STATS * pitch_stats,float initial_pitch,float min_space,BOOL8 ignore_outsize,BOOL8 split_outsize,inT32 dm_gap)1060 BOOL8 count_pitch_stats( //find lines
1061 TO_ROW *row, //row to do
1062 STATS *gap_stats, //blob gaps
1063 STATS *pitch_stats, //centre-centre stats
1064 float initial_pitch, //guess at pitch
1065 float min_space, //estimate space size
1066 BOOL8 ignore_outsize, //discard big objects
1067 BOOL8 split_outsize, //split big objects
1068 inT32 dm_gap //ignorable gaps
1069 ) {
1070 BOOL8 prev_valid; //not word broken
1071 BLOBNBOX *blob; //current blob
1072 //blobs
1073 BLOBNBOX_IT blob_it = row->blob_list ();
1074 inT32 prev_right; //end of prev blob
1075 inT32 prev_centre; //centre of previous blob
1076 inT32 x_centre; //centre of this blob
1077 inT32 blob_width; //width of blob
1078 inT32 width_units; //no of widths in blob
1079 float width; //blob width
1080 TBOX blob_box; //bounding box
1081 TBOX joined_box; //of super blob
1082
1083 gap_stats->clear ();
1084 pitch_stats->clear ();
1085 if (blob_it.empty ())
1086 return FALSE;
1087 prev_valid = FALSE;
1088 prev_centre = 0;
1089 prev_right = 0; //stop complier warning
1090 joined_box = blob_it.data ()->bounding_box ();
1091 do {
1092 blob_it.forward ();
1093 blob = blob_it.data ();
1094 if (!blob->joined_to_prev ()) {
1095 blob_box = blob->bounding_box ();
1096 if ((blob_box.left () - joined_box.right () < dm_gap
1097 && !blob_it.at_first ())
1098 || (blob->cblob () == NULL && blob->blob () == NULL))
1099 joined_box += blob_box; //merge blobs
1100 else {
1101 blob_width = joined_box.width ();
1102 if (split_outsize) {
1103 width_units =
1104 (inT32) floor ((float) blob_width / initial_pitch + 0.5);
1105 if (width_units < 1)
1106 width_units = 1;
1107 width_units--;
1108 }
1109 else if (ignore_outsize) {
1110 width = (float) blob_width / initial_pitch;
1111 width_units = width < 1 + words_default_fixed_limit
1112 && width > 1 - words_default_fixed_limit ? 0 : -1;
1113 }
1114 else
1115 width_units = 0; //everything in
1116 x_centre = (inT32) (joined_box.left ()
1117 + (blob_width -
1118 width_units * initial_pitch) / 2);
1119 if (prev_valid && width_units >= 0) {
1120 // if (width_units>0)
1121 // {
1122 // tprintf("wu=%d, width=%d, xc=%d, adding %d\n",
1123 // width_units,blob_width,x_centre,x_centre-prev_centre);
1124 // }
1125 gap_stats->add (joined_box.left () - prev_right, 1);
1126 pitch_stats->add (x_centre - prev_centre, 1);
1127 }
1128 prev_centre = (inT32) (x_centre + width_units * initial_pitch);
1129 prev_right = joined_box.right ();
1130 prev_valid = blob_box.left () - joined_box.right () < min_space;
1131 prev_valid = prev_valid && width_units >= 0;
1132 joined_box = blob_box;
1133 }
1134 }
1135 }
1136 while (!blob_it.at_first ());
1137 return gap_stats->get_total () >= 3;
1138 }
1139
1140
1141 /**********************************************************************
1142 * tune_row_pitch
1143 *
1144 * Use a dp algorithm to fit the character cells and return the sd of
1145 * the cell size over the row.
1146 **********************************************************************/
1147
tune_row_pitch(TO_ROW * row,STATS * projection,inT16 projection_left,inT16 projection_right,float space_size,float & initial_pitch,float & best_sp_sd,inT16 & best_mid_cuts,ICOORDELT_LIST * best_cells,BOOL8 testing_on)1148 float tune_row_pitch( //find fp cells
1149 TO_ROW *row, //row to do
1150 STATS *projection, //vertical projection
1151 inT16 projection_left, //edge of projection
1152 inT16 projection_right, //edge of projection
1153 float space_size, //size of blank
1154 float &initial_pitch, //guess at pitch
1155 float &best_sp_sd, //space sd
1156 inT16 &best_mid_cuts, //no of cheap cuts
1157 ICOORDELT_LIST *best_cells, //row cells
1158 BOOL8 testing_on //inidividual words
1159 ) {
1160 int pitch_delta; //offset pitch
1161 inT16 mid_cuts; //cheap cuts
1162 float pitch_sd; //current sd
1163 float best_sd; //best result
1164 float best_pitch; //pitch for best result
1165 float initial_sd; //starting error
1166 float sp_sd; //space sd
1167 ICOORDELT_LIST test_cells; //row cells
1168 ICOORDELT_IT best_it; //start of best list
1169
1170 if (textord_fast_pitch_test)
1171 return tune_row_pitch2 (row, projection, projection_left,
1172 projection_right, space_size, initial_pitch,
1173 best_sp_sd,
1174 //space sd
1175 best_mid_cuts, best_cells, testing_on);
1176 if (textord_disable_pitch_test) {
1177 best_sp_sd = initial_pitch;
1178 return initial_pitch;
1179 }
1180 initial_sd =
1181 compute_pitch_sd(row,
1182 projection,
1183 projection_left,
1184 projection_right,
1185 space_size,
1186 initial_pitch,
1187 best_sp_sd,
1188 best_mid_cuts,
1189 best_cells,
1190 testing_on);
1191 best_sd = initial_sd;
1192 best_pitch = initial_pitch;
1193 if (testing_on)
1194 tprintf ("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd);
1195 for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1196 pitch_sd =
1197 compute_pitch_sd (row, projection, projection_left, projection_right,
1198 space_size, initial_pitch + pitch_delta, sp_sd,
1199 mid_cuts, &test_cells, testing_on);
1200 if (testing_on)
1201 tprintf ("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta,
1202 pitch_sd);
1203 if (pitch_sd < best_sd) {
1204 best_sd = pitch_sd;
1205 best_mid_cuts = mid_cuts;
1206 best_sp_sd = sp_sd;
1207 best_pitch = initial_pitch + pitch_delta;
1208 best_cells->clear ();
1209 best_it.set_to_list (best_cells);
1210 best_it.add_list_after (&test_cells);
1211 }
1212 else
1213 test_cells.clear ();
1214 if (pitch_sd > initial_sd)
1215 break; //getting worse
1216 }
1217 for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1218 pitch_sd =
1219 compute_pitch_sd (row, projection, projection_left, projection_right,
1220 space_size, initial_pitch - pitch_delta, sp_sd,
1221 mid_cuts, &test_cells, testing_on);
1222 if (testing_on)
1223 tprintf ("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta,
1224 pitch_sd);
1225 if (pitch_sd < best_sd) {
1226 best_sd = pitch_sd;
1227 best_mid_cuts = mid_cuts;
1228 best_sp_sd = sp_sd;
1229 best_pitch = initial_pitch - pitch_delta;
1230 best_cells->clear ();
1231 best_it.set_to_list (best_cells);
1232 best_it.add_list_after (&test_cells);
1233 }
1234 else
1235 test_cells.clear ();
1236 if (pitch_sd > initial_sd)
1237 break;
1238 }
1239 initial_pitch = best_pitch;
1240
1241 if (textord_debug_pitch_metric)
1242 print_pitch_sd(row,
1243 projection,
1244 projection_left,
1245 projection_right,
1246 space_size,
1247 best_pitch);
1248
1249 return best_sd;
1250 }
1251
1252
1253 /**********************************************************************
1254 * tune_row_pitch
1255 *
1256 * Use a dp algorithm to fit the character cells and return the sd of
1257 * the cell size over the row.
1258 **********************************************************************/
1259
tune_row_pitch2(TO_ROW * row,STATS * projection,inT16 projection_left,inT16 projection_right,float space_size,float & initial_pitch,float & best_sp_sd,inT16 & best_mid_cuts,ICOORDELT_LIST * best_cells,BOOL8 testing_on)1260 float tune_row_pitch2( //find fp cells
1261 TO_ROW *row, //row to do
1262 STATS *projection, //vertical projection
1263 inT16 projection_left, //edge of projection
1264 inT16 projection_right, //edge of projection
1265 float space_size, //size of blank
1266 float &initial_pitch, //guess at pitch
1267 float &best_sp_sd, //space sd
1268 inT16 &best_mid_cuts, //no of cheap cuts
1269 ICOORDELT_LIST *best_cells, //row cells
1270 BOOL8 testing_on //inidividual words
1271 ) {
1272 int pitch_delta; //offset pitch
1273 inT16 pixel; //pixel coord
1274 inT16 best_pixel; //pixel coord
1275 inT16 best_delta; //best pitch
1276 inT16 best_pitch; //best pitch
1277 inT16 start; //of good range
1278 inT16 end; //of good range
1279 inT32 best_count; //lowest sum
1280 float best_sd; //best result
1281 STATS *sum_proj; //summed projection
1282
1283 best_sp_sd = initial_pitch;
1284
1285 if (textord_disable_pitch_test) {
1286 return initial_pitch;
1287 }
1288 sum_proj = new STATS[textord_pitch_range * 2 + 1];
1289 if (sum_proj == NULL)
1290 return initial_pitch;
1291 best_pitch = (inT32) initial_pitch;
1292
1293 for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
1294 pitch_delta++)
1295 sum_proj[textord_pitch_range + pitch_delta].set_range (0,
1296 best_pitch +
1297 pitch_delta + 1);
1298 for (pixel = projection_left; pixel <= projection_right; pixel++) {
1299 for (pitch_delta = -textord_pitch_range;
1300 pitch_delta <= textord_pitch_range; pitch_delta++)
1301 sum_proj[textord_pitch_range +
1302 pitch_delta].add ((pixel - projection_left) % (best_pitch +
1303 pitch_delta),
1304 projection->pile_count (pixel));
1305 }
1306 best_count = sum_proj[textord_pitch_range].pile_count (0);
1307 best_delta = 0;
1308 best_pixel = 0;
1309 for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
1310 pitch_delta++) {
1311 for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {
1312 if (sum_proj[textord_pitch_range + pitch_delta].pile_count (pixel)
1313 < best_count) {
1314 best_count =
1315 sum_proj[textord_pitch_range +
1316 pitch_delta].pile_count (pixel);
1317 best_delta = pitch_delta;
1318 best_pixel = pixel;
1319 }
1320 }
1321 }
1322 if (testing_on)
1323 tprintf ("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n",
1324 initial_pitch, best_delta, best_count);
1325 best_pitch += best_delta;
1326 initial_pitch = best_pitch;
1327 best_count++;
1328 best_count += best_count;
1329 for (start = best_pixel - 2; start > best_pixel - best_pitch
1330 && sum_proj[textord_pitch_range +
1331 best_delta].pile_count (start % best_pitch) <= best_count;
1332 start--);
1333 for (end = best_pixel + 2;
1334 end < best_pixel + best_pitch
1335 && sum_proj[textord_pitch_range +
1336 best_delta].pile_count (end % best_pitch) <= best_count;
1337 end++);
1338
1339 best_sd =
1340 compute_pitch_sd(row,
1341 projection,
1342 projection_left,
1343 projection_right,
1344 space_size,
1345 initial_pitch,
1346 best_sp_sd,
1347 best_mid_cuts,
1348 best_cells,
1349 testing_on,
1350 start,
1351 end);
1352 if (testing_on)
1353 tprintf ("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch,
1354 best_sd);
1355
1356 if (textord_debug_pitch_metric)
1357 print_pitch_sd(row,
1358 projection,
1359 projection_left,
1360 projection_right,
1361 space_size,
1362 initial_pitch);
1363
1364 delete[]sum_proj;
1365
1366 return best_sd;
1367 }
1368
1369
1370 /**********************************************************************
1371 * compute_pitch_sd
1372 *
1373 * Use a dp algorithm to fit the character cells and return the sd of
1374 * the cell size over the row.
1375 **********************************************************************/
1376
compute_pitch_sd(TO_ROW * row,STATS * projection,inT16 projection_left,inT16 projection_right,float space_size,float initial_pitch,float & sp_sd,inT16 & mid_cuts,ICOORDELT_LIST * row_cells,BOOL8 testing_on,inT16 start,inT16 end)1377 float compute_pitch_sd( //find fp cells
1378 TO_ROW *row, //row to do
1379 STATS *projection, //vertical projection
1380 inT16 projection_left, //edge
1381 inT16 projection_right, //edge
1382 float space_size, //size of blank
1383 float initial_pitch, //guess at pitch
1384 float &sp_sd, //space sd
1385 inT16 &mid_cuts, //no of free cuts
1386 ICOORDELT_LIST *row_cells, //list of chop pts
1387 BOOL8 testing_on, //inidividual words
1388 inT16 start, //start of good range
1389 inT16 end //end of good range
1390 ) {
1391 inT16 occupation; //no of cells in word.
1392 //blobs
1393 BLOBNBOX_IT blob_it = row->blob_list ();
1394 BLOBNBOX_IT start_it; //start of word
1395 BLOBNBOX_IT plot_it; //for plotting
1396 inT16 blob_count; //no of blobs
1397 TBOX blob_box; //bounding box
1398 TBOX prev_box; //of super blob
1399 inT32 prev_right; //of word sync
1400 int scale_factor; //on scores for big words
1401 inT32 sp_count; //spaces
1402 FPSEGPT_LIST seg_list; //char cells
1403 FPSEGPT_IT seg_it; //iterator
1404 inT16 segpos; //position of segment
1405 inT16 cellpos; //previous cell boundary
1406 //iterator
1407 ICOORDELT_IT cell_it = row_cells;
1408 ICOORDELT *cell; //new cell
1409 double sqsum; //sum of squares
1410 double spsum; //of spaces
1411 double sp_var; //space error
1412 double word_sync; //result for word
1413 inT32 total_count; //total blobs
1414
1415 if ((pitsync_linear_version & 3) > 1) {
1416 word_sync = compute_pitch_sd2 (row, projection, projection_left,
1417 projection_right, initial_pitch,
1418 occupation, mid_cuts, row_cells,
1419 testing_on, start, end);
1420 sp_sd = occupation;
1421 return word_sync;
1422 }
1423 mid_cuts = 0;
1424 cellpos = 0;
1425 total_count = 0;
1426 sqsum = 0;
1427 sp_count = 0;
1428 spsum = 0;
1429 prev_right = -1;
1430 if (blob_it.empty ())
1431 return space_size * 10;
1432 #ifndef GRAPHICS_DISABLED
1433 if (testing_on && to_win > 0) {
1434 blob_box = blob_it.data ()->bounding_box ();
1435 projection->plot (to_win, projection_left,
1436 row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
1437 }
1438 #endif
1439 start_it = blob_it;
1440 blob_count = 0;
1441 blob_box = box_next (&blob_it);//first blob
1442 blob_it.mark_cycle_pt ();
1443 do {
1444 for (; blob_count > 0; blob_count--)
1445 box_next(&start_it);
1446 do {
1447 prev_box = blob_box;
1448 blob_count++;
1449 blob_box = box_next (&blob_it);
1450 }
1451 while (!blob_it.cycled_list ()
1452 && blob_box.left () - prev_box.right () < space_size);
1453 plot_it = start_it;
1454 if (pitsync_linear_version & 3)
1455 word_sync =
1456 check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2,
1457 projection, projection_left, projection_right,
1458 row->xheight * textord_projection_scale,
1459 occupation, &seg_list, start, end);
1460 else
1461 word_sync =
1462 check_pitch_sync (&start_it, blob_count, (inT16) initial_pitch, 2,
1463 projection, &seg_list);
1464 if (testing_on) {
1465 tprintf ("Word ending at (%d,%d), len=%d, sync rating=%g, ",
1466 prev_box.right (), prev_box.top (),
1467 seg_list.length () - 1, word_sync);
1468 seg_it.set_to_list (&seg_list);
1469 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list ();
1470 seg_it.forward ()) {
1471 if (seg_it.data ()->faked)
1472 tprintf ("(F)");
1473 tprintf ("%d, ", seg_it.data ()->position ());
1474 // tprintf("C=%g, s=%g, sq=%g\n",
1475 // seg_it.data()->cost_function(),
1476 // seg_it.data()->sum(),
1477 // seg_it.data()->squares());
1478 }
1479 tprintf ("\n");
1480 }
1481 #ifndef GRAPHICS_DISABLED
1482 if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
1483 plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1484 #endif
1485 seg_it.set_to_list (&seg_list);
1486 if (prev_right >= 0) {
1487 sp_var = seg_it.data ()->position () - prev_right;
1488 sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
1489 sp_var *= sp_var;
1490 spsum += sp_var;
1491 sp_count++;
1492 }
1493 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
1494 segpos = seg_it.data ()->position ();
1495 if (cell_it.empty () || segpos > cellpos + initial_pitch / 2) {
1496 //big gap
1497 while (!cell_it.empty () && segpos > cellpos + initial_pitch * 3 / 2) {
1498 cell = new ICOORDELT (cellpos + (inT16) initial_pitch, 0);
1499 cell_it.add_after_then_move (cell);
1500 cellpos += (inT16) initial_pitch;
1501 }
1502 //make new one
1503 cell = new ICOORDELT (segpos, 0);
1504 cell_it.add_after_then_move (cell);
1505 cellpos = segpos;
1506 }
1507 else if (segpos > cellpos - initial_pitch / 2) {
1508 cell = cell_it.data ();
1509 //average positions
1510 cell->set_x ((cellpos + segpos) / 2);
1511 cellpos = cell->x ();
1512 }
1513 }
1514 seg_it.move_to_last ();
1515 prev_right = seg_it.data ()->position ();
1516 if (textord_pitch_scalebigwords) {
1517 scale_factor = (seg_list.length () - 2) / 2;
1518 if (scale_factor < 1)
1519 scale_factor = 1;
1520 }
1521 else
1522 scale_factor = 1;
1523 sqsum += word_sync * scale_factor;
1524 total_count += (seg_list.length () - 1) * scale_factor;
1525 seg_list.clear ();
1526 }
1527 while (!blob_it.cycled_list ());
1528 sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
1529 return total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
1530 }
1531
1532
1533 /**********************************************************************
1534 * compute_pitch_sd2
1535 *
1536 * Use a dp algorithm to fit the character cells and return the sd of
1537 * the cell size over the row.
1538 **********************************************************************/
1539
compute_pitch_sd2(TO_ROW * row,STATS * projection,inT16 projection_left,inT16 projection_right,float initial_pitch,inT16 & occupation,inT16 & mid_cuts,ICOORDELT_LIST * row_cells,BOOL8 testing_on,inT16 start,inT16 end)1540 float compute_pitch_sd2( //find fp cells
1541 TO_ROW *row, //row to do
1542 STATS *projection, //vertical projection
1543 inT16 projection_left, //edge
1544 inT16 projection_right, //edge
1545 float initial_pitch, //guess at pitch
1546 inT16 &occupation, //no of occupied cells
1547 inT16 &mid_cuts, //no of free cuts
1548 ICOORDELT_LIST *row_cells, //list of chop pts
1549 BOOL8 testing_on, //inidividual words
1550 inT16 start, //start of good range
1551 inT16 end //end of good range
1552 ) {
1553 //blobs
1554 BLOBNBOX_IT blob_it = row->blob_list ();
1555 BLOBNBOX_IT plot_it;
1556 inT16 blob_count; //no of blobs
1557 TBOX blob_box; //bounding box
1558 FPSEGPT_LIST seg_list; //char cells
1559 FPSEGPT_IT seg_it; //iterator
1560 inT16 segpos; //position of segment
1561 //iterator
1562 ICOORDELT_IT cell_it = row_cells;
1563 ICOORDELT *cell; //new cell
1564 double word_sync; //result for word
1565
1566 mid_cuts = 0;
1567 if (blob_it.empty ()) {
1568 occupation = 0;
1569 return initial_pitch * 10;
1570 }
1571 #ifndef GRAPHICS_DISABLED
1572 if (testing_on && to_win > 0) {
1573 projection->plot (to_win, projection_left,
1574 row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
1575 }
1576 #endif
1577 blob_count = 0;
1578 blob_it.mark_cycle_pt ();
1579 do {
1580 //first blob
1581 blob_box = box_next (&blob_it);
1582 blob_count++;
1583 }
1584 while (!blob_it.cycled_list ());
1585 plot_it = blob_it;
1586 word_sync = check_pitch_sync2 (&blob_it, blob_count, (inT16) initial_pitch,
1587 2, projection, projection_left,
1588 projection_right,
1589 row->xheight * textord_projection_scale,
1590 occupation, &seg_list, start, end);
1591 if (testing_on) {
1592 tprintf ("Row ending at (%d,%d), len=%d, sync rating=%g, ",
1593 blob_box.right (), blob_box.top (),
1594 seg_list.length () - 1, word_sync);
1595 seg_it.set_to_list (&seg_list);
1596 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
1597 if (seg_it.data ()->faked)
1598 tprintf ("(F)");
1599 tprintf ("%d, ", seg_it.data ()->position ());
1600 // tprintf("C=%g, s=%g, sq=%g\n",
1601 // seg_it.data()->cost_function(),
1602 // seg_it.data()->sum(),
1603 // seg_it.data()->squares());
1604 }
1605 tprintf ("\n");
1606 }
1607 #ifndef GRAPHICS_DISABLED
1608 if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
1609 plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1610 #endif
1611 seg_it.set_to_list (&seg_list);
1612 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
1613 segpos = seg_it.data ()->position ();
1614 //make new one
1615 cell = new ICOORDELT (segpos, 0);
1616 cell_it.add_after_then_move (cell);
1617 if (seg_it.at_last ())
1618 mid_cuts = seg_it.data ()->cheap_cuts ();
1619 }
1620 seg_list.clear ();
1621 return occupation > 0 ? sqrt (word_sync / occupation) : initial_pitch * 10;
1622 }
1623
1624
1625 /**********************************************************************
1626 * print_pitch_sd
1627 *
1628 * Use a dp algorithm to fit the character cells and return the sd of
1629 * the cell size over the row.
1630 **********************************************************************/
1631
print_pitch_sd(TO_ROW * row,STATS * projection,inT16 projection_left,inT16 projection_right,float space_size,float initial_pitch)1632 void print_pitch_sd( //find fp cells
1633 TO_ROW *row, //row to do
1634 STATS *projection, //vertical projection
1635 inT16 projection_left, //edges //size of blank
1636 inT16 projection_right,
1637 float space_size,
1638 float initial_pitch //guess at pitch
1639 ) {
1640 const char *res2; //pitch result
1641 inT16 occupation; //used cells
1642 float sp_sd; //space sd
1643 //blobs
1644 BLOBNBOX_IT blob_it = row->blob_list ();
1645 BLOBNBOX_IT start_it; //start of word
1646 BLOBNBOX_IT row_start; //start of row
1647 inT16 blob_count; //no of blobs
1648 inT16 total_blob_count; //total blobs in line
1649 TBOX blob_box; //bounding box
1650 TBOX prev_box; //of super blob
1651 inT32 prev_right; //of word sync
1652 int scale_factor; //on scores for big words
1653 inT32 sp_count; //spaces
1654 FPSEGPT_LIST seg_list; //char cells
1655 FPSEGPT_IT seg_it; //iterator
1656 double sqsum; //sum of squares
1657 double spsum; //of spaces
1658 double sp_var; //space error
1659 double word_sync; //result for word
1660 double total_count; //total cuts
1661
1662 if (blob_it.empty ())
1663 return;
1664 row_start = blob_it;
1665 total_blob_count = 0;
1666
1667 total_count = 0;
1668 sqsum = 0;
1669 sp_count = 0;
1670 spsum = 0;
1671 prev_right = -1;
1672 blob_it = row_start;
1673 start_it = blob_it;
1674 blob_count = 0;
1675 blob_box = box_next (&blob_it);//first blob
1676 blob_it.mark_cycle_pt ();
1677 do {
1678 for (; blob_count > 0; blob_count--)
1679 box_next(&start_it);
1680 do {
1681 prev_box = blob_box;
1682 blob_count++;
1683 blob_box = box_next (&blob_it);
1684 }
1685 while (!blob_it.cycled_list ()
1686 && blob_box.left () - prev_box.right () < space_size);
1687 word_sync =
1688 check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2,
1689 projection, projection_left, projection_right,
1690 row->xheight * textord_projection_scale,
1691 occupation, &seg_list, 0, 0);
1692 total_blob_count += blob_count;
1693 seg_it.set_to_list (&seg_list);
1694 if (prev_right >= 0) {
1695 sp_var = seg_it.data ()->position () - prev_right;
1696 sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
1697 sp_var *= sp_var;
1698 spsum += sp_var;
1699 sp_count++;
1700 }
1701 seg_it.move_to_last ();
1702 prev_right = seg_it.data ()->position ();
1703 if (textord_pitch_scalebigwords) {
1704 scale_factor = (seg_list.length () - 2) / 2;
1705 if (scale_factor < 1)
1706 scale_factor = 1;
1707 }
1708 else
1709 scale_factor = 1;
1710 sqsum += word_sync * scale_factor;
1711 total_count += (seg_list.length () - 1) * scale_factor;
1712 seg_list.clear ();
1713 }
1714 while (!blob_it.cycled_list ());
1715 sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
1716 word_sync = total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
1717 tprintf ("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:",
1718 word_sync, word_sync / initial_pitch, sp_sd,
1719 word_sync < textord_words_pitchsd_threshold * initial_pitch
1720 ? 'F' : 'P');
1721
1722 start_it = row_start;
1723 blob_it = row_start;
1724 word_sync =
1725 check_pitch_sync2 (&blob_it, total_blob_count, (inT16) initial_pitch, 2,
1726 projection, projection_left, projection_right,
1727 row->xheight * textord_projection_scale, occupation,
1728 &seg_list, 0, 0);
1729 if (occupation > 1)
1730 word_sync /= occupation;
1731 word_sync = sqrt (word_sync);
1732
1733 #ifndef GRAPHICS_DISABLED
1734 if (textord_show_row_cuts && to_win != NULL)
1735 plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list);
1736 #endif
1737 seg_list.clear ();
1738 if (word_sync < textord_words_pitchsd_threshold * initial_pitch) {
1739 if (word_sync < textord_words_def_fixed * initial_pitch
1740 && !row->all_caps)
1741 res2 = "DF";
1742 else
1743 res2 = "MF";
1744 }
1745 else
1746 res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP";
1747 tprintf
1748 ("row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, all_caps=%d\n",
1749 word_sync, word_sync / initial_pitch,
1750 word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P',
1751 occupation, res2, initial_pitch, row->fixed_pitch, row->all_caps);
1752 }
1753
1754
1755 /**********************************************************************
1756 * sort_floats
1757 *
1758 * qsort function to sort 2 floats.
1759 **********************************************************************/
1760
sort_floats2(const void * arg1,const void * arg2)1761 int sort_floats2( //qsort function
1762 const void *arg1, //ptrs to floats
1763 const void *arg2) {
1764 float diff; //difference
1765
1766 diff = *((float *) arg1) - *((float *) arg2);
1767 if (diff > 0)
1768 return 1;
1769 else if (diff < 0)
1770 return -1;
1771 else
1772 return 0;
1773 }
1774
1775
1776 /**********************************************************************
1777 * find_repeated_chars
1778 *
1779 * Find 4 or more adjacent chars which are the same and put them
1780 * into words in advance of fixed pitch checking and word generation.
1781 **********************************************************************/
find_repeated_chars(TO_BLOCK * block,BOOL8 testing_on,tesseract::Tesseract * tess)1782 void find_repeated_chars( //search for equal chars
1783 TO_BLOCK *block, //block to search
1784 BOOL8 testing_on, //dbug mode
1785 tesseract::Tesseract* tess
1786 ) {
1787 TO_ROW *row;
1788 BLOBNBOX_IT box_it;
1789 BLOBNBOX_IT search_it; // forward search
1790 WERD_IT word_it; //new words
1791 WERD *word; //new word
1792 TBOX word_box; //for plotting
1793 int blobcount, repeated_set;
1794
1795 TO_ROW_IT row_it = block->get_rows();
1796 if (row_it.empty()) return; // empty block
1797 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1798 row = row_it.data ();
1799 box_it.set_to_list (row->blob_list ());
1800 if (box_it.empty()) continue; // no blobs in this row
1801 if (!row->rep_chars_marked()) {
1802 mark_repeated_chars(row, block->xheight, tess);
1803 }
1804 if (row->num_repeated_sets() == 0) continue; // nothing to do for this row
1805 word_it.set_to_list (&row->rep_words);
1806 do {
1807 if (box_it.data()->repeated_set() != 0 &&
1808 !box_it.data()->joined_to_prev()) {
1809 blobcount = 1;
1810 repeated_set = box_it.data()->repeated_set();
1811 search_it = box_it;
1812 search_it.forward ();
1813 while (!search_it.at_first() &&
1814 search_it.data()->repeated_set() == repeated_set) {
1815 blobcount++;
1816 search_it.forward ();
1817 }
1818 // After the call to make_real_word() all the blobs from this
1819 // repeated set will be removed from the blob list. box_it will be
1820 // set to point to the blob after the end of the extracted sequence.
1821 word = make_real_word(&box_it, blobcount,
1822 box_it.at_first(), false, false, 1);
1823 #ifndef GRAPHICS_DISABLED
1824 if (testing_on) {
1825 word_box = word->bounding_box ();
1826 tprintf("Found repeated word of %d blobs from (%d,%d)->(%d,%d)\n",
1827 blobcount, word_box.left(), word_box.bottom(),
1828 word_box.right(), word_box.top());
1829 //perimeter_color_index(to_win, RED);
1830 to_win->Pen(255,0,0);
1831 //interior_style(to_win, INT_HOLLOW, TRUE);
1832 to_win->Rectangle(word_box.left(), word_box.bottom(),
1833 word_box.right(), word_box.top());
1834 }
1835 #endif
1836 word->set_flag(W_REP_CHAR, true);
1837 word->set_flag(W_DONT_CHOP, true);
1838 word_it.add_after_then_move (word);
1839 } else {
1840 box_it.forward();
1841 }
1842 } while (!box_it.at_first());
1843 }
1844 }
1845
1846
1847 /**********************************************************************
1848 * plot_fp_word
1849 *
1850 * Plot a block of words as if fixed pitch.
1851 **********************************************************************/
1852
1853 #ifndef GRAPHICS_DISABLED
plot_fp_word(TO_BLOCK * block,float pitch,float nonspace)1854 void plot_fp_word( //draw block of words
1855 TO_BLOCK *block, //block to draw
1856 float pitch, //pitch to draw with
1857 float nonspace //for space threshold
1858 ) {
1859 TO_ROW *row; //current row
1860 TO_ROW_IT row_it = block->get_rows ();
1861
1862 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1863 row = row_it.data ();
1864 row->min_space = (inT32) ((pitch + nonspace) / 2);
1865 row->max_nonspace = row->min_space;
1866 row->space_threshold = row->min_space;
1867 plot_word_decisions (to_win, (inT16) pitch, row);
1868 }
1869 }
1870 #endif
1871