1 /**********************************************************************
2 * File: wordseg.cpp (Formerly wspace.c)
3 * Description: Code to segment the blobs into words.
4 * Author: Ray Smith
5 * Created: Fri Oct 16 11:32:28 BST 1992
6 *
7 * (C) Copyright 1992, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 #include "mfcpch.h"
21 #ifdef __UNIX__
22 #include <assert.h>
23 #endif
24 #include "stderr.h"
25 #include "blobbox.h"
26 #include "ocrclass.h"
27 #include "lmedsq.h"
28 #include "statistc.h"
29 #include "drawtord.h"
30 #include "makerow.h"
31 #include "pitsync1.h"
32 #include "blobcmpl.h"
33 #include "tovars.h"
34 #include "topitch.h"
35 #include "tospace.h"
36 #include "fpchop.h"
37 #include "wordseg.h"
38
39 #define EXTERN
40
41 EXTERN BOOL_VAR (textord_fp_chopping, TRUE, "Do fixed pitch chopping");
42 EXTERN BOOL_VAR (textord_force_make_prop_words, FALSE,
43 "Force proportional word segmentation on all rows");
44 EXTERN BOOL_VAR (textord_chopper_test, FALSE,
45 "Chopper is being tested.");
46 extern /*"C" */ ETEXT_DESC *global_monitor; //progress monitor
47
48 #define FIXED_WIDTH_MULTIPLE 5
49 #define BLOCK_STATS_CLUSTERS 10
50
51
52 /**********************************************************************
53 * make_single_word
54 *
55 * Arrange the blobs into one word. There is no fixed pitch detection.
56 **********************************************************************/
57
make_single_word(bool one_blob,TO_ROW_LIST * rows,ROW_LIST * real_rows)58 void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) {
59 TO_ROW_IT to_row_it(rows);
60 TO_ROW* row = to_row_it.data();
61 // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
62 // to create the word.
63 C_BLOB_LIST cblobs;
64 C_BLOB_IT cblob_it(&cblobs);
65 BLOBNBOX_IT box_it(row->blob_list());
66 for (;!box_it.empty(); box_it.forward()) {
67 BLOBNBOX* bblob= box_it.extract();
68 if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
69 if (bblob->cblob() != NULL) {
70 C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
71 cout_it.move_to_last();
72 cout_it.add_list_after(bblob->cblob()->out_list());
73 delete bblob->cblob();
74 }
75 } else {
76 if (bblob->cblob() != NULL)
77 cblob_it.add_after_then_move(bblob->cblob());
78 delete bblob;
79 }
80 }
81 // Convert the TO_ROW to a ROW.
82 ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size),
83 static_cast<inT16>(row->space_size));
84 WERD_IT word_it(real_row->word_list());
85 WERD* word = new WERD(&cblobs, 0, NULL);
86 word->set_flag(W_BOL, TRUE);
87 word->set_flag(W_EOL, TRUE);
88 word_it.add_after_then_move(word);
89 ROW_IT row_it(real_rows);
90 row_it.add_after_then_move(real_row);
91 }
92
93 /**********************************************************************
94 * make_words
95 *
96 * Arrange the blobs into words.
97 **********************************************************************/
98
make_words(ICOORD page_tr,float gradient,BLOCK_LIST * blocks,TO_BLOCK_LIST * land_blocks,TO_BLOCK_LIST * port_blocks,tesseract::Tesseract * tess)99 void make_words( //make words
100 ICOORD page_tr, //top right
101 float gradient, //page skew
102 BLOCK_LIST *blocks, //block list
103 TO_BLOCK_LIST *land_blocks, //rotated for landscape
104 TO_BLOCK_LIST *port_blocks, //output list
105 tesseract::Tesseract* tess
106 ) {
107 TO_BLOCK_IT block_it; //iterator
108 TO_BLOCK *block; //current block;
109
110 compute_fixed_pitch (page_tr, port_blocks, gradient, FCOORD (0.0f, -1.0f),
111 !(BOOL8) textord_test_landscape, tess);
112 if (global_monitor != NULL) {
113 global_monitor->ocr_alive = TRUE;
114 global_monitor->progress = 25;
115 }
116 to_spacing(page_tr, port_blocks);
117 block_it.set_to_list (port_blocks);
118 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
119 block_it.forward ()) {
120 block = block_it.data ();
121 // set_row_spaces(block,FCOORD(1,0),!(BOOL8)textord_test_landscape);
122 //make proper classes
123 make_real_words (block, FCOORD (1.0f, 0.0f));
124 }
125 }
126
127
128 /**********************************************************************
129 * set_row_spaces
130 *
131 * Set the min_space and max_nonspace members of the row so that
132 * the blobs can be arranged into words.
133 **********************************************************************/
134
set_row_spaces(TO_BLOCK * block,FCOORD rotation,BOOL8 testing_on)135 void set_row_spaces( //find space sizes
136 TO_BLOCK *block, //block to do
137 FCOORD rotation, //for drawing
138 BOOL8 testing_on //correct orientation
139 ) {
140 inT32 maxwidth; //of widest space
141 TO_ROW *row; //current row
142 TO_ROW_IT row_it = block->get_rows ();
143
144 if (row_it.empty ())
145 return; //empty block
146 maxwidth = (inT32) ceil (block->xheight * textord_words_maxspace);
147 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
148 row = row_it.data ();
149 if (row->fixed_pitch == 0) {
150 // if (!textord_test_mode
151 // && row_words(block,row,maxwidth,rotation,testing_on)==0
152 // || textord_test_mode
153 // && row_words2(block,row,maxwidth,rotation,testing_on)==0)
154 // {
155 row->min_space =
156 (inT32) ceil (row->pr_space -
157 (row->pr_space -
158 row->pr_nonsp) * textord_words_definite_spread);
159 row->max_nonspace =
160 (inT32) floor (row->pr_nonsp +
161 (row->pr_space -
162 row->pr_nonsp) * textord_words_definite_spread);
163 if (testing_on && textord_show_initial_words) {
164 tprintf ("Assigning defaults %d non, %d space to row at %g\n",
165 row->max_nonspace, row->min_space, row->intercept ());
166 }
167 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
168 row->space_size = row->pr_space;
169 row->kern_size = row->pr_nonsp;
170 // }
171 }
172 #ifndef GRAPHICS_DISABLED
173 if (textord_show_initial_words && testing_on) {
174 plot_word_decisions (to_win, (inT16) row->fixed_pitch, row);
175 }
176 #endif
177 }
178 }
179
180
181 /**********************************************************************
182 * row_words
183 *
184 * Compute the max nonspace and min space for the row.
185 **********************************************************************/
186
row_words(TO_BLOCK * block,TO_ROW * row,inT32 maxwidth,FCOORD rotation,BOOL8 testing_on)187 inT32 row_words( //compute space size
188 TO_BLOCK *block, //block it came from
189 TO_ROW *row, //row to operate on
190 inT32 maxwidth, //max expected space size
191 FCOORD rotation, //for drawing
192 BOOL8 testing_on //for debug
193 ) {
194 BOOL8 testing_row; //contains testpt
195 BOOL8 prev_valid; //if decent size
196 BOOL8 this_valid; //current blob big enough
197 inT32 prev_x; //end of prev blob
198 inT32 min_gap; //min interesting gap
199 inT32 cluster_count; //no of clusters
200 inT32 gap_index; //which cluster
201 inT32 smooth_factor; //for smoothing stats
202 BLOBNBOX *blob; //current blob
203 float lower, upper; //clustering parameters
204 float gaps[3]; //gap clusers
205 ICOORD testpt;
206 TBOX blob_box; //bounding box
207 //iterator
208 BLOBNBOX_IT blob_it = row->blob_list ();
209 STATS gap_stats (0, maxwidth);
210 STATS cluster_stats[4]; //clusters
211
212 testpt = ICOORD (textord_test_x, textord_test_y);
213 smooth_factor =
214 (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
215 // if (testing_on)
216 // tprintf("Row smooth factor=%d\n",smooth_factor);
217 prev_valid = FALSE;
218 prev_x = -MAX_INT32;
219 testing_row = FALSE;
220 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
221 blob = blob_it.data ();
222 blob_box = blob->bounding_box ();
223 if (blob_box.contains (testpt))
224 testing_row = TRUE;
225 gap_stats.add (blob_box.width (), 1);
226 }
227 min_gap = (inT32) floor (gap_stats.ile (textord_words_width_ile));
228 gap_stats.clear ();
229 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
230 blob = blob_it.data ();
231 if (!blob->joined_to_prev ()) {
232 blob_box = blob->bounding_box ();
233 // this_valid=blob_box.width()>=min_gap;
234 this_valid = TRUE;
235 if (this_valid && prev_valid
236 && blob_box.left () - prev_x < maxwidth) {
237 gap_stats.add (blob_box.left () - prev_x, 1);
238 }
239 prev_x = blob_box.right ();
240 prev_valid = this_valid;
241 }
242 }
243 if (gap_stats.get_total () == 0) {
244 row->min_space = 0; //no evidence
245 row->max_nonspace = 0;
246 return 0;
247 }
248 gap_stats.smooth (smooth_factor);
249 lower = row->xheight * textord_words_initial_lower;
250 upper = row->xheight * textord_words_initial_upper;
251 cluster_count = gap_stats.cluster (lower, upper,
252 textord_spacesize_ratioprop, 3,
253 cluster_stats);
254 while (cluster_count < 2 && ceil (lower) < floor (upper)) {
255 //shrink gap
256 upper = (upper * 3 + lower) / 4;
257 lower = (lower * 3 + upper) / 4;
258 cluster_count = gap_stats.cluster (lower, upper,
259 textord_spacesize_ratioprop, 3,
260 cluster_stats);
261 }
262 if (cluster_count < 2) {
263 row->min_space = 0; //no evidence
264 row->max_nonspace = 0;
265 return 0;
266 }
267 for (gap_index = 0; gap_index < cluster_count; gap_index++)
268 gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
269 //get medians
270 if (cluster_count > 2) {
271 if (testing_on && textord_show_initial_words) {
272 tprintf ("Row at %g has 3 sizes of gap:%g,%g,%g\n",
273 row->intercept (),
274 cluster_stats[1].ile (0.5),
275 cluster_stats[2].ile (0.5), cluster_stats[3].ile (0.5));
276 }
277 lower = gaps[0];
278 if (gaps[1] > lower) {
279 upper = gaps[1]; //prefer most frequent
280 if (upper < block->xheight * textord_words_min_minspace
281 && gaps[2] > gaps[1]) {
282 upper = gaps[2];
283 }
284 }
285 else if (gaps[2] > lower
286 && gaps[2] >= block->xheight * textord_words_min_minspace)
287 upper = gaps[2];
288 else if (lower >= block->xheight * textord_words_min_minspace) {
289 upper = lower; //not nice
290 lower = gaps[1];
291 if (testing_on && textord_show_initial_words) {
292 tprintf ("Had to switch most common from lower to upper!!\n");
293 gap_stats.print (stdout, TRUE);
294 }
295 }
296 else {
297 row->min_space = 0; //no evidence
298 row->max_nonspace = 0;
299 return 0;
300 }
301 }
302 else {
303 if (gaps[1] < gaps[0]) {
304 if (testing_on && textord_show_initial_words) {
305 tprintf ("Had to switch most common from lower to upper!!\n");
306 gap_stats.print (stdout, TRUE);
307 }
308 lower = gaps[1];
309 upper = gaps[0];
310 }
311 else {
312 upper = gaps[1];
313 lower = gaps[0];
314 }
315 }
316 if (upper < block->xheight * textord_words_min_minspace) {
317 row->min_space = 0; //no evidence
318 row->max_nonspace = 0;
319 return 0;
320 }
321 if (upper * 3 < block->min_space * 2 + block->max_nonspace
322 || lower * 3 > block->min_space * 2 + block->max_nonspace) {
323 if (testing_on && textord_show_initial_words) {
324 tprintf ("Disagreement between block and row at %g!!\n",
325 row->intercept ());
326 tprintf ("Lower=%g, upper=%g, Stats:\n", lower, upper);
327 gap_stats.print (stdout, TRUE);
328 }
329 }
330 row->min_space =
331 (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
332 row->max_nonspace =
333 (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
334 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
335 row->space_size = upper;
336 row->kern_size = lower;
337 if (testing_on && textord_show_initial_words) {
338 if (testing_row) {
339 tprintf ("GAP STATS\n");
340 gap_stats.print (stdout, TRUE);
341 tprintf ("SPACE stats\n");
342 cluster_stats[2].print (stdout, FALSE);
343 tprintf ("NONSPACE stats\n");
344 cluster_stats[1].print (stdout, FALSE);
345 }
346 tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
347 row->intercept (), row->min_space, upper,
348 row->max_nonspace, lower);
349 }
350 return cluster_stats[2].get_total ();
351 }
352
353
354 /**********************************************************************
355 * row_words2
356 *
357 * Compute the max nonspace and min space for the row.
358 **********************************************************************/
359
row_words2(TO_BLOCK * block,TO_ROW * row,inT32 maxwidth,FCOORD rotation,BOOL8 testing_on)360 inT32 row_words2( //compute space size
361 TO_BLOCK *block, //block it came from
362 TO_ROW *row, //row to operate on
363 inT32 maxwidth, //max expected space size
364 FCOORD rotation, //for drawing
365 BOOL8 testing_on //for debug
366 ) {
367 BOOL8 testing_row; //contains testpt
368 BOOL8 prev_valid; //if decent size
369 BOOL8 this_valid; //current blob big enough
370 inT32 prev_x; //end of prev blob
371 inT32 min_width; //min interesting width
372 inT32 valid_count; //good gaps
373 inT32 total_count; //total gaps
374 inT32 cluster_count; //no of clusters
375 inT32 prev_count; //previous cluster_count
376 inT32 gap_index; //which cluster
377 inT32 smooth_factor; //for smoothing stats
378 BLOBNBOX *blob; //current blob
379 float lower, upper; //clustering parameters
380 ICOORD testpt;
381 TBOX blob_box; //bounding box
382 //iterator
383 BLOBNBOX_IT blob_it = row->blob_list ();
384 STATS gap_stats (0, maxwidth);
385 //gap sizes
386 float gaps[BLOCK_STATS_CLUSTERS];
387 STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
388 //clusters
389
390 testpt = ICOORD (textord_test_x, textord_test_y);
391 smooth_factor =
392 (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
393 // if (testing_on)
394 // tprintf("Row smooth factor=%d\n",smooth_factor);
395 prev_valid = FALSE;
396 prev_x = -MAX_INT16;
397 testing_row = FALSE;
398 //min blob size
399 min_width = (inT32) block->pr_space;
400 total_count = 0;
401 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
402 blob = blob_it.data ();
403 if (!blob->joined_to_prev ()) {
404 blob_box = blob->bounding_box ();
405 this_valid = blob_box.width () >= min_width;
406 this_valid = TRUE;
407 if (this_valid && prev_valid
408 && blob_box.left () - prev_x < maxwidth) {
409 gap_stats.add (blob_box.left () - prev_x, 1);
410 }
411 total_count++; //count possibles
412 prev_x = blob_box.right ();
413 prev_valid = this_valid;
414 }
415 }
416 valid_count = gap_stats.get_total ();
417 if (valid_count < total_count * textord_words_minlarge) {
418 gap_stats.clear ();
419 prev_x = -MAX_INT16;
420 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
421 blob_it.forward ()) {
422 blob = blob_it.data ();
423 if (!blob->joined_to_prev ()) {
424 blob_box = blob->bounding_box ();
425 if (blob_box.left () - prev_x < maxwidth) {
426 gap_stats.add (blob_box.left () - prev_x, 1);
427 }
428 prev_x = blob_box.right ();
429 }
430 }
431 }
432 if (gap_stats.get_total () == 0) {
433 row->min_space = 0; //no evidence
434 row->max_nonspace = 0;
435 return 0;
436 }
437
438 cluster_count = 0;
439 lower = block->xheight * words_initial_lower;
440 upper = block->xheight * words_initial_upper;
441 gap_stats.smooth (smooth_factor);
442 do {
443 prev_count = cluster_count;
444 cluster_count = gap_stats.cluster (lower, upper,
445 textord_spacesize_ratioprop,
446 BLOCK_STATS_CLUSTERS, cluster_stats);
447 }
448 while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
449 if (cluster_count < 1) {
450 row->min_space = 0;
451 row->max_nonspace = 0;
452 return 0;
453 }
454 for (gap_index = 0; gap_index < cluster_count; gap_index++)
455 gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
456 //get medians
457 if (testing_on) {
458 tprintf ("cluster_count=%d:", cluster_count);
459 for (gap_index = 0; gap_index < cluster_count; gap_index++)
460 tprintf (" %g(%d)", gaps[gap_index],
461 cluster_stats[gap_index + 1].get_total ());
462 tprintf ("\n");
463 }
464
465 //Try to find proportional non-space and space for row.
466 for (gap_index = 0; gap_index < cluster_count
467 && gaps[gap_index] > block->max_nonspace; gap_index++);
468 if (gap_index < cluster_count)
469 lower = gaps[gap_index]; //most frequent below
470 else {
471 if (testing_on)
472 tprintf ("No cluster below block threshold!, using default=%g\n",
473 block->pr_nonsp);
474 lower = block->pr_nonsp;
475 }
476 for (gap_index = 0; gap_index < cluster_count
477 && gaps[gap_index] <= block->max_nonspace; gap_index++);
478 if (gap_index < cluster_count)
479 upper = gaps[gap_index]; //most frequent above
480 else {
481 if (testing_on)
482 tprintf ("No cluster above block threshold!, using default=%g\n",
483 block->pr_space);
484 upper = block->pr_space;
485 }
486 row->min_space =
487 (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
488 row->max_nonspace =
489 (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
490 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
491 row->space_size = upper;
492 row->kern_size = lower;
493 if (testing_on) {
494 if (testing_row) {
495 tprintf ("GAP STATS\n");
496 gap_stats.print (stdout, TRUE);
497 tprintf ("SPACE stats\n");
498 cluster_stats[2].print (stdout, FALSE);
499 tprintf ("NONSPACE stats\n");
500 cluster_stats[1].print (stdout, FALSE);
501 }
502 tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
503 row->intercept (), row->min_space, upper,
504 row->max_nonspace, lower);
505 }
506 return 1;
507 }
508
509
510 /**********************************************************************
511 * make_real_words
512 *
513 * Convert a TO_BLOCK to a BLOCK.
514 **********************************************************************/
515
make_real_words(TO_BLOCK * block,FCOORD rotation)516 void make_real_words( //find lines
517 TO_BLOCK *block, //block to do
518 FCOORD rotation //for drawing
519 ) {
520 TO_ROW *row; //current row
521 TO_ROW_IT row_it = block->get_rows ();
522 ROW *real_row = NULL; //output row
523 ROW_IT real_row_it = block->block->row_list ();
524
525 if (row_it.empty ())
526 return; //empty block
527 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
528 row = row_it.data ();
529 if (row->blob_list ()->empty () && !row->rep_words.empty ()) {
530 real_row = make_rep_words (row, block);
531 }
532 else if (!row->blob_list ()->empty ()) {
533 // In a fixed pitch document, some lines may be detected as fixed pitch
534 // while others don't, and will go through different path.
535 // For non-space delimited language like CJK, fixed pitch chop always
536 // leave the entire line as one word. We can force consistent chopping
537 // with force_make_prop_words flag.
538 if (textord_chopper_test) {
539 real_row = make_blob_words (row, rotation);
540 } else if (textord_force_make_prop_words ||
541 row->pitch_decision == PITCH_DEF_PROP ||
542 row->pitch_decision == PITCH_CORR_PROP) {
543 real_row = make_prop_words (row, rotation);
544 } else if (row->pitch_decision == PITCH_DEF_FIXED ||
545 row->pitch_decision == PITCH_CORR_FIXED) {
546 real_row = fixed_pitch_words (row, rotation);
547 } else
548 ASSERT_HOST(FALSE);
549 }
550 if (real_row != NULL) {
551 //put row in block
552 real_row_it.add_after_then_move (real_row);
553 }
554 }
555 block->block->set_stats (block->fixed_pitch == 0, (inT16) block->kern_size,
556 (inT16) block->space_size,
557 (inT16) block->fixed_pitch);
558 block->block->check_pitch ();
559 }
560
561
562 /**********************************************************************
563 * make_rep_words
564 *
565 * Fabricate a real row from only the repeated blob words.
566 * Get the xheight from the block as it may be more meaningful.
567 **********************************************************************/
568
make_rep_words(TO_ROW * row,TO_BLOCK * block)569 ROW *make_rep_words( //make a row
570 TO_ROW *row, //row to convert
571 TO_BLOCK *block //block it lives in
572 ) {
573 inT32 xstarts[2]; //ends of row
574 ROW *real_row; //output row
575 TBOX word_box; //bounding box
576 double coeffs[3]; //spline
577 //iterator
578 WERD_IT word_it = &row->rep_words;
579
580 if (word_it.empty ())
581 return NULL;
582 word_box = word_it.data ()->bounding_box ();
583 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ())
584 word_box += word_it.data ()->bounding_box ();
585 xstarts[0] = word_box.left ();
586 xstarts[1] = word_box.right ();
587 coeffs[0] = 0;
588 coeffs[1] = row->line_m ();
589 coeffs[2] = row->line_c ();
590 row->xheight = block->xheight;
591 real_row = new ROW(row,
592 (inT16) block->kern_size, (inT16) block->space_size);
593 word_it.set_to_list (real_row->word_list ());
594 //put words in row
595 word_it.add_list_after (&row->rep_words);
596 real_row->recalc_bounding_box ();
597 return real_row;
598 }
599
600
601 /**********************************************************************
602 * make_real_word
603 *
604 * Construct a WERD from a given number of adjacent entries in a
605 * list of BLOBNBOXs.
606 **********************************************************************/
607
make_real_word(BLOBNBOX_IT * box_it,inT32 blobcount,BOOL8 bol,BOOL8 fuzzy_sp,BOOL8 fuzzy_non,uinT8 blanks)608 WERD *make_real_word( //make a WERD
609 BLOBNBOX_IT *box_it, //iterator
610 inT32 blobcount, //no of blobs to use
611 BOOL8 bol, //start of line
612 BOOL8 fuzzy_sp, //fuzzy space
613 BOOL8 fuzzy_non, //fuzzy non-space
614 uinT8 blanks //no of blanks
615 ) {
616 OUTLINE_IT out_it; //outlines
617 C_OUTLINE_IT cout_it;
618 PBLOB_LIST blobs; //blobs in word
619 C_BLOB_LIST cblobs;
620 PBLOB_IT blob_it = &blobs; //iterator
621 C_BLOB_IT cblob_it = &cblobs;
622 WERD *word; //new word
623 BLOBNBOX *bblob; //current blob
624 inT32 blobindex; //in row
625
626 for (blobindex = 0; blobindex < blobcount; blobindex++) {
627 bblob = box_it->extract ();
628 if (bblob->joined_to_prev ()) {
629 if (bblob->blob () != NULL) {
630 out_it.set_to_list (blob_it.data ()->out_list ());
631 out_it.move_to_last ();
632 out_it.add_list_after (bblob->blob ()->out_list ());
633 delete bblob->blob ();
634 }
635 else if (bblob->cblob () != NULL) {
636 cout_it.set_to_list (cblob_it.data ()->out_list ());
637 cout_it.move_to_last ();
638 cout_it.add_list_after (bblob->cblob ()->out_list ());
639 delete bblob->cblob ();
640 }
641 }
642 else {
643 if (bblob->blob () != NULL)
644 blob_it.add_after_then_move (bblob->blob ());
645 else if (bblob->cblob () != NULL)
646 cblob_it.add_after_then_move (bblob->cblob ());
647 }
648 delete bblob;
649 box_it->forward (); //next one
650 }
651
652 if (blanks < 1)
653 blanks = 1;
654 if (!blob_it.empty ()) {
655 //make real word
656 word = new WERD (&blobs, blanks, NULL);
657 }
658 else {
659 word = new WERD (&cblobs, blanks, NULL);
660 }
661 if (bol) {
662 word->set_flag (W_BOL, TRUE);
663 }
664 if (fuzzy_sp)
665 //probably space
666 word->set_flag (W_FUZZY_SP, TRUE);
667 else if (fuzzy_non)
668 //probably not
669 word->set_flag (W_FUZZY_NON, TRUE);
670 if (box_it->at_first ()) {
671 word->set_flag (W_EOL, TRUE);//at end of line
672 }
673 return word;
674 }
675
676