• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**********************************************************************
2  * File:        wordseg.cpp  (Formerly wspace.c)
3  * Description: Code to segment the blobs into words.
4  * Author:		Ray Smith
5  * Created:		Fri Oct 16 11:32:28 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #ifdef __UNIX__
22 #include          <assert.h>
23 #endif
24 #include          "stderr.h"
25 #include          "blobbox.h"
26 #include          "ocrclass.h"
27 #include          "lmedsq.h"
28 #include          "statistc.h"
29 #include          "drawtord.h"
30 #include          "makerow.h"
31 #include          "pitsync1.h"
32 #include          "blobcmpl.h"
33 #include          "tovars.h"
34 #include          "topitch.h"
35 #include          "tospace.h"
36 #include          "fpchop.h"
37 #include          "wordseg.h"
38 
39 #define EXTERN
40 
41 EXTERN BOOL_VAR (textord_fp_chopping, TRUE, "Do fixed pitch chopping");
42 EXTERN BOOL_VAR (textord_force_make_prop_words, FALSE,
43                  "Force proportional word segmentation on all rows");
44 EXTERN BOOL_VAR (textord_chopper_test, FALSE,
45                  "Chopper is being tested.");
46 extern /*"C" */ ETEXT_DESC *global_monitor;     //progress monitor
47 
48 #define FIXED_WIDTH_MULTIPLE  5
49 #define BLOCK_STATS_CLUSTERS  10
50 
51 
52 /**********************************************************************
53  * make_single_word
54  *
55  * Arrange the blobs into one word. There is no fixed pitch detection.
56  **********************************************************************/
57 
make_single_word(bool one_blob,TO_ROW_LIST * rows,ROW_LIST * real_rows)58 void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) {
59   TO_ROW_IT to_row_it(rows);
60   TO_ROW* row = to_row_it.data();
61   // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
62   // to create the word.
63   C_BLOB_LIST cblobs;
64   C_BLOB_IT cblob_it(&cblobs);
65   BLOBNBOX_IT box_it(row->blob_list());
66   for (;!box_it.empty(); box_it.forward()) {
67     BLOBNBOX* bblob= box_it.extract();
68     if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
69       if (bblob->cblob() != NULL) {
70         C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
71         cout_it.move_to_last();
72         cout_it.add_list_after(bblob->cblob()->out_list());
73         delete bblob->cblob();
74       }
75     } else {
76       if (bblob->cblob() != NULL)
77         cblob_it.add_after_then_move(bblob->cblob());
78       delete bblob;
79     }
80   }
81   // Convert the TO_ROW to a ROW.
82   ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size),
83                           static_cast<inT16>(row->space_size));
84   WERD_IT word_it(real_row->word_list());
85   WERD* word = new WERD(&cblobs, 0, NULL);
86   word->set_flag(W_BOL, TRUE);
87   word->set_flag(W_EOL, TRUE);
88   word_it.add_after_then_move(word);
89   ROW_IT row_it(real_rows);
90   row_it.add_after_then_move(real_row);
91 }
92 
93 /**********************************************************************
94  * make_words
95  *
96  * Arrange the blobs into words.
97  **********************************************************************/
98 
make_words(ICOORD page_tr,float gradient,BLOCK_LIST * blocks,TO_BLOCK_LIST * land_blocks,TO_BLOCK_LIST * port_blocks,tesseract::Tesseract * tess)99 void make_words(                             //make words
100                 ICOORD page_tr,              //top right
101                 float gradient,              //page skew
102                 BLOCK_LIST *blocks,          //block list
103                 TO_BLOCK_LIST *land_blocks,  //rotated for landscape
104                 TO_BLOCK_LIST *port_blocks,  //output list
105                 tesseract::Tesseract* tess
106                ) {
107   TO_BLOCK_IT block_it;          //iterator
108   TO_BLOCK *block;               //current block;
109 
110   compute_fixed_pitch (page_tr, port_blocks, gradient, FCOORD (0.0f, -1.0f),
111     !(BOOL8) textord_test_landscape, tess);
112   if (global_monitor != NULL) {
113     global_monitor->ocr_alive = TRUE;
114     global_monitor->progress = 25;
115   }
116   to_spacing(page_tr, port_blocks);
117   block_it.set_to_list (port_blocks);
118   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
119   block_it.forward ()) {
120     block = block_it.data ();
121     //              set_row_spaces(block,FCOORD(1,0),!(BOOL8)textord_test_landscape);
122                                  //make proper classes
123     make_real_words (block, FCOORD (1.0f, 0.0f));
124   }
125 }
126 
127 
128 /**********************************************************************
129  * set_row_spaces
130  *
131  * Set the min_space and max_nonspace members of the row so that
132  * the blobs can be arranged into words.
133  **********************************************************************/
134 
set_row_spaces(TO_BLOCK * block,FCOORD rotation,BOOL8 testing_on)135 void set_row_spaces(                  //find space sizes
136                     TO_BLOCK *block,  //block to do
137                     FCOORD rotation,  //for drawing
138                     BOOL8 testing_on  //correct orientation
139                    ) {
140   inT32 maxwidth;                //of widest space
141   TO_ROW *row;                   //current row
142   TO_ROW_IT row_it = block->get_rows ();
143 
144   if (row_it.empty ())
145     return;                      //empty block
146   maxwidth = (inT32) ceil (block->xheight * textord_words_maxspace);
147   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
148     row = row_it.data ();
149     if (row->fixed_pitch == 0) {
150       //                      if (!textord_test_mode
151       //                      && row_words(block,row,maxwidth,rotation,testing_on)==0
152       //                      || textord_test_mode
153       //                      && row_words2(block,row,maxwidth,rotation,testing_on)==0)
154       //                      {
155       row->min_space =
156         (inT32) ceil (row->pr_space -
157         (row->pr_space -
158         row->pr_nonsp) * textord_words_definite_spread);
159       row->max_nonspace =
160         (inT32) floor (row->pr_nonsp +
161         (row->pr_space -
162         row->pr_nonsp) * textord_words_definite_spread);
163       if (testing_on && textord_show_initial_words) {
164         tprintf ("Assigning defaults %d non, %d space to row at %g\n",
165           row->max_nonspace, row->min_space, row->intercept ());
166       }
167       row->space_threshold = (row->max_nonspace + row->min_space) / 2;
168       row->space_size = row->pr_space;
169       row->kern_size = row->pr_nonsp;
170       //                      }
171     }
172 #ifndef GRAPHICS_DISABLED
173     if (textord_show_initial_words && testing_on) {
174       plot_word_decisions (to_win, (inT16) row->fixed_pitch, row);
175     }
176 #endif
177   }
178 }
179 
180 
181 /**********************************************************************
182  * row_words
183  *
184  * Compute the max nonspace and min space for the row.
185  **********************************************************************/
186 
row_words(TO_BLOCK * block,TO_ROW * row,inT32 maxwidth,FCOORD rotation,BOOL8 testing_on)187 inT32 row_words(                  //compute space size
188                 TO_BLOCK *block,  //block it came from
189                 TO_ROW *row,      //row to operate on
190                 inT32 maxwidth,   //max expected space size
191                 FCOORD rotation,  //for drawing
192                 BOOL8 testing_on  //for debug
193                ) {
194   BOOL8 testing_row;             //contains testpt
195   BOOL8 prev_valid;              //if decent size
196   BOOL8 this_valid;              //current blob big enough
197   inT32 prev_x;                  //end of prev blob
198   inT32 min_gap;                 //min interesting gap
199   inT32 cluster_count;           //no of clusters
200   inT32 gap_index;               //which cluster
201   inT32 smooth_factor;           //for smoothing stats
202   BLOBNBOX *blob;                //current blob
203   float lower, upper;            //clustering parameters
204   float gaps[3];                 //gap clusers
205   ICOORD testpt;
206   TBOX blob_box;                  //bounding box
207                                  //iterator
208   BLOBNBOX_IT blob_it = row->blob_list ();
209   STATS gap_stats (0, maxwidth);
210   STATS cluster_stats[4];        //clusters
211 
212   testpt = ICOORD (textord_test_x, textord_test_y);
213   smooth_factor =
214     (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
215   //      if (testing_on)
216   //              tprintf("Row smooth factor=%d\n",smooth_factor);
217   prev_valid = FALSE;
218   prev_x = -MAX_INT32;
219   testing_row = FALSE;
220   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
221     blob = blob_it.data ();
222     blob_box = blob->bounding_box ();
223     if (blob_box.contains (testpt))
224       testing_row = TRUE;
225     gap_stats.add (blob_box.width (), 1);
226   }
227   min_gap = (inT32) floor (gap_stats.ile (textord_words_width_ile));
228   gap_stats.clear ();
229   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
230     blob = blob_it.data ();
231     if (!blob->joined_to_prev ()) {
232       blob_box = blob->bounding_box ();
233       //                      this_valid=blob_box.width()>=min_gap;
234       this_valid = TRUE;
235       if (this_valid && prev_valid
236       && blob_box.left () - prev_x < maxwidth) {
237         gap_stats.add (blob_box.left () - prev_x, 1);
238       }
239       prev_x = blob_box.right ();
240       prev_valid = this_valid;
241     }
242   }
243   if (gap_stats.get_total () == 0) {
244     row->min_space = 0;          //no evidence
245     row->max_nonspace = 0;
246     return 0;
247   }
248   gap_stats.smooth (smooth_factor);
249   lower = row->xheight * textord_words_initial_lower;
250   upper = row->xheight * textord_words_initial_upper;
251   cluster_count = gap_stats.cluster (lower, upper,
252     textord_spacesize_ratioprop, 3,
253     cluster_stats);
254   while (cluster_count < 2 && ceil (lower) < floor (upper)) {
255                                  //shrink gap
256     upper = (upper * 3 + lower) / 4;
257     lower = (lower * 3 + upper) / 4;
258     cluster_count = gap_stats.cluster (lower, upper,
259       textord_spacesize_ratioprop, 3,
260       cluster_stats);
261   }
262   if (cluster_count < 2) {
263     row->min_space = 0;          //no evidence
264     row->max_nonspace = 0;
265     return 0;
266   }
267   for (gap_index = 0; gap_index < cluster_count; gap_index++)
268     gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
269   //get medians
270   if (cluster_count > 2) {
271     if (testing_on && textord_show_initial_words) {
272       tprintf ("Row at %g has 3 sizes of gap:%g,%g,%g\n",
273         row->intercept (),
274         cluster_stats[1].ile (0.5),
275         cluster_stats[2].ile (0.5), cluster_stats[3].ile (0.5));
276     }
277     lower = gaps[0];
278     if (gaps[1] > lower) {
279       upper = gaps[1];           //prefer most frequent
280       if (upper < block->xheight * textord_words_min_minspace
281       && gaps[2] > gaps[1]) {
282         upper = gaps[2];
283       }
284     }
285     else if (gaps[2] > lower
286       && gaps[2] >= block->xheight * textord_words_min_minspace)
287       upper = gaps[2];
288     else if (lower >= block->xheight * textord_words_min_minspace) {
289       upper = lower;             //not nice
290       lower = gaps[1];
291       if (testing_on && textord_show_initial_words) {
292         tprintf ("Had to switch most common from lower to upper!!\n");
293         gap_stats.print (stdout, TRUE);
294       }
295     }
296     else {
297       row->min_space = 0;        //no evidence
298       row->max_nonspace = 0;
299       return 0;
300     }
301   }
302   else {
303     if (gaps[1] < gaps[0]) {
304       if (testing_on && textord_show_initial_words) {
305         tprintf ("Had to switch most common from lower to upper!!\n");
306         gap_stats.print (stdout, TRUE);
307       }
308       lower = gaps[1];
309       upper = gaps[0];
310     }
311     else {
312       upper = gaps[1];
313       lower = gaps[0];
314     }
315   }
316   if (upper < block->xheight * textord_words_min_minspace) {
317     row->min_space = 0;          //no evidence
318     row->max_nonspace = 0;
319     return 0;
320   }
321   if (upper * 3 < block->min_space * 2 + block->max_nonspace
322   || lower * 3 > block->min_space * 2 + block->max_nonspace) {
323     if (testing_on && textord_show_initial_words) {
324       tprintf ("Disagreement between block and row at %g!!\n",
325         row->intercept ());
326       tprintf ("Lower=%g, upper=%g, Stats:\n", lower, upper);
327       gap_stats.print (stdout, TRUE);
328     }
329   }
330   row->min_space =
331     (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
332   row->max_nonspace =
333     (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
334   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
335   row->space_size = upper;
336   row->kern_size = lower;
337   if (testing_on && textord_show_initial_words) {
338     if (testing_row) {
339       tprintf ("GAP STATS\n");
340       gap_stats.print (stdout, TRUE);
341       tprintf ("SPACE stats\n");
342       cluster_stats[2].print (stdout, FALSE);
343       tprintf ("NONSPACE stats\n");
344       cluster_stats[1].print (stdout, FALSE);
345     }
346     tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
347       row->intercept (), row->min_space, upper,
348       row->max_nonspace, lower);
349   }
350   return cluster_stats[2].get_total ();
351 }
352 
353 
354 /**********************************************************************
355  * row_words2
356  *
357  * Compute the max nonspace and min space for the row.
358  **********************************************************************/
359 
row_words2(TO_BLOCK * block,TO_ROW * row,inT32 maxwidth,FCOORD rotation,BOOL8 testing_on)360 inT32 row_words2(                  //compute space size
361                  TO_BLOCK *block,  //block it came from
362                  TO_ROW *row,      //row to operate on
363                  inT32 maxwidth,   //max expected space size
364                  FCOORD rotation,  //for drawing
365                  BOOL8 testing_on  //for debug
366                 ) {
367   BOOL8 testing_row;             //contains testpt
368   BOOL8 prev_valid;              //if decent size
369   BOOL8 this_valid;              //current blob big enough
370   inT32 prev_x;                  //end of prev blob
371   inT32 min_width;               //min interesting width
372   inT32 valid_count;             //good gaps
373   inT32 total_count;             //total gaps
374   inT32 cluster_count;           //no of clusters
375   inT32 prev_count;              //previous cluster_count
376   inT32 gap_index;               //which cluster
377   inT32 smooth_factor;           //for smoothing stats
378   BLOBNBOX *blob;                //current blob
379   float lower, upper;            //clustering parameters
380   ICOORD testpt;
381   TBOX blob_box;                  //bounding box
382                                  //iterator
383   BLOBNBOX_IT blob_it = row->blob_list ();
384   STATS gap_stats (0, maxwidth);
385                                  //gap sizes
386   float gaps[BLOCK_STATS_CLUSTERS];
387   STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
388   //clusters
389 
390   testpt = ICOORD (textord_test_x, textord_test_y);
391   smooth_factor =
392     (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
393   //      if (testing_on)
394   //              tprintf("Row smooth factor=%d\n",smooth_factor);
395   prev_valid = FALSE;
396   prev_x = -MAX_INT16;
397   testing_row = FALSE;
398                                  //min blob size
399   min_width = (inT32) block->pr_space;
400   total_count = 0;
401   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
402     blob = blob_it.data ();
403     if (!blob->joined_to_prev ()) {
404       blob_box = blob->bounding_box ();
405       this_valid = blob_box.width () >= min_width;
406       this_valid = TRUE;
407       if (this_valid && prev_valid
408       && blob_box.left () - prev_x < maxwidth) {
409         gap_stats.add (blob_box.left () - prev_x, 1);
410       }
411       total_count++;             //count possibles
412       prev_x = blob_box.right ();
413       prev_valid = this_valid;
414     }
415   }
416   valid_count = gap_stats.get_total ();
417   if (valid_count < total_count * textord_words_minlarge) {
418     gap_stats.clear ();
419     prev_x = -MAX_INT16;
420     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
421     blob_it.forward ()) {
422       blob = blob_it.data ();
423       if (!blob->joined_to_prev ()) {
424         blob_box = blob->bounding_box ();
425         if (blob_box.left () - prev_x < maxwidth) {
426           gap_stats.add (blob_box.left () - prev_x, 1);
427         }
428         prev_x = blob_box.right ();
429       }
430     }
431   }
432   if (gap_stats.get_total () == 0) {
433     row->min_space = 0;          //no evidence
434     row->max_nonspace = 0;
435     return 0;
436   }
437 
438   cluster_count = 0;
439   lower = block->xheight * words_initial_lower;
440   upper = block->xheight * words_initial_upper;
441   gap_stats.smooth (smooth_factor);
442   do {
443     prev_count = cluster_count;
444     cluster_count = gap_stats.cluster (lower, upper,
445       textord_spacesize_ratioprop,
446       BLOCK_STATS_CLUSTERS, cluster_stats);
447   }
448   while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
449   if (cluster_count < 1) {
450     row->min_space = 0;
451     row->max_nonspace = 0;
452     return 0;
453   }
454   for (gap_index = 0; gap_index < cluster_count; gap_index++)
455     gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
456   //get medians
457   if (testing_on) {
458     tprintf ("cluster_count=%d:", cluster_count);
459     for (gap_index = 0; gap_index < cluster_count; gap_index++)
460       tprintf (" %g(%d)", gaps[gap_index],
461         cluster_stats[gap_index + 1].get_total ());
462     tprintf ("\n");
463   }
464 
465   //Try to find proportional non-space and space for row.
466   for (gap_index = 0; gap_index < cluster_count
467     && gaps[gap_index] > block->max_nonspace; gap_index++);
468   if (gap_index < cluster_count)
469     lower = gaps[gap_index];     //most frequent below
470   else {
471     if (testing_on)
472       tprintf ("No cluster below block threshold!, using default=%g\n",
473         block->pr_nonsp);
474     lower = block->pr_nonsp;
475   }
476   for (gap_index = 0; gap_index < cluster_count
477     && gaps[gap_index] <= block->max_nonspace; gap_index++);
478   if (gap_index < cluster_count)
479     upper = gaps[gap_index];     //most frequent above
480   else {
481     if (testing_on)
482       tprintf ("No cluster above block threshold!, using default=%g\n",
483         block->pr_space);
484     upper = block->pr_space;
485   }
486   row->min_space =
487     (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
488   row->max_nonspace =
489     (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
490   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
491   row->space_size = upper;
492   row->kern_size = lower;
493   if (testing_on) {
494     if (testing_row) {
495       tprintf ("GAP STATS\n");
496       gap_stats.print (stdout, TRUE);
497       tprintf ("SPACE stats\n");
498       cluster_stats[2].print (stdout, FALSE);
499       tprintf ("NONSPACE stats\n");
500       cluster_stats[1].print (stdout, FALSE);
501     }
502     tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
503       row->intercept (), row->min_space, upper,
504       row->max_nonspace, lower);
505   }
506   return 1;
507 }
508 
509 
510 /**********************************************************************
511  * make_real_words
512  *
513  * Convert a TO_BLOCK to a BLOCK.
514  **********************************************************************/
515 
make_real_words(TO_BLOCK * block,FCOORD rotation)516 void make_real_words(                  //find lines
517                      TO_BLOCK *block,  //block to do
518                      FCOORD rotation   //for drawing
519                     ) {
520   TO_ROW *row;                   //current row
521   TO_ROW_IT row_it = block->get_rows ();
522   ROW *real_row = NULL;          //output row
523   ROW_IT real_row_it = block->block->row_list ();
524 
525   if (row_it.empty ())
526     return;                      //empty block
527   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
528     row = row_it.data ();
529     if (row->blob_list ()->empty () && !row->rep_words.empty ()) {
530       real_row = make_rep_words (row, block);
531     }
532     else if (!row->blob_list ()->empty ()) {
533       // In a fixed pitch document, some lines may be detected as fixed pitch
534       // while others don't, and will go through different path.
535       // For non-space delimited language like CJK, fixed pitch chop always
536       // leave the entire line as one word.  We can force consistent chopping
537       // with force_make_prop_words flag.
538       if (textord_chopper_test) {
539         real_row = make_blob_words (row, rotation);
540       } else if (textord_force_make_prop_words ||
541           row->pitch_decision == PITCH_DEF_PROP ||
542           row->pitch_decision == PITCH_CORR_PROP) {
543         real_row = make_prop_words (row, rotation);
544       } else if (row->pitch_decision == PITCH_DEF_FIXED ||
545                  row->pitch_decision == PITCH_CORR_FIXED) {
546         real_row = fixed_pitch_words (row, rotation);
547       } else
548         ASSERT_HOST(FALSE);
549     }
550     if (real_row != NULL) {
551                                  //put row in block
552       real_row_it.add_after_then_move (real_row);
553     }
554   }
555   block->block->set_stats (block->fixed_pitch == 0, (inT16) block->kern_size,
556     (inT16) block->space_size,
557     (inT16) block->fixed_pitch);
558   block->block->check_pitch ();
559 }
560 
561 
562 /**********************************************************************
563  * make_rep_words
564  *
565  * Fabricate a real row from only the repeated blob words.
566  * Get the xheight from the block as it may be more meaningful.
567  **********************************************************************/
568 
make_rep_words(TO_ROW * row,TO_BLOCK * block)569 ROW *make_rep_words(                 //make a row
570                     TO_ROW *row,     //row to convert
571                     TO_BLOCK *block  //block it lives in
572                    ) {
573   inT32 xstarts[2];              //ends of row
574   ROW *real_row;                 //output row
575   TBOX word_box;                  //bounding box
576   double coeffs[3];              //spline
577                                  //iterator
578   WERD_IT word_it = &row->rep_words;
579 
580   if (word_it.empty ())
581     return NULL;
582   word_box = word_it.data ()->bounding_box ();
583   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ())
584     word_box += word_it.data ()->bounding_box ();
585   xstarts[0] = word_box.left ();
586   xstarts[1] = word_box.right ();
587   coeffs[0] = 0;
588   coeffs[1] = row->line_m ();
589   coeffs[2] = row->line_c ();
590   row->xheight = block->xheight;
591   real_row = new ROW(row,
592     (inT16) block->kern_size, (inT16) block->space_size);
593   word_it.set_to_list (real_row->word_list ());
594                                  //put words in row
595   word_it.add_list_after (&row->rep_words);
596   real_row->recalc_bounding_box ();
597   return real_row;
598 }
599 
600 
601 /**********************************************************************
602  * make_real_word
603  *
604  * Construct a WERD from a given number of adjacent entries in a
605  * list of BLOBNBOXs.
606  **********************************************************************/
607 
make_real_word(BLOBNBOX_IT * box_it,inT32 blobcount,BOOL8 bol,BOOL8 fuzzy_sp,BOOL8 fuzzy_non,uinT8 blanks)608 WERD *make_real_word(                      //make a WERD
609                      BLOBNBOX_IT *box_it,  //iterator
610                      inT32 blobcount,      //no of blobs to use
611                      BOOL8 bol,            //start of line
612                      BOOL8 fuzzy_sp,       //fuzzy space
613                      BOOL8 fuzzy_non,      //fuzzy non-space
614                      uinT8 blanks          //no of blanks
615                     ) {
616   OUTLINE_IT out_it;             //outlines
617   C_OUTLINE_IT cout_it;
618   PBLOB_LIST blobs;              //blobs in word
619   C_BLOB_LIST cblobs;
620   PBLOB_IT blob_it = &blobs;     //iterator
621   C_BLOB_IT cblob_it = &cblobs;
622   WERD *word;                    //new word
623   BLOBNBOX *bblob;               //current blob
624   inT32 blobindex;               //in row
625 
626   for (blobindex = 0; blobindex < blobcount; blobindex++) {
627     bblob = box_it->extract ();
628     if (bblob->joined_to_prev ()) {
629       if (bblob->blob () != NULL) {
630         out_it.set_to_list (blob_it.data ()->out_list ());
631         out_it.move_to_last ();
632         out_it.add_list_after (bblob->blob ()->out_list ());
633         delete bblob->blob ();
634       }
635       else if (bblob->cblob () != NULL) {
636         cout_it.set_to_list (cblob_it.data ()->out_list ());
637         cout_it.move_to_last ();
638         cout_it.add_list_after (bblob->cblob ()->out_list ());
639         delete bblob->cblob ();
640       }
641     }
642     else {
643       if (bblob->blob () != NULL)
644         blob_it.add_after_then_move (bblob->blob ());
645       else if (bblob->cblob () != NULL)
646         cblob_it.add_after_then_move (bblob->cblob ());
647     }
648     delete bblob;
649     box_it->forward ();          //next one
650   }
651 
652   if (blanks < 1)
653     blanks = 1;
654   if (!blob_it.empty ()) {
655                                  //make real word
656     word = new WERD (&blobs, blanks, NULL);
657   }
658   else {
659     word = new WERD (&cblobs, blanks, NULL);
660   }
661   if (bol) {
662     word->set_flag (W_BOL, TRUE);
663   }
664   if (fuzzy_sp)
665                                  //probably space
666     word->set_flag (W_FUZZY_SP, TRUE);
667   else if (fuzzy_non)
668                                  //probably not
669     word->set_flag (W_FUZZY_NON, TRUE);
670   if (box_it->at_first ()) {
671     word->set_flag (W_EOL, TRUE);//at end of line
672   }
673   return word;
674 }
675 
676