• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************
2  * File:        fixspace.cpp  (Formerly fixspace.c)
3  * Description: Implements a pass over the page res, exploring the alternative
4  *					spacing possibilities, trying to use context to improve the
5           word spacing
6 * Author:		Phil Cheatle
7 * Created:		Thu Oct 21 11:38:43 BST 1993
8 *
9 * (C) Copyright 1993, Hewlett-Packard Ltd.
10 ** Licensed under the Apache License, Version 2.0 (the "License");
11 ** you may not use this file except in compliance with the License.
12 ** You may obtain a copy of the License at
13 ** http://www.apache.org/licenses/LICENSE-2.0
14 ** Unless required by applicable law or agreed to in writing, software
15 ** distributed under the License is distributed on an "AS IS" BASIS,
16 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 ** See the License for the specific language governing permissions and
18 ** limitations under the License.
19 *
20 **********************************************************************/
21 
22 #include "mfcpch.h"
23 #include <ctype.h>
24 #include "reject.h"
25 #include "statistc.h"
26 #include "genblob.h"
27 #include "control.h"
28 #include "fixspace.h"
29 #include "tessvars.h"
30 #include "tessbox.h"
31 #include "secname.h"
32 #include "globals.h"
33 #include "tesseractclass.h"
34 
35 #define EXTERN
36 
37 EXTERN BOOL_VAR (fixsp_check_for_fp_noise_space, TRUE,
38 "Try turning noise to space in fixed pitch");
39 EXTERN BOOL_VAR (fixsp_fp_eval, TRUE, "Use alternate evaluation for fp");
40 EXTERN BOOL_VAR (fixsp_noise_score_fixing, TRUE, "More sophisticated?");
41 EXTERN INT_VAR (fixsp_non_noise_limit, 1,
42 "How many non-noise blbs either side?");
43 EXTERN double_VAR (fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
44 
45 EXTERN BOOL_VAR (fixsp_ignore_punct, TRUE, "In uniform spacing calc");
46 EXTERN BOOL_VAR (fixsp_numeric_fix, TRUE, "Try to deal with numeric punct");
47 EXTERN BOOL_VAR (fixsp_prefer_joined_1s, TRUE, "Arbitrary boost");
48 EXTERN BOOL_VAR (tessedit_test_uniform_wd_spacing, FALSE,
49 "Limit context word spacing");
50 EXTERN BOOL_VAR (tessedit_prefer_joined_punct, FALSE,
51 "Reward punctation joins");
52 EXTERN INT_VAR (fixsp_done_mode, 1, "What constitues done for spacing");
53 EXTERN INT_VAR (debug_fix_space_level, 0, "Contextual fixspace debug");
54 EXTERN STRING_VAR (numeric_punctuation, ".,",
55 "Punct. chs expected WITHIN numbers");
56 
57 #define PERFECT_WERDS   999
58 #define MAXSPACING      128      /*max expected spacing in pix */
59 
60 /*************************************************************************
61  * fix_fuzzy_spaces()
62  * Walk over the page finding sequences of words joined by fuzzy spaces. Extract
63  * them as a sublist, process the sublist to find the optimal arrangement of
64  * spaces then replace the sublist in the ROW_RES.
65  *************************************************************************/
66 namespace tesseract {
fix_fuzzy_spaces(volatile ETEXT_DESC * monitor,inT32 word_count,PAGE_RES * page_res)67 void Tesseract::fix_fuzzy_spaces(                       //find fuzzy words
68                                                         //progress monitor
69                                  volatile ETEXT_DESC *monitor,
70                                                         //count of words in doc
71                                  inT32 word_count,
72                                  PAGE_RES *page_res) {
73   BLOCK_RES_IT block_res_it;     //iterators
74   ROW_RES_IT row_res_it;
75   WERD_RES_IT word_res_it_from;
76   WERD_RES_IT word_res_it_to;
77   WERD_RES *word_res;
78   WERD_RES_LIST fuzzy_space_words;
79   inT16 new_length;
80   BOOL8 prevent_null_wd_fixsp;   //DONT process blobless wds
81   inT32 word_index;              //current word
82 
83   block_res_it.set_to_list (&page_res->block_res_list);
84   word_index = 0;
85   for (block_res_it.mark_cycle_pt ();
86   !block_res_it.cycled_list (); block_res_it.forward ()) {
87     row_res_it.set_to_list (&block_res_it.data ()->row_res_list);
88     for (row_res_it.mark_cycle_pt ();
89     !row_res_it.cycled_list (); row_res_it.forward ()) {
90       word_res_it_from.set_to_list (&row_res_it.data ()->word_res_list);
91       while (!word_res_it_from.at_last ()) {
92         word_res = word_res_it_from.data ();
93         while (!word_res_it_from.at_last () &&
94           !(word_res->combination ||
95           word_res_it_from.data_relative (1)->
96           word->flag (W_FUZZY_NON) ||
97           word_res_it_from.data_relative (1)->
98         word->flag (W_FUZZY_SP))) {
99           fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
100                          block_res_it.data()->block);
101           word_res = word_res_it_from.forward ();
102           word_index++;
103           if (monitor != NULL) {
104             monitor->ocr_alive = TRUE;
105             monitor->progress = 90 + 5 * word_index / word_count;
106           }
107         }
108 
109         if (!word_res_it_from.at_last ()) {
110           word_res_it_to = word_res_it_from;
111           prevent_null_wd_fixsp =
112             word_res->word->gblob_list ()->empty ();
113           if (check_debug_pt (word_res, 60))
114             debug_fix_space_level.set_value (10);
115           word_res_it_to.forward ();
116           word_index++;
117           if (monitor != NULL) {
118             monitor->ocr_alive = TRUE;
119             monitor->progress = 90 + 5 * word_index / word_count;
120           }
121           while (!word_res_it_to.at_last () &&
122             (word_res_it_to.data_relative (1)->
123             word->flag (W_FUZZY_NON) ||
124             word_res_it_to.data_relative (1)->
125           word->flag (W_FUZZY_SP))) {
126             if (check_debug_pt (word_res, 60))
127               debug_fix_space_level.set_value (10);
128             if (word_res->word->gblob_list ()->empty ())
129               prevent_null_wd_fixsp = TRUE;
130             word_res = word_res_it_to.forward ();
131           }
132           if (check_debug_pt (word_res, 60))
133             debug_fix_space_level.set_value (10);
134           if (word_res->word->gblob_list ()->empty ())
135             prevent_null_wd_fixsp = TRUE;
136           if (prevent_null_wd_fixsp) {
137             word_res_it_from = word_res_it_to;
138           } else {
139             fuzzy_space_words.assign_to_sublist (&word_res_it_from,
140               &word_res_it_to);
141             fix_fuzzy_space_list (fuzzy_space_words,
142                                  row_res_it.data()->row,
143                                  block_res_it.data()->block);
144             new_length = fuzzy_space_words.length ();
145             word_res_it_from.add_list_before (&fuzzy_space_words);
146             for (;
147               (!word_res_it_from.at_last () &&
148             (new_length > 0)); new_length--) {
149               word_res_it_from.forward ();
150             }
151           }
152           if (test_pt)
153             debug_fix_space_level.set_value (0);
154         }
155         fix_sp_fp_word(word_res_it_from, row_res_it.data ()->row,
156                        block_res_it.data()->block);
157         //Last word in row
158       }
159     }
160   }
161 }
162 
fix_fuzzy_space_list(WERD_RES_LIST & best_perm,ROW * row,BLOCK * block)163 void Tesseract::fix_fuzzy_space_list(  //space explorer
164                                      WERD_RES_LIST &best_perm,
165                                      ROW *row,
166                                      BLOCK* block) {
167   inT16 best_score;
168   WERD_RES_LIST current_perm;
169   inT16 current_score;
170   BOOL8 improved = FALSE;
171 
172   best_score = eval_word_spacing(best_perm);  // default score
173   dump_words (best_perm, best_score, 1, improved);
174 
175   if (best_score != PERFECT_WERDS)
176     initialise_search(best_perm, current_perm);
177 
178   while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {
179     match_current_words(current_perm, row, block);
180     current_score = eval_word_spacing (current_perm);
181     dump_words (current_perm, current_score, 2, improved);
182     if (current_score > best_score) {
183       best_perm.clear ();
184       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
185       best_score = current_score;
186       improved = TRUE;
187     }
188     if (current_score < PERFECT_WERDS)
189       transform_to_next_perm(current_perm);
190   }
191   dump_words (best_perm, best_score, 3, improved);
192 }
193 
194 }  // namespace tesseract
195 
initialise_search(WERD_RES_LIST & src_list,WERD_RES_LIST & new_list)196 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
197   WERD_RES_IT src_it(&src_list);
198   WERD_RES_IT new_it(&new_list);
199   WERD_RES *src_wd;
200   WERD_RES *new_wd;
201 
202   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
203     src_wd = src_it.data ();
204     if (!src_wd->combination) {
205       new_wd = new WERD_RES (*src_wd);
206       new_wd->combination = FALSE;
207       new_wd->part_of_combo = FALSE;
208       new_it.add_after_then_move (new_wd);
209     }
210   }
211 }
212 
213 
214 namespace tesseract {
match_current_words(WERD_RES_LIST & words,ROW * row,BLOCK * block)215 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
216                                     BLOCK* block) {
217   WERD_RES_IT word_it(&words);
218   WERD_RES *word;
219 
220   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
221     word = word_it.data ();
222     if ((!word->part_of_combo) && (word->outword == NULL))
223       classify_word_pass2(word, block, row);
224   }
225 }
226 
227 
228 /*************************************************************************
229  * eval_word_spacing()
230  * The basic measure is the number of characters in contextually confirmed
231  * words. (I.e the word is done)
232  * If all words are contextually confirmed the evaluation is deemed perfect.
233  *
234  * Some fiddles are done to handle "1"s as these are VERY frequent causes of
235  * fuzzy spaces. The problem with the basic measure is that "561 63" would score
236  * the same as "56163", though given our knowledge that the space is fuzzy, and
237  * that there is a "1" next to the fuzzy space, we need to ensure that "56163"
238  * is prefered.
239  *
240  * The solution is to NOT COUNT the score of any word which has a digit at one
241  * end and a "1Il" as the character the other side of the space.
242  *
243  * Conversly, any character next to a "1" within a word is counted as a positive
244  * score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of
245  * the "1" joined).  "56163" would score 7 - all chars in a numeric word + 2
246  * sides of a "1" joined.
247  *
248  * The joined 1 rule is applied to any word REGARDLESS of contextual
249  * confirmation.  Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
250  * confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
251  *
252  *************************************************************************/
eval_word_spacing(WERD_RES_LIST & word_res_list)253 inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
254   WERD_RES_IT word_res_it(&word_res_list);
255   inT16 total_score = 0;
256   inT16 word_count = 0;
257   inT16 done_word_count = 0;
258   inT16 word_len;
259   inT16 i;
260   inT16 offset;
261   WERD_RES *word;                //current word
262   inT16 prev_word_score = 0;
263   BOOL8 prev_word_done = FALSE;
264   BOOL8 prev_char_1 = FALSE;     //prev ch a "1/I/l"?
265   BOOL8 prev_char_digit = FALSE; //prev ch 2..9 or 0
266   BOOL8 current_char_1 = FALSE;
267   BOOL8 current_word_ok_so_far;
268   STRING punct_chars = "!\"`',.:;";
269   BOOL8 prev_char_punct = FALSE;
270   BOOL8 current_char_punct = FALSE;
271   BOOL8 word_done = FALSE;
272 
273   do {
274     word = word_res_it.data ();
275     word_done = fixspace_thinks_word_done (word);
276     word_count++;
277     if (word->tess_failed) {
278       total_score += prev_word_score;
279       if (prev_word_done)
280         done_word_count++;
281       prev_word_score = 0;
282       prev_char_1 = FALSE;
283       prev_char_digit = FALSE;
284       prev_word_done = FALSE;
285     }
286     else {
287       /*
288         Can we add the prev word score and potentially count this word?
289         Yes IF it didnt end in a 1 when the first char of this word is a digit
290           AND it didnt end in a digit when the first char of this word is a 1
291       */
292       word_len = word->reject_map.length ();
293       current_word_ok_so_far = FALSE;
294       if (!((prev_char_1 &&
295         digit_or_numeric_punct (word, 0)) ||
296         (prev_char_digit &&
297         ((word_done &&
298         (word->best_choice->unichar_lengths().string()[0] == 1 &&
299          word->best_choice->unichar_string()[0] == '1')) ||
300         (!word_done &&
301          STRING(conflict_set_I_l_1).contains(
302              word->best_choice->unichar_string ()[0])))))) {
303         total_score += prev_word_score;
304         if (prev_word_done)
305           done_word_count++;
306         current_word_ok_so_far = word_done;
307       }
308 
309       if ((current_word_ok_so_far) &&
310         (!tessedit_test_uniform_wd_spacing ||
311         ((word->best_choice->permuter () == NUMBER_PERM) ||
312       uniformly_spaced (word)))) {
313         prev_word_done = TRUE;
314         prev_word_score = word_len;
315       }
316       else {
317         prev_word_done = FALSE;
318         prev_word_score = 0;
319       }
320 
321       if (fixsp_prefer_joined_1s) {
322         /* Add 1 to total score for every joined 1 regardless of context and
323            rejtn */
324 
325         for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
326           current_char_1 = word->best_choice->unichar_string()[i] == '1';
327           if (prev_char_1 || (current_char_1 && (i > 0)))
328             total_score++;
329           prev_char_1 = current_char_1;
330         }
331       }
332 
333       /* Add 1 to total score for every joined punctuation regardless of context
334         and rejtn */
335       if (tessedit_prefer_joined_punct) {
336         for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
337              offset += word->best_choice->unichar_lengths()[i++]) {
338           current_char_punct =
339             punct_chars.contains (word->best_choice->unichar_string()[offset]);
340           if (prev_char_punct || (current_char_punct && (i > 0)))
341             total_score++;
342           prev_char_punct = current_char_punct;
343         }
344       }
345       prev_char_digit = digit_or_numeric_punct (word, word_len - 1);
346       for (i = 0, offset = 0; i < word_len - 1;
347            offset += word->best_choice->unichar_lengths()[i++]);
348       prev_char_1 =
349         ((word_done
350         && (word->best_choice->unichar_string()[offset] == '1'))
351         || (!word_done
352         && STRING(conflict_set_I_l_1).contains(
353             word->best_choice->unichar_string()[offset])));
354     }
355     /* Find next word */
356     do
357     word_res_it.forward ();
358     while (word_res_it.data ()->part_of_combo);
359   }
360   while (!word_res_it.at_first ());
361   total_score += prev_word_score;
362   if (prev_word_done)
363     done_word_count++;
364   if (done_word_count == word_count)
365     return PERFECT_WERDS;
366   else
367     return total_score;
368 }
369 
370 
digit_or_numeric_punct(WERD_RES * word,int char_position)371 BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
372   int i;
373   int offset;
374 
375   for (i = 0, offset = 0; i < char_position;
376        offset += word->best_choice->unichar_lengths()[i++]);
377   return (unicharset.get_isdigit(word->best_choice->unichar_string().string() + offset,
378                                  word->best_choice->unichar_lengths()[i]) ||
379     (fixsp_numeric_fix &&
380     (word->best_choice->permuter () == NUMBER_PERM) &&
381     STRING (numeric_punctuation).contains
382      (word->best_choice->unichar_string().string()[offset])));
383 }
384 }  // namespace tesseract
385 
386 
387 /*************************************************************************
388  * transform_to_next_perm()
389  * Examines the current word list to find the smallest word gap size. Then walks
390  * the word list closing any gaps of this size by either inserted new
391  * combination words, or extending existing ones.
392  *
393  * The routine COULD be limited to stop it building words longer than N blobs.
394  *
395  * If there are no more gaps then it DELETES the entire list and returns the
396  * empty list to cause termination.
397  *************************************************************************/
transform_to_next_perm(WERD_RES_LIST & words)398 void transform_to_next_perm(WERD_RES_LIST &words) {
399   WERD_RES_IT word_it(&words);
400   WERD_RES_IT prev_word_it(&words);
401   WERD_RES *word;
402   WERD_RES *prev_word;
403   WERD_RES *combo;
404   WERD *copy_word;
405   inT16 prev_right = -1;
406   TBOX box;
407   inT16 gap;
408   inT16 min_gap = MAX_INT16;
409 
410   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
411     word = word_it.data ();
412     if (!word->part_of_combo) {
413       box = word->word->bounding_box ();
414       if (prev_right >= 0) {
415         gap = box.left () - prev_right;
416         if (gap < min_gap)
417           min_gap = gap;
418       }
419       prev_right = box.right ();
420     }
421   }
422   if (min_gap < MAX_INT16) {
423     prev_right = -1;             //back to start
424     word_it.set_to_list (&words);
425     for (;                       //cant use cycle pt due to inserted combos at start of list
426     (prev_right < 0) || !word_it.at_first (); word_it.forward ()) {
427       word = word_it.data ();
428       if (!word->part_of_combo) {
429         box = word->word->bounding_box ();
430         if (prev_right >= 0) {
431           gap = box.left () - prev_right;
432           if (gap <= min_gap) {
433             prev_word = prev_word_it.data ();
434             if (prev_word->combination)
435               combo = prev_word;
436             else {
437               /* Make a new combination and insert before the first word being joined */
438               copy_word = new WERD;
439               *copy_word = *(prev_word->word);
440               //deep copy
441               combo = new WERD_RES (copy_word);
442               combo->combination = TRUE;
443               combo->x_height = prev_word->x_height;
444               prev_word->part_of_combo = TRUE;
445               prev_word_it.add_before_then_move (combo);
446             }
447             combo->word->set_flag (W_EOL, word->word->flag (W_EOL));
448             if (word->combination) {
449               combo->word->join_on (word->word);
450               //Move blbs to combo
451                                  //old combo no longer needed
452               delete word_it.extract ();
453             }
454             else {
455                                  //Cpy current wd to combo
456               combo->copy_on (word);
457               word->part_of_combo = TRUE;
458             }
459             combo->done = FALSE;
460             if (combo->outword != NULL) {
461               delete combo->outword;
462               delete combo->best_choice;
463               delete combo->raw_choice;
464               combo->outword = NULL;
465               combo->best_choice = NULL;
466               combo->raw_choice = NULL;
467             }
468           }
469           else
470                                  //catch up
471               prev_word_it = word_it;
472         }
473         prev_right = box.right ();
474       }
475     }
476   }
477   else
478     words.clear ();              //signal termination
479 }
480 
481 
dump_words(WERD_RES_LIST & perm,inT16 score,inT16 mode,BOOL8 improved)482 void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved) {
483   WERD_RES_IT word_res_it(&perm);
484   static STRING initial_str;
485 
486   if (debug_fix_space_level > 0) {
487     if (mode == 1) {
488       initial_str = "";
489       for (word_res_it.mark_cycle_pt ();
490       !word_res_it.cycled_list (); word_res_it.forward ()) {
491         if (!word_res_it.data ()->part_of_combo) {
492           initial_str += word_res_it.data()->best_choice->unichar_string();
493           initial_str += ' ';
494         }
495       }
496     }
497 
498     #ifndef SECURE_NAMES
499     if (debug_fix_space_level > 1) {
500       switch (mode) {
501         case 1:
502           tprintf ("EXTRACTED (%d): \"", score);
503           break;
504         case 2:
505           tprintf ("TESTED (%d): \"", score);
506           break;
507         case 3:
508           tprintf ("RETURNED (%d): \"", score);
509           break;
510       }
511 
512       for (word_res_it.mark_cycle_pt ();
513       !word_res_it.cycled_list (); word_res_it.forward ()) {
514         if (!word_res_it.data ()->part_of_combo)
515           tprintf("%s/%1d ",
516                   word_res_it.data()->best_choice->unichar_string().string(),
517                   (int)word_res_it.data()->best_choice->permuter());
518       }
519       tprintf ("\"\n");
520     }
521     else if (improved) {
522       tprintf ("FIX SPACING \"%s\" => \"", initial_str.string ());
523       for (word_res_it.mark_cycle_pt ();
524       !word_res_it.cycled_list (); word_res_it.forward ()) {
525         if (!word_res_it.data ()->part_of_combo)
526           tprintf ("%s/%1d ",
527                    word_res_it.data()->best_choice->unichar_string().string(),
528                    (int)word_res_it.data()->best_choice->permuter());
529       }
530       tprintf ("\"\n");
531     }
532     #endif
533   }
534 }
535 
536 
537 /*************************************************************************
538  * uniformly_spaced()
539  * Return true if one of the following are true:
540  *    - All inter-char gaps are the same width
541  *	- The largest gap is no larger than twice the mean/median of the others
542  *	- The largest gap is < 64/5 = 13 and all others are <= 0
543  * **** REMEMBER - WE'RE NOW WORKING WITH A BLN WERD !!!
544  *************************************************************************/
uniformly_spaced(WERD_RES * word)545 BOOL8 uniformly_spaced(  //sensible word
546                        WERD_RES *word) {
547   PBLOB_IT blob_it;
548   TBOX box;
549   inT16 prev_right = -MAX_INT16;
550   inT16 gap;
551   inT16 max_gap = -MAX_INT16;
552   inT16 max_gap_count = 0;
553   STATS gap_stats (0, MAXSPACING);
554   BOOL8 result;
555   const ROW *row = word->denorm.row ();
556   float max_non_space;
557   float normalised_max_nonspace;
558   inT16 i = 0;
559   inT16 offset = 0;
560   STRING punct_chars = "\"`',.:;";
561 
562   blob_it.set_to_list (word->outword->blob_list ());
563 
564   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
565     box = blob_it.data ()->bounding_box ();
566     if ((prev_right > -MAX_INT16) &&
567       (!fixsp_ignore_punct ||
568       (!punct_chars.contains (word->best_choice->unichar_string()
569                               [offset - word->best_choice->unichar_lengths()[i - 1]]) &&
570     !punct_chars.contains (word->best_choice->unichar_string()[offset])))) {
571       gap = box.left () - prev_right;
572       if (gap < max_gap)
573         gap_stats.add (gap, 1);
574       else if (gap == max_gap)
575         max_gap_count++;
576       else {
577         if (max_gap_count > 0)
578           gap_stats.add (max_gap, max_gap_count);
579         max_gap = gap;
580         max_gap_count = 1;
581       }
582     }
583     prev_right = box.right ();
584     offset += word->best_choice->unichar_lengths()[i++];
585   }
586 
587   max_non_space = (row->space () + 3 * row->kern ()) / 4;
588   normalised_max_nonspace = max_non_space * bln_x_height / row->x_height ();
589 
590   result = ((gap_stats.get_total () == 0) ||
591     (max_gap <= normalised_max_nonspace) ||
592     ((gap_stats.get_total () > 2) &&
593     (max_gap <= 2 * gap_stats.median ())) ||
594     ((gap_stats.get_total () <= 2) &&
595     (max_gap <= 2 * gap_stats.mean ())));
596   #ifndef SECURE_NAMES
597   if ((debug_fix_space_level > 1)) {
598     if (result)
599       tprintf
600         ("ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n",
601         word->best_choice->unichar_string().string (), normalised_max_nonspace,
602         max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (),
603         gap_stats.median ());
604     else
605       tprintf
606         ("REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n",
607         word->best_choice->unichar_string().string (), normalised_max_nonspace,
608         max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (),
609         gap_stats.median ());
610   }
611   #endif
612 
613   return result;
614 }
615 
616 
fixspace_thinks_word_done(WERD_RES * word)617 BOOL8 fixspace_thinks_word_done(WERD_RES *word) {
618   if (word->done)
619     return TRUE;
620 
621   /*
622     Use all the standard pass 2 conditions for mode 5 in set_done() in
623     reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
624     CARE WHETHER WE HAVE of/at on/an etc.
625   */
626   if ((fixsp_done_mode > 0) &&
627     (word->tess_accepted ||
628     ((fixsp_done_mode == 2) &&
629     (word->reject_map.reject_count () == 0)) ||
630     (fixsp_done_mode == 3)) &&
631     (strchr (word->best_choice->unichar_string().string (), ' ') == NULL) &&
632     ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
633     (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
634     (word->best_choice->permuter () == USER_DAWG_PERM) ||
635     (word->best_choice->permuter () == NUMBER_PERM)))
636     return TRUE;
637   else
638     return FALSE;
639 }
640 
641 
642 /*************************************************************************
643  * fix_sp_fp_word()
644  * Test the current word to see if it can be split by deleting noise blobs. If
645  * so, do the buisiness.
646  * Return with the iterator pointing to the same place if the word is unchanged,
647  * or the last of the replacement words.
648  *************************************************************************/
649 namespace tesseract {
fix_sp_fp_word(WERD_RES_IT & word_res_it,ROW * row,BLOCK * block)650 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
651                                BLOCK* block) {
652   WERD_RES *word_res;
653   WERD_RES_LIST sub_word_list;
654   WERD_RES_IT sub_word_list_it(&sub_word_list);
655   inT16 blob_index;
656   inT16 new_length;
657   float junk;
658 
659   word_res = word_res_it.data ();
660   if (!fixsp_check_for_fp_noise_space ||
661     word_res->word->flag (W_REP_CHAR) ||
662     word_res->combination ||
663     word_res->part_of_combo || !word_res->word->flag (W_DONT_CHOP))
664     return;
665 
666   blob_index = worst_noise_blob (word_res, &junk);
667   if (blob_index < 0)
668     return;
669 
670   #ifndef SECURE_NAMES
671   if (debug_fix_space_level > 1) {
672     tprintf ("FP fixspace working on \"%s\"\n",
673       word_res->best_choice->unichar_string().string());
674   }
675   #endif
676   gblob_sort_list ((PBLOB_LIST *) word_res->word->rej_cblob_list (), FALSE);
677   sub_word_list_it.add_after_stay_put (word_res_it.extract ());
678   fix_noisy_space_list(sub_word_list, row, block);
679   new_length = sub_word_list.length ();
680   word_res_it.add_list_before (&sub_word_list);
681   for (; (!word_res_it.at_last () && (new_length > 1)); new_length--) {
682     word_res_it.forward ();
683   }
684 }
685 
fix_noisy_space_list(WERD_RES_LIST & best_perm,ROW * row,BLOCK * block)686 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
687                                      BLOCK* block) {
688   inT16 best_score;
689   WERD_RES_IT best_perm_it(&best_perm);
690   WERD_RES_LIST current_perm;
691   WERD_RES_IT current_perm_it(&current_perm);
692   WERD_RES *old_word_res;
693   WERD_RES *new_word_res;
694   inT16 current_score;
695   BOOL8 improved = FALSE;
696 
697                                  //default score
698   best_score = fp_eval_word_spacing (best_perm);
699 
700   dump_words (best_perm, best_score, 1, improved);
701 
702   new_word_res = new WERD_RES;
703   old_word_res = best_perm_it.data ();
704                                  //Kludge to force deep copy
705   old_word_res->combination = TRUE;
706   *new_word_res = *old_word_res; //deep copy
707                                  //Undo kludge
708   old_word_res->combination = FALSE;
709                                  //Undo kludge
710   new_word_res->combination = FALSE;
711   current_perm_it.add_to_end (new_word_res);
712 
713   break_noisiest_blob_word(current_perm);
714 
715   while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {
716     match_current_words(current_perm, row, block);
717     current_score = fp_eval_word_spacing (current_perm);
718     dump_words (current_perm, current_score, 2, improved);
719     if (current_score > best_score) {
720       best_perm.clear ();
721       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
722       best_score = current_score;
723       improved = TRUE;
724     }
725     if (current_score < PERFECT_WERDS)
726       break_noisiest_blob_word(current_perm);
727   }
728   dump_words (best_perm, best_score, 3, improved);
729 }
730 }  // namespace tesseract
731 
732 
733 /*************************************************************************
734  * break_noisiest_blob_word()
735  * Find the word with the blob which looks like the worst noise.
736  * Break the word into two, deleting the noise blob.
737  *************************************************************************/
break_noisiest_blob_word(WERD_RES_LIST & words)738 void break_noisiest_blob_word(WERD_RES_LIST &words) {
739   WERD_RES_IT word_it(&words);
740   WERD_RES_IT worst_word_it;
741   float worst_noise_score = 9999;
742   int worst_blob_index = -1;     //noisiest blb of noisiest wd
743   int blob_index;                //of wds noisiest blb
744   float noise_score;             //of wds noisiest blb
745   WERD_RES *word_res;
746   C_BLOB_IT blob_it;
747   C_BLOB_IT rej_cblob_it;
748   C_BLOB_LIST new_blob_list;
749   C_BLOB_IT new_blob_it;
750   C_BLOB_IT new_rej_cblob_it;
751   WERD *new_word;
752   inT16 start_of_noise_blob;
753   inT16 i;
754 
755   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
756     blob_index = worst_noise_blob (word_it.data (), &noise_score);
757     if ((blob_index > -1) && (worst_noise_score > noise_score)) {
758       worst_noise_score = noise_score;
759       worst_blob_index = blob_index;
760       worst_word_it = word_it;
761     }
762   }
763   if (worst_blob_index < 0) {
764     words.clear ();              //signal termination
765     return;
766   }
767 
768   /* Now split the worst_word_it */
769 
770   word_res = worst_word_it.data ();
771 
772   /* Move blobs before noise blob to a new bloblist */
773 
774   new_blob_it.set_to_list (&new_blob_list);
775   blob_it.set_to_list (word_res->word->cblob_list ());
776   for (i = 0; i < worst_blob_index; i++, blob_it.forward ()) {
777     new_blob_it.add_after_then_move (blob_it.extract ());
778   }
779   start_of_noise_blob = blob_it.data ()->bounding_box ().left ();
780   delete blob_it.extract ();     //throw out noise blb
781 
782   new_word = new WERD (&new_blob_list, word_res->word);
783   new_word->set_flag (W_EOL, FALSE);
784   word_res->word->set_flag (W_BOL, FALSE);
785   word_res->word->set_blanks (1);//After break
786 
787   new_rej_cblob_it.set_to_list (new_word->rej_cblob_list ());
788   rej_cblob_it.set_to_list (word_res->word->rej_cblob_list ());
789   for (;
790     (!rej_cblob_it.empty () &&
791     (rej_cblob_it.data ()->bounding_box ().left () <
792   start_of_noise_blob)); rej_cblob_it.forward ()) {
793     new_rej_cblob_it.add_after_then_move (rej_cblob_it.extract ());
794   }
795 
796   worst_word_it.add_before_then_move (new WERD_RES (new_word));
797 
798   word_res->done = FALSE;
799   if (word_res->outword != NULL) {
800     delete word_res->outword;
801     delete word_res->best_choice;
802     delete word_res->raw_choice;
803     word_res->outword = NULL;
804     word_res->best_choice = NULL;
805     word_res->raw_choice = NULL;
806   }
807 }
808 
809 
worst_noise_blob(WERD_RES * word_res,float * worst_noise_score)810 inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
811   PBLOB_IT blob_it;
812   inT16 blob_count;
813   float noise_score[512];
814   int i;
815   int min_noise_blob;            //1st contender
816   int max_noise_blob;            //last contender
817   int non_noise_count;
818   int worst_noise_blob;          //Worst blob
819   float small_limit = bln_x_height * fixsp_small_outlines_size;
820   float non_noise_limit = bln_x_height * 0.8;
821 
822   blob_it.set_to_list (word_res->outword->blob_list ());
823   //normalised
824   blob_count = blob_it.length ();
825   ASSERT_HOST (blob_count <= 512);
826   if (blob_count < 5)
827     return -1;                   //too short to split
828   /* Get the noise scores for all blobs */
829 
830   #ifndef SECURE_NAMES
831   if (debug_fix_space_level > 5)
832     tprintf ("FP fixspace Noise metrics for \"%s\": ",
833       word_res->best_choice->unichar_string().string());
834   #endif
835 
836   for (i = 0; i < blob_count; i++, blob_it.forward ()) {
837     if (word_res->reject_map[i].accepted ())
838       noise_score[i] = non_noise_limit;
839     else
840       noise_score[i] = blob_noise_score (blob_it.data ());
841 
842     if (debug_fix_space_level > 5)
843       tprintf ("%1.1f ", noise_score[i]);
844   }
845   if (debug_fix_space_level > 5)
846     tprintf ("\n");
847 
848   /* Now find the worst one which is far enough away from the end of the word */
849 
850   non_noise_count = 0;
851   for (i = 0;
852   (i < blob_count) && (non_noise_count < fixsp_non_noise_limit); i++) {
853     if (noise_score[i] >= non_noise_limit)
854       non_noise_count++;
855   }
856   if (non_noise_count < fixsp_non_noise_limit)
857     return -1;
858   min_noise_blob = i;
859 
860   non_noise_count = 0;
861   for (i = blob_count - 1;
862   (i >= 0) && (non_noise_count < fixsp_non_noise_limit); i--) {
863     if (noise_score[i] >= non_noise_limit)
864       non_noise_count++;
865   }
866   if (non_noise_count < fixsp_non_noise_limit)
867     return -1;
868   max_noise_blob = i;
869 
870   if (min_noise_blob > max_noise_blob)
871     return -1;
872 
873   *worst_noise_score = small_limit;
874   worst_noise_blob = -1;
875   for (i = min_noise_blob; i <= max_noise_blob; i++) {
876     if (noise_score[i] < *worst_noise_score) {
877       worst_noise_blob = i;
878       *worst_noise_score = noise_score[i];
879     }
880   }
881   return worst_noise_blob;
882 }
883 
884 
blob_noise_score(PBLOB * blob)885 float blob_noise_score(PBLOB *blob) {
886   OUTLINE_IT outline_it;
887   TBOX box;                       //BB of outline
888   inT16 outline_count = 0;
889   inT16 max_dimension;
890   inT16 largest_outline_dimension = 0;
891 
892   outline_it.set_to_list (blob->out_list ());
893   for (outline_it.mark_cycle_pt ();
894   !outline_it.cycled_list (); outline_it.forward ()) {
895     outline_count++;
896     box = outline_it.data ()->bounding_box ();
897     if (box.height () > box.width ())
898       max_dimension = box.height ();
899     else
900       max_dimension = box.width ();
901 
902     if (largest_outline_dimension < max_dimension)
903       largest_outline_dimension = max_dimension;
904   }
905 
906   if (fixsp_noise_score_fixing) {
907     if (outline_count > 5)
908                                  //penalise LOTS of blobs
909       largest_outline_dimension *= 2;
910 
911     box = blob->bounding_box ();
912 
913     if ((box.bottom () > bln_baseline_offset * 4) ||
914       (box.top () < bln_baseline_offset / 2))
915                                  //Lax blob is if high or low
916       largest_outline_dimension /= 2;
917   }
918   return largest_outline_dimension;
919 }
920 
921 
fixspace_dbg(WERD_RES * word)922 void fixspace_dbg(WERD_RES *word) {
923   TBOX box = word->word->bounding_box ();
924   BOOL8 show_map_detail = FALSE;
925   inT16 i;
926 
927   box.print ();
928   #ifndef SECURE_NAMES
929   tprintf (" \"%s\" ", word->best_choice->unichar_string().string ());
930   tprintf ("Blob count: %d (word); %d/%d (outword)\n",
931     word->word->gblob_list ()->length (),
932     word->outword->gblob_list ()->length (),
933     word->outword->rej_blob_list ()->length ());
934   word->reject_map.print (debug_fp);
935   tprintf ("\n");
936   if (show_map_detail) {
937     tprintf ("\"%s\"\n", word->best_choice->unichar_string().string ());
938     for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
939       tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
940       word->reject_map[i].full_print (debug_fp);
941     }
942   }
943 
944   tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
945   tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
946   #endif
947 }
948 
949 
950 /*************************************************************************
951  * fp_eval_word_spacing()
952  * Evaluation function for fixed pitch word lists.
953  *
954  * Basically, count the number of "nice" characters - those which are in tess
955  * acceptable words or in dict words and are not rejected.
956  * Penalise any potential noise chars
957  *************************************************************************/
958 namespace tesseract {
fp_eval_word_spacing(WERD_RES_LIST & word_res_list)959 inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
960   WERD_RES_IT word_it(&word_res_list);
961   WERD_RES *word;
962   PBLOB_IT blob_it;
963   inT16 word_length;
964   inT16 score = 0;
965   inT16 i;
966   float small_limit = bln_x_height * fixsp_small_outlines_size;
967 
968   if (!fixsp_fp_eval)
969     return (eval_word_spacing (word_res_list));
970 
971   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
972     word = word_it.data ();
973     word_length = word->reject_map.length ();
974     if ((word->done ||
975       word->tess_accepted) ||
976       (word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
977       (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
978       (word->best_choice->permuter () == USER_DAWG_PERM) ||
979         (safe_dict_word(*(word->best_choice)) > 0)) {
980       blob_it.set_to_list (word->outword->blob_list ());
981       UNICHAR_ID space = getDict().getUnicharset().unichar_to_id(" ");
982       for (i = 0; i < word->best_choice->length(); ++i, blob_it.forward()) {
983         if (word->best_choice->unichar_id(i) == space ||
984             (blob_noise_score(blob_it.data()) < small_limit)) {
985           score -= 1;            //penalise possibly erroneous non-space
986         } else if (word->reject_map[i].accepted()) {
987           score++;
988       }
989     }
990   }
991   }
992   if (score < 0)
993     score = 0;
994   return score;
995 }
996 }  // namespace tesseract
997