1 /******************************************************************
2 * File: fixspace.cpp (Formerly fixspace.c)
3 * Description: Implements a pass over the page res, exploring the alternative
4 * spacing possibilities, trying to use context to improve the
5 word spacing
6 * Author: Phil Cheatle
7 * Created: Thu Oct 21 11:38:43 BST 1993
8 *
9 * (C) Copyright 1993, Hewlett-Packard Ltd.
10 ** Licensed under the Apache License, Version 2.0 (the "License");
11 ** you may not use this file except in compliance with the License.
12 ** You may obtain a copy of the License at
13 ** http://www.apache.org/licenses/LICENSE-2.0
14 ** Unless required by applicable law or agreed to in writing, software
15 ** distributed under the License is distributed on an "AS IS" BASIS,
16 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 ** See the License for the specific language governing permissions and
18 ** limitations under the License.
19 *
20 **********************************************************************/
21
22 #include "mfcpch.h"
23 #include <ctype.h>
24 #include "reject.h"
25 #include "statistc.h"
26 #include "genblob.h"
27 #include "control.h"
28 #include "fixspace.h"
29 #include "tessvars.h"
30 #include "tessbox.h"
31 #include "secname.h"
32 #include "globals.h"
33 #include "tesseractclass.h"
34
35 #define EXTERN
36
37 EXTERN BOOL_VAR (fixsp_check_for_fp_noise_space, TRUE,
38 "Try turning noise to space in fixed pitch");
39 EXTERN BOOL_VAR (fixsp_fp_eval, TRUE, "Use alternate evaluation for fp");
40 EXTERN BOOL_VAR (fixsp_noise_score_fixing, TRUE, "More sophisticated?");
41 EXTERN INT_VAR (fixsp_non_noise_limit, 1,
42 "How many non-noise blbs either side?");
43 EXTERN double_VAR (fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
44
45 EXTERN BOOL_VAR (fixsp_ignore_punct, TRUE, "In uniform spacing calc");
46 EXTERN BOOL_VAR (fixsp_numeric_fix, TRUE, "Try to deal with numeric punct");
47 EXTERN BOOL_VAR (fixsp_prefer_joined_1s, TRUE, "Arbitrary boost");
48 EXTERN BOOL_VAR (tessedit_test_uniform_wd_spacing, FALSE,
49 "Limit context word spacing");
50 EXTERN BOOL_VAR (tessedit_prefer_joined_punct, FALSE,
51 "Reward punctation joins");
52 EXTERN INT_VAR (fixsp_done_mode, 1, "What constitues done for spacing");
53 EXTERN INT_VAR (debug_fix_space_level, 0, "Contextual fixspace debug");
54 EXTERN STRING_VAR (numeric_punctuation, ".,",
55 "Punct. chs expected WITHIN numbers");
56
57 #define PERFECT_WERDS 999
58 #define MAXSPACING 128 /*max expected spacing in pix */
59
60 /*************************************************************************
61 * fix_fuzzy_spaces()
62 * Walk over the page finding sequences of words joined by fuzzy spaces. Extract
63 * them as a sublist, process the sublist to find the optimal arrangement of
64 * spaces then replace the sublist in the ROW_RES.
65 *************************************************************************/
66 namespace tesseract {
fix_fuzzy_spaces(volatile ETEXT_DESC * monitor,inT32 word_count,PAGE_RES * page_res)67 void Tesseract::fix_fuzzy_spaces( //find fuzzy words
68 //progress monitor
69 volatile ETEXT_DESC *monitor,
70 //count of words in doc
71 inT32 word_count,
72 PAGE_RES *page_res) {
73 BLOCK_RES_IT block_res_it; //iterators
74 ROW_RES_IT row_res_it;
75 WERD_RES_IT word_res_it_from;
76 WERD_RES_IT word_res_it_to;
77 WERD_RES *word_res;
78 WERD_RES_LIST fuzzy_space_words;
79 inT16 new_length;
80 BOOL8 prevent_null_wd_fixsp; //DONT process blobless wds
81 inT32 word_index; //current word
82
83 block_res_it.set_to_list (&page_res->block_res_list);
84 word_index = 0;
85 for (block_res_it.mark_cycle_pt ();
86 !block_res_it.cycled_list (); block_res_it.forward ()) {
87 row_res_it.set_to_list (&block_res_it.data ()->row_res_list);
88 for (row_res_it.mark_cycle_pt ();
89 !row_res_it.cycled_list (); row_res_it.forward ()) {
90 word_res_it_from.set_to_list (&row_res_it.data ()->word_res_list);
91 while (!word_res_it_from.at_last ()) {
92 word_res = word_res_it_from.data ();
93 while (!word_res_it_from.at_last () &&
94 !(word_res->combination ||
95 word_res_it_from.data_relative (1)->
96 word->flag (W_FUZZY_NON) ||
97 word_res_it_from.data_relative (1)->
98 word->flag (W_FUZZY_SP))) {
99 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
100 block_res_it.data()->block);
101 word_res = word_res_it_from.forward ();
102 word_index++;
103 if (monitor != NULL) {
104 monitor->ocr_alive = TRUE;
105 monitor->progress = 90 + 5 * word_index / word_count;
106 }
107 }
108
109 if (!word_res_it_from.at_last ()) {
110 word_res_it_to = word_res_it_from;
111 prevent_null_wd_fixsp =
112 word_res->word->gblob_list ()->empty ();
113 if (check_debug_pt (word_res, 60))
114 debug_fix_space_level.set_value (10);
115 word_res_it_to.forward ();
116 word_index++;
117 if (monitor != NULL) {
118 monitor->ocr_alive = TRUE;
119 monitor->progress = 90 + 5 * word_index / word_count;
120 }
121 while (!word_res_it_to.at_last () &&
122 (word_res_it_to.data_relative (1)->
123 word->flag (W_FUZZY_NON) ||
124 word_res_it_to.data_relative (1)->
125 word->flag (W_FUZZY_SP))) {
126 if (check_debug_pt (word_res, 60))
127 debug_fix_space_level.set_value (10);
128 if (word_res->word->gblob_list ()->empty ())
129 prevent_null_wd_fixsp = TRUE;
130 word_res = word_res_it_to.forward ();
131 }
132 if (check_debug_pt (word_res, 60))
133 debug_fix_space_level.set_value (10);
134 if (word_res->word->gblob_list ()->empty ())
135 prevent_null_wd_fixsp = TRUE;
136 if (prevent_null_wd_fixsp) {
137 word_res_it_from = word_res_it_to;
138 } else {
139 fuzzy_space_words.assign_to_sublist (&word_res_it_from,
140 &word_res_it_to);
141 fix_fuzzy_space_list (fuzzy_space_words,
142 row_res_it.data()->row,
143 block_res_it.data()->block);
144 new_length = fuzzy_space_words.length ();
145 word_res_it_from.add_list_before (&fuzzy_space_words);
146 for (;
147 (!word_res_it_from.at_last () &&
148 (new_length > 0)); new_length--) {
149 word_res_it_from.forward ();
150 }
151 }
152 if (test_pt)
153 debug_fix_space_level.set_value (0);
154 }
155 fix_sp_fp_word(word_res_it_from, row_res_it.data ()->row,
156 block_res_it.data()->block);
157 //Last word in row
158 }
159 }
160 }
161 }
162
fix_fuzzy_space_list(WERD_RES_LIST & best_perm,ROW * row,BLOCK * block)163 void Tesseract::fix_fuzzy_space_list( //space explorer
164 WERD_RES_LIST &best_perm,
165 ROW *row,
166 BLOCK* block) {
167 inT16 best_score;
168 WERD_RES_LIST current_perm;
169 inT16 current_score;
170 BOOL8 improved = FALSE;
171
172 best_score = eval_word_spacing(best_perm); // default score
173 dump_words (best_perm, best_score, 1, improved);
174
175 if (best_score != PERFECT_WERDS)
176 initialise_search(best_perm, current_perm);
177
178 while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {
179 match_current_words(current_perm, row, block);
180 current_score = eval_word_spacing (current_perm);
181 dump_words (current_perm, current_score, 2, improved);
182 if (current_score > best_score) {
183 best_perm.clear ();
184 best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy);
185 best_score = current_score;
186 improved = TRUE;
187 }
188 if (current_score < PERFECT_WERDS)
189 transform_to_next_perm(current_perm);
190 }
191 dump_words (best_perm, best_score, 3, improved);
192 }
193
194 } // namespace tesseract
195
initialise_search(WERD_RES_LIST & src_list,WERD_RES_LIST & new_list)196 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
197 WERD_RES_IT src_it(&src_list);
198 WERD_RES_IT new_it(&new_list);
199 WERD_RES *src_wd;
200 WERD_RES *new_wd;
201
202 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
203 src_wd = src_it.data ();
204 if (!src_wd->combination) {
205 new_wd = new WERD_RES (*src_wd);
206 new_wd->combination = FALSE;
207 new_wd->part_of_combo = FALSE;
208 new_it.add_after_then_move (new_wd);
209 }
210 }
211 }
212
213
214 namespace tesseract {
match_current_words(WERD_RES_LIST & words,ROW * row,BLOCK * block)215 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
216 BLOCK* block) {
217 WERD_RES_IT word_it(&words);
218 WERD_RES *word;
219
220 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
221 word = word_it.data ();
222 if ((!word->part_of_combo) && (word->outword == NULL))
223 classify_word_pass2(word, block, row);
224 }
225 }
226
227
228 /*************************************************************************
229 * eval_word_spacing()
230 * The basic measure is the number of characters in contextually confirmed
231 * words. (I.e the word is done)
232 * If all words are contextually confirmed the evaluation is deemed perfect.
233 *
234 * Some fiddles are done to handle "1"s as these are VERY frequent causes of
235 * fuzzy spaces. The problem with the basic measure is that "561 63" would score
236 * the same as "56163", though given our knowledge that the space is fuzzy, and
237 * that there is a "1" next to the fuzzy space, we need to ensure that "56163"
238 * is prefered.
239 *
240 * The solution is to NOT COUNT the score of any word which has a digit at one
241 * end and a "1Il" as the character the other side of the space.
242 *
243 * Conversly, any character next to a "1" within a word is counted as a positive
244 * score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of
245 * the "1" joined). "56163" would score 7 - all chars in a numeric word + 2
246 * sides of a "1" joined.
247 *
248 * The joined 1 rule is applied to any word REGARDLESS of contextual
249 * confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
250 * confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
251 *
252 *************************************************************************/
eval_word_spacing(WERD_RES_LIST & word_res_list)253 inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
254 WERD_RES_IT word_res_it(&word_res_list);
255 inT16 total_score = 0;
256 inT16 word_count = 0;
257 inT16 done_word_count = 0;
258 inT16 word_len;
259 inT16 i;
260 inT16 offset;
261 WERD_RES *word; //current word
262 inT16 prev_word_score = 0;
263 BOOL8 prev_word_done = FALSE;
264 BOOL8 prev_char_1 = FALSE; //prev ch a "1/I/l"?
265 BOOL8 prev_char_digit = FALSE; //prev ch 2..9 or 0
266 BOOL8 current_char_1 = FALSE;
267 BOOL8 current_word_ok_so_far;
268 STRING punct_chars = "!\"`',.:;";
269 BOOL8 prev_char_punct = FALSE;
270 BOOL8 current_char_punct = FALSE;
271 BOOL8 word_done = FALSE;
272
273 do {
274 word = word_res_it.data ();
275 word_done = fixspace_thinks_word_done (word);
276 word_count++;
277 if (word->tess_failed) {
278 total_score += prev_word_score;
279 if (prev_word_done)
280 done_word_count++;
281 prev_word_score = 0;
282 prev_char_1 = FALSE;
283 prev_char_digit = FALSE;
284 prev_word_done = FALSE;
285 }
286 else {
287 /*
288 Can we add the prev word score and potentially count this word?
289 Yes IF it didnt end in a 1 when the first char of this word is a digit
290 AND it didnt end in a digit when the first char of this word is a 1
291 */
292 word_len = word->reject_map.length ();
293 current_word_ok_so_far = FALSE;
294 if (!((prev_char_1 &&
295 digit_or_numeric_punct (word, 0)) ||
296 (prev_char_digit &&
297 ((word_done &&
298 (word->best_choice->unichar_lengths().string()[0] == 1 &&
299 word->best_choice->unichar_string()[0] == '1')) ||
300 (!word_done &&
301 STRING(conflict_set_I_l_1).contains(
302 word->best_choice->unichar_string ()[0])))))) {
303 total_score += prev_word_score;
304 if (prev_word_done)
305 done_word_count++;
306 current_word_ok_so_far = word_done;
307 }
308
309 if ((current_word_ok_so_far) &&
310 (!tessedit_test_uniform_wd_spacing ||
311 ((word->best_choice->permuter () == NUMBER_PERM) ||
312 uniformly_spaced (word)))) {
313 prev_word_done = TRUE;
314 prev_word_score = word_len;
315 }
316 else {
317 prev_word_done = FALSE;
318 prev_word_score = 0;
319 }
320
321 if (fixsp_prefer_joined_1s) {
322 /* Add 1 to total score for every joined 1 regardless of context and
323 rejtn */
324
325 for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
326 current_char_1 = word->best_choice->unichar_string()[i] == '1';
327 if (prev_char_1 || (current_char_1 && (i > 0)))
328 total_score++;
329 prev_char_1 = current_char_1;
330 }
331 }
332
333 /* Add 1 to total score for every joined punctuation regardless of context
334 and rejtn */
335 if (tessedit_prefer_joined_punct) {
336 for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
337 offset += word->best_choice->unichar_lengths()[i++]) {
338 current_char_punct =
339 punct_chars.contains (word->best_choice->unichar_string()[offset]);
340 if (prev_char_punct || (current_char_punct && (i > 0)))
341 total_score++;
342 prev_char_punct = current_char_punct;
343 }
344 }
345 prev_char_digit = digit_or_numeric_punct (word, word_len - 1);
346 for (i = 0, offset = 0; i < word_len - 1;
347 offset += word->best_choice->unichar_lengths()[i++]);
348 prev_char_1 =
349 ((word_done
350 && (word->best_choice->unichar_string()[offset] == '1'))
351 || (!word_done
352 && STRING(conflict_set_I_l_1).contains(
353 word->best_choice->unichar_string()[offset])));
354 }
355 /* Find next word */
356 do
357 word_res_it.forward ();
358 while (word_res_it.data ()->part_of_combo);
359 }
360 while (!word_res_it.at_first ());
361 total_score += prev_word_score;
362 if (prev_word_done)
363 done_word_count++;
364 if (done_word_count == word_count)
365 return PERFECT_WERDS;
366 else
367 return total_score;
368 }
369
370
digit_or_numeric_punct(WERD_RES * word,int char_position)371 BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
372 int i;
373 int offset;
374
375 for (i = 0, offset = 0; i < char_position;
376 offset += word->best_choice->unichar_lengths()[i++]);
377 return (unicharset.get_isdigit(word->best_choice->unichar_string().string() + offset,
378 word->best_choice->unichar_lengths()[i]) ||
379 (fixsp_numeric_fix &&
380 (word->best_choice->permuter () == NUMBER_PERM) &&
381 STRING (numeric_punctuation).contains
382 (word->best_choice->unichar_string().string()[offset])));
383 }
384 } // namespace tesseract
385
386
387 /*************************************************************************
388 * transform_to_next_perm()
389 * Examines the current word list to find the smallest word gap size. Then walks
390 * the word list closing any gaps of this size by either inserted new
391 * combination words, or extending existing ones.
392 *
393 * The routine COULD be limited to stop it building words longer than N blobs.
394 *
395 * If there are no more gaps then it DELETES the entire list and returns the
396 * empty list to cause termination.
397 *************************************************************************/
transform_to_next_perm(WERD_RES_LIST & words)398 void transform_to_next_perm(WERD_RES_LIST &words) {
399 WERD_RES_IT word_it(&words);
400 WERD_RES_IT prev_word_it(&words);
401 WERD_RES *word;
402 WERD_RES *prev_word;
403 WERD_RES *combo;
404 WERD *copy_word;
405 inT16 prev_right = -1;
406 TBOX box;
407 inT16 gap;
408 inT16 min_gap = MAX_INT16;
409
410 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
411 word = word_it.data ();
412 if (!word->part_of_combo) {
413 box = word->word->bounding_box ();
414 if (prev_right >= 0) {
415 gap = box.left () - prev_right;
416 if (gap < min_gap)
417 min_gap = gap;
418 }
419 prev_right = box.right ();
420 }
421 }
422 if (min_gap < MAX_INT16) {
423 prev_right = -1; //back to start
424 word_it.set_to_list (&words);
425 for (; //cant use cycle pt due to inserted combos at start of list
426 (prev_right < 0) || !word_it.at_first (); word_it.forward ()) {
427 word = word_it.data ();
428 if (!word->part_of_combo) {
429 box = word->word->bounding_box ();
430 if (prev_right >= 0) {
431 gap = box.left () - prev_right;
432 if (gap <= min_gap) {
433 prev_word = prev_word_it.data ();
434 if (prev_word->combination)
435 combo = prev_word;
436 else {
437 /* Make a new combination and insert before the first word being joined */
438 copy_word = new WERD;
439 *copy_word = *(prev_word->word);
440 //deep copy
441 combo = new WERD_RES (copy_word);
442 combo->combination = TRUE;
443 combo->x_height = prev_word->x_height;
444 prev_word->part_of_combo = TRUE;
445 prev_word_it.add_before_then_move (combo);
446 }
447 combo->word->set_flag (W_EOL, word->word->flag (W_EOL));
448 if (word->combination) {
449 combo->word->join_on (word->word);
450 //Move blbs to combo
451 //old combo no longer needed
452 delete word_it.extract ();
453 }
454 else {
455 //Cpy current wd to combo
456 combo->copy_on (word);
457 word->part_of_combo = TRUE;
458 }
459 combo->done = FALSE;
460 if (combo->outword != NULL) {
461 delete combo->outword;
462 delete combo->best_choice;
463 delete combo->raw_choice;
464 combo->outword = NULL;
465 combo->best_choice = NULL;
466 combo->raw_choice = NULL;
467 }
468 }
469 else
470 //catch up
471 prev_word_it = word_it;
472 }
473 prev_right = box.right ();
474 }
475 }
476 }
477 else
478 words.clear (); //signal termination
479 }
480
481
dump_words(WERD_RES_LIST & perm,inT16 score,inT16 mode,BOOL8 improved)482 void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved) {
483 WERD_RES_IT word_res_it(&perm);
484 static STRING initial_str;
485
486 if (debug_fix_space_level > 0) {
487 if (mode == 1) {
488 initial_str = "";
489 for (word_res_it.mark_cycle_pt ();
490 !word_res_it.cycled_list (); word_res_it.forward ()) {
491 if (!word_res_it.data ()->part_of_combo) {
492 initial_str += word_res_it.data()->best_choice->unichar_string();
493 initial_str += ' ';
494 }
495 }
496 }
497
498 #ifndef SECURE_NAMES
499 if (debug_fix_space_level > 1) {
500 switch (mode) {
501 case 1:
502 tprintf ("EXTRACTED (%d): \"", score);
503 break;
504 case 2:
505 tprintf ("TESTED (%d): \"", score);
506 break;
507 case 3:
508 tprintf ("RETURNED (%d): \"", score);
509 break;
510 }
511
512 for (word_res_it.mark_cycle_pt ();
513 !word_res_it.cycled_list (); word_res_it.forward ()) {
514 if (!word_res_it.data ()->part_of_combo)
515 tprintf("%s/%1d ",
516 word_res_it.data()->best_choice->unichar_string().string(),
517 (int)word_res_it.data()->best_choice->permuter());
518 }
519 tprintf ("\"\n");
520 }
521 else if (improved) {
522 tprintf ("FIX SPACING \"%s\" => \"", initial_str.string ());
523 for (word_res_it.mark_cycle_pt ();
524 !word_res_it.cycled_list (); word_res_it.forward ()) {
525 if (!word_res_it.data ()->part_of_combo)
526 tprintf ("%s/%1d ",
527 word_res_it.data()->best_choice->unichar_string().string(),
528 (int)word_res_it.data()->best_choice->permuter());
529 }
530 tprintf ("\"\n");
531 }
532 #endif
533 }
534 }
535
536
537 /*************************************************************************
538 * uniformly_spaced()
539 * Return true if one of the following are true:
540 * - All inter-char gaps are the same width
541 * - The largest gap is no larger than twice the mean/median of the others
542 * - The largest gap is < 64/5 = 13 and all others are <= 0
543 * **** REMEMBER - WE'RE NOW WORKING WITH A BLN WERD !!!
544 *************************************************************************/
uniformly_spaced(WERD_RES * word)545 BOOL8 uniformly_spaced( //sensible word
546 WERD_RES *word) {
547 PBLOB_IT blob_it;
548 TBOX box;
549 inT16 prev_right = -MAX_INT16;
550 inT16 gap;
551 inT16 max_gap = -MAX_INT16;
552 inT16 max_gap_count = 0;
553 STATS gap_stats (0, MAXSPACING);
554 BOOL8 result;
555 const ROW *row = word->denorm.row ();
556 float max_non_space;
557 float normalised_max_nonspace;
558 inT16 i = 0;
559 inT16 offset = 0;
560 STRING punct_chars = "\"`',.:;";
561
562 blob_it.set_to_list (word->outword->blob_list ());
563
564 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
565 box = blob_it.data ()->bounding_box ();
566 if ((prev_right > -MAX_INT16) &&
567 (!fixsp_ignore_punct ||
568 (!punct_chars.contains (word->best_choice->unichar_string()
569 [offset - word->best_choice->unichar_lengths()[i - 1]]) &&
570 !punct_chars.contains (word->best_choice->unichar_string()[offset])))) {
571 gap = box.left () - prev_right;
572 if (gap < max_gap)
573 gap_stats.add (gap, 1);
574 else if (gap == max_gap)
575 max_gap_count++;
576 else {
577 if (max_gap_count > 0)
578 gap_stats.add (max_gap, max_gap_count);
579 max_gap = gap;
580 max_gap_count = 1;
581 }
582 }
583 prev_right = box.right ();
584 offset += word->best_choice->unichar_lengths()[i++];
585 }
586
587 max_non_space = (row->space () + 3 * row->kern ()) / 4;
588 normalised_max_nonspace = max_non_space * bln_x_height / row->x_height ();
589
590 result = ((gap_stats.get_total () == 0) ||
591 (max_gap <= normalised_max_nonspace) ||
592 ((gap_stats.get_total () > 2) &&
593 (max_gap <= 2 * gap_stats.median ())) ||
594 ((gap_stats.get_total () <= 2) &&
595 (max_gap <= 2 * gap_stats.mean ())));
596 #ifndef SECURE_NAMES
597 if ((debug_fix_space_level > 1)) {
598 if (result)
599 tprintf
600 ("ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n",
601 word->best_choice->unichar_string().string (), normalised_max_nonspace,
602 max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (),
603 gap_stats.median ());
604 else
605 tprintf
606 ("REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n",
607 word->best_choice->unichar_string().string (), normalised_max_nonspace,
608 max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (),
609 gap_stats.median ());
610 }
611 #endif
612
613 return result;
614 }
615
616
fixspace_thinks_word_done(WERD_RES * word)617 BOOL8 fixspace_thinks_word_done(WERD_RES *word) {
618 if (word->done)
619 return TRUE;
620
621 /*
622 Use all the standard pass 2 conditions for mode 5 in set_done() in
623 reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
624 CARE WHETHER WE HAVE of/at on/an etc.
625 */
626 if ((fixsp_done_mode > 0) &&
627 (word->tess_accepted ||
628 ((fixsp_done_mode == 2) &&
629 (word->reject_map.reject_count () == 0)) ||
630 (fixsp_done_mode == 3)) &&
631 (strchr (word->best_choice->unichar_string().string (), ' ') == NULL) &&
632 ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
633 (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
634 (word->best_choice->permuter () == USER_DAWG_PERM) ||
635 (word->best_choice->permuter () == NUMBER_PERM)))
636 return TRUE;
637 else
638 return FALSE;
639 }
640
641
642 /*************************************************************************
643 * fix_sp_fp_word()
644 * Test the current word to see if it can be split by deleting noise blobs. If
645 * so, do the buisiness.
646 * Return with the iterator pointing to the same place if the word is unchanged,
647 * or the last of the replacement words.
648 *************************************************************************/
649 namespace tesseract {
fix_sp_fp_word(WERD_RES_IT & word_res_it,ROW * row,BLOCK * block)650 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
651 BLOCK* block) {
652 WERD_RES *word_res;
653 WERD_RES_LIST sub_word_list;
654 WERD_RES_IT sub_word_list_it(&sub_word_list);
655 inT16 blob_index;
656 inT16 new_length;
657 float junk;
658
659 word_res = word_res_it.data ();
660 if (!fixsp_check_for_fp_noise_space ||
661 word_res->word->flag (W_REP_CHAR) ||
662 word_res->combination ||
663 word_res->part_of_combo || !word_res->word->flag (W_DONT_CHOP))
664 return;
665
666 blob_index = worst_noise_blob (word_res, &junk);
667 if (blob_index < 0)
668 return;
669
670 #ifndef SECURE_NAMES
671 if (debug_fix_space_level > 1) {
672 tprintf ("FP fixspace working on \"%s\"\n",
673 word_res->best_choice->unichar_string().string());
674 }
675 #endif
676 gblob_sort_list ((PBLOB_LIST *) word_res->word->rej_cblob_list (), FALSE);
677 sub_word_list_it.add_after_stay_put (word_res_it.extract ());
678 fix_noisy_space_list(sub_word_list, row, block);
679 new_length = sub_word_list.length ();
680 word_res_it.add_list_before (&sub_word_list);
681 for (; (!word_res_it.at_last () && (new_length > 1)); new_length--) {
682 word_res_it.forward ();
683 }
684 }
685
fix_noisy_space_list(WERD_RES_LIST & best_perm,ROW * row,BLOCK * block)686 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
687 BLOCK* block) {
688 inT16 best_score;
689 WERD_RES_IT best_perm_it(&best_perm);
690 WERD_RES_LIST current_perm;
691 WERD_RES_IT current_perm_it(¤t_perm);
692 WERD_RES *old_word_res;
693 WERD_RES *new_word_res;
694 inT16 current_score;
695 BOOL8 improved = FALSE;
696
697 //default score
698 best_score = fp_eval_word_spacing (best_perm);
699
700 dump_words (best_perm, best_score, 1, improved);
701
702 new_word_res = new WERD_RES;
703 old_word_res = best_perm_it.data ();
704 //Kludge to force deep copy
705 old_word_res->combination = TRUE;
706 *new_word_res = *old_word_res; //deep copy
707 //Undo kludge
708 old_word_res->combination = FALSE;
709 //Undo kludge
710 new_word_res->combination = FALSE;
711 current_perm_it.add_to_end (new_word_res);
712
713 break_noisiest_blob_word(current_perm);
714
715 while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {
716 match_current_words(current_perm, row, block);
717 current_score = fp_eval_word_spacing (current_perm);
718 dump_words (current_perm, current_score, 2, improved);
719 if (current_score > best_score) {
720 best_perm.clear ();
721 best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy);
722 best_score = current_score;
723 improved = TRUE;
724 }
725 if (current_score < PERFECT_WERDS)
726 break_noisiest_blob_word(current_perm);
727 }
728 dump_words (best_perm, best_score, 3, improved);
729 }
730 } // namespace tesseract
731
732
733 /*************************************************************************
734 * break_noisiest_blob_word()
735 * Find the word with the blob which looks like the worst noise.
736 * Break the word into two, deleting the noise blob.
737 *************************************************************************/
break_noisiest_blob_word(WERD_RES_LIST & words)738 void break_noisiest_blob_word(WERD_RES_LIST &words) {
739 WERD_RES_IT word_it(&words);
740 WERD_RES_IT worst_word_it;
741 float worst_noise_score = 9999;
742 int worst_blob_index = -1; //noisiest blb of noisiest wd
743 int blob_index; //of wds noisiest blb
744 float noise_score; //of wds noisiest blb
745 WERD_RES *word_res;
746 C_BLOB_IT blob_it;
747 C_BLOB_IT rej_cblob_it;
748 C_BLOB_LIST new_blob_list;
749 C_BLOB_IT new_blob_it;
750 C_BLOB_IT new_rej_cblob_it;
751 WERD *new_word;
752 inT16 start_of_noise_blob;
753 inT16 i;
754
755 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
756 blob_index = worst_noise_blob (word_it.data (), &noise_score);
757 if ((blob_index > -1) && (worst_noise_score > noise_score)) {
758 worst_noise_score = noise_score;
759 worst_blob_index = blob_index;
760 worst_word_it = word_it;
761 }
762 }
763 if (worst_blob_index < 0) {
764 words.clear (); //signal termination
765 return;
766 }
767
768 /* Now split the worst_word_it */
769
770 word_res = worst_word_it.data ();
771
772 /* Move blobs before noise blob to a new bloblist */
773
774 new_blob_it.set_to_list (&new_blob_list);
775 blob_it.set_to_list (word_res->word->cblob_list ());
776 for (i = 0; i < worst_blob_index; i++, blob_it.forward ()) {
777 new_blob_it.add_after_then_move (blob_it.extract ());
778 }
779 start_of_noise_blob = blob_it.data ()->bounding_box ().left ();
780 delete blob_it.extract (); //throw out noise blb
781
782 new_word = new WERD (&new_blob_list, word_res->word);
783 new_word->set_flag (W_EOL, FALSE);
784 word_res->word->set_flag (W_BOL, FALSE);
785 word_res->word->set_blanks (1);//After break
786
787 new_rej_cblob_it.set_to_list (new_word->rej_cblob_list ());
788 rej_cblob_it.set_to_list (word_res->word->rej_cblob_list ());
789 for (;
790 (!rej_cblob_it.empty () &&
791 (rej_cblob_it.data ()->bounding_box ().left () <
792 start_of_noise_blob)); rej_cblob_it.forward ()) {
793 new_rej_cblob_it.add_after_then_move (rej_cblob_it.extract ());
794 }
795
796 worst_word_it.add_before_then_move (new WERD_RES (new_word));
797
798 word_res->done = FALSE;
799 if (word_res->outword != NULL) {
800 delete word_res->outword;
801 delete word_res->best_choice;
802 delete word_res->raw_choice;
803 word_res->outword = NULL;
804 word_res->best_choice = NULL;
805 word_res->raw_choice = NULL;
806 }
807 }
808
809
worst_noise_blob(WERD_RES * word_res,float * worst_noise_score)810 inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
811 PBLOB_IT blob_it;
812 inT16 blob_count;
813 float noise_score[512];
814 int i;
815 int min_noise_blob; //1st contender
816 int max_noise_blob; //last contender
817 int non_noise_count;
818 int worst_noise_blob; //Worst blob
819 float small_limit = bln_x_height * fixsp_small_outlines_size;
820 float non_noise_limit = bln_x_height * 0.8;
821
822 blob_it.set_to_list (word_res->outword->blob_list ());
823 //normalised
824 blob_count = blob_it.length ();
825 ASSERT_HOST (blob_count <= 512);
826 if (blob_count < 5)
827 return -1; //too short to split
828 /* Get the noise scores for all blobs */
829
830 #ifndef SECURE_NAMES
831 if (debug_fix_space_level > 5)
832 tprintf ("FP fixspace Noise metrics for \"%s\": ",
833 word_res->best_choice->unichar_string().string());
834 #endif
835
836 for (i = 0; i < blob_count; i++, blob_it.forward ()) {
837 if (word_res->reject_map[i].accepted ())
838 noise_score[i] = non_noise_limit;
839 else
840 noise_score[i] = blob_noise_score (blob_it.data ());
841
842 if (debug_fix_space_level > 5)
843 tprintf ("%1.1f ", noise_score[i]);
844 }
845 if (debug_fix_space_level > 5)
846 tprintf ("\n");
847
848 /* Now find the worst one which is far enough away from the end of the word */
849
850 non_noise_count = 0;
851 for (i = 0;
852 (i < blob_count) && (non_noise_count < fixsp_non_noise_limit); i++) {
853 if (noise_score[i] >= non_noise_limit)
854 non_noise_count++;
855 }
856 if (non_noise_count < fixsp_non_noise_limit)
857 return -1;
858 min_noise_blob = i;
859
860 non_noise_count = 0;
861 for (i = blob_count - 1;
862 (i >= 0) && (non_noise_count < fixsp_non_noise_limit); i--) {
863 if (noise_score[i] >= non_noise_limit)
864 non_noise_count++;
865 }
866 if (non_noise_count < fixsp_non_noise_limit)
867 return -1;
868 max_noise_blob = i;
869
870 if (min_noise_blob > max_noise_blob)
871 return -1;
872
873 *worst_noise_score = small_limit;
874 worst_noise_blob = -1;
875 for (i = min_noise_blob; i <= max_noise_blob; i++) {
876 if (noise_score[i] < *worst_noise_score) {
877 worst_noise_blob = i;
878 *worst_noise_score = noise_score[i];
879 }
880 }
881 return worst_noise_blob;
882 }
883
884
blob_noise_score(PBLOB * blob)885 float blob_noise_score(PBLOB *blob) {
886 OUTLINE_IT outline_it;
887 TBOX box; //BB of outline
888 inT16 outline_count = 0;
889 inT16 max_dimension;
890 inT16 largest_outline_dimension = 0;
891
892 outline_it.set_to_list (blob->out_list ());
893 for (outline_it.mark_cycle_pt ();
894 !outline_it.cycled_list (); outline_it.forward ()) {
895 outline_count++;
896 box = outline_it.data ()->bounding_box ();
897 if (box.height () > box.width ())
898 max_dimension = box.height ();
899 else
900 max_dimension = box.width ();
901
902 if (largest_outline_dimension < max_dimension)
903 largest_outline_dimension = max_dimension;
904 }
905
906 if (fixsp_noise_score_fixing) {
907 if (outline_count > 5)
908 //penalise LOTS of blobs
909 largest_outline_dimension *= 2;
910
911 box = blob->bounding_box ();
912
913 if ((box.bottom () > bln_baseline_offset * 4) ||
914 (box.top () < bln_baseline_offset / 2))
915 //Lax blob is if high or low
916 largest_outline_dimension /= 2;
917 }
918 return largest_outline_dimension;
919 }
920
921
fixspace_dbg(WERD_RES * word)922 void fixspace_dbg(WERD_RES *word) {
923 TBOX box = word->word->bounding_box ();
924 BOOL8 show_map_detail = FALSE;
925 inT16 i;
926
927 box.print ();
928 #ifndef SECURE_NAMES
929 tprintf (" \"%s\" ", word->best_choice->unichar_string().string ());
930 tprintf ("Blob count: %d (word); %d/%d (outword)\n",
931 word->word->gblob_list ()->length (),
932 word->outword->gblob_list ()->length (),
933 word->outword->rej_blob_list ()->length ());
934 word->reject_map.print (debug_fp);
935 tprintf ("\n");
936 if (show_map_detail) {
937 tprintf ("\"%s\"\n", word->best_choice->unichar_string().string ());
938 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
939 tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
940 word->reject_map[i].full_print (debug_fp);
941 }
942 }
943
944 tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
945 tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
946 #endif
947 }
948
949
950 /*************************************************************************
951 * fp_eval_word_spacing()
952 * Evaluation function for fixed pitch word lists.
953 *
954 * Basically, count the number of "nice" characters - those which are in tess
955 * acceptable words or in dict words and are not rejected.
956 * Penalise any potential noise chars
957 *************************************************************************/
958 namespace tesseract {
fp_eval_word_spacing(WERD_RES_LIST & word_res_list)959 inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
960 WERD_RES_IT word_it(&word_res_list);
961 WERD_RES *word;
962 PBLOB_IT blob_it;
963 inT16 word_length;
964 inT16 score = 0;
965 inT16 i;
966 float small_limit = bln_x_height * fixsp_small_outlines_size;
967
968 if (!fixsp_fp_eval)
969 return (eval_word_spacing (word_res_list));
970
971 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
972 word = word_it.data ();
973 word_length = word->reject_map.length ();
974 if ((word->done ||
975 word->tess_accepted) ||
976 (word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
977 (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
978 (word->best_choice->permuter () == USER_DAWG_PERM) ||
979 (safe_dict_word(*(word->best_choice)) > 0)) {
980 blob_it.set_to_list (word->outword->blob_list ());
981 UNICHAR_ID space = getDict().getUnicharset().unichar_to_id(" ");
982 for (i = 0; i < word->best_choice->length(); ++i, blob_it.forward()) {
983 if (word->best_choice->unichar_id(i) == space ||
984 (blob_noise_score(blob_it.data()) < small_limit)) {
985 score -= 1; //penalise possibly erroneous non-space
986 } else if (word->reject_map[i].accepted()) {
987 score++;
988 }
989 }
990 }
991 }
992 if (score < 0)
993 score = 0;
994 return score;
995 }
996 } // namespace tesseract
997