1 /******************************************************************
2 * File: docqual.cpp (Formerly docqual.c)
3 * Description: Document Quality Metrics
4 * Author: Phil Cheatle
5 * Created: Mon May 9 11:27:28 BST 1994
6 *
7 * (C) Copyright 1994, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 #include "mfcpch.h"
21 #include <ctype.h>
22 #include "docqual.h"
23 #include "tstruct.h"
24 #include "tfacep.h"
25 #include "reject.h"
26 #include "tessvars.h"
27 #include "genblob.h"
28 #include "secname.h"
29 #include "globals.h"
30 #include "tesseractclass.h"
31
32 #define EXTERN
33
34 EXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines");
35 EXTERN STRING_VAR (outlines_2, "ij!?%\":;",
36 "Non standard number of outlines");
37 EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE,
38 "Allow outline errs in unrejection?");
39 EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE,
40 "Reduce rejection on good docs");
41 EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
42 EXTERN double_VAR (tessedit_reject_doc_percent, 65.00,
43 "%rej allowed before rej whole doc");
44 EXTERN double_VAR (tessedit_reject_block_percent, 45.00,
45 "%rej allowed before rej whole block");
46 EXTERN double_VAR (tessedit_reject_row_percent, 40.00,
47 "%rej allowed before rej whole row");
48 EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00,
49 "%of row rejects in whole word rejects which prevents whole row rejection");
50 EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE,
51 "Only rej partially rejected words in block rejection");
52 EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE,
53 "Only rej partially rejected words in row rejection");
54 EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE,
55 "Use word segmentation quality metric");
56 EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE,
57 "Use word segmentation quality metric");
58 EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2,
59 "Only preserve wds longer than this");
60 EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE,
61 "Apply row rejection to good docs");
62 EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1,
63 "rej good doc wd if more than this fraction rejected");
64 EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE,
65 "Reject all bad quality wds");
66 EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats");
67 EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE,
68 "Output data to debug file");
69 EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no chekcs");
70 EXTERN double_VAR (quality_rowrej_pc, 1.1,
71 "good_quality_doc gte good char limit");
72
73 EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE,
74 "Mark v.bad words for tilde crunch");
75 EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?");
76 EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE,
77 "Take out ~^ early?");
78
79 EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this");
80 EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says");
81 EXTERN double_VAR (crunch_poor_garbage_cert, -9.0,
82 "crunch garbage cert lt this");
83 EXTERN double_VAR (crunch_poor_garbage_rate, 60,
84 "crunch garbage rating lt this");
85
86 EXTERN double_VAR (crunch_pot_poor_rate, 40,
87 "POTENTIAL crunch rating lt this");
88 EXTERN double_VAR (crunch_pot_poor_cert, -8.0,
89 "POTENTIAL crunch cert lt this");
90 EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");
91
92 EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
93 EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
94 EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
95 EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
96 EXTERN double_VAR (crunch_del_min_width, 3.0,
97 "Del if word width lt xht x this");
98 EXTERN double_VAR (crunch_del_high_word, 1.5,
99 "Del if word gt xht x this above bl");
100 EXTERN double_VAR (crunch_del_low_word, 0.5,
101 "Del if word gt xht x this below bl");
102 EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this");
103
104 EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch");
105 EXTERN INT_VAR (crunch_pot_indicators, 1,
106 "How many potential indicators needed");
107
108 EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE,
109 "Dont touch sensible strings");
110 EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring");
111 EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE,
112 "Dont pot crunch sensible strings");
113 EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures");
114 EXTERN INT_VAR (crunch_leave_lc_strings, 4,
115 "Dont crunch words with long lower case strings");
116 EXTERN INT_VAR (crunch_leave_uc_strings, 4,
117 "Dont crunch words with long lower case strings");
118 EXTERN INT_VAR (crunch_long_repetitions, 3,
119 "Crunch words with long repetitions");
120
121 EXTERN INT_VAR (crunch_debug, 0, "As it says");
122
123 /*************************************************************************
124 * word_blob_quality()
125 * How many blobs in the outword are identical to those of the inword?
126 * ASSUME blobs in both initial word and outword are in ascending order of
127 * left hand blob edge.
128 *************************************************************************/
word_blob_quality(WERD_RES * word,ROW * row)129 inT16 word_blob_quality( //Blob seg changes
130 WERD_RES *word,
131 ROW *row) {
132 WERD *bln_word; //BL norm init word
133 TWERD *tessword; //tess format
134 WERD *init_word; //BL norm init word
135 PBLOB_IT outword_it;
136 PBLOB_IT initial_it;
137 inT16 i;
138 inT16 init_blobs_left;
139 inT16 match_count = 0;
140 BOOL8 matched;
141 TBOX out_box;
142 PBLOB *test_blob;
143 DENORM denorm;
144 float bln_xht;
145
146 if (word->word->gblob_list ()->empty ())
147 return 0;
148 //xht used for blnorm
149 bln_xht = bln_x_height / word->denorm.scale ();
150 bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm);
151 /*
152 NOTE: Need to convert to tess format and back again to ensure that the
153 same float -> int rounding of coords is done to source wd as out wd before
154 comparison
155 */
156 tessword = make_tess_word(bln_word, NULL); // Convert word.
157 init_word = make_ed_word (tessword, bln_word);
158 delete bln_word;
159 delete_word(tessword);
160 if (init_word == NULL) {
161 // Conversion failed.
162 return 0;
163 }
164
165 initial_it.set_to_list (init_word->blob_list ());
166 init_blobs_left = initial_it.length ();
167 outword_it.set_to_list (word->outword->blob_list ());
168
169 for (outword_it.mark_cycle_pt ();
170 !outword_it.cycled_list (); outword_it.forward ()) {
171 out_box = outword_it.data ()->bounding_box ();
172
173 // Skip any initial blobs LEFT of current outword blob.
174 while (!initial_it.at_last () &&
175 (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
176 initial_it.forward ();
177 init_blobs_left--;
178 }
179
180 /* See if current outword blob matches any initial blob with the same left
181 coord. (Normally only one but possibly more - in unknown order) */
182
183 i = 0;
184 matched = FALSE;
185 do {
186 test_blob = initial_it.data_relative (i++);
187 matched = crude_match_blobs (test_blob, outword_it.data ());
188 if (matched)
189 match_count++;
190 }
191 while (!matched &&
192 (init_blobs_left - i > 0) &&
193 (i < 129) &&
194 !initial_it.at_last () &&
195 test_blob->bounding_box ().left () == out_box.left ());
196 }
197 delete init_word;
198 return match_count;
199 }
200
201
202 /*************************************************************************
203 * crude_match_blobs()
204 * Check bounding boxes are the same and the number of outlines are the same.
205 *************************************************************************/
crude_match_blobs(PBLOB * blob1,PBLOB * blob2)206 BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) {
207 TBOX box1 = blob1->bounding_box ();
208 TBOX box2 = blob2->bounding_box ();
209
210 if (box1.contains (box2) &&
211 box2.contains (box1) &&
212 (blob1->out_list ()->length () == blob1->out_list ()->length ()))
213 return TRUE;
214 else
215 return FALSE;
216 }
217
218
word_outline_errs(WERD_RES * word)219 inT16 word_outline_errs(WERD_RES *word) {
220 PBLOB_IT outword_it;
221 inT16 i = 0;
222 inT16 err_count = 0;
223
224 outword_it.set_to_list (word->outword->blob_list ());
225
226 for (outword_it.mark_cycle_pt ();
227 !outword_it.cycled_list (); outword_it.forward ()) {
228 err_count += count_outline_errs (word->best_choice->unichar_string()[i],
229 outword_it.data()->out_list()->length());
230 i++;
231 }
232 return err_count;
233 }
234
235
236 /*************************************************************************
237 * word_char_quality()
238 * Combination of blob quality and outline quality - how many good chars are
239 * there? - I.e chars which pass the blob AND outline tests.
240 *************************************************************************/
word_char_quality(WERD_RES * word,ROW * row,inT16 * match_count,inT16 * accepted_match_count)241 void word_char_quality(WERD_RES *word,
242 ROW *row,
243 inT16 *match_count,
244 inT16 *accepted_match_count) {
245 WERD *bln_word; //BL norm init word
246 TWERD *tessword; //tess format
247 WERD *init_word; //BL norm init word
248 PBLOB_IT outword_it;
249 PBLOB_IT initial_it;
250 inT16 i;
251 inT16 init_blobs_left;
252 BOOL8 matched;
253 TBOX out_box;
254 PBLOB *test_blob;
255 DENORM denorm;
256 float bln_xht;
257 inT16 j = 0;
258
259 *match_count = 0;
260 *accepted_match_count = 0;
261 if (word->word->gblob_list ()->empty ())
262 return;
263
264 //xht used for blnorm
265 bln_xht = bln_x_height / word->denorm.scale ();
266 bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm);
267 /*
268 NOTE: Need to convert to tess format and back again to ensure that the
269 same float -> int rounding of coords is done to source wd as out wd before
270 comparison
271 */
272 tessword = make_tess_word(bln_word, NULL); // Convert word.
273 init_word = make_ed_word (tessword, bln_word);
274 delete bln_word;
275 delete_word(tessword);
276 if (init_word == NULL)
277 return;
278
279 initial_it.set_to_list (init_word->blob_list ());
280 init_blobs_left = initial_it.length ();
281 outword_it.set_to_list (word->outword->blob_list ());
282
283 for (outword_it.mark_cycle_pt ();
284 !outword_it.cycled_list (); outword_it.forward ()) {
285 out_box = outword_it.data ()->bounding_box ();
286
287 /* Skip any initial blobs LEFT of current outword blob */
288 while (!initial_it.at_last () &&
289 (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
290 initial_it.forward ();
291 init_blobs_left--;
292 }
293
294 /* See if current outword blob matches any initial blob with the same left
295 coord. (Normally only one but possibly more - in unknown order) */
296
297 i = 0;
298 matched = FALSE;
299 do {
300 test_blob = initial_it.data_relative (i++);
301 matched = crude_match_blobs (test_blob, outword_it.data ());
302 if (matched &&
303 (count_outline_errs (word->best_choice->unichar_string()[j],
304 outword_it.data ()->out_list ()->length ())
305 == 0)) {
306 (*match_count)++;
307 if (word->reject_map[j].accepted ())
308 (*accepted_match_count)++;
309 }
310 }
311 while (!matched &&
312 (init_blobs_left - i > 0) &&
313 (i < 129) &&
314 !initial_it.at_last () &&
315 test_blob->bounding_box ().left () == out_box.left ());
316 j++;
317 }
318 delete init_word;
319 }
320
321
322 /*************************************************************************
323 * unrej_good_chs()
324 * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
325 *************************************************************************/
unrej_good_chs(WERD_RES * word,ROW * row)326 void unrej_good_chs(WERD_RES *word, ROW *row) {
327 WERD *bln_word; //BL norm init word
328 TWERD *tessword; //tess format
329 WERD *init_word; //BL norm init word
330 PBLOB_IT outword_it;
331 PBLOB_IT initial_it;
332 inT16 i;
333 inT16 init_blobs_left;
334 BOOL8 matched;
335 TBOX out_box;
336 PBLOB *test_blob;
337 DENORM denorm;
338 float bln_xht;
339 inT16 j = 0;
340
341 if (word->word->gblob_list ()->empty ())
342 return;
343
344 //xht used for blnorm
345 bln_xht = bln_x_height / word->denorm.scale ();
346 bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm);
347 /*
348 NOTE: Need to convert to tess format and back again to ensure that the
349 same float -> int rounding of coords is done to source wd as out wd before
350 comparison
351 */
352 tessword = make_tess_word(bln_word, NULL); // Convert word
353 init_word = make_ed_word (tessword, bln_word);
354 delete bln_word;
355 delete_word(tessword);
356 if (init_word == NULL)
357 return;
358
359 initial_it.set_to_list (init_word->blob_list ());
360 init_blobs_left = initial_it.length ();
361 outword_it.set_to_list (word->outword->blob_list ());
362
363 for (outword_it.mark_cycle_pt ();
364 !outword_it.cycled_list (); outword_it.forward ()) {
365 out_box = outword_it.data ()->bounding_box ();
366
367 /* Skip any initial blobs LEFT of current outword blob */
368 while (!initial_it.at_last () &&
369 (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
370 initial_it.forward ();
371 init_blobs_left--;
372 }
373
374 /* See if current outword blob matches any initial blob with the same left
375 coord. (Normally only one but possibly more - in unknown order) */
376
377 i = 0;
378 matched = FALSE;
379 do {
380 test_blob = initial_it.data_relative (i++);
381 matched = crude_match_blobs (test_blob, outword_it.data ());
382 if (matched &&
383 (word->reject_map[j].accept_if_good_quality ()) &&
384 (docqual_excuse_outline_errs ||
385 (count_outline_errs (word->best_choice->unichar_string()[j],
386 outword_it.data ()->out_list ()->
387 length ()) == 0)))
388 word->reject_map[j].setrej_quality_accept ();
389 }
390 while (!matched &&
391 (init_blobs_left - i > 0) &&
392 (i < 129) &&
393 !initial_it.at_last () &&
394 test_blob->bounding_box ().left () == out_box.left ());
395 j++;
396 }
397 delete init_word;
398 }
399
400
print_boxes(WERD * word)401 void print_boxes(WERD *word) {
402 PBLOB_IT it;
403 TBOX box;
404
405 it.set_to_list (word->blob_list ());
406 for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
407 box = it.data ()->bounding_box ();
408 box.print ();
409 }
410 }
411
412
count_outline_errs(char c,inT16 outline_count)413 inT16 count_outline_errs(char c, inT16 outline_count) {
414 int expected_outline_count;
415
416 if (STRING (outlines_odd).contains (c))
417 return 0; //Dont use this char
418 else if (STRING (outlines_2).contains (c))
419 expected_outline_count = 2;
420 else
421 expected_outline_count = 1;
422 return abs (outline_count - expected_outline_count);
423 }
424
425
426 namespace tesseract {
quality_based_rejection(PAGE_RES_IT & page_res_it,BOOL8 good_quality_doc)427 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
428 BOOL8 good_quality_doc) {
429 if ((tessedit_good_quality_unrej && good_quality_doc))
430 unrej_good_quality_words(page_res_it);
431 doc_and_block_rejection(page_res_it, good_quality_doc);
432
433 page_res_it.restart_page ();
434 while (page_res_it.word () != NULL) {
435 insert_rej_cblobs(page_res_it.word());
436 page_res_it.forward();
437 }
438
439 if (unlv_tilde_crunching) {
440 tilde_crunch(page_res_it);
441 tilde_delete(page_res_it);
442 }
443 }
444
445
446 /*************************************************************************
447 * unrej_good_quality_words()
448 * Accept potential rejects in words which pass the following checks:
449 * - Contains a potential reject
450 * - Word looks like a sensible alpha word.
451 * - Word segmentation is the same as the original image
452 * - All characters have the expected number of outlines
453 * NOTE - the rejection counts are recalculated after unrejection
454 * - CANT do it in a single pass without a bit of fiddling
455 * - keep it simple but inefficient
456 *************************************************************************/
unrej_good_quality_words(PAGE_RES_IT & page_res_it)457 void Tesseract::unrej_good_quality_words( //unreject potential
458 PAGE_RES_IT &page_res_it) {
459 WERD_RES *word;
460 ROW_RES *current_row;
461 BLOCK_RES *current_block;
462 int i;
463
464 page_res_it.restart_page ();
465 while (page_res_it.word () != NULL) {
466 check_debug_pt (page_res_it.word (), 100);
467 if (bland_unrej) {
468 word = page_res_it.word ();
469 for (i = 0; i < word->reject_map.length (); i++) {
470 if (word->reject_map[i].accept_if_good_quality ())
471 word->reject_map[i].setrej_quality_accept ();
472 }
473 page_res_it.forward ();
474 }
475 else if ((page_res_it.row ()->char_count > 0) &&
476 ((page_res_it.row ()->rej_count /
477 (float) page_res_it.row ()->char_count) <=
478 quality_rowrej_pc)) {
479 word = page_res_it.word ();
480 if (word->reject_map.quality_recoverable_rejects () &&
481 (tessedit_unrej_any_wd ||
482 acceptable_word_string (word->best_choice->unichar_string().string(),
483 word->best_choice->unichar_lengths().string())
484 != AC_UNACCEPTABLE)) {
485 unrej_good_chs (word, page_res_it.row ()->row);
486 }
487 page_res_it.forward ();
488 }
489 else {
490 /* Skip to end of dodgy row */
491 current_row = page_res_it.row ();
492 while ((page_res_it.word () != NULL) &&
493 (page_res_it.row () == current_row))
494 page_res_it.forward ();
495 }
496 check_debug_pt (page_res_it.word (), 110);
497 }
498 page_res_it.restart_page ();
499 page_res_it.page_res->char_count = 0;
500 page_res_it.page_res->rej_count = 0;
501 current_block = NULL;
502 current_row = NULL;
503 while (page_res_it.word () != NULL) {
504 if (current_block != page_res_it.block ()) {
505 current_block = page_res_it.block ();
506 current_block->char_count = 0;
507 current_block->rej_count = 0;
508 }
509 if (current_row != page_res_it.row ()) {
510 current_row = page_res_it.row ();
511 current_row->char_count = 0;
512 current_row->rej_count = 0;
513 current_row->whole_word_rej_count = 0;
514 }
515 page_res_it.rej_stat_word ();
516 page_res_it.forward ();
517 }
518 }
519
520
521 /*************************************************************************
522 * doc_and_block_rejection()
523 *
524 * If the page has too many rejects - reject all of it.
525 * If any block has too many rejects - reject all words in the block
526 *************************************************************************/
527
doc_and_block_rejection(PAGE_RES_IT & page_res_it,BOOL8 good_quality_doc)528 void Tesseract::doc_and_block_rejection( //reject big chunks
529 PAGE_RES_IT &page_res_it,
530 BOOL8 good_quality_doc) {
531 inT16 block_no = 0;
532 inT16 row_no = 0;
533 BLOCK_RES *current_block;
534 ROW_RES *current_row;
535
536 BOOL8 rej_word;
537 BOOL8 prev_word_rejected;
538 inT16 char_quality;
539 inT16 accepted_char_quality;
540
541 if ((page_res_it.page_res->rej_count * 100.0 /
542 page_res_it.page_res->char_count) > tessedit_reject_doc_percent) {
543 reject_whole_page(page_res_it);
544 #ifndef SECURE_NAMES
545 if (tessedit_debug_doc_rejection) {
546 tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n",
547 page_res_it.page_res->char_count,
548 page_res_it.page_res->rej_count);
549 }
550 #endif
551 }
552 else {
553 #ifndef SECURE_NAMES
554 if (tessedit_debug_doc_rejection)
555 tprintf ("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
556 page_res_it.page_res->char_count,
557 page_res_it.page_res->rej_count);
558 #endif
559
560 /* Walk blocks testing for block rejection */
561
562 page_res_it.restart_page ();
563 while (page_res_it.word () != NULL) {
564 current_block = page_res_it.block ();
565 block_no = current_block->block->index();
566 if ((page_res_it.block ()->char_count > 0) &&
567 ((page_res_it.block ()->rej_count * 100.0 /
568 page_res_it.block ()->char_count) >
569 tessedit_reject_block_percent)) {
570 #ifndef SECURE_NAMES
571 if (tessedit_debug_block_rejection)
572 tprintf ("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
573 block_no,
574 page_res_it.block ()->char_count,
575 page_res_it.block ()->rej_count);
576 #endif
577 prev_word_rejected = FALSE;
578 while ((page_res_it.word () != NULL) &&
579 (page_res_it.block () == current_block)) {
580 if (tessedit_preserve_blk_rej_perfect_wds) {
581 rej_word =
582 (page_res_it.word ()->reject_map.reject_count () > 0)
583 || (page_res_it.word ()->reject_map.length () <
584 tessedit_preserve_min_wd_len);
585 if (rej_word && tessedit_dont_blkrej_good_wds
586 && !(page_res_it.word ()->reject_map.length () <
587 tessedit_preserve_min_wd_len)
588 &&
589 (acceptable_word_string
590 (page_res_it.word()->best_choice->unichar_string().string(),
591 page_res_it.word ()->best_choice->unichar_lengths().string()) !=
592 AC_UNACCEPTABLE)) {
593 word_char_quality (page_res_it.word (),
594 page_res_it.row ()->row,
595 &char_quality,
596 &accepted_char_quality);
597 rej_word = char_quality !=
598 page_res_it.word ()->reject_map.length ();
599 }
600 }
601 else
602 rej_word = TRUE;
603 if (rej_word) {
604 /*
605 Reject spacing if both current and prev words are rejected.
606 NOTE - this is NOT restricted to FUZZY spaces. - When tried this
607 generated more space errors.
608 */
609 if (tessedit_use_reject_spaces &&
610 prev_word_rejected &&
611 (page_res_it.prev_row () == page_res_it.row ()) &&
612 (page_res_it.word ()->word->space () == 1))
613 page_res_it.word ()->reject_spaces = TRUE;
614 page_res_it.word ()->reject_map.rej_word_block_rej ();
615 }
616 prev_word_rejected = rej_word;
617 page_res_it.forward ();
618 }
619 }
620 else {
621 #ifndef SECURE_NAMES
622 if (tessedit_debug_block_rejection)
623 tprintf
624 ("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
625 block_no, page_res_it.block ()->char_count,
626 page_res_it.block ()->rej_count);
627 #endif
628
629 /* Walk rows in block testing for row rejection */
630 row_no = 0;
631 while ((page_res_it.word () != NULL) &&
632 (page_res_it.block () == current_block)) {
633 current_row = page_res_it.row ();
634 row_no++;
635 /* Reject whole row if:
636 fraction of chars on row which are rejected exceed a limit AND
637 fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
638 limit
639 */
640 if ((page_res_it.row ()->char_count > 0) &&
641 ((page_res_it.row ()->rej_count * 100.0 /
642 page_res_it.row ()->char_count) >
643 tessedit_reject_row_percent) &&
644 ((page_res_it.row ()->whole_word_rej_count * 100.0 /
645 page_res_it.row ()->rej_count) <
646 tessedit_whole_wd_rej_row_percent)) {
647 #ifndef SECURE_NAMES
648 if (tessedit_debug_block_rejection)
649 tprintf
650 ("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
651 row_no, page_res_it.row ()->char_count,
652 page_res_it.row ()->rej_count);
653 #endif
654 prev_word_rejected = FALSE;
655 while ((page_res_it.word () != NULL) &&
656 (page_res_it.row () == current_row)) {
657 /* Preserve words on good docs unless they are mostly rejected*/
658 if (!tessedit_row_rej_good_docs && good_quality_doc) {
659 rej_word =
660 page_res_it.word ()->reject_map.
661 reject_count () /
662 (float) page_res_it.word ()->reject_map.
663 length () > tessedit_good_doc_still_rowrej_wd;
664 }
665
666 /* Preserve perfect words anyway */
667 else if (tessedit_preserve_row_rej_perfect_wds) {
668 rej_word =
669 (page_res_it.word ()->reject_map.
670 reject_count () > 0)
671 || (page_res_it.word ()->reject_map.
672 length () < tessedit_preserve_min_wd_len);
673 if (rej_word && tessedit_dont_rowrej_good_wds
674 && !(page_res_it.word ()->reject_map.
675 length () <
676 tessedit_preserve_min_wd_len)
677 &&
678 (acceptable_word_string
679 (page_res_it.word ()->best_choice->
680 unichar_string().string(),
681 page_res_it.word ()->best_choice->
682 unichar_lengths().string()) != AC_UNACCEPTABLE)) {
683 word_char_quality (page_res_it.word (),
684 page_res_it.row ()->row,
685 &char_quality,
686 &accepted_char_quality);
687 rej_word = char_quality !=
688 page_res_it.word ()->reject_map.length ();
689 }
690 }
691 else
692 rej_word = TRUE;
693 if (rej_word) {
694 /*
695 Reject spacing if both current and prev words are rejected.
696 NOTE - this is NOT restricted to FUZZY spaces. - When tried
697 this generated more space errors.
698 */
699 if (tessedit_use_reject_spaces &&
700 prev_word_rejected &&
701 (page_res_it.prev_row () ==
702 page_res_it.row ())
703 && (page_res_it.word ()->word->space () ==
704 1))
705 page_res_it.word ()->reject_spaces = TRUE;
706 page_res_it.word ()->reject_map.
707 rej_word_row_rej();
708 }
709 prev_word_rejected = rej_word;
710 page_res_it.forward ();
711 }
712 }
713 else {
714 #ifndef SECURE_NAMES
715 if (tessedit_debug_block_rejection)
716 tprintf
717 ("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
718 row_no, page_res_it.row ()->char_count,
719 page_res_it.row ()->rej_count);
720 #endif
721 while ((page_res_it.word () != NULL) &&
722 (page_res_it.row () == current_row))
723 page_res_it.forward ();
724 }
725 }
726 }
727 }
728 }
729 }
730 } // namespace tesseract
731
732
733 /*************************************************************************
734 * reject_whole_page()
735 * Dont believe any of it - set the reject map to 00..00 in all words
736 *
737 *************************************************************************/
738
reject_whole_page(PAGE_RES_IT & page_res_it)739 void reject_whole_page(PAGE_RES_IT &page_res_it) {
740 page_res_it.restart_page ();
741 while (page_res_it.word () != NULL) {
742 page_res_it.word ()->reject_map.rej_word_doc_rej ();
743 page_res_it.forward ();
744 }
745 //whole page is rejected
746 page_res_it.page_res->rejected = TRUE;
747 }
748
749 namespace tesseract {
tilde_crunch(PAGE_RES_IT & page_res_it)750 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
751 WERD_RES *word;
752 GARBAGE_LEVEL garbage_level;
753 PAGE_RES_IT copy_it;
754 BOOL8 prev_potential_marked = FALSE;
755 BOOL8 found_terrible_word = FALSE;
756 BOOL8 ok_dict_word;
757
758 page_res_it.restart_page ();
759 while (page_res_it.word () != NULL) {
760 word = page_res_it.word ();
761
762 if (crunch_early_convert_bad_unlv_chs)
763 convert_bad_unlv_chs(word);
764
765 if (crunch_early_merge_tess_fails)
766 merge_tess_fails(word);
767
768 if (word->reject_map.accept_count () != 0) {
769 found_terrible_word = FALSE;
770 //Forget earlier potential crunches
771 prev_potential_marked = FALSE;
772 }
773 else {
774 ok_dict_word = safe_dict_word(*(word->best_choice));
775 garbage_level = garbage_word (word, ok_dict_word);
776
777 if ((garbage_level != G_NEVER_CRUNCH) &&
778 (terrible_word_crunch (word, garbage_level))) {
779 if (crunch_debug > 0) {
780 tprintf ("T CRUNCHING: \"%s\"\n",
781 word->best_choice->unichar_string().string());
782 }
783 word->unlv_crunch_mode = CR_KEEP_SPACE;
784 if (prev_potential_marked) {
785 while (copy_it.word () != word) {
786 if (crunch_debug > 0) {
787 tprintf ("P1 CRUNCHING: \"%s\"\n",
788 copy_it.word()->best_choice->unichar_string().string());
789 }
790 copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
791 copy_it.forward ();
792 }
793 prev_potential_marked = FALSE;
794 }
795 found_terrible_word = TRUE;
796 }
797 else if ((garbage_level != G_NEVER_CRUNCH) &&
798 (potential_word_crunch (word,
799 garbage_level, ok_dict_word))) {
800 if (found_terrible_word) {
801 if (crunch_debug > 0) {
802 tprintf ("P2 CRUNCHING: \"%s\"\n",
803 word->best_choice->unichar_string().string());
804 }
805 word->unlv_crunch_mode = CR_KEEP_SPACE;
806 }
807 else if (!prev_potential_marked) {
808 copy_it = page_res_it;
809 prev_potential_marked = TRUE;
810 if (crunch_debug > 1) {
811 tprintf ("P3 CRUNCHING: \"%s\"\n",
812 word->best_choice->unichar_string().string());
813 }
814 }
815 }
816 else {
817 found_terrible_word = FALSE;
818 //Forget earlier potential crunches
819 prev_potential_marked = FALSE;
820 if (crunch_debug > 2) {
821 tprintf ("NO CRUNCH: \"%s\"\n",
822 word->best_choice->unichar_string().string());
823 }
824 }
825 }
826 page_res_it.forward ();
827 }
828 }
829 } // namespace tesseract
830
831
terrible_word_crunch(WERD_RES * word,GARBAGE_LEVEL garbage_level)832 BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
833 float rating_per_ch;
834 int adjusted_len;
835 int crunch_mode = 0;
836
837 if ((word->best_choice->unichar_string().length () == 0) ||
838 (strspn (word->best_choice->unichar_string().string(), " ") ==
839 word->best_choice->unichar_string().length ()))
840 crunch_mode = 1;
841 else {
842 adjusted_len = word->reject_map.length ();
843 if (adjusted_len > crunch_rating_max)
844 adjusted_len = crunch_rating_max;
845 rating_per_ch = word->best_choice->rating () / adjusted_len;
846
847 if (rating_per_ch > crunch_terrible_rating)
848 crunch_mode = 2;
849 else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
850 crunch_mode = 3;
851 else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
852 (garbage_level != G_OK))
853 crunch_mode = 4;
854 else if ((rating_per_ch > crunch_poor_garbage_rate) &&
855 (garbage_level != G_OK))
856 crunch_mode = 5;
857 }
858 if (crunch_mode > 0) {
859 if (crunch_debug > 2) {
860 tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
861 crunch_mode, word->best_choice->unichar_string().string());
862 }
863 return TRUE;
864 }
865 else
866 return FALSE;
867 }
868
869 namespace tesseract {
potential_word_crunch(WERD_RES * word,GARBAGE_LEVEL garbage_level,BOOL8 ok_dict_word)870 BOOL8 Tesseract::potential_word_crunch(WERD_RES *word,
871 GARBAGE_LEVEL garbage_level,
872 BOOL8 ok_dict_word) {
873 float rating_per_ch;
874 int adjusted_len;
875 const char *str = word->best_choice->unichar_string().string();
876 const char *lengths = word->best_choice->unichar_lengths().string();
877 BOOL8 word_crunchable;
878 int poor_indicator_count = 0;
879
880 word_crunchable =
881 !crunch_leave_accept_strings ||
882 (word->reject_map.length () < 3) ||
883 ((acceptable_word_string (str, lengths) == AC_UNACCEPTABLE) &&
884 !ok_dict_word);
885
886 adjusted_len = word->reject_map.length ();
887 if (adjusted_len > 10)
888 adjusted_len = 10;
889 rating_per_ch = word->best_choice->rating () / adjusted_len;
890
891 if (rating_per_ch > crunch_pot_poor_rate) {
892 if (crunch_debug > 2) {
893 tprintf ("Potential poor rating on \"%s\"\n",
894 word->best_choice->unichar_string().string());
895 }
896 poor_indicator_count++;
897 }
898
899 if (word_crunchable &&
900 (word->best_choice->certainty () < crunch_pot_poor_cert)) {
901 if (crunch_debug > 2) {
902 tprintf ("Potential poor cert on \"%s\"\n",
903 word->best_choice->unichar_string().string());
904 }
905 poor_indicator_count++;
906 }
907
908 if (garbage_level != G_OK) {
909 if (crunch_debug > 2) {
910 tprintf ("Potential garbage on \"%s\"\n",
911 word->best_choice->unichar_string().string());
912 }
913 poor_indicator_count++;
914 }
915 return (poor_indicator_count >= crunch_pot_indicators);
916 }
917 } // namespace tesseract
918
919
920 namespace tesseract {
tilde_delete(PAGE_RES_IT & page_res_it)921 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
922 WERD_RES *word;
923 PAGE_RES_IT copy_it;
924 BOOL8 deleting_from_bol = FALSE;
925 BOOL8 marked_delete_point = FALSE;
926 inT16 debug_delete_mode;
927 CRUNCH_MODE delete_mode;
928 inT16 x_debug_delete_mode;
929 CRUNCH_MODE x_delete_mode;
930
931 page_res_it.restart_page ();
932 while (page_res_it.word () != NULL) {
933 word = page_res_it.word ();
934
935 delete_mode = word_deletable (word, debug_delete_mode);
936 if (delete_mode != CR_NONE) {
937 if (word->word->flag (W_BOL) || deleting_from_bol) {
938 if (crunch_debug > 0) {
939 tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
940 debug_delete_mode,
941 word->best_choice->unichar_string().string());
942 }
943 word->unlv_crunch_mode = delete_mode;
944 deleting_from_bol = TRUE;
945 }
946 else if (word->word->flag (W_EOL)) {
947 if (marked_delete_point) {
948 while (copy_it.word () != word) {
949 x_delete_mode = word_deletable (copy_it.word (),
950 x_debug_delete_mode);
951 if (crunch_debug > 0) {
952 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
953 x_debug_delete_mode,
954 copy_it.word()->best_choice->unichar_string().string());
955 }
956 copy_it.word ()->unlv_crunch_mode = x_delete_mode;
957 copy_it.forward ();
958 }
959 }
960 if (crunch_debug > 0) {
961 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
962 debug_delete_mode,
963 word->best_choice->unichar_string().string());
964 }
965 word->unlv_crunch_mode = delete_mode;
966 deleting_from_bol = FALSE;
967 marked_delete_point = FALSE;
968 }
969 else {
970 if (!marked_delete_point) {
971 copy_it = page_res_it;
972 marked_delete_point = TRUE;
973 }
974 }
975 }
976 else {
977 deleting_from_bol = FALSE;
978 //Forget earlier potential crunches
979 marked_delete_point = FALSE;
980 }
981 /*
982 The following step has been left till now as the tess fails are used to
983 determine if the word is deletable.
984 */
985 if (!crunch_early_merge_tess_fails)
986 merge_tess_fails(word);
987 page_res_it.forward ();
988 }
989 }
990
991
convert_bad_unlv_chs(WERD_RES * word_res)992 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
993 int i;
994 UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-");
995 UNICHAR_ID unichar_space = unicharset.unichar_to_id(" ");
996 UNICHAR_ID unichar_tilde = unicharset.unichar_to_id("~");
997 UNICHAR_ID unichar_pow = unicharset.unichar_to_id("^");
998 bool modified = false;
999 for (i = 0; i < word_res->reject_map.length(); ++i) {
1000 if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
1001 word_res->best_choice->set_unichar_id(unichar_dash, i);
1002 modified = true;
1003 if (word_res->reject_map[i].accepted ())
1004 word_res->reject_map[i].setrej_unlv_rej ();
1005 }
1006 if (word_res->best_choice->unichar_id(i) == unichar_pow) {
1007 word_res->best_choice->set_unichar_id(unichar_space, i);
1008 modified = true;
1009 if (word_res->reject_map[i].accepted ())
1010 word_res->reject_map[i].setrej_unlv_rej ();
1011 }
1012 }
1013 if (modified) {
1014 word_res->best_choice->populate_unichars(unicharset);
1015 }
1016 }
1017
1018 // Change pairs of tess failures to a single one
merge_tess_fails(WERD_RES * word_res)1019 void Tesseract::merge_tess_fails(WERD_RES *word_res) {
1020 PBLOB_IT blob_it; //blobs
1021 int len = word_res->best_choice->length();
1022 bool modified = false;
1023
1024 ASSERT_HOST (word_res->reject_map.length () == len);
1025 ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
1026
1027 UNICHAR_ID unichar_space = unicharset.unichar_to_id(" ");
1028 blob_it = word_res->outword->blob_list ();
1029 int i = 0;
1030 while (i < word_res->best_choice->length()-1) {
1031 if ((word_res->best_choice->unichar_id(i) == unichar_space) &&
1032 (word_res->best_choice->unichar_id(i+1) == unichar_space)) {
1033 modified = true;
1034 word_res->best_choice->remove_unichar_id(i);
1035 word_res->reject_map.remove_pos (i);
1036 merge_blobs (blob_it.data_relative (1), blob_it.data ());
1037 delete blob_it.extract (); //get rid of spare
1038 } else {
1039 i++;
1040 }
1041 blob_it.forward ();
1042 }
1043 len = word_res->best_choice->length();
1044 ASSERT_HOST (word_res->reject_map.length () == len);
1045 ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
1046 if (modified) {
1047 word_res->best_choice->populate_unichars(unicharset);
1048 }
1049 }
1050
garbage_word(WERD_RES * word,BOOL8 ok_dict_word)1051 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
1052 enum STATES
1053 {
1054 JUNK,
1055 FIRST_UPPER,
1056 FIRST_LOWER,
1057 FIRST_NUM,
1058 SUBSEQUENT_UPPER,
1059 SUBSEQUENT_LOWER,
1060 SUBSEQUENT_NUM
1061 };
1062 const char *str = word->best_choice->unichar_string().string();
1063 const char *lengths = word->best_choice->unichar_lengths().string();
1064 STATES state = JUNK;
1065 int len = 0;
1066 int isolated_digits = 0;
1067 int isolated_alphas = 0;
1068 int bad_char_count = 0;
1069 int tess_rejs = 0;
1070 int dodgy_chars = 0;
1071 int ok_chars;
1072 UNICHAR_ID last_char = -1;
1073 int alpha_repetition_count = 0;
1074 int longest_alpha_repetition_count = 0;
1075 int longest_lower_run_len = 0;
1076 int lower_string_count = 0;
1077 int longest_upper_run_len = 0;
1078 int upper_string_count = 0;
1079 int total_alpha_count = 0;
1080 int total_digit_count = 0;
1081
1082 for (; *str != '\0'; str += *(lengths++)) {
1083 len++;
1084 if (unicharset.get_isupper (str, *lengths)) {
1085 total_alpha_count++;
1086 switch (state) {
1087 case SUBSEQUENT_UPPER:
1088 case FIRST_UPPER:
1089 state = SUBSEQUENT_UPPER;
1090 upper_string_count++;
1091 if (longest_upper_run_len < upper_string_count)
1092 longest_upper_run_len = upper_string_count;
1093 if (last_char == unicharset.unichar_to_id(str, *lengths)) {
1094 alpha_repetition_count++;
1095 if (longest_alpha_repetition_count < alpha_repetition_count) {
1096 longest_alpha_repetition_count = alpha_repetition_count;
1097 }
1098 }
1099 else {
1100 last_char = unicharset.unichar_to_id(str, *lengths);
1101 alpha_repetition_count = 1;
1102 }
1103 break;
1104 case FIRST_NUM:
1105 isolated_digits++;
1106 default:
1107 state = FIRST_UPPER;
1108 last_char = unicharset.unichar_to_id(str, *lengths);
1109 alpha_repetition_count = 1;
1110 upper_string_count = 1;
1111 break;
1112 }
1113 }
1114 else if (unicharset.get_islower (str, *lengths)) {
1115 total_alpha_count++;
1116 switch (state) {
1117 case SUBSEQUENT_LOWER:
1118 case FIRST_LOWER:
1119 state = SUBSEQUENT_LOWER;
1120 lower_string_count++;
1121 if (longest_lower_run_len < lower_string_count)
1122 longest_lower_run_len = lower_string_count;
1123 if (last_char == unicharset.unichar_to_id(str, *lengths)) {
1124 alpha_repetition_count++;
1125 if (longest_alpha_repetition_count < alpha_repetition_count) {
1126 longest_alpha_repetition_count = alpha_repetition_count;
1127 }
1128 }
1129 else {
1130 last_char = unicharset.unichar_to_id(str, *lengths);
1131 alpha_repetition_count = 1;
1132 }
1133 break;
1134 case FIRST_NUM:
1135 isolated_digits++;
1136 default:
1137 state = FIRST_LOWER;
1138 last_char = unicharset.unichar_to_id(str, *lengths);
1139 alpha_repetition_count = 1;
1140 lower_string_count = 1;
1141 break;
1142 }
1143 }
1144 else if (unicharset.get_isdigit (str, *lengths)) {
1145 total_digit_count++;
1146 switch (state) {
1147 case FIRST_NUM:
1148 state = SUBSEQUENT_NUM;
1149 case SUBSEQUENT_NUM:
1150 break;
1151 case FIRST_UPPER:
1152 case FIRST_LOWER:
1153 isolated_alphas++;
1154 default:
1155 state = FIRST_NUM;
1156 break;
1157 }
1158 }
1159 else {
1160 if (*lengths == 1 && *str == ' ')
1161 tess_rejs++;
1162 else
1163 bad_char_count++;
1164 switch (state) {
1165 case FIRST_NUM:
1166 isolated_digits++;
1167 break;
1168 case FIRST_UPPER:
1169 case FIRST_LOWER:
1170 isolated_alphas++;
1171 default:
1172 break;
1173 }
1174 state = JUNK;
1175 }
1176 }
1177
1178 switch (state) {
1179 case FIRST_NUM:
1180 isolated_digits++;
1181 break;
1182 case FIRST_UPPER:
1183 case FIRST_LOWER:
1184 isolated_alphas++;
1185 default:
1186 break;
1187 }
1188
1189 if (crunch_include_numerals) {
1190 total_alpha_count += total_digit_count - isolated_digits;
1191 }
1192
1193 if (crunch_leave_ok_strings &&
1194 (len >= 4) &&
1195 (2 * (total_alpha_count - isolated_alphas) > len) &&
1196 (longest_alpha_repetition_count < crunch_long_repetitions)) {
1197 if ((crunch_accept_ok &&
1198 (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE)) ||
1199 (longest_lower_run_len > crunch_leave_lc_strings) ||
1200 (longest_upper_run_len > crunch_leave_uc_strings))
1201 return G_NEVER_CRUNCH;
1202 }
1203 if ((word->reject_map.length () > 1) &&
1204 (strpbrk (str, " ") == NULL) &&
1205 ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
1206 (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
1207 (word->best_choice->permuter () == USER_DAWG_PERM) ||
1208 (word->best_choice->permuter () == NUMBER_PERM) ||
1209 (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE) || ok_dict_word))
1210 return G_OK;
1211
1212 ok_chars = len - bad_char_count - isolated_digits -
1213 isolated_alphas - tess_rejs;
1214
1215 if (crunch_debug > 3) {
1216 tprintf ("garbage_word: \"%s\"\n",
1217 word->best_choice->unichar_string().string());
1218 tprintf ("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
1219 len,
1220 bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
1221 }
1222 if ((bad_char_count == 0) &&
1223 (tess_rejs == 0) &&
1224 ((len > isolated_digits + isolated_alphas) || (len <= 2)))
1225 return G_OK;
1226
1227 if ((tess_rejs > ok_chars) ||
1228 ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len)))
1229 return G_TERRIBLE;
1230
1231 if (len > 4) {
1232 dodgy_chars = 2 * tess_rejs + bad_char_count +
1233 isolated_digits + isolated_alphas;
1234 if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5))
1235 return G_DODGY;
1236 else
1237 return G_OK;
1238 }
1239 else {
1240 dodgy_chars = 2 * tess_rejs + bad_char_count;
1241 if (((len == 4) && (dodgy_chars > 2)) ||
1242 ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len))
1243 return G_DODGY;
1244 else
1245 return G_OK;
1246 }
1247 }
1248 } // namespace tesseract
1249
1250
1251 /*************************************************************************
1252 * word_deletable()
1253 * DELETE WERDS AT ENDS OF ROWS IF
1254 * Word is crunched &&
1255 * ( string length = 0 OR
1256 * > 50% of chars are "|" (before merging) OR
1257 * certainty < -10 OR
1258 * rating /char > 60 OR
1259 * TOP of word is more than 0.5 xht BELOW baseline OR
1260 * BOTTOM of word is more than 0.5 xht ABOVE xht OR
1261 * length of word < 3xht OR
1262 * height of word < 0.7 xht OR
1263 * height of word > 3.0 xht OR
1264 * >75% of the outline BBs have longest dimension < 0.5xht
1265 *************************************************************************/
1266
word_deletable(WERD_RES * word,inT16 & delete_mode)1267 CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) {
1268 int word_len = word->reject_map.length ();
1269 float rating_per_ch;
1270 TBOX box; //BB of word
1271
1272 if (word->unlv_crunch_mode == CR_NONE) {
1273 delete_mode = 0;
1274 return CR_NONE;
1275 }
1276
1277 if (word_len == 0) {
1278 delete_mode = 1;
1279 return CR_DELETE;
1280 }
1281
1282 box = word->outword->bounding_box ();
1283 if (box.height () < crunch_del_min_ht * bln_x_height) {
1284 delete_mode = 4;
1285 return CR_DELETE;
1286 }
1287
1288 if (noise_outlines (word->outword)) {
1289 delete_mode = 5;
1290 return CR_DELETE;
1291 }
1292
1293 if ((failure_count (word) * 1.5) > word_len) {
1294 delete_mode = 2;
1295 return CR_LOOSE_SPACE;
1296 }
1297
1298 if (word->best_choice->certainty () < crunch_del_cert) {
1299 delete_mode = 7;
1300 return CR_LOOSE_SPACE;
1301 }
1302
1303 rating_per_ch = word->best_choice->rating () / word_len;
1304
1305 if (rating_per_ch > crunch_del_rating) {
1306 delete_mode = 8;
1307 return CR_LOOSE_SPACE;
1308 }
1309
1310 if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) {
1311 delete_mode = 9;
1312 return CR_LOOSE_SPACE;
1313 }
1314
1315 if (box.bottom () >
1316 bln_baseline_offset + crunch_del_high_word * bln_x_height) {
1317 delete_mode = 10;
1318 return CR_LOOSE_SPACE;
1319 }
1320
1321 if (box.height () > crunch_del_max_ht * bln_x_height) {
1322 delete_mode = 11;
1323 return CR_LOOSE_SPACE;
1324 }
1325
1326 if (box.width () < crunch_del_min_width * bln_x_height) {
1327 delete_mode = 3;
1328 return CR_LOOSE_SPACE;
1329 }
1330
1331 delete_mode = 0;
1332 return CR_NONE;
1333 }
1334
failure_count(WERD_RES * word)1335 inT16 failure_count(WERD_RES *word) {
1336 const char *str = word->best_choice->unichar_string().string();
1337 int tess_rejs = 0;
1338
1339 for (; *str != '\0'; str++) {
1340 if (*str == ' ')
1341 tess_rejs++;
1342 }
1343 return tess_rejs;
1344 }
1345
1346
noise_outlines(WERD * word)1347 BOOL8 noise_outlines(WERD *word) {
1348 PBLOB_IT blob_it;
1349 OUTLINE_IT outline_it;
1350 TBOX box; //BB of outline
1351 inT16 outline_count = 0;
1352 inT16 small_outline_count = 0;
1353 inT16 max_dimension;
1354 float small_limit = bln_x_height * crunch_small_outlines_size;
1355
1356 blob_it.set_to_list (word->blob_list ());
1357 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
1358 outline_it.set_to_list (blob_it.data ()->out_list ());
1359 for (outline_it.mark_cycle_pt ();
1360 !outline_it.cycled_list (); outline_it.forward ()) {
1361 outline_count++;
1362 box = outline_it.data ()->bounding_box ();
1363 if (box.height () > box.width ())
1364 max_dimension = box.height ();
1365 else
1366 max_dimension = box.width ();
1367 if (max_dimension < small_limit)
1368 small_outline_count++;
1369 }
1370 }
1371 return (small_outline_count >= outline_count);
1372 }
1373
1374
1375 /*************************************************************************
1376 * insert_rej_cblobs()
1377 * Put rejected word blobs back into the outword.
1378 * NOTE!!! AFTER THIS THE CHOICES LIST WILL NOT HAVE THE CORRECT NUMBER
1379 * OF ELEMENTS.
1380 *************************************************************************/
1381 namespace tesseract {
insert_rej_cblobs(WERD_RES * word)1382 void Tesseract::insert_rej_cblobs(WERD_RES *word) {
1383 PBLOB_IT blob_it; //blob iterator
1384 PBLOB_IT rej_blob_it;
1385 const STRING *word_str;
1386 const STRING *word_lengths;
1387 int old_len;
1388 int rej_len;
1389 char new_str[512 * UNICHAR_LEN];
1390 char new_lengths[512];
1391 REJMAP new_map;
1392 int i = 0; //new_str index
1393 int j = 0; //old_str index
1394 int i_offset = 0; //new_str offset
1395 int j_offset = 0; //old_str offset
1396 int new_len;
1397
1398 gblob_sort_list (word->outword->rej_blob_list (), TRUE);
1399 rej_blob_it.set_to_list (word->outword->rej_blob_list ());
1400 if (rej_blob_it.empty ())
1401 return;
1402 rej_len = rej_blob_it.length ();
1403 blob_it.set_to_list (word->outword->blob_list ());
1404 word_str = &(word->best_choice->unichar_string());
1405 word_lengths = &(word->best_choice->unichar_lengths());
1406 old_len = word->best_choice->length();
1407 ASSERT_HOST (word->reject_map.length () == old_len);
1408 ASSERT_HOST (blob_it.length () == old_len);
1409 if ((old_len + rej_len) > 511)
1410 return; //Word is garbage anyway prevent abort
1411 new_map.initialise (old_len + rej_len);
1412
1413 while (!rej_blob_it.empty ()) {
1414 if ((j >= old_len) ||
1415 (rej_blob_it.data ()->bounding_box ().left () <=
1416 blob_it.data ()->bounding_box ().left ())) {
1417 /* Insert reject blob */
1418 if (j >= old_len)
1419 blob_it.add_to_end (rej_blob_it.extract ());
1420 else
1421 blob_it.add_before_stay_put (rej_blob_it.extract ());
1422 if (!rej_blob_it.empty ())
1423 rej_blob_it.forward ();
1424 new_str[i_offset] = ' ';
1425 new_lengths[i] = 1;
1426 new_map[i].setrej_rej_cblob ();
1427 i_offset += new_lengths[i++];
1428 }
1429 else {
1430 strncpy(new_str + i_offset, &(*word_str)[j_offset],
1431 (*word_lengths)[j]);
1432 new_lengths[i] = (*word_lengths)[j];
1433 new_map[i] = word->reject_map[j];
1434 i_offset += new_lengths[i++];
1435 j_offset += (*word_lengths)[j++];
1436 blob_it.forward ();
1437 }
1438 }
1439 /* Add any extra normal blobs to strings */
1440 while (j < word_lengths->length ()) {
1441 strncpy(new_str + i_offset, &(*word_str)[j_offset],
1442 (*word_lengths)[j]);
1443 new_lengths[i] = (*word_lengths)[j];
1444 new_map[i] = word->reject_map[j];
1445 i_offset += new_lengths[i++];
1446 j_offset += (*word_lengths)[j++];
1447 }
1448 new_str[i_offset] = '\0';
1449 new_lengths[i] = 0;
1450 /*
1451 tprintf(
1452 "\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n",
1453 old_len, i, new_str, new_map );
1454 */
1455 ASSERT_HOST (i == blob_it.length ());
1456 ASSERT_HOST (i == old_len + rej_len);
1457 word->reject_map = new_map;
1458
1459 // Update word->best_choice if needed.
1460 if (strcmp(new_str, word->best_choice->unichar_string().string()) != 0 ||
1461 strcmp(new_lengths, word->best_choice->unichar_lengths().string()) != 0) {
1462 WERD_CHOICE *new_choice =
1463 new WERD_CHOICE(new_str, new_lengths,
1464 word->best_choice->rating(),
1465 word->best_choice->certainty(),
1466 word->best_choice->permuter(),
1467 getDict().getUnicharset());
1468 new_choice->populate_unichars(getDict().getUnicharset());
1469 delete word->best_choice;
1470 word->best_choice = new_choice;
1471 }
1472 new_len = word->best_choice->length();
1473 ASSERT_HOST (word->reject_map.length () == new_len);
1474 ASSERT_HOST (word->outword->blob_list ()->length () == new_len);
1475
1476 }
1477 } // namespace tesseract
1478