• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************
2  * File:        docqual.cpp  (Formerly docqual.c)
3  * Description: Document Quality Metrics
4  * Author:		Phil Cheatle
5  * Created:		Mon May  9 11:27:28 BST 1994
6  *
7  * (C) Copyright 1994, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #include          <ctype.h>
22 #include          "docqual.h"
23 #include          "tstruct.h"
24 #include          "tfacep.h"
25 #include          "reject.h"
26 #include          "tessvars.h"
27 #include          "genblob.h"
28 #include          "secname.h"
29 #include          "globals.h"
30 #include          "tesseractclass.h"
31 
32 #define EXTERN
33 
34 EXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines");
35 EXTERN STRING_VAR (outlines_2, "ij!?%\":;",
36 "Non standard number of outlines");
37 EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE,
38 "Allow outline errs in unrejection?");
39 EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE,
40 "Reduce rejection on good docs");
41 EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
42 EXTERN double_VAR (tessedit_reject_doc_percent, 65.00,
43 "%rej allowed before rej whole doc");
44 EXTERN double_VAR (tessedit_reject_block_percent, 45.00,
45 "%rej allowed before rej whole block");
46 EXTERN double_VAR (tessedit_reject_row_percent, 40.00,
47 "%rej allowed before rej whole row");
48 EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00,
49 "%of row rejects in whole word rejects which prevents whole row rejection");
50 EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE,
51 "Only rej partially rejected words in block rejection");
52 EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE,
53 "Only rej partially rejected words in row rejection");
54 EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE,
55 "Use word segmentation quality metric");
56 EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE,
57 "Use word segmentation quality metric");
58 EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2,
59 "Only preserve wds longer than this");
60 EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE,
61 "Apply row rejection to good docs");
62 EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1,
63 "rej good doc wd if more than this fraction rejected");
64 EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE,
65 "Reject all bad quality wds");
66 EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats");
67 EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE,
68 "Output data to debug file");
69 EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no chekcs");
70 EXTERN double_VAR (quality_rowrej_pc, 1.1,
71 "good_quality_doc gte good char limit");
72 
73 EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE,
74 "Mark v.bad words for tilde crunch");
75 EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?");
76 EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE,
77 "Take out ~^ early?");
78 
79 EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this");
80 EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says");
81 EXTERN double_VAR (crunch_poor_garbage_cert, -9.0,
82 "crunch garbage cert lt this");
83 EXTERN double_VAR (crunch_poor_garbage_rate, 60,
84 "crunch garbage rating lt this");
85 
86 EXTERN double_VAR (crunch_pot_poor_rate, 40,
87 "POTENTIAL crunch rating lt this");
88 EXTERN double_VAR (crunch_pot_poor_cert, -8.0,
89 "POTENTIAL crunch cert lt this");
90 EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");
91 
92 EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
93 EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
94 EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
95 EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
96 EXTERN double_VAR (crunch_del_min_width, 3.0,
97 "Del if word width lt xht x this");
98 EXTERN double_VAR (crunch_del_high_word, 1.5,
99 "Del if word gt xht x this above bl");
100 EXTERN double_VAR (crunch_del_low_word, 0.5,
101 "Del if word gt xht x this below bl");
102 EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this");
103 
104 EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch");
105 EXTERN INT_VAR (crunch_pot_indicators, 1,
106 "How many potential indicators needed");
107 
108 EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE,
109 "Dont touch sensible strings");
110 EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring");
111 EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE,
112 "Dont pot crunch sensible strings");
113 EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures");
114 EXTERN INT_VAR (crunch_leave_lc_strings, 4,
115 "Dont crunch words with long lower case strings");
116 EXTERN INT_VAR (crunch_leave_uc_strings, 4,
117 "Dont crunch words with long lower case strings");
118 EXTERN INT_VAR (crunch_long_repetitions, 3,
119 "Crunch words with long repetitions");
120 
121 EXTERN INT_VAR (crunch_debug, 0, "As it says");
122 
123 /*************************************************************************
124  * word_blob_quality()
125  * How many blobs in the outword are identical to those of the inword?
126  * ASSUME blobs in both initial word and outword are in ascending order of
127  * left hand blob edge.
128  *************************************************************************/
word_blob_quality(WERD_RES * word,ROW * row)129 inT16 word_blob_quality(  //Blob seg changes
130                         WERD_RES *word,
131                         ROW *row) {
132   WERD *bln_word;                //BL norm init word
133   TWERD *tessword;               //tess format
134   WERD *init_word;               //BL norm init word
135   PBLOB_IT outword_it;
136   PBLOB_IT initial_it;
137   inT16 i;
138   inT16 init_blobs_left;
139   inT16 match_count = 0;
140   BOOL8 matched;
141   TBOX out_box;
142   PBLOB *test_blob;
143   DENORM denorm;
144   float bln_xht;
145 
146   if (word->word->gblob_list ()->empty ())
147     return 0;
148                                  //xht used for blnorm
149   bln_xht = bln_x_height / word->denorm.scale ();
150   bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm);
151   /*
152     NOTE: Need to convert to tess format and back again to ensure that the
153     same float -> int rounding of coords is done to source wd as out wd before
154     comparison
155   */
156   tessword = make_tess_word(bln_word, NULL);  // Convert word.
157   init_word = make_ed_word (tessword, bln_word);
158   delete bln_word;
159   delete_word(tessword);
160   if (init_word == NULL) {
161     // Conversion failed.
162     return 0;
163   }
164 
165   initial_it.set_to_list (init_word->blob_list ());
166   init_blobs_left = initial_it.length ();
167   outword_it.set_to_list (word->outword->blob_list ());
168 
169   for (outword_it.mark_cycle_pt ();
170   !outword_it.cycled_list (); outword_it.forward ()) {
171     out_box = outword_it.data ()->bounding_box ();
172 
173     // Skip any initial blobs LEFT of current outword blob.
174     while (!initial_it.at_last () &&
175     (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
176       initial_it.forward ();
177       init_blobs_left--;
178     }
179 
180     /* See if current outword blob matches any initial blob with the same left
181       coord. (Normally only one but possibly more - in unknown order) */
182 
183     i = 0;
184     matched = FALSE;
185     do {
186       test_blob = initial_it.data_relative (i++);
187       matched = crude_match_blobs (test_blob, outword_it.data ());
188       if (matched)
189         match_count++;
190     }
191     while (!matched &&
192       (init_blobs_left - i > 0) &&
193       (i < 129) &&
194       !initial_it.at_last () &&
195       test_blob->bounding_box ().left () == out_box.left ());
196   }
197   delete init_word;
198   return match_count;
199 }
200 
201 
202 /*************************************************************************
203  * crude_match_blobs()
204  * Check bounding boxes are the same and the number of outlines are the same.
205  *************************************************************************/
crude_match_blobs(PBLOB * blob1,PBLOB * blob2)206 BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) {
207   TBOX box1 = blob1->bounding_box ();
208   TBOX box2 = blob2->bounding_box ();
209 
210   if (box1.contains (box2) &&
211     box2.contains (box1) &&
212     (blob1->out_list ()->length () == blob1->out_list ()->length ()))
213     return TRUE;
214   else
215     return FALSE;
216 }
217 
218 
word_outline_errs(WERD_RES * word)219 inT16 word_outline_errs(WERD_RES *word) {
220   PBLOB_IT outword_it;
221   inT16 i = 0;
222   inT16 err_count = 0;
223 
224   outword_it.set_to_list (word->outword->blob_list ());
225 
226   for (outword_it.mark_cycle_pt ();
227   !outword_it.cycled_list (); outword_it.forward ()) {
228     err_count += count_outline_errs (word->best_choice->unichar_string()[i],
229                                     outword_it.data()->out_list()->length());
230     i++;
231   }
232   return err_count;
233 }
234 
235 
236 /*************************************************************************
237  * word_char_quality()
238  * Combination of blob quality and outline quality - how many good chars are
239  * there? - I.e chars which pass the blob AND outline tests.
240  *************************************************************************/
word_char_quality(WERD_RES * word,ROW * row,inT16 * match_count,inT16 * accepted_match_count)241 void word_char_quality(WERD_RES *word,
242                        ROW *row,
243                        inT16 *match_count,
244                        inT16 *accepted_match_count) {
245   WERD *bln_word;                //BL norm init word
246   TWERD *tessword;               //tess format
247   WERD *init_word;               //BL norm init word
248   PBLOB_IT outword_it;
249   PBLOB_IT initial_it;
250   inT16 i;
251   inT16 init_blobs_left;
252   BOOL8 matched;
253   TBOX out_box;
254   PBLOB *test_blob;
255   DENORM denorm;
256   float bln_xht;
257   inT16 j = 0;
258 
259   *match_count = 0;
260   *accepted_match_count = 0;
261   if (word->word->gblob_list ()->empty ())
262     return;
263 
264                                  //xht used for blnorm
265   bln_xht = bln_x_height / word->denorm.scale ();
266   bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm);
267   /*
268     NOTE: Need to convert to tess format and back again to ensure that the
269     same float -> int rounding of coords is done to source wd as out wd before
270     comparison
271   */
272   tessword = make_tess_word(bln_word, NULL);  // Convert word.
273   init_word = make_ed_word (tessword, bln_word);
274   delete bln_word;
275   delete_word(tessword);
276   if (init_word == NULL)
277     return;
278 
279   initial_it.set_to_list (init_word->blob_list ());
280   init_blobs_left = initial_it.length ();
281   outword_it.set_to_list (word->outword->blob_list ());
282 
283   for (outword_it.mark_cycle_pt ();
284   !outword_it.cycled_list (); outword_it.forward ()) {
285     out_box = outword_it.data ()->bounding_box ();
286 
287     /* Skip any initial blobs LEFT of current outword blob */
288     while (!initial_it.at_last () &&
289     (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
290       initial_it.forward ();
291       init_blobs_left--;
292     }
293 
294     /* See if current outword blob matches any initial blob with the same left
295       coord. (Normally only one but possibly more - in unknown order) */
296 
297     i = 0;
298     matched = FALSE;
299     do {
300       test_blob = initial_it.data_relative (i++);
301       matched = crude_match_blobs (test_blob, outword_it.data ());
302       if (matched &&
303         (count_outline_errs (word->best_choice->unichar_string()[j],
304         outword_it.data ()->out_list ()->length ())
305       == 0)) {
306         (*match_count)++;
307         if (word->reject_map[j].accepted ())
308           (*accepted_match_count)++;
309       }
310     }
311     while (!matched &&
312       (init_blobs_left - i > 0) &&
313       (i < 129) &&
314       !initial_it.at_last () &&
315       test_blob->bounding_box ().left () == out_box.left ());
316     j++;
317   }
318   delete init_word;
319 }
320 
321 
322 /*************************************************************************
323  * unrej_good_chs()
324  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
325  *************************************************************************/
unrej_good_chs(WERD_RES * word,ROW * row)326 void unrej_good_chs(WERD_RES *word, ROW *row) {
327   WERD *bln_word;                //BL norm init word
328   TWERD *tessword;               //tess format
329   WERD *init_word;               //BL norm init word
330   PBLOB_IT outword_it;
331   PBLOB_IT initial_it;
332   inT16 i;
333   inT16 init_blobs_left;
334   BOOL8 matched;
335   TBOX out_box;
336   PBLOB *test_blob;
337   DENORM denorm;
338   float bln_xht;
339   inT16 j = 0;
340 
341   if (word->word->gblob_list ()->empty ())
342     return;
343 
344                                  //xht used for blnorm
345   bln_xht = bln_x_height / word->denorm.scale ();
346   bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm);
347   /*
348     NOTE: Need to convert to tess format and back again to ensure that the
349     same float -> int rounding of coords is done to source wd as out wd before
350     comparison
351   */
352   tessword = make_tess_word(bln_word, NULL);  // Convert word
353   init_word = make_ed_word (tessword, bln_word);
354   delete bln_word;
355   delete_word(tessword);
356   if (init_word == NULL)
357     return;
358 
359   initial_it.set_to_list (init_word->blob_list ());
360   init_blobs_left = initial_it.length ();
361   outword_it.set_to_list (word->outword->blob_list ());
362 
363   for (outword_it.mark_cycle_pt ();
364   !outword_it.cycled_list (); outword_it.forward ()) {
365     out_box = outword_it.data ()->bounding_box ();
366 
367     /* Skip any initial blobs LEFT of current outword blob */
368     while (!initial_it.at_last () &&
369     (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
370       initial_it.forward ();
371       init_blobs_left--;
372     }
373 
374     /* See if current outword blob matches any initial blob with the same left
375       coord. (Normally only one but possibly more - in unknown order) */
376 
377     i = 0;
378     matched = FALSE;
379     do {
380       test_blob = initial_it.data_relative (i++);
381       matched = crude_match_blobs (test_blob, outword_it.data ());
382       if (matched &&
383         (word->reject_map[j].accept_if_good_quality ()) &&
384         (docqual_excuse_outline_errs ||
385         (count_outline_errs (word->best_choice->unichar_string()[j],
386         outword_it.data ()->out_list ()->
387         length ()) == 0)))
388         word->reject_map[j].setrej_quality_accept ();
389     }
390     while (!matched &&
391       (init_blobs_left - i > 0) &&
392       (i < 129) &&
393       !initial_it.at_last () &&
394       test_blob->bounding_box ().left () == out_box.left ());
395     j++;
396   }
397   delete init_word;
398 }
399 
400 
print_boxes(WERD * word)401 void print_boxes(WERD *word) {
402   PBLOB_IT it;
403   TBOX box;
404 
405   it.set_to_list (word->blob_list ());
406   for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
407     box = it.data ()->bounding_box ();
408     box.print ();
409   }
410 }
411 
412 
count_outline_errs(char c,inT16 outline_count)413 inT16 count_outline_errs(char c, inT16 outline_count) {
414   int expected_outline_count;
415 
416   if (STRING (outlines_odd).contains (c))
417     return 0;                    //Dont use this char
418   else if (STRING (outlines_2).contains (c))
419     expected_outline_count = 2;
420   else
421     expected_outline_count = 1;
422   return abs (outline_count - expected_outline_count);
423 }
424 
425 
426 namespace tesseract {
quality_based_rejection(PAGE_RES_IT & page_res_it,BOOL8 good_quality_doc)427 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
428                                         BOOL8 good_quality_doc) {
429   if ((tessedit_good_quality_unrej && good_quality_doc))
430     unrej_good_quality_words(page_res_it);
431   doc_and_block_rejection(page_res_it, good_quality_doc);
432 
433   page_res_it.restart_page ();
434   while (page_res_it.word () != NULL) {
435     insert_rej_cblobs(page_res_it.word());
436     page_res_it.forward();
437   }
438 
439   if (unlv_tilde_crunching) {
440     tilde_crunch(page_res_it);
441     tilde_delete(page_res_it);
442   }
443 }
444 
445 
446 /*************************************************************************
447  * unrej_good_quality_words()
448  * Accept potential rejects in words which pass the following checks:
449  *    - Contains a potential reject
450  *    - Word looks like a sensible alpha word.
451  *    - Word segmentation is the same as the original image
452  *		- All characters have the expected number of outlines
453  * NOTE - the rejection counts are recalculated after unrejection
454  *      - CANT do it in a single pass without a bit of fiddling
455  *		- keep it simple but inefficient
456  *************************************************************************/
unrej_good_quality_words(PAGE_RES_IT & page_res_it)457 void Tesseract::unrej_good_quality_words(  //unreject potential
458                                          PAGE_RES_IT &page_res_it) {
459   WERD_RES *word;
460   ROW_RES *current_row;
461   BLOCK_RES *current_block;
462   int i;
463 
464   page_res_it.restart_page ();
465   while (page_res_it.word () != NULL) {
466     check_debug_pt (page_res_it.word (), 100);
467     if (bland_unrej) {
468       word = page_res_it.word ();
469       for (i = 0; i < word->reject_map.length (); i++) {
470         if (word->reject_map[i].accept_if_good_quality ())
471           word->reject_map[i].setrej_quality_accept ();
472       }
473       page_res_it.forward ();
474     }
475     else if ((page_res_it.row ()->char_count > 0) &&
476       ((page_res_it.row ()->rej_count /
477       (float) page_res_it.row ()->char_count) <=
478     quality_rowrej_pc)) {
479       word = page_res_it.word ();
480       if (word->reject_map.quality_recoverable_rejects () &&
481         (tessedit_unrej_any_wd ||
482         acceptable_word_string (word->best_choice->unichar_string().string(),
483                                 word->best_choice->unichar_lengths().string())
484       != AC_UNACCEPTABLE)) {
485         unrej_good_chs (word, page_res_it.row ()->row);
486       }
487       page_res_it.forward ();
488     }
489     else {
490       /* Skip to end of dodgy row */
491       current_row = page_res_it.row ();
492       while ((page_res_it.word () != NULL) &&
493         (page_res_it.row () == current_row))
494         page_res_it.forward ();
495     }
496     check_debug_pt (page_res_it.word (), 110);
497   }
498   page_res_it.restart_page ();
499   page_res_it.page_res->char_count = 0;
500   page_res_it.page_res->rej_count = 0;
501   current_block = NULL;
502   current_row = NULL;
503   while (page_res_it.word () != NULL) {
504     if (current_block != page_res_it.block ()) {
505       current_block = page_res_it.block ();
506       current_block->char_count = 0;
507       current_block->rej_count = 0;
508     }
509     if (current_row != page_res_it.row ()) {
510       current_row = page_res_it.row ();
511       current_row->char_count = 0;
512       current_row->rej_count = 0;
513       current_row->whole_word_rej_count = 0;
514     }
515     page_res_it.rej_stat_word ();
516     page_res_it.forward ();
517   }
518 }
519 
520 
521 /*************************************************************************
522  * doc_and_block_rejection()
523  *
524  * If the page has too many rejects - reject all of it.
525  * If any block has too many rejects - reject all words in the block
526  *************************************************************************/
527 
doc_and_block_rejection(PAGE_RES_IT & page_res_it,BOOL8 good_quality_doc)528 void Tesseract::doc_and_block_rejection(  //reject big chunks
529                                         PAGE_RES_IT &page_res_it,
530                                         BOOL8 good_quality_doc) {
531   inT16 block_no = 0;
532   inT16 row_no = 0;
533   BLOCK_RES *current_block;
534   ROW_RES *current_row;
535 
536   BOOL8 rej_word;
537   BOOL8 prev_word_rejected;
538   inT16 char_quality;
539   inT16 accepted_char_quality;
540 
541   if ((page_res_it.page_res->rej_count * 100.0 /
542   page_res_it.page_res->char_count) > tessedit_reject_doc_percent) {
543     reject_whole_page(page_res_it);
544     #ifndef SECURE_NAMES
545     if (tessedit_debug_doc_rejection) {
546       tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n",
547         page_res_it.page_res->char_count,
548         page_res_it.page_res->rej_count);
549     }
550     #endif
551   }
552   else {
553     #ifndef SECURE_NAMES
554     if (tessedit_debug_doc_rejection)
555       tprintf ("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
556         page_res_it.page_res->char_count,
557         page_res_it.page_res->rej_count);
558     #endif
559 
560     /* Walk blocks testing for block rejection */
561 
562     page_res_it.restart_page ();
563     while (page_res_it.word () != NULL) {
564       current_block = page_res_it.block ();
565       block_no = current_block->block->index();
566       if ((page_res_it.block ()->char_count > 0) &&
567         ((page_res_it.block ()->rej_count * 100.0 /
568         page_res_it.block ()->char_count) >
569       tessedit_reject_block_percent)) {
570         #ifndef SECURE_NAMES
571         if (tessedit_debug_block_rejection)
572           tprintf ("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
573             block_no,
574             page_res_it.block ()->char_count,
575             page_res_it.block ()->rej_count);
576         #endif
577         prev_word_rejected = FALSE;
578         while ((page_res_it.word () != NULL) &&
579         (page_res_it.block () == current_block)) {
580           if (tessedit_preserve_blk_rej_perfect_wds) {
581             rej_word =
582               (page_res_it.word ()->reject_map.reject_count () > 0)
583               || (page_res_it.word ()->reject_map.length () <
584               tessedit_preserve_min_wd_len);
585             if (rej_word && tessedit_dont_blkrej_good_wds
586               && !(page_res_it.word ()->reject_map.length () <
587               tessedit_preserve_min_wd_len)
588               &&
589               (acceptable_word_string
590                (page_res_it.word()->best_choice->unichar_string().string(),
591                page_res_it.word ()->best_choice->unichar_lengths().string()) !=
592                AC_UNACCEPTABLE)) {
593               word_char_quality (page_res_it.word (),
594                 page_res_it.row ()->row,
595                 &char_quality,
596                 &accepted_char_quality);
597               rej_word = char_quality !=
598                 page_res_it.word ()->reject_map.length ();
599             }
600           }
601           else
602             rej_word = TRUE;
603           if (rej_word) {
604             /*
605               Reject spacing if both current and prev words are rejected.
606               NOTE - this is NOT restricted to FUZZY spaces. - When tried this
607               generated more space errors.
608             */
609             if (tessedit_use_reject_spaces &&
610               prev_word_rejected &&
611               (page_res_it.prev_row () == page_res_it.row ()) &&
612               (page_res_it.word ()->word->space () == 1))
613               page_res_it.word ()->reject_spaces = TRUE;
614             page_res_it.word ()->reject_map.rej_word_block_rej ();
615           }
616           prev_word_rejected = rej_word;
617           page_res_it.forward ();
618         }
619       }
620       else {
621         #ifndef SECURE_NAMES
622         if (tessedit_debug_block_rejection)
623           tprintf
624             ("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
625             block_no, page_res_it.block ()->char_count,
626             page_res_it.block ()->rej_count);
627         #endif
628 
629         /* Walk rows in block testing for row rejection */
630         row_no = 0;
631         while ((page_res_it.word () != NULL) &&
632         (page_res_it.block () == current_block)) {
633           current_row = page_res_it.row ();
634           row_no++;
635           /* Reject whole row if:
636             fraction of chars on row which are rejected exceed a limit AND
637             fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
638             limit
639           */
640           if ((page_res_it.row ()->char_count > 0) &&
641             ((page_res_it.row ()->rej_count * 100.0 /
642             page_res_it.row ()->char_count) >
643             tessedit_reject_row_percent) &&
644             ((page_res_it.row ()->whole_word_rej_count * 100.0 /
645             page_res_it.row ()->rej_count) <
646           tessedit_whole_wd_rej_row_percent)) {
647             #ifndef SECURE_NAMES
648             if (tessedit_debug_block_rejection)
649               tprintf
650                 ("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
651                 row_no, page_res_it.row ()->char_count,
652                 page_res_it.row ()->rej_count);
653             #endif
654             prev_word_rejected = FALSE;
655             while ((page_res_it.word () != NULL) &&
656             (page_res_it.row () == current_row)) {
657               /* Preserve words on good docs unless they are mostly rejected*/
658               if (!tessedit_row_rej_good_docs && good_quality_doc) {
659                 rej_word =
660                   page_res_it.word ()->reject_map.
661                   reject_count () /
662                   (float) page_res_it.word ()->reject_map.
663                   length () > tessedit_good_doc_still_rowrej_wd;
664               }
665 
666               /* Preserve perfect words anyway */
667               else if (tessedit_preserve_row_rej_perfect_wds) {
668                 rej_word =
669                   (page_res_it.word ()->reject_map.
670                   reject_count () > 0)
671                   || (page_res_it.word ()->reject_map.
672                   length () < tessedit_preserve_min_wd_len);
673                 if (rej_word && tessedit_dont_rowrej_good_wds
674                   && !(page_res_it.word ()->reject_map.
675                   length () <
676                   tessedit_preserve_min_wd_len)
677                   &&
678                   (acceptable_word_string
679                    (page_res_it.word ()->best_choice->
680                     unichar_string().string(),
681                     page_res_it.word ()->best_choice->
682                     unichar_lengths().string()) != AC_UNACCEPTABLE)) {
683                   word_char_quality (page_res_it.word (),
684                     page_res_it.row ()->row,
685                     &char_quality,
686                     &accepted_char_quality);
687                   rej_word = char_quality !=
688                     page_res_it.word ()->reject_map.length ();
689                 }
690               }
691               else
692                 rej_word = TRUE;
693               if (rej_word) {
694                 /*
695                   Reject spacing if both current and prev words are rejected.
696                   NOTE - this is NOT restricted to FUZZY spaces. - When tried
697                   this generated more space errors.
698                 */
699                 if (tessedit_use_reject_spaces &&
700                   prev_word_rejected &&
701                   (page_res_it.prev_row () ==
702                   page_res_it.row ())
703                   && (page_res_it.word ()->word->space () ==
704                   1))
705                   page_res_it.word ()->reject_spaces = TRUE;
706                 page_res_it.word ()->reject_map.
707                   rej_word_row_rej();
708               }
709               prev_word_rejected = rej_word;
710               page_res_it.forward ();
711             }
712           }
713           else {
714             #ifndef SECURE_NAMES
715             if (tessedit_debug_block_rejection)
716               tprintf
717                 ("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
718                 row_no, page_res_it.row ()->char_count,
719                 page_res_it.row ()->rej_count);
720             #endif
721             while ((page_res_it.word () != NULL) &&
722               (page_res_it.row () == current_row))
723               page_res_it.forward ();
724           }
725         }
726       }
727     }
728   }
729 }
730 }  // namespace tesseract
731 
732 
733 /*************************************************************************
734  * reject_whole_page()
735  * Dont believe any of it - set the reject map to 00..00 in all words
736  *
737  *************************************************************************/
738 
reject_whole_page(PAGE_RES_IT & page_res_it)739 void reject_whole_page(PAGE_RES_IT &page_res_it) {
740   page_res_it.restart_page ();
741   while (page_res_it.word () != NULL) {
742     page_res_it.word ()->reject_map.rej_word_doc_rej ();
743     page_res_it.forward ();
744   }
745                                  //whole page is rejected
746   page_res_it.page_res->rejected = TRUE;
747 }
748 
749 namespace tesseract {
tilde_crunch(PAGE_RES_IT & page_res_it)750 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
751   WERD_RES *word;
752   GARBAGE_LEVEL garbage_level;
753   PAGE_RES_IT copy_it;
754   BOOL8 prev_potential_marked = FALSE;
755   BOOL8 found_terrible_word = FALSE;
756   BOOL8 ok_dict_word;
757 
758   page_res_it.restart_page ();
759   while (page_res_it.word () != NULL) {
760     word = page_res_it.word ();
761 
762     if (crunch_early_convert_bad_unlv_chs)
763       convert_bad_unlv_chs(word);
764 
765     if (crunch_early_merge_tess_fails)
766       merge_tess_fails(word);
767 
768     if (word->reject_map.accept_count () != 0) {
769       found_terrible_word = FALSE;
770                                  //Forget earlier potential crunches
771       prev_potential_marked = FALSE;
772     }
773     else {
774       ok_dict_word = safe_dict_word(*(word->best_choice));
775       garbage_level = garbage_word (word, ok_dict_word);
776 
777       if ((garbage_level != G_NEVER_CRUNCH) &&
778       (terrible_word_crunch (word, garbage_level))) {
779         if (crunch_debug > 0) {
780           tprintf ("T CRUNCHING: \"%s\"\n",
781             word->best_choice->unichar_string().string());
782         }
783         word->unlv_crunch_mode = CR_KEEP_SPACE;
784         if (prev_potential_marked) {
785           while (copy_it.word () != word) {
786             if (crunch_debug > 0) {
787               tprintf ("P1 CRUNCHING: \"%s\"\n",
788                 copy_it.word()->best_choice->unichar_string().string());
789             }
790             copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
791             copy_it.forward ();
792           }
793           prev_potential_marked = FALSE;
794         }
795         found_terrible_word = TRUE;
796       }
797       else if ((garbage_level != G_NEVER_CRUNCH) &&
798         (potential_word_crunch (word,
799       garbage_level, ok_dict_word))) {
800         if (found_terrible_word) {
801           if (crunch_debug > 0) {
802             tprintf ("P2 CRUNCHING: \"%s\"\n",
803               word->best_choice->unichar_string().string());
804           }
805           word->unlv_crunch_mode = CR_KEEP_SPACE;
806         }
807         else if (!prev_potential_marked) {
808           copy_it = page_res_it;
809           prev_potential_marked = TRUE;
810           if (crunch_debug > 1) {
811             tprintf ("P3 CRUNCHING: \"%s\"\n",
812               word->best_choice->unichar_string().string());
813           }
814         }
815       }
816       else {
817         found_terrible_word = FALSE;
818                                  //Forget earlier potential crunches
819         prev_potential_marked = FALSE;
820         if (crunch_debug > 2) {
821           tprintf ("NO CRUNCH: \"%s\"\n",
822             word->best_choice->unichar_string().string());
823         }
824       }
825     }
826     page_res_it.forward ();
827   }
828 }
829 }  // namespace tesseract
830 
831 
terrible_word_crunch(WERD_RES * word,GARBAGE_LEVEL garbage_level)832 BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
833   float rating_per_ch;
834   int adjusted_len;
835   int crunch_mode = 0;
836 
837   if ((word->best_choice->unichar_string().length () == 0) ||
838     (strspn (word->best_choice->unichar_string().string(), " ") ==
839     word->best_choice->unichar_string().length ()))
840     crunch_mode = 1;
841   else {
842     adjusted_len = word->reject_map.length ();
843     if (adjusted_len > crunch_rating_max)
844       adjusted_len = crunch_rating_max;
845     rating_per_ch = word->best_choice->rating () / adjusted_len;
846 
847     if (rating_per_ch > crunch_terrible_rating)
848       crunch_mode = 2;
849     else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
850       crunch_mode = 3;
851     else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
852       (garbage_level != G_OK))
853       crunch_mode = 4;
854     else if ((rating_per_ch > crunch_poor_garbage_rate) &&
855       (garbage_level != G_OK))
856       crunch_mode = 5;
857   }
858   if (crunch_mode > 0) {
859     if (crunch_debug > 2) {
860       tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
861         crunch_mode, word->best_choice->unichar_string().string());
862     }
863     return TRUE;
864   }
865   else
866     return FALSE;
867 }
868 
869 namespace tesseract {
potential_word_crunch(WERD_RES * word,GARBAGE_LEVEL garbage_level,BOOL8 ok_dict_word)870 BOOL8 Tesseract::potential_word_crunch(WERD_RES *word,
871                                        GARBAGE_LEVEL garbage_level,
872                                        BOOL8 ok_dict_word) {
873   float rating_per_ch;
874   int adjusted_len;
875   const char *str = word->best_choice->unichar_string().string();
876   const char *lengths = word->best_choice->unichar_lengths().string();
877   BOOL8 word_crunchable;
878   int poor_indicator_count = 0;
879 
880   word_crunchable =
881     !crunch_leave_accept_strings ||
882     (word->reject_map.length () < 3) ||
883     ((acceptable_word_string (str, lengths) == AC_UNACCEPTABLE) &&
884      !ok_dict_word);
885 
886   adjusted_len = word->reject_map.length ();
887   if (adjusted_len > 10)
888     adjusted_len = 10;
889   rating_per_ch = word->best_choice->rating () / adjusted_len;
890 
891   if (rating_per_ch > crunch_pot_poor_rate) {
892     if (crunch_debug > 2) {
893       tprintf ("Potential poor rating on \"%s\"\n",
894         word->best_choice->unichar_string().string());
895     }
896     poor_indicator_count++;
897   }
898 
899   if (word_crunchable &&
900   (word->best_choice->certainty () < crunch_pot_poor_cert)) {
901     if (crunch_debug > 2) {
902       tprintf ("Potential poor cert on \"%s\"\n",
903         word->best_choice->unichar_string().string());
904     }
905     poor_indicator_count++;
906   }
907 
908   if (garbage_level != G_OK) {
909     if (crunch_debug > 2) {
910       tprintf ("Potential garbage on \"%s\"\n",
911         word->best_choice->unichar_string().string());
912     }
913     poor_indicator_count++;
914   }
915   return (poor_indicator_count >= crunch_pot_indicators);
916 }
917 }  // namespace tesseract
918 
919 
920 namespace tesseract {
tilde_delete(PAGE_RES_IT & page_res_it)921 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
922   WERD_RES *word;
923   PAGE_RES_IT copy_it;
924   BOOL8 deleting_from_bol = FALSE;
925   BOOL8 marked_delete_point = FALSE;
926   inT16 debug_delete_mode;
927   CRUNCH_MODE delete_mode;
928   inT16 x_debug_delete_mode;
929   CRUNCH_MODE x_delete_mode;
930 
931   page_res_it.restart_page ();
932   while (page_res_it.word () != NULL) {
933     word = page_res_it.word ();
934 
935     delete_mode = word_deletable (word, debug_delete_mode);
936     if (delete_mode != CR_NONE) {
937       if (word->word->flag (W_BOL) || deleting_from_bol) {
938         if (crunch_debug > 0) {
939           tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
940             debug_delete_mode,
941             word->best_choice->unichar_string().string());
942         }
943         word->unlv_crunch_mode = delete_mode;
944         deleting_from_bol = TRUE;
945       }
946       else if (word->word->flag (W_EOL)) {
947         if (marked_delete_point) {
948           while (copy_it.word () != word) {
949             x_delete_mode = word_deletable (copy_it.word (),
950               x_debug_delete_mode);
951             if (crunch_debug > 0) {
952               tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
953                 x_debug_delete_mode,
954                 copy_it.word()->best_choice->unichar_string().string());
955             }
956             copy_it.word ()->unlv_crunch_mode = x_delete_mode;
957             copy_it.forward ();
958           }
959         }
960         if (crunch_debug > 0) {
961           tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
962             debug_delete_mode,
963             word->best_choice->unichar_string().string());
964         }
965         word->unlv_crunch_mode = delete_mode;
966         deleting_from_bol = FALSE;
967         marked_delete_point = FALSE;
968       }
969       else {
970         if (!marked_delete_point) {
971           copy_it = page_res_it;
972           marked_delete_point = TRUE;
973         }
974       }
975     }
976     else {
977       deleting_from_bol = FALSE;
978                                  //Forget earlier potential crunches
979       marked_delete_point = FALSE;
980     }
981     /*
982       The following step has been left till now as the tess fails are used to
983       determine if the word is deletable.
984     */
985     if (!crunch_early_merge_tess_fails)
986       merge_tess_fails(word);
987     page_res_it.forward ();
988   }
989 }
990 
991 
convert_bad_unlv_chs(WERD_RES * word_res)992 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
993   int i;
994   UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-");
995   UNICHAR_ID unichar_space = unicharset.unichar_to_id(" ");
996   UNICHAR_ID unichar_tilde = unicharset.unichar_to_id("~");
997   UNICHAR_ID unichar_pow = unicharset.unichar_to_id("^");
998   bool modified = false;
999   for (i = 0; i < word_res->reject_map.length(); ++i) {
1000     if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
1001       word_res->best_choice->set_unichar_id(unichar_dash, i);
1002       modified = true;
1003       if (word_res->reject_map[i].accepted ())
1004         word_res->reject_map[i].setrej_unlv_rej ();
1005     }
1006     if (word_res->best_choice->unichar_id(i) == unichar_pow) {
1007       word_res->best_choice->set_unichar_id(unichar_space, i);
1008       modified = true;
1009       if (word_res->reject_map[i].accepted ())
1010         word_res->reject_map[i].setrej_unlv_rej ();
1011     }
1012   }
1013   if (modified) {
1014     word_res->best_choice->populate_unichars(unicharset);
1015   }
1016 }
1017 
1018 // Change pairs of tess failures to a single one
merge_tess_fails(WERD_RES * word_res)1019 void Tesseract::merge_tess_fails(WERD_RES *word_res) {
1020   PBLOB_IT blob_it;              //blobs
1021   int len = word_res->best_choice->length();
1022   bool modified = false;
1023 
1024   ASSERT_HOST (word_res->reject_map.length () == len);
1025   ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
1026 
1027   UNICHAR_ID unichar_space = unicharset.unichar_to_id(" ");
1028   blob_it = word_res->outword->blob_list ();
1029   int i = 0;
1030   while (i < word_res->best_choice->length()-1) {
1031     if ((word_res->best_choice->unichar_id(i) == unichar_space) &&
1032         (word_res->best_choice->unichar_id(i+1) == unichar_space)) {
1033       modified = true;
1034       word_res->best_choice->remove_unichar_id(i);
1035       word_res->reject_map.remove_pos (i);
1036       merge_blobs (blob_it.data_relative (1), blob_it.data ());
1037       delete blob_it.extract (); //get rid of spare
1038     } else {
1039       i++;
1040     }
1041     blob_it.forward ();
1042   }
1043   len = word_res->best_choice->length();
1044   ASSERT_HOST (word_res->reject_map.length () == len);
1045   ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
1046   if (modified) {
1047     word_res->best_choice->populate_unichars(unicharset);
1048   }
1049 }
1050 
garbage_word(WERD_RES * word,BOOL8 ok_dict_word)1051 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
1052   enum STATES
1053   {
1054     JUNK,
1055     FIRST_UPPER,
1056     FIRST_LOWER,
1057     FIRST_NUM,
1058     SUBSEQUENT_UPPER,
1059     SUBSEQUENT_LOWER,
1060     SUBSEQUENT_NUM
1061   };
1062   const char *str = word->best_choice->unichar_string().string();
1063   const char *lengths = word->best_choice->unichar_lengths().string();
1064   STATES state = JUNK;
1065   int len = 0;
1066   int isolated_digits = 0;
1067   int isolated_alphas = 0;
1068   int bad_char_count = 0;
1069   int tess_rejs = 0;
1070   int dodgy_chars = 0;
1071   int ok_chars;
1072   UNICHAR_ID last_char = -1;
1073   int alpha_repetition_count = 0;
1074   int longest_alpha_repetition_count = 0;
1075   int longest_lower_run_len = 0;
1076   int lower_string_count = 0;
1077   int longest_upper_run_len = 0;
1078   int upper_string_count = 0;
1079   int total_alpha_count = 0;
1080   int total_digit_count = 0;
1081 
1082   for (; *str != '\0'; str += *(lengths++)) {
1083     len++;
1084     if (unicharset.get_isupper (str, *lengths)) {
1085       total_alpha_count++;
1086       switch (state) {
1087         case SUBSEQUENT_UPPER:
1088         case FIRST_UPPER:
1089           state = SUBSEQUENT_UPPER;
1090           upper_string_count++;
1091           if (longest_upper_run_len < upper_string_count)
1092             longest_upper_run_len = upper_string_count;
1093           if (last_char == unicharset.unichar_to_id(str, *lengths)) {
1094             alpha_repetition_count++;
1095             if (longest_alpha_repetition_count < alpha_repetition_count) {
1096               longest_alpha_repetition_count = alpha_repetition_count;
1097             }
1098           }
1099           else {
1100             last_char = unicharset.unichar_to_id(str, *lengths);
1101             alpha_repetition_count = 1;
1102           }
1103           break;
1104         case FIRST_NUM:
1105           isolated_digits++;
1106         default:
1107           state = FIRST_UPPER;
1108           last_char = unicharset.unichar_to_id(str, *lengths);
1109           alpha_repetition_count = 1;
1110           upper_string_count = 1;
1111           break;
1112       }
1113     }
1114     else if (unicharset.get_islower (str, *lengths)) {
1115       total_alpha_count++;
1116       switch (state) {
1117         case SUBSEQUENT_LOWER:
1118         case FIRST_LOWER:
1119           state = SUBSEQUENT_LOWER;
1120           lower_string_count++;
1121           if (longest_lower_run_len < lower_string_count)
1122             longest_lower_run_len = lower_string_count;
1123           if (last_char == unicharset.unichar_to_id(str, *lengths)) {
1124             alpha_repetition_count++;
1125             if (longest_alpha_repetition_count < alpha_repetition_count) {
1126               longest_alpha_repetition_count = alpha_repetition_count;
1127             }
1128           }
1129           else {
1130             last_char = unicharset.unichar_to_id(str, *lengths);
1131             alpha_repetition_count = 1;
1132           }
1133           break;
1134         case FIRST_NUM:
1135           isolated_digits++;
1136         default:
1137           state = FIRST_LOWER;
1138           last_char = unicharset.unichar_to_id(str, *lengths);
1139           alpha_repetition_count = 1;
1140           lower_string_count = 1;
1141           break;
1142       }
1143     }
1144     else if (unicharset.get_isdigit (str, *lengths)) {
1145       total_digit_count++;
1146       switch (state) {
1147         case FIRST_NUM:
1148           state = SUBSEQUENT_NUM;
1149         case SUBSEQUENT_NUM:
1150           break;
1151         case FIRST_UPPER:
1152         case FIRST_LOWER:
1153           isolated_alphas++;
1154         default:
1155           state = FIRST_NUM;
1156           break;
1157       }
1158     }
1159     else {
1160       if (*lengths == 1 && *str == ' ')
1161         tess_rejs++;
1162       else
1163         bad_char_count++;
1164       switch (state) {
1165         case FIRST_NUM:
1166           isolated_digits++;
1167           break;
1168         case FIRST_UPPER:
1169         case FIRST_LOWER:
1170           isolated_alphas++;
1171         default:
1172           break;
1173       }
1174       state = JUNK;
1175     }
1176   }
1177 
1178   switch (state) {
1179     case FIRST_NUM:
1180       isolated_digits++;
1181       break;
1182     case FIRST_UPPER:
1183     case FIRST_LOWER:
1184       isolated_alphas++;
1185     default:
1186       break;
1187   }
1188 
1189   if (crunch_include_numerals) {
1190     total_alpha_count += total_digit_count - isolated_digits;
1191   }
1192 
1193   if (crunch_leave_ok_strings &&
1194     (len >= 4) &&
1195     (2 * (total_alpha_count - isolated_alphas) > len) &&
1196   (longest_alpha_repetition_count < crunch_long_repetitions)) {
1197     if ((crunch_accept_ok &&
1198       (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE)) ||
1199       (longest_lower_run_len > crunch_leave_lc_strings) ||
1200       (longest_upper_run_len > crunch_leave_uc_strings))
1201       return G_NEVER_CRUNCH;
1202   }
1203   if ((word->reject_map.length () > 1) &&
1204     (strpbrk (str, " ") == NULL) &&
1205     ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
1206     (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
1207     (word->best_choice->permuter () == USER_DAWG_PERM) ||
1208     (word->best_choice->permuter () == NUMBER_PERM) ||
1209     (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE) || ok_dict_word))
1210     return G_OK;
1211 
1212   ok_chars = len - bad_char_count - isolated_digits -
1213     isolated_alphas - tess_rejs;
1214 
1215   if (crunch_debug > 3) {
1216     tprintf ("garbage_word: \"%s\"\n",
1217       word->best_choice->unichar_string().string());
1218     tprintf ("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
1219       len,
1220       bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
1221   }
1222   if ((bad_char_count == 0) &&
1223     (tess_rejs == 0) &&
1224     ((len > isolated_digits + isolated_alphas) || (len <= 2)))
1225     return G_OK;
1226 
1227   if ((tess_rejs > ok_chars) ||
1228     ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len)))
1229     return G_TERRIBLE;
1230 
1231   if (len > 4) {
1232     dodgy_chars = 2 * tess_rejs + bad_char_count +
1233       isolated_digits + isolated_alphas;
1234     if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5))
1235       return G_DODGY;
1236     else
1237       return G_OK;
1238   }
1239   else {
1240     dodgy_chars = 2 * tess_rejs + bad_char_count;
1241     if (((len == 4) && (dodgy_chars > 2)) ||
1242       ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len))
1243       return G_DODGY;
1244     else
1245       return G_OK;
1246   }
1247 }
1248 }  // namespace tesseract
1249 
1250 
1251 /*************************************************************************
1252  * word_deletable()
1253  *     DELETE WERDS AT ENDS OF ROWS IF
1254  *        Word is crunched &&
1255  *        ( string length = 0                                          OR
1256  *          > 50% of chars are "|" (before merging)                    OR
1257  *          certainty < -10                                            OR
1258  *          rating /char > 60                                          OR
1259  *          TOP of word is more than 0.5 xht BELOW baseline            OR
1260  *          BOTTOM of word is more than 0.5 xht ABOVE xht              OR
1261  *          length of word < 3xht                                      OR
1262  *          height of word < 0.7 xht                                   OR
1263  *          height of word > 3.0 xht                                   OR
1264  *          >75% of the outline BBs have longest dimension < 0.5xht
1265  *************************************************************************/
1266 
word_deletable(WERD_RES * word,inT16 & delete_mode)1267 CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) {
1268   int word_len = word->reject_map.length ();
1269   float rating_per_ch;
1270   TBOX box;                       //BB of word
1271 
1272   if (word->unlv_crunch_mode == CR_NONE) {
1273     delete_mode = 0;
1274     return CR_NONE;
1275   }
1276 
1277   if (word_len == 0) {
1278     delete_mode = 1;
1279     return CR_DELETE;
1280   }
1281 
1282   box = word->outword->bounding_box ();
1283   if (box.height () < crunch_del_min_ht * bln_x_height) {
1284     delete_mode = 4;
1285     return CR_DELETE;
1286   }
1287 
1288   if (noise_outlines (word->outword)) {
1289     delete_mode = 5;
1290     return CR_DELETE;
1291   }
1292 
1293   if ((failure_count (word) * 1.5) > word_len) {
1294     delete_mode = 2;
1295     return CR_LOOSE_SPACE;
1296   }
1297 
1298   if (word->best_choice->certainty () < crunch_del_cert) {
1299     delete_mode = 7;
1300     return CR_LOOSE_SPACE;
1301   }
1302 
1303   rating_per_ch = word->best_choice->rating () / word_len;
1304 
1305   if (rating_per_ch > crunch_del_rating) {
1306     delete_mode = 8;
1307     return CR_LOOSE_SPACE;
1308   }
1309 
1310   if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) {
1311     delete_mode = 9;
1312     return CR_LOOSE_SPACE;
1313   }
1314 
1315   if (box.bottom () >
1316   bln_baseline_offset + crunch_del_high_word * bln_x_height) {
1317     delete_mode = 10;
1318     return CR_LOOSE_SPACE;
1319   }
1320 
1321   if (box.height () > crunch_del_max_ht * bln_x_height) {
1322     delete_mode = 11;
1323     return CR_LOOSE_SPACE;
1324   }
1325 
1326   if (box.width () < crunch_del_min_width * bln_x_height) {
1327     delete_mode = 3;
1328     return CR_LOOSE_SPACE;
1329   }
1330 
1331   delete_mode = 0;
1332   return CR_NONE;
1333 }
1334 
failure_count(WERD_RES * word)1335 inT16 failure_count(WERD_RES *word) {
1336   const char *str = word->best_choice->unichar_string().string();
1337   int tess_rejs = 0;
1338 
1339   for (; *str != '\0'; str++) {
1340     if (*str == ' ')
1341       tess_rejs++;
1342   }
1343   return tess_rejs;
1344 }
1345 
1346 
noise_outlines(WERD * word)1347 BOOL8 noise_outlines(WERD *word) {
1348   PBLOB_IT blob_it;
1349   OUTLINE_IT outline_it;
1350   TBOX box;                       //BB of outline
1351   inT16 outline_count = 0;
1352   inT16 small_outline_count = 0;
1353   inT16 max_dimension;
1354   float small_limit = bln_x_height * crunch_small_outlines_size;
1355 
1356   blob_it.set_to_list (word->blob_list ());
1357   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
1358     outline_it.set_to_list (blob_it.data ()->out_list ());
1359     for (outline_it.mark_cycle_pt ();
1360     !outline_it.cycled_list (); outline_it.forward ()) {
1361       outline_count++;
1362       box = outline_it.data ()->bounding_box ();
1363       if (box.height () > box.width ())
1364         max_dimension = box.height ();
1365       else
1366         max_dimension = box.width ();
1367       if (max_dimension < small_limit)
1368         small_outline_count++;
1369     }
1370   }
1371   return (small_outline_count >= outline_count);
1372 }
1373 
1374 
1375 /*************************************************************************
1376  * insert_rej_cblobs()
1377  * Put rejected word blobs back into the outword.
1378  * NOTE!!! AFTER THIS THE CHOICES LIST WILL NOT HAVE THE CORRECT NUMBER
1379  * OF ELEMENTS.
1380  *************************************************************************/
1381 namespace tesseract {
insert_rej_cblobs(WERD_RES * word)1382 void Tesseract::insert_rej_cblobs(WERD_RES *word) {
1383   PBLOB_IT blob_it;              //blob iterator
1384   PBLOB_IT rej_blob_it;
1385   const STRING *word_str;
1386   const STRING *word_lengths;
1387   int old_len;
1388   int rej_len;
1389   char new_str[512 * UNICHAR_LEN];
1390   char new_lengths[512];
1391   REJMAP new_map;
1392   int i = 0;                     //new_str index
1393   int j = 0;                     //old_str index
1394   int i_offset = 0;              //new_str offset
1395   int j_offset = 0;              //old_str offset
1396   int new_len;
1397 
1398   gblob_sort_list (word->outword->rej_blob_list (), TRUE);
1399   rej_blob_it.set_to_list (word->outword->rej_blob_list ());
1400   if (rej_blob_it.empty ())
1401     return;
1402   rej_len = rej_blob_it.length ();
1403   blob_it.set_to_list (word->outword->blob_list ());
1404   word_str = &(word->best_choice->unichar_string());
1405   word_lengths = &(word->best_choice->unichar_lengths());
1406   old_len = word->best_choice->length();
1407   ASSERT_HOST (word->reject_map.length () == old_len);
1408   ASSERT_HOST (blob_it.length () == old_len);
1409   if ((old_len + rej_len) > 511)
1410     return;                      //Word is garbage anyway prevent abort
1411   new_map.initialise (old_len + rej_len);
1412 
1413   while (!rej_blob_it.empty ()) {
1414     if ((j >= old_len) ||
1415       (rej_blob_it.data ()->bounding_box ().left () <=
1416     blob_it.data ()->bounding_box ().left ())) {
1417       /* Insert reject blob */
1418       if (j >= old_len)
1419         blob_it.add_to_end (rej_blob_it.extract ());
1420       else
1421         blob_it.add_before_stay_put (rej_blob_it.extract ());
1422       if (!rej_blob_it.empty ())
1423         rej_blob_it.forward ();
1424       new_str[i_offset] = ' ';
1425       new_lengths[i] = 1;
1426       new_map[i].setrej_rej_cblob ();
1427       i_offset += new_lengths[i++];
1428     }
1429     else {
1430       strncpy(new_str + i_offset, &(*word_str)[j_offset],
1431               (*word_lengths)[j]);
1432       new_lengths[i] = (*word_lengths)[j];
1433       new_map[i] = word->reject_map[j];
1434       i_offset += new_lengths[i++];
1435       j_offset += (*word_lengths)[j++];
1436       blob_it.forward ();
1437     }
1438   }
1439   /* Add any extra normal blobs to strings */
1440   while (j < word_lengths->length ()) {
1441     strncpy(new_str + i_offset, &(*word_str)[j_offset],
1442             (*word_lengths)[j]);
1443     new_lengths[i] = (*word_lengths)[j];
1444     new_map[i] = word->reject_map[j];
1445     i_offset += new_lengths[i++];
1446     j_offset += (*word_lengths)[j++];
1447   }
1448   new_str[i_offset] = '\0';
1449   new_lengths[i] = 0;
1450   /*
1451     tprintf(
1452           "\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n",
1453           old_len, i, new_str, new_map );
1454   */
1455   ASSERT_HOST (i == blob_it.length ());
1456   ASSERT_HOST (i == old_len + rej_len);
1457   word->reject_map = new_map;
1458 
1459   // Update word->best_choice if needed.
1460   if (strcmp(new_str, word->best_choice->unichar_string().string()) != 0 ||
1461       strcmp(new_lengths, word->best_choice->unichar_lengths().string()) != 0) {
1462     WERD_CHOICE *new_choice =
1463       new WERD_CHOICE(new_str, new_lengths,
1464                       word->best_choice->rating(),
1465                       word->best_choice->certainty(),
1466                       word->best_choice->permuter(),
1467                       getDict().getUnicharset());
1468    new_choice->populate_unichars(getDict().getUnicharset());
1469    delete word->best_choice;
1470    word->best_choice = new_choice;
1471   }
1472   new_len = word->best_choice->length();
1473   ASSERT_HOST (word->reject_map.length () == new_len);
1474   ASSERT_HOST (word->outword->blob_list ()->length () == new_len);
1475 
1476 }
1477 }  // namespace tesseract
1478