• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************
2  * File:        docqual.h  (Formerly docqual.h)
3  * Description: Document Quality Metrics
4  * Author:		Phil Cheatle
5  * Created:		Mon May  9 11:27:28 BST 1994
6  *
7  * (C) Copyright 1994, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef           DOCQUAL_H
21 #define           DOCQUAL_H
22 
23 #include          "control.h"
24 #include          "notdll.h"
25 
26 enum GARBAGE_LEVEL
27 {
28   G_NEVER_CRUNCH,
29   G_OK,
30   G_DODGY,
31   G_TERRIBLE
32 };
33 
34 extern STRING_VAR_H (outlines_odd, "%| ", "Non standard number of outlines");
35 extern STRING_VAR_H (outlines_2, "ij!?%\":;",
36 "Non standard number of outlines");
37 extern BOOL_VAR_H (docqual_excuse_outline_errs, FALSE,
38 "Allow outline errs in unrejection?");
39 extern BOOL_VAR_H (tessedit_good_quality_unrej, TRUE,
40 "Reduce rejection on good docs");
41 extern BOOL_VAR_H (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
42 extern double_VAR_H (tessedit_reject_doc_percent, 65.00,
43 "%rej allowed before rej whole doc");
44 extern double_VAR_H (tessedit_reject_block_percent, 45.00,
45 "%rej allowed before rej whole block");
46 extern double_VAR_H (tessedit_reject_row_percent, 40.00,
47 "%rej allowed before rej whole row");
48 extern double_VAR_H (tessedit_whole_wd_rej_row_percent, 70.00,
49 "%of row rejects in whole word rejects which prevents whole row rejection");
50 extern BOOL_VAR_H (tessedit_preserve_blk_rej_perfect_wds, TRUE,
51 "Only rej partially rejected words in block rejection");
52 extern BOOL_VAR_H (tessedit_preserve_row_rej_perfect_wds, TRUE,
53 "Only rej partially rejected words in row rejection");
54 extern BOOL_VAR_H (tessedit_dont_blkrej_good_wds, FALSE,
55 "Use word segmentation quality metric");
56 extern BOOL_VAR_H (tessedit_dont_rowrej_good_wds, FALSE,
57 "Use word segmentation quality metric");
58 extern INT_VAR_H (tessedit_preserve_min_wd_len, 2,
59 "Only preserve wds longer than this");
60 extern BOOL_VAR_H (tessedit_row_rej_good_docs, TRUE,
61 "Apply row rejection to good docs");
62 extern double_VAR_H (tessedit_good_doc_still_rowrej_wd, 1.1,
63 "rej good doc wd if more than this fraction rejected");
64 extern BOOL_VAR_H (tessedit_reject_bad_qual_wds, TRUE,
65 "Reject all bad quality wds");
66 extern BOOL_VAR_H (tessedit_debug_doc_rejection, FALSE, "Page stats");
67 extern BOOL_VAR_H (tessedit_debug_quality_metrics, FALSE,
68 "Output data to debug file");
69 extern BOOL_VAR_H (bland_unrej, FALSE, "unrej potential with no chekcs");
70 extern double_VAR_H (quality_rowrej_pc, 1.1,
71 "good_quality_doc gte good char limit");
72 extern BOOL_VAR_H (unlv_tilde_crunching, TRUE,
73 "Mark v.bad words for tilde crunch");
74 extern BOOL_VAR_H (crunch_early_merge_tess_fails, TRUE,
75 "Before word crunch?");
76 extern BOOL_VAR_H (crunch_early_convert_bad_unlv_chs, FALSE,
77 "Take out ~^ early?");
78 extern double_VAR_H (crunch_terrible_rating, 80.0, "crunch rating lt this");
79 extern BOOL_VAR_H (crunch_terrible_garbage, TRUE, "As it says");
80 extern double_VAR_H (crunch_poor_garbage_cert, -9.0,
81 "crunch garbage cert lt this");
82 extern double_VAR_H (crunch_poor_garbage_rate, 60,
83 "crunch garbage rating lt this");
84 extern double_VAR_H (crunch_pot_poor_rate, 40,
85 "POTENTIAL crunch rating lt this");
86 extern double_VAR_H (crunch_pot_poor_cert, -8.0,
87 "POTENTIAL crunch cert lt this");
88 extern BOOL_VAR_H (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");
89 extern double_VAR_H (crunch_del_rating, 60,
90 "POTENTIAL crunch rating lt this");
91 extern double_VAR_H (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
92 extern double_VAR_H (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
93 extern double_VAR_H (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
94 extern double_VAR_H (crunch_del_min_width, 3.0,
95 "Del if word width lt xht x this");
96 extern double_VAR_H (crunch_del_high_word, 1.5,
97 "Del if word gt xht x this above bl");
98 extern double_VAR_H (crunch_del_low_word, 0.5,
99 "Del if word gt xht x this below bl");
100 extern double_VAR_H (crunch_small_outlines_size, 0.6,
101 "Small if lt xht x this");
102 extern INT_VAR_H (crunch_rating_max, 10, "For adj length in rating per ch");
103 extern INT_VAR_H (crunch_pot_indicators, 1,
104 "How many potential indicators needed");
105 extern BOOL_VAR_H (crunch_leave_ok_strings, TRUE,
106 "Dont touch sensible strings");
107 extern BOOL_VAR_H (crunch_accept_ok, TRUE, "Use acceptability in okstring");
108 extern BOOL_VAR_H (crunch_leave_accept_strings, FALSE,
109 "Dont pot crunch sensible strings");
110 extern BOOL_VAR_H (crunch_include_numerals, FALSE, "Fiddle alpha figures");
111 extern INT_VAR_H (crunch_leave_lc_strings, 4,
112 "Dont crunch words with long lower case strings");
113 extern INT_VAR_H (crunch_leave_uc_strings, 4,
114 "Dont crunch words with long lower case strings");
115 extern INT_VAR_H (crunch_long_repetitions, 3,
116 "Crunch words with long repetitions");
117 extern INT_VAR_H (crunch_debug, 0, "As it says");
118 inT16 word_blob_quality(  //Blob seg changes
119                         WERD_RES *word,
120                         ROW *row);
121 BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2);
122 inT16 word_outline_errs(  //Outline count errs
123                         WERD_RES *word);
124 void word_char_quality(  //Blob seg changes
125                        WERD_RES *word,
126                        ROW *row,
127                        inT16 *match_count,
128                        inT16 *accepted_match_count);
129 void unrej_good_chs(WERD_RES *word, ROW *row);
130 void print_boxes(WERD *word);
131 inT16 count_outline_errs(char c, inT16 outline_count);
132 void reject_whole_page(PAGE_RES_IT &page_res_it);
133 BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level);
134                                  //word to do
135 CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode);
136 inT16 failure_count(WERD_RES *word);
137 BOOL8 noise_outlines(WERD *word);
138 #endif
139