1 /****************************************************************** 2 * File: docqual.h (Formerly docqual.h) 3 * Description: Document Quality Metrics 4 * Author: Phil Cheatle 5 * Created: Mon May 9 11:27:28 BST 1994 6 * 7 * (C) Copyright 1994, Hewlett-Packard Ltd. 8 ** Licensed under the Apache License, Version 2.0 (the "License"); 9 ** you may not use this file except in compliance with the License. 10 ** You may obtain a copy of the License at 11 ** http://www.apache.org/licenses/LICENSE-2.0 12 ** Unless required by applicable law or agreed to in writing, software 13 ** distributed under the License is distributed on an "AS IS" BASIS, 14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 ** See the License for the specific language governing permissions and 16 ** limitations under the License. 17 * 18 **********************************************************************/ 19 20 #ifndef DOCQUAL_H 21 #define DOCQUAL_H 22 23 #include "control.h" 24 #include "notdll.h" 25 26 enum GARBAGE_LEVEL 27 { 28 G_NEVER_CRUNCH, 29 G_OK, 30 G_DODGY, 31 G_TERRIBLE 32 }; 33 34 extern STRING_VAR_H (outlines_odd, "%| ", "Non standard number of outlines"); 35 extern STRING_VAR_H (outlines_2, "ij!?%\":;", 36 "Non standard number of outlines"); 37 extern BOOL_VAR_H (docqual_excuse_outline_errs, FALSE, 38 "Allow outline errs in unrejection?"); 39 extern BOOL_VAR_H (tessedit_good_quality_unrej, TRUE, 40 "Reduce rejection on good docs"); 41 extern BOOL_VAR_H (tessedit_use_reject_spaces, TRUE, "Reject spaces?"); 42 extern double_VAR_H (tessedit_reject_doc_percent, 65.00, 43 "%rej allowed before rej whole doc"); 44 extern double_VAR_H (tessedit_reject_block_percent, 45.00, 45 "%rej allowed before rej whole block"); 46 extern double_VAR_H (tessedit_reject_row_percent, 40.00, 47 "%rej allowed before rej whole row"); 48 extern double_VAR_H (tessedit_whole_wd_rej_row_percent, 70.00, 49 "%of row rejects in whole word rejects which prevents whole row rejection"); 50 extern BOOL_VAR_H (tessedit_preserve_blk_rej_perfect_wds, TRUE, 51 "Only rej partially rejected words in block rejection"); 52 extern BOOL_VAR_H (tessedit_preserve_row_rej_perfect_wds, TRUE, 53 "Only rej partially rejected words in row rejection"); 54 extern BOOL_VAR_H (tessedit_dont_blkrej_good_wds, FALSE, 55 "Use word segmentation quality metric"); 56 extern BOOL_VAR_H (tessedit_dont_rowrej_good_wds, FALSE, 57 "Use word segmentation quality metric"); 58 extern INT_VAR_H (tessedit_preserve_min_wd_len, 2, 59 "Only preserve wds longer than this"); 60 extern BOOL_VAR_H (tessedit_row_rej_good_docs, TRUE, 61 "Apply row rejection to good docs"); 62 extern double_VAR_H (tessedit_good_doc_still_rowrej_wd, 1.1, 63 "rej good doc wd if more than this fraction rejected"); 64 extern BOOL_VAR_H (tessedit_reject_bad_qual_wds, TRUE, 65 "Reject all bad quality wds"); 66 extern BOOL_VAR_H (tessedit_debug_doc_rejection, FALSE, "Page stats"); 67 extern BOOL_VAR_H (tessedit_debug_quality_metrics, FALSE, 68 "Output data to debug file"); 69 extern BOOL_VAR_H (bland_unrej, FALSE, "unrej potential with no chekcs"); 70 extern double_VAR_H (quality_rowrej_pc, 1.1, 71 "good_quality_doc gte good char limit"); 72 extern BOOL_VAR_H (unlv_tilde_crunching, TRUE, 73 "Mark v.bad words for tilde crunch"); 74 extern BOOL_VAR_H (crunch_early_merge_tess_fails, TRUE, 75 "Before word crunch?"); 76 extern BOOL_VAR_H (crunch_early_convert_bad_unlv_chs, FALSE, 77 "Take out ~^ early?"); 78 extern double_VAR_H (crunch_terrible_rating, 80.0, "crunch rating lt this"); 79 extern BOOL_VAR_H (crunch_terrible_garbage, TRUE, "As it says"); 80 extern double_VAR_H (crunch_poor_garbage_cert, -9.0, 81 "crunch garbage cert lt this"); 82 extern double_VAR_H (crunch_poor_garbage_rate, 60, 83 "crunch garbage rating lt this"); 84 extern double_VAR_H (crunch_pot_poor_rate, 40, 85 "POTENTIAL crunch rating lt this"); 86 extern double_VAR_H (crunch_pot_poor_cert, -8.0, 87 "POTENTIAL crunch cert lt this"); 88 extern BOOL_VAR_H (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage"); 89 extern double_VAR_H (crunch_del_rating, 60, 90 "POTENTIAL crunch rating lt this"); 91 extern double_VAR_H (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this"); 92 extern double_VAR_H (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this"); 93 extern double_VAR_H (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this"); 94 extern double_VAR_H (crunch_del_min_width, 3.0, 95 "Del if word width lt xht x this"); 96 extern double_VAR_H (crunch_del_high_word, 1.5, 97 "Del if word gt xht x this above bl"); 98 extern double_VAR_H (crunch_del_low_word, 0.5, 99 "Del if word gt xht x this below bl"); 100 extern double_VAR_H (crunch_small_outlines_size, 0.6, 101 "Small if lt xht x this"); 102 extern INT_VAR_H (crunch_rating_max, 10, "For adj length in rating per ch"); 103 extern INT_VAR_H (crunch_pot_indicators, 1, 104 "How many potential indicators needed"); 105 extern BOOL_VAR_H (crunch_leave_ok_strings, TRUE, 106 "Dont touch sensible strings"); 107 extern BOOL_VAR_H (crunch_accept_ok, TRUE, "Use acceptability in okstring"); 108 extern BOOL_VAR_H (crunch_leave_accept_strings, FALSE, 109 "Dont pot crunch sensible strings"); 110 extern BOOL_VAR_H (crunch_include_numerals, FALSE, "Fiddle alpha figures"); 111 extern INT_VAR_H (crunch_leave_lc_strings, 4, 112 "Dont crunch words with long lower case strings"); 113 extern INT_VAR_H (crunch_leave_uc_strings, 4, 114 "Dont crunch words with long lower case strings"); 115 extern INT_VAR_H (crunch_long_repetitions, 3, 116 "Crunch words with long repetitions"); 117 extern INT_VAR_H (crunch_debug, 0, "As it says"); 118 inT16 word_blob_quality( //Blob seg changes 119 WERD_RES *word, 120 ROW *row); 121 BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2); 122 inT16 word_outline_errs( //Outline count errs 123 WERD_RES *word); 124 void word_char_quality( //Blob seg changes 125 WERD_RES *word, 126 ROW *row, 127 inT16 *match_count, 128 inT16 *accepted_match_count); 129 void unrej_good_chs(WERD_RES *word, ROW *row); 130 void print_boxes(WERD *word); 131 inT16 count_outline_errs(char c, inT16 outline_count); 132 void reject_whole_page(PAGE_RES_IT &page_res_it); 133 BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level); 134 //word to do 135 CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode); 136 inT16 failure_count(WERD_RES *word); 137 BOOL8 noise_outlines(WERD *word); 138 #endif 139