1 /**********************************************************************
2 * File: reject.cpp (Formerly reject.c)
3 * Description: Rejection functions used in tessedit
4 * Author: Phil Cheatle
5 * Created: Wed Sep 23 16:50:21 BST 1992
6 *
7 * (C) Copyright 1992, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 #include "mfcpch.h"
21 #include "tessvars.h"
22 #ifdef __UNIX__
23 #include <assert.h>
24 #include <errno.h>
25 #endif
26 #include "scanutils.h"
27 #include <ctype.h>
28 #include <string.h>
29 //#include "tessbox.h"
30 #include "memry.h"
31 #include "reject.h"
32 #include "tfacep.h"
33 #include "mainblk.h"
34 #include "charcut.h"
35 #include "imgs.h"
36 #include "scaleimg.h"
37 #include "control.h"
38 #include "docqual.h"
39 #include "secname.h"
40 #include "globals.h"
41
42 /* #define SECURE_NAMES done in secnames.h when necessary */
43
44 //extern "C" {
45 #include "callnet.h"
46 //}
47 #include "tesseractclass.h"
48 #include "notdll.h"
49
50 CLISTIZEH (STRING) CLISTIZE (STRING)
51 #define EXTERN
52 EXTERN
53 INT_VAR (tessedit_reject_mode, 0, "Rejection algorithm");
54 EXTERN
55 INT_VAR (tessedit_ok_mode, 5, "Acceptance decision algorithm");
56 EXTERN
57 BOOL_VAR (tessedit_use_nn, FALSE, "");
58 EXTERN
59 BOOL_VAR (tessedit_rejection_debug, FALSE, "Adaption debug");
60 EXTERN
61 BOOL_VAR (tessedit_rejection_stats, FALSE, "Show NN stats");
62 EXTERN
63 BOOL_VAR (tessedit_flip_0O, TRUE, "Contextual 0O O0 flips");
64 EXTERN
65 double_VAR (tessedit_lower_flip_hyphen, 1.5,
66 "Aspect ratio dot/hyphen test");
67 EXTERN
68 double_VAR (tessedit_upper_flip_hyphen, 1.8,
69 "Aspect ratio dot/hyphen test");
70
71 EXTERN
72 BOOL_VAR (rej_trust_doc_dawg, FALSE,
73 "Use DOC dawg in 11l conf. detector");
74 EXTERN
75 BOOL_VAR (rej_1Il_use_dict_word, FALSE, "Use dictword test");
76 EXTERN
77 BOOL_VAR (rej_1Il_trust_permuter_type, TRUE, "Dont double check");
78
79 EXTERN
80 BOOL_VAR (one_ell_conflict_default, TRUE, "one_ell_conflict default");
81 EXTERN
82 BOOL_VAR (show_char_clipping, FALSE, "Show clip image window?");
83 EXTERN
84 BOOL_VAR (nn_debug, FALSE, "NN DEBUGGING?");
85 EXTERN
86 BOOL_VAR (nn_reject_debug, FALSE, "NN DEBUG each char?");
87 EXTERN
88 BOOL_VAR (nn_lax, FALSE, "Use 2nd rate matches");
89 EXTERN
90 BOOL_VAR (nn_double_check_dict, FALSE, "Double check");
91 EXTERN
92 BOOL_VAR (nn_conf_double_check_dict, TRUE,
93 "Double check for confusions");
94 EXTERN
95 BOOL_VAR (nn_conf_1Il, TRUE, "NN use 1Il conflicts");
96 EXTERN
97 BOOL_VAR (nn_conf_Ss, TRUE, "NN use Ss conflicts");
98 EXTERN
99 BOOL_VAR (nn_conf_hyphen, TRUE, "NN hyphen conflicts");
100 EXTERN
101 BOOL_VAR (nn_conf_test_good_qual, FALSE, "NN dodgy 1Il cross check");
102 EXTERN
103 BOOL_VAR (nn_conf_test_dict, TRUE, "NN dodgy 1Il cross check");
104 EXTERN
105 BOOL_VAR (nn_conf_test_sensible, TRUE, "NN dodgy 1Il cross check");
106 EXTERN
107 BOOL_VAR (nn_conf_strict_on_dodgy_chs, TRUE,
108 "Require stronger NN match");
109 EXTERN
110 double_VAR (nn_dodgy_char_threshold, 0.99, "min accept score");
111 EXTERN
112 INT_VAR (nn_conf_accept_level, 4, "NN accept dodgy 1Il matches? ");
113 EXTERN
114 INT_VAR (nn_conf_initial_i_level, 3,
115 "NN accept initial Ii match level ");
116
117 EXTERN
118 BOOL_VAR (no_unrej_dubious_chars, TRUE, "Dubious chars next to reject?");
119 EXTERN
120 BOOL_VAR (no_unrej_no_alphanum_wds, TRUE, "Stop unrej of non A/N wds?");
121 EXTERN
122 BOOL_VAR (no_unrej_1Il, FALSE, "Stop unrej of 1Ilchars?");
123 EXTERN
124 BOOL_VAR (rej_use_tess_accepted, TRUE, "Individual rejection control");
125 EXTERN
126 BOOL_VAR (rej_use_tess_blanks, TRUE, "Individual rejection control");
127 EXTERN
128 BOOL_VAR (rej_use_good_perm, TRUE, "Individual rejection control");
129 EXTERN
130 BOOL_VAR (rej_use_sensible_wd, FALSE, "Extend permuter check");
131 EXTERN
132 BOOL_VAR (rej_alphas_in_number_perm, FALSE, "Extend permuter check");
133
134 EXTERN
135 double_VAR (rej_whole_of_mostly_reject_word_fract, 0.85,
136 "if >this fract");
137 EXTERN
138 INT_VAR (rej_mostly_reject_mode, 1,
139 "0-never, 1-afterNN, 2-after new xht");
140 EXTERN
141 double_VAR (tessed_fullstop_aspect_ratio, 1.2,
142 "if >this fract then reject");
143
144 EXTERN
145 INT_VAR (net_image_width, 40, "NN input image width");
146 EXTERN
147 INT_VAR (net_image_height, 36, "NN input image height");
148 EXTERN
149 INT_VAR (net_image_x_height, 22, "NN input image x_height");
150 EXTERN
151 INT_VAR (tessedit_image_border, 2, "Rej blbs near image edge limit");
152
153 /*
154 Net input is assumed to have (net_image_width * net_image_height) input
155 units of image pixels, followed by 0, 1, or N units representing the
156 baseline position. 0 implies no baseline information. 1 implies a floating
157 point value. N implies a "guage" of N units. For any char an initial set
158 of these are ON, the remainder OFF to indicate the "level" of the
159 baseline.
160
161 HOWEVER!!! NOTE THAT EACH NEW INPUT LAYER FORMAT EXPECTS TO BE RUN WITH A
162 DIFFERENT tessed/netmatch/nmatch.c MODULE. - These are classic C modules
163 generated by aspirin with HARD CODED CONSTANTS
164 */
165
166 EXTERN
167 INT_VAR (net_bl_nodes, 20, "Number of baseline nodes");
168
169 EXTERN
170 double_VAR (nn_reject_threshold, 0.5, "NN min accept score");
171 EXTERN
172 double_VAR (nn_reject_head_and_shoulders, 0.6, "top scores sep factor");
173
174 /* NOTE - ctoh doesn't handle "=" properly, hence \075 */
175 EXTERN
176 STRING_VAR (ok_single_ch_non_alphanum_wds, "-?\075",
177 "Allow NN to unrej");
178 EXTERN
179 STRING_VAR (ok_repeated_ch_non_alphanum_wds, "-?*\075",
180 "Allow NN to unrej");
181 EXTERN
182 STRING_VAR (conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
183 EXTERN
184 STRING_VAR (conflict_set_S_s, "Ss$", "Ss conflict set");
185 EXTERN
186 STRING_VAR (conflict_set_hyphen, "-_~", "hyphen conflict set");
187 EXTERN
188 STRING_VAR (dubious_chars_left_of_reject, "!'+`()-./\\<>;:^_,~\"",
189 "Unreliable chars");
190 EXTERN
191 STRING_VAR (dubious_chars_right_of_reject, "!'+`()-./\\<>;:^_,~\"",
192 "Unreliable chars");
193
194 EXTERN
195 INT_VAR (min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
196
197 /*************************************************************************
198 * set_done()
199 *
200 * Set the done flag based on the word acceptability criteria
201 *************************************************************************/
202
203 namespace tesseract {
set_done(WERD_RES * word,inT16 pass)204 void Tesseract::set_done( //set done flag
205 WERD_RES *word,
206 inT16 pass) {
207 /*
208 0: Original heuristic used in Tesseract and Ray's prototype Resaljet
209 */
210 if (tessedit_ok_mode == 0) {
211 /* NOTE - done even if word contains some or all spaces !!! */
212 word->done = word->tess_accepted;
213 }
214 /*
215 1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
216 */
217 else if (tessedit_ok_mode == 1) {
218 word->done = word->tess_accepted &&
219 (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
220
221 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
222 word->done = FALSE;
223 }
224 /*
225 2: as 1 + only accept dict words or numerics in pass 1
226 */
227 else if (tessedit_ok_mode == 2) {
228 word->done = word->tess_accepted &&
229 (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
230
231 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
232 word->done = FALSE;
233
234 if (word->done &&
235 (pass == 1) &&
236 (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
237 (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
238 (word->best_choice->permuter () != USER_DAWG_PERM) &&
239 (word->best_choice->permuter () != NUMBER_PERM)) {
240 #ifndef SECURE_NAMES
241 if (tessedit_rejection_debug)
242 tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
243 word->best_choice->unichar_string().string ());
244 #endif
245 word->done = FALSE;
246 }
247 }
248 /*
249 3: as 2 + only accept dict words or numerics in pass 2 as well
250 */
251 else if (tessedit_ok_mode == 3) {
252 word->done = word->tess_accepted &&
253 (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
254
255 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
256 word->done = FALSE;
257
258 if (word->done &&
259 (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
260 (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
261 (word->best_choice->permuter () != USER_DAWG_PERM) &&
262 (word->best_choice->permuter () != NUMBER_PERM)) {
263 #ifndef SECURE_NAMES
264 if (tessedit_rejection_debug)
265 tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
266 word->best_choice->unichar_string().string ());
267 #endif
268 word->done = FALSE;
269 }
270 }
271 /*
272 4: as 2 + reject dict ambigs in pass 1
273 */
274 else if (tessedit_ok_mode == 4) {
275 word->done = word->tess_accepted &&
276 (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
277
278 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
279 word->done = FALSE;
280
281 if (word->done &&
282 (pass == 1) &&
283 (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
284 (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
285 (word->best_choice->permuter () != USER_DAWG_PERM) &&
286 (word->best_choice->permuter () != NUMBER_PERM)) ||
287 (test_ambig_word (word)))) {
288 #ifndef SECURE_NAMES
289 if (tessedit_rejection_debug)
290 tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
291 word->best_choice->unichar_string().string ());
292 #endif
293 word->done = FALSE;
294 }
295 }
296 /*
297 5: as 3 + reject dict ambigs in both passes
298 */
299 else if (tessedit_ok_mode == 5) {
300 word->done = word->tess_accepted &&
301 (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
302
303 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
304 word->done = FALSE;
305
306 if (word->done &&
307 (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
308 (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
309 (word->best_choice->permuter () != USER_DAWG_PERM) &&
310 (word->best_choice->permuter () != NUMBER_PERM)) ||
311 (test_ambig_word (word)))) {
312 #ifndef SECURE_NAMES
313 if (tessedit_rejection_debug)
314 tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
315 word->best_choice->unichar_string().string ());
316 #endif
317 word->done = FALSE;
318 }
319 }
320
321 else {
322 tprintf ("BAD tessedit_ok_mode\n");
323 err_exit();
324 }
325 }
326
327
328 /*************************************************************************
329 * make_reject_map()
330 *
331 * Sets the done flag to indicate whether the resylt is acceptable.
332 *
333 * Sets a reject map for the word.
334 *************************************************************************/
make_reject_map(WERD_RES * word,BLOB_CHOICE_LIST_CLIST * blob_choices,ROW * row,inT16 pass)335 void Tesseract::make_reject_map( //make rej map for wd //detailed results
336 WERD_RES *word,
337 BLOB_CHOICE_LIST_CLIST *blob_choices,
338 ROW *row,
339 inT16 pass //1st or 2nd?
340 ) {
341 int i;
342 int offset;
343
344 flip_0O(word);
345 check_debug_pt (word, -1); //For trap only
346 set_done(word, pass); //Set acceptance
347 word->reject_map.initialise (word->best_choice->unichar_lengths().length ());
348 reject_blanks(word);
349 /*
350 0: Rays original heuristic - the baseline
351 */
352 if (tessedit_reject_mode == 0) {
353 if (!word->done)
354 reject_poor_matches(word, blob_choices);
355 }
356 /*
357 5: Reject I/1/l from words where there is no strong contextual confirmation;
358 the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
359 and the whole of any words which are very small
360 */
361 else if (tessedit_reject_mode == 5) {
362 if (bln_x_height / word->denorm.scale () <= min_sane_x_ht_pixels)
363 word->reject_map.rej_word_small_xht ();
364 else {
365 one_ell_conflict(word, TRUE);
366 /*
367 Originally the code here just used the done flag. Now I have duplicated
368 and unpacked the conditions for setting the done flag so that each
369 mechanism can be turned on or off independently. This works WITHOUT
370 affecting the done flag setting.
371 */
372 if (rej_use_tess_accepted && !word->tess_accepted)
373 word->reject_map.rej_word_not_tess_accepted ();
374
375 if (rej_use_tess_blanks &&
376 (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
377 word->reject_map.rej_word_contains_blanks ();
378
379 if (rej_use_good_perm) {
380 if (((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
381 (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
382 (word->best_choice->permuter () == USER_DAWG_PERM)) &&
383 (!rej_use_sensible_wd ||
384 (acceptable_word_string
385 (word->best_choice->unichar_string().string (),
386 word->best_choice->unichar_lengths().string ()) !=
387 AC_UNACCEPTABLE))) {
388 //PASSED TEST
389 }
390 else if (word->best_choice->permuter () == NUMBER_PERM) {
391 if (rej_alphas_in_number_perm) {
392 for (i = 0, offset = 0;
393 word->best_choice->unichar_string()[offset] != '\0';
394 offset += word->best_choice->unichar_lengths()[i++]) {
395 if (word->reject_map[i].accepted () &&
396 unicharset.get_isalpha(
397 word->best_choice->unichar_string().string() + offset,
398 word->best_choice->unichar_lengths()[i]))
399 word->reject_map[i].setrej_bad_permuter ();
400 //rej alpha
401 }
402 }
403 }
404 else {
405 word->reject_map.rej_word_bad_permuter ();
406 }
407 }
408
409 /* Ambig word rejection was here once !!*/
410
411 }
412 }
413 else {
414 tprintf ("BAD tessedit_reject_mode\n");
415 err_exit();
416 }
417
418 if (tessedit_image_border > -1)
419 reject_edge_blobs(word);
420
421 check_debug_pt (word, 10);
422 if (tessedit_rejection_debug) {
423 tprintf ("Permuter Type = %d\n", word->best_choice->permuter ());
424 tprintf ("Certainty: %f Rating: %f\n",
425 word->best_choice->certainty (), word->best_choice->rating ());
426 tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
427 }
428
429 /* Un-reject any rejected characters if NN permits */
430
431 if (tessedit_use_nn && (pass == 2) &&
432 word->reject_map.recoverable_rejects ())
433 nn_recover_rejects(word, row);
434 flip_hyphens(word);
435 check_debug_pt (word, 20);
436 }
437 } // namespace tesseract
438
439
reject_blanks(WERD_RES * word)440 void reject_blanks(WERD_RES *word) {
441 inT16 i;
442 inT16 offset;
443
444 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
445 offset += word->best_choice->unichar_lengths()[i], i += 1) {
446 if (word->best_choice->unichar_string()[offset] == ' ')
447 //rej unrecognised blobs
448 word->reject_map[i].setrej_tess_failure ();
449 }
450 }
451
452
reject_I_1_L(WERD_RES * word)453 void reject_I_1_L(WERD_RES *word) {
454 inT16 i;
455 inT16 offset;
456
457 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
458 offset += word->best_choice->unichar_lengths()[i], i += 1) {
459 if (STRING (conflict_set_I_l_1).
460 contains (word->best_choice->unichar_string()[offset])) {
461 //rej 1Il conflict
462 word->reject_map[i].setrej_1Il_conflict ();
463 }
464 }
465 }
466
467
reject_poor_matches(WERD_RES * word,BLOB_CHOICE_LIST_CLIST * blob_choices)468 void reject_poor_matches( //detailed results
469 WERD_RES *word,
470 BLOB_CHOICE_LIST_CLIST *blob_choices) {
471 float threshold;
472 inT16 i = 0;
473 inT16 offset = 0;
474 //super iterator
475 BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
476 BLOB_CHOICE_IT choice_it; //real iterator
477
478 #ifndef SECURE_NAMES
479 if (strlen(word->best_choice->unichar_lengths().string()) !=
480 list_it.length()) {
481 tprintf
482 ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
483 word->best_choice->unichar_string().string(),
484 strlen (word->best_choice->unichar_lengths().string()), list_it.length(),
485 word->outword->blob_list()->length());
486 }
487 #endif
488 ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) ==
489 list_it.length ());
490 ASSERT_HOST (word->outword->blob_list ()->length () == list_it.length ());
491 threshold = compute_reject_threshold (blob_choices);
492
493 for (list_it.mark_cycle_pt ();
494 !list_it.cycled_list (); list_it.forward (), i++,
495 offset += word->best_choice->unichar_lengths()[i]) {
496 /* NB - only compares the threshold against the TOP choice char in the
497 choices list for a blob !! - the selected one may be below the threshold
498 */
499 choice_it.set_to_list (list_it.data ());
500 if ((word->best_choice->unichar_string()[offset] == ' ') ||
501 (choice_it.length () == 0))
502 //rej unrecognised blobs
503 word->reject_map[i].setrej_tess_failure ();
504 else if (choice_it.data ()->certainty () < threshold)
505 //rej poor score blob
506 word->reject_map[i].setrej_poor_match ();
507 }
508 }
509
510
511 /**********************************************************************
512 * compute_reject_threshold
513 *
514 * Set a rejection threshold for this word.
515 * Initially this is a trivial function which looks for the largest
516 * gap in the certainty value.
517 **********************************************************************/
518
compute_reject_threshold(BLOB_CHOICE_LIST_CLIST * blob_choices)519 float compute_reject_threshold( //compute threshold //detailed results
520 BLOB_CHOICE_LIST_CLIST *blob_choices) {
521 inT16 index; //to ratings
522 inT16 blob_count; //no of blobs in word
523 inT16 ok_blob_count = 0; //non TESS rej blobs in word
524 float *ratings; //array of confidences
525 float threshold; //rejection threshold
526 float bestgap; //biggest gap
527 float gapstart; //bottom of gap
528 //super iterator
529 BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
530 BLOB_CHOICE_IT choice_it; //real iterator
531
532 blob_count = blob_choices->length ();
533 ratings = (float *) alloc_mem (blob_count * sizeof (float));
534 for (list_it.mark_cycle_pt (), index = 0;
535 !list_it.cycled_list (); list_it.forward (), index++) {
536 choice_it.set_to_list (list_it.data ());
537 if (choice_it.length () > 0) {
538 ratings[ok_blob_count] = choice_it.data ()->certainty ();
539 //get in an array
540 // tprintf("Rating[%d]=%c %g %g\n",
541 // index,choice_it.data()->char_class(),
542 // choice_it.data()->rating(),choice_it.data()->certainty());
543 ok_blob_count++;
544 }
545 }
546 ASSERT_HOST (index == blob_count);
547 qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
548 //sort them
549 bestgap = 0;
550 gapstart = ratings[0] - 1; //all reject if none better
551 if (ok_blob_count >= 3) {
552 for (index = 0; index < ok_blob_count - 1; index++) {
553 if (ratings[index + 1] - ratings[index] > bestgap) {
554 bestgap = ratings[index + 1] - ratings[index];
555 //find biggest
556 gapstart = ratings[index];
557 }
558 }
559 }
560 threshold = gapstart + bestgap / 2;
561 // tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
562 // ratings[0],ratings[index],bestgap,threshold);
563
564 free_mem(ratings);
565 return threshold;
566 }
567
568
569 /**********************************************************************
570 * sort_floats
571 *
572 * qsort function to sort 2 floats.
573 **********************************************************************/
574
sort_floats(const void * arg1,const void * arg2)575 int sort_floats( //qsort function
576 const void *arg1, //ptrs to floats
577 const void *arg2) {
578 float diff; //difference
579
580 diff = *((float *) arg1) - *((float *) arg2);
581 if (diff > 0)
582 return 1;
583 else if (diff < 0)
584 return -1;
585 else
586 return 0;
587 }
588
589
590 /*************************************************************************
591 * reject_edge_blobs()
592 *
593 * If the word is perilously close to the edge of the image, reject those blobs
594 * in the word which are too close to the edge as they could be clipped.
595 *************************************************************************/
596
reject_edge_blobs(WERD_RES * word)597 void reject_edge_blobs(WERD_RES *word) {
598 TBOX word_box = word->word->bounding_box ();
599 TBOX blob_box;
600 PBLOB_IT blob_it = word->outword->blob_list ();
601 //blobs
602 int blobindex = 0;
603 float centre;
604
605 if ((word_box.left () < tessedit_image_border) ||
606 (word_box.bottom () < tessedit_image_border) ||
607 (word_box.right () + tessedit_image_border >
608 page_image.get_xsize () - 1) ||
609 (word_box.top () + tessedit_image_border > page_image.get_ysize () - 1)) {
610 ASSERT_HOST (word->reject_map.length () == blob_it.length ());
611 for (blobindex = 0, blob_it.mark_cycle_pt ();
612 !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
613 blob_box = blob_it.data ()->bounding_box ();
614 centre = (blob_box.left () + blob_box.right ()) / 2.0;
615 if ((word->denorm.x (blob_box.left ()) < tessedit_image_border) ||
616 (word->denorm.y (blob_box.bottom (), centre) <
617 tessedit_image_border) ||
618 (word->denorm.x (blob_box.right ()) + tessedit_image_border >
619 page_image.get_xsize () - 1) ||
620 (word->denorm.y (blob_box.top (), centre)
621 + tessedit_image_border > page_image.get_ysize () - 1)) {
622 word->reject_map[blobindex].setrej_edge_char ();
623 //close to edge
624 }
625 }
626 }
627 }
628
629
630 /**********************************************************************
631 * one_ell_conflict()
632 *
633 * Identify words where there is a potential I/l/1 error.
634 * - A bundle of contextual heuristics!
635 **********************************************************************/
636 namespace tesseract {
one_ell_conflict(WERD_RES * word_res,BOOL8 update_map)637 BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
638 const char *word;
639 const char *lengths;
640 inT16 word_len; //its length
641 inT16 first_alphanum_index_;
642 inT16 first_alphanum_offset_;
643 inT16 i;
644 inT16 offset;
645 BOOL8 non_conflict_set_char; //non conf set a/n?
646 BOOL8 conflict = FALSE;
647 BOOL8 allow_1s;
648 ACCEPTABLE_WERD_TYPE word_type;
649 BOOL8 dict_perm_type;
650 BOOL8 dict_word_ok;
651 int dict_word_type;
652
653 word = word_res->best_choice->unichar_string().string ();
654 lengths = word_res->best_choice->unichar_lengths().string();
655 word_len = strlen (lengths);
656 /*
657 If there are no occurrences of the conflict set characters then the word
658 is OK.
659 */
660 if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
661 return FALSE;
662
663 /*
664 There is a conflict if there are NO other (confirmed) alphanumerics apart
665 from those in the conflict set.
666 */
667
668 for (i = 0, offset = 0, non_conflict_set_char = FALSE;
669 (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
670 non_conflict_set_char =
671 (unicharset.get_isalpha(word + offset, lengths[i]) ||
672 unicharset.get_isdigit(word + offset, lengths[i])) &&
673 !STRING (conflict_set_I_l_1).contains (word[offset]);
674 if (!non_conflict_set_char) {
675 if (update_map)
676 reject_I_1_L(word_res);
677 return TRUE;
678 }
679
680 /*
681 If the word is accepted by a dawg permuter, and the first alpha character
682 is "I" or "l", check to see if the alternative is also a dawg word. If it
683 is, then there is a potential error otherwise the word is ok.
684 */
685
686 dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
687 (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
688 (rej_trust_doc_dawg &&
689 (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
690 (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
691 dict_word_type = dict_word(*(word_res->best_choice));
692 dict_word_ok = (dict_word_type > 0) &&
693 (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
694
695 if ((rej_1Il_use_dict_word && dict_word_ok) ||
696 (rej_1Il_trust_permuter_type && dict_perm_type) ||
697 (dict_perm_type && dict_word_ok)) {
698 first_alphanum_index_ = first_alphanum_index (word, lengths);
699 first_alphanum_offset_ = first_alphanum_offset (word, lengths);
700 if (lengths[first_alphanum_index_] == 1 &&
701 word[first_alphanum_offset_] == 'I') {
702 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
703 if (safe_dict_word(*(word_res->best_choice)) > 0) {
704 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
705 if (update_map)
706 word_res->reject_map[first_alphanum_index_].
707 setrej_1Il_conflict();
708 return TRUE;
709 }
710 else {
711 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
712 return FALSE;
713 }
714 }
715
716 if (lengths[first_alphanum_index_] == 1 &&
717 word[first_alphanum_offset_] == 'l') {
718 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
719 if (safe_dict_word(*(word_res->best_choice)) > 0) {
720 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
721 if (update_map)
722 word_res->reject_map[first_alphanum_index_].
723 setrej_1Il_conflict();
724 return TRUE;
725 }
726 else {
727 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
728 return FALSE;
729 }
730 }
731 return FALSE;
732 }
733
734 /*
735 NEW 1Il code. The old code relied on permuter types too much. In fact,
736 tess will use TOP_CHOICE permute for good things like "palette".
737 In this code the string is examined independently to see if it looks like
738 a well formed word.
739 */
740
741 /*
742 REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
743 dictionary word.
744 */
745 first_alphanum_index_ = first_alphanum_index (word, lengths);
746 first_alphanum_offset_ = first_alphanum_offset (word, lengths);
747 if (lengths[first_alphanum_index_] == 1 &&
748 word[first_alphanum_offset_] == 'l') {
749 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
750 if (safe_dict_word(*(word_res->best_choice)) > 0)
751 return FALSE;
752 else
753 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
754 }
755 else if (lengths[first_alphanum_index_] == 1 &&
756 word[first_alphanum_offset_] == 'I') {
757 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
758 if (safe_dict_word(*(word_res->best_choice)) > 0)
759 return FALSE;
760 else
761 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
762 }
763 /*
764 For strings containing digits:
765 If there are no alphas OR the numeric permuter liked the word,
766 reject any non 1 conflict chs
767 Else reject all conflict chs
768 */
769 if (word_contains_non_1_digit (word, lengths)) {
770 allow_1s = (alpha_count (word, lengths) == 0) ||
771 (word_res->best_choice->permuter () == NUMBER_PERM);
772
773 inT16 offset;
774 conflict = FALSE;
775 for (i = 0, offset = 0; word[offset] != '\0';
776 offset += word_res->best_choice->unichar_lengths()[i++]) {
777 if ((!allow_1s || (word[offset] != '1')) &&
778 STRING (conflict_set_I_l_1).contains (word[offset])) {
779 if (update_map)
780 word_res->reject_map[i].setrej_1Il_conflict ();
781 conflict = TRUE;
782 }
783 }
784 return conflict;
785 }
786 /*
787 For anything else. See if it conforms to an acceptable word type. If so,
788 treat accordingly.
789 */
790 word_type = acceptable_word_string (word, lengths);
791 if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
792 first_alphanum_index_ = first_alphanum_index (word, lengths);
793 first_alphanum_offset_ = first_alphanum_offset (word, lengths);
794 if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
795 if (update_map)
796 word_res->reject_map[first_alphanum_index_].
797 setrej_1Il_conflict ();
798 return TRUE;
799 }
800 else
801 return FALSE;
802 }
803 else if (word_type == AC_UPPER_CASE) {
804 return FALSE;
805 }
806 else {
807 if (update_map)
808 reject_I_1_L(word_res);
809 return TRUE;
810 }
811 }
812
813
first_alphanum_index(const char * word,const char * word_lengths)814 inT16 Tesseract::first_alphanum_index(const char *word,
815 const char *word_lengths) {
816 inT16 i;
817 inT16 offset;
818
819 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
820 if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
821 unicharset.get_isdigit(word + offset, word_lengths[i]))
822 return i;
823 }
824 return -1;
825 }
826
first_alphanum_offset(const char * word,const char * word_lengths)827 inT16 Tesseract::first_alphanum_offset(const char *word,
828 const char *word_lengths) {
829 inT16 i;
830 inT16 offset;
831
832 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
833 if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
834 unicharset.get_isdigit(word + offset, word_lengths[i]))
835 return offset;
836 }
837 return -1;
838 }
839
alpha_count(const char * word,const char * word_lengths)840 inT16 Tesseract::alpha_count(const char *word,
841 const char *word_lengths) {
842 inT16 i;
843 inT16 offset;
844 inT16 count = 0;
845
846 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
847 if (unicharset.get_isalpha (word + offset, word_lengths[i]))
848 count++;
849 }
850 return count;
851 }
852
853
word_contains_non_1_digit(const char * word,const char * word_lengths)854 BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
855 const char *word_lengths) {
856 inT16 i;
857 inT16 offset;
858
859 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
860 if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
861 (word_lengths[i] != 1 || word[offset] != '1'))
862 return TRUE;
863 }
864 return FALSE;
865 }
866
867
test_ambig_word(WERD_RES * word)868 BOOL8 Tesseract::test_ambig_word( //test for ambiguity
869 WERD_RES *word) {
870 BOOL8 ambig = FALSE;
871
872 if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
873 (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
874 (word->best_choice->permuter () == USER_DAWG_PERM)) {
875 ambig = !getDict().NoDangerousAmbig(
876 word->best_choice, NULL, false, NULL, NULL);
877 }
878 return ambig;
879 }
880
881 /*************************************************************************
882 * char_ambiguities()
883 *
884 * Return a pointer to a string containing the full conflict set of characters
885 * which includes the specified character, if there is one. If the specified
886 * character is not a member of a conflict set, return NULL.
887 * (NOTE that a character is assumed to be a member of only ONE conflict set.)
888 *************************************************************************/
char_ambiguities(char c)889 const char *Tesseract::char_ambiguities(char c) {
890 static STRING_CLIST conflict_sets;
891 static BOOL8 read_conflict_sets = FALSE;
892 STRING_C_IT cs_it(&conflict_sets);
893 const char *cs;
894 STRING cs_file_name;
895 FILE *cs_file;
896 char buff[1024];
897
898 if (!read_conflict_sets) {
899 cs_file_name = datadir + "confsets";
900 if (!(cs_file = fopen (cs_file_name.string (), "r"))) {
901 CANTOPENFILE.error ("char_ambiguities", EXIT, "%s %d",
902 cs_file_name.string (), errno);
903 }
904 while (fscanf (cs_file, "%s", buff) == 1) {
905 cs_it.add_after_then_move (new STRING (buff));
906 }
907 read_conflict_sets = TRUE;
908 cs_it.move_to_first ();
909 if (tessedit_rejection_debug) {
910 for (cs_it.mark_cycle_pt ();
911 !cs_it.cycled_list (); cs_it.forward ()) {
912 tprintf ("\"%s\"\n", cs_it.data ()->string ());
913 }
914 }
915 }
916
917 cs_it.move_to_first ();
918 for (cs_it.mark_cycle_pt (); !cs_it.cycled_list (); cs_it.forward ()) {
919 cs = cs_it.data ()->string ();
920 if (strchr (cs, c) != NULL)
921 return cs;
922 }
923 return NULL;
924 }
925
926 /*************************************************************************
927 * nn_recover_rejects()
928 * Generate the nn_reject_map - a copy of the current reject map, but dont
929 * reject previously rejected chars if the NN matcher agrees with the best
930 * choice.
931 *************************************************************************/
932
nn_recover_rejects(WERD_RES * word,ROW * row)933 void Tesseract::nn_recover_rejects(WERD_RES *word, ROW *row) {
934 //copy for debug
935 REJMAP old_map = word->reject_map;
936 /*
937 NOTE THAT THIS IS RELATIVELY INEFFICIENT AS THE WHOLE OF THE WERD IS
938 MATCHED BY THE NN MATCHER. IF COULD EASILY BE RESTRICTED TO JUST THE
939 REJECT CHARACTERS (Though initial use is when words are total rejects
940 anyway).
941 */
942
943 set_global_subsubloc_code(SUBSUBLOC_NN);
944 nn_match_word(word, row);
945
946 if (no_unrej_1Il)
947 dont_allow_1Il(word);
948 if (no_unrej_dubious_chars)
949 dont_allow_dubious_chars(word);
950
951 if (rej_mostly_reject_mode == 1)
952 reject_mostly_rejects(word);
953 /*
954 IF there are no unrejected alphanumerics AND
955 The word is not an acceptable single non alphanum char word AND
956 The word is not an acceptable repeated non alphanum char word
957 THEN Reject whole word
958 */
959 if (no_unrej_no_alphanum_wds &&
960 (count_alphanums (word) < 1) &&
961 !((word->best_choice->unichar_lengths().length () == 1) &&
962 STRING(ok_single_ch_non_alphanum_wds).contains(
963 word->best_choice->unichar_string()[0]))
964 && !repeated_nonalphanum_wd (word, row))
965
966 word->reject_map.rej_word_no_alphanums ();
967
968 #ifndef SECURE_NAMES
969
970 if (nn_debug) {
971 tprintf ("\nTess: \"%s\" MAP ",
972 word->best_choice->unichar_string().string());
973 old_map.print (stdout);
974 tprintf ("->");
975 word->reject_map.print (stdout);
976 tprintf ("\n");
977 }
978 #endif
979 set_global_subsubloc_code(SUBSUBLOC_OTHER);
980 }
981
nn_match_word(WERD_RES * word,ROW * row)982 void Tesseract::nn_match_word( //Match a word
983 WERD_RES *word,
984 ROW *row) {
985 PIXROW_LIST *pixrow_list;
986 PIXROW_IT pixrow_it;
987 IMAGELINE *imlines; //lines of the image
988 TBOX pix_box; //box of imlines extent
989 #ifndef GRAPHICS_DISABLED
990 ScrollView* win = NULL;
991 #endif
992 IMAGE clip_image;
993 IMAGE scaled_image;
994 float baseline_pos;
995 inT16 net_image_size;
996 inT16 clip_image_size;
997 WERD copy_outword; // copy to denorm
998 inT16 i;
999
1000 const char *word_string;
1001 const char *word_string_lengths;
1002 BOOL8 word_in_dict; //Tess wd in dict
1003 BOOL8 checked_dict_word; //Tess wd definitely in dict
1004 BOOL8 sensible_word; //OK char string
1005 BOOL8 centre; //Not at word end chs
1006 BOOL8 good_quality_word;
1007 inT16 char_quality;
1008 inT16 accepted_char_quality;
1009
1010 inT16 conf_level; //0:REJECT
1011 //1:DODGY ACCEPT
1012 //2:DICT ACCEPT
1013 //3:CLEAR ACCEPT
1014 inT16 first_alphanum_index_;
1015 inT16 first_alphanum_offset_;
1016
1017 word_string = word->best_choice->unichar_string().string();
1018 word_string_lengths = word->best_choice->unichar_lengths().string();
1019 first_alphanum_index_ = first_alphanum_index (word_string,
1020 word_string_lengths);
1021 first_alphanum_offset_ = first_alphanum_offset (word_string,
1022 word_string_lengths);
1023 word_in_dict = ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
1024 (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
1025 (word->best_choice->permuter () == USER_DAWG_PERM));
1026 checked_dict_word = word_in_dict &&
1027 (safe_dict_word(*(word->best_choice)) > 0);
1028 sensible_word = acceptable_word_string (word_string, word_string_lengths) !=
1029 AC_UNACCEPTABLE;
1030
1031 word_char_quality(word, row, &char_quality, &accepted_char_quality);
1032 good_quality_word =
1033 word->best_choice->unichar_lengths().length () == char_quality;
1034
1035 #ifndef SECURE_NAMES
1036 if (nn_reject_debug) {
1037 tprintf ("Dict: %c Checked Dict: %c Sensible: %c Quality: %c\n",
1038 word_in_dict ? 'T' : 'F',
1039 checked_dict_word ? 'T' : 'F',
1040 sensible_word ? 'T' : 'F', good_quality_word ? 'T' : 'F');
1041 }
1042 #endif
1043
1044 if (word->best_choice->unichar_lengths().length () !=
1045 word->outword->blob_list ()->length ()) {
1046 #ifndef SECURE_NAMES
1047 tprintf ("nn_match_word ASSERT FAIL String:\"%s\"; #Blobs=%d\n",
1048 word->best_choice->unichar_string().string (),
1049 word->outword->blob_list ()->length ());
1050 #endif
1051 err_exit();
1052 }
1053
1054 copy_outword = *(word->outword);
1055 copy_outword.baseline_denormalise (&word->denorm);
1056 /*
1057 For each character, generate and match a new image, containing JUST the
1058 character we have clipped, centered in the image, on a white background.
1059 Note that we MUST have a square image so that we can scale it uniformly in
1060 x and y. We base the size on x_height as this can be found fairly reliably.
1061 */
1062 net_image_size = (net_image_width > net_image_height) ?
1063 net_image_width : net_image_height;
1064 clip_image_size = (inT16) floor (0.5 +
1065 net_image_size * word->x_height /
1066 net_image_x_height);
1067 if ((clip_image_size <= 1) || (net_image_size <= 1)) {
1068 return;
1069 }
1070
1071 /*
1072 Get the image of the word and the pix positions of each char
1073 */
1074 char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box);
1075 #ifndef GRAPHICS_DISABLED
1076 if (show_char_clipping) {
1077 win = display_clip_image (©_outword, page_image,
1078 pixrow_list, pix_box);
1079 }
1080 #endif
1081 pixrow_it.set_to_list (pixrow_list);
1082 pixrow_it.move_to_first ();
1083 for (pixrow_it.mark_cycle_pt (), i = 0;
1084 !pixrow_it.cycled_list (); pixrow_it.forward (), i++) {
1085 if (pixrow_it.data ()->
1086 bad_box (page_image.get_xsize (), page_image.get_ysize ()))
1087 continue;
1088 clip_image.create (clip_image_size, clip_image_size, 1);
1089 //make bin imge
1090 if (!copy_outword.flag (W_INVERSE))
1091 invert_image(&clip_image); //white background for black on white
1092 pixrow_it.data ()->char_clip_image (imlines, pix_box, row,
1093 clip_image, baseline_pos);
1094 if (copy_outword.flag (W_INVERSE))
1095 invert_image(&clip_image); //invert white on black for scaling &NN
1096 scaled_image.create (net_image_size, net_image_size, 1);
1097 scale_image(clip_image, scaled_image);
1098 baseline_pos *= net_image_size / clip_image_size;
1099 //scale with im
1100 centre = !pixrow_it.at_first () && !pixrow_it.at_last ();
1101
1102 conf_level = nn_match_char (scaled_image, baseline_pos,
1103 word_in_dict, checked_dict_word,
1104 sensible_word, centre,
1105 good_quality_word, word_string[i]);
1106 if (word->reject_map[i].recoverable ()) {
1107 if ((i == first_alphanum_index_) &&
1108 word_string_lengths[first_alphanum_index_] == 1 &&
1109 ((word_string[first_alphanum_offset_] == 'I') ||
1110 (word_string[first_alphanum_offset_] == 'i'))) {
1111 if (conf_level >= nn_conf_initial_i_level)
1112 word->reject_map[i].setrej_nn_accept ();
1113 //un-reject char
1114 }
1115 else if (conf_level > 0)
1116 //un-reject char
1117 word->reject_map[i].setrej_nn_accept ();
1118 }
1119 #ifndef GRAPHICS_DISABLED
1120 if (show_char_clipping)
1121 display_images(clip_image, scaled_image);
1122 #endif
1123 clip_image.destroy();
1124 scaled_image.destroy();
1125 }
1126
1127 delete[]imlines; // Free array of imlines
1128 delete pixrow_list;
1129
1130 #ifndef GRAPHICS_DISABLED
1131 if (show_char_clipping) {
1132 // destroy_window(win);
1133 // win->Destroy();
1134 delete win;
1135 }
1136 #endif
1137 }
1138 } // namespace tesseract
1139
1140
1141 /*************************************************************************
1142 * nn_match_char()
1143 * Call Neural Net matcher to match a single character, given a scaled,
1144 * square image
1145 *************************************************************************/
1146
nn_match_char(IMAGE & scaled_image,float baseline_pos,BOOL8 dict_word,BOOL8 checked_dict_word,BOOL8 sensible_word,BOOL8 centre,BOOL8 good_quality_word,char tess_ch)1147 inT16 nn_match_char( //of character
1148 IMAGE &scaled_image,
1149 float baseline_pos, //rel to scaled_image
1150 BOOL8 dict_word, //part of dict wd?
1151 BOOL8 checked_dict_word, //part of dict wd?
1152 BOOL8 sensible_word, //part acceptable str?
1153 BOOL8 centre, //not at word ends?
1154 BOOL8 good_quality_word, //initial segmentation
1155 char tess_ch //confirm this?
1156 ) {
1157 inT16 conf_level; //0..2
1158 inT32 row;
1159 inT32 col;
1160 inT32 y_size = scaled_image.get_ysize ();
1161 inT32 start_y = y_size - (y_size - net_image_height) / 2 - 1;
1162 inT32 end_y = start_y - net_image_height + 1;
1163 IMAGELINE imline;
1164 float *input_vector;
1165 float *input_vec_ptr;
1166 char top;
1167 float top_score;
1168 char next;
1169 float next_score;
1170 inT16 input_nodes = (net_image_height * net_image_width) + net_bl_nodes;
1171 inT16 j;
1172
1173 input_vector = (float *) alloc_mem (input_nodes * sizeof (float));
1174 input_vec_ptr = input_vector;
1175
1176 invert_image(&scaled_image); //cos nns work better
1177 for (row = start_y; row >= end_y; row--) {
1178 scaled_image.fast_get_line (0, row, net_image_width, &imline);
1179 for (col = 0; col < net_image_width; col++)
1180 *input_vec_ptr++ = imline.pixels[col];
1181 }
1182 /*
1183 The bit map presented to the net may be shorter than the image, so shift
1184 the coord to be relative to the bitmap portion.
1185 */
1186 baseline_pos -= (y_size - net_image_height) / 2.0;
1187 /*
1188 Baseline pos is 0 if below bitmap, 1 if above and in proportion otherwise.
1189 This is represented to the net as a set of bl_nodes, an initial proportion
1190 of which are set to 1.0, indicating the level of the baseline. The
1191 remainder are 0.0
1192 */
1193
1194 if (baseline_pos < 0)
1195 baseline_pos = 0;
1196 else if (baseline_pos >= net_image_height)
1197 baseline_pos = net_image_height + 1;
1198 else
1199 baseline_pos = baseline_pos + 1;
1200 baseline_pos = baseline_pos / (net_image_height + 1);
1201
1202 if (net_bl_nodes > 0) {
1203 baseline_pos *= 1.7; //Use a wider range
1204 if (net_bl_nodes > 1) {
1205 /* Multi-node baseline representation */
1206 for (j = 0; j < net_bl_nodes; j++) {
1207 if (baseline_pos > ((float) j / net_bl_nodes))
1208 *input_vec_ptr++ = 1.0;
1209 else
1210 *input_vec_ptr++ = 0.0;
1211 }
1212 }
1213 else {
1214 /* Single node baseline */
1215 *input_vec_ptr++ = baseline_pos;
1216 }
1217 }
1218
1219 callnet(input_vector, &top, &top_score, &next, &next_score);
1220 conf_level = evaluate_net_match (top, top_score, next, next_score,
1221 tess_ch, dict_word, checked_dict_word,
1222 sensible_word, centre, good_quality_word);
1223 #ifndef SECURE_NAMES
1224 if (nn_reject_debug) {
1225 tprintf ("top:\"%c\" %4.2f next:\"%c\" %4.2f TESS:\"%c\" Conf: %d\n",
1226 top, top_score, next, next_score, tess_ch, conf_level);
1227 }
1228 #endif
1229 free_mem(input_vector);
1230 return conf_level;
1231 }
1232
1233
evaluate_net_match(char top,float top_score,char next,float next_score,char tess_ch,BOOL8 dict_word,BOOL8 checked_dict_word,BOOL8 sensible_word,BOOL8 centre,BOOL8 good_quality_word)1234 inT16 evaluate_net_match(char top,
1235 float top_score,
1236 char next,
1237 float next_score,
1238 char tess_ch,
1239 BOOL8 dict_word,
1240 BOOL8 checked_dict_word,
1241 BOOL8 sensible_word,
1242 BOOL8 centre,
1243 BOOL8 good_quality_word) {
1244 inT16 accept_level; //0 Very clearly matched
1245 //1 Clearly top
1246 //2 Top but poor match
1247 //3 Next & poor top match
1248 //4 Next but good top match
1249 //5 No chance
1250 BOOL8 good_top_choice;
1251 BOOL8 excellent_top_choice;
1252 BOOL8 confusion_match = FALSE;
1253 BOOL8 dodgy_char = !isalnum (tess_ch);
1254
1255 good_top_choice = (top_score > nn_reject_threshold) &&
1256 (nn_reject_head_and_shoulders * top_score > next_score);
1257
1258 excellent_top_choice = good_top_choice &&
1259 (top_score > nn_dodgy_char_threshold);
1260
1261 if (top == tess_ch) {
1262 if (excellent_top_choice)
1263 accept_level = 0;
1264 else if (good_top_choice)
1265 accept_level = 1; //Top correct and well matched
1266 else
1267 accept_level = 2; //Top correct but poor match
1268 }
1269 else if ((nn_conf_1Il &&
1270 STRING (conflict_set_I_l_1).contains (tess_ch) &&
1271 STRING (conflict_set_I_l_1).contains (top)) ||
1272 (nn_conf_hyphen &&
1273 STRING (conflict_set_hyphen).contains (tess_ch) &&
1274 STRING (conflict_set_hyphen).contains (top)) ||
1275 (nn_conf_Ss &&
1276 STRING (conflict_set_S_s).contains (tess_ch) &&
1277 STRING (conflict_set_S_s).contains (top))) {
1278 confusion_match = TRUE;
1279 if (good_top_choice)
1280 accept_level = 1; //Good top confusion
1281 else
1282 accept_level = 2; //Poor top confusion
1283 }
1284 else if ((nn_conf_1Il &&
1285 STRING (conflict_set_I_l_1).contains (tess_ch) &&
1286 STRING (conflict_set_I_l_1).contains (next)) ||
1287 (nn_conf_hyphen &&
1288 STRING (conflict_set_hyphen).contains (tess_ch) &&
1289 STRING (conflict_set_hyphen).contains (next)) ||
1290 (nn_conf_Ss &&
1291 STRING (conflict_set_S_s).contains (tess_ch) &&
1292 STRING (conflict_set_S_s).contains (next))) {
1293 confusion_match = TRUE;
1294 if (!good_top_choice)
1295 accept_level = 3; //Next confusion and top match dodgy
1296 else
1297 accept_level = 4; //Next confusion and good top match
1298 }
1299 else if (next == tess_ch) {
1300 if (!good_top_choice)
1301 accept_level = 3; //Next match and top match dodgy
1302 else
1303 accept_level = 4; //Next match and good top match
1304 }
1305 else
1306 accept_level = 5;
1307
1308 /* Could allow some match flexibility here sS$ etc */
1309
1310 /* Now set confirmation level according to how much we can believe the tess
1311 char. */
1312
1313 if ((accept_level == 0) && !confusion_match)
1314 return 3;
1315
1316 if ((accept_level <= 1) &&
1317 (!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match)
1318 return 3;
1319
1320 if ((accept_level == 2) &&
1321 !confusion_match && !dodgy_char &&
1322 good_quality_word &&
1323 dict_word &&
1324 (checked_dict_word || !nn_double_check_dict) && sensible_word)
1325 return 2;
1326
1327 if (confusion_match &&
1328 (accept_level <= nn_conf_accept_level) &&
1329 (good_quality_word ||
1330 (!nn_conf_test_good_qual &&
1331 !STRING (conflict_set_I_l_1).contains (tess_ch))) &&
1332 (dict_word || !nn_conf_test_dict) &&
1333 (checked_dict_word || !nn_conf_double_check_dict) &&
1334 (sensible_word || !nn_conf_test_sensible))
1335 return 1;
1336
1337 if (!confusion_match &&
1338 nn_lax &&
1339 (accept_level == 3) &&
1340 (good_quality_word || !nn_conf_test_good_qual) &&
1341 (dict_word || !nn_conf_test_dict) &&
1342 (sensible_word || !nn_conf_test_sensible))
1343 return 1;
1344 else
1345 return 0;
1346 }
1347
1348
1349 /*************************************************************************
1350 * dont_allow_dubious_chars()
1351 * Let Rejects "eat" into adjacent "dubious" chars. I.e those prone to be wrong
1352 * if adjacent to a reject.
1353 *************************************************************************/
dont_allow_dubious_chars(WERD_RES * word)1354 void dont_allow_dubious_chars(WERD_RES *word) {
1355 int i = 0;
1356 int offset = 0;
1357 int rej_pos;
1358 int word_len = word->reject_map.length ();
1359
1360 while (i < word_len) {
1361 /* Find next reject */
1362
1363 while ((i < word_len) && (word->reject_map[i].accepted ()))
1364 {
1365 offset += word->best_choice->unichar_lengths()[i];
1366 i++;
1367 }
1368
1369 if (i < word_len) {
1370 rej_pos = i;
1371
1372 /* Reject dubious chars to the left */
1373 i--;
1374 offset -= word->best_choice->unichar_lengths()[i];
1375 while ((i >= 0) &&
1376 STRING(dubious_chars_left_of_reject).contains(
1377 word->best_choice->unichar_string()[offset])) {
1378 word->reject_map[i--].setrej_dubious ();
1379 offset -= word->best_choice->unichar_lengths()[i];
1380 }
1381
1382 /* Skip adjacent rejects */
1383
1384 for (i = rej_pos;
1385 (i < word_len) && (word->reject_map[i].rejected ());
1386 offset += word->best_choice->unichar_lengths()[i++]);
1387
1388 /* Reject dubious chars to the right */
1389
1390 while ((i < word_len) &&
1391 STRING(dubious_chars_right_of_reject).contains(
1392 word->best_choice->unichar_string()[offset])) {
1393 offset += word->best_choice->unichar_lengths()[i];
1394 word->reject_map[i++].setrej_dubious ();
1395 }
1396 }
1397 }
1398 }
1399
1400
1401 /*************************************************************************
1402 * dont_allow_1Il()
1403 * Dont unreject LONE accepted 1Il conflict set chars
1404 *************************************************************************/
1405 namespace tesseract {
dont_allow_1Il(WERD_RES * word)1406 void Tesseract::dont_allow_1Il(WERD_RES *word) {
1407 int i = 0;
1408 int offset;
1409 int word_len = word->reject_map.length ();
1410 const char *s = word->best_choice->unichar_string().string ();
1411 const char *lengths = word->best_choice->unichar_lengths().string ();
1412 BOOL8 accepted_1Il = FALSE;
1413
1414 for (i = 0, offset = 0; i < word_len;
1415 offset += word->best_choice->unichar_lengths()[i++]) {
1416 if (word->reject_map[i].accepted ()) {
1417 if (STRING (conflict_set_I_l_1).contains (s[offset]))
1418 accepted_1Il = TRUE;
1419 else {
1420 if (unicharset.get_isalpha (s + offset, lengths[i]) ||
1421 unicharset.get_isdigit (s + offset, lengths[i]))
1422 return; // >=1 non 1Il ch accepted
1423 }
1424 }
1425 }
1426 if (!accepted_1Il)
1427 return; //Nothing to worry about
1428
1429 for (i = 0, offset = 0; i < word_len;
1430 offset += word->best_choice->unichar_lengths()[i++]) {
1431 if (STRING (conflict_set_I_l_1).contains (s[offset]) &&
1432 word->reject_map[i].accepted ())
1433 word->reject_map[i].setrej_postNN_1Il ();
1434 }
1435 }
1436
1437
count_alphanums(WERD_RES * word_res)1438 inT16 Tesseract::count_alphanums( //how many alphanums
1439 WERD_RES *word_res) {
1440 int count = 0;
1441 const WERD_CHOICE *best_choice = word_res->best_choice;
1442 for (int i = 0; i < word_res->reject_map.length(); ++i) {
1443 if ((word_res->reject_map[i].accepted()) &&
1444 (unicharset.get_isalpha(best_choice->unichar_id(i)) ||
1445 unicharset.get_isdigit(best_choice->unichar_id(i)))) {
1446 count++;
1447 }
1448 }
1449 return count;
1450 }
1451 } // namespace tesseract
1452
1453
reject_mostly_rejects(WERD_RES * word)1454 void reject_mostly_rejects( //rej all if most rejectd
1455 WERD_RES *word) {
1456 /* Reject the whole of the word if the fraction of rejects exceeds a limit */
1457
1458 if ((float) word->reject_map.reject_count () / word->reject_map.length () >=
1459 rej_whole_of_mostly_reject_word_fract)
1460 word->reject_map.rej_word_mostly_rej ();
1461 }
1462
1463
1464 namespace tesseract {
repeated_nonalphanum_wd(WERD_RES * word,ROW * row)1465 BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
1466 inT16 char_quality;
1467 inT16 accepted_char_quality;
1468
1469 if (word->best_choice->unichar_lengths().length () <= 1)
1470 return FALSE;
1471
1472 if (!STRING (ok_repeated_ch_non_alphanum_wds).
1473 contains (word->best_choice->unichar_string()[0]))
1474 return FALSE;
1475
1476 if (!repeated_ch_string (word->best_choice->unichar_string().string (),
1477 word->best_choice->unichar_lengths().string ()))
1478 return FALSE;
1479
1480 word_char_quality(word, row, &char_quality, &accepted_char_quality);
1481
1482 if ((word->best_choice->unichar_lengths().length () == char_quality) &&
1483 (char_quality == accepted_char_quality))
1484 return TRUE;
1485 else
1486 return FALSE;
1487 }
1488
repeated_ch_string(const char * rep_ch_str,const char * lengths)1489 BOOL8 Tesseract::repeated_ch_string(const char *rep_ch_str,
1490 const char *lengths) {
1491 UNICHAR_ID c;
1492
1493 if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) {
1494 return FALSE;
1495 }
1496
1497 c = unicharset.unichar_to_id(rep_ch_str, *lengths);
1498 rep_ch_str += *(lengths++);
1499 while (*rep_ch_str != '\0' &&
1500 unicharset.unichar_to_id(rep_ch_str, *lengths) == c) {
1501 rep_ch_str++;
1502 }
1503 if (*rep_ch_str == '\0')
1504 return TRUE;
1505 return FALSE;
1506 }
1507
1508
safe_dict_word(const WERD_CHOICE & word)1509 inT16 Tesseract::safe_dict_word(const WERD_CHOICE &word) {
1510 int dict_word_type = dict_word(word);
1511 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
1512 }
1513
1514
flip_hyphens(WERD_RES * word_res)1515 void Tesseract::flip_hyphens(WERD_RES *word_res) {
1516 WERD_CHOICE *best_choice = word_res->best_choice;
1517 int i;
1518 PBLOB_IT outword_it;
1519 int prev_right = -9999;
1520 int next_left;
1521 TBOX out_box;
1522 float aspect_ratio;
1523
1524 if (tessedit_lower_flip_hyphen <= 1)
1525 return;
1526
1527 outword_it.set_to_list(word_res->outword->blob_list());
1528 UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-");
1529 bool modified = false;
1530 for (i = 0, outword_it.mark_cycle_pt();
1531 i < best_choice->length() && !outword_it.cycled_list();
1532 ++i, outword_it.forward()) {
1533 out_box = outword_it.data()->bounding_box();
1534 if (outword_it.at_last())
1535 next_left = 9999;
1536 else
1537 next_left = outword_it.data_relative(1)->bounding_box().left();
1538 // Dont touch small or touching blobs - it is too dangerous.
1539 if ((out_box.width() > 8 * word_res->denorm.scale()) &&
1540 (out_box.left() > prev_right) && (out_box.right() < next_left)) {
1541 aspect_ratio = out_box.width() / (float) out_box.height();
1542 if (unicharset.eq(best_choice->unichar_id(i), ".")) {
1543 if (aspect_ratio >= tessedit_upper_flip_hyphen &&
1544 unicharset.contains_unichar_id(unichar_dash) &&
1545 unicharset.get_enabled(unichar_dash)) {
1546 /* Certain HYPHEN */
1547 best_choice->set_unichar_id(unichar_dash, i);
1548 modified = true;
1549 if (word_res->reject_map[i].rejected())
1550 word_res->reject_map[i].setrej_hyphen_accept();
1551 }
1552 if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
1553 word_res->reject_map[i].accepted())
1554 //Suspected HYPHEN
1555 word_res->reject_map[i].setrej_hyphen ();
1556 }
1557 else if (best_choice->unichar_id(i) == unichar_dash) {
1558 if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
1559 (word_res->reject_map[i].rejected()))
1560 word_res->reject_map[i].setrej_hyphen_accept();
1561 //Certain HYPHEN
1562
1563 if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
1564 (word_res->reject_map[i].accepted()))
1565 //Suspected HYPHEN
1566 word_res->reject_map[i].setrej_hyphen();
1567 }
1568 }
1569 prev_right = out_box.right();
1570 }
1571 if (modified) {
1572 best_choice->populate_unichars(unicharset);
1573 }
1574 }
1575
flip_0O(WERD_RES * word_res)1576 void Tesseract::flip_0O(WERD_RES *word_res) {
1577 WERD_CHOICE *best_choice = word_res->best_choice;
1578 int i;
1579 PBLOB_IT outword_it;
1580 TBOX out_box;
1581
1582 if (!tessedit_flip_0O)
1583 return;
1584
1585 outword_it.set_to_list(word_res->outword->blob_list ());
1586
1587 for (i = 0, outword_it.mark_cycle_pt ();
1588 i < best_choice->length() && !outword_it.cycled_list ();
1589 ++i, outword_it.forward ()) {
1590 if (unicharset.get_isupper(best_choice->unichar_id(i)) ||
1591 unicharset.get_isdigit(best_choice->unichar_id(i))) {
1592 out_box = outword_it.data()->bounding_box ();
1593 if ((out_box.top() < bln_baseline_offset + bln_x_height) ||
1594 (out_box.bottom() > bln_baseline_offset + bln_x_height / 4))
1595 return; //Beware words with sub/superscripts
1596 }
1597 }
1598 UNICHAR_ID unichar_0 = unicharset.unichar_to_id("0");
1599 UNICHAR_ID unichar_O = unicharset.unichar_to_id("O");
1600 if (unichar_0 == INVALID_UNICHAR_ID || !unicharset.get_enabled(unichar_0) ||
1601 unichar_O == INVALID_UNICHAR_ID || !unicharset.get_enabled(unichar_O)) {
1602 return; // 0 or O are not present/enabled in unicharset
1603 }
1604 bool modified = false;
1605 for (i = 1; i < best_choice->length(); ++i, outword_it.forward ()) {
1606 if (best_choice->unichar_id(i) == unichar_0 ||
1607 best_choice->unichar_id(i) == unichar_O) {
1608 /* A0A */
1609 if ((i+1) < best_choice->length() &&
1610 non_O_upper(best_choice->unichar_id(i-1)) &&
1611 non_O_upper(best_choice->unichar_id(i+1))) {
1612 best_choice->set_unichar_id(unichar_O, i);
1613 modified = true;
1614 }
1615 /* A00A */
1616 if (non_O_upper(best_choice->unichar_id(i-1)) &&
1617 (i+1) < best_choice->length() &&
1618 (best_choice->unichar_id(i+1) == unichar_0 ||
1619 best_choice->unichar_id(i+1) == unichar_O) &&
1620 (i+2) < best_choice->length() &&
1621 non_O_upper(best_choice->unichar_id(i+2))) {
1622 best_choice->set_unichar_id(unichar_O, i);
1623 modified = true;
1624 i++;
1625 }
1626 /* AA0<non digit or end of word> */
1627 if ((i > 1) &&
1628 non_O_upper(best_choice->unichar_id(i-2)) &&
1629 non_O_upper(best_choice->unichar_id(i-1)) &&
1630 (((i+1) < best_choice->length() &&
1631 !unicharset.get_isdigit(best_choice->unichar_id(i+1)) &&
1632 !unicharset.eq(best_choice->unichar_id(i+1), "l") &&
1633 !unicharset.eq(best_choice->unichar_id(i+1), "I")) ||
1634 (i == best_choice->length() - 1))) {
1635 best_choice->set_unichar_id(unichar_O, i);
1636 modified = true;
1637 }
1638 /* 9O9 */
1639 if (non_0_digit(best_choice->unichar_id(i-1)) &&
1640 (i+1) < best_choice->length() &&
1641 non_0_digit(best_choice->unichar_id(i+1))) {
1642 best_choice->set_unichar_id(unichar_0, i);
1643 modified = true;
1644 }
1645 /* 9OOO */
1646 if (non_0_digit(best_choice->unichar_id(i-1)) &&
1647 (i+2) < best_choice->length() &&
1648 (best_choice->unichar_id(i+1) == unichar_0 ||
1649 best_choice->unichar_id(i+1) == unichar_O) &&
1650 (best_choice->unichar_id(i+2) == unichar_0 ||
1651 best_choice->unichar_id(i+2) == unichar_O)) {
1652 best_choice->set_unichar_id(unichar_0, i);
1653 best_choice->set_unichar_id(unichar_0, i+1);
1654 best_choice->set_unichar_id(unichar_0, i+2);
1655 modified = true;
1656 i += 2;
1657 }
1658 /* 9OO<non upper> */
1659 if (non_0_digit(best_choice->unichar_id(i-1)) &&
1660 (i+2) < best_choice->length() &&
1661 (best_choice->unichar_id(i+1) == unichar_0 ||
1662 best_choice->unichar_id(i+1) == unichar_O) &&
1663 !unicharset.get_isupper(best_choice->unichar_id(i+2))) {
1664 best_choice->set_unichar_id(unichar_0, i);
1665 best_choice->set_unichar_id(unichar_0, i+1);
1666 modified = true;
1667 i++;
1668 }
1669 /* 9O<non upper> */
1670 if (non_0_digit(best_choice->unichar_id(i-1)) &&
1671 (i+1) < best_choice->length() &&
1672 !unicharset.get_isupper(best_choice->unichar_id(i+1))) {
1673 best_choice->set_unichar_id(unichar_0, i);
1674 }
1675 /* 9[.,]OOO.. */
1676 if ((i > 1) &&
1677 (unicharset.eq(best_choice->unichar_id(i-1), ".") ||
1678 unicharset.eq(best_choice->unichar_id(i-1), ",")) &&
1679 (unicharset.get_isdigit(best_choice->unichar_id(i-2)) ||
1680 best_choice->unichar_id(i-2) == unichar_O)) {
1681 if (best_choice->unichar_id(i-2) == unichar_O) {
1682 best_choice->set_unichar_id(unichar_0, i-2);
1683 modified = true;
1684 }
1685 while (i < best_choice->length() &&
1686 (best_choice->unichar_id(i) == unichar_O ||
1687 best_choice->unichar_id(i) == unichar_0)) {
1688 best_choice->set_unichar_id(unichar_0, i);
1689 modified = true;
1690 i++;
1691 }
1692 i--;
1693 }
1694 }
1695 }
1696 if (modified) {
1697 best_choice->populate_unichars(unicharset);
1698 }
1699 }
1700
non_O_upper(UNICHAR_ID unichar_id)1701 BOOL8 Tesseract::non_O_upper(UNICHAR_ID unichar_id) {
1702 return (unicharset.get_isupper(unichar_id) &&
1703 (!unicharset.eq(unichar_id, "O")));
1704 }
1705
non_0_digit(UNICHAR_ID unichar_id)1706 BOOL8 Tesseract::non_0_digit(UNICHAR_ID unichar_id) {
1707 return (unicharset.get_isdigit(unichar_id) &&
1708 (!unicharset.eq(unichar_id, "0")));
1709 }
1710 } // namespace tesseract
1711