• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**********************************************************************
2  * File:        reject.cpp  (Formerly reject.c)
3  * Description: Rejection functions used in tessedit
4  * Author:		Phil Cheatle
5  * Created:		Wed Sep 23 16:50:21 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #include          "tessvars.h"
22 #ifdef __UNIX__
23 #include          <assert.h>
24 #include          <errno.h>
25 #endif
26 #include          "scanutils.h"
27 #include          <ctype.h>
28 #include          <string.h>
29 //#include                                      "tessbox.h"
30 #include          "memry.h"
31 #include          "reject.h"
32 #include          "tfacep.h"
33 #include          "mainblk.h"
34 #include          "charcut.h"
35 #include          "imgs.h"
36 #include          "scaleimg.h"
37 #include          "control.h"
38 #include          "docqual.h"
39 #include          "secname.h"
40 #include          "globals.h"
41 
42 /* #define SECURE_NAMES done in secnames.h when necessary */
43 
44 //extern "C" {
45 #include          "callnet.h"
46 //}
47 #include "tesseractclass.h"
48 #include          "notdll.h"
49 
50 CLISTIZEH (STRING) CLISTIZE (STRING)
51 #define EXTERN
52 EXTERN
53 INT_VAR (tessedit_reject_mode, 0, "Rejection algorithm");
54 EXTERN
55 INT_VAR (tessedit_ok_mode, 5, "Acceptance decision algorithm");
56 EXTERN
57 BOOL_VAR (tessedit_use_nn, FALSE, "");
58 EXTERN
59 BOOL_VAR (tessedit_rejection_debug, FALSE, "Adaption debug");
60 EXTERN
61 BOOL_VAR (tessedit_rejection_stats, FALSE, "Show NN stats");
62 EXTERN
63 BOOL_VAR (tessedit_flip_0O, TRUE, "Contextual 0O O0 flips");
64 EXTERN
65 double_VAR (tessedit_lower_flip_hyphen, 1.5,
66 "Aspect ratio dot/hyphen test");
67 EXTERN
68 double_VAR (tessedit_upper_flip_hyphen, 1.8,
69 "Aspect ratio dot/hyphen test");
70 
71 EXTERN
72 BOOL_VAR (rej_trust_doc_dawg, FALSE,
73 "Use DOC dawg in 11l conf. detector");
74 EXTERN
75 BOOL_VAR (rej_1Il_use_dict_word, FALSE, "Use dictword test");
76 EXTERN
77 BOOL_VAR (rej_1Il_trust_permuter_type, TRUE, "Dont double check");
78 
79 EXTERN
80 BOOL_VAR (one_ell_conflict_default, TRUE, "one_ell_conflict default");
81 EXTERN
82 BOOL_VAR (show_char_clipping, FALSE, "Show clip image window?");
83 EXTERN
84 BOOL_VAR (nn_debug, FALSE, "NN DEBUGGING?");
85 EXTERN
86 BOOL_VAR (nn_reject_debug, FALSE, "NN DEBUG each char?");
87 EXTERN
88 BOOL_VAR (nn_lax, FALSE, "Use 2nd rate matches");
89 EXTERN
90 BOOL_VAR (nn_double_check_dict, FALSE, "Double check");
91 EXTERN
92 BOOL_VAR (nn_conf_double_check_dict, TRUE,
93 "Double check for confusions");
94 EXTERN
95 BOOL_VAR (nn_conf_1Il, TRUE, "NN use 1Il conflicts");
96 EXTERN
97 BOOL_VAR (nn_conf_Ss, TRUE, "NN use Ss conflicts");
98 EXTERN
99 BOOL_VAR (nn_conf_hyphen, TRUE, "NN hyphen conflicts");
100 EXTERN
101 BOOL_VAR (nn_conf_test_good_qual, FALSE, "NN dodgy 1Il cross check");
102 EXTERN
103 BOOL_VAR (nn_conf_test_dict, TRUE, "NN dodgy 1Il cross check");
104 EXTERN
105 BOOL_VAR (nn_conf_test_sensible, TRUE, "NN dodgy 1Il cross check");
106 EXTERN
107 BOOL_VAR (nn_conf_strict_on_dodgy_chs, TRUE,
108 "Require stronger NN match");
109 EXTERN
110 double_VAR (nn_dodgy_char_threshold, 0.99, "min accept score");
111 EXTERN
112 INT_VAR (nn_conf_accept_level, 4, "NN accept dodgy 1Il matches? ");
113 EXTERN
114 INT_VAR (nn_conf_initial_i_level, 3,
115 "NN accept initial Ii match level ");
116 
117 EXTERN
118 BOOL_VAR (no_unrej_dubious_chars, TRUE, "Dubious chars next to reject?");
119 EXTERN
120 BOOL_VAR (no_unrej_no_alphanum_wds, TRUE, "Stop unrej of non A/N wds?");
121 EXTERN
122 BOOL_VAR (no_unrej_1Il, FALSE, "Stop unrej of 1Ilchars?");
123 EXTERN
124 BOOL_VAR (rej_use_tess_accepted, TRUE, "Individual rejection control");
125 EXTERN
126 BOOL_VAR (rej_use_tess_blanks, TRUE, "Individual rejection control");
127 EXTERN
128 BOOL_VAR (rej_use_good_perm, TRUE, "Individual rejection control");
129 EXTERN
130 BOOL_VAR (rej_use_sensible_wd, FALSE, "Extend permuter check");
131 EXTERN
132 BOOL_VAR (rej_alphas_in_number_perm, FALSE, "Extend permuter check");
133 
134 EXTERN
135 double_VAR (rej_whole_of_mostly_reject_word_fract, 0.85,
136 "if >this fract");
137 EXTERN
138 INT_VAR (rej_mostly_reject_mode, 1,
139 "0-never, 1-afterNN, 2-after new xht");
140 EXTERN
141 double_VAR (tessed_fullstop_aspect_ratio, 1.2,
142 "if >this fract then reject");
143 
144 EXTERN
145 INT_VAR (net_image_width, 40, "NN input image width");
146 EXTERN
147 INT_VAR (net_image_height, 36, "NN input image height");
148 EXTERN
149 INT_VAR (net_image_x_height, 22, "NN input image x_height");
150 EXTERN
151 INT_VAR (tessedit_image_border, 2, "Rej blbs near image edge limit");
152 
153 /*
154   Net input is assumed to have (net_image_width * net_image_height) input
155   units of image pixels, followed by 0, 1, or N units representing the
156   baseline position. 0 implies no baseline information. 1 implies a floating
157   point value. N implies a "guage" of N units. For any char an initial set
158   of these are ON, the remainder OFF to indicate the "level" of the
159   baseline.
160 
161   HOWEVER!!!  NOTE THAT EACH NEW INPUT LAYER FORMAT EXPECTS TO BE RUN WITH A
162   DIFFERENT tessed/netmatch/nmatch.c MODULE. - These are classic C modules
163   generated by aspirin with HARD CODED CONSTANTS
164 */
165 
166 EXTERN
167 INT_VAR (net_bl_nodes, 20, "Number of baseline nodes");
168 
169 EXTERN
170 double_VAR (nn_reject_threshold, 0.5, "NN min accept score");
171 EXTERN
172 double_VAR (nn_reject_head_and_shoulders, 0.6, "top scores sep factor");
173 
174 /* NOTE - ctoh doesn't handle "=" properly, hence \075 */
175 EXTERN
176 STRING_VAR (ok_single_ch_non_alphanum_wds, "-?\075",
177 "Allow NN to unrej");
178 EXTERN
179 STRING_VAR (ok_repeated_ch_non_alphanum_wds, "-?*\075",
180 "Allow NN to unrej");
181 EXTERN
182 STRING_VAR (conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
183 EXTERN
184 STRING_VAR (conflict_set_S_s, "Ss$", "Ss conflict set");
185 EXTERN
186 STRING_VAR (conflict_set_hyphen, "-_~", "hyphen conflict set");
187 EXTERN
188 STRING_VAR (dubious_chars_left_of_reject, "!'+`()-./\\<>;:^_,~\"",
189 "Unreliable chars");
190 EXTERN
191 STRING_VAR (dubious_chars_right_of_reject, "!'+`()-./\\<>;:^_,~\"",
192 "Unreliable chars");
193 
194 EXTERN
195 INT_VAR (min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
196 
197 /*************************************************************************
198  * set_done()
199  *
200  * Set the done flag based on the word acceptability criteria
201  *************************************************************************/
202 
203 namespace tesseract {
set_done(WERD_RES * word,inT16 pass)204 void Tesseract::set_done(  //set done flag
205                          WERD_RES *word,
206                          inT16 pass) {
207   /*
208   0: Original heuristic used in Tesseract and Ray's prototype Resaljet
209   */
210   if (tessedit_ok_mode == 0) {
211     /* NOTE - done even if word contains some or all spaces !!! */
212     word->done = word->tess_accepted;
213   }
214   /*
215   1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
216   */
217   else if (tessedit_ok_mode == 1) {
218     word->done = word->tess_accepted &&
219       (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
220 
221     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
222       word->done = FALSE;
223   }
224   /*
225   2: as 1 + only accept dict words or numerics in pass 1
226   */
227   else if (tessedit_ok_mode == 2) {
228     word->done = word->tess_accepted &&
229       (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
230 
231     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
232       word->done = FALSE;
233 
234     if (word->done &&
235       (pass == 1) &&
236       (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
237       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
238       (word->best_choice->permuter () != USER_DAWG_PERM) &&
239     (word->best_choice->permuter () != NUMBER_PERM)) {
240       #ifndef SECURE_NAMES
241       if (tessedit_rejection_debug)
242         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
243           word->best_choice->unichar_string().string ());
244       #endif
245       word->done = FALSE;
246     }
247   }
248   /*
249   3: as 2 + only accept dict words or numerics in pass 2 as well
250   */
251   else if (tessedit_ok_mode == 3) {
252     word->done = word->tess_accepted &&
253       (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
254 
255     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
256       word->done = FALSE;
257 
258     if (word->done &&
259       (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
260       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
261       (word->best_choice->permuter () != USER_DAWG_PERM) &&
262     (word->best_choice->permuter () != NUMBER_PERM)) {
263       #ifndef SECURE_NAMES
264       if (tessedit_rejection_debug)
265         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
266           word->best_choice->unichar_string().string ());
267       #endif
268       word->done = FALSE;
269     }
270   }
271   /*
272   4: as 2 + reject dict ambigs in pass 1
273   */
274   else if (tessedit_ok_mode == 4) {
275     word->done = word->tess_accepted &&
276       (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
277 
278     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
279       word->done = FALSE;
280 
281     if (word->done &&
282       (pass == 1) &&
283       (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
284       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
285       (word->best_choice->permuter () != USER_DAWG_PERM) &&
286       (word->best_choice->permuter () != NUMBER_PERM)) ||
287     (test_ambig_word (word)))) {
288       #ifndef SECURE_NAMES
289       if (tessedit_rejection_debug)
290         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
291           word->best_choice->unichar_string().string ());
292       #endif
293       word->done = FALSE;
294     }
295   }
296   /*
297   5: as 3 + reject dict ambigs in both passes
298   */
299   else if (tessedit_ok_mode == 5) {
300     word->done = word->tess_accepted &&
301       (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
302 
303     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
304       word->done = FALSE;
305 
306     if (word->done &&
307       (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
308       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
309       (word->best_choice->permuter () != USER_DAWG_PERM) &&
310       (word->best_choice->permuter () != NUMBER_PERM)) ||
311     (test_ambig_word (word)))) {
312       #ifndef SECURE_NAMES
313       if (tessedit_rejection_debug)
314         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
315           word->best_choice->unichar_string().string ());
316       #endif
317       word->done = FALSE;
318     }
319   }
320 
321   else {
322     tprintf ("BAD tessedit_ok_mode\n");
323     err_exit();
324   }
325 }
326 
327 
328 /*************************************************************************
329  * make_reject_map()
330  *
331  * Sets the done flag to indicate whether the resylt is acceptable.
332  *
333  * Sets a reject map for the word.
334  *************************************************************************/
make_reject_map(WERD_RES * word,BLOB_CHOICE_LIST_CLIST * blob_choices,ROW * row,inT16 pass)335 void Tesseract::make_reject_map(      //make rej map for wd //detailed results
336                                 WERD_RES *word,
337                                 BLOB_CHOICE_LIST_CLIST *blob_choices,
338                                 ROW *row,
339                                 inT16 pass  //1st or 2nd?
340                                ) {
341   int i;
342   int offset;
343 
344   flip_0O(word);
345   check_debug_pt (word, -1);     //For trap only
346   set_done(word, pass);  //Set acceptance
347   word->reject_map.initialise (word->best_choice->unichar_lengths().length ());
348   reject_blanks(word);
349   /*
350   0: Rays original heuristic - the baseline
351   */
352   if (tessedit_reject_mode == 0) {
353     if (!word->done)
354       reject_poor_matches(word, blob_choices);
355   }
356   /*
357   5: Reject I/1/l from words where there is no strong contextual confirmation;
358     the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
359     and the whole of any words which are very small
360   */
361   else if (tessedit_reject_mode == 5) {
362     if (bln_x_height / word->denorm.scale () <= min_sane_x_ht_pixels)
363       word->reject_map.rej_word_small_xht ();
364     else {
365       one_ell_conflict(word, TRUE);
366       /*
367         Originally the code here just used the done flag. Now I have duplicated
368         and unpacked the conditions for setting the done flag so that each
369         mechanism can be turned on or off independently. This works WITHOUT
370         affecting the done flag setting.
371       */
372       if (rej_use_tess_accepted && !word->tess_accepted)
373         word->reject_map.rej_word_not_tess_accepted ();
374 
375       if (rej_use_tess_blanks &&
376         (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
377         word->reject_map.rej_word_contains_blanks ();
378 
379       if (rej_use_good_perm) {
380         if (((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
381           (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
382           (word->best_choice->permuter () == USER_DAWG_PERM)) &&
383           (!rej_use_sensible_wd ||
384           (acceptable_word_string
385           (word->best_choice->unichar_string().string (),
386            word->best_choice->unichar_lengths().string ()) !=
387         AC_UNACCEPTABLE))) {
388           //PASSED TEST
389         }
390         else if (word->best_choice->permuter () == NUMBER_PERM) {
391           if (rej_alphas_in_number_perm) {
392             for (i = 0, offset = 0;
393                  word->best_choice->unichar_string()[offset] != '\0';
394                  offset += word->best_choice->unichar_lengths()[i++]) {
395               if (word->reject_map[i].accepted () &&
396                   unicharset.get_isalpha(
397                       word->best_choice->unichar_string().string() + offset,
398                       word->best_choice->unichar_lengths()[i]))
399                 word->reject_map[i].setrej_bad_permuter ();
400               //rej alpha
401             }
402           }
403         }
404         else {
405           word->reject_map.rej_word_bad_permuter ();
406         }
407       }
408 
409       /* Ambig word rejection was here once !!*/
410 
411     }
412   }
413   else {
414     tprintf ("BAD tessedit_reject_mode\n");
415     err_exit();
416   }
417 
418   if (tessedit_image_border > -1)
419     reject_edge_blobs(word);
420 
421   check_debug_pt (word, 10);
422   if (tessedit_rejection_debug) {
423     tprintf ("Permuter Type = %d\n", word->best_choice->permuter ());
424     tprintf ("Certainty: %f     Rating: %f\n",
425       word->best_choice->certainty (), word->best_choice->rating ());
426     tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
427   }
428 
429   /* Un-reject any rejected characters if NN permits */
430 
431   if (tessedit_use_nn && (pass == 2) &&
432     word->reject_map.recoverable_rejects ())
433     nn_recover_rejects(word, row);
434   flip_hyphens(word);
435   check_debug_pt (word, 20);
436 }
437 }  // namespace tesseract
438 
439 
reject_blanks(WERD_RES * word)440 void reject_blanks(WERD_RES *word) {
441   inT16 i;
442   inT16 offset;
443 
444   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
445        offset += word->best_choice->unichar_lengths()[i], i += 1) {
446     if (word->best_choice->unichar_string()[offset] == ' ')
447                                  //rej unrecognised blobs
448       word->reject_map[i].setrej_tess_failure ();
449   }
450 }
451 
452 
reject_I_1_L(WERD_RES * word)453 void reject_I_1_L(WERD_RES *word) {
454   inT16 i;
455   inT16 offset;
456 
457   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
458        offset += word->best_choice->unichar_lengths()[i], i += 1) {
459     if (STRING (conflict_set_I_l_1).
460     contains (word->best_choice->unichar_string()[offset])) {
461                                  //rej 1Il conflict
462       word->reject_map[i].setrej_1Il_conflict ();
463     }
464   }
465 }
466 
467 
reject_poor_matches(WERD_RES * word,BLOB_CHOICE_LIST_CLIST * blob_choices)468 void reject_poor_matches(  //detailed results
469                          WERD_RES *word,
470                          BLOB_CHOICE_LIST_CLIST *blob_choices) {
471   float threshold;
472   inT16 i = 0;
473   inT16 offset = 0;
474                                  //super iterator
475   BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
476   BLOB_CHOICE_IT choice_it;      //real iterator
477 
478   #ifndef SECURE_NAMES
479   if (strlen(word->best_choice->unichar_lengths().string()) !=
480       list_it.length()) {
481     tprintf
482       ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
483       word->best_choice->unichar_string().string(),
484       strlen (word->best_choice->unichar_lengths().string()), list_it.length(),
485       word->outword->blob_list()->length());
486   }
487   #endif
488   ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) ==
489     list_it.length ());
490   ASSERT_HOST (word->outword->blob_list ()->length () == list_it.length ());
491   threshold = compute_reject_threshold (blob_choices);
492 
493   for (list_it.mark_cycle_pt ();
494   !list_it.cycled_list (); list_it.forward (), i++,
495            offset += word->best_choice->unichar_lengths()[i]) {
496     /* NB - only compares the threshold against the TOP choice char in the
497       choices list for a blob !! - the selected one may be below the threshold
498     */
499     choice_it.set_to_list (list_it.data ());
500     if ((word->best_choice->unichar_string()[offset] == ' ') ||
501       (choice_it.length () == 0))
502                                  //rej unrecognised blobs
503       word->reject_map[i].setrej_tess_failure ();
504     else if (choice_it.data ()->certainty () < threshold)
505                                  //rej poor score blob
506       word->reject_map[i].setrej_poor_match ();
507   }
508 }
509 
510 
511 /**********************************************************************
512  * compute_reject_threshold
513  *
514  * Set a rejection threshold for this word.
515  * Initially this is a trivial function which looks for the largest
516  * gap in the certainty value.
517  **********************************************************************/
518 
compute_reject_threshold(BLOB_CHOICE_LIST_CLIST * blob_choices)519 float compute_reject_threshold(  //compute threshold //detailed results
520                                BLOB_CHOICE_LIST_CLIST *blob_choices) {
521   inT16 index;                   //to ratings
522   inT16 blob_count;              //no of blobs in word
523   inT16 ok_blob_count = 0;       //non TESS rej blobs in word
524   float *ratings;                //array of confidences
525   float threshold;               //rejection threshold
526   float bestgap;                 //biggest gap
527   float gapstart;                //bottom of gap
528                                  //super iterator
529   BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
530   BLOB_CHOICE_IT choice_it;      //real iterator
531 
532   blob_count = blob_choices->length ();
533   ratings = (float *) alloc_mem (blob_count * sizeof (float));
534   for (list_it.mark_cycle_pt (), index = 0;
535   !list_it.cycled_list (); list_it.forward (), index++) {
536     choice_it.set_to_list (list_it.data ());
537     if (choice_it.length () > 0) {
538       ratings[ok_blob_count] = choice_it.data ()->certainty ();
539       //get in an array
540       //                 tprintf("Rating[%d]=%c %g %g\n",
541       //                         index,choice_it.data()->char_class(),
542       //                         choice_it.data()->rating(),choice_it.data()->certainty());
543       ok_blob_count++;
544     }
545   }
546   ASSERT_HOST (index == blob_count);
547   qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
548   //sort them
549   bestgap = 0;
550   gapstart = ratings[0] - 1;     //all reject if none better
551   if (ok_blob_count >= 3) {
552     for (index = 0; index < ok_blob_count - 1; index++) {
553       if (ratings[index + 1] - ratings[index] > bestgap) {
554         bestgap = ratings[index + 1] - ratings[index];
555         //find biggest
556         gapstart = ratings[index];
557       }
558     }
559   }
560   threshold = gapstart + bestgap / 2;
561   //      tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
562   //              ratings[0],ratings[index],bestgap,threshold);
563 
564   free_mem(ratings);
565   return threshold;
566 }
567 
568 
569 /**********************************************************************
570  * sort_floats
571  *
572  * qsort function to sort 2 floats.
573  **********************************************************************/
574 
sort_floats(const void * arg1,const void * arg2)575 int sort_floats(                   //qsort function
576                 const void *arg1,  //ptrs to floats
577                 const void *arg2) {
578   float diff;                    //difference
579 
580   diff = *((float *) arg1) - *((float *) arg2);
581   if (diff > 0)
582     return 1;
583   else if (diff < 0)
584     return -1;
585   else
586     return 0;
587 }
588 
589 
590 /*************************************************************************
591  * reject_edge_blobs()
592  *
593  * If the word is perilously close to the edge of the image, reject those blobs
594  * in the word which are too close to the edge as they could be clipped.
595  *************************************************************************/
596 
reject_edge_blobs(WERD_RES * word)597 void reject_edge_blobs(WERD_RES *word) {
598   TBOX word_box = word->word->bounding_box ();
599   TBOX blob_box;
600   PBLOB_IT blob_it = word->outword->blob_list ();
601   //blobs
602   int blobindex = 0;
603   float centre;
604 
605   if ((word_box.left () < tessedit_image_border) ||
606     (word_box.bottom () < tessedit_image_border) ||
607     (word_box.right () + tessedit_image_border >
608     page_image.get_xsize () - 1) ||
609   (word_box.top () + tessedit_image_border > page_image.get_ysize () - 1)) {
610     ASSERT_HOST (word->reject_map.length () == blob_it.length ());
611     for (blobindex = 0, blob_it.mark_cycle_pt ();
612     !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
613       blob_box = blob_it.data ()->bounding_box ();
614       centre = (blob_box.left () + blob_box.right ()) / 2.0;
615       if ((word->denorm.x (blob_box.left ()) < tessedit_image_border) ||
616         (word->denorm.y (blob_box.bottom (), centre) <
617         tessedit_image_border) ||
618         (word->denorm.x (blob_box.right ()) + tessedit_image_border >
619         page_image.get_xsize () - 1) ||
620         (word->denorm.y (blob_box.top (), centre)
621       + tessedit_image_border > page_image.get_ysize () - 1)) {
622         word->reject_map[blobindex].setrej_edge_char ();
623         //close to edge
624       }
625     }
626   }
627 }
628 
629 
630 /**********************************************************************
631  * one_ell_conflict()
632  *
633  * Identify words where there is a potential I/l/1 error.
634  * - A bundle of contextual heuristics!
635  **********************************************************************/
636 namespace tesseract {
one_ell_conflict(WERD_RES * word_res,BOOL8 update_map)637 BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
638   const char *word;
639   const char *lengths;
640   inT16 word_len;                //its length
641   inT16 first_alphanum_index_;
642   inT16 first_alphanum_offset_;
643   inT16 i;
644   inT16 offset;
645   BOOL8 non_conflict_set_char;   //non conf set a/n?
646   BOOL8 conflict = FALSE;
647   BOOL8 allow_1s;
648   ACCEPTABLE_WERD_TYPE word_type;
649   BOOL8 dict_perm_type;
650   BOOL8 dict_word_ok;
651   int dict_word_type;
652 
653   word = word_res->best_choice->unichar_string().string ();
654   lengths = word_res->best_choice->unichar_lengths().string();
655   word_len = strlen (lengths);
656   /*
657     If there are no occurrences of the conflict set characters then the word
658     is OK.
659   */
660   if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
661     return FALSE;
662 
663   /*
664     There is a conflict if there are NO other (confirmed) alphanumerics apart
665     from those in the conflict set.
666   */
667 
668   for (i = 0, offset = 0, non_conflict_set_char = FALSE;
669        (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
670     non_conflict_set_char =
671         (unicharset.get_isalpha(word + offset, lengths[i]) ||
672          unicharset.get_isdigit(word + offset, lengths[i])) &&
673         !STRING (conflict_set_I_l_1).contains (word[offset]);
674   if (!non_conflict_set_char) {
675     if (update_map)
676       reject_I_1_L(word_res);
677     return TRUE;
678   }
679 
680   /*
681     If the word is accepted by a dawg permuter, and the first alpha character
682     is "I" or "l", check to see if the alternative is also a dawg word. If it
683     is, then there is a potential error otherwise the word is ok.
684   */
685 
686   dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
687     (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
688     (rej_trust_doc_dawg &&
689     (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
690     (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
691   dict_word_type = dict_word(*(word_res->best_choice));
692   dict_word_ok = (dict_word_type > 0) &&
693     (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
694 
695   if ((rej_1Il_use_dict_word && dict_word_ok) ||
696     (rej_1Il_trust_permuter_type && dict_perm_type) ||
697   (dict_perm_type && dict_word_ok)) {
698     first_alphanum_index_ = first_alphanum_index (word, lengths);
699     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
700     if (lengths[first_alphanum_index_] == 1 &&
701         word[first_alphanum_offset_] == 'I') {
702       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
703       if (safe_dict_word(*(word_res->best_choice)) > 0) {
704         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
705         if (update_map)
706           word_res->reject_map[first_alphanum_index_].
707             setrej_1Il_conflict();
708         return TRUE;
709       }
710       else {
711         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
712         return FALSE;
713       }
714     }
715 
716     if (lengths[first_alphanum_index_] == 1 &&
717         word[first_alphanum_offset_] == 'l') {
718       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
719       if (safe_dict_word(*(word_res->best_choice)) > 0) {
720         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
721         if (update_map)
722           word_res->reject_map[first_alphanum_index_].
723             setrej_1Il_conflict();
724         return TRUE;
725       }
726       else {
727         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
728         return FALSE;
729       }
730     }
731     return FALSE;
732   }
733 
734   /*
735     NEW 1Il code. The old code relied on permuter types too much. In fact,
736     tess will use TOP_CHOICE permute for good things like "palette".
737     In this code the string is examined independently to see if it looks like
738     a well formed word.
739   */
740 
741   /*
742     REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
743     dictionary word.
744   */
745   first_alphanum_index_ = first_alphanum_index (word, lengths);
746   first_alphanum_offset_ = first_alphanum_offset (word, lengths);
747   if (lengths[first_alphanum_index_] == 1 &&
748       word[first_alphanum_offset_] == 'l') {
749     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
750     if (safe_dict_word(*(word_res->best_choice)) > 0)
751       return FALSE;
752     else
753       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
754   }
755   else if (lengths[first_alphanum_index_] == 1 &&
756            word[first_alphanum_offset_] == 'I') {
757     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
758     if (safe_dict_word(*(word_res->best_choice)) > 0)
759       return FALSE;
760     else
761       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
762   }
763   /*
764     For strings containing digits:
765       If there are no alphas OR the numeric permuter liked the word,
766         reject any non 1 conflict chs
767       Else reject all conflict chs
768   */
769   if (word_contains_non_1_digit (word, lengths)) {
770     allow_1s = (alpha_count (word, lengths) == 0) ||
771       (word_res->best_choice->permuter () == NUMBER_PERM);
772 
773     inT16 offset;
774     conflict = FALSE;
775     for (i = 0, offset = 0; word[offset] != '\0';
776          offset += word_res->best_choice->unichar_lengths()[i++]) {
777       if ((!allow_1s || (word[offset] != '1')) &&
778       STRING (conflict_set_I_l_1).contains (word[offset])) {
779         if (update_map)
780           word_res->reject_map[i].setrej_1Il_conflict ();
781         conflict = TRUE;
782       }
783     }
784     return conflict;
785   }
786   /*
787     For anything else. See if it conforms to an acceptable word type. If so,
788     treat accordingly.
789   */
790   word_type = acceptable_word_string (word, lengths);
791   if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
792     first_alphanum_index_ = first_alphanum_index (word, lengths);
793     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
794     if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
795       if (update_map)
796         word_res->reject_map[first_alphanum_index_].
797             setrej_1Il_conflict ();
798       return TRUE;
799     }
800     else
801       return FALSE;
802   }
803   else if (word_type == AC_UPPER_CASE) {
804     return FALSE;
805   }
806   else {
807     if (update_map)
808       reject_I_1_L(word_res);
809     return TRUE;
810   }
811 }
812 
813 
first_alphanum_index(const char * word,const char * word_lengths)814 inT16 Tesseract::first_alphanum_index(const char *word,
815                                       const char *word_lengths) {
816   inT16 i;
817   inT16 offset;
818 
819   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
820     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
821         unicharset.get_isdigit(word + offset, word_lengths[i]))
822       return i;
823   }
824   return -1;
825 }
826 
first_alphanum_offset(const char * word,const char * word_lengths)827 inT16 Tesseract::first_alphanum_offset(const char *word,
828                                        const char *word_lengths) {
829   inT16 i;
830   inT16 offset;
831 
832   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
833     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
834         unicharset.get_isdigit(word + offset, word_lengths[i]))
835       return offset;
836   }
837   return -1;
838 }
839 
alpha_count(const char * word,const char * word_lengths)840 inT16 Tesseract::alpha_count(const char *word,
841                              const char *word_lengths) {
842   inT16 i;
843   inT16 offset;
844   inT16 count = 0;
845 
846   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
847     if (unicharset.get_isalpha (word + offset, word_lengths[i]))
848       count++;
849   }
850   return count;
851 }
852 
853 
word_contains_non_1_digit(const char * word,const char * word_lengths)854 BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
855                                            const char *word_lengths) {
856   inT16 i;
857   inT16 offset;
858 
859   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
860     if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
861         (word_lengths[i] != 1 || word[offset] != '1'))
862       return TRUE;
863   }
864   return FALSE;
865 }
866 
867 
test_ambig_word(WERD_RES * word)868 BOOL8 Tesseract::test_ambig_word(  //test for ambiguity
869                                  WERD_RES *word) {
870     BOOL8 ambig = FALSE;
871 
872     if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
873       (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
874     (word->best_choice->permuter () == USER_DAWG_PERM)) {
875       ambig = !getDict().NoDangerousAmbig(
876           word->best_choice, NULL, false, NULL, NULL);
877   }
878   return ambig;
879 }
880 
881 /*************************************************************************
882  * char_ambiguities()
883  *
884  * Return a pointer to a string containing the full conflict set of characters
885  * which includes the specified character, if there is one. If the specified
886  * character is not a member of a conflict set, return NULL.
887  * (NOTE that a character is assumed to be a member of only ONE conflict set.)
888  *************************************************************************/
char_ambiguities(char c)889 const char *Tesseract::char_ambiguities(char c) {
890   static STRING_CLIST conflict_sets;
891   static BOOL8 read_conflict_sets = FALSE;
892   STRING_C_IT cs_it(&conflict_sets);
893   const char *cs;
894   STRING cs_file_name;
895   FILE *cs_file;
896   char buff[1024];
897 
898   if (!read_conflict_sets) {
899     cs_file_name = datadir + "confsets";
900     if (!(cs_file = fopen (cs_file_name.string (), "r"))) {
901       CANTOPENFILE.error ("char_ambiguities", EXIT, "%s %d",
902         cs_file_name.string (), errno);
903     }
904     while (fscanf (cs_file, "%s", buff) == 1) {
905       cs_it.add_after_then_move (new STRING (buff));
906     }
907     read_conflict_sets = TRUE;
908     cs_it.move_to_first ();
909     if (tessedit_rejection_debug) {
910       for (cs_it.mark_cycle_pt ();
911       !cs_it.cycled_list (); cs_it.forward ()) {
912         tprintf ("\"%s\"\n", cs_it.data ()->string ());
913       }
914     }
915   }
916 
917   cs_it.move_to_first ();
918   for (cs_it.mark_cycle_pt (); !cs_it.cycled_list (); cs_it.forward ()) {
919     cs = cs_it.data ()->string ();
920     if (strchr (cs, c) != NULL)
921       return cs;
922   }
923   return NULL;
924 }
925 
926 /*************************************************************************
927  * nn_recover_rejects()
928  * Generate the nn_reject_map - a copy of the current reject map, but dont
929  * reject previously rejected chars if the NN matcher agrees with the best
930  * choice.
931  *************************************************************************/
932 
nn_recover_rejects(WERD_RES * word,ROW * row)933 void Tesseract::nn_recover_rejects(WERD_RES *word, ROW *row) {
934                                  //copy for debug
935   REJMAP old_map = word->reject_map;
936   /*
937     NOTE THAT THIS IS RELATIVELY INEFFICIENT AS THE WHOLE OF THE WERD IS
938     MATCHED BY THE NN MATCHER. IF COULD EASILY BE RESTRICTED TO JUST THE
939     REJECT CHARACTERS  (Though initial use is when words are total rejects
940     anyway).
941   */
942 
943   set_global_subsubloc_code(SUBSUBLOC_NN);
944   nn_match_word(word, row);
945 
946   if (no_unrej_1Il)
947     dont_allow_1Il(word);
948   if (no_unrej_dubious_chars)
949     dont_allow_dubious_chars(word);
950 
951   if (rej_mostly_reject_mode == 1)
952     reject_mostly_rejects(word);
953   /*
954     IF there are no unrejected alphanumerics AND
955       The word is not an acceptable single non alphanum char word  AND
956       The word is not an acceptable repeated non alphanum char word
957     THEN Reject whole word
958   */
959   if (no_unrej_no_alphanum_wds &&
960     (count_alphanums (word) < 1) &&
961     !((word->best_choice->unichar_lengths().length () == 1) &&
962       STRING(ok_single_ch_non_alphanum_wds).contains(
963           word->best_choice->unichar_string()[0]))
964     && !repeated_nonalphanum_wd (word, row))
965 
966     word->reject_map.rej_word_no_alphanums ();
967 
968   #ifndef SECURE_NAMES
969 
970   if (nn_debug) {
971     tprintf ("\nTess: \"%s\" MAP ",
972              word->best_choice->unichar_string().string());
973     old_map.print (stdout);
974     tprintf ("->");
975     word->reject_map.print (stdout);
976     tprintf ("\n");
977   }
978   #endif
979   set_global_subsubloc_code(SUBSUBLOC_OTHER);
980 }
981 
nn_match_word(WERD_RES * word,ROW * row)982 void Tesseract::nn_match_word(  //Match a word
983                               WERD_RES *word,
984                               ROW *row) {
985   PIXROW_LIST *pixrow_list;
986   PIXROW_IT pixrow_it;
987   IMAGELINE *imlines;            //lines of the image
988   TBOX pix_box;                   //box of imlines extent
989 #ifndef GRAPHICS_DISABLED
990   ScrollView* win = NULL;
991 #endif
992   IMAGE clip_image;
993   IMAGE scaled_image;
994   float baseline_pos;
995   inT16 net_image_size;
996   inT16 clip_image_size;
997   WERD copy_outword;             // copy to denorm
998   inT16 i;
999 
1000   const char *word_string;
1001   const char *word_string_lengths;
1002   BOOL8 word_in_dict;            //Tess wd in dict
1003   BOOL8 checked_dict_word;       //Tess wd definitely in dict
1004   BOOL8 sensible_word;           //OK char string
1005   BOOL8 centre;                  //Not at word end       chs
1006   BOOL8 good_quality_word;
1007   inT16 char_quality;
1008   inT16 accepted_char_quality;
1009 
1010   inT16 conf_level;              //0:REJECT
1011   //1:DODGY ACCEPT
1012   //2:DICT ACCEPT
1013   //3:CLEAR ACCEPT
1014   inT16 first_alphanum_index_;
1015   inT16 first_alphanum_offset_;
1016 
1017   word_string = word->best_choice->unichar_string().string();
1018   word_string_lengths = word->best_choice->unichar_lengths().string();
1019   first_alphanum_index_ = first_alphanum_index (word_string,
1020                                                 word_string_lengths);
1021   first_alphanum_offset_ = first_alphanum_offset (word_string,
1022                                                   word_string_lengths);
1023   word_in_dict = ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
1024     (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
1025     (word->best_choice->permuter () == USER_DAWG_PERM));
1026   checked_dict_word = word_in_dict &&
1027     (safe_dict_word(*(word->best_choice)) > 0);
1028   sensible_word = acceptable_word_string (word_string, word_string_lengths) !=
1029       AC_UNACCEPTABLE;
1030 
1031   word_char_quality(word, row, &char_quality, &accepted_char_quality);
1032   good_quality_word =
1033       word->best_choice->unichar_lengths().length () == char_quality;
1034 
1035   #ifndef SECURE_NAMES
1036   if (nn_reject_debug) {
1037     tprintf ("Dict: %c   Checked Dict: %c   Sensible: %c   Quality: %c\n",
1038       word_in_dict ? 'T' : 'F',
1039       checked_dict_word ? 'T' : 'F',
1040       sensible_word ? 'T' : 'F', good_quality_word ? 'T' : 'F');
1041   }
1042   #endif
1043 
1044   if (word->best_choice->unichar_lengths().length () !=
1045   word->outword->blob_list ()->length ()) {
1046     #ifndef SECURE_NAMES
1047     tprintf ("nn_match_word ASSERT FAIL String:\"%s\";  #Blobs=%d\n",
1048       word->best_choice->unichar_string().string (),
1049       word->outword->blob_list ()->length ());
1050     #endif
1051     err_exit();
1052   }
1053 
1054   copy_outword = *(word->outword);
1055   copy_outword.baseline_denormalise (&word->denorm);
1056   /*
1057     For each character, generate and match a new image, containing JUST the
1058     character we have clipped, centered in the image, on a white background.
1059     Note that we MUST have a square image so that we can scale it uniformly in
1060     x and y.  We base the size on x_height as this can be found fairly reliably.
1061   */
1062   net_image_size = (net_image_width > net_image_height) ?
1063     net_image_width : net_image_height;
1064   clip_image_size = (inT16) floor (0.5 +
1065     net_image_size * word->x_height /
1066     net_image_x_height);
1067   if ((clip_image_size <= 1) || (net_image_size <= 1)) {
1068     return;
1069   }
1070 
1071   /*
1072     Get the image of the word and the pix positions of each char
1073   */
1074   char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
1075 #ifndef GRAPHICS_DISABLED
1076   if (show_char_clipping) {
1077     win = display_clip_image (&copy_outword, page_image,
1078       pixrow_list, pix_box);
1079   }
1080 #endif
1081   pixrow_it.set_to_list (pixrow_list);
1082   pixrow_it.move_to_first ();
1083   for (pixrow_it.mark_cycle_pt (), i = 0;
1084   !pixrow_it.cycled_list (); pixrow_it.forward (), i++) {
1085     if (pixrow_it.data ()->
1086       bad_box (page_image.get_xsize (), page_image.get_ysize ()))
1087       continue;
1088     clip_image.create (clip_image_size, clip_image_size, 1);
1089     //make bin imge
1090     if (!copy_outword.flag (W_INVERSE))
1091       invert_image(&clip_image);  //white background for black on white
1092     pixrow_it.data ()->char_clip_image (imlines, pix_box, row,
1093       clip_image, baseline_pos);
1094     if (copy_outword.flag (W_INVERSE))
1095       invert_image(&clip_image);  //invert white on black for scaling &NN
1096     scaled_image.create (net_image_size, net_image_size, 1);
1097     scale_image(clip_image, scaled_image);
1098     baseline_pos *= net_image_size / clip_image_size;
1099     //scale with im
1100     centre = !pixrow_it.at_first () && !pixrow_it.at_last ();
1101 
1102     conf_level = nn_match_char (scaled_image, baseline_pos,
1103       word_in_dict, checked_dict_word,
1104       sensible_word, centre,
1105       good_quality_word, word_string[i]);
1106     if (word->reject_map[i].recoverable ()) {
1107       if ((i == first_alphanum_index_) &&
1108           word_string_lengths[first_alphanum_index_] == 1 &&
1109       ((word_string[first_alphanum_offset_] == 'I') ||
1110        (word_string[first_alphanum_offset_] == 'i'))) {
1111         if (conf_level >= nn_conf_initial_i_level)
1112           word->reject_map[i].setrej_nn_accept ();
1113         //un-reject char
1114       }
1115       else if (conf_level > 0)
1116                                  //un-reject char
1117         word->reject_map[i].setrej_nn_accept ();
1118     }
1119 #ifndef GRAPHICS_DISABLED
1120     if (show_char_clipping)
1121       display_images(clip_image, scaled_image);
1122 #endif
1123    clip_image.destroy();
1124    scaled_image.destroy();
1125   }
1126 
1127   delete[]imlines;               // Free array of imlines
1128   delete pixrow_list;
1129 
1130 #ifndef GRAPHICS_DISABLED
1131   if (show_char_clipping) {
1132 //    destroy_window(win);
1133 //   win->Destroy();
1134     delete win;
1135   }
1136 #endif
1137 }
1138 }  // namespace tesseract
1139 
1140 
1141 /*************************************************************************
1142  * nn_match_char()
1143  * Call Neural Net matcher to match a single character, given a scaled,
1144  * square image
1145  *************************************************************************/
1146 
nn_match_char(IMAGE & scaled_image,float baseline_pos,BOOL8 dict_word,BOOL8 checked_dict_word,BOOL8 sensible_word,BOOL8 centre,BOOL8 good_quality_word,char tess_ch)1147 inT16 nn_match_char(                          //of character
1148                     IMAGE &scaled_image,
1149                     float baseline_pos,       //rel to scaled_image
1150                     BOOL8 dict_word,          //part of dict wd?
1151                     BOOL8 checked_dict_word,  //part of dict wd?
1152                     BOOL8 sensible_word,      //part acceptable str?
1153                     BOOL8 centre,             //not at word ends?
1154                     BOOL8 good_quality_word,  //initial segmentation
1155                     char tess_ch              //confirm this?
1156                    ) {
1157   inT16 conf_level;              //0..2
1158   inT32 row;
1159   inT32 col;
1160   inT32 y_size = scaled_image.get_ysize ();
1161   inT32 start_y = y_size - (y_size - net_image_height) / 2 - 1;
1162   inT32 end_y = start_y - net_image_height + 1;
1163   IMAGELINE imline;
1164   float *input_vector;
1165   float *input_vec_ptr;
1166   char top;
1167   float top_score;
1168   char next;
1169   float next_score;
1170   inT16 input_nodes = (net_image_height * net_image_width) + net_bl_nodes;
1171   inT16 j;
1172 
1173   input_vector = (float *) alloc_mem (input_nodes * sizeof (float));
1174   input_vec_ptr = input_vector;
1175 
1176   invert_image(&scaled_image);  //cos nns work better
1177   for (row = start_y; row >= end_y; row--) {
1178     scaled_image.fast_get_line (0, row, net_image_width, &imline);
1179     for (col = 0; col < net_image_width; col++)
1180       *input_vec_ptr++ = imline.pixels[col];
1181   }
1182   /*
1183     The bit map presented to the net may be shorter than the image, so shift
1184     the coord to be relative to the bitmap portion.
1185   */
1186   baseline_pos -= (y_size - net_image_height) / 2.0;
1187   /*
1188     Baseline pos is 0 if below bitmap, 1 if above and in proportion otherwise.
1189     This is represented to the net as a set of bl_nodes, an initial proportion
1190     of which are set to 1.0, indicating the level of the baseline. The
1191     remainder are 0.0
1192   */
1193 
1194   if (baseline_pos < 0)
1195     baseline_pos = 0;
1196   else if (baseline_pos >= net_image_height)
1197     baseline_pos = net_image_height + 1;
1198   else
1199     baseline_pos = baseline_pos + 1;
1200   baseline_pos = baseline_pos / (net_image_height + 1);
1201 
1202   if (net_bl_nodes > 0) {
1203     baseline_pos *= 1.7;         //Use a wider range
1204     if (net_bl_nodes > 1) {
1205       /* Multi-node baseline representation */
1206       for (j = 0; j < net_bl_nodes; j++) {
1207         if (baseline_pos > ((float) j / net_bl_nodes))
1208           *input_vec_ptr++ = 1.0;
1209         else
1210           *input_vec_ptr++ = 0.0;
1211       }
1212     }
1213     else {
1214       /* Single node baseline */
1215       *input_vec_ptr++ = baseline_pos;
1216     }
1217   }
1218 
1219   callnet(input_vector, &top, &top_score, &next, &next_score);
1220   conf_level = evaluate_net_match (top, top_score, next, next_score,
1221     tess_ch, dict_word, checked_dict_word,
1222     sensible_word, centre, good_quality_word);
1223   #ifndef SECURE_NAMES
1224   if (nn_reject_debug) {
1225     tprintf ("top:\"%c\" %4.2f   next:\"%c\" %4.2f  TESS:\"%c\" Conf: %d\n",
1226       top, top_score, next, next_score, tess_ch, conf_level);
1227   }
1228   #endif
1229   free_mem(input_vector);
1230   return conf_level;
1231 }
1232 
1233 
evaluate_net_match(char top,float top_score,char next,float next_score,char tess_ch,BOOL8 dict_word,BOOL8 checked_dict_word,BOOL8 sensible_word,BOOL8 centre,BOOL8 good_quality_word)1234 inT16 evaluate_net_match(char top,
1235                          float top_score,
1236                          char next,
1237                          float next_score,
1238                          char tess_ch,
1239                          BOOL8 dict_word,
1240                          BOOL8 checked_dict_word,
1241                          BOOL8 sensible_word,
1242                          BOOL8 centre,
1243                          BOOL8 good_quality_word) {
1244   inT16 accept_level;            //0 Very clearly matched
1245   //1 Clearly top
1246   //2 Top but poor match
1247   //3 Next & poor top match
1248   //4 Next but good top match
1249   //5 No chance
1250   BOOL8 good_top_choice;
1251   BOOL8 excellent_top_choice;
1252   BOOL8 confusion_match = FALSE;
1253   BOOL8 dodgy_char = !isalnum (tess_ch);
1254 
1255   good_top_choice = (top_score > nn_reject_threshold) &&
1256     (nn_reject_head_and_shoulders * top_score > next_score);
1257 
1258   excellent_top_choice = good_top_choice &&
1259     (top_score > nn_dodgy_char_threshold);
1260 
1261   if (top == tess_ch) {
1262     if (excellent_top_choice)
1263       accept_level = 0;
1264     else if (good_top_choice)
1265       accept_level = 1;          //Top correct and well matched
1266     else
1267       accept_level = 2;          //Top correct but poor match
1268   }
1269   else if ((nn_conf_1Il &&
1270     STRING (conflict_set_I_l_1).contains (tess_ch) &&
1271     STRING (conflict_set_I_l_1).contains (top)) ||
1272     (nn_conf_hyphen &&
1273     STRING (conflict_set_hyphen).contains (tess_ch) &&
1274     STRING (conflict_set_hyphen).contains (top)) ||
1275     (nn_conf_Ss &&
1276     STRING (conflict_set_S_s).contains (tess_ch) &&
1277   STRING (conflict_set_S_s).contains (top))) {
1278     confusion_match = TRUE;
1279     if (good_top_choice)
1280       accept_level = 1;          //Good top confusion
1281     else
1282       accept_level = 2;          //Poor top confusion
1283   }
1284   else if ((nn_conf_1Il &&
1285     STRING (conflict_set_I_l_1).contains (tess_ch) &&
1286     STRING (conflict_set_I_l_1).contains (next)) ||
1287     (nn_conf_hyphen &&
1288     STRING (conflict_set_hyphen).contains (tess_ch) &&
1289     STRING (conflict_set_hyphen).contains (next)) ||
1290     (nn_conf_Ss &&
1291     STRING (conflict_set_S_s).contains (tess_ch) &&
1292   STRING (conflict_set_S_s).contains (next))) {
1293     confusion_match = TRUE;
1294     if (!good_top_choice)
1295       accept_level = 3;          //Next confusion and top match dodgy
1296     else
1297       accept_level = 4;          //Next confusion and good top match
1298   }
1299   else if (next == tess_ch) {
1300     if (!good_top_choice)
1301       accept_level = 3;          //Next match and top match dodgy
1302     else
1303       accept_level = 4;          //Next match and good top match
1304   }
1305   else
1306     accept_level = 5;
1307 
1308   /* Could allow some match flexibility here sS$ etc */
1309 
1310   /* Now set confirmation level according to how much we can believe the tess
1311     char. */
1312 
1313   if ((accept_level == 0) && !confusion_match)
1314     return 3;
1315 
1316   if ((accept_level <= 1) &&
1317     (!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match)
1318     return 3;
1319 
1320   if ((accept_level == 2) &&
1321     !confusion_match && !dodgy_char &&
1322     good_quality_word &&
1323     dict_word &&
1324     (checked_dict_word || !nn_double_check_dict) && sensible_word)
1325     return 2;
1326 
1327   if (confusion_match &&
1328     (accept_level <= nn_conf_accept_level) &&
1329     (good_quality_word ||
1330     (!nn_conf_test_good_qual &&
1331     !STRING (conflict_set_I_l_1).contains (tess_ch))) &&
1332     (dict_word || !nn_conf_test_dict) &&
1333     (checked_dict_word || !nn_conf_double_check_dict) &&
1334     (sensible_word || !nn_conf_test_sensible))
1335     return 1;
1336 
1337   if (!confusion_match &&
1338     nn_lax &&
1339     (accept_level == 3) &&
1340     (good_quality_word || !nn_conf_test_good_qual) &&
1341     (dict_word || !nn_conf_test_dict) &&
1342     (sensible_word || !nn_conf_test_sensible))
1343     return 1;
1344   else
1345     return 0;
1346 }
1347 
1348 
1349 /*************************************************************************
1350  * dont_allow_dubious_chars()
1351  * Let Rejects "eat" into adjacent "dubious" chars. I.e those prone to be wrong
1352  * if adjacent to a reject.
1353  *************************************************************************/
dont_allow_dubious_chars(WERD_RES * word)1354 void dont_allow_dubious_chars(WERD_RES *word) {
1355   int i = 0;
1356   int offset = 0;
1357   int rej_pos;
1358   int word_len = word->reject_map.length ();
1359 
1360   while (i < word_len) {
1361     /* Find next reject */
1362 
1363     while ((i < word_len) && (word->reject_map[i].accepted ()))
1364     {
1365       offset += word->best_choice->unichar_lengths()[i];
1366       i++;
1367     }
1368 
1369     if (i < word_len) {
1370       rej_pos = i;
1371 
1372       /* Reject dubious chars to the left */
1373       i--;
1374       offset -= word->best_choice->unichar_lengths()[i];
1375       while ((i >= 0) &&
1376         STRING(dubious_chars_left_of_reject).contains(
1377             word->best_choice->unichar_string()[offset])) {
1378         word->reject_map[i--].setrej_dubious ();
1379         offset -= word->best_choice->unichar_lengths()[i];
1380       }
1381 
1382       /* Skip adjacent rejects */
1383 
1384       for (i = rej_pos;
1385         (i < word_len) && (word->reject_map[i].rejected ());
1386            offset += word->best_choice->unichar_lengths()[i++]);
1387 
1388       /* Reject dubious chars to the right */
1389 
1390       while ((i < word_len) &&
1391         STRING(dubious_chars_right_of_reject).contains(
1392             word->best_choice->unichar_string()[offset])) {
1393         offset += word->best_choice->unichar_lengths()[i];
1394         word->reject_map[i++].setrej_dubious ();
1395       }
1396     }
1397   }
1398 }
1399 
1400 
1401 /*************************************************************************
1402  * dont_allow_1Il()
1403  * Dont unreject LONE accepted 1Il conflict set chars
1404  *************************************************************************/
1405 namespace tesseract {
dont_allow_1Il(WERD_RES * word)1406 void Tesseract::dont_allow_1Il(WERD_RES *word) {
1407   int i = 0;
1408   int offset;
1409   int word_len = word->reject_map.length ();
1410   const char *s = word->best_choice->unichar_string().string ();
1411   const char *lengths = word->best_choice->unichar_lengths().string ();
1412   BOOL8 accepted_1Il = FALSE;
1413 
1414   for (i = 0, offset = 0; i < word_len;
1415        offset += word->best_choice->unichar_lengths()[i++]) {
1416     if (word->reject_map[i].accepted ()) {
1417       if (STRING (conflict_set_I_l_1).contains (s[offset]))
1418         accepted_1Il = TRUE;
1419       else {
1420         if (unicharset.get_isalpha (s + offset, lengths[i]) ||
1421             unicharset.get_isdigit (s + offset, lengths[i]))
1422           return;                // >=1 non 1Il ch accepted
1423       }
1424     }
1425   }
1426   if (!accepted_1Il)
1427     return;                      //Nothing to worry about
1428 
1429   for (i = 0, offset = 0; i < word_len;
1430        offset += word->best_choice->unichar_lengths()[i++]) {
1431     if (STRING (conflict_set_I_l_1).contains (s[offset]) &&
1432       word->reject_map[i].accepted ())
1433       word->reject_map[i].setrej_postNN_1Il ();
1434   }
1435 }
1436 
1437 
count_alphanums(WERD_RES * word_res)1438 inT16 Tesseract::count_alphanums(  //how many alphanums
1439                                  WERD_RES *word_res) {
1440   int count = 0;
1441   const WERD_CHOICE *best_choice = word_res->best_choice;
1442   for (int i = 0; i < word_res->reject_map.length(); ++i) {
1443     if ((word_res->reject_map[i].accepted()) &&
1444         (unicharset.get_isalpha(best_choice->unichar_id(i)) ||
1445          unicharset.get_isdigit(best_choice->unichar_id(i)))) {
1446       count++;
1447     }
1448   }
1449   return count;
1450 }
1451 }  // namespace tesseract
1452 
1453 
reject_mostly_rejects(WERD_RES * word)1454 void reject_mostly_rejects(  //rej all if most rejectd
1455                            WERD_RES *word) {
1456   /* Reject the whole of the word if the fraction of rejects exceeds a limit */
1457 
1458   if ((float) word->reject_map.reject_count () / word->reject_map.length () >=
1459     rej_whole_of_mostly_reject_word_fract)
1460     word->reject_map.rej_word_mostly_rej ();
1461 }
1462 
1463 
1464 namespace tesseract {
repeated_nonalphanum_wd(WERD_RES * word,ROW * row)1465 BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
1466   inT16 char_quality;
1467   inT16 accepted_char_quality;
1468 
1469   if (word->best_choice->unichar_lengths().length () <= 1)
1470     return FALSE;
1471 
1472   if (!STRING (ok_repeated_ch_non_alphanum_wds).
1473     contains (word->best_choice->unichar_string()[0]))
1474     return FALSE;
1475 
1476   if (!repeated_ch_string (word->best_choice->unichar_string().string (),
1477                            word->best_choice->unichar_lengths().string ()))
1478     return FALSE;
1479 
1480   word_char_quality(word, row, &char_quality, &accepted_char_quality);
1481 
1482   if ((word->best_choice->unichar_lengths().length () == char_quality) &&
1483     (char_quality == accepted_char_quality))
1484     return TRUE;
1485   else
1486     return FALSE;
1487 }
1488 
repeated_ch_string(const char * rep_ch_str,const char * lengths)1489 BOOL8 Tesseract::repeated_ch_string(const char *rep_ch_str,
1490                                     const char *lengths) {
1491   UNICHAR_ID c;
1492 
1493   if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) {
1494     return FALSE;
1495   }
1496 
1497   c = unicharset.unichar_to_id(rep_ch_str, *lengths);
1498   rep_ch_str += *(lengths++);
1499   while (*rep_ch_str != '\0' &&
1500          unicharset.unichar_to_id(rep_ch_str, *lengths) == c) {
1501     rep_ch_str++;
1502   }
1503   if (*rep_ch_str == '\0')
1504     return TRUE;
1505   return FALSE;
1506 }
1507 
1508 
safe_dict_word(const WERD_CHOICE & word)1509 inT16 Tesseract::safe_dict_word(const WERD_CHOICE &word) {
1510   int dict_word_type = dict_word(word);
1511   return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
1512 }
1513 
1514 
flip_hyphens(WERD_RES * word_res)1515 void Tesseract::flip_hyphens(WERD_RES *word_res) {
1516   WERD_CHOICE *best_choice = word_res->best_choice;
1517   int i;
1518   PBLOB_IT outword_it;
1519   int prev_right = -9999;
1520   int next_left;
1521   TBOX out_box;
1522   float aspect_ratio;
1523 
1524   if (tessedit_lower_flip_hyphen <= 1)
1525     return;
1526 
1527   outword_it.set_to_list(word_res->outword->blob_list());
1528   UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-");
1529   bool modified = false;
1530   for (i = 0, outword_it.mark_cycle_pt();
1531        i < best_choice->length() && !outword_it.cycled_list();
1532        ++i, outword_it.forward()) {
1533     out_box = outword_it.data()->bounding_box();
1534     if (outword_it.at_last())
1535       next_left = 9999;
1536     else
1537       next_left = outword_it.data_relative(1)->bounding_box().left();
1538     // Dont touch small or touching blobs - it is too dangerous.
1539     if ((out_box.width() > 8 * word_res->denorm.scale()) &&
1540         (out_box.left() > prev_right) && (out_box.right() < next_left)) {
1541       aspect_ratio = out_box.width() / (float) out_box.height();
1542       if (unicharset.eq(best_choice->unichar_id(i), ".")) {
1543         if (aspect_ratio >= tessedit_upper_flip_hyphen &&
1544             unicharset.contains_unichar_id(unichar_dash) &&
1545             unicharset.get_enabled(unichar_dash)) {
1546           /* Certain HYPHEN */
1547           best_choice->set_unichar_id(unichar_dash, i);
1548           modified = true;
1549           if (word_res->reject_map[i].rejected())
1550             word_res->reject_map[i].setrej_hyphen_accept();
1551         }
1552         if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
1553           word_res->reject_map[i].accepted())
1554                                  //Suspected HYPHEN
1555           word_res->reject_map[i].setrej_hyphen ();
1556       }
1557       else if (best_choice->unichar_id(i) == unichar_dash) {
1558         if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
1559           (word_res->reject_map[i].rejected()))
1560           word_res->reject_map[i].setrej_hyphen_accept();
1561         //Certain HYPHEN
1562 
1563         if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
1564           (word_res->reject_map[i].accepted()))
1565                                  //Suspected HYPHEN
1566           word_res->reject_map[i].setrej_hyphen();
1567       }
1568     }
1569     prev_right = out_box.right();
1570   }
1571   if (modified) {
1572     best_choice->populate_unichars(unicharset);
1573   }
1574 }
1575 
flip_0O(WERD_RES * word_res)1576 void Tesseract::flip_0O(WERD_RES *word_res) {
1577   WERD_CHOICE *best_choice = word_res->best_choice;
1578   int i;
1579   PBLOB_IT outword_it;
1580   TBOX out_box;
1581 
1582   if (!tessedit_flip_0O)
1583     return;
1584 
1585   outword_it.set_to_list(word_res->outword->blob_list ());
1586 
1587   for (i = 0, outword_it.mark_cycle_pt ();
1588        i < best_choice->length() && !outword_it.cycled_list ();
1589        ++i, outword_it.forward ()) {
1590     if (unicharset.get_isupper(best_choice->unichar_id(i)) ||
1591         unicharset.get_isdigit(best_choice->unichar_id(i))) {
1592       out_box = outword_it.data()->bounding_box ();
1593       if ((out_box.top() < bln_baseline_offset + bln_x_height) ||
1594         (out_box.bottom() > bln_baseline_offset + bln_x_height / 4))
1595         return;                  //Beware words with sub/superscripts
1596     }
1597   }
1598   UNICHAR_ID unichar_0 = unicharset.unichar_to_id("0");
1599   UNICHAR_ID unichar_O = unicharset.unichar_to_id("O");
1600   if (unichar_0 == INVALID_UNICHAR_ID || !unicharset.get_enabled(unichar_0) ||
1601       unichar_O == INVALID_UNICHAR_ID || !unicharset.get_enabled(unichar_O)) {
1602     return;  // 0 or O are not present/enabled in unicharset
1603   }
1604   bool modified = false;
1605   for (i = 1; i < best_choice->length(); ++i, outword_it.forward ()) {
1606     if (best_choice->unichar_id(i) == unichar_0 ||
1607         best_choice->unichar_id(i) == unichar_O) {
1608       /* A0A */
1609       if ((i+1) < best_choice->length() &&
1610           non_O_upper(best_choice->unichar_id(i-1)) &&
1611           non_O_upper(best_choice->unichar_id(i+1))) {
1612         best_choice->set_unichar_id(unichar_O, i);
1613         modified = true;
1614       }
1615       /* A00A */
1616       if (non_O_upper(best_choice->unichar_id(i-1)) &&
1617           (i+1) < best_choice->length() &&
1618           (best_choice->unichar_id(i+1) == unichar_0 ||
1619            best_choice->unichar_id(i+1) == unichar_O) &&
1620           (i+2) < best_choice->length() &&
1621           non_O_upper(best_choice->unichar_id(i+2))) {
1622         best_choice->set_unichar_id(unichar_O, i);
1623         modified = true;
1624         i++;
1625       }
1626       /* AA0<non digit or end of word> */
1627       if ((i > 1) &&
1628           non_O_upper(best_choice->unichar_id(i-2)) &&
1629           non_O_upper(best_choice->unichar_id(i-1)) &&
1630           (((i+1) < best_choice->length() &&
1631             !unicharset.get_isdigit(best_choice->unichar_id(i+1)) &&
1632             !unicharset.eq(best_choice->unichar_id(i+1), "l") &&
1633             !unicharset.eq(best_choice->unichar_id(i+1), "I")) ||
1634            (i == best_choice->length() - 1))) {
1635         best_choice->set_unichar_id(unichar_O, i);
1636         modified = true;
1637       }
1638       /* 9O9 */
1639       if (non_0_digit(best_choice->unichar_id(i-1)) &&
1640           (i+1) < best_choice->length() &&
1641           non_0_digit(best_choice->unichar_id(i+1))) {
1642         best_choice->set_unichar_id(unichar_0, i);
1643         modified = true;
1644       }
1645       /* 9OOO */
1646       if (non_0_digit(best_choice->unichar_id(i-1)) &&
1647           (i+2) < best_choice->length() &&
1648           (best_choice->unichar_id(i+1) == unichar_0 ||
1649            best_choice->unichar_id(i+1) == unichar_O) &&
1650           (best_choice->unichar_id(i+2) == unichar_0 ||
1651            best_choice->unichar_id(i+2) == unichar_O)) {
1652         best_choice->set_unichar_id(unichar_0, i);
1653         best_choice->set_unichar_id(unichar_0, i+1);
1654         best_choice->set_unichar_id(unichar_0, i+2);
1655         modified = true;
1656         i += 2;
1657       }
1658       /* 9OO<non upper> */
1659       if (non_0_digit(best_choice->unichar_id(i-1)) &&
1660           (i+2) < best_choice->length() &&
1661           (best_choice->unichar_id(i+1) == unichar_0 ||
1662           best_choice->unichar_id(i+1) == unichar_O) &&
1663           !unicharset.get_isupper(best_choice->unichar_id(i+2))) {
1664         best_choice->set_unichar_id(unichar_0, i);
1665         best_choice->set_unichar_id(unichar_0, i+1);
1666         modified = true;
1667         i++;
1668       }
1669       /* 9O<non upper> */
1670       if (non_0_digit(best_choice->unichar_id(i-1)) &&
1671           (i+1) < best_choice->length() &&
1672           !unicharset.get_isupper(best_choice->unichar_id(i+1))) {
1673         best_choice->set_unichar_id(unichar_0, i);
1674       }
1675       /* 9[.,]OOO.. */
1676       if ((i > 1) &&
1677           (unicharset.eq(best_choice->unichar_id(i-1), ".") ||
1678            unicharset.eq(best_choice->unichar_id(i-1), ",")) &&
1679           (unicharset.get_isdigit(best_choice->unichar_id(i-2)) ||
1680            best_choice->unichar_id(i-2) == unichar_O)) {
1681         if (best_choice->unichar_id(i-2) == unichar_O) {
1682           best_choice->set_unichar_id(unichar_0, i-2);
1683           modified = true;
1684         }
1685         while (i < best_choice->length() &&
1686                (best_choice->unichar_id(i) == unichar_O ||
1687                 best_choice->unichar_id(i) == unichar_0)) {
1688           best_choice->set_unichar_id(unichar_0, i);
1689           modified = true;
1690           i++;
1691         }
1692         i--;
1693       }
1694     }
1695   }
1696   if (modified) {
1697     best_choice->populate_unichars(unicharset);
1698   }
1699 }
1700 
non_O_upper(UNICHAR_ID unichar_id)1701 BOOL8 Tesseract::non_O_upper(UNICHAR_ID unichar_id) {
1702   return (unicharset.get_isupper(unichar_id) &&
1703           (!unicharset.eq(unichar_id, "O")));
1704 }
1705 
non_0_digit(UNICHAR_ID unichar_id)1706 BOOL8 Tesseract::non_0_digit(UNICHAR_ID unichar_id) {
1707   return (unicharset.get_isdigit(unichar_id) &&
1708           (!unicharset.eq(unichar_id, "0")));
1709 }
1710 }  // namespace tesseract
1711