• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**********************************************************************
2  * File:        adaptions.cpp  (Formerly adaptions.c)
3  * Description: Functions used to adapt to blobs already confidently
4  *					identified
5  * Author:		Chris Newton
6  * Created:		Thu Oct  7 10:17:28 BST 1993
7  *
8  * (C) Copyright 1992, Hewlett-Packard Ltd.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 #include "mfcpch.h"
22 #ifdef __UNIX__
23 #include          <assert.h>
24 #endif
25 #include          <ctype.h>
26 #include          <string.h>
27 #include          "tessbox.h"
28 #include          "tessvars.h"
29 #include          "memry.h"
30 #include          "mainblk.h"
31 #include          "charcut.h"
32 #include          "imgs.h"
33 #include          "scaleimg.h"
34 #include          "reject.h"
35 #include          "control.h"
36 #include          "adaptions.h"
37 #include          "stopper.h"
38 #include          "charsample.h"
39 #include          "matmatch.h"
40 #include          "secname.h"
41 #include          "tesseractclass.h"
42 
43 inT32 demo_word = 0;
44 
45 #define WINDOWNAMESIZE    13     /*max size of name */
46 
47 #define EXTERN
48 
49 EXTERN BOOL_VAR (tessedit_reject_ems, FALSE, "Reject all m's");
50 EXTERN BOOL_VAR (tessedit_reject_suspect_ems, FALSE, "Reject suspect m's");
51 
52 EXTERN double_VAR (tessedit_cluster_t1, 0.20,
53 "t1 threshold for clustering samples");
54 EXTERN double_VAR (tessedit_cluster_t2, 0.40,
55 "t2 threshold for clustering samples");
56 EXTERN double_VAR (tessedit_cluster_t3, 0.12,
57 "Extra threshold for clustering samples, only keep a new sample if best score greater than this value");
58 EXTERN double_VAR (tessedit_cluster_accept_fraction, 0.80,
59 "Largest fraction of characters in cluster for it to be used for adaption");
60 EXTERN INT_VAR (tessedit_cluster_min_size, 3,
61 "Smallest number of samples in a cluster for it to be used for adaption");
62 EXTERN BOOL_VAR (tessedit_cluster_debug, FALSE,
63 "Generate and print debug information for adaption by clustering");
64 EXTERN BOOL_VAR (tessedit_use_best_sample, FALSE,
65 "Use best sample from cluster when adapting");
66 EXTERN BOOL_VAR (tessedit_test_cluster_input, FALSE,
67 "Set reject map to enable cluster input to be measured");
68 
69 EXTERN BOOL_VAR (tessedit_matrix_match, TRUE, "Use matrix matcher");
70 EXTERN BOOL_VAR (tessedit_mm_use_non_adaption_set, FALSE,
71 "Don't try to adapt to characters on this list");
72 EXTERN STRING_VAR (tessedit_non_adaption_set, ",.;:'~@*",
73 "Characters to be avoided when adapting");
74 EXTERN BOOL_VAR (tessedit_mm_adapt_using_prototypes, TRUE,
75 "Use prototypes when adapting");
76 EXTERN BOOL_VAR (tessedit_mm_use_prototypes, TRUE,
77 "Use prototypes as clusters are built");
78 EXTERN BOOL_VAR (tessedit_mm_use_rejmap, FALSE,
79 "Adapt to characters using reject map");
80 EXTERN BOOL_VAR (tessedit_mm_all_rejects, FALSE,
81 "Adapt to all characters using, matrix matcher");
82 EXTERN BOOL_VAR (tessedit_mm_only_match_same_char, FALSE,
83 "Only match samples against clusters for the same character");
84 EXTERN BOOL_VAR (tessedit_process_rns, FALSE, "Handle m - rn ambigs");
85 
86 EXTERN BOOL_VAR (tessedit_demo_adaption, FALSE,
87 "Display cut images and matrix match for demo purposes");
88 EXTERN INT_VAR (tessedit_demo_word1, 62,
89 "Word number of first word to display");
90 EXTERN INT_VAR (tessedit_demo_word2, 64,
91 "Word number of second word to display");
92 EXTERN STRING_VAR (tessedit_demo_file, "academe",
93 "Name of document containing demo words");
94 EXTERN BOOL_VAR(tessedit_adapt_to_char_fragments, TRUE,
95                 "Adapt to words that contain "
96                 " a character composed form fragments");
97 
98 namespace tesseract {
word_adaptable(WERD_RES * word,uinT16 mode)99 BOOL8 Tesseract::word_adaptable(  //should we adapt?
100                                 WERD_RES *word,
101                                 uinT16 mode) {
102   if (tessedit_adaption_debug) {
103     tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
104           word->best_choice == NULL ? "" :
105           word->best_choice->unichar_string().string(),
106           word->best_choice->rating(), word->best_choice->certainty());
107   }
108 
109   BOOL8 status = FALSE;
110   BITS16 flags(mode);
111 
112   enum MODES
113   {
114     ADAPTABLE_WERD,
115     ACCEPTABLE_WERD,
116     CHECK_DAWGS,
117     CHECK_SPACES,
118     CHECK_ONE_ELL_CONFLICT,
119     CHECK_AMBIG_WERD
120   };
121 
122   /*
123   0: NO adaption
124   */
125   if (mode == 0) {
126     if (tessedit_adaption_debug) tprintf("adaption disabled\n");
127     return FALSE;
128   }
129 
130   if (flags.bit (ADAPTABLE_WERD)) {
131     status |= word->tess_would_adapt;  // result of Classify::AdaptableWord()
132     if (tessedit_adaption_debug && !status) {
133       tprintf("tess_would_adapt bit is false\n");
134     }
135   }
136 
137   if (flags.bit (ACCEPTABLE_WERD)) {
138     status |= word->tess_accepted;
139     if (tessedit_adaption_debug && !status) {
140       tprintf("tess_accepted bit is false\n");
141     }
142   }
143 
144   if (!status) {                  // If not set then
145     return FALSE;                // ignore other checks
146   }
147 
148   if (flags.bit (CHECK_DAWGS) &&
149     (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
150     (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
151     (word->best_choice->permuter () != USER_DAWG_PERM) &&
152     (word->best_choice->permuter () != NUMBER_PERM)) {
153     if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
154     return FALSE;
155   }
156 
157   if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) {
158     if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
159     return FALSE;
160   }
161 
162   if (flags.bit (CHECK_SPACES) &&
163     (strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) {
164     if (tessedit_adaption_debug) tprintf("word contains spaces\n");
165     return FALSE;
166   }
167 
168 //  if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
169   if (flags.bit (CHECK_AMBIG_WERD) &&
170       !getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) {
171     if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
172     return FALSE;
173   }
174 
175   // Do not adapt to words that are composed from fragments if
176   // tessedit_adapt_to_char_fragments is false.
177   if (!tessedit_adapt_to_char_fragments) {
178     const char *fragment_lengths = word->best_choice->fragment_lengths();
179     if (fragment_lengths != NULL && *fragment_lengths != '\0') {
180       for (int i = 0; i < word->best_choice->length(); ++i) {
181         if (fragment_lengths[i] > 1) {
182           if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n");
183           return false;  // found a character composed from fragments
184         }
185       }
186     }
187   }
188 
189   if (tessedit_adaption_debug) {
190     tprintf("returning status %d\n", status);
191   }
192   return status;
193 
194 }
195 
196 
collect_ems_for_adaption(WERD_RES * word,CHAR_SAMPLES_LIST * char_clusters,CHAR_SAMPLE_LIST * chars_waiting)197 void Tesseract::collect_ems_for_adaption(WERD_RES *word,
198                                          CHAR_SAMPLES_LIST *char_clusters,
199                                          CHAR_SAMPLE_LIST *chars_waiting) {
200   PBLOB_LIST *blobs = word->outword->blob_list ();
201   PBLOB_IT blob_it(blobs);
202   inT16 i;
203   CHAR_SAMPLE *sample;
204   PIXROW_LIST *pixrow_list;
205   PIXROW_IT pixrow_it;
206   IMAGELINE *imlines;            // lines of the image
207   TBOX pix_box;                   // box of imlines
208   // extent
209   WERD copy_outword;             // copy to denorm
210   PBLOB_IT copy_blob_it;
211   OUTLINE_IT copy_outline_it;
212   inT32 resolution = page_image.get_res ();
213 
214   if (tessedit_reject_ems || tessedit_reject_suspect_ems)
215     return;                      // Do nothing
216 
217   if (word->word->bounding_box ().height () > resolution / 3)
218     return;
219 
220   if (tessedit_demo_adaption)
221                                  // Make sure not set
222     tessedit_display_mm.set_value (FALSE);
223 
224   if (word_adaptable (word, tessedit_em_adaption_mode)
225     && word->reject_map.reject_count () == 0
226     && (strchr (word->best_choice->unichar_string().string (), 'm') != NULL
227     || (tessedit_process_rns
228     && strstr (word->best_choice->unichar_string().string (),
229   "rn") != NULL))) {
230     if (tessedit_process_rns
231     && strstr (word->best_choice->unichar_string().string (), "rn") != NULL) {
232       copy_outword = *(word->outword);
233       copy_blob_it.set_to_list (copy_outword.blob_list ());
234       i = 0;
235       while (word->best_choice->unichar_string()[i] != '\0') {
236         if (word->best_choice->unichar_string()[i] == 'r'
237         && word->best_choice->unichar_string()[i + 1] == 'n') {
238           copy_outline_it.set_to_list (copy_blob_it.data ()->
239             out_list ());
240           copy_outline_it.add_list_after (copy_blob_it.
241             data_relative (1)->
242             out_list ());
243           copy_blob_it.forward ();
244           delete (copy_blob_it.extract ());
245           i++;
246         }
247         copy_blob_it.forward ();
248         i++;
249       }
250     }
251     else
252       copy_outword = *(word->outword);
253 
254     copy_outword.baseline_denormalise (&word->denorm);
255     char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
256     pixrow_it.set_to_list (pixrow_list);
257     pixrow_it.move_to_first ();
258 
259     blob_it.move_to_first ();
260     for (i = 0;
261       word->best_choice->unichar_string()[i] != '\0';
262     i++, pixrow_it.forward (), blob_it.forward ()) {
263 
264       if (word->best_choice->unichar_string()[i] == 'm'
265         || (word->best_choice->unichar_string()[i] == 'r'
266       && word->best_choice->unichar_string()[i + 1] == 'n')) {
267         #ifndef SECURE_NAMES
268         if (tessedit_cluster_debug)
269           tprintf ("Sample %c for adaption found in %s, index %d\n",
270             word->best_choice->unichar_string()[i],
271             word->best_choice->unichar_string().string (), i);
272         #endif
273         if (tessedit_matrix_match) {
274           sample = clip_sample (pixrow_it.data (),
275             imlines,
276             pix_box,
277             copy_outword.flag (W_INVERSE),
278             word->best_choice->unichar_string()[i]);
279 
280           if (sample == NULL) {  //Clip failed
281             #ifndef SECURE_NAMES
282             tprintf ("Unable to clip sample from %s, index %d\n",
283               word->best_choice->unichar_string().string (), i);
284             #endif
285             if (word->best_choice->unichar_string()[i] == 'r')
286               i++;
287 
288             continue;
289           }
290         }
291         else
292           sample = new CHAR_SAMPLE (blob_it.data (),
293             &word->denorm,
294             word->best_choice->unichar_string()[i]);
295 
296         cluster_sample(sample, char_clusters, chars_waiting);
297 
298         if (word->best_choice->unichar_string()[i] == 'r')
299           i++;                   // Skip next character
300       }
301     }
302     delete[]imlines;             // Free array of imlines
303     delete pixrow_list;
304   }
305 }
306 
307 
collect_characters_for_adaption(WERD_RES * word,CHAR_SAMPLES_LIST * char_clusters,CHAR_SAMPLE_LIST * chars_waiting)308 void Tesseract::collect_characters_for_adaption(
309     WERD_RES *word,
310     CHAR_SAMPLES_LIST *char_clusters,
311     CHAR_SAMPLE_LIST *chars_waiting) {
312   PBLOB_LIST *blobs = word->outword->blob_list ();
313   PBLOB_IT blob_it(blobs);
314   inT16 i;
315   CHAR_SAMPLE *sample;
316   PIXROW_LIST *pixrow_list;
317   PIXROW_IT pixrow_it;
318   IMAGELINE *imlines;            // lines of the image
319   TBOX pix_box;                   // box of imlines
320   // extent
321   WERD copy_outword;             // copy to denorm
322   inT32 resolution = page_image.get_res ();
323 
324   if (word->word->bounding_box ().height () > resolution / 3)
325     return;
326 
327   if (tessedit_demo_adaption)
328                                  // Make sure not set
329     tessedit_display_mm.set_value (FALSE);
330 
331   if ((word_adaptable (word, tessedit_cluster_adaption_mode)
332   && word->reject_map.reject_count () == 0) || tessedit_mm_use_rejmap) {
333     if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
334       return;                    // Reject map set to acceptable
335     /* Collect information about good matches */
336     copy_outword = *(word->outword);
337     copy_outword.baseline_denormalise (&word->denorm);
338     char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
339     pixrow_it.set_to_list (pixrow_list);
340     pixrow_it.move_to_first ();
341 
342     blob_it.move_to_first ();
343     for (i = 0;
344       word->best_choice->unichar_string()[i] != '\0';
345     i++, pixrow_it.forward (), blob_it.forward ()) {
346 
347       if (!(tessedit_mm_use_non_adaption_set
348         && STRING(tessedit_non_adaption_set).contains(
349             word->best_choice->unichar_string()[i]))
350       || (tessedit_mm_use_rejmap && word->reject_map[i].accepted ())) {
351         #ifndef SECURE_NAMES
352         if (tessedit_cluster_debug)
353           tprintf ("Sample %c for adaption found in %s, index %d\n",
354             word->best_choice->unichar_string()[i],
355             word->best_choice->unichar_string().string (), i);
356         #endif
357         sample = clip_sample (pixrow_it.data (),
358           imlines,
359           pix_box,
360           copy_outword.flag (W_INVERSE),
361           word->best_choice->unichar_string()[i]);
362 
363         if (sample == NULL) {    //Clip failed
364           #ifndef SECURE_NAMES
365           tprintf ("Unable to clip sample from %s, index %d\n",
366             word->best_choice->unichar_string().string (), i);
367           #endif
368           continue;
369         }
370         cluster_sample(sample, char_clusters, chars_waiting);
371       }
372     }
373     delete[]imlines;             // Free array of imlines
374     delete pixrow_list;
375   }
376   else if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
377     // Set word to all rejects
378     word->reject_map.rej_word_tess_failure ();
379 
380 }
381 
382 
cluster_sample(CHAR_SAMPLE * sample,CHAR_SAMPLES_LIST * char_clusters,CHAR_SAMPLE_LIST * chars_waiting)383 void Tesseract::cluster_sample(CHAR_SAMPLE *sample,
384                                CHAR_SAMPLES_LIST *char_clusters,
385                                CHAR_SAMPLE_LIST *chars_waiting) {
386   CHAR_SAMPLES *best_cluster = NULL;
387   CHAR_SAMPLES_IT c_it = char_clusters;
388   CHAR_SAMPLE_IT cw_it = chars_waiting;
389   float score;
390   float best_score = MAX_INT32;
391 
392   if (c_it.empty ())
393     c_it.add_to_end (new CHAR_SAMPLES (sample));
394   else {
395     for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
396       score = c_it.data ()->match_score (sample, this);
397       if (score < best_score) {
398         best_score = score;
399         best_cluster = c_it.data ();
400       }
401     }
402 
403     if (tessedit_cluster_debug)
404       tprintf ("Sample's best score %f\n", best_score);
405 
406     if (best_score < tessedit_cluster_t1) {
407       if (best_score > tessedit_cluster_t3 || tessedit_mm_use_prototypes) {
408         best_cluster->add_sample (sample, this);
409         check_wait_list(chars_waiting, sample, best_cluster);
410         #ifndef SECURE_NAMES
411         if (tessedit_cluster_debug)
412           tprintf ("Sample added to an existing cluster\n");
413         #endif
414       }
415       else {
416         #ifndef SECURE_NAMES
417         if (tessedit_cluster_debug)
418           tprintf
419             ("Sample dropped, good match to an existing cluster\n");
420         #endif
421       }
422     }
423     else if (best_score > tessedit_cluster_t2) {
424       c_it.add_to_end (new CHAR_SAMPLES (sample));
425       #ifndef SECURE_NAMES
426       if (tessedit_cluster_debug)
427         tprintf ("New cluster created for this sample\n");
428       #endif
429     }
430     else {
431       cw_it.add_to_end (sample);
432       if (tessedit_cluster_debug)
433         tprintf ("Sample added to the wait list\n");
434     }
435   }
436 }
437 
check_wait_list(CHAR_SAMPLE_LIST * chars_waiting,CHAR_SAMPLE * sample,CHAR_SAMPLES * best_cluster)438 void Tesseract::check_wait_list(CHAR_SAMPLE_LIST *chars_waiting,
439                                 CHAR_SAMPLE *sample,
440                                 CHAR_SAMPLES *best_cluster) {
441   CHAR_SAMPLE *wait_sample;
442   CHAR_SAMPLE *test_sample = sample;
443   CHAR_SAMPLE_IT cw_it = chars_waiting;
444   CHAR_SAMPLE_LIST add_list;     //Samples added to best cluster
445   CHAR_SAMPLE_IT add_it = &add_list;
446   float score;
447 
448   add_list.clear ();
449 
450   if (!cw_it.empty ()) {
451     do {
452       if (!add_list.empty ()) {
453         add_it.forward ();
454         test_sample = add_it.extract ();
455         best_cluster->add_sample (test_sample, this);
456       }
457 
458       for (cw_it.mark_cycle_pt ();
459       !cw_it.cycled_list (); cw_it.forward ()) {
460         wait_sample = cw_it.data ();
461         if (tessedit_mm_use_prototypes)
462           score = best_cluster->match_score (wait_sample, this);
463         else
464           score = sample->match_sample (wait_sample, FALSE, this);
465         if (score < tessedit_cluster_t1) {
466           if (score > tessedit_cluster_t3
467           || tessedit_mm_use_prototypes) {
468             add_it.add_after_stay_put (cw_it.extract ());
469             #ifndef SECURE_NAMES
470             if (tessedit_cluster_debug)
471               tprintf
472                 ("Wait sample added to an existing cluster\n");
473             #endif
474           }
475           else {
476             #ifndef SECURE_NAMES
477             if (tessedit_cluster_debug)
478               tprintf
479                 ("Wait sample dropped, good match to an existing cluster\n");
480             #endif
481           }
482         }
483       }
484     }
485     while (!add_list.empty ());
486   }
487 }
488 
489 
complete_clustering(CHAR_SAMPLES_LIST * char_clusters,CHAR_SAMPLE_LIST * chars_waiting)490 void Tesseract::complete_clustering(CHAR_SAMPLES_LIST *char_clusters,
491                                     CHAR_SAMPLE_LIST *chars_waiting) {
492   CHAR_SAMPLES *best_cluster;
493   CHAR_SAMPLES_IT c_it = char_clusters;
494   CHAR_SAMPLE_IT cw_it = chars_waiting;
495   CHAR_SAMPLE *sample;
496   inT32 total_sample_count = 0;
497 
498   while (!cw_it.empty ()) {
499     cw_it.move_to_first ();
500     sample = cw_it.extract ();
501     best_cluster = new CHAR_SAMPLES (sample);
502     c_it.add_to_end (best_cluster);
503     check_wait_list(chars_waiting, sample, best_cluster);
504   }
505 
506   for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
507     c_it.data ()->assign_to_char ();
508     if (tessedit_use_best_sample)
509       c_it.data ()->find_best_sample ();
510     else if (tessedit_mm_adapt_using_prototypes)
511       c_it.data ()->build_prototype ();
512 
513     if (tessedit_cluster_debug)
514       total_sample_count += c_it.data ()->n_samples ();
515   }
516   #ifndef SECURE_NAMES
517   if (tessedit_cluster_debug)
518     tprintf ("Clustering completed, %d samples in all\n", total_sample_count);
519   #endif
520 
521 #ifndef GRAPHICS_DISABLED
522   if (tessedit_demo_adaption)
523     display_cluster_prototypes(char_clusters);
524 #endif
525 
526 }
527 
adapt_to_good_ems(WERD_RES * word,CHAR_SAMPLES_LIST * char_clusters,CHAR_SAMPLE_LIST * chars_waiting)528 void Tesseract::adapt_to_good_ems(WERD_RES *word,
529                                   CHAR_SAMPLES_LIST *char_clusters,
530                                   CHAR_SAMPLE_LIST *chars_waiting) {
531   PBLOB_LIST *blobs = word->outword->blob_list ();
532   PBLOB_IT blob_it(blobs);
533   inT16 i;
534   CHAR_SAMPLE *sample;
535   CHAR_SAMPLES_IT c_it = char_clusters;
536   CHAR_SAMPLE_IT cw_it = chars_waiting;
537   float score;
538   float best_score;
539   char best_char;
540   CHAR_SAMPLES *best_cluster;
541   PIXROW_LIST *pixrow_list;
542   PIXROW_IT pixrow_it;
543   IMAGELINE *imlines;            // lines of the image
544   TBOX pix_box;                   // box of imlines
545   // extent
546   WERD copy_outword;             // copy to denorm
547   TBOX b_box;
548   PBLOB_IT copy_blob_it;
549   OUTLINE_IT copy_outline_it;
550   PIXROW *pixrow = NULL;
551 
552   static inT32 word_number = 0;
553 
554 #ifndef GRAPHICS_DISABLED
555   ScrollView* demo_win = NULL;
556 #endif
557 
558   inT32 resolution = page_image.get_res ();
559 
560   if (word->word->bounding_box ().height () > resolution / 3)
561     return;
562 
563   word_number++;
564 
565   if (strchr (word->best_choice->unichar_string().string (), 'm') == NULL
566     && (tessedit_process_rns
567     && strstr (word->best_choice->unichar_string().string (), "rn") == NULL))
568     return;
569 
570   if (tessedit_reject_ems)
571     reject_all_ems(word);
572   else if (tessedit_reject_suspect_ems)
573     reject_suspect_ems(word);
574   else {
575     if (char_clusters->length () == 0) {
576       #ifndef SECURE_NAMES
577       if (tessedit_cluster_debug)
578         tprintf ("No clusters to use for em adaption\n");
579       #endif
580       return;
581     }
582 
583     if (!cw_it.empty ()) {
584       complete_clustering(char_clusters, chars_waiting);
585       print_em_stats(char_clusters, chars_waiting);
586     }
587 
588     if ((!word_adaptable (word, tessedit_em_adaption_mode) ||
589       word->reject_map.reject_count () != 0)
590       && (strchr (word->best_choice->unichar_string().string (), 'm') != NULL
591       || (tessedit_process_rns
592       && strstr (word->best_choice->unichar_string().string (),
593     "rn") != NULL))) {
594       if (tessedit_process_rns
595         && strstr (word->best_choice->unichar_string().string (),
596       "rn") != NULL) {
597         copy_outword = *(word->outword);
598         copy_blob_it.set_to_list (copy_outword.blob_list ());
599         i = 0;
600         while (word->best_choice->unichar_string()[i] != '\0') {
601           if (word->best_choice->unichar_string()[i] == 'r'
602           && word->best_choice->unichar_string()[i + 1] == 'n') {
603             copy_outline_it.set_to_list (copy_blob_it.data ()->
604               out_list ());
605             copy_outline_it.add_list_after (copy_blob_it.
606               data_relative (1)->
607               out_list ());
608             copy_blob_it.forward ();
609             delete (copy_blob_it.extract ());
610             i++;
611           }
612           copy_blob_it.forward ();
613           i++;
614         }
615       }
616       else
617         copy_outword = *(word->outword);
618 
619       copy_outword.baseline_denormalise (&word->denorm);
620       copy_blob_it.set_to_list (copy_outword.blob_list ());
621       char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
622       pixrow_it.set_to_list (pixrow_list);
623       pixrow_it.move_to_first ();
624 
625                                  // For debugging only
626       b_box = copy_outword.bounding_box ();
627       pixrow = pixrow_it.data ();
628 
629       blob_it.move_to_first ();
630       copy_blob_it.move_to_first ();
631       for (i = 0;
632         word->best_choice->unichar_string()[i] != '\0';
633         i++, pixrow_it.forward (), blob_it.forward (),
634       copy_blob_it.forward ()) {
635         if ((word->best_choice->unichar_string()[i] == 'm'
636           || (word->best_choice->unichar_string()[i] == 'r'
637           && word->best_choice->unichar_string()[i + 1] == 'n'))
638         && !word->reject_map[i].perm_rejected ()) {
639           if (tessedit_cluster_debug)
640             tprintf ("Sample %c to check found in %s, index %d\n",
641               word->best_choice->unichar_string()[i],
642               word->best_choice->unichar_string().string (), i);
643 
644           if (tessedit_demo_adaption)
645             tprintf
646               ("Sample %c to check found in %s (%d), index %d\n",
647               word->best_choice->unichar_string()[i],
648               word->best_choice->unichar_string().string (), word_number,
649               i);
650 
651           if (tessedit_matrix_match) {
652             TBOX copy_box = copy_blob_it.data ()->bounding_box ();
653 
654             sample = clip_sample (pixrow_it.data (),
655               imlines,
656               pix_box,
657               copy_outword.flag (W_INVERSE),
658               word->best_choice->unichar_string()[i]);
659 
660                                  //Clip failed
661             if (sample == NULL) {
662               tprintf
663                 ("Unable to clip sample from %s, index %d\n",
664                 word->best_choice->unichar_string().string (), i);
665               #ifndef SECURE_NAMES
666               if (tessedit_cluster_debug)
667                 tprintf ("Sample rejected (no sample)\n");
668               #endif
669               word->reject_map[i].setrej_mm_reject ();
670               if (word->best_choice->unichar_string()[i] == 'r') {
671                 word->reject_map[i + 1].setrej_mm_reject ();
672                 i++;
673               }
674               continue;
675             }
676           }
677           else
678             sample = new CHAR_SAMPLE(blob_it.data(),
679                                      &word->denorm,
680                                      word->best_choice->unichar_string()[i]);
681 
682           best_score = MAX_INT32;
683           best_char = '\0';
684           best_cluster = NULL;
685 
686           for (c_it.mark_cycle_pt ();
687           !c_it.cycled_list (); c_it.forward ()) {
688             if (c_it.data ()->character () != '\0') {
689               score = c_it.data ()->match_score (sample, this);
690               if (score < best_score) {
691                 best_cluster = c_it.data ();
692                 best_score = score;
693                 best_char = c_it.data ()->character ();
694               }
695             }
696           }
697 
698           if (best_score > tessedit_cluster_t1) {
699             #ifndef SECURE_NAMES
700             if (tessedit_cluster_debug)
701               tprintf ("Sample rejected (score %f)\n", best_score);
702             if (tessedit_demo_adaption)
703               tprintf ("Sample rejected (score %f)\n", best_score);
704             #endif
705             word->reject_map[i].setrej_mm_reject ();
706             if (word->best_choice->unichar_string()[i] == 'r')
707               word->reject_map[i + 1].setrej_mm_reject ();
708           }
709           else {
710             if (word->best_choice->unichar_string()[i] == best_char) {
711               #ifndef SECURE_NAMES
712               if (tessedit_cluster_debug)
713                 tprintf ("Sample accepted (score %f)\n",
714                   best_score);
715               if (tessedit_demo_adaption)
716                 tprintf ("Sample accepted (score %f)\n",
717                   best_score);
718               #endif
719               word->reject_map[i].setrej_mm_accept ();
720               if (word->best_choice->unichar_string()[i] == 'r')
721                 word->reject_map[i + 1].setrej_mm_accept ();
722             }
723             else {
724               #ifndef SECURE_NAMES
725               if (tessedit_cluster_debug)
726                 tprintf ("Sample rejected (char %c, score %f)\n",
727                   best_char, best_score);
728               if (tessedit_demo_adaption)
729                 tprintf ("Sample rejected (char %c, score %f)\n",
730                   best_char, best_score);
731               #endif
732               word->reject_map[i].setrej_mm_reject ();
733               if (word->best_choice->unichar_string()[i] == 'r')
734                 word->reject_map[i + 1].setrej_mm_reject ();
735             }
736           }
737 
738           if (tessedit_demo_adaption) {
739             if (strcmp (imagebasename.string (),
740               tessedit_demo_file.string ()) != 0
741               || word_number == tessedit_demo_word1
742             || word_number == tessedit_demo_word2) {
743 #ifndef GRAPHICS_DISABLED
744               demo_win =
745                 display_clip_image(&copy_outword,
746                                    page_image,
747                                    pixrow_list,
748                                    pix_box);
749 #endif
750               demo_word = word_number;
751               best_cluster->match_score (sample, this);
752               demo_word = 0;
753             }
754           }
755           if (word->best_choice->unichar_string()[i] == 'r')
756             i++;                 // Skip next character
757         }
758       }
759       delete[]imlines;           // Free array of imlines
760       delete pixrow_list;
761     }
762   }
763 }
764 
765 
766 
adapt_to_good_samples(WERD_RES * word,CHAR_SAMPLES_LIST * char_clusters,CHAR_SAMPLE_LIST * chars_waiting)767 void Tesseract::adapt_to_good_samples(WERD_RES *word,
768                                       CHAR_SAMPLES_LIST *char_clusters,
769                                       CHAR_SAMPLE_LIST *chars_waiting) {
770   PBLOB_LIST *blobs = word->outword->blob_list ();
771   PBLOB_IT blob_it(blobs);
772   inT16 i;
773   CHAR_SAMPLE *sample;
774   CHAR_SAMPLES_IT c_it = char_clusters;
775   CHAR_SAMPLE_IT cw_it = chars_waiting;
776   float score;
777   float best_score;
778   char best_char;
779   CHAR_SAMPLES *best_cluster;
780   PIXROW_LIST *pixrow_list;
781   PIXROW_IT pixrow_it;
782   IMAGELINE *imlines;            // lines of the image
783   TBOX pix_box;                   // box of imlines
784   // extent
785   WERD copy_outword;             // copy to denorm
786   TBOX b_box;
787   PBLOB_IT copy_blob_it;
788   PIXROW *pixrow = NULL;
789 
790   static inT32 word_number = 0;
791 
792 #ifndef GRAPHICS_DISABLED
793   ScrollView* demo_win = NULL;
794 #endif
795 
796   inT32 resolution = page_image.get_res ();
797 
798   word_number++;
799 
800   if (tessedit_test_cluster_input)
801     return;
802 
803   if (word->word->bounding_box ().height () > resolution / 3)
804     return;
805 
806   if (char_clusters->length () == 0) {
807     #ifndef SECURE_NAMES
808     if (tessedit_cluster_debug)
809       tprintf ("No clusters to use for adaption\n");
810     #endif
811     return;
812   }
813 
814   if (!cw_it.empty ()) {
815     complete_clustering(char_clusters, chars_waiting);
816     print_em_stats(char_clusters, chars_waiting);
817   }
818 
819   if ((!word_adaptable (word, tessedit_cluster_adaption_mode)
820   && word->reject_map.reject_count () != 0) || tessedit_mm_use_rejmap) {
821     if (tessedit_cluster_debug) {
822       tprintf ("\nChecking: \"%s\"  MAP ",
823         word->best_choice->unichar_string().string ());
824       word->reject_map.print (debug_fp);
825       tprintf ("\n");
826     }
827 
828     copy_outword = *(word->outword);
829     copy_outword.baseline_denormalise (&word->denorm);
830     copy_blob_it.set_to_list (copy_outword.blob_list ());
831     char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
832     pixrow_it.set_to_list (pixrow_list);
833     pixrow_it.move_to_first ();
834 
835                                  // For debugging only
836     b_box = copy_outword.bounding_box ();
837     pixrow = pixrow_it.data ();
838 
839     blob_it.move_to_first ();
840     copy_blob_it.move_to_first ();
841     for (i = 0;
842       word->best_choice->unichar_string()[i] != '\0';
843       i++, pixrow_it.forward (), blob_it.forward (),
844     copy_blob_it.forward ()) {
845       if (word->reject_map[i].recoverable ()
846       || (tessedit_mm_all_rejects && word->reject_map[i].rejected ())) {
847         TBOX copy_box = copy_blob_it.data ()->bounding_box ();
848 
849         if (tessedit_cluster_debug)
850           tprintf ("Sample %c to check found in %s, index %d\n",
851             word->best_choice->unichar_string()[i],
852             word->best_choice->unichar_string().string (), i);
853 
854         if (tessedit_demo_adaption)
855           tprintf ("Sample %c to check found in %s (%d), index %d\n",
856             word->best_choice->unichar_string()[i],
857             word->best_choice->unichar_string().string (),
858             word_number, i);
859 
860         sample = clip_sample (pixrow_it.data (),
861           imlines,
862           pix_box,
863           copy_outword.flag (W_INVERSE),
864           word->best_choice->unichar_string()[i]);
865 
866         if (sample == NULL) {    //Clip failed
867           tprintf ("Unable to clip sample from %s, index %d\n",
868             word->best_choice->unichar_string().string (), i);
869           #ifndef SECURE_NAMES
870           if (tessedit_cluster_debug)
871             tprintf ("Sample rejected (no sample)\n");
872           #endif
873           word->reject_map[i].setrej_mm_reject ();
874 
875           continue;
876         }
877 
878         best_score = MAX_INT32;
879         best_char = '\0';
880         best_cluster = NULL;
881 
882         for (c_it.mark_cycle_pt ();
883         !c_it.cycled_list (); c_it.forward ()) {
884           if (c_it.data ()->character () != '\0') {
885             score = c_it.data ()->match_score (sample, this);
886             if (score < best_score) {
887               best_cluster = c_it.data ();
888               best_score = score;
889               best_char = c_it.data ()->character ();
890             }
891           }
892         }
893 
894         if (best_score > tessedit_cluster_t1) {
895           #ifndef SECURE_NAMES
896           if (tessedit_cluster_debug)
897             tprintf ("Sample rejected (score %f)\n", best_score);
898           if (tessedit_demo_adaption)
899             tprintf ("Sample rejected (score %f)\n", best_score);
900           #endif
901           word->reject_map[i].setrej_mm_reject ();
902         }
903         else {
904           if (word->best_choice->unichar_string()[i] == best_char) {
905             #ifndef SECURE_NAMES
906             if (tessedit_cluster_debug)
907               tprintf ("Sample accepted (score %f)\n", best_score);
908             if (tessedit_demo_adaption)
909               tprintf ("Sample accepted (score %f)\n", best_score);
910             #endif
911             if (tessedit_test_adaption)
912               word->reject_map[i].setrej_minimal_rej_accept ();
913             else
914               word->reject_map[i].setrej_mm_accept ();
915           }
916           else {
917             #ifndef SECURE_NAMES
918             if (tessedit_cluster_debug)
919               tprintf ("Sample rejected (char %c, score %f)\n",
920                 best_char, best_score);
921             if (tessedit_demo_adaption)
922               tprintf ("Sample rejected (char %c, score %f)\n",
923                 best_char, best_score);
924             #endif
925             word->reject_map[i].setrej_mm_reject ();
926           }
927         }
928 
929         if (tessedit_demo_adaption) {
930           if (strcmp (imagebasename.string (),
931             tessedit_demo_file.string ()) != 0
932             || word_number == tessedit_demo_word1
933           || word_number == tessedit_demo_word2) {
934 #ifndef GRAPHICS_DISABLED
935             demo_win =
936               display_clip_image(&copy_outword,
937                                  page_image,
938                                  pixrow_list,
939                                  pix_box);
940 #endif
941             demo_word = word_number;
942             best_cluster->match_score (sample, this);
943             demo_word = 0;
944           }
945         }
946       }
947     }
948     delete[]imlines;             // Free array of imlines
949     delete pixrow_list;
950 
951     if (tessedit_cluster_debug) {
952       tprintf ("\nFinal: \"%s\"  MAP ",
953         word->best_choice->unichar_string().string ());
954       word->reject_map.print (debug_fp);
955       tprintf ("\n");
956     }
957   }
958 }
959 }  // namespace tesseract
960 
961 
print_em_stats(CHAR_SAMPLES_LIST * char_clusters,CHAR_SAMPLE_LIST * chars_waiting)962 void print_em_stats(CHAR_SAMPLES_LIST *char_clusters,
963                     CHAR_SAMPLE_LIST *chars_waiting) {
964   CHAR_SAMPLES_IT c_it = char_clusters;
965 
966   if (!tessedit_cluster_debug)
967     return;
968   #ifndef SECURE_NAMES
969   tprintf ("There are %d clusters and %d samples waiting\n",
970     char_clusters->length (), chars_waiting->length ());
971 
972   for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ())
973     c_it.data ()->print (debug_fp);
974   #endif
975   tprintf ("\n");
976 }
977 
978 
clip_sample(PIXROW * pixrow,IMAGELINE * imlines,TBOX pix_box,BOOL8 white_on_black,char c)979 CHAR_SAMPLE *clip_sample(              //lines of the image
980                          PIXROW *pixrow,
981                          IMAGELINE *imlines,
982                          TBOX pix_box,  //box of imlines extent
983                          BOOL8 white_on_black,
984                          char c) {
985   TBOX b_box = pixrow->bounding_box ();
986   float baseline_pos = 0;
987   inT32 resolution = page_image.get_res ();
988 
989   if (!b_box.null_box ()) {
990     ASSERT_HOST (b_box.width () < page_image.get_xsize () &&
991       b_box.height () < page_image.get_ysize ());
992 
993     if (b_box.width () > resolution || b_box.height () > resolution) {
994       tprintf ("clip sample: sample too big (%d x %d)\n",
995         b_box.width (), b_box.height ());
996 
997       return NULL;
998     }
999 
1000     IMAGE *image = new (IMAGE);
1001     if (image->create (b_box.width (), b_box.height (), 1) == -1) {
1002       tprintf ("clip sample: create image failed (%d x %d)\n",
1003         b_box.width (), b_box.height ());
1004 
1005       delete image;
1006       return NULL;
1007     }
1008 
1009     if (!white_on_black)
1010       invert_image(image);  // Set background to white
1011     pixrow->char_clip_image (imlines, pix_box, NULL, *image, baseline_pos);
1012     if (white_on_black)
1013       invert_image(image);  //invert white on black for scaling &NN
1014     return new CHAR_SAMPLE (image, c);
1015   }
1016   else
1017     return NULL;
1018 }
1019 
1020 
1021 #ifndef GRAPHICS_DISABLED
display_cluster_prototypes(CHAR_SAMPLES_LIST * char_clusters)1022 void display_cluster_prototypes(CHAR_SAMPLES_LIST *char_clusters) {
1023   inT16 proto_number = 0;
1024   CHAR_SAMPLES_IT c_it = char_clusters;
1025   char title[WINDOWNAMESIZE];
1026 
1027   for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
1028     proto_number++;
1029 
1030     #ifndef SECURE_NAMES
1031     tprintf ("Displaying proto number %d\n", proto_number);
1032     #endif
1033 
1034     if (c_it.data ()->prototype () != NULL) {
1035       sprintf (title, "Proto - %d", proto_number);
1036       display_image (c_it.data ()->prototype ()->make_image (),
1037         title, (proto_number - 1) * 400, 0, FALSE);
1038     }
1039   }
1040 }
1041 #endif
1042 
1043 // *********************************************************************
1044 // Simplistic routines to test the effect of rejecting ems and fullstops
1045 // *********************************************************************
1046 
reject_all_ems(WERD_RES * word)1047 void reject_all_ems(WERD_RES *word) {
1048   inT16 i;
1049 
1050   for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1051     if (word->best_choice->unichar_string()[i] == 'm')
1052                                  // reject all ems
1053       word->reject_map[i].setrej_mm_reject ();
1054   }
1055 }
1056 
1057 
reject_all_fullstops(WERD_RES * word)1058 void reject_all_fullstops(WERD_RES *word) {
1059   inT16 i;
1060 
1061   for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1062     if (word->best_choice->unichar_string()[i] == '.')
1063                                  // reject all fullstops
1064       word->reject_map[i].setrej_mm_reject ();
1065   }
1066 }
1067 
1068 namespace tesseract {
reject_suspect_ems(WERD_RES * word)1069 void Tesseract::reject_suspect_ems(WERD_RES *word) {
1070   inT16 i;
1071 
1072   if (!word_adaptable (word, tessedit_cluster_adaption_mode))
1073   for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1074     if (word->best_choice->unichar_string()[i] == 'm' && suspect_em (word, i))
1075                                  // reject all ems
1076       word->reject_map[i].setrej_mm_reject ();
1077   }
1078 }
1079 }  // namespace tesseract
1080 
1081 
reject_suspect_fullstops(WERD_RES * word)1082 void reject_suspect_fullstops(WERD_RES *word) {
1083   inT16 i;
1084 
1085   for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1086     if (word->best_choice->unichar_string()[i] == '.'
1087       && suspect_fullstop (word, i))
1088                                  // reject all commas
1089       word->reject_map[i].setrej_mm_reject ();
1090   }
1091 }
1092 
1093 
suspect_em(WERD_RES * word,inT16 index)1094 BOOL8 suspect_em(WERD_RES *word, inT16 index) {
1095   PBLOB_LIST *blobs = word->outword->blob_list ();
1096   PBLOB_IT blob_it(blobs);
1097   inT16 j;
1098 
1099   for (j = 0; j < index; j++)
1100     blob_it.forward ();
1101 
1102   return (blob_it.data ()->out_list ()->length () != 1);
1103 }
1104 
1105 
suspect_fullstop(WERD_RES * word,inT16 i)1106 BOOL8 suspect_fullstop(WERD_RES *word, inT16 i) {
1107   float aspect_ratio;
1108   PBLOB_LIST *blobs = word->outword->blob_list ();
1109   PBLOB_IT blob_it(blobs);
1110   inT16 j;
1111   TBOX box;
1112   inT16 width;
1113   inT16 height;
1114 
1115   for (j = 0; j < i; j++)
1116     blob_it.forward ();
1117 
1118   box = blob_it.data ()->bounding_box ();
1119 
1120   width = box.width ();
1121   height = box.height ();
1122 
1123   aspect_ratio = ((width > height) ? ((float) width) / height :
1124   ((float) height) / width);
1125 
1126   return (aspect_ratio > tessed_fullstop_aspect_ratio);
1127 }
1128