• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        genericvector.h
3 // Description: Functions for producing classifications
4 //              for the input to ambigstraining.
5 // Author:      Daria Antonova
6 // Created:     Mon Jun 23 11:26:43 PDT 2008
7 //
8 // (C) Copyright 2007, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
19 ///////////////////////////////////////////////////////////////////////
20 
21 #include "ambigs.h"
22 
23 #include "applybox.h"
24 #include "boxread.h"
25 #include "control.h"
26 #include "permute.h"
27 #include "ratngs.h"
28 #include "reject.h"
29 #include "stopper.h"
30 #include "tesseractclass.h"
31 
32 namespace tesseract {
33 
34 // Sets flags necessary for ambigs training mode.
35 // Opens and returns the pointer to the output file.
init_ambigs_training(const STRING & fname)36 FILE *Tesseract::init_ambigs_training(const STRING &fname) {
37   permute_only_top = 1;                        // use only top choice permuter
38   tessedit_tess_adaption_mode.set_value(0);    // turn off adaption
39   tessedit_ok_mode.set_value(0);               // turn off context checking
40   tessedit_enable_doc_dict.set_value(0);       // turn off document dictionary
41   save_best_choices.set_value(1);              // save individual char choices
42   stopper_no_acceptable_choices.set_value(1);  // explore all segmentations
43   save_raw_choices.set_value(1);               // save raw choices
44 
45   // Open ambigs output file.
46   STRING output_fname = fname;
47   const char *lastdot = strrchr(output_fname.string(), '.');
48   if (lastdot != NULL) {
49     output_fname[lastdot - output_fname.string()] = '\0';
50   }
51   output_fname += ".txt";
52   FILE *output_file;
53   if (!(output_file = fopen(output_fname.string(), "a+"))) {
54     CANTOPENFILE.error("ambigs_training", EXIT,
55                        "Can't open box file %s\n", output_fname.string());
56   }
57   return output_file;
58 }
59 
60 // This function takes tif/box pair of files and runs recognition on the image,
61 // while making sure that the word bounds that tesseract identified roughly
62 // match to those specified by the input box file. For each word (ngram in a
63 // single bounding box from the input box file) it outputs the ocred result,
64 // the correct label, rating and certainty.
ambigs_training_segmented(const STRING & fname,PAGE_RES * page_res,volatile ETEXT_DESC * monitor,FILE * output_file)65 void Tesseract::ambigs_training_segmented(const STRING &fname,
66                                           PAGE_RES *page_res,
67                                           volatile ETEXT_DESC *monitor,
68                                           FILE *output_file) {
69   STRING box_fname = fname;
70   const char *lastdot = strrchr(box_fname.string(), '.');
71   if (lastdot != NULL) {
72     box_fname[lastdot - box_fname.string()] = '\0';
73   }
74   box_fname += ".box";
75   FILE *box_file;
76   if (!(box_file = fopen(box_fname.string(), "r"))) {
77     CANTOPENFILE.error("ambigs_training", EXIT,
78                        "Can't open box file %s\n", box_fname.string());
79   }
80 
81   static PAGE_RES_IT page_res_it;
82   page_res_it.page_res = page_res;
83   page_res_it.restart_page();
84   int x_min, y_min, x_max, y_max;
85   char label[UNICHAR_LEN * 10];
86 
87   // Process all the words on this page.
88   while (page_res_it.word() != NULL &&
89          read_next_box(applybox_page, box_file, label,
90                        &x_min, &y_min, &x_max, &y_max)) {
91     // Init bounding box of the current word bounding box and from box file.
92     TBOX box = TBOX(ICOORD(x_min, y_min), ICOORD(x_max, y_max));
93     TBOX word_box(page_res_it.word()->word->bounding_box());
94     bool one_word = true;
95     // Check whether the bounding box of the next word overlaps with the
96     // current box from box file.
97     while (page_res_it.next_word() != NULL &&
98            box.x_overlap(page_res_it.next_word()->word->bounding_box())) {
99       word_box = word_box.bounding_union(
100           page_res_it.next_word()->word->bounding_box());
101       page_res_it.forward();
102       one_word = false;
103     }
104     if (!word_box.major_overlap(box)) {
105       if (!word_box.x_overlap(box)) {
106         // We must be looking at the word that belongs in the "next" bounding
107         // box from the box file. The ngram that was supposed to appear in
108         // the current box read from the box file must have been dropped by
109         // tesseract as noise.
110         tprintf("Word %s was dropped as noise.\n", label);
111         continue;  // stay on this blob, but read next box from box file
112       } else {
113         tprintf("Error: Insufficient overlap for word box"
114                 " and box from file for %s\n", label);
115         word_box.print();
116         box.print();
117         exit(1);
118       }
119     }
120     // Skip recognizing the ngram if tesseract is sure it's not
121     // one word, otherwise run one recognition pass on this word.
122     if (!one_word) {
123       tprintf("Tesseract segmented %s as multiple words\n", label);
124     } else {
125       ambigs_classify_and_output(&page_res_it, label, output_file);
126     }
127     page_res_it.forward();
128   }
129   fclose(box_file);
130 }
131 
132 // Run classify_word_pass1() on the current word. Output tesseract's raw choice
133 // as a result of the classification. For words labeled with a single unichar
134 // also output all alternatives from blob_choices of the best choice.
ambigs_classify_and_output(PAGE_RES_IT * page_res_it,const char * label,FILE * output_file)135 void Tesseract::ambigs_classify_and_output(PAGE_RES_IT *page_res_it,
136                                            const char *label,
137                                            FILE *output_file) {
138   int offset;
139   // Classify word.
140   classify_word_pass1(page_res_it->word(), page_res_it->row()->row,
141                       page_res_it->block()->block,
142                       FALSE, NULL, NULL);
143   WERD_CHOICE *best_choice = page_res_it->word()->best_choice;
144   ASSERT_HOST(best_choice != NULL);
145   ASSERT_HOST(best_choice->blob_choices() != NULL);
146 
147   // Compute the number of unichars in the label.
148   int label_num_unichars = 0;
149   int step = 1;  // should be non-zero on the first iteration
150   for (offset = 0; label[offset] != '\0' && step > 0;
151        step = getDict().getUnicharset().step(label + offset),
152        offset += step, ++label_num_unichars);
153   if (step == 0) {
154     tprintf("Not outputting illegal unichar %s\n", label);
155     return;
156   }
157 
158   // Output all classifier choices for the unigrams (1-1 classifications).
159   if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
160     BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
161     outer_blob_choice_it.set_to_list(best_choice->blob_choices());
162     BLOB_CHOICE_IT blob_choice_it;
163     blob_choice_it.set_to_list(outer_blob_choice_it.data());
164     for (blob_choice_it.mark_cycle_pt();
165          !blob_choice_it.cycled_list();
166          blob_choice_it.forward()) {
167       BLOB_CHOICE *blob_choice = blob_choice_it.data();
168       if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
169         fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
170                unicharset.id_to_unichar(blob_choice->unichar_id()),
171                label, blob_choice->rating(), blob_choice->certainty());
172       }
173     }
174   }
175   // Output the raw choice for succesful non 1-1 classifications.
176   getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
177 }
178 
179 }  // namespace tesseract
180