• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**********************************************************************
2  * File:        tessedit.cpp  (Formerly tessedit.c)
3  * Description: Main program for merge of tess and editor.
4  * Author:					Ray Smith
5  * Created:					Tue Jan 07 15:21:46 GMT 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 //#include                                                      <osfcn.h>
22 //#include                                                      <signal.h>
23 //#include                                                      <time.h>
24 //#include                                                      <unistd.h>
25 #include          "tfacep.h"     //must be before main.h
26 //#include                                                      "fileerr.h"
27 #include          "stderr.h"
28 #include          "basedir.h"
29 #include          "tessvars.h"
30 //#include                                                      "debgwin.h"
31 //#include                                      "epapdest.h"
32 #include          "control.h"
33 #include          "imgs.h"
34 #include          "reject.h"
35 #include          "pageres.h"
36 //#include                                                      "gpapdest.h"
37 #include          "mainblk.h"
38 #include          "nwmain.h"
39 #include          "pgedit.h"
40 #include          "ocrshell.h"
41 #include          "tprintf.h"
42 //#include                                      "ipeerr.h"
43 //#include                                                      "restart.h"
44 #include          "tessedit.h"
45 //#include                                                      "fontfind.h"
46 #include "permute.h"
47 #include "permdawg.h"
48 #include "stopper.h"
49 #include "adaptmatch.h"
50 #include "intmatcher.h"
51 #include "chop.h"
52 #include "efio.h"
53 #include "danerror.h"
54 #include "globals.h"
55 #include "tesseractclass.h"
56 #include "varable.h"
57 
58 /*
59 ** Include automatically generated configuration file if running autoconf
60 */
61 #ifdef HAVE_CONFIG_H
62 #include "config_auto.h"
63 #endif
64 // Includes libtiff if HAVE_LIBTIFF is defined
65 #ifdef HAVE_LIBTIFF
66 #include "tiffio.h"
67 
68 #endif
69 
70 #include          "notdll.h"     //phils nn stuff
71 
72 #define VARDIR        "configs/" /*variables files */
73                                  //config under api
74 #define API_CONFIG      "configs/api_config"
75 #define EXTERN
76 
77 EXTERN BOOL_EVAR (tessedit_write_vars, FALSE, "Write all vars to file");
78 
79 ETEXT_DESC *global_monitor = NULL;  // progress monitor
80 
81 namespace tesseract {
82 
83 // Read a "config" file containing a set of variable, value pairs.
84 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
85 // and also accepts a relative or absolute path name.
read_config_file(const char * filename,bool global_only)86 void Tesseract::read_config_file(const char *filename, bool global_only) {
87   STRING path = datadir;
88   path += "configs/";
89   path += filename;
90   FILE* fp;
91   if ((fp = fopen(path.string(), "r")) != NULL) {
92     fclose(fp);
93   } else {
94     path = datadir;
95     path += "tessconfigs/";
96     path += filename;
97     if ((fp = fopen(path.string(), "r")) != NULL) {
98       fclose(fp);
99     } else {
100       path = filename;
101     }
102   }
103   read_variables_file(path.string(), global_only);
104 }
105 
106 // Returns false if a unicharset file for the specified language was not found
107 // or was invalid.
108 // This function initializes TessdataManager. After TessdataManager is
109 // no longer needed, TessdataManager::End() should be called.
init_tesseract_lang_data(const char * arg0,const char * textbase,const char * language,char ** configs,int configs_size,bool configs_global_only)110 bool Tesseract::init_tesseract_lang_data(
111     const char *arg0, const char *textbase, const char *language,
112     char **configs, int configs_size, bool configs_global_only) {
113   FILE *var_file;
114   static char c_path[MAX_PATH];  //path for c code
115 
116   // Set the basename, compute the data directory.
117   main_setup(arg0, textbase);
118   debug_window_on.set_value (FALSE);
119 
120   if (tessedit_write_vars) {
121     var_file = fopen ("edited.cfg", "w");
122     if (var_file != NULL) {
123       print_variables(var_file);
124       fclose(var_file);
125     }
126   }
127   strcpy (c_path, datadir.string ());
128   c_path[strlen (c_path) - strlen (m_data_sub_dir.string ())] = '\0';
129   demodir = c_path;
130 
131   // Set the language data path prefix
132   lang = language != NULL ? language : "eng";
133   language_data_path_prefix = datadir;
134   language_data_path_prefix += lang;
135   language_data_path_prefix += ".";
136 
137   // Load tesseract variables from config files.
138   for (int i = 0; i < configs_size; ++i) {
139     read_config_file(configs[i], configs_global_only);
140   }
141 
142   // Initialize TessdataManager.
143   STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
144   tessdata_manager.Init(tessdata_path.string());
145 
146   // If a language specific config file (lang.config) exists, load it in.
147   if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
148     read_variables_from_fp(tessdata_manager.GetDataFilePtr(),
149                            tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
150                            false);
151     if (global_tessdata_manager_debug_level) {
152       tprintf("Loaded language config file\n");
153   }
154 }
155 
156   // Load the unicharset
157   if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
158       !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
159     return false;
160   }
161   if (unicharset.size() > MAX_NUM_CLASSES) {
162     tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
163     return false;
164   }
165   if (global_tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
166 
167   if (!global_tessedit_ambigs_training &&
168       tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
169     unichar_ambigs.LoadUnicharAmbigs(
170         tessdata_manager.GetDataFilePtr(),
171         tessdata_manager.GetEndOffset(TESSDATA_AMBIGS),
172         &unicharset);
173     if (global_tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
174   }
175   return true;
176 }
177 
init_tesseract(const char * arg0,const char * textbase,const char * language,char ** configs,int configs_size,bool configs_global_only)178 int Tesseract::init_tesseract(
179     const char *arg0, const char *textbase, const char *language,
180     char **configs, int configs_size, bool configs_global_only) {
181   if (!init_tesseract_lang_data(arg0, textbase, language, configs,
182                                 configs_size, configs_global_only)) {
183     return -1;
184   }
185   start_recog(textbase);
186   tessdata_manager.End();
187   return 0;                      //Normal exit
188 }
189 
190 // Init everything except the language model
init_tesseract_classifier(const char * arg0,const char * textbase,const char * language,char ** configs,int configs_size,bool configs_global_only)191 int Tesseract::init_tesseract_classifier(
192     const char *arg0, const char *textbase, const char *language,
193     char **configs, int configs_size, bool configs_global_only) {
194   if (!init_tesseract_lang_data (arg0, textbase, language, configs,
195                                  configs_size, configs_global_only)) {
196     return -1;
197   }
198   // Dont initialize the permuter.
199   program_editup(textbase, false);
200   tessdata_manager.End();
201   return 0;
202 }
203 
204 // init the LM component
init_tesseract_lm(const char * arg0,const char * textbase,const char * language)205 int Tesseract::init_tesseract_lm(const char *arg0,
206                    const char *textbase,
207                    const char *language) {
208   init_tesseract_lang_data(arg0, textbase, language, NULL, 0, false);
209   getDict().init_permute();
210   tessdata_manager.End();
211   return 0;
212 }
213 
end_tesseract()214 void Tesseract::end_tesseract() {
215   end_recog();
216 }
217 
218 /* Define command type identifiers */
219 
220 enum CMD_EVENTS
221 {
222   ACTION_1_CMD_EVENT,
223   RECOG_WERDS,
224   RECOG_PSEUDO,
225   ACTION_2_CMD_EVENT
226 };
227 
228 }  // namespace tesseract
229 
230 #ifdef _TIFFIO_
read_tiff_image(TIFF * tif,IMAGE * image)231 void read_tiff_image(TIFF* tif, IMAGE* image) {
232   tdata_t buf;
233   uint32 image_width, image_height;
234   uint16 photometric;
235   inT16 bpp;
236   inT16 samples_per_pixel = 0;
237   TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &image_width);
238   TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &image_height);
239   TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bpp);
240   TIFFGetField(tif, TIFFTAG_SAMPLESPERPIXEL, &samples_per_pixel);
241   TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometric);
242   if (samples_per_pixel > 1)
243     bpp *= samples_per_pixel;
244   // Tesseract's internal representation is 0-is-black,
245   // so if the photometric is 1 (min is black) then high-valued pixels
246   // are 1 (white), otherwise they are 0 (black).
247   uinT8 high_value = photometric == 1;
248   image->create(image_width, image_height, bpp);
249   IMAGELINE line;
250   line.init(image_width);
251 
252   buf = _TIFFmalloc(TIFFScanlineSize(tif));
253   int bytes_per_line = (image_width*bpp + 7)/8;
254   uinT8* dest_buf = image->get_buffer();
255   // This will go badly wrong with one of the more exotic tiff formats,
256   // but the majority will work OK.
257   for (int y = 0; y < image_height; ++y) {
258     TIFFReadScanline(tif, buf, y);
259     memcpy(dest_buf, buf, bytes_per_line);
260     dest_buf += bytes_per_line;
261   }
262   if (high_value == 0)
263     invert_image(image);
264   _TIFFfree(buf);
265 }
266 #endif
267