• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        tessdatamanager.cpp
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author:      Daria Antonova
5 // Created:     Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19 
20 #include "tessdatamanager.h"
21 
22 #include <stdio.h>
23 
24 #include "serialis.h"
25 #include "strngs.h"
26 #include "tprintf.h"
27 #include "varable.h"
28 
29 BOOL_VAR(global_load_system_dawg, true, "Load system word dawg.");
30 BOOL_VAR(global_load_freq_dawg, true, "Load frequent word dawg.");
31 BOOL_VAR(global_load_punc_dawg, true, "Load dawg with punctuation patterns.");
32 BOOL_VAR(global_load_number_dawg, true, "Load dawg with number patterns.");
33 
34 INT_VAR(global_tessdata_manager_debug_level, 0,
35         "Debug level for TessdataManager functions.");
36 
37 namespace tesseract {
38 
Init(const char * data_file_name)39 void TessdataManager::Init(const char *data_file_name) {
40   int i;
41   data_file_ = fopen(data_file_name, "rb");
42   if (data_file_ == NULL) {
43     tprintf("Error openning data file %s\n", data_file_name);
44     exit(1);
45   }
46   fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
47   bool swap = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
48   if (swap) {
49     actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_);
50   }
51   ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
52   fread(offset_table_, sizeof(inT64),
53         actual_tessdata_num_entries_, data_file_);
54   if (swap) {
55     for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
56       offset_table_[i] = reverse64(offset_table_[i]);
57     }
58   }
59   if (global_tessdata_manager_debug_level) {
60     tprintf("TessdataManager loaded %d types of tesseract data files.\n",
61             actual_tessdata_num_entries_);
62     for (i = 0; i < actual_tessdata_num_entries_; ++i) {
63       tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
64     }
65   }
66 }
67 
GetFilePtr(const char * language_data_path_prefix,const char * file_suffix,bool required_file,bool text_file)68 FILE *TessdataManager::GetFilePtr(const char *language_data_path_prefix,
69                                   const char *file_suffix, bool required_file,
70                                   bool text_file) {
71   STRING file_name = language_data_path_prefix;
72   file_name += file_suffix;
73   FILE *file_ptr = fopen(file_name.string(), text_file ? "r" : "rb");
74   if (required_file && (file_ptr == NULL)) {
75     tprintf("Error openning required file %s\n", file_name.string());
76     exit(1);
77   }
78   return file_ptr;
79 }
80 
CopyFile(FILE * input_file,FILE * output_file,bool newline_end)81 void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
82                                bool newline_end) {
83   int buffer_size = 1024;
84   char *chunk = new char[buffer_size];
85   int bytes_read;
86   char last_char = 0x0;
87   while ((bytes_read = fread(chunk, sizeof(char),
88                              buffer_size, input_file))) {
89     fwrite(chunk, sizeof(char), bytes_read, output_file);
90     last_char = chunk[bytes_read-1];
91   }
92   if (newline_end) ASSERT_HOST(last_char == '\n');
93   delete[] chunk;
94 }
95 
CombineDataFiles(const char * language_data_path_prefix,const char * output_filename)96 void TessdataManager::CombineDataFiles(
97     const char *language_data_path_prefix,
98     const char *output_filename) {
99   FILE *file_ptr;
100   STRING file_name;
101   int i;
102   inT64 offset_table[TESSDATA_NUM_ENTRIES];
103   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
104   FILE *output_file = fopen(output_filename, "wb");
105   // Leave some space for recording the offset_table.
106   fseek(output_file,
107         sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
108 
109   // Record language-specific tesseract config file.
110   file_ptr = GetFilePtr(language_data_path_prefix,
111                         kLangConfigFileSuffix, false, true);
112   if (file_ptr != NULL) {
113     offset_table[TESSDATA_LANG_CONFIG] = ftell(output_file);
114     CopyFile(file_ptr, output_file, true);
115     fclose(file_ptr);
116   }
117 
118   // Record unicharset.
119   file_ptr = GetFilePtr(language_data_path_prefix,
120                         kUnicharsetFileSuffix, true, true);
121   offset_table[TESSDATA_UNICHARSET] = ftell(output_file);
122   CopyFile(file_ptr, output_file, true);
123   fclose(file_ptr);
124 
125   // Record ambiguities.
126   file_ptr = GetFilePtr(language_data_path_prefix,
127                         kAmbigsFileSuffix, false, true);
128   if (file_ptr != NULL) {
129     offset_table[TESSDATA_AMBIGS] = ftell(output_file);
130     CopyFile(file_ptr, output_file, true);
131     fclose(file_ptr);
132   }
133 
134   // Record inttemp.
135   file_ptr =
136     GetFilePtr(language_data_path_prefix,
137                kBuiltInTemplatesFileSuffix, false, false);
138   if (file_ptr != NULL) {
139     offset_table[TESSDATA_INTTEMP] = ftell(output_file);
140     CopyFile(file_ptr, output_file, false);
141     fclose(file_ptr);
142 
143     // Record pffmtable.
144     file_ptr = GetFilePtr(language_data_path_prefix,
145                           kBuiltInCutoffsFileSuffix, true, true);
146     offset_table[TESSDATA_PFFMTABLE] = ftell(output_file);
147     CopyFile(file_ptr, output_file, true);
148     fclose(file_ptr);
149 
150     // Record normproto.
151     file_ptr = GetFilePtr(language_data_path_prefix,
152                           kNormProtoFileSuffix, true, true);
153     offset_table[TESSDATA_NORMPROTO] = ftell(output_file);
154     CopyFile(file_ptr, output_file, true);
155     fclose(file_ptr);
156   }
157 
158   // Record dawgs.
159   file_ptr = GetFilePtr(language_data_path_prefix,
160                         kPuncDawgFileSuffix, false, false);
161   if (file_ptr != NULL) {
162     offset_table[TESSDATA_PUNC_DAWG] = ftell(output_file);
163     CopyFile(file_ptr, output_file, false);
164     fclose(file_ptr);
165   }
166 
167   file_ptr = GetFilePtr(language_data_path_prefix,
168                         kSystemDawgFileSuffix, false, false);
169   if (file_ptr != NULL) {
170     offset_table[TESSDATA_SYSTEM_DAWG] = ftell(output_file);
171     CopyFile(file_ptr, output_file, false);
172     fclose(file_ptr);
173   }
174 
175   file_ptr = GetFilePtr(language_data_path_prefix,
176                         kNumberDawgFileSuffix, false, false);
177   if (file_ptr != NULL) {
178     offset_table[TESSDATA_NUMBER_DAWG] = ftell(output_file);
179     CopyFile(file_ptr, output_file, false);
180     fclose(file_ptr);
181   }
182 
183   file_ptr = GetFilePtr(language_data_path_prefix,
184                         kFreqDawgFileSuffix, false, false);
185   if (file_ptr != NULL) {
186     offset_table[TESSDATA_FREQ_DAWG] = ftell(output_file);
187     CopyFile(file_ptr, output_file, false);
188     fclose(file_ptr);
189   }
190 
191   fseek(output_file, 0, SEEK_SET);
192   inT32 num_entries = TESSDATA_NUM_ENTRIES;
193   fwrite(&num_entries, sizeof(inT32), 1, output_file);
194   fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
195   fclose(output_file);
196 
197   tprintf("TessdataManager combined tesseract data files.\n");
198   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
199     tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
200   }
201 }
202 
203 }  // namespace tesseract
204