1 ///////////////////////////////////////////////////////////////////////
2 // File: tessdatamanager.cpp
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 // Created: Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19
20 #include "tessdatamanager.h"
21
22 #include <stdio.h>
23
24 #include "serialis.h"
25 #include "strngs.h"
26 #include "tprintf.h"
27 #include "varable.h"
28
29 BOOL_VAR(global_load_system_dawg, true, "Load system word dawg.");
30 BOOL_VAR(global_load_freq_dawg, true, "Load frequent word dawg.");
31 BOOL_VAR(global_load_punc_dawg, true, "Load dawg with punctuation patterns.");
32 BOOL_VAR(global_load_number_dawg, true, "Load dawg with number patterns.");
33
34 INT_VAR(global_tessdata_manager_debug_level, 0,
35 "Debug level for TessdataManager functions.");
36
37 namespace tesseract {
38
Init(const char * data_file_name)39 void TessdataManager::Init(const char *data_file_name) {
40 int i;
41 data_file_ = fopen(data_file_name, "rb");
42 if (data_file_ == NULL) {
43 tprintf("Error openning data file %s\n", data_file_name);
44 exit(1);
45 }
46 fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
47 bool swap = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
48 if (swap) {
49 actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_);
50 }
51 ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
52 fread(offset_table_, sizeof(inT64),
53 actual_tessdata_num_entries_, data_file_);
54 if (swap) {
55 for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
56 offset_table_[i] = reverse64(offset_table_[i]);
57 }
58 }
59 if (global_tessdata_manager_debug_level) {
60 tprintf("TessdataManager loaded %d types of tesseract data files.\n",
61 actual_tessdata_num_entries_);
62 for (i = 0; i < actual_tessdata_num_entries_; ++i) {
63 tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
64 }
65 }
66 }
67
GetFilePtr(const char * language_data_path_prefix,const char * file_suffix,bool required_file,bool text_file)68 FILE *TessdataManager::GetFilePtr(const char *language_data_path_prefix,
69 const char *file_suffix, bool required_file,
70 bool text_file) {
71 STRING file_name = language_data_path_prefix;
72 file_name += file_suffix;
73 FILE *file_ptr = fopen(file_name.string(), text_file ? "r" : "rb");
74 if (required_file && (file_ptr == NULL)) {
75 tprintf("Error openning required file %s\n", file_name.string());
76 exit(1);
77 }
78 return file_ptr;
79 }
80
CopyFile(FILE * input_file,FILE * output_file,bool newline_end)81 void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
82 bool newline_end) {
83 int buffer_size = 1024;
84 char *chunk = new char[buffer_size];
85 int bytes_read;
86 char last_char = 0x0;
87 while ((bytes_read = fread(chunk, sizeof(char),
88 buffer_size, input_file))) {
89 fwrite(chunk, sizeof(char), bytes_read, output_file);
90 last_char = chunk[bytes_read-1];
91 }
92 if (newline_end) ASSERT_HOST(last_char == '\n');
93 delete[] chunk;
94 }
95
CombineDataFiles(const char * language_data_path_prefix,const char * output_filename)96 void TessdataManager::CombineDataFiles(
97 const char *language_data_path_prefix,
98 const char *output_filename) {
99 FILE *file_ptr;
100 STRING file_name;
101 int i;
102 inT64 offset_table[TESSDATA_NUM_ENTRIES];
103 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
104 FILE *output_file = fopen(output_filename, "wb");
105 // Leave some space for recording the offset_table.
106 fseek(output_file,
107 sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
108
109 // Record language-specific tesseract config file.
110 file_ptr = GetFilePtr(language_data_path_prefix,
111 kLangConfigFileSuffix, false, true);
112 if (file_ptr != NULL) {
113 offset_table[TESSDATA_LANG_CONFIG] = ftell(output_file);
114 CopyFile(file_ptr, output_file, true);
115 fclose(file_ptr);
116 }
117
118 // Record unicharset.
119 file_ptr = GetFilePtr(language_data_path_prefix,
120 kUnicharsetFileSuffix, true, true);
121 offset_table[TESSDATA_UNICHARSET] = ftell(output_file);
122 CopyFile(file_ptr, output_file, true);
123 fclose(file_ptr);
124
125 // Record ambiguities.
126 file_ptr = GetFilePtr(language_data_path_prefix,
127 kAmbigsFileSuffix, false, true);
128 if (file_ptr != NULL) {
129 offset_table[TESSDATA_AMBIGS] = ftell(output_file);
130 CopyFile(file_ptr, output_file, true);
131 fclose(file_ptr);
132 }
133
134 // Record inttemp.
135 file_ptr =
136 GetFilePtr(language_data_path_prefix,
137 kBuiltInTemplatesFileSuffix, false, false);
138 if (file_ptr != NULL) {
139 offset_table[TESSDATA_INTTEMP] = ftell(output_file);
140 CopyFile(file_ptr, output_file, false);
141 fclose(file_ptr);
142
143 // Record pffmtable.
144 file_ptr = GetFilePtr(language_data_path_prefix,
145 kBuiltInCutoffsFileSuffix, true, true);
146 offset_table[TESSDATA_PFFMTABLE] = ftell(output_file);
147 CopyFile(file_ptr, output_file, true);
148 fclose(file_ptr);
149
150 // Record normproto.
151 file_ptr = GetFilePtr(language_data_path_prefix,
152 kNormProtoFileSuffix, true, true);
153 offset_table[TESSDATA_NORMPROTO] = ftell(output_file);
154 CopyFile(file_ptr, output_file, true);
155 fclose(file_ptr);
156 }
157
158 // Record dawgs.
159 file_ptr = GetFilePtr(language_data_path_prefix,
160 kPuncDawgFileSuffix, false, false);
161 if (file_ptr != NULL) {
162 offset_table[TESSDATA_PUNC_DAWG] = ftell(output_file);
163 CopyFile(file_ptr, output_file, false);
164 fclose(file_ptr);
165 }
166
167 file_ptr = GetFilePtr(language_data_path_prefix,
168 kSystemDawgFileSuffix, false, false);
169 if (file_ptr != NULL) {
170 offset_table[TESSDATA_SYSTEM_DAWG] = ftell(output_file);
171 CopyFile(file_ptr, output_file, false);
172 fclose(file_ptr);
173 }
174
175 file_ptr = GetFilePtr(language_data_path_prefix,
176 kNumberDawgFileSuffix, false, false);
177 if (file_ptr != NULL) {
178 offset_table[TESSDATA_NUMBER_DAWG] = ftell(output_file);
179 CopyFile(file_ptr, output_file, false);
180 fclose(file_ptr);
181 }
182
183 file_ptr = GetFilePtr(language_data_path_prefix,
184 kFreqDawgFileSuffix, false, false);
185 if (file_ptr != NULL) {
186 offset_table[TESSDATA_FREQ_DAWG] = ftell(output_file);
187 CopyFile(file_ptr, output_file, false);
188 fclose(file_ptr);
189 }
190
191 fseek(output_file, 0, SEEK_SET);
192 inT32 num_entries = TESSDATA_NUM_ENTRIES;
193 fwrite(&num_entries, sizeof(inT32), 1, output_file);
194 fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
195 fclose(output_file);
196
197 tprintf("TessdataManager combined tesseract data files.\n");
198 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
199 tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
200 }
201 }
202
203 } // namespace tesseract
204