• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        tessdatamanager.h
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author:      Daria Antonova
5 // Created:     Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19 
20 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
21 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
22 
23 #include <stdio.h>
24 #include "host.h"
25 #include "tprintf.h"
26 #include "varable.h"
27 
28 extern BOOL_VAR_H(global_load_punc_dawg, true,
29                   "Load dawg with punctuation patterns.");
30 extern BOOL_VAR_H(global_load_system_dawg, true, "Load system word dawg.");
31 extern BOOL_VAR_H(global_load_number_dawg, true,
32                   "Load dawg with number patterns.");
33 extern BOOL_VAR_H(global_load_freq_dawg, true, "Load frequent word dawg.");
34 
35 extern INT_VAR_H(global_tessdata_manager_debug_level, 0,
36                  "Debug level for TessdataManager functions.");
37 
38 static const char kTrainedDataSuffix[] = "traineddata";
39 
40 static const char kLangConfigFileSuffix[] = "config";
41 static const char kUnicharsetFileSuffix[] = "unicharset";
42 static const char kAmbigsFileSuffix[] = "unicharambigs";
43 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
44 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
45 static const char kNormProtoFileSuffix[] = "normproto";
46 static const char kPuncDawgFileSuffix[] = "punc-dawg";
47 static const char kSystemDawgFileSuffix[] = "word-dawg";
48 static const char kNumberDawgFileSuffix[] = "number-dawg";
49 static const char kFreqDawgFileSuffix[] = "freq-dawg";
50 
51 namespace tesseract {
52 
53 enum TessdataType {
54   TESSDATA_LANG_CONFIG,  // 0
55   TESSDATA_UNICHARSET,   // 1
56   TESSDATA_AMBIGS,       // 2
57   TESSDATA_INTTEMP,      // 3
58   TESSDATA_PFFMTABLE,    // 4
59   TESSDATA_NORMPROTO,    // 5
60   TESSDATA_PUNC_DAWG,    // 6
61   TESSDATA_SYSTEM_DAWG,  // 7
62   TESSDATA_NUMBER_DAWG,  // 8
63   TESSDATA_FREQ_DAWG,    // 9
64 
65   TESSDATA_NUM_ENTRIES
66 };
67 
68 // TessdataType could be updated to contain more entries, however
69 // we do not expect that number to be astronomically high.
70 // In order to automatically detect endianness TessdataManager will
71 // flip the bits if actual_tessdata_num_entries_ is larger than
72 // kMaxNumTessdataEntries.
73 static const int kMaxNumTessdataEntries = 1000;
74 
75 
76 class TessdataManager {
77  public:
TessdataManager()78   TessdataManager() {
79     data_file_ = NULL;
80     actual_tessdata_num_entries_ = 0;
81     for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
82       offset_table_[i] = -1;
83     }
84   }
~TessdataManager()85   ~TessdataManager() {}
86 
87   // Opens the given data file and reads the offset table.
88   void Init(const char *data_file_name);
89 
90   // Returns data file pointer.
GetDataFilePtr()91   inline FILE *GetDataFilePtr() const { return data_file_; }
92 
93   // Returns false if there is no data of the given type.
94   // Otherwise does a seek on the data_file_ to position the pointer
95   // at the start of the data of the given type.
SeekToStart(TessdataType tessdata_type)96   inline bool SeekToStart(TessdataType tessdata_type) {
97     if (global_tessdata_manager_debug_level) {
98       tprintf("TessdataManager: seek to offset %lld (start of tessdata"
99               "type %d)\n", offset_table_[tessdata_type], tessdata_type);
100     }
101     if (offset_table_[tessdata_type] < 0) {
102       return false;
103     } else {
104       ASSERT_HOST(fseek(data_file_,
105                         offset_table_[tessdata_type], SEEK_SET) == 0);
106       return true;
107     }
108   }
109   // Returns the end offset for the given tesseract data file type.
GetEndOffset(TessdataType tessdata_type)110   inline inT64 GetEndOffset(TessdataType tessdata_type) const {
111     int index = tessdata_type + 1;
112     while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
113       ++index;  // skip tessdata types not present in the combined file
114     }
115     if (global_tessdata_manager_debug_level) {
116       tprintf("TessdataManager: end offset for type %d is %lld\n",
117               tessdata_type,
118               (index == actual_tessdata_num_entries_) ? -1
119               : offset_table_[index]);
120     }
121     return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
122   }
123   // Closes data_file_ (if it was opened by Init()).
End()124   inline void End() {
125     if (data_file_ != NULL) {
126       fclose(data_file_);
127       data_file_ = NULL;
128     }
129   }
130 
131   // Reads all the standard tesseract config and data files for a language
132   // at the given path and bundles them up into one binary data file.
133   static void CombineDataFiles(const char *language_data_path_prefix,
134                                const char *output_filename);
135 
136  private:
137 
138   // Opens the file whose name is a concatentation of language_data_path_prefix
139   // and file_suffix. Terminates the program if required_file is set to true,
140   // but the file could not be found or opened for reading.
141   // Returns a file pointer to the opened file.
142   static FILE *GetFilePtr(const char *language_data_path_prefix,
143                           const char *file_suffix, bool required_file,
144                           bool text_file);
145 
146   // Copies all the bytes in the given input file to the output_file provided.
147   static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end);
148 
149   // Each offset_table_[i] contains a file offset in the combined data file
150   // where the data of TessdataFileType i is stored.
151   inT64 offset_table_[TESSDATA_NUM_ENTRIES];
152   // Actual number of entries in the tessdata table. This value can only be
153   // same or smaller than TESSDATA_NUM_ENTRIES, but can never be larger,
154   // since then it would be impossible to interpret the type of tessdata at
155   // indices same and higher than TESSDATA_NUM_ENTRIES.
156   // This parameter is used to allow for backward compatiblity
157   // when new tessdata types are introduced.
158   inT32 actual_tessdata_num_entries_;
159   FILE *data_file_;  // pointer to the data file.
160 };
161 
162 
163 }  // namespace tesseract
164 
165 #endif  // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
166