1 /////////////////////////////////////////////////////////////////////// 2 // File: tessdatamanager.h 3 // Description: Functions to handle loading/combining tesseract data files. 4 // Author: Daria Antonova 5 // Created: Wed Jun 03 11:26:43 PST 2009 6 // 7 // (C) Copyright 2009, Google Inc. 8 // Licensed under the Apache License, Version 2.0 (the "License"); 9 // you may not use this file except in compliance with the License. 10 // You may obtain a copy of the License at 11 // http://www.apache.org/licenses/LICENSE-2.0 12 // Unless required by applicable law or agreed to in writing, software 13 // distributed under the License is distributed on an "AS IS" BASIS, 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 // See the License for the specific language governing permissions and 16 // limitations under the License. 17 // 18 /////////////////////////////////////////////////////////////////////// 19 20 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_ 21 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_ 22 23 #include <stdio.h> 24 #include "host.h" 25 #include "tprintf.h" 26 #include "varable.h" 27 28 extern BOOL_VAR_H(global_load_punc_dawg, true, 29 "Load dawg with punctuation patterns."); 30 extern BOOL_VAR_H(global_load_system_dawg, true, "Load system word dawg."); 31 extern BOOL_VAR_H(global_load_number_dawg, true, 32 "Load dawg with number patterns."); 33 extern BOOL_VAR_H(global_load_freq_dawg, true, "Load frequent word dawg."); 34 35 extern INT_VAR_H(global_tessdata_manager_debug_level, 0, 36 "Debug level for TessdataManager functions."); 37 38 static const char kTrainedDataSuffix[] = "traineddata"; 39 40 static const char kLangConfigFileSuffix[] = "config"; 41 static const char kUnicharsetFileSuffix[] = "unicharset"; 42 static const char kAmbigsFileSuffix[] = "unicharambigs"; 43 static const char kBuiltInTemplatesFileSuffix[] = "inttemp"; 44 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable"; 45 static const char kNormProtoFileSuffix[] = "normproto"; 46 static const char kPuncDawgFileSuffix[] = "punc-dawg"; 47 static const char kSystemDawgFileSuffix[] = "word-dawg"; 48 static const char kNumberDawgFileSuffix[] = "number-dawg"; 49 static const char kFreqDawgFileSuffix[] = "freq-dawg"; 50 51 namespace tesseract { 52 53 enum TessdataType { 54 TESSDATA_LANG_CONFIG, // 0 55 TESSDATA_UNICHARSET, // 1 56 TESSDATA_AMBIGS, // 2 57 TESSDATA_INTTEMP, // 3 58 TESSDATA_PFFMTABLE, // 4 59 TESSDATA_NORMPROTO, // 5 60 TESSDATA_PUNC_DAWG, // 6 61 TESSDATA_SYSTEM_DAWG, // 7 62 TESSDATA_NUMBER_DAWG, // 8 63 TESSDATA_FREQ_DAWG, // 9 64 65 TESSDATA_NUM_ENTRIES 66 }; 67 68 // TessdataType could be updated to contain more entries, however 69 // we do not expect that number to be astronomically high. 70 // In order to automatically detect endianness TessdataManager will 71 // flip the bits if actual_tessdata_num_entries_ is larger than 72 // kMaxNumTessdataEntries. 73 static const int kMaxNumTessdataEntries = 1000; 74 75 76 class TessdataManager { 77 public: TessdataManager()78 TessdataManager() { 79 data_file_ = NULL; 80 actual_tessdata_num_entries_ = 0; 81 for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 82 offset_table_[i] = -1; 83 } 84 } ~TessdataManager()85 ~TessdataManager() {} 86 87 // Opens the given data file and reads the offset table. 88 void Init(const char *data_file_name); 89 90 // Returns data file pointer. GetDataFilePtr()91 inline FILE *GetDataFilePtr() const { return data_file_; } 92 93 // Returns false if there is no data of the given type. 94 // Otherwise does a seek on the data_file_ to position the pointer 95 // at the start of the data of the given type. SeekToStart(TessdataType tessdata_type)96 inline bool SeekToStart(TessdataType tessdata_type) { 97 if (global_tessdata_manager_debug_level) { 98 tprintf("TessdataManager: seek to offset %lld (start of tessdata" 99 "type %d)\n", offset_table_[tessdata_type], tessdata_type); 100 } 101 if (offset_table_[tessdata_type] < 0) { 102 return false; 103 } else { 104 ASSERT_HOST(fseek(data_file_, 105 offset_table_[tessdata_type], SEEK_SET) == 0); 106 return true; 107 } 108 } 109 // Returns the end offset for the given tesseract data file type. GetEndOffset(TessdataType tessdata_type)110 inline inT64 GetEndOffset(TessdataType tessdata_type) const { 111 int index = tessdata_type + 1; 112 while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) { 113 ++index; // skip tessdata types not present in the combined file 114 } 115 if (global_tessdata_manager_debug_level) { 116 tprintf("TessdataManager: end offset for type %d is %lld\n", 117 tessdata_type, 118 (index == actual_tessdata_num_entries_) ? -1 119 : offset_table_[index]); 120 } 121 return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1; 122 } 123 // Closes data_file_ (if it was opened by Init()). End()124 inline void End() { 125 if (data_file_ != NULL) { 126 fclose(data_file_); 127 data_file_ = NULL; 128 } 129 } 130 131 // Reads all the standard tesseract config and data files for a language 132 // at the given path and bundles them up into one binary data file. 133 static void CombineDataFiles(const char *language_data_path_prefix, 134 const char *output_filename); 135 136 private: 137 138 // Opens the file whose name is a concatentation of language_data_path_prefix 139 // and file_suffix. Terminates the program if required_file is set to true, 140 // but the file could not be found or opened for reading. 141 // Returns a file pointer to the opened file. 142 static FILE *GetFilePtr(const char *language_data_path_prefix, 143 const char *file_suffix, bool required_file, 144 bool text_file); 145 146 // Copies all the bytes in the given input file to the output_file provided. 147 static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end); 148 149 // Each offset_table_[i] contains a file offset in the combined data file 150 // where the data of TessdataFileType i is stored. 151 inT64 offset_table_[TESSDATA_NUM_ENTRIES]; 152 // Actual number of entries in the tessdata table. This value can only be 153 // same or smaller than TESSDATA_NUM_ENTRIES, but can never be larger, 154 // since then it would be impossible to interpret the type of tessdata at 155 // indices same and higher than TESSDATA_NUM_ENTRIES. 156 // This parameter is used to allow for backward compatiblity 157 // when new tessdata types are introduced. 158 inT32 actual_tessdata_num_entries_; 159 FILE *data_file_; // pointer to the data file. 160 }; 161 162 163 } // namespace tesseract 164 165 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_ 166