1 /////////////////////////////////////////////////////////////////////// 2 // File: baseapi.h 3 // Description: Simple API for calling tesseract. 4 // Author: Ray Smith 5 // Created: Fri Oct 06 15:35:01 PDT 2006 6 // 7 // (C) Copyright 2006, Google Inc. 8 // Licensed under the Apache License, Version 2.0 (the "License"); 9 // you may not use this file except in compliance with the License. 10 // You may obtain a copy of the License at 11 // http://www.apache.org/licenses/LICENSE-2.0 12 // Unless required by applicable law or agreed to in writing, software 13 // distributed under the License is distributed on an "AS IS" BASIS, 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 // See the License for the specific language governing permissions and 16 // limitations under the License. 17 // 18 /////////////////////////////////////////////////////////////////////// 19 20 #ifndef TESSERACT_CCMAIN_BASEAPI_H__ 21 #define TESSERACT_CCMAIN_BASEAPI_H__ 22 23 #include "thresholder.h" 24 25 class PAGE_RES; 26 class PAGE_RES_IT; 27 class BLOCK_LIST; 28 class IMAGE; 29 class STRING; 30 struct Pix; 31 struct Box; 32 struct Pixa; 33 struct Boxa; 34 struct ETEXT_STRUCT; 35 struct OSResults; 36 struct TBOX; 37 38 #define MAX_NUM_INT_FEATURES 512 39 struct INT_FEATURE_STRUCT; 40 typedef INT_FEATURE_STRUCT *INT_FEATURE; 41 typedef INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]; 42 43 #ifdef TESSDLL_EXPORTS 44 #define TESSDLL_API __declspec(dllexport) 45 #elif defined(TESSDLL_IMPORTS) 46 #define TESSDLL_API __declspec(dllimport) 47 #else 48 #define TESSDLL_API 49 #endif 50 51 52 namespace tesseract { 53 54 class Dict; 55 class Tesseract; 56 class Trie; 57 class CubeRecoContext; 58 class TesseractCubeCombiner; 59 class CubeObject; 60 class CubeLineObject; 61 class Dawg; 62 63 typedef int (Dict::*DictFunc)(void* void_dawg_args, int char_index, 64 const void *word, bool word_end); 65 66 enum PageSegMode { 67 PSM_AUTO, // Fully automatic page segmentation. 68 PSM_SINGLE_COLUMN, // Assume a single column of text of variable sizes. 69 PSM_SINGLE_BLOCK, // Assume a single uniform block of text. (Default.) 70 PSM_SINGLE_LINE, // Treat the image as a single text line. 71 PSM_SINGLE_WORD, // Treat the image as a single word. 72 PSM_SINGLE_CHAR, // Treat the image as a single character. 73 74 PSM_COUNT // Number of enum entries. 75 }; 76 77 // The values in the AccuracyVSpeed enum provide hints for how the engine 78 // should trade speed for accuracy. There is no guarantee of any effect. 79 enum AccuracyVSpeed { 80 AVS_FASTEST = 0, // Fastest speed, but lowest accuracy. 81 AVS_MOST_ACCURATE = 100 // Greatest accuracy, but slowest speed. 82 }; 83 84 // Base class for all tesseract APIs. 85 // Specific classes can add ability to work on different inputs or produce 86 // different outputs. 87 // This class is mostly an interface layer on top of the Tesseract instance 88 // class to hide the data types so that users of this class don't have to 89 // include any other Tesseract headers. 90 91 class TESSDLL_API TessBaseAPI { 92 public: 93 TessBaseAPI(); 94 virtual ~TessBaseAPI(); 95 96 // Set the name of the input file. Needed only for training and 97 // reading a UNLV zone file. 98 void SetInputName(const char* name); 99 100 // Set the name of the bonus output files. Needed only for debugging. 101 void SetOutputName(const char* name); 102 103 // Set the value of an internal "variable" (of either old or new types). 104 // Supply the name of the variable and the value as a string, just as 105 // you would in a config file. 106 // Returns false if the name lookup failed. 107 // Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z. 108 // Or SetVariable("bln_numericmode", "1"); to set numeric-only mode. 109 // SetVariable may be used before Init, but settings will revert to 110 // defaults on End(). 111 bool SetVariable(const char* variable, const char* value); 112 113 // Eventually instances will be thread-safe and totally independent, 114 // but for now, they all point to the same underlying engine, 115 // and are NOT RE-ENTRANT OR THREAD-SAFE. For now: 116 // it is safe to Init multiple TessBaseAPIs in the same language, use them 117 // sequentially, and End or delete them all, but once one is Ended, you can't 118 // do anything other than End the others. After End, it is safe to Init 119 // again on the same one. 120 // 121 // Start tesseract. Returns zero on success and -1 on failure. 122 // NOTE that the only members that may be called before Init are those 123 // listed above here in the class definition. 124 // 125 // The datapath must be the name of the data directory (no ending /) or 126 // some other file in which the data directory resides (for instance argv[0].) 127 // The language is (usually) an ISO 639-3 string or NULL will default to eng. 128 // It is entirely safe (and eventually will be efficient too) to call 129 // Init multiple times on the same instance to change language, or just 130 // to reset the classifier. 131 // WARNING: On changing languages, all Variables are reset back to their 132 // default values. If you have a rare need to set a Variable that controls 133 // initialization for a second call to Init you should explicitly 134 // call End() and then use SetVariable before Init. This is only a very 135 // rare use case, since there are very few uses that require any variables 136 // to be set before Init. 137 int Init(const char* datapath, const char* language, 138 char **configs, int configs_size, bool configs_global_only); Init(const char * datapath,const char * language)139 int Init(const char* datapath, const char* language) { 140 return Init(datapath, language, 0, 0, false); 141 } 142 143 // Init only the lang model component of Tesseract. The only functions 144 // that work after this init are SetVariable and IsValidWord. 145 // WARNING: temporary! This function will be removed from here and placed 146 // in a separate API at some future time. 147 int InitLangMod(const char* datapath, const char* language); 148 149 // Init everything except the language model. Used to allow initialization for 150 // the specified language without any available dawg models. 151 int InitWithoutLangModel(const char* datapath, const char* language); 152 153 // Read a "config" file containing a set of variable, value pairs. 154 // Searches the standard places: tessdata/configs, tessdata/tessconfigs 155 // and also accepts a relative or absolute path name. 156 void ReadConfigFile(const char* filename, bool global_only); 157 158 // Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. 159 // The mode is stored as an INT_VARIABLE so it can also be modified by 160 // ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). 161 void SetPageSegMode(PageSegMode mode); 162 163 // Return the current page segmentation mode. 164 PageSegMode GetPageSegMode() const; 165 166 // Set the hint for trading accuracy against speed. 167 // Default is AVS_FASTEST, which is the old behaviour. 168 // Note that this is only a hint. Depending on the language and/or 169 // build configuration, speed and accuracy may not be tradeable. 170 // Also note that despite being an enum, any value in the range 171 // AVS_FASTEST to AVS_MOST_ACCURATE can be provided, and may or may not 172 // have an effect, depending on the implementation. 173 // The mode is stored as an INT_VARIABLE so it can also be modified by 174 // ReadConfigFile or SetVariable("tessedit_accuracyvspeed", mode as string). 175 void SetAccuracyVSpeed(AccuracyVSpeed mode); 176 177 // Recognize a rectangle from an image and return the result as a string. 178 // May be called many times for a single Init. 179 // Currently has no error checking. 180 // Greyscale of 8 and color of 24 or 32 bits per pixel may be given. 181 // Palette color images will not work properly and must be converted to 182 // 24 bit. 183 // Binary images of 1 bit per pixel may also be given but they must be 184 // byte packed with the MSB of the first byte being the first pixel, and a 185 // 1 represents WHITE. For binary images set bytes_per_pixel=0. 186 // The recognized text is returned as a char* which is coded 187 // as UTF8 and must be freed with the delete [] operator. 188 // 189 // Note that TesseractRect is the simplified convenience interface. 190 // For advanced uses, use SetImage, (optionally) SetRectangle, Recognize, 191 // and one or more of the Get*Text functions below. 192 char* TesseractRect(const unsigned char* imagedata, 193 int bytes_per_pixel, int bytes_per_line, 194 int left, int top, int width, int height); 195 196 // Call between pages or documents etc to free up memory and forget 197 // adaptive data. 198 void ClearAdaptiveClassifier(); 199 200 // ------------------------Advanced API-------------------------------- 201 // The following methods break TesseractRect into pieces, so you can 202 // get hold of the thresholded image, get the text in different formats, 203 // get bounding boxes, confidences etc. 204 205 // Provide an image for Tesseract to recognize. Format is as 206 // TesseractRect above. Does not copy the image buffer, or take 207 // ownership. The source image may be destroyed after Recognize is called, 208 // either explicitly or implicitly via one of the Get*Text functions. 209 // SetImage clears all recognition results, and sets the rectangle to the 210 // full image, so it may be followed immediately by a GetUTF8Text, and it 211 // will automatically perform recognition. 212 void SetImage(const unsigned char* imagedata, int width, int height, 213 int bytes_per_pixel, int bytes_per_line); 214 215 // Provide an image for Tesseract to recognize. As with SetImage above, 216 // Tesseract doesn't take a copy or ownership or pixDestroy the image, so 217 // it must persist until after Recognize. 218 // Pix vs raw, which to use? 219 // Use Pix where possible. A future version of Tesseract may choose to use Pix 220 // as its internal representation and discard IMAGE altogether. 221 // Because of that, an implementation that sources and targets Pix may end up 222 // with less copies than an implementation that does not. 223 void SetImage(const Pix* pix); 224 225 // Restrict recognition to a sub-rectangle of the image. Call after SetImage. 226 // Each SetRectangle clears the recogntion results so multiple rectangles 227 // can be recognized with the same image. 228 void SetRectangle(int left, int top, int width, int height); 229 230 // In extreme cases only, usually with a subclass of Thresholder, it 231 // is possible to provide a different Thresholder. The Thresholder may 232 // be preloaded with an image, settings etc, or they may be set after. 233 // Note that Tesseract takes ownership of the Thresholder and will 234 // delete it when it it is replaced or the API is destructed. SetThresholder(ImageThresholder * thresholder)235 void SetThresholder(ImageThresholder* thresholder) { 236 if (thresholder_ != 0) 237 delete thresholder_; 238 thresholder_ = thresholder; 239 ClearResults(); 240 } 241 242 // Get a copy of the internal thresholded image from Tesseract. 243 // Caller takes ownership of the Pix and must pixDestroy it. 244 // May be called any time after SetImage, or after TesseractRect. 245 Pix* GetThresholdedImage(); 246 247 // Get the result of page layout analysis as a leptonica-style 248 // Boxa, Pixa pair, in reading order. 249 // Can be called before or after Recognize. 250 Boxa* GetRegions(Pixa** pixa); 251 252 // Get the textlines as a leptonica-style 253 // Boxa, Pixa pair, in reading order. 254 // Can be called before or after Recognize. 255 // If blockids is not NULL, the block-id of each line is also returned as an 256 // array of one element per line. delete [] after use. 257 Boxa* GetTextlines(Pixa** pixa, int** blockids); 258 259 // Get the words as a leptonica-style 260 // Boxa, Pixa pair, in reading order. 261 // Can be called before or after Recognize. 262 Boxa* GetWords(Pixa** pixa); 263 264 // Dump the internal binary image to a PGM file. 265 // Deprecated. Use GetThresholdedImage and write the image using pixWrite 266 // instead if possible. 267 void DumpPGM(const char* filename); 268 269 // Recognize the image from SetAndThresholdImage, generating Tesseract 270 // internal structures. Returns 0 on success. 271 // Optional. The Get*Text functions below will call Recognize if needed. 272 // After Recognize, the output is kept internally until the next SetImage. 273 int Recognize(ETEXT_STRUCT* monitor); 274 275 // Methods to retrieve information after SetAndThresholdImage(), 276 // Recognize() or TesseractRect(). (Recognize is called implicitly if needed.) 277 278 // Variant on Recognize used for testing chopper. 279 int RecognizeForChopTest(struct ETEXT_STRUCT* monitor); 280 281 // The recognized text is returned as a char* which is coded 282 // as UTF8 and must be freed with the delete [] operator. 283 char* GetUTF8Text(); 284 // The recognized text is returned as a char* which is coded in the same 285 // format as a box file used in training. Returned string must be freed with 286 // the delete [] operator. 287 // Constructs coordinates in the original image - not just the rectangle. 288 char* GetBoxText(); 289 // The recognized text is returned as a char* which is coded 290 // as UNLV format Latin-1 with specific reject and suspect codes 291 // and must be freed with the delete [] operator. 292 char* GetUNLVText(); 293 // Returns the (average) confidence value between 0 and 100. 294 int MeanTextConf(); 295 // Returns all word confidences (between 0 and 100) in an array, terminated 296 // by -1. The calling function must delete [] after use. 297 // The number of confidences should correspond to the number of space- 298 // delimited words in GetUTF8Text. 299 int* AllWordConfidences(); 300 301 // Free up recognition results and any stored image data, without actually 302 // freeing any recognition data that would be time-consuming to reload. 303 // Afterwards, you must call SetImage or TesseractRect before doing 304 // any Recognize or Get* operation. 305 void Clear(); 306 307 // Close down tesseract and free up all memory. End() is equivalent to 308 // destructing and reconstructing your TessBaseAPI. 309 // Once End() has been used, none of the other API functions may be used 310 // other than Init and anything declared above it in the class definition. 311 void End(); 312 313 // Check whether a word is valid according to Tesseract's language model 314 // returns 0 if the word is invalid, non-zero if valid. 315 // WARNING: temporary! This function will be removed from here and placed 316 // in a separate API at some future time. 317 int IsValidWord(const char *word); 318 319 bool GetTextDirection(int* out_offset, float* out_slope); 320 321 // Set the letter_is_okay function to point somewhere else. 322 void SetDictFunc(DictFunc f); 323 324 // Estimates the Orientation And Script of the image. 325 // Returns true if the image was processed successfully. 326 bool DetectOS(OSResults*); 327 328 // This method returns the features associated with the input image. 329 void GetFeatures(INT_FEATURE_ARRAY int_features, 330 int* num_features); 331 332 // Return the pointer to the i-th dawg loaded into tesseract_ object. 333 const Dawg *GetDawg(int i) const; 334 335 // Return the number of dawgs loaded into tesseract_ object. 336 int NumDawgs() const; 337 338 // Return the language used in the last valid initialization. 339 const char* GetLastInitLanguage() const; 340 341 protected: 342 343 // Common code for setting the image. Returns true if Init has been called. 344 bool InternalSetImage(); 345 346 // Run the thresholder to make the thresholded image. If pix is not NULL, 347 // the source is thresholded to pix instead of the internal IMAGE. 348 virtual void Threshold(Pix** pix); 349 350 // Find lines from the image making the BLOCK_LIST. 351 // Returns 0 on success. 352 int FindLines(); 353 354 // Delete the pageres and block list ready for a new page. 355 void ClearResults(); 356 357 // Return the length of the output text string, as UTF8, assuming 358 // one newline per line and one per block, with a terminator, 359 // and assuming a single character reject marker for each rejected character. 360 // Also return the number of recognized blobs in blob_count. 361 int TextLength(int* blob_count); 362 363 // __________________________ ocropus add-ons ___________________________ 364 365 // Find lines from the image making the BLOCK_LIST. 366 BLOCK_LIST* FindLinesCreateBlockList(); 367 368 // Delete a block list. 369 // This is to keep BLOCK_LIST pointer opaque 370 // and let go of including the other headers. 371 static void DeleteBlockList(BLOCK_LIST* block_list); 372 373 // Adapt to recognize the current image as the given character. 374 // The image must be preloaded and be just an image of a single character. 375 void AdaptToCharacter(const char *unichar_repr, 376 int length, 377 float baseline, 378 float xheight, 379 float descender, 380 float ascender); 381 382 // Recognize text doing one pass only, using settings for a given pass. 383 /*static*/ PAGE_RES* RecognitionPass1(BLOCK_LIST* block_list); 384 /*static*/ PAGE_RES* RecognitionPass2(BLOCK_LIST* block_list, 385 PAGE_RES* pass1_result); 386 387 // Extract the OCR results, costs (penalty points for uncertainty), 388 // and the bounding boxes of the characters. 389 static int TesseractExtractResult(char** text, 390 int** lengths, 391 float** costs, 392 int** x0, 393 int** y0, 394 int** x1, 395 int** y1, 396 PAGE_RES* page_res); 397 398 // Call the Cube OCR engine. Takes the Region, line and word segmentation 399 // information from Tesseract as inputs. Makes changes or populates the 400 // output PAGE_RES object which contains the recogntion results. 401 // The behavior of this function depends on the 402 // current language and the value of the tessedit_accuracyvspeed: 403 // For English (and other Latin based scripts): 404 // If the accuracyvspeed flag is set to any value other than AVS_FASTEST, 405 // Cube uses the word information passed by Tesseract. 406 // Cube will run on a subset of the words segmented and recognized by 407 // Tesseract. The value of the accuracyvspeed and the Tesseract 408 // confidence of a word determines whether Cube runs on it or not and 409 // whether Cube's results override Tesseract's 410 // For Arabic & Hindi: 411 // Cube uses the Region information passed by Tesseract. It then performs 412 // its own line segmentation. This will change once Tesseract's line 413 // segmentation works for Arabic. Cube then segments each line into 414 // phrases. Each phrase is then recognized in phrase mode which allows 415 // spaces in the results. 416 // Note that at this point, the line segmentation algorithm might have 417 // some problems with ill spaced Arabic document. 418 int Cube(); 419 // Run Cube on the lines extracted by Tesseract. 420 int RunCubeOnLines(); 421 // Run Cube on a subset of the words already present in the page_res_ object 422 // The subset, and whether Cube overrides the results is determined by 423 // the SpeedVsAccuracy flag 424 int CubePostProcessWords(); 425 // Create a Cube line object for each line 426 CubeLineObject **CreateLineObjects(Pixa* pixa_lines); 427 // Create a TBox array corresponding to the phrases in the array of 428 // line objects 429 TBOX *CreatePhraseBoxes(Boxa* boxa_lines, CubeLineObject **line_objs, 430 int *phrase_cnt); 431 // Recognize the phrases saving the results to the page_res_ object 432 bool RecognizePhrases(int line_cnt, int phrase_cnt, 433 CubeLineObject **line_objs, TBOX *phrase_boxes); 434 // Recognize a single phrase saving the results to the page_res_ object 435 bool RecognizePhrase(CubeObject *phrase, PAGE_RES_IT *result); 436 // Create the necessary Cube Objects 437 bool CreateCubeObjects(); 438 439 protected: 440 Tesseract* tesseract_; // The underlying data object. 441 ImageThresholder* thresholder_; // Image thresholding module. 442 bool threshold_done_; // Image has been passed to page_image. 443 BLOCK_LIST* block_list_; // The page layout. 444 PAGE_RES* page_res_; // The page-level data. 445 STRING* input_file_; // Name used by training code. 446 STRING* output_file_; // Name used by debug code. 447 STRING* datapath_; // Current location of tessdata. 448 STRING* language_; // Last initialized language. 449 // Parameters saved from the Thresholder. Needed to rebuild coordinates. 450 int rect_left_; 451 int rect_top_; 452 int rect_width_; 453 int rect_height_; 454 int image_width_; 455 int image_height_; 456 }; 457 458 } // namespace tesseract. 459 460 #endif // TESSERACT_CCMAIN_BASEAPI_H__ 461