• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        baseapi.h
3 // Description: Simple API for calling tesseract.
4 // Author:      Ray Smith
5 // Created:     Fri Oct 06 15:35:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19 
20 #ifndef TESSERACT_CCMAIN_BASEAPI_H__
21 #define TESSERACT_CCMAIN_BASEAPI_H__
22 
23 #include "thresholder.h"
24 
25 class PAGE_RES;
26 class PAGE_RES_IT;
27 class BLOCK_LIST;
28 class IMAGE;
29 class STRING;
30 struct Pix;
31 struct Box;
32 struct Pixa;
33 struct Boxa;
34 struct ETEXT_STRUCT;
35 struct OSResults;
36 struct TBOX;
37 
38 #define MAX_NUM_INT_FEATURES 512
39 struct INT_FEATURE_STRUCT;
40 typedef INT_FEATURE_STRUCT *INT_FEATURE;
41 typedef INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES];
42 
43 #ifdef TESSDLL_EXPORTS
44 #define TESSDLL_API __declspec(dllexport)
45 #elif defined(TESSDLL_IMPORTS)
46 #define TESSDLL_API __declspec(dllimport)
47 #else
48 #define TESSDLL_API
49 #endif
50 
51 
52 namespace tesseract {
53 
54 class Dict;
55 class Tesseract;
56 class Trie;
57 class CubeRecoContext;
58 class TesseractCubeCombiner;
59 class CubeObject;
60 class CubeLineObject;
61 class Dawg;
62 
63 typedef int (Dict::*DictFunc)(void* void_dawg_args, int char_index,
64                               const void *word, bool word_end);
65 
66 enum PageSegMode {
67   PSM_AUTO,           // Fully automatic page segmentation.
68   PSM_SINGLE_COLUMN,  // Assume a single column of text of variable sizes.
69   PSM_SINGLE_BLOCK,   // Assume a single uniform block of text. (Default.)
70   PSM_SINGLE_LINE,    // Treat the image as a single text line.
71   PSM_SINGLE_WORD,    // Treat the image as a single word.
72   PSM_SINGLE_CHAR,    // Treat the image as a single character.
73 
74   PSM_COUNT           // Number of enum entries.
75 };
76 
77 // The values in the AccuracyVSpeed enum provide hints for how the engine
78 // should trade speed for accuracy. There is no guarantee of any effect.
79 enum AccuracyVSpeed {
80   AVS_FASTEST = 0,         // Fastest speed, but lowest accuracy.
81   AVS_MOST_ACCURATE = 100  // Greatest accuracy, but slowest speed.
82 };
83 
84 // Base class for all tesseract APIs.
85 // Specific classes can add ability to work on different inputs or produce
86 // different outputs.
87 // This class is mostly an interface layer on top of the Tesseract instance
88 // class to hide the data types so that users of this class don't have to
89 // include any other Tesseract headers.
90 
91 class TESSDLL_API TessBaseAPI {
92  public:
93   TessBaseAPI();
94   virtual ~TessBaseAPI();
95 
96   // Set the name of the input file. Needed only for training and
97   // reading a UNLV zone file.
98   void SetInputName(const char* name);
99 
100   // Set the name of the bonus output files. Needed only for debugging.
101   void SetOutputName(const char* name);
102 
103   // Set the value of an internal "variable" (of either old or new types).
104   // Supply the name of the variable and the value as a string, just as
105   // you would in a config file.
106   // Returns false if the name lookup failed.
107   // Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
108   // Or SetVariable("bln_numericmode", "1"); to set numeric-only mode.
109   // SetVariable may be used before Init, but settings will revert to
110   // defaults on End().
111   bool SetVariable(const char* variable, const char* value);
112 
113   // Eventually instances will be thread-safe and totally independent,
114   // but for now, they all point to the same underlying engine,
115   // and are NOT RE-ENTRANT OR THREAD-SAFE. For now:
116   // it is safe to Init multiple TessBaseAPIs in the same language, use them
117   // sequentially, and End or delete them all, but once one is Ended, you can't
118   // do anything other than End the others. After End, it is safe to Init
119   // again on the same one.
120   //
121   // Start tesseract. Returns zero on success and -1 on failure.
122   // NOTE that the only members that may be called before Init are those
123   // listed above here in the class definition.
124   //
125   // The datapath must be the name of the data directory (no ending /) or
126   // some other file in which the data directory resides (for instance argv[0].)
127   // The language is (usually) an ISO 639-3 string or NULL will default to eng.
128   // It is entirely safe (and eventually will be efficient too) to call
129   // Init multiple times on the same instance to change language, or just
130   // to reset the classifier.
131   // WARNING: On changing languages, all Variables are reset back to their
132   // default values. If you have a rare need to set a Variable that controls
133   // initialization for a second call to Init you should explicitly
134   // call End() and then use SetVariable before Init. This is only a very
135   // rare use case, since there are very few uses that require any variables
136   // to be set before Init.
137   int Init(const char* datapath, const char* language,
138            char **configs, int configs_size, bool configs_global_only);
Init(const char * datapath,const char * language)139   int Init(const char* datapath, const char* language) {
140     return Init(datapath, language, 0, 0, false);
141   }
142 
143   // Init only the lang model component of Tesseract. The only functions
144   // that work after this init are SetVariable and IsValidWord.
145   // WARNING: temporary! This function will be removed from here and placed
146   // in a separate API at some future time.
147   int InitLangMod(const char* datapath, const char* language);
148 
149   // Init everything except the language model. Used to allow initialization for
150   // the specified language without any available dawg models.
151   int InitWithoutLangModel(const char* datapath, const char* language);
152 
153   // Read a "config" file containing a set of variable, value pairs.
154   // Searches the standard places: tessdata/configs, tessdata/tessconfigs
155   // and also accepts a relative or absolute path name.
156   void ReadConfigFile(const char* filename, bool global_only);
157 
158   // Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
159   // The mode is stored as an INT_VARIABLE so it can also be modified by
160   // ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
161   void SetPageSegMode(PageSegMode mode);
162 
163   // Return the current page segmentation mode.
164   PageSegMode GetPageSegMode() const;
165 
166   // Set the hint for trading accuracy against speed.
167   // Default is AVS_FASTEST, which is the old behaviour.
168   // Note that this is only a hint. Depending on the language and/or
169   // build configuration, speed and accuracy may not be tradeable.
170   // Also note that despite being an enum, any value in the range
171   // AVS_FASTEST to AVS_MOST_ACCURATE can be provided, and may or may not
172   // have an effect, depending on the implementation.
173   // The mode is stored as an INT_VARIABLE so it can also be modified by
174   // ReadConfigFile or SetVariable("tessedit_accuracyvspeed", mode as string).
175   void SetAccuracyVSpeed(AccuracyVSpeed mode);
176 
177   // Recognize a rectangle from an image and return the result as a string.
178   // May be called many times for a single Init.
179   // Currently has no error checking.
180   // Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
181   // Palette color images will not work properly and must be converted to
182   // 24 bit.
183   // Binary images of 1 bit per pixel may also be given but they must be
184   // byte packed with the MSB of the first byte being the first pixel, and a
185   // 1 represents WHITE. For binary images set bytes_per_pixel=0.
186   // The recognized text is returned as a char* which is coded
187   // as UTF8 and must be freed with the delete [] operator.
188   //
189   // Note that TesseractRect is the simplified convenience interface.
190   // For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
191   // and one or more of the Get*Text functions below.
192   char* TesseractRect(const unsigned char* imagedata,
193                       int bytes_per_pixel, int bytes_per_line,
194                       int left, int top, int width, int height);
195 
196   // Call between pages or documents etc to free up memory and forget
197   // adaptive data.
198   void ClearAdaptiveClassifier();
199 
200   // ------------------------Advanced API--------------------------------
201   // The following methods break TesseractRect into pieces, so you can
202   // get hold of the thresholded image, get the text in different formats,
203   // get bounding boxes, confidences etc.
204 
205   // Provide an image for Tesseract to recognize. Format is as
206   // TesseractRect above. Does not copy the image buffer, or take
207   // ownership. The source image may be destroyed after Recognize is called,
208   // either explicitly or implicitly via one of the Get*Text functions.
209   // SetImage clears all recognition results, and sets the rectangle to the
210   // full image, so it may be followed immediately by a GetUTF8Text, and it
211   // will automatically perform recognition.
212   void SetImage(const unsigned char* imagedata, int width, int height,
213                 int bytes_per_pixel, int bytes_per_line);
214 
215   // Provide an image for Tesseract to recognize. As with SetImage above,
216   // Tesseract doesn't take a copy or ownership or pixDestroy the image, so
217   // it must persist until after Recognize.
218   // Pix vs raw, which to use?
219   // Use Pix where possible. A future version of Tesseract may choose to use Pix
220   // as its internal representation and discard IMAGE altogether.
221   // Because of that, an implementation that sources and targets Pix may end up
222   // with less copies than an implementation that does not.
223   void SetImage(const Pix* pix);
224 
225   // Restrict recognition to a sub-rectangle of the image. Call after SetImage.
226   // Each SetRectangle clears the recogntion results so multiple rectangles
227   // can be recognized with the same image.
228   void SetRectangle(int left, int top, int width, int height);
229 
230   // In extreme cases only, usually with a subclass of Thresholder, it
231   // is possible to provide a different Thresholder. The Thresholder may
232   // be preloaded with an image, settings etc, or they may be set after.
233   // Note that Tesseract takes ownership of the Thresholder and will
234   // delete it when it it is replaced or the API is destructed.
SetThresholder(ImageThresholder * thresholder)235   void SetThresholder(ImageThresholder* thresholder) {
236     if (thresholder_ != 0)
237       delete thresholder_;
238     thresholder_ = thresholder;
239     ClearResults();
240   }
241 
242   // Get a copy of the internal thresholded image from Tesseract.
243   // Caller takes ownership of the Pix and must pixDestroy it.
244   // May be called any time after SetImage, or after TesseractRect.
245   Pix* GetThresholdedImage();
246 
247   // Get the result of page layout analysis as a leptonica-style
248   // Boxa, Pixa pair, in reading order.
249   // Can be called before or after Recognize.
250   Boxa* GetRegions(Pixa** pixa);
251 
252   // Get the textlines as a leptonica-style
253   // Boxa, Pixa pair, in reading order.
254   // Can be called before or after Recognize.
255   // If blockids is not NULL, the block-id of each line is also returned as an
256   // array of one element per line. delete [] after use.
257   Boxa* GetTextlines(Pixa** pixa, int** blockids);
258 
259   // Get the words as a leptonica-style
260   // Boxa, Pixa pair, in reading order.
261   // Can be called before or after Recognize.
262   Boxa* GetWords(Pixa** pixa);
263 
264   // Dump the internal binary image to a PGM file.
265   // Deprecated. Use GetThresholdedImage and write the image using pixWrite
266   // instead if possible.
267   void DumpPGM(const char* filename);
268 
269   // Recognize the image from SetAndThresholdImage, generating Tesseract
270   // internal structures. Returns 0 on success.
271   // Optional. The Get*Text functions below will call Recognize if needed.
272   // After Recognize, the output is kept internally until the next SetImage.
273   int Recognize(ETEXT_STRUCT* monitor);
274 
275   // Methods to retrieve information after SetAndThresholdImage(),
276   // Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
277 
278   // Variant on Recognize used for testing chopper.
279   int RecognizeForChopTest(struct ETEXT_STRUCT* monitor);
280 
281   // The recognized text is returned as a char* which is coded
282   // as UTF8 and must be freed with the delete [] operator.
283   char* GetUTF8Text();
284   // The recognized text is returned as a char* which is coded in the same
285   // format as a box file used in training. Returned string must be freed with
286   // the delete [] operator.
287   // Constructs coordinates in the original image - not just the rectangle.
288   char* GetBoxText();
289   // The recognized text is returned as a char* which is coded
290   // as UNLV format Latin-1 with specific reject and suspect codes
291   // and must be freed with the delete [] operator.
292   char* GetUNLVText();
293   // Returns the (average) confidence value between 0 and 100.
294   int MeanTextConf();
295   // Returns all word confidences (between 0 and 100) in an array, terminated
296   // by -1.  The calling function must delete [] after use.
297   // The number of confidences should correspond to the number of space-
298   // delimited words in GetUTF8Text.
299   int* AllWordConfidences();
300 
301   // Free up recognition results and any stored image data, without actually
302   // freeing any recognition data that would be time-consuming to reload.
303   // Afterwards, you must call SetImage or TesseractRect before doing
304   // any Recognize or Get* operation.
305   void Clear();
306 
307   // Close down tesseract and free up all memory. End() is equivalent to
308   // destructing and reconstructing your TessBaseAPI.
309   // Once End() has been used, none of the other API functions may be used
310   // other than Init and anything declared above it in the class definition.
311   void End();
312 
313   // Check whether a word is valid according to Tesseract's language model
314   // returns 0 if the word is invalid, non-zero if valid.
315   // WARNING: temporary! This function will be removed from here and placed
316   // in a separate API at some future time.
317   int IsValidWord(const char *word);
318 
319   bool GetTextDirection(int* out_offset, float* out_slope);
320 
321   // Set the letter_is_okay function to point somewhere else.
322   void SetDictFunc(DictFunc f);
323 
324   // Estimates the Orientation And Script of the image.
325   // Returns true if the image was processed successfully.
326   bool DetectOS(OSResults*);
327 
328   // This method returns the features associated with the input image.
329   void GetFeatures(INT_FEATURE_ARRAY int_features,
330                    int* num_features);
331 
332   // Return the pointer to the i-th dawg loaded into tesseract_ object.
333   const Dawg *GetDawg(int i) const;
334 
335   // Return the number of dawgs loaded into tesseract_ object.
336   int NumDawgs() const;
337 
338   // Return the language used in the last valid initialization.
339   const char* GetLastInitLanguage() const;
340 
341  protected:
342 
343   // Common code for setting the image. Returns true if Init has been called.
344   bool InternalSetImage();
345 
346   // Run the thresholder to make the thresholded image. If pix is not NULL,
347   // the source is thresholded to pix instead of the internal IMAGE.
348   virtual void Threshold(Pix** pix);
349 
350   // Find lines from the image making the BLOCK_LIST.
351   // Returns 0 on success.
352   int FindLines();
353 
354   // Delete the pageres and block list ready for a new page.
355   void ClearResults();
356 
357   // Return the length of the output text string, as UTF8, assuming
358   // one newline per line and one per block, with a terminator,
359   // and assuming a single character reject marker for each rejected character.
360   // Also return the number of recognized blobs in blob_count.
361   int TextLength(int* blob_count);
362 
363   // __________________________   ocropus add-ons   ___________________________
364 
365   // Find lines from the image making the BLOCK_LIST.
366   BLOCK_LIST* FindLinesCreateBlockList();
367 
368   // Delete a block list.
369   // This is to keep BLOCK_LIST pointer opaque
370   // and let go of including the other headers.
371   static void DeleteBlockList(BLOCK_LIST* block_list);
372 
373   // Adapt to recognize the current image as the given character.
374   // The image must be preloaded and be just an image of a single character.
375   void AdaptToCharacter(const char *unichar_repr,
376                         int length,
377                         float baseline,
378                         float xheight,
379                         float descender,
380                         float ascender);
381 
382   // Recognize text doing one pass only, using settings for a given pass.
383   /*static*/ PAGE_RES* RecognitionPass1(BLOCK_LIST* block_list);
384   /*static*/ PAGE_RES* RecognitionPass2(BLOCK_LIST* block_list,
385                                     PAGE_RES* pass1_result);
386 
387   // Extract the OCR results, costs (penalty points for uncertainty),
388   // and the bounding boxes of the characters.
389   static int TesseractExtractResult(char** text,
390                                     int** lengths,
391                                     float** costs,
392                                     int** x0,
393                                     int** y0,
394                                     int** x1,
395                                     int** y1,
396                                     PAGE_RES* page_res);
397 
398   // Call the Cube OCR engine. Takes the Region, line and word segmentation
399   // information from Tesseract as inputs. Makes changes or populates the
400   // output PAGE_RES object which contains the recogntion results.
401   // The behavior of this function depends on the
402   // current language and the value of the tessedit_accuracyvspeed:
403   // For English (and other Latin based scripts):
404   //    If the accuracyvspeed flag is set to any value other than AVS_FASTEST,
405   //    Cube uses the word information passed by Tesseract.
406   //    Cube will run on a subset of the words segmented and recognized by
407   //    Tesseract. The value of the accuracyvspeed and the Tesseract
408   //    confidence of a word determines whether Cube runs on it or not and
409   //    whether Cube's results override Tesseract's
410   // For Arabic & Hindi:
411   //    Cube uses the Region information passed by Tesseract. It then performs
412   //    its own line segmentation. This will change once Tesseract's line
413   //    segmentation works for Arabic. Cube then segments each line into
414   //    phrases. Each phrase is then recognized in phrase mode which allows
415   //    spaces in the results.
416   //    Note that at this point, the line segmentation algorithm might have
417   //    some problems with ill spaced Arabic document.
418   int Cube();
419   // Run Cube on the lines extracted by Tesseract.
420   int RunCubeOnLines();
421   // Run Cube on a subset of the words already present in the page_res_ object
422   // The subset, and whether Cube overrides the results is determined by
423   // the SpeedVsAccuracy flag
424   int CubePostProcessWords();
425   // Create a Cube line object for each line
426   CubeLineObject **CreateLineObjects(Pixa* pixa_lines);
427   // Create a TBox array corresponding to the phrases in the array of
428   // line objects
429   TBOX *CreatePhraseBoxes(Boxa* boxa_lines, CubeLineObject **line_objs,
430                           int *phrase_cnt);
431   // Recognize the phrases saving the results to the page_res_ object
432   bool RecognizePhrases(int line_cnt, int phrase_cnt,
433                         CubeLineObject **line_objs, TBOX *phrase_boxes);
434   // Recognize a single phrase saving the results to the page_res_ object
435   bool RecognizePhrase(CubeObject *phrase, PAGE_RES_IT *result);
436   // Create the necessary Cube Objects
437   bool CreateCubeObjects();
438 
439  protected:
440    Tesseract*        tesseract_;       // The underlying data object.
441    ImageThresholder* thresholder_;     // Image thresholding module.
442    bool              threshold_done_;  // Image has been passed to page_image.
443    BLOCK_LIST*       block_list_;      // The page layout.
444    PAGE_RES*         page_res_;        // The page-level data.
445    STRING*           input_file_;      // Name used by training code.
446    STRING*           output_file_;     // Name used by debug code.
447    STRING*           datapath_;        // Current location of tessdata.
448    STRING*           language_;        // Last initialized language.
449   // Parameters saved from the Thresholder. Needed to rebuild coordinates.
450   int rect_left_;
451   int rect_top_;
452   int rect_width_;
453   int rect_height_;
454   int image_width_;
455   int image_height_;
456 };
457 
458 }  // namespace tesseract.
459 
460 #endif  // TESSERACT_CCMAIN_BASEAPI_H__
461