• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**********************************************************************
2  * File:        baseapi.cpp
3  * Description: Simple API for calling tesseract.
4  * Author:      Ray Smith
5  * Created:     Fri Oct 06 15:35:01 PDT 2006
6  *
7  * (C) Copyright 2006, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #ifdef HAVE_LIBLEPT
26 // Include leptonica library only if autoconf (or makefile etc) tell us to.
27 #include "allheaders.h"
28 #endif
29 
30 #include "baseapi.h"
31 
32 #include "thresholder.h"
33 #include "tesseractmain.h"
34 #include "tesseractclass.h"
35 #include "tessedit.h"
36 #include "ocrclass.h"
37 #include "pageres.h"
38 #include "tessvars.h"
39 #include "control.h"
40 #include "applybox.h"
41 #include "pgedit.h"
42 #include "varabled.h"
43 #include "output.h"
44 #include "mainblk.h"
45 #include "globals.h"
46 #include "adaptmatch.h"
47 #include "edgblob.h"
48 #include "tessbox.h"
49 #include "tordvars.h"
50 #include "imgs.h"
51 #include "makerow.h"
52 #include "tstruct.h"
53 #include "tessout.h"
54 #include "tface.h"
55 #include "permute.h"
56 #include "otsuthr.h"
57 #include "osdetect.h"
58 #include "chopper.h"
59 #include "matchtab.h"
60 
61 namespace tesseract {
62 
63 // Minimum sensible image size to be worth running tesseract.
64 const int kMinRectSize = 10;
65 // Character returned when Tesseract couldn't recognize as anything.
66 const char kTesseractReject = '~';
67 // Character used by UNLV error counter as a reject.
68 const char kUNLVReject = '~';
69 // Character used by UNLV as a suspect marker.
70 const char kUNLVSuspect = '^';
71 // Filename used for input image file, from which to derive a name to search
72 // for a possible UNLV zone file, if none is specified by SetInputName.
73 const char* kInputFile = "noname.tif";
74 
TessBaseAPI()75 TessBaseAPI::TessBaseAPI()
76   : tesseract_(NULL),
77     // Thresholder is initialized to NULL here, but will be set before use by:
78     // A constructor of a derived API,  SetThresholder(), or
79     // created implicitly when used in InternalSetImage.
80     thresholder_(NULL),
81     threshold_done_(false),
82     block_list_(NULL),
83     page_res_(NULL),
84     input_file_(NULL),
85     output_file_(NULL),
86     datapath_(NULL),
87     language_(NULL),
88     rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0),
89     image_width_(0), image_height_(0) {
90 }
91 
~TessBaseAPI()92 TessBaseAPI::~TessBaseAPI() {
93   End();
94 }
95 
96 // Set the name of the input file. Needed only for training and
97 // loading a UNLV zone file.
SetInputName(const char * name)98 void TessBaseAPI::SetInputName(const char* name) {
99   if (input_file_ == NULL)
100     input_file_ = new STRING(name);
101   else
102     *input_file_ = name;
103 }
104 
105 // Set the name of the output files. Needed only for debugging.
SetOutputName(const char * name)106 void TessBaseAPI::SetOutputName(const char* name) {
107   if (output_file_ == NULL)
108     output_file_ = new STRING(name);
109   else
110     *output_file_ = name;
111 }
112 
113 // Set the value of an internal "variable" (of either old or new types).
114 // Supply the name of the variable and the value as a string, just as
115 // you would in a config file.
116 // Returns false if the name lookup failed.
117 // SetVariable may be used before Init, to set things that control
118 // initialization, but note that on End all settings are lost and
119 // the next Init will use the defaults unless SetVariable is used again.
SetVariable(const char * variable,const char * value)120 bool TessBaseAPI::SetVariable(const char* variable, const char* value) {
121   if (tesseract_ == NULL)
122     tesseract_ = new Tesseract;
123   return set_variable(variable, value);
124 }
125 
126 // The datapath must be the name of the data directory (no ending /) or
127 // some other file in which the data directory resides (for instance argv[0].)
128 // The language is (usually) an ISO 639-3 string or NULL will default to eng.
129 // If numeric_mode is true, then only digits and Roman numerals will
130 // be returned.
131 // Returns 0 on success and -1 on initialization failure.
Init(const char * datapath,const char * language,char ** configs,int configs_size,bool configs_global_only)132 int TessBaseAPI::Init(const char* datapath, const char* language,
133                       char **configs, int configs_size,
134                       bool configs_global_only) {
135   // If the datapath or the language have changed, then start again.
136   // Note that the language_ field stores the last requested language that was
137   // initialized successfully, while tesseract_->lang stores the language
138   // actually used. They differ only if the requested language was NULL, in
139   // which case tesseract_->lang is set to the Tesseract default ("eng").
140   if (tesseract_ != NULL &&
141       (datapath_ == NULL || language_ == NULL || *datapath_ != datapath
142        || (*language_ != language && tesseract_->lang != language))) {
143     tesseract_->end_tesseract();
144     delete tesseract_;
145     tesseract_ = NULL;
146   }
147 
148   bool reset_classifier = true;
149   if (tesseract_ == NULL) {
150     reset_classifier = false;
151     tesseract_ = new Tesseract;
152     if (tesseract_->init_tesseract(
153             datapath, output_file_ != NULL ? output_file_->string() : NULL,
154             language, configs, configs_size, configs_global_only) != 0) {
155       return -1;
156     }
157   }
158   // Update datapath and language requested for the last valid initialization.
159   if (datapath_ == NULL)
160     datapath_ = new STRING(datapath);
161   else
162     *datapath_ = datapath;
163   if (language_ == NULL)
164     language_ = new STRING(language);
165   else
166     *language_ = language;
167 
168   // For same language and datapath, just reset the adaptive classifier.
169   if (reset_classifier) tesseract_->ResetAdaptiveClassifier();
170 
171   return 0;
172 }
173 
174 // Init only the lang model component of Tesseract. The only functions
175 // that work after this init are SetVariable and IsValidWord.
176 // WARNING: temporary! This function will be removed from here and placed
177 // in a separate API at some future time.
InitLangMod(const char * datapath,const char * language)178 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
179   if (tesseract_ == NULL)
180     tesseract_ = new Tesseract;
181   return tesseract_->init_tesseract_lm(datapath, NULL, language);
182 }
183 
184 // Init only the classifer component of Tesseract. Used to initialize the
185 // specified language when no dawg models are available.
InitWithoutLangModel(const char * datapath,const char * language)186 int TessBaseAPI::InitWithoutLangModel(const char* datapath,
187                                       const char* language) {
188   // If the datapath or the language have changed, then start again.
189   if (tesseract_ != NULL &&
190       (datapath_ == NULL || language_ == NULL ||
191        *datapath_ != datapath || *language_ != language)) {
192     tesseract_->end_tesseract();
193     delete tesseract_;
194     tesseract_ = NULL;
195   }
196   if (datapath_ == NULL)
197     datapath_ = new STRING(datapath);
198   else
199     *datapath_ = datapath;
200   if (language_ == NULL)
201     language_ = new STRING(language);
202   else
203     *language_ = language;
204   if (tesseract_ == NULL) {
205     tesseract_ = new Tesseract;
206     return tesseract_->init_tesseract_classifier(
207         datapath, output_file_ != NULL ? output_file_->string() : NULL,
208         language, NULL, 0, false);
209   }
210   // For same language and datapath, just reset the adaptive classifier.
211   tesseract_->ResetAdaptiveClassifier();
212   return 0;
213 }
214 
215 // Read a "config" file containing a set of variable, value pairs.
216 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
217 // and also accepts a relative or absolute path name.
ReadConfigFile(const char * filename,bool global_only)218 void TessBaseAPI::ReadConfigFile(const char* filename, bool global_only) {
219   tesseract_->read_config_file(filename, global_only);
220 }
221 
222 // Set the current page segmentation mode. Defaults to PSM_AUTO.
223 // The mode is stored as an INT_VARIABLE so it can also be modified by
224 // ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
SetPageSegMode(PageSegMode mode)225 void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
226   if (tesseract_ == NULL)
227     tesseract_ = new Tesseract;
228   tesseract_->tessedit_pageseg_mode.set_value(mode);
229 }
230 
231 // Return the current page segmentation mode.
GetPageSegMode() const232 PageSegMode TessBaseAPI::GetPageSegMode() const {
233   if (tesseract_ == NULL)
234     return PSM_SINGLE_BLOCK;
235   return static_cast<PageSegMode>(
236     static_cast<int>(tesseract_->tessedit_pageseg_mode));
237 }
238 
239 // Set the hint for trading accuracy against speed.
240 // Default is AVS_FASTEST, which is the old behaviour.
241 // Note that this is only a hint. Depending on the language and/or
242 // build configuration, speed and accuracy may not be tradeable.
243 // Also note that despite being an enum, any value in the range
244 // AVS_FASTEST to AVS_MOST_ACCURATE can be provided, and may or may not
245 // have an effect, depending on the implementation.
246 // The mode is stored as an INT_VARIABLE so it can also be modified by
247 // ReadConfigFile or SetVariable("tessedit_accuracyvspeed", mode as string).
SetAccuracyVSpeed(AccuracyVSpeed mode)248 void TessBaseAPI::SetAccuracyVSpeed(AccuracyVSpeed mode) {
249   if (tesseract_ == NULL)
250     tesseract_ = new Tesseract;
251   tesseract_->tessedit_accuracyvspeed.set_value(mode);
252 }
253 
254 // Recognize a rectangle from an image and return the result as a string.
255 // May be called many times for a single Init.
256 // Currently has no error checking.
257 // Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
258 // Palette color images will not work properly and must be converted to
259 // 24 bit.
260 // Binary images of 1 bit per pixel may also be given but they must be
261 // byte packed with the MSB of the first byte being the first pixel, and a
262 // one pixel is WHITE. For binary images set bytes_per_pixel=0.
263 // The recognized text is returned as a char* which is coded
264 // as UTF8 and must be freed with the delete [] operator.
TesseractRect(const unsigned char * imagedata,int bytes_per_pixel,int bytes_per_line,int left,int top,int width,int height)265 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
266                                  int bytes_per_pixel,
267                                  int bytes_per_line,
268                                  int left, int top,
269                                  int width, int height) {
270   if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize)
271     return NULL;  // Nothing worth doing.
272 
273   // Since this original api didn't give the exact size of the image,
274   // we have to invent a reasonable value.
275   int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
276   SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height,
277            bytes_per_pixel, bytes_per_line);
278   SetRectangle(left, top, width, height);
279 
280   return GetUTF8Text();
281 }
282 
283 // Call between pages or documents etc to free up memory and forget
284 // adaptive data.
ClearAdaptiveClassifier()285 void TessBaseAPI::ClearAdaptiveClassifier() {
286   if (tesseract_ == NULL)
287     return;
288   tesseract_->ResetAdaptiveClassifier();
289 }
290 
291 // Provide an image for Tesseract to recognize. Format is as
292 // TesseractRect above. Does not copy the image buffer, or take
293 // ownership. The source image may be destroyed after Recognize is called,
294 // either explicitly or implicitly via one of the Get*Text functions.
295 // SetImage clears all recognition results, and sets the rectangle to the
296 // full image, so it may be followed immediately by a GetUTF8Text, and it
297 // will automatically perform recognition.
SetImage(const unsigned char * imagedata,int width,int height,int bytes_per_pixel,int bytes_per_line)298 void TessBaseAPI::SetImage(const unsigned char* imagedata,
299                            int width, int height,
300                            int bytes_per_pixel, int bytes_per_line) {
301   if (InternalSetImage())
302     thresholder_->SetImage(imagedata, width, height,
303                            bytes_per_pixel, bytes_per_line);
304 }
305 
306 // Provide an image for Tesseract to recognize. As with SetImage above,
307 // Tesseract doesn't take a copy or ownership or pixDestroy the image, so
308 // it must persist until after Recognize.
309 // Pix vs raw, which to use?
310 // Use Pix where possible. A future version of Tesseract may choose to use Pix
311 // as its internal representation and discard IMAGE altogether.
312 // Because of that, an implementation that sources and targets Pix may end up
313 // with less copies than an implementation that does not.
SetImage(const Pix * pix)314 void TessBaseAPI::SetImage(const Pix* pix) {
315 #ifdef HAVE_LIBLEPT
316   if (InternalSetImage())
317     thresholder_->SetImage(pix);
318 #endif
319 }
320 
321 // Restrict recognition to a sub-rectangle of the image. Call after SetImage.
322 // Each SetRectangle clears the recogntion results so multiple rectangles
323 // can be recognized with the same image.
SetRectangle(int left,int top,int width,int height)324 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
325   if (thresholder_ == NULL)
326     return;
327   thresholder_->SetRectangle(left, top, width, height);
328   ClearResults();
329 }
330 
331 // ONLY available if you have Leptonica installed.
332 // Get a copy of the internal thresholded image from Tesseract.
GetThresholdedImage()333 Pix* TessBaseAPI::GetThresholdedImage() {
334 #ifdef HAVE_LIBLEPT
335   if (tesseract_ == NULL)
336     return NULL;
337   if (tesseract_->pix_binary() == NULL)
338     Threshold(tesseract_->mutable_pix_binary());
339   return pixClone(tesseract_->pix_binary());
340 #else
341   return NULL;
342 #endif
343 }
344 
345 // Get the result of page layout analysis as a leptonica-style
346 // Boxa, Pixa pair, in reading order.
347 // Can be called before or after Recognize.
348 // For now only gets text regions.
GetRegions(Pixa ** pixa)349 Boxa* TessBaseAPI::GetRegions(Pixa** pixa) {
350 #ifdef HAVE_LIBLEPT
351   if (block_list_ == NULL || block_list_->empty()) {
352     FindLines();
353   }
354   int im_height = pixGetHeight(tesseract_->pix_binary());
355   Boxa* boxa = boxaCreate(block_list_->length());
356   if (pixa != NULL) {
357     *pixa = pixaCreate(boxaGetCount(boxa));
358   }
359   BLOCK_IT it(block_list_);
360   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
361     BLOCK* block = it.data();
362     POLY_BLOCK* poly = block->poly_block();
363     TBOX box;
364     if (poly != NULL) {
365       if (!poly->IsText())
366         continue;  // Use only text blocks.
367       POLY_BLOCK image_block(poly->points(), poly->isA());
368       image_block.rotate(block->re_rotation());
369       box = *image_block.bounding_box();
370       if (pixa != NULL) {
371         Pix* pix = pixCreate(box.width(), box.height(), 1);
372         PB_LINE_IT *lines;
373         // Block outline is a polygon, so use a PC_LINE_IT to get the
374         // rasterized interior. (Runs of interior pixels on a line.)
375         lines = new PB_LINE_IT(&image_block);
376         for (int y = box.bottom(); y < box.top(); ++y) {
377           ICOORDELT_LIST* segments = lines->get_line(y);
378           if (!segments->empty()) {
379             ICOORDELT_IT s_it(segments);
380             // Each element of segments is a start x and x size of the
381             // run of interior pixels.
382             for (s_it.mark_cycle_pt(); !s_it.cycled_list(); s_it.forward()) {
383               int start = s_it.data()->x();
384               int xext = s_it.data()->y();
385               // Copy the run from the source image to the block image.
386               pixRasterop(pix, start - box.left(),
387                           box.height() - 1 - (y - box.bottom()),
388                           xext, 1, PIX_SRC, tesseract_->pix_binary(),
389                           start, im_height - 1 - y);
390             }
391           }
392           delete segments;
393         }
394         delete lines;
395         pixaAddPix(*pixa, pix, L_INSERT);
396       }
397     } else {
398       if (!block_list_->singleton())
399         continue;  // A null poly block can only be used if it is the only block.
400       box = block->bounding_box();
401       if (pixa != NULL) {
402         Pix* pix = pixCreate(box.width(), box.height(), 1);
403         // Just copy the whole block as there is only a bounding box.
404         pixRasterop(pix, 0, 0, box.width(), box.height(),
405                     PIX_SRC, tesseract_->pix_binary(),
406                     box.left(), im_height - box.top());
407         pixaAddPix(*pixa, pix, L_INSERT);
408       }
409     }
410     Box* lbox = boxCreate(box.left(), im_height - box.top(),
411                           box.width(), box.height());
412     boxaAddBox(boxa, lbox, L_INSERT);
413   }
414   return boxa;
415 #else
416   return NULL;
417 #endif
418 }
419 
420 // Get the textlines as a leptonica-style
421 // Boxa, Pixa pair, in reading order.
422 // Can be called before or after Recognize.
423 // If blockids is not NULL, the block-id of each line is also returned as an
424 // array of one element per line. delete [] after use.
GetTextlines(Pixa ** pixa,int ** blockids)425 Boxa* TessBaseAPI::GetTextlines(Pixa** pixa, int** blockids) {
426 #ifdef HAVE_LIBLEPT
427   if (block_list_ == NULL || block_list_->empty()) {
428     FindLines();
429   }
430   // A local PAGE_RES prevents the clear if Recognize is called after.
431   PAGE_RES page_res(block_list_);
432   PAGE_RES_IT page_res_it(page_res_ != NULL ? page_res_ : &page_res);
433   // Count the lines to get a size for the arrays.
434   int line_count = 0;
435   for (page_res_it.restart_page(); page_res_it.word() != NULL;
436        page_res_it.forward()) {
437     if (page_res_it.row() != page_res_it.next_row()) {
438       ++line_count;
439     }
440   }
441 
442   int im_height = pixGetHeight(tesseract_->pix_binary());
443   Boxa* boxa = boxaCreate(line_count);
444   if (pixa != NULL)
445     *pixa = pixaCreate(line_count);
446   if (blockids != NULL)
447     *blockids = new int[line_count];
448   int blockid = 0;
449   int lineindex = 0;
450   for (page_res_it.restart_page(); page_res_it.word() != NULL;
451        page_res_it.forward(), ++lineindex) {
452     WERD_RES *word = page_res_it.word();
453     BLOCK* block = page_res_it.block()->block;
454     // Get the line bounding box.
455     PAGE_RES_IT word_it(page_res_it);  // Save start of line.
456     TBOX line_box = word->word->bounding_box();
457     while (page_res_it.next_row() == page_res_it.row()) {
458       page_res_it.forward();
459       word = page_res_it.word();
460       TBOX word_box = word->word->bounding_box();
461       word_box.rotate(block->re_rotation());
462       line_box += word_box;
463     }
464     Box* lbox = boxCreate(line_box.left(), im_height - line_box.top(),
465                           line_box.width(), line_box.height());
466     boxaAddBox(boxa, lbox, L_INSERT);
467     if (pixa != NULL) {
468       Pix* pix = pixCreate(line_box.width(), line_box.height(), 1);
469       // Copy all the words to the output pix.
470       while (word_it.row() == page_res_it.row()) {
471         word = word_it.word();
472         TBOX word_box = word->word->bounding_box();
473         word_box.rotate(block->re_rotation());
474         pixRasterop(pix, word_box.left() - line_box.left(),
475                     line_box.top() - word_box.top(),
476                     word_box.width(), word_box.height(),
477                     PIX_SRC, tesseract_->pix_binary(),
478                     word_box.left(), im_height - word_box.top());
479         word_it.forward();
480       }
481       pixaAddPix(*pixa, pix, L_INSERT);
482       pixaAddBox(*pixa, lbox, L_CLONE);
483     }
484     if (blockids != NULL) {
485       (*blockids)[lineindex] = blockid;
486       if (page_res_it.block() != page_res_it.next_block())
487         ++blockid;
488     }
489   }
490   return boxa;
491 #else
492   return NULL;
493 #endif
494 }
495 
496 // Get the words as a leptonica-style
497 // Boxa, Pixa pair, in reading order.
498 // Can be called before or after Recognize.
GetWords(Pixa ** pixa)499 Boxa* TessBaseAPI::GetWords(Pixa** pixa) {
500 #ifdef HAVE_LIBLEPT
501   if (block_list_ == NULL || block_list_->empty()) {
502     FindLines();
503   }
504   // A local PAGE_RES prevents the clear if Recognize is called after.
505   PAGE_RES page_res(block_list_);
506   PAGE_RES_IT page_res_it(page_res_ != NULL ? page_res_ : &page_res);
507   // Count the words to get a size for the arrays.
508   int word_count = 0;
509   for (page_res_it.restart_page(); page_res_it.word () != NULL;
510        page_res_it.forward())
511     ++word_count;
512 
513   int im_height = pixGetHeight(tesseract_->pix_binary());
514   Boxa* boxa = boxaCreate(word_count);
515   if (pixa != NULL) {
516     *pixa = pixaCreate(word_count);
517   }
518   for (page_res_it.restart_page(); page_res_it.word () != NULL;
519        page_res_it.forward()) {
520     WERD_RES *word = page_res_it.word();
521     BLOCK* block = page_res_it.block()->block;
522     TBOX box = word->word->bounding_box();
523     box.rotate(block->re_rotation());
524     Box* lbox = boxCreate(box.left(), im_height - box.top(),
525                           box.width(), box.height());
526     boxaAddBox(boxa, lbox, L_INSERT);
527     if (pixa != NULL) {
528       Pix* pix = pixCreate(box.width(), box.height(), 1);
529       // Copy the whole word bounding box to the output pix.
530       pixRasterop(pix, 0, 0, box.width(), box.height(),
531                   PIX_SRC, tesseract_->pix_binary(),
532                   box.left(), im_height - box.top());
533       pixaAddPix(*pixa, pix, L_INSERT);
534       pixaAddBox(*pixa, lbox, L_CLONE);
535     }
536   }
537   return boxa;
538 #else
539   return NULL;
540 #endif  // HAVE_LIBLEPT
541 }
542 
543 // Dump the internal binary image to a PGM file.
DumpPGM(const char * filename)544 void TessBaseAPI::DumpPGM(const char* filename) {
545   if (tesseract_ == NULL)
546     return;
547   IMAGELINE line;
548   line.init(page_image.get_xsize());
549   FILE *fp = fopen(filename, "w");
550   fprintf(fp, "P5 " INT32FORMAT " " INT32FORMAT " 255\n",
551           page_image.get_xsize(), page_image.get_ysize());
552   for (int j = page_image.get_ysize()-1; j >= 0 ; --j) {
553     page_image.get_line(0, j, page_image.get_xsize(), &line, 0);
554     for (int i = 0; i < page_image.get_xsize(); ++i) {
555       uinT8 b = line.pixels[i] ? 255 : 0;
556       fwrite(&b, 1, 1, fp);
557     }
558   }
559   fclose(fp);
560 }
561 
562 // Recognize the tesseract global image and return the result as Tesseract
563 // internal structures.
Recognize(struct ETEXT_STRUCT * monitor)564 int TessBaseAPI::Recognize(struct ETEXT_STRUCT* monitor) {
565   if (tesseract_ == NULL)
566     return -1;
567   if (thresholder_ == NULL || thresholder_->IsEmpty()) {
568     tprintf("Please call SetImage before attempting recognition.");
569     return -1;
570   }
571   if (page_res_ != NULL)
572     ClearResults();
573   if (FindLines() != 0)
574     return -1;
575   if (tesseract_->tessedit_resegment_from_boxes)
576     tesseract_->apply_boxes(*input_file_, block_list_);
577   tesseract_->SetBlackAndWhitelist();
578 
579   page_res_ = new PAGE_RES(block_list_);
580   int result = 0;
581   if (interactive_mode) {
582 #ifndef GRAPHICS_DISABLED
583     tesseract_->pgeditor_main(block_list_);
584 #endif
585     // The page_res is invalid after an interactive session, so cleanup
586     // in a way that lets us continue to the next page without crashing.
587     delete page_res_;
588     page_res_ = NULL;
589     return -1;
590   } else if (tesseract_->tessedit_train_from_boxes) {
591     apply_box_training(*output_file_, block_list_);
592   } else if (tesseract_->global_tessedit_ambigs_training) {
593     FILE *ambigs_output_file = tesseract_->init_ambigs_training(*input_file_);
594     // OCR the page segmented into words by tesseract.
595     tesseract_->ambigs_training_segmented(
596         *input_file_, page_res_, monitor, ambigs_output_file);
597     fclose(ambigs_output_file);
598   } else {
599     // Now run the main recognition.
600     // Running base tesseract if the inttemp for the current language loaded.
601     if (tesseract_->inttemp_loaded_) {
602       tesseract_->recog_all_words(page_res_, monitor);
603     }
604   }
605   return result;
606 }
607 
608 // Tests the chopper by exhaustively running chop_one_blob.
RecognizeForChopTest(struct ETEXT_STRUCT * monitor)609 int TessBaseAPI::RecognizeForChopTest(struct ETEXT_STRUCT* monitor) {
610   if (tesseract_ == NULL)
611     return -1;
612   if (thresholder_ == NULL || thresholder_->IsEmpty()) {
613     tprintf("Please call SetImage before attempting recognition.");
614     return -1;
615   }
616   if (page_res_ != NULL)
617     ClearResults();
618   if (FindLines() != 0)
619     return -1;
620   // Additional conditions under which chopper test cannot be run
621   if (tesseract_->tessedit_train_from_boxes_word_level || interactive_mode)
622     return -1;
623   ASSERT_HOST(tesseract_->inttemp_loaded_);
624 
625   page_res_ = new PAGE_RES(block_list_);
626 
627   PAGE_RES_IT page_res_it(page_res_);
628 
629   tesseract_->tess_matcher = &Tesseract::tess_default_matcher;
630   tesseract_->tess_tester = NULL;
631   tesseract_->tess_trainer = NULL;
632 
633   while (page_res_it.word() != NULL) {
634     WERD_RES *word_res = page_res_it.word();
635     WERD *word = word_res->word;
636     if (word->cblob_list()->empty()) {
637       page_res_it.forward();
638       continue;
639     }
640     WERD *bln_word = make_bln_copy(word, page_res_it.row()->row,
641                                    page_res_it.block()->block,
642                                    word_res->x_height, &word_res->denorm);
643     ASSERT_HOST(!bln_word->blob_list()->empty());
644     TWERD *tessword = make_tess_word(bln_word, NULL);
645     if (tessword->blobs == NULL) {
646       make_tess_word(bln_word, NULL);
647     }
648     TBLOB *pblob;
649     TBLOB *blob;
650     init_match_table();
651     BLOB_CHOICE_LIST *match_result;
652     BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
653     tesseract_->tess_denorm = &word_res->denorm;
654     tesseract_->tess_word = bln_word;
655     ASSERT_HOST(tessword->blobs != NULL);
656     for (blob = tessword->blobs, pblob = NULL;
657          blob != NULL; blob = blob->next) {
658       match_result = tesseract_->classify_blob(pblob, blob, blob->next, NULL,
659                                    "chop_word:", Green);
660       if (match_result == NULL)
661         tprintf("Null classifier output!\n");
662       tesseract_->modify_blob_choice(match_result, 0);
663       ASSERT_HOST(!match_result->empty());
664       *char_choices += match_result;
665       pblob = blob;
666     }
667     inT32 blob_number;
668     SEAMS seam_list = start_seam_list(tessword->blobs);
669     int right_chop_index = 0;
670     while (tesseract_->chop_one_blob(tessword, char_choices,
671                                     &blob_number, &seam_list,
672                                     &right_chop_index))   {
673     }
674 
675     word_res->best_choice = new WERD_CHOICE();
676     word_res->raw_choice = new WERD_CHOICE();
677     word_res->best_choice->make_bad();
678     word_res->raw_choice->make_bad();
679     tesseract_->getDict().permute_characters(*char_choices, 1000.0,
680                                              word_res->best_choice,
681                                              word_res->raw_choice);
682 
683     word_res->outword = make_ed_word(tessword, bln_word);
684     page_res_it.forward();
685   }
686   return 0;
687 }
688 
689 // Make a text string from the internal data structures.
GetUTF8Text()690 char* TessBaseAPI::GetUTF8Text() {
691   if (tesseract_ == NULL ||
692       (page_res_ == NULL && Recognize(NULL) < 0))
693     return NULL;
694   int total_length = TextLength(NULL);
695   PAGE_RES_IT   page_res_it(page_res_);
696   char* result = new char[total_length];
697   char* ptr = result;
698   for (page_res_it.restart_page(); page_res_it.word () != NULL;
699        page_res_it.forward()) {
700     WERD_RES *word = page_res_it.word();
701     WERD_CHOICE* choice = word->best_choice;
702     if (choice != NULL) {
703       strcpy(ptr, choice->unichar_string().string());
704       ptr += choice->unichar_string().length();
705       if (word->word->flag(W_EOL))
706         *ptr++ = '\n';
707       else
708         *ptr++ = ' ';
709     }
710   }
711   *ptr++ = '\n';
712   *ptr = '\0';
713   return result;
714 }
715 
ConvertWordToBoxText(WERD_RES * word,ROW_RES * row,int left,int bottom,char * word_str)716 static int ConvertWordToBoxText(WERD_RES *word,
717                                 ROW_RES* row,
718                                 int left,
719                                 int bottom,
720                                 char* word_str) {
721   // Copy the output word and denormalize it back to image coords.
722   WERD copy_outword;
723   copy_outword = *(word->outword);
724   copy_outword.baseline_denormalise(&word->denorm);
725   PBLOB_IT blob_it;
726   blob_it.set_to_list(copy_outword.blob_list());
727   int length = copy_outword.blob_list()->length();
728   int output_size = 0;
729 
730   if (length > 0) {
731     for (int index = 0, offset = 0; index < length;
732          offset += word->best_choice->unichar_lengths()[index++],
733          blob_it.forward()) {
734       PBLOB* blob = blob_it.data();
735       TBOX blob_box = blob->bounding_box();
736       if (word->tess_failed ||
737           blob_box.left() < 0 ||
738           blob_box.right() > page_image.get_xsize() ||
739           blob_box.bottom() < 0 ||
740           blob_box.top() > page_image.get_ysize()) {
741         // Bounding boxes can be illegal when tess fails on a word.
742         blob_box = word->word->bounding_box();  // Use original word as backup.
743         tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
744                 blob_box.left(), blob_box.bottom(),
745                 blob_box.right(), blob_box.top());
746       }
747 
748       // A single classification unit can be composed of several UTF-8
749       // characters. Append each of them to the result.
750       for (int sub = 0;
751            sub < word->best_choice->unichar_lengths()[index]; ++sub) {
752         char ch = word->best_choice->unichar_string()[offset + sub];
753         // Tesseract uses space for recognition failure. Fix to a reject
754         // character, kTesseractReject so we don't create illegal box files.
755         if (ch == ' ')
756           ch = kTesseractReject;
757         word_str[output_size++] = ch;
758       }
759       sprintf(word_str + output_size, " %d %d %d %d\n",
760               blob_box.left() + left, blob_box.bottom() + bottom,
761               blob_box.right() + left, blob_box.top() + bottom);
762       output_size += strlen(word_str + output_size);
763     }
764   }
765   return output_size;
766 }
767 
768 // Multiplier for max expected textlength assumes typically 4 numbers @
769 // (5 digits and a space) plus the newline = 4*(5+1)+1. Add to this the
770 // orginal UTF8 characters, and one kMaxCharsPerChar.
771 const int kCharsPerChar = 25;
772 // A maximal single box could occupy 4 numbers at 20 digits (for 64 bit) and a
773 // space plus the newline 4*(20+1)+1 and the maximum length of a UNICHAR.
774 // Test against this on each iteration for safety.
775 const int kMaxCharsPerChar = 85 + UNICHAR_LEN;
776 
777 // The recognized text is returned as a char* which is coded
778 // as a UTF8 box file and must be freed with the delete [] operator.
GetBoxText()779 char* TessBaseAPI::GetBoxText() {
780   int bottom = image_height_ - (rect_top_ + rect_height_);
781   if (tesseract_ == NULL ||
782       (page_res_ == NULL && Recognize(NULL) < 0))
783     return NULL;
784   int blob_count;
785   int utf8_length = TextLength(&blob_count);
786   int total_length = blob_count*kCharsPerChar + utf8_length + kMaxCharsPerChar;
787   PAGE_RES_IT   page_res_it(page_res_);
788   char* result = new char[total_length];
789   char* ptr = result;
790   for (page_res_it.restart_page(); page_res_it.word () != NULL;
791        page_res_it.forward()) {
792     WERD_RES *word = page_res_it.word();
793     ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom,
794                                 ptr);
795     // Just in case...
796     if (ptr - result + kMaxCharsPerChar > total_length)
797       break;
798   }
799   *ptr = '\0';
800   return result;
801 }
802 
803 // Conversion table for non-latin characters.
804 // Maps characters out of the latin set into the latin set.
805 // TODO(rays) incorporate this translation into unicharset.
806 const int kUniChs[] = {
807   0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
808 };
809 // Latin chars corresponding to the unicode chars above.
810 const int kLatinChs[] = {
811   0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
812 };
813 
814 // The recognized text is returned as a char* which is coded
815 // as UNLV format Latin-1 with specific reject and suspect codes
816 // and must be freed with the delete [] operator.
GetUNLVText()817 char* TessBaseAPI::GetUNLVText() {
818   if (tesseract_ == NULL ||
819       (page_res_ == NULL && Recognize(NULL) < 0))
820     return NULL;
821   bool tilde_crunch_written = false;
822   bool last_char_was_newline = true;
823   bool last_char_was_tilde = false;
824 
825   int total_length = TextLength(NULL);
826   PAGE_RES_IT   page_res_it(page_res_);
827   char* result = new char[total_length];
828   char* ptr = result;
829   for (page_res_it.restart_page(); page_res_it.word () != NULL;
830        page_res_it.forward()) {
831     WERD_RES *word = page_res_it.word();
832     // Process the current word.
833     if (word->unlv_crunch_mode != CR_NONE) {
834       if (word->unlv_crunch_mode != CR_DELETE &&
835           (!tilde_crunch_written ||
836            (word->unlv_crunch_mode == CR_KEEP_SPACE &&
837             word->word->space() > 0 &&
838             !word->word->flag(W_FUZZY_NON) &&
839             !word->word->flag(W_FUZZY_SP)))) {
840         if (!word->word->flag(W_BOL) &&
841             word->word->space() > 0 &&
842             !word->word->flag(W_FUZZY_NON) &&
843             !word->word->flag(W_FUZZY_SP)) {
844           /* Write a space to separate from preceeding good text */
845           *ptr++ = ' ';
846           last_char_was_tilde = false;
847         }
848         if (!last_char_was_tilde) {
849           // Write a reject char.
850           last_char_was_tilde = true;
851           *ptr++ = kUNLVReject;
852           tilde_crunch_written = true;
853           last_char_was_newline = false;
854         }
855       }
856     } else {
857       // NORMAL PROCESSING of non tilde crunched words.
858       tilde_crunch_written = false;
859 
860       if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
861         ensure_rep_chars_are_consistent(word);
862 
863       tesseract_->set_unlv_suspects(word);
864       const char* wordstr = word->best_choice->unichar_string().string();
865       const STRING& lengths = word->best_choice->unichar_lengths();
866       int length = lengths.length();
867       int i = 0;
868       int offset = 0;
869 
870       if (last_char_was_tilde &&
871           word->word->space() == 0 && wordstr[offset] == ' ') {
872         // Prevent adjacent tilde across words - we know that adjacent tildes
873         // within words have been removed.
874         // Skip the first character.
875         offset = lengths[i++];
876       }
877       if (i < length && wordstr[offset] != 0) {
878         if (!last_char_was_newline)
879           *ptr++ = ' ';
880         else
881           last_char_was_newline = false;
882         for (; i < length; offset += lengths[i++]) {
883           if (wordstr[offset] == ' ' ||
884               wordstr[offset] == kTesseractReject) {
885             *ptr++ = kUNLVReject;
886             last_char_was_tilde = true;
887           } else {
888             if (word->reject_map[i].rejected())
889               *ptr++ = kUNLVSuspect;
890             UNICHAR ch(wordstr + offset, lengths[i]);
891             int uni_ch = ch.first_uni();
892             for (int j = 0; kUniChs[j] != 0; ++j) {
893               if (kUniChs[j] == uni_ch) {
894                 uni_ch = kLatinChs[j];
895                 break;
896               }
897             }
898             if (uni_ch <= 0xff) {
899               *ptr++ = static_cast<char>(uni_ch);
900               last_char_was_tilde = false;
901             } else {
902               *ptr++ = kUNLVReject;
903               last_char_was_tilde = true;
904             }
905           }
906         }
907       }
908     }
909     if (word->word->flag(W_EOL) && !last_char_was_newline) {
910       /* Add a new line output */
911       *ptr++ = '\n';
912       tilde_crunch_written = false;
913       last_char_was_newline = true;
914       last_char_was_tilde = false;
915     }
916   }
917   *ptr++ = '\n';
918   *ptr = '\0';
919   return result;
920 }
921 
922 // Returns the average word confidence for Tesseract page result.
MeanTextConf()923 int TessBaseAPI::MeanTextConf() {
924   int* conf = AllWordConfidences();
925   if (!conf) return 0;
926   int sum = 0;
927   int *pt = conf;
928   while (*pt >= 0) sum += *pt++;
929   if (pt != conf) sum /= pt - conf;
930   delete [] conf;
931   return sum;
932 }
933 
934 // Returns an array of all word confidences, terminated by -1.
AllWordConfidences()935 int* TessBaseAPI::AllWordConfidences() {
936   if (tesseract_ == NULL ||
937       (page_res_ == NULL && Recognize(NULL) < 0))
938     return NULL;
939   int n_word = 0;
940   PAGE_RES_IT res_it(page_res_);
941   for (res_it.restart_page(); res_it.word() != NULL; res_it.forward())
942     n_word++;
943 
944   int* conf = new int[n_word+1];
945   n_word = 0;
946   for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) {
947     WERD_RES *word = res_it.word();
948     WERD_CHOICE* choice = word->best_choice;
949     int w_conf = static_cast<int>(100 + 5 * choice->certainty());
950                  // This is the eq for converting Tesseract confidence to 1..100
951     if (w_conf < 0) w_conf = 0;
952     if (w_conf > 100) w_conf = 100;
953     conf[n_word++] = w_conf;
954   }
955   conf[n_word] = -1;
956   return conf;
957 }
958 
959 // Free up recognition results and any stored image data, without actually
960 // freeing any recognition data that would be time-consuming to reload.
961 // Afterwards, you must call SetImage or TesseractRect before doing
962 // any Recognize or Get* operation.
Clear()963 void TessBaseAPI::Clear() {
964   if (thresholder_ != NULL)
965     thresholder_->Clear();
966   ClearResults();
967   page_image.destroy();
968 }
969 
970 // Close down tesseract and free up all memory. End() is equivalent to
971 // destructing and reconstructing your TessBaseAPI.
972 // Once End() has been used, none of the other API functions may be used
973 // other than Init and anything declared above it in the class definition.
End()974 void TessBaseAPI::End() {
975   if (thresholder_ != NULL) {
976     delete thresholder_;
977     thresholder_ = NULL;
978   }
979   if (page_res_ != NULL) {
980     delete page_res_;
981     page_res_ = NULL;
982   }
983   if (block_list_ != NULL) {
984     delete block_list_;
985     block_list_ = NULL;
986   }
987   if (tesseract_ != NULL) {
988     tesseract_->end_tesseract();
989     delete tesseract_;
990     tesseract_ = NULL;
991   }
992   if (input_file_ != NULL) {
993     delete input_file_;
994     input_file_ = NULL;
995   }
996   if (output_file_ != NULL) {
997     delete output_file_;
998     output_file_ = NULL;
999   }
1000   if (datapath_ != NULL) {
1001     delete datapath_;
1002     datapath_ = NULL;
1003   }
1004   if (language_ != NULL) {
1005     delete language_;
1006     language_ = NULL;
1007   }
1008 }
1009 
1010 // Check whether a word is valid according to Tesseract's language model
1011 // returns 0 if the word is invalid, non-zero if valid
IsValidWord(const char * word)1012 int TessBaseAPI::IsValidWord(const char *word) {
1013   return tesseract_->getDict().valid_word(word);
1014 }
1015 
1016 
GetTextDirection(int * out_offset,float * out_slope)1017 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
1018   if (page_res_ == NULL)
1019     FindLines();
1020   if (block_list_->length() < 1) {
1021     return false;
1022   }
1023 
1024   // Get first block
1025   BLOCK_IT block_it(block_list_);
1026   block_it.move_to_first();
1027   ROW_LIST* rows = block_it.data()->row_list();
1028   if (rows->length() != 1) {
1029     return false;
1030   }
1031 
1032   // Get first line of block
1033   ROW_IT row_it(rows);
1034   row_it.move_to_first();
1035   ROW* row = row_it.data();
1036 
1037   // Calculate offset and slope (NOTE: Kind of ugly)
1038   *out_offset = static_cast<int>(row->base_line(0.0));
1039   *out_slope = row->base_line(1.0) - row->base_line(0.0);
1040 
1041   return true;
1042 }
1043 
1044 // Set the letter_is_okay function to point somewhere else.
SetDictFunc(DictFunc f)1045 void TessBaseAPI::SetDictFunc(DictFunc f) {
1046   if (tesseract_ != NULL) {
1047     tesseract_->getDict().letter_is_okay_ = f;
1048   }
1049 }
1050 
1051 // Common code for setting the image.
InternalSetImage()1052 bool TessBaseAPI::InternalSetImage() {
1053   if (tesseract_ == NULL) {
1054     tprintf("Please call Init before attempting to send an image.");
1055     return false;
1056   }
1057   if (thresholder_ == NULL)
1058     thresholder_ = new ImageThresholder;
1059   ClearResults();
1060   return true;
1061 }
1062 
1063 // Run the thresholder to make the thresholded image. If pix is not NULL,
1064 // the source is thresholded to pix instead of the internal IMAGE.
Threshold(Pix ** pix)1065 void TessBaseAPI::Threshold(Pix** pix) {
1066 #ifdef HAVE_LIBLEPT
1067   if (pix != NULL)
1068     thresholder_->ThresholdToPix(pix);
1069   else
1070     thresholder_->ThresholdToIMAGE(&page_image);
1071 #else
1072   thresholder_->ThresholdToIMAGE(&page_image);
1073 #endif
1074   thresholder_->GetImageSizes(&rect_left_, &rect_top_,
1075                               &rect_width_, &rect_height_,
1076                               &image_width_, &image_height_);
1077   threshold_done_ = true;
1078 }
1079 
1080 // Find lines from the image making the BLOCK_LIST.
FindLines()1081 int TessBaseAPI::FindLines() {
1082   if (!block_list_->empty()) {
1083     return 0;
1084   }
1085   if (tesseract_ == NULL) {
1086     tesseract_ = new Tesseract;
1087     tesseract_->InitAdaptiveClassifier();
1088   }
1089 #ifdef HAVE_LIBLEPT
1090   if (tesseract_->pix_binary() == NULL)
1091     Threshold(tesseract_->mutable_pix_binary());
1092 #endif
1093   if (!threshold_done_)
1094     Threshold(NULL);
1095 
1096   if (tesseract_->SegmentPage(input_file_, &page_image, block_list_) < 0)
1097     return -1;
1098   ASSERT_HOST(page_image.get_xsize() == rect_width_ ||
1099               page_image.get_xsize() == rect_width_ - 1);
1100   ASSERT_HOST(page_image.get_ysize() == rect_height_ ||
1101               page_image.get_ysize() == rect_height_ - 1);
1102   return 0;
1103 }
1104 
1105 // Delete the pageres and clear the block list ready for a new page.
ClearResults()1106 void TessBaseAPI::ClearResults() {
1107   threshold_done_ = false;
1108   if (tesseract_ != NULL)
1109     tesseract_->Clear();
1110   if (page_res_ != NULL) {
1111     delete page_res_;
1112     page_res_ = NULL;
1113   }
1114   if (block_list_ == NULL)
1115     block_list_ = new BLOCK_LIST;
1116   else
1117     block_list_->clear();
1118 }
1119 
1120 // Return the length of the output text string, as UTF8, assuming
1121 // one newline per line and one per block, with a terminator,
1122 // and assuming a single character reject marker for each rejected character.
1123 // Also return the number of recognized blobs in blob_count.
TextLength(int * blob_count)1124 int TessBaseAPI::TextLength(int* blob_count) {
1125   if (tesseract_ == NULL || page_res_ == NULL)
1126     return 0;
1127 
1128   PAGE_RES_IT   page_res_it(page_res_);
1129   int total_length = 2;
1130   int total_blobs = 0;
1131   // Iterate over the data structures to extract the recognition result.
1132   for (page_res_it.restart_page(); page_res_it.word () != NULL;
1133        page_res_it.forward()) {
1134     WERD_RES *word = page_res_it.word();
1135     WERD_CHOICE* choice = word->best_choice;
1136     if (choice != NULL) {
1137       total_blobs += choice->length() + 1;
1138       total_length += choice->unichar_string().length() + 1;
1139       for (int i = 0; i < word->reject_map.length(); ++i) {
1140         if (word->reject_map[i].rejected())
1141           ++total_length;
1142       }
1143     }
1144   }
1145   if (blob_count != NULL)
1146     *blob_count = total_blobs;
1147   return total_length;
1148 }
1149 
1150 // Estimates the Orientation And Script of the image.
1151 // Returns true if the image was processed successfully.
DetectOS(OSResults * osr)1152 bool TessBaseAPI::DetectOS(OSResults* osr) {
1153   if (tesseract_ == NULL)
1154     return false;
1155   ClearResults();
1156   Threshold(NULL);
1157   if (input_file_ == NULL)
1158     input_file_ = new STRING(kInputFile);
1159   return orientation_and_script_detection(*input_file_, osr, tesseract_);
1160 }
1161 
1162 // ____________________________________________________________________________
1163 // Ocropus add-ons.
1164 
1165 // Find lines from the image making the BLOCK_LIST.
FindLinesCreateBlockList()1166 BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() {
1167   FindLines();
1168   BLOCK_LIST* result = block_list_;
1169   block_list_ = NULL;
1170   return result;
1171 }
1172 
1173 // Delete a block list.
1174 // This is to keep BLOCK_LIST pointer opaque
1175 // and let go of including the other headers.
DeleteBlockList(BLOCK_LIST * block_list)1176 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
1177   delete block_list;
1178 }
1179 
1180 
make_tess_ocrrow(float baseline,float xheight,float descender,float ascender)1181 static ROW *make_tess_ocrrow(float baseline,
1182                              float xheight,
1183                              float descender,
1184                              float ascender) {
1185   inT32 xstarts[] = {-32000};
1186   double quad_coeffs[] = {0, 0, baseline};
1187   return new ROW(1,
1188                  xstarts,
1189                  quad_coeffs,
1190                  xheight,
1191                  ascender - (baseline + xheight),
1192                  descender - baseline,
1193                  0,
1194                  0);
1195 }
1196 
1197 // Almost a copy of make_tess_row() from ccmain/tstruct.cpp.
fill_dummy_row(float baseline,float xheight,float descender,float ascender,TEXTROW * tessrow)1198 static void fill_dummy_row(float baseline, float xheight,
1199                            float descender, float ascender,
1200                            TEXTROW* tessrow) {
1201   tessrow->baseline.segments = 1;
1202   tessrow->baseline.xstarts[0] = -32767;
1203   tessrow->baseline.xstarts[1] = 32767;
1204   tessrow->baseline.quads[0].a = 0;
1205   tessrow->baseline.quads[0].b = 0;
1206   tessrow->baseline.quads[0].c = bln_baseline_offset;
1207   tessrow->xheight.segments = 1;
1208   tessrow->xheight.xstarts[0] = -32767;
1209   tessrow->xheight.xstarts[1] = 32767;
1210   tessrow->xheight.quads[0].a = 0;
1211   tessrow->xheight.quads[0].b = 0;
1212   tessrow->xheight.quads[0].c = bln_baseline_offset + bln_x_height;
1213   tessrow->lineheight = bln_x_height;
1214   tessrow->ascrise = bln_x_height * (ascender - (xheight + baseline)) / xheight;
1215   tessrow->descdrop = bln_x_height * (descender - baseline) / xheight;
1216 }
1217 
1218 
1219 // Return a TBLOB * from the whole page_image.
1220 // To be freed later with free_blob().
make_tesseract_blob(float baseline,float xheight,float descender,float ascender)1221 TBLOB *make_tesseract_blob(float baseline, float xheight,
1222                            float descender, float ascender) {
1223   BLOCK *block = new BLOCK("a character",
1224                            TRUE,
1225                            0, 0,
1226                            0, 0,
1227                            page_image.get_xsize(),
1228                            page_image.get_ysize());
1229 
1230   // Create C_BLOBs from the page
1231   extract_edges(
1232 #ifndef GRAPHICS_DISABLED
1233 		NULL,
1234 #endif
1235 		&page_image, &page_image,
1236                 ICOORD(page_image.get_xsize(), page_image.get_ysize()),
1237                 block);
1238 
1239   // Create one PBLOB from all C_BLOBs
1240   C_BLOB_LIST *list = block->blob_list();
1241   C_BLOB_IT c_blob_it(list);
1242   PBLOB *pblob = new PBLOB;  // will be (hopefully) deleted by the pblob_list
1243   for (c_blob_it.mark_cycle_pt();
1244        !c_blob_it.cycled_list();
1245        c_blob_it.forward()) {
1246       C_BLOB *c_blob = c_blob_it.data();
1247       PBLOB c_as_p(c_blob, baseline + xheight);
1248       merge_blobs(pblob, &c_as_p);
1249   }
1250   PBLOB_LIST *pblob_list = new PBLOB_LIST;  // will be deleted by the word
1251   PBLOB_IT pblob_it(pblob_list);
1252   pblob_it.add_after_then_move(pblob);
1253 
1254   // Normalize PBLOB
1255   WERD word(pblob_list, 0, " ");
1256   ROW *row = make_tess_ocrrow(baseline, xheight, descender, ascender);
1257   word.baseline_normalise(row);
1258   delete row;
1259 
1260   // Create a TBLOB from PBLOB
1261   return make_tess_blob(pblob, /* flatten: */ TRUE);
1262 }
1263 
1264 
1265 // Adapt to recognize the current image as the given character.
1266 // The image must be preloaded and be just an image of a single character.
AdaptToCharacter(const char * unichar_repr,int length,float baseline,float xheight,float descender,float ascender)1267 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
1268                                    int length,
1269                                    float baseline,
1270                                    float xheight,
1271                                    float descender,
1272                                    float ascender) {
1273   UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
1274   LINE_STATS LineStats;
1275   TEXTROW row;
1276   fill_dummy_row(baseline, xheight, descender, ascender, &row);
1277   GetLineStatsFromRow(&row, &LineStats);
1278 
1279   TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender);
1280   float threshold;
1281   UNICHAR_ID best_class = 0;
1282   float best_rating = -100;
1283 
1284 
1285   // Classify to get a raw choice.
1286   BLOB_CHOICE_LIST choices;
1287   tesseract_->AdaptiveClassifier(blob, NULL, &row, &choices, NULL);
1288   BLOB_CHOICE_IT choice_it;
1289   choice_it.set_to_list(&choices);
1290   for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1291        choice_it.forward()) {
1292     if (choice_it.data()->rating() > best_rating) {
1293       best_rating = choice_it.data()->rating();
1294       best_class = choice_it.data()->unichar_id();
1295     }
1296   }
1297 
1298   if (id == best_class) {
1299     threshold = matcher_good_threshold;
1300   } else {
1301     /* the blob was incorrectly classified - find the rating threshold
1302        needed to create a template which will correct the error with
1303        some margin.  However, don't waste time trying to make
1304        templates which are too tight. */
1305     threshold = tesseract_->GetBestRatingFor(blob, &LineStats, id);
1306     threshold *= .9;
1307     const float max_threshold = .125;
1308     const float min_threshold = .02;
1309 
1310     if (threshold > max_threshold)
1311         threshold = max_threshold;
1312 
1313     // I have cuddled the following line to set it out of the strike
1314     // of the coverage testing tool. I have no idea how to trigger
1315     // this situation nor I have any necessity to do it. --mezhirov
1316     if (threshold < min_threshold) threshold = min_threshold;
1317   }
1318 
1319   if (blob->outlines)
1320     tesseract_->AdaptToChar(blob, &LineStats, id, threshold);
1321   free_blob(blob);
1322 }
1323 
1324 
RecognitionPass1(BLOCK_LIST * block_list)1325 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
1326   PAGE_RES *page_res = new PAGE_RES(block_list);
1327   tesseract_->recog_all_words(page_res, NULL, NULL, 1);
1328   return page_res;
1329 }
1330 
RecognitionPass2(BLOCK_LIST * block_list,PAGE_RES * pass1_result)1331 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
1332                                         PAGE_RES* pass1_result) {
1333   if (!pass1_result)
1334     pass1_result = new PAGE_RES(block_list);
1335   tesseract_->recog_all_words(pass1_result, NULL, NULL, 2);
1336   return pass1_result;
1337 }
1338 
1339 struct TESS_CHAR : ELIST_LINK {
1340   char *unicode_repr;
1341   int length;  // of unicode_repr
1342   float cost;
1343   TBOX box;
1344 
TESS_CHARtesseract::TESS_CHAR1345   TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
1346     length = (len == -1 ? strlen(repr) : len);
1347     unicode_repr = new char[length + 1];
1348     strncpy(unicode_repr, repr, length);
1349   }
1350 
TESS_CHARtesseract::TESS_CHAR1351   TESS_CHAR() {  // Satisfies ELISTIZE.
1352   }
~TESS_CHARtesseract::TESS_CHAR1353   ~TESS_CHAR() {
1354     delete [] unicode_repr;
1355   }
1356 };
1357 
1358 ELISTIZEH(TESS_CHAR)
ELISTIZE(TESS_CHAR)1359 ELISTIZE(TESS_CHAR)
1360 
1361 static void add_space(TESS_CHAR_IT* it) {
1362   TESS_CHAR *t = new TESS_CHAR(0, " ");
1363   it->add_after_then_move(t);
1364 }
1365 
1366 
rating_to_cost(float rating)1367 static float rating_to_cost(float rating) {
1368   rating = 100 + 5*rating;
1369   // cuddled that to save from coverage profiler
1370   // (I have never seen ratings worse than -100,
1371   //  but the check won't hurt)
1372   if (rating < 0) rating = 0;
1373   return rating;
1374 }
1375 
1376 
1377 // Extract the OCR results, costs (penalty points for uncertainty),
1378 // and the bounding boxes of the characters.
extract_result(TESS_CHAR_IT * out,PAGE_RES * page_res)1379 static void extract_result(TESS_CHAR_IT* out,
1380                            PAGE_RES* page_res) {
1381   PAGE_RES_IT page_res_it(page_res);
1382   int word_count = 0;
1383   while (page_res_it.word() != NULL) {
1384     WERD_RES *word = page_res_it.word();
1385     const char *str = word->best_choice->unichar_string().string();
1386     const char *len = word->best_choice->unichar_lengths().string();
1387 
1388     if (word_count)
1389       add_space(out);
1390     TBOX bln_rect;
1391     PBLOB_LIST *blobs = word->outword->blob_list();
1392     PBLOB_IT it(blobs);
1393     int n = strlen(len);
1394     TBOX** boxes_to_fix = new TBOX*[n];
1395     for (int i = 0; i < n; i++) {
1396       PBLOB *blob = it.data();
1397       TBOX current = blob->bounding_box();
1398       bln_rect = bln_rect.bounding_union(current);
1399       TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->certainty()),
1400                                     str, *len);
1401       tc->box = current;
1402       boxes_to_fix[i] = &tc->box;
1403 
1404       out->add_after_then_move(tc);
1405       it.forward();
1406       str += *len;
1407       len++;
1408     }
1409 
1410     // Find the word bbox before normalization.
1411     // Here we can't use the C_BLOB bboxes directly,
1412     // since connected letters are not yet cut.
1413     TBOX real_rect = word->word->bounding_box();
1414 
1415     // Denormalize boxes by transforming the bbox of the whole bln word
1416     // into the denorm bbox (`real_rect') of the whole word.
1417     double x_stretch = static_cast<double>(real_rect.width())
1418                      / bln_rect.width();
1419     double y_stretch = static_cast<double>(real_rect.height())
1420                      / bln_rect.height();
1421     for (int j = 0; j < n; j++) {
1422       TBOX *box = boxes_to_fix[j];
1423       int x0 = static_cast<int>(real_rect.left() +
1424                    x_stretch * (box->left() - bln_rect.left()) + 0.5);
1425       int x1 = static_cast<int>(real_rect.left() +
1426                    x_stretch * (box->right() - bln_rect.left()) + 0.5);
1427       int y0 = static_cast<int>(real_rect.bottom() +
1428                    y_stretch * (box->bottom() - bln_rect.bottom()) + 0.5);
1429       int y1 = static_cast<int>(real_rect.bottom() +
1430                    y_stretch * (box->top() - bln_rect.bottom()) + 0.5);
1431       *box = TBOX(ICOORD(x0, y0), ICOORD(x1, y1));
1432     }
1433     delete [] boxes_to_fix;
1434 
1435     page_res_it.forward();
1436     word_count++;
1437   }
1438 }
1439 
1440 
1441 // Extract the OCR results, costs (penalty points for uncertainty),
1442 // and the bounding boxes of the characters.
TesseractExtractResult(char ** text,int ** lengths,float ** costs,int ** x0,int ** y0,int ** x1,int ** y1,PAGE_RES * page_res)1443 int TessBaseAPI::TesseractExtractResult(char** text,
1444                                         int** lengths,
1445                                         float** costs,
1446                                         int** x0,
1447                                         int** y0,
1448                                         int** x1,
1449                                         int** y1,
1450                                         PAGE_RES* page_res) {
1451   TESS_CHAR_LIST tess_chars;
1452   TESS_CHAR_IT tess_chars_it(&tess_chars);
1453   extract_result(&tess_chars_it, page_res);
1454   tess_chars_it.move_to_first();
1455   int n = tess_chars.length();
1456   int text_len = 0;
1457   *lengths = new int[n];
1458   *costs = new float[n];
1459   *x0 = new int[n];
1460   *y0 = new int[n];
1461   *x1 = new int[n];
1462   *y1 = new int[n];
1463   int i = 0;
1464   for (tess_chars_it.mark_cycle_pt();
1465        !tess_chars_it.cycled_list();
1466        tess_chars_it.forward(), i++) {
1467     TESS_CHAR *tc = tess_chars_it.data();
1468     text_len += (*lengths)[i] = tc->length;
1469     (*costs)[i] = tc->cost;
1470     (*x0)[i] = tc->box.left();
1471     (*y0)[i] = tc->box.bottom();
1472     (*x1)[i] = tc->box.right();
1473     (*y1)[i] = tc->box.top();
1474   }
1475   char *p = *text = new char[text_len];
1476 
1477   tess_chars_it.move_to_first();
1478   for (tess_chars_it.mark_cycle_pt();
1479         !tess_chars_it.cycled_list();
1480        tess_chars_it.forward()) {
1481     TESS_CHAR *tc = tess_chars_it.data();
1482     strncpy(p, tc->unicode_repr, tc->length);
1483     p += tc->length;
1484   }
1485   return n;
1486 }
1487 
1488 // This method returns the features associated with the current image.
1489 // Make sure setimage has been called before calling this method.
GetFeatures(INT_FEATURE_ARRAY int_features,int * num_features)1490 void TessBaseAPI::GetFeatures(INT_FEATURE_ARRAY int_features,
1491                               int* num_features) {
1492   if (page_res_ != NULL)
1493     ClearResults();
1494   if (!threshold_done_)
1495     Threshold(NULL);
1496   // We have only one block, which is of the size of the page.
1497   BLOCK_LIST* blocks = new BLOCK_LIST;
1498   BLOCK *block = new BLOCK("",                       // filename.
1499                            TRUE,                     // proportional.
1500                            0,                        // kerning.
1501                            0,                        // spacing.
1502                            0,                        // Left.
1503                            0,                        // Bottom.
1504                            page_image.get_xsize(),   // Right.
1505                            page_image.get_ysize());  // Top.
1506   ICOORD bleft, tright;
1507   block->bounding_box (bleft, tright);
1508 
1509   BLOCK_IT block_it_add = blocks;
1510   block_it_add.add_to_end(block);
1511 
1512   ICOORD page_tr(page_image.get_xsize(), page_image.get_ysize());
1513   TEXTROW tessrow;
1514   make_tess_row(NULL,       // Denormalizer.
1515                 &tessrow);  // Output row.
1516   LINE_STATS line_stats;
1517   GetLineStatsFromRow(&tessrow, &line_stats);
1518 
1519   // Perform a CC analysis to detect the blobs.
1520   BLOCK_IT block_it = blocks;
1521   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
1522        block_it.forward ()) {
1523     BLOCK* block = block_it.data();
1524 #ifndef GRAPHICS_DISABLED
1525     extract_edges(NULL,         // Scrollview window.
1526                   &page_image,  // Image.
1527                   &page_image,  // Thresholded image.
1528                   page_tr,      // corner of page.
1529                   block);       // block.
1530 #else
1531     extract_edges(&page_image,  // Image.
1532                   &page_image,  // Thresholded image.
1533                   page_tr,      // corner of page.
1534                   block);       // block.
1535 #endif
1536     C_BLOB_IT blob_it = block->blob_list();
1537     PBLOB *pblob = new PBLOB;
1538     // Iterate over all blobs found and get their features.
1539     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
1540          blob_it.forward()) {
1541       C_BLOB* blob = blob_it.data();
1542       blob = blob;
1543       PBLOB c_as_p(blob, page_image.get_ysize());
1544       merge_blobs(pblob, &c_as_p);
1545     }
1546 
1547     PBLOB_LIST *pblob_list = new PBLOB_LIST;
1548     PBLOB_IT pblob_it(pblob_list);
1549     pblob_it.add_after_then_move(pblob);
1550     WERD word(pblob_list,  // Blob list.
1551               0,           // Blanks in front.
1552               " ");        // Correct text.
1553     ROW *row = make_tess_ocrrow(0,                       // baseline.
1554                                 page_image.get_ysize(),  // xheight.
1555                                 0,                       // ascent.
1556                                 0);                      // descent.
1557     word.baseline_normalise(row);
1558     delete row;
1559     if (pblob->out_list () == NULL) {
1560       tprintf("Blob list is empty");
1561     }
1562     TBLOB* tblob = make_tess_blob(pblob,  // Blob.
1563                                   TRUE);  // Flatten.
1564 
1565     CLASS_NORMALIZATION_ARRAY norm_array;
1566     inT32 len;
1567     *num_features = tesseract_->GetCharNormFeatures(
1568         tblob, &line_stats,
1569         tesseract_->PreTrainedTemplates,
1570         int_features, norm_array, &len);
1571   }
1572   delete blocks;
1573 }
1574 
1575 // Return the pointer to the i-th dawg loaded into tesseract_ object.
GetDawg(int i) const1576 const Dawg *TessBaseAPI::GetDawg(int i) const {
1577   if (tesseract_ == NULL || i >= NumDawgs()) return NULL;
1578   return tesseract_->getDict().GetDawg(i);
1579 }
1580 
1581 // Return the number of dawgs loaded into tesseract_ object.
NumDawgs() const1582 int TessBaseAPI::NumDawgs() const {
1583   return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs();
1584 }
1585 
1586 // Return the language used in the last valid initialization.
GetLastInitLanguage() const1587 const char* TessBaseAPI::GetLastInitLanguage() const {
1588   return (tesseract_ == NULL || tesseract_->lang.string() == NULL) ?
1589       "" : tesseract_->lang.string();
1590 }
1591 }  // namespace tesseract.
1592