1 /**********************************************************************
2 * File: baseapi.cpp
3 * Description: Simple API for calling tesseract.
4 * Author: Ray Smith
5 * Created: Fri Oct 06 15:35:01 PDT 2006
6 *
7 * (C) Copyright 2006, Google Inc.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24
25 #ifdef HAVE_LIBLEPT
26 // Include leptonica library only if autoconf (or makefile etc) tell us to.
27 #include "allheaders.h"
28 #endif
29
30 #include "baseapi.h"
31
32 #include "thresholder.h"
33 #include "tesseractmain.h"
34 #include "tesseractclass.h"
35 #include "tessedit.h"
36 #include "ocrclass.h"
37 #include "pageres.h"
38 #include "tessvars.h"
39 #include "control.h"
40 #include "applybox.h"
41 #include "pgedit.h"
42 #include "varabled.h"
43 #include "output.h"
44 #include "mainblk.h"
45 #include "globals.h"
46 #include "adaptmatch.h"
47 #include "edgblob.h"
48 #include "tessbox.h"
49 #include "tordvars.h"
50 #include "imgs.h"
51 #include "makerow.h"
52 #include "tstruct.h"
53 #include "tessout.h"
54 #include "tface.h"
55 #include "permute.h"
56 #include "otsuthr.h"
57 #include "osdetect.h"
58 #include "chopper.h"
59 #include "matchtab.h"
60
61 namespace tesseract {
62
63 // Minimum sensible image size to be worth running tesseract.
64 const int kMinRectSize = 10;
65 // Character returned when Tesseract couldn't recognize as anything.
66 const char kTesseractReject = '~';
67 // Character used by UNLV error counter as a reject.
68 const char kUNLVReject = '~';
69 // Character used by UNLV as a suspect marker.
70 const char kUNLVSuspect = '^';
71 // Filename used for input image file, from which to derive a name to search
72 // for a possible UNLV zone file, if none is specified by SetInputName.
73 const char* kInputFile = "noname.tif";
74
TessBaseAPI()75 TessBaseAPI::TessBaseAPI()
76 : tesseract_(NULL),
77 // Thresholder is initialized to NULL here, but will be set before use by:
78 // A constructor of a derived API, SetThresholder(), or
79 // created implicitly when used in InternalSetImage.
80 thresholder_(NULL),
81 threshold_done_(false),
82 block_list_(NULL),
83 page_res_(NULL),
84 input_file_(NULL),
85 output_file_(NULL),
86 datapath_(NULL),
87 language_(NULL),
88 rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0),
89 image_width_(0), image_height_(0) {
90 }
91
~TessBaseAPI()92 TessBaseAPI::~TessBaseAPI() {
93 End();
94 }
95
96 // Set the name of the input file. Needed only for training and
97 // loading a UNLV zone file.
SetInputName(const char * name)98 void TessBaseAPI::SetInputName(const char* name) {
99 if (input_file_ == NULL)
100 input_file_ = new STRING(name);
101 else
102 *input_file_ = name;
103 }
104
105 // Set the name of the output files. Needed only for debugging.
SetOutputName(const char * name)106 void TessBaseAPI::SetOutputName(const char* name) {
107 if (output_file_ == NULL)
108 output_file_ = new STRING(name);
109 else
110 *output_file_ = name;
111 }
112
113 // Set the value of an internal "variable" (of either old or new types).
114 // Supply the name of the variable and the value as a string, just as
115 // you would in a config file.
116 // Returns false if the name lookup failed.
117 // SetVariable may be used before Init, to set things that control
118 // initialization, but note that on End all settings are lost and
119 // the next Init will use the defaults unless SetVariable is used again.
SetVariable(const char * variable,const char * value)120 bool TessBaseAPI::SetVariable(const char* variable, const char* value) {
121 if (tesseract_ == NULL)
122 tesseract_ = new Tesseract;
123 return set_variable(variable, value);
124 }
125
126 // The datapath must be the name of the data directory (no ending /) or
127 // some other file in which the data directory resides (for instance argv[0].)
128 // The language is (usually) an ISO 639-3 string or NULL will default to eng.
129 // If numeric_mode is true, then only digits and Roman numerals will
130 // be returned.
131 // Returns 0 on success and -1 on initialization failure.
Init(const char * datapath,const char * language,char ** configs,int configs_size,bool configs_global_only)132 int TessBaseAPI::Init(const char* datapath, const char* language,
133 char **configs, int configs_size,
134 bool configs_global_only) {
135 // If the datapath or the language have changed, then start again.
136 // Note that the language_ field stores the last requested language that was
137 // initialized successfully, while tesseract_->lang stores the language
138 // actually used. They differ only if the requested language was NULL, in
139 // which case tesseract_->lang is set to the Tesseract default ("eng").
140 if (tesseract_ != NULL &&
141 (datapath_ == NULL || language_ == NULL || *datapath_ != datapath
142 || (*language_ != language && tesseract_->lang != language))) {
143 tesseract_->end_tesseract();
144 delete tesseract_;
145 tesseract_ = NULL;
146 }
147
148 bool reset_classifier = true;
149 if (tesseract_ == NULL) {
150 reset_classifier = false;
151 tesseract_ = new Tesseract;
152 if (tesseract_->init_tesseract(
153 datapath, output_file_ != NULL ? output_file_->string() : NULL,
154 language, configs, configs_size, configs_global_only) != 0) {
155 return -1;
156 }
157 }
158 // Update datapath and language requested for the last valid initialization.
159 if (datapath_ == NULL)
160 datapath_ = new STRING(datapath);
161 else
162 *datapath_ = datapath;
163 if (language_ == NULL)
164 language_ = new STRING(language);
165 else
166 *language_ = language;
167
168 // For same language and datapath, just reset the adaptive classifier.
169 if (reset_classifier) tesseract_->ResetAdaptiveClassifier();
170
171 return 0;
172 }
173
174 // Init only the lang model component of Tesseract. The only functions
175 // that work after this init are SetVariable and IsValidWord.
176 // WARNING: temporary! This function will be removed from here and placed
177 // in a separate API at some future time.
InitLangMod(const char * datapath,const char * language)178 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
179 if (tesseract_ == NULL)
180 tesseract_ = new Tesseract;
181 return tesseract_->init_tesseract_lm(datapath, NULL, language);
182 }
183
184 // Init only the classifer component of Tesseract. Used to initialize the
185 // specified language when no dawg models are available.
InitWithoutLangModel(const char * datapath,const char * language)186 int TessBaseAPI::InitWithoutLangModel(const char* datapath,
187 const char* language) {
188 // If the datapath or the language have changed, then start again.
189 if (tesseract_ != NULL &&
190 (datapath_ == NULL || language_ == NULL ||
191 *datapath_ != datapath || *language_ != language)) {
192 tesseract_->end_tesseract();
193 delete tesseract_;
194 tesseract_ = NULL;
195 }
196 if (datapath_ == NULL)
197 datapath_ = new STRING(datapath);
198 else
199 *datapath_ = datapath;
200 if (language_ == NULL)
201 language_ = new STRING(language);
202 else
203 *language_ = language;
204 if (tesseract_ == NULL) {
205 tesseract_ = new Tesseract;
206 return tesseract_->init_tesseract_classifier(
207 datapath, output_file_ != NULL ? output_file_->string() : NULL,
208 language, NULL, 0, false);
209 }
210 // For same language and datapath, just reset the adaptive classifier.
211 tesseract_->ResetAdaptiveClassifier();
212 return 0;
213 }
214
215 // Read a "config" file containing a set of variable, value pairs.
216 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
217 // and also accepts a relative or absolute path name.
ReadConfigFile(const char * filename,bool global_only)218 void TessBaseAPI::ReadConfigFile(const char* filename, bool global_only) {
219 tesseract_->read_config_file(filename, global_only);
220 }
221
222 // Set the current page segmentation mode. Defaults to PSM_AUTO.
223 // The mode is stored as an INT_VARIABLE so it can also be modified by
224 // ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
SetPageSegMode(PageSegMode mode)225 void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
226 if (tesseract_ == NULL)
227 tesseract_ = new Tesseract;
228 tesseract_->tessedit_pageseg_mode.set_value(mode);
229 }
230
231 // Return the current page segmentation mode.
GetPageSegMode() const232 PageSegMode TessBaseAPI::GetPageSegMode() const {
233 if (tesseract_ == NULL)
234 return PSM_SINGLE_BLOCK;
235 return static_cast<PageSegMode>(
236 static_cast<int>(tesseract_->tessedit_pageseg_mode));
237 }
238
239 // Set the hint for trading accuracy against speed.
240 // Default is AVS_FASTEST, which is the old behaviour.
241 // Note that this is only a hint. Depending on the language and/or
242 // build configuration, speed and accuracy may not be tradeable.
243 // Also note that despite being an enum, any value in the range
244 // AVS_FASTEST to AVS_MOST_ACCURATE can be provided, and may or may not
245 // have an effect, depending on the implementation.
246 // The mode is stored as an INT_VARIABLE so it can also be modified by
247 // ReadConfigFile or SetVariable("tessedit_accuracyvspeed", mode as string).
SetAccuracyVSpeed(AccuracyVSpeed mode)248 void TessBaseAPI::SetAccuracyVSpeed(AccuracyVSpeed mode) {
249 if (tesseract_ == NULL)
250 tesseract_ = new Tesseract;
251 tesseract_->tessedit_accuracyvspeed.set_value(mode);
252 }
253
254 // Recognize a rectangle from an image and return the result as a string.
255 // May be called many times for a single Init.
256 // Currently has no error checking.
257 // Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
258 // Palette color images will not work properly and must be converted to
259 // 24 bit.
260 // Binary images of 1 bit per pixel may also be given but they must be
261 // byte packed with the MSB of the first byte being the first pixel, and a
262 // one pixel is WHITE. For binary images set bytes_per_pixel=0.
263 // The recognized text is returned as a char* which is coded
264 // as UTF8 and must be freed with the delete [] operator.
TesseractRect(const unsigned char * imagedata,int bytes_per_pixel,int bytes_per_line,int left,int top,int width,int height)265 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
266 int bytes_per_pixel,
267 int bytes_per_line,
268 int left, int top,
269 int width, int height) {
270 if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize)
271 return NULL; // Nothing worth doing.
272
273 // Since this original api didn't give the exact size of the image,
274 // we have to invent a reasonable value.
275 int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
276 SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height,
277 bytes_per_pixel, bytes_per_line);
278 SetRectangle(left, top, width, height);
279
280 return GetUTF8Text();
281 }
282
283 // Call between pages or documents etc to free up memory and forget
284 // adaptive data.
ClearAdaptiveClassifier()285 void TessBaseAPI::ClearAdaptiveClassifier() {
286 if (tesseract_ == NULL)
287 return;
288 tesseract_->ResetAdaptiveClassifier();
289 }
290
291 // Provide an image for Tesseract to recognize. Format is as
292 // TesseractRect above. Does not copy the image buffer, or take
293 // ownership. The source image may be destroyed after Recognize is called,
294 // either explicitly or implicitly via one of the Get*Text functions.
295 // SetImage clears all recognition results, and sets the rectangle to the
296 // full image, so it may be followed immediately by a GetUTF8Text, and it
297 // will automatically perform recognition.
SetImage(const unsigned char * imagedata,int width,int height,int bytes_per_pixel,int bytes_per_line)298 void TessBaseAPI::SetImage(const unsigned char* imagedata,
299 int width, int height,
300 int bytes_per_pixel, int bytes_per_line) {
301 if (InternalSetImage())
302 thresholder_->SetImage(imagedata, width, height,
303 bytes_per_pixel, bytes_per_line);
304 }
305
306 // Provide an image for Tesseract to recognize. As with SetImage above,
307 // Tesseract doesn't take a copy or ownership or pixDestroy the image, so
308 // it must persist until after Recognize.
309 // Pix vs raw, which to use?
310 // Use Pix where possible. A future version of Tesseract may choose to use Pix
311 // as its internal representation and discard IMAGE altogether.
312 // Because of that, an implementation that sources and targets Pix may end up
313 // with less copies than an implementation that does not.
SetImage(const Pix * pix)314 void TessBaseAPI::SetImage(const Pix* pix) {
315 #ifdef HAVE_LIBLEPT
316 if (InternalSetImage())
317 thresholder_->SetImage(pix);
318 #endif
319 }
320
321 // Restrict recognition to a sub-rectangle of the image. Call after SetImage.
322 // Each SetRectangle clears the recogntion results so multiple rectangles
323 // can be recognized with the same image.
SetRectangle(int left,int top,int width,int height)324 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
325 if (thresholder_ == NULL)
326 return;
327 thresholder_->SetRectangle(left, top, width, height);
328 ClearResults();
329 }
330
331 // ONLY available if you have Leptonica installed.
332 // Get a copy of the internal thresholded image from Tesseract.
GetThresholdedImage()333 Pix* TessBaseAPI::GetThresholdedImage() {
334 #ifdef HAVE_LIBLEPT
335 if (tesseract_ == NULL)
336 return NULL;
337 if (tesseract_->pix_binary() == NULL)
338 Threshold(tesseract_->mutable_pix_binary());
339 return pixClone(tesseract_->pix_binary());
340 #else
341 return NULL;
342 #endif
343 }
344
345 // Get the result of page layout analysis as a leptonica-style
346 // Boxa, Pixa pair, in reading order.
347 // Can be called before or after Recognize.
348 // For now only gets text regions.
GetRegions(Pixa ** pixa)349 Boxa* TessBaseAPI::GetRegions(Pixa** pixa) {
350 #ifdef HAVE_LIBLEPT
351 if (block_list_ == NULL || block_list_->empty()) {
352 FindLines();
353 }
354 int im_height = pixGetHeight(tesseract_->pix_binary());
355 Boxa* boxa = boxaCreate(block_list_->length());
356 if (pixa != NULL) {
357 *pixa = pixaCreate(boxaGetCount(boxa));
358 }
359 BLOCK_IT it(block_list_);
360 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
361 BLOCK* block = it.data();
362 POLY_BLOCK* poly = block->poly_block();
363 TBOX box;
364 if (poly != NULL) {
365 if (!poly->IsText())
366 continue; // Use only text blocks.
367 POLY_BLOCK image_block(poly->points(), poly->isA());
368 image_block.rotate(block->re_rotation());
369 box = *image_block.bounding_box();
370 if (pixa != NULL) {
371 Pix* pix = pixCreate(box.width(), box.height(), 1);
372 PB_LINE_IT *lines;
373 // Block outline is a polygon, so use a PC_LINE_IT to get the
374 // rasterized interior. (Runs of interior pixels on a line.)
375 lines = new PB_LINE_IT(&image_block);
376 for (int y = box.bottom(); y < box.top(); ++y) {
377 ICOORDELT_LIST* segments = lines->get_line(y);
378 if (!segments->empty()) {
379 ICOORDELT_IT s_it(segments);
380 // Each element of segments is a start x and x size of the
381 // run of interior pixels.
382 for (s_it.mark_cycle_pt(); !s_it.cycled_list(); s_it.forward()) {
383 int start = s_it.data()->x();
384 int xext = s_it.data()->y();
385 // Copy the run from the source image to the block image.
386 pixRasterop(pix, start - box.left(),
387 box.height() - 1 - (y - box.bottom()),
388 xext, 1, PIX_SRC, tesseract_->pix_binary(),
389 start, im_height - 1 - y);
390 }
391 }
392 delete segments;
393 }
394 delete lines;
395 pixaAddPix(*pixa, pix, L_INSERT);
396 }
397 } else {
398 if (!block_list_->singleton())
399 continue; // A null poly block can only be used if it is the only block.
400 box = block->bounding_box();
401 if (pixa != NULL) {
402 Pix* pix = pixCreate(box.width(), box.height(), 1);
403 // Just copy the whole block as there is only a bounding box.
404 pixRasterop(pix, 0, 0, box.width(), box.height(),
405 PIX_SRC, tesseract_->pix_binary(),
406 box.left(), im_height - box.top());
407 pixaAddPix(*pixa, pix, L_INSERT);
408 }
409 }
410 Box* lbox = boxCreate(box.left(), im_height - box.top(),
411 box.width(), box.height());
412 boxaAddBox(boxa, lbox, L_INSERT);
413 }
414 return boxa;
415 #else
416 return NULL;
417 #endif
418 }
419
420 // Get the textlines as a leptonica-style
421 // Boxa, Pixa pair, in reading order.
422 // Can be called before or after Recognize.
423 // If blockids is not NULL, the block-id of each line is also returned as an
424 // array of one element per line. delete [] after use.
GetTextlines(Pixa ** pixa,int ** blockids)425 Boxa* TessBaseAPI::GetTextlines(Pixa** pixa, int** blockids) {
426 #ifdef HAVE_LIBLEPT
427 if (block_list_ == NULL || block_list_->empty()) {
428 FindLines();
429 }
430 // A local PAGE_RES prevents the clear if Recognize is called after.
431 PAGE_RES page_res(block_list_);
432 PAGE_RES_IT page_res_it(page_res_ != NULL ? page_res_ : &page_res);
433 // Count the lines to get a size for the arrays.
434 int line_count = 0;
435 for (page_res_it.restart_page(); page_res_it.word() != NULL;
436 page_res_it.forward()) {
437 if (page_res_it.row() != page_res_it.next_row()) {
438 ++line_count;
439 }
440 }
441
442 int im_height = pixGetHeight(tesseract_->pix_binary());
443 Boxa* boxa = boxaCreate(line_count);
444 if (pixa != NULL)
445 *pixa = pixaCreate(line_count);
446 if (blockids != NULL)
447 *blockids = new int[line_count];
448 int blockid = 0;
449 int lineindex = 0;
450 for (page_res_it.restart_page(); page_res_it.word() != NULL;
451 page_res_it.forward(), ++lineindex) {
452 WERD_RES *word = page_res_it.word();
453 BLOCK* block = page_res_it.block()->block;
454 // Get the line bounding box.
455 PAGE_RES_IT word_it(page_res_it); // Save start of line.
456 TBOX line_box = word->word->bounding_box();
457 while (page_res_it.next_row() == page_res_it.row()) {
458 page_res_it.forward();
459 word = page_res_it.word();
460 TBOX word_box = word->word->bounding_box();
461 word_box.rotate(block->re_rotation());
462 line_box += word_box;
463 }
464 Box* lbox = boxCreate(line_box.left(), im_height - line_box.top(),
465 line_box.width(), line_box.height());
466 boxaAddBox(boxa, lbox, L_INSERT);
467 if (pixa != NULL) {
468 Pix* pix = pixCreate(line_box.width(), line_box.height(), 1);
469 // Copy all the words to the output pix.
470 while (word_it.row() == page_res_it.row()) {
471 word = word_it.word();
472 TBOX word_box = word->word->bounding_box();
473 word_box.rotate(block->re_rotation());
474 pixRasterop(pix, word_box.left() - line_box.left(),
475 line_box.top() - word_box.top(),
476 word_box.width(), word_box.height(),
477 PIX_SRC, tesseract_->pix_binary(),
478 word_box.left(), im_height - word_box.top());
479 word_it.forward();
480 }
481 pixaAddPix(*pixa, pix, L_INSERT);
482 pixaAddBox(*pixa, lbox, L_CLONE);
483 }
484 if (blockids != NULL) {
485 (*blockids)[lineindex] = blockid;
486 if (page_res_it.block() != page_res_it.next_block())
487 ++blockid;
488 }
489 }
490 return boxa;
491 #else
492 return NULL;
493 #endif
494 }
495
496 // Get the words as a leptonica-style
497 // Boxa, Pixa pair, in reading order.
498 // Can be called before or after Recognize.
GetWords(Pixa ** pixa)499 Boxa* TessBaseAPI::GetWords(Pixa** pixa) {
500 #ifdef HAVE_LIBLEPT
501 if (block_list_ == NULL || block_list_->empty()) {
502 FindLines();
503 }
504 // A local PAGE_RES prevents the clear if Recognize is called after.
505 PAGE_RES page_res(block_list_);
506 PAGE_RES_IT page_res_it(page_res_ != NULL ? page_res_ : &page_res);
507 // Count the words to get a size for the arrays.
508 int word_count = 0;
509 for (page_res_it.restart_page(); page_res_it.word () != NULL;
510 page_res_it.forward())
511 ++word_count;
512
513 int im_height = pixGetHeight(tesseract_->pix_binary());
514 Boxa* boxa = boxaCreate(word_count);
515 if (pixa != NULL) {
516 *pixa = pixaCreate(word_count);
517 }
518 for (page_res_it.restart_page(); page_res_it.word () != NULL;
519 page_res_it.forward()) {
520 WERD_RES *word = page_res_it.word();
521 BLOCK* block = page_res_it.block()->block;
522 TBOX box = word->word->bounding_box();
523 box.rotate(block->re_rotation());
524 Box* lbox = boxCreate(box.left(), im_height - box.top(),
525 box.width(), box.height());
526 boxaAddBox(boxa, lbox, L_INSERT);
527 if (pixa != NULL) {
528 Pix* pix = pixCreate(box.width(), box.height(), 1);
529 // Copy the whole word bounding box to the output pix.
530 pixRasterop(pix, 0, 0, box.width(), box.height(),
531 PIX_SRC, tesseract_->pix_binary(),
532 box.left(), im_height - box.top());
533 pixaAddPix(*pixa, pix, L_INSERT);
534 pixaAddBox(*pixa, lbox, L_CLONE);
535 }
536 }
537 return boxa;
538 #else
539 return NULL;
540 #endif // HAVE_LIBLEPT
541 }
542
543 // Dump the internal binary image to a PGM file.
DumpPGM(const char * filename)544 void TessBaseAPI::DumpPGM(const char* filename) {
545 if (tesseract_ == NULL)
546 return;
547 IMAGELINE line;
548 line.init(page_image.get_xsize());
549 FILE *fp = fopen(filename, "w");
550 fprintf(fp, "P5 " INT32FORMAT " " INT32FORMAT " 255\n",
551 page_image.get_xsize(), page_image.get_ysize());
552 for (int j = page_image.get_ysize()-1; j >= 0 ; --j) {
553 page_image.get_line(0, j, page_image.get_xsize(), &line, 0);
554 for (int i = 0; i < page_image.get_xsize(); ++i) {
555 uinT8 b = line.pixels[i] ? 255 : 0;
556 fwrite(&b, 1, 1, fp);
557 }
558 }
559 fclose(fp);
560 }
561
562 // Recognize the tesseract global image and return the result as Tesseract
563 // internal structures.
Recognize(struct ETEXT_STRUCT * monitor)564 int TessBaseAPI::Recognize(struct ETEXT_STRUCT* monitor) {
565 if (tesseract_ == NULL)
566 return -1;
567 if (thresholder_ == NULL || thresholder_->IsEmpty()) {
568 tprintf("Please call SetImage before attempting recognition.");
569 return -1;
570 }
571 if (page_res_ != NULL)
572 ClearResults();
573 if (FindLines() != 0)
574 return -1;
575 if (tesseract_->tessedit_resegment_from_boxes)
576 tesseract_->apply_boxes(*input_file_, block_list_);
577 tesseract_->SetBlackAndWhitelist();
578
579 page_res_ = new PAGE_RES(block_list_);
580 int result = 0;
581 if (interactive_mode) {
582 #ifndef GRAPHICS_DISABLED
583 tesseract_->pgeditor_main(block_list_);
584 #endif
585 // The page_res is invalid after an interactive session, so cleanup
586 // in a way that lets us continue to the next page without crashing.
587 delete page_res_;
588 page_res_ = NULL;
589 return -1;
590 } else if (tesseract_->tessedit_train_from_boxes) {
591 apply_box_training(*output_file_, block_list_);
592 } else if (tesseract_->global_tessedit_ambigs_training) {
593 FILE *ambigs_output_file = tesseract_->init_ambigs_training(*input_file_);
594 // OCR the page segmented into words by tesseract.
595 tesseract_->ambigs_training_segmented(
596 *input_file_, page_res_, monitor, ambigs_output_file);
597 fclose(ambigs_output_file);
598 } else {
599 // Now run the main recognition.
600 // Running base tesseract if the inttemp for the current language loaded.
601 if (tesseract_->inttemp_loaded_) {
602 tesseract_->recog_all_words(page_res_, monitor);
603 }
604 }
605 return result;
606 }
607
608 // Tests the chopper by exhaustively running chop_one_blob.
RecognizeForChopTest(struct ETEXT_STRUCT * monitor)609 int TessBaseAPI::RecognizeForChopTest(struct ETEXT_STRUCT* monitor) {
610 if (tesseract_ == NULL)
611 return -1;
612 if (thresholder_ == NULL || thresholder_->IsEmpty()) {
613 tprintf("Please call SetImage before attempting recognition.");
614 return -1;
615 }
616 if (page_res_ != NULL)
617 ClearResults();
618 if (FindLines() != 0)
619 return -1;
620 // Additional conditions under which chopper test cannot be run
621 if (tesseract_->tessedit_train_from_boxes_word_level || interactive_mode)
622 return -1;
623 ASSERT_HOST(tesseract_->inttemp_loaded_);
624
625 page_res_ = new PAGE_RES(block_list_);
626
627 PAGE_RES_IT page_res_it(page_res_);
628
629 tesseract_->tess_matcher = &Tesseract::tess_default_matcher;
630 tesseract_->tess_tester = NULL;
631 tesseract_->tess_trainer = NULL;
632
633 while (page_res_it.word() != NULL) {
634 WERD_RES *word_res = page_res_it.word();
635 WERD *word = word_res->word;
636 if (word->cblob_list()->empty()) {
637 page_res_it.forward();
638 continue;
639 }
640 WERD *bln_word = make_bln_copy(word, page_res_it.row()->row,
641 page_res_it.block()->block,
642 word_res->x_height, &word_res->denorm);
643 ASSERT_HOST(!bln_word->blob_list()->empty());
644 TWERD *tessword = make_tess_word(bln_word, NULL);
645 if (tessword->blobs == NULL) {
646 make_tess_word(bln_word, NULL);
647 }
648 TBLOB *pblob;
649 TBLOB *blob;
650 init_match_table();
651 BLOB_CHOICE_LIST *match_result;
652 BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
653 tesseract_->tess_denorm = &word_res->denorm;
654 tesseract_->tess_word = bln_word;
655 ASSERT_HOST(tessword->blobs != NULL);
656 for (blob = tessword->blobs, pblob = NULL;
657 blob != NULL; blob = blob->next) {
658 match_result = tesseract_->classify_blob(pblob, blob, blob->next, NULL,
659 "chop_word:", Green);
660 if (match_result == NULL)
661 tprintf("Null classifier output!\n");
662 tesseract_->modify_blob_choice(match_result, 0);
663 ASSERT_HOST(!match_result->empty());
664 *char_choices += match_result;
665 pblob = blob;
666 }
667 inT32 blob_number;
668 SEAMS seam_list = start_seam_list(tessword->blobs);
669 int right_chop_index = 0;
670 while (tesseract_->chop_one_blob(tessword, char_choices,
671 &blob_number, &seam_list,
672 &right_chop_index)) {
673 }
674
675 word_res->best_choice = new WERD_CHOICE();
676 word_res->raw_choice = new WERD_CHOICE();
677 word_res->best_choice->make_bad();
678 word_res->raw_choice->make_bad();
679 tesseract_->getDict().permute_characters(*char_choices, 1000.0,
680 word_res->best_choice,
681 word_res->raw_choice);
682
683 word_res->outword = make_ed_word(tessword, bln_word);
684 page_res_it.forward();
685 }
686 return 0;
687 }
688
689 // Make a text string from the internal data structures.
GetUTF8Text()690 char* TessBaseAPI::GetUTF8Text() {
691 if (tesseract_ == NULL ||
692 (page_res_ == NULL && Recognize(NULL) < 0))
693 return NULL;
694 int total_length = TextLength(NULL);
695 PAGE_RES_IT page_res_it(page_res_);
696 char* result = new char[total_length];
697 char* ptr = result;
698 for (page_res_it.restart_page(); page_res_it.word () != NULL;
699 page_res_it.forward()) {
700 WERD_RES *word = page_res_it.word();
701 WERD_CHOICE* choice = word->best_choice;
702 if (choice != NULL) {
703 strcpy(ptr, choice->unichar_string().string());
704 ptr += choice->unichar_string().length();
705 if (word->word->flag(W_EOL))
706 *ptr++ = '\n';
707 else
708 *ptr++ = ' ';
709 }
710 }
711 *ptr++ = '\n';
712 *ptr = '\0';
713 return result;
714 }
715
ConvertWordToBoxText(WERD_RES * word,ROW_RES * row,int left,int bottom,char * word_str)716 static int ConvertWordToBoxText(WERD_RES *word,
717 ROW_RES* row,
718 int left,
719 int bottom,
720 char* word_str) {
721 // Copy the output word and denormalize it back to image coords.
722 WERD copy_outword;
723 copy_outword = *(word->outword);
724 copy_outword.baseline_denormalise(&word->denorm);
725 PBLOB_IT blob_it;
726 blob_it.set_to_list(copy_outword.blob_list());
727 int length = copy_outword.blob_list()->length();
728 int output_size = 0;
729
730 if (length > 0) {
731 for (int index = 0, offset = 0; index < length;
732 offset += word->best_choice->unichar_lengths()[index++],
733 blob_it.forward()) {
734 PBLOB* blob = blob_it.data();
735 TBOX blob_box = blob->bounding_box();
736 if (word->tess_failed ||
737 blob_box.left() < 0 ||
738 blob_box.right() > page_image.get_xsize() ||
739 blob_box.bottom() < 0 ||
740 blob_box.top() > page_image.get_ysize()) {
741 // Bounding boxes can be illegal when tess fails on a word.
742 blob_box = word->word->bounding_box(); // Use original word as backup.
743 tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
744 blob_box.left(), blob_box.bottom(),
745 blob_box.right(), blob_box.top());
746 }
747
748 // A single classification unit can be composed of several UTF-8
749 // characters. Append each of them to the result.
750 for (int sub = 0;
751 sub < word->best_choice->unichar_lengths()[index]; ++sub) {
752 char ch = word->best_choice->unichar_string()[offset + sub];
753 // Tesseract uses space for recognition failure. Fix to a reject
754 // character, kTesseractReject so we don't create illegal box files.
755 if (ch == ' ')
756 ch = kTesseractReject;
757 word_str[output_size++] = ch;
758 }
759 sprintf(word_str + output_size, " %d %d %d %d\n",
760 blob_box.left() + left, blob_box.bottom() + bottom,
761 blob_box.right() + left, blob_box.top() + bottom);
762 output_size += strlen(word_str + output_size);
763 }
764 }
765 return output_size;
766 }
767
768 // Multiplier for max expected textlength assumes typically 4 numbers @
769 // (5 digits and a space) plus the newline = 4*(5+1)+1. Add to this the
770 // orginal UTF8 characters, and one kMaxCharsPerChar.
771 const int kCharsPerChar = 25;
772 // A maximal single box could occupy 4 numbers at 20 digits (for 64 bit) and a
773 // space plus the newline 4*(20+1)+1 and the maximum length of a UNICHAR.
774 // Test against this on each iteration for safety.
775 const int kMaxCharsPerChar = 85 + UNICHAR_LEN;
776
777 // The recognized text is returned as a char* which is coded
778 // as a UTF8 box file and must be freed with the delete [] operator.
GetBoxText()779 char* TessBaseAPI::GetBoxText() {
780 int bottom = image_height_ - (rect_top_ + rect_height_);
781 if (tesseract_ == NULL ||
782 (page_res_ == NULL && Recognize(NULL) < 0))
783 return NULL;
784 int blob_count;
785 int utf8_length = TextLength(&blob_count);
786 int total_length = blob_count*kCharsPerChar + utf8_length + kMaxCharsPerChar;
787 PAGE_RES_IT page_res_it(page_res_);
788 char* result = new char[total_length];
789 char* ptr = result;
790 for (page_res_it.restart_page(); page_res_it.word () != NULL;
791 page_res_it.forward()) {
792 WERD_RES *word = page_res_it.word();
793 ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom,
794 ptr);
795 // Just in case...
796 if (ptr - result + kMaxCharsPerChar > total_length)
797 break;
798 }
799 *ptr = '\0';
800 return result;
801 }
802
803 // Conversion table for non-latin characters.
804 // Maps characters out of the latin set into the latin set.
805 // TODO(rays) incorporate this translation into unicharset.
806 const int kUniChs[] = {
807 0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
808 };
809 // Latin chars corresponding to the unicode chars above.
810 const int kLatinChs[] = {
811 0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
812 };
813
814 // The recognized text is returned as a char* which is coded
815 // as UNLV format Latin-1 with specific reject and suspect codes
816 // and must be freed with the delete [] operator.
GetUNLVText()817 char* TessBaseAPI::GetUNLVText() {
818 if (tesseract_ == NULL ||
819 (page_res_ == NULL && Recognize(NULL) < 0))
820 return NULL;
821 bool tilde_crunch_written = false;
822 bool last_char_was_newline = true;
823 bool last_char_was_tilde = false;
824
825 int total_length = TextLength(NULL);
826 PAGE_RES_IT page_res_it(page_res_);
827 char* result = new char[total_length];
828 char* ptr = result;
829 for (page_res_it.restart_page(); page_res_it.word () != NULL;
830 page_res_it.forward()) {
831 WERD_RES *word = page_res_it.word();
832 // Process the current word.
833 if (word->unlv_crunch_mode != CR_NONE) {
834 if (word->unlv_crunch_mode != CR_DELETE &&
835 (!tilde_crunch_written ||
836 (word->unlv_crunch_mode == CR_KEEP_SPACE &&
837 word->word->space() > 0 &&
838 !word->word->flag(W_FUZZY_NON) &&
839 !word->word->flag(W_FUZZY_SP)))) {
840 if (!word->word->flag(W_BOL) &&
841 word->word->space() > 0 &&
842 !word->word->flag(W_FUZZY_NON) &&
843 !word->word->flag(W_FUZZY_SP)) {
844 /* Write a space to separate from preceeding good text */
845 *ptr++ = ' ';
846 last_char_was_tilde = false;
847 }
848 if (!last_char_was_tilde) {
849 // Write a reject char.
850 last_char_was_tilde = true;
851 *ptr++ = kUNLVReject;
852 tilde_crunch_written = true;
853 last_char_was_newline = false;
854 }
855 }
856 } else {
857 // NORMAL PROCESSING of non tilde crunched words.
858 tilde_crunch_written = false;
859
860 if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
861 ensure_rep_chars_are_consistent(word);
862
863 tesseract_->set_unlv_suspects(word);
864 const char* wordstr = word->best_choice->unichar_string().string();
865 const STRING& lengths = word->best_choice->unichar_lengths();
866 int length = lengths.length();
867 int i = 0;
868 int offset = 0;
869
870 if (last_char_was_tilde &&
871 word->word->space() == 0 && wordstr[offset] == ' ') {
872 // Prevent adjacent tilde across words - we know that adjacent tildes
873 // within words have been removed.
874 // Skip the first character.
875 offset = lengths[i++];
876 }
877 if (i < length && wordstr[offset] != 0) {
878 if (!last_char_was_newline)
879 *ptr++ = ' ';
880 else
881 last_char_was_newline = false;
882 for (; i < length; offset += lengths[i++]) {
883 if (wordstr[offset] == ' ' ||
884 wordstr[offset] == kTesseractReject) {
885 *ptr++ = kUNLVReject;
886 last_char_was_tilde = true;
887 } else {
888 if (word->reject_map[i].rejected())
889 *ptr++ = kUNLVSuspect;
890 UNICHAR ch(wordstr + offset, lengths[i]);
891 int uni_ch = ch.first_uni();
892 for (int j = 0; kUniChs[j] != 0; ++j) {
893 if (kUniChs[j] == uni_ch) {
894 uni_ch = kLatinChs[j];
895 break;
896 }
897 }
898 if (uni_ch <= 0xff) {
899 *ptr++ = static_cast<char>(uni_ch);
900 last_char_was_tilde = false;
901 } else {
902 *ptr++ = kUNLVReject;
903 last_char_was_tilde = true;
904 }
905 }
906 }
907 }
908 }
909 if (word->word->flag(W_EOL) && !last_char_was_newline) {
910 /* Add a new line output */
911 *ptr++ = '\n';
912 tilde_crunch_written = false;
913 last_char_was_newline = true;
914 last_char_was_tilde = false;
915 }
916 }
917 *ptr++ = '\n';
918 *ptr = '\0';
919 return result;
920 }
921
922 // Returns the average word confidence for Tesseract page result.
MeanTextConf()923 int TessBaseAPI::MeanTextConf() {
924 int* conf = AllWordConfidences();
925 if (!conf) return 0;
926 int sum = 0;
927 int *pt = conf;
928 while (*pt >= 0) sum += *pt++;
929 if (pt != conf) sum /= pt - conf;
930 delete [] conf;
931 return sum;
932 }
933
934 // Returns an array of all word confidences, terminated by -1.
AllWordConfidences()935 int* TessBaseAPI::AllWordConfidences() {
936 if (tesseract_ == NULL ||
937 (page_res_ == NULL && Recognize(NULL) < 0))
938 return NULL;
939 int n_word = 0;
940 PAGE_RES_IT res_it(page_res_);
941 for (res_it.restart_page(); res_it.word() != NULL; res_it.forward())
942 n_word++;
943
944 int* conf = new int[n_word+1];
945 n_word = 0;
946 for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) {
947 WERD_RES *word = res_it.word();
948 WERD_CHOICE* choice = word->best_choice;
949 int w_conf = static_cast<int>(100 + 5 * choice->certainty());
950 // This is the eq for converting Tesseract confidence to 1..100
951 if (w_conf < 0) w_conf = 0;
952 if (w_conf > 100) w_conf = 100;
953 conf[n_word++] = w_conf;
954 }
955 conf[n_word] = -1;
956 return conf;
957 }
958
959 // Free up recognition results and any stored image data, without actually
960 // freeing any recognition data that would be time-consuming to reload.
961 // Afterwards, you must call SetImage or TesseractRect before doing
962 // any Recognize or Get* operation.
Clear()963 void TessBaseAPI::Clear() {
964 if (thresholder_ != NULL)
965 thresholder_->Clear();
966 ClearResults();
967 page_image.destroy();
968 }
969
970 // Close down tesseract and free up all memory. End() is equivalent to
971 // destructing and reconstructing your TessBaseAPI.
972 // Once End() has been used, none of the other API functions may be used
973 // other than Init and anything declared above it in the class definition.
End()974 void TessBaseAPI::End() {
975 if (thresholder_ != NULL) {
976 delete thresholder_;
977 thresholder_ = NULL;
978 }
979 if (page_res_ != NULL) {
980 delete page_res_;
981 page_res_ = NULL;
982 }
983 if (block_list_ != NULL) {
984 delete block_list_;
985 block_list_ = NULL;
986 }
987 if (tesseract_ != NULL) {
988 tesseract_->end_tesseract();
989 delete tesseract_;
990 tesseract_ = NULL;
991 }
992 if (input_file_ != NULL) {
993 delete input_file_;
994 input_file_ = NULL;
995 }
996 if (output_file_ != NULL) {
997 delete output_file_;
998 output_file_ = NULL;
999 }
1000 if (datapath_ != NULL) {
1001 delete datapath_;
1002 datapath_ = NULL;
1003 }
1004 if (language_ != NULL) {
1005 delete language_;
1006 language_ = NULL;
1007 }
1008 }
1009
1010 // Check whether a word is valid according to Tesseract's language model
1011 // returns 0 if the word is invalid, non-zero if valid
IsValidWord(const char * word)1012 int TessBaseAPI::IsValidWord(const char *word) {
1013 return tesseract_->getDict().valid_word(word);
1014 }
1015
1016
GetTextDirection(int * out_offset,float * out_slope)1017 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
1018 if (page_res_ == NULL)
1019 FindLines();
1020 if (block_list_->length() < 1) {
1021 return false;
1022 }
1023
1024 // Get first block
1025 BLOCK_IT block_it(block_list_);
1026 block_it.move_to_first();
1027 ROW_LIST* rows = block_it.data()->row_list();
1028 if (rows->length() != 1) {
1029 return false;
1030 }
1031
1032 // Get first line of block
1033 ROW_IT row_it(rows);
1034 row_it.move_to_first();
1035 ROW* row = row_it.data();
1036
1037 // Calculate offset and slope (NOTE: Kind of ugly)
1038 *out_offset = static_cast<int>(row->base_line(0.0));
1039 *out_slope = row->base_line(1.0) - row->base_line(0.0);
1040
1041 return true;
1042 }
1043
1044 // Set the letter_is_okay function to point somewhere else.
SetDictFunc(DictFunc f)1045 void TessBaseAPI::SetDictFunc(DictFunc f) {
1046 if (tesseract_ != NULL) {
1047 tesseract_->getDict().letter_is_okay_ = f;
1048 }
1049 }
1050
1051 // Common code for setting the image.
InternalSetImage()1052 bool TessBaseAPI::InternalSetImage() {
1053 if (tesseract_ == NULL) {
1054 tprintf("Please call Init before attempting to send an image.");
1055 return false;
1056 }
1057 if (thresholder_ == NULL)
1058 thresholder_ = new ImageThresholder;
1059 ClearResults();
1060 return true;
1061 }
1062
1063 // Run the thresholder to make the thresholded image. If pix is not NULL,
1064 // the source is thresholded to pix instead of the internal IMAGE.
Threshold(Pix ** pix)1065 void TessBaseAPI::Threshold(Pix** pix) {
1066 #ifdef HAVE_LIBLEPT
1067 if (pix != NULL)
1068 thresholder_->ThresholdToPix(pix);
1069 else
1070 thresholder_->ThresholdToIMAGE(&page_image);
1071 #else
1072 thresholder_->ThresholdToIMAGE(&page_image);
1073 #endif
1074 thresholder_->GetImageSizes(&rect_left_, &rect_top_,
1075 &rect_width_, &rect_height_,
1076 &image_width_, &image_height_);
1077 threshold_done_ = true;
1078 }
1079
1080 // Find lines from the image making the BLOCK_LIST.
FindLines()1081 int TessBaseAPI::FindLines() {
1082 if (!block_list_->empty()) {
1083 return 0;
1084 }
1085 if (tesseract_ == NULL) {
1086 tesseract_ = new Tesseract;
1087 tesseract_->InitAdaptiveClassifier();
1088 }
1089 #ifdef HAVE_LIBLEPT
1090 if (tesseract_->pix_binary() == NULL)
1091 Threshold(tesseract_->mutable_pix_binary());
1092 #endif
1093 if (!threshold_done_)
1094 Threshold(NULL);
1095
1096 if (tesseract_->SegmentPage(input_file_, &page_image, block_list_) < 0)
1097 return -1;
1098 ASSERT_HOST(page_image.get_xsize() == rect_width_ ||
1099 page_image.get_xsize() == rect_width_ - 1);
1100 ASSERT_HOST(page_image.get_ysize() == rect_height_ ||
1101 page_image.get_ysize() == rect_height_ - 1);
1102 return 0;
1103 }
1104
1105 // Delete the pageres and clear the block list ready for a new page.
ClearResults()1106 void TessBaseAPI::ClearResults() {
1107 threshold_done_ = false;
1108 if (tesseract_ != NULL)
1109 tesseract_->Clear();
1110 if (page_res_ != NULL) {
1111 delete page_res_;
1112 page_res_ = NULL;
1113 }
1114 if (block_list_ == NULL)
1115 block_list_ = new BLOCK_LIST;
1116 else
1117 block_list_->clear();
1118 }
1119
1120 // Return the length of the output text string, as UTF8, assuming
1121 // one newline per line and one per block, with a terminator,
1122 // and assuming a single character reject marker for each rejected character.
1123 // Also return the number of recognized blobs in blob_count.
TextLength(int * blob_count)1124 int TessBaseAPI::TextLength(int* blob_count) {
1125 if (tesseract_ == NULL || page_res_ == NULL)
1126 return 0;
1127
1128 PAGE_RES_IT page_res_it(page_res_);
1129 int total_length = 2;
1130 int total_blobs = 0;
1131 // Iterate over the data structures to extract the recognition result.
1132 for (page_res_it.restart_page(); page_res_it.word () != NULL;
1133 page_res_it.forward()) {
1134 WERD_RES *word = page_res_it.word();
1135 WERD_CHOICE* choice = word->best_choice;
1136 if (choice != NULL) {
1137 total_blobs += choice->length() + 1;
1138 total_length += choice->unichar_string().length() + 1;
1139 for (int i = 0; i < word->reject_map.length(); ++i) {
1140 if (word->reject_map[i].rejected())
1141 ++total_length;
1142 }
1143 }
1144 }
1145 if (blob_count != NULL)
1146 *blob_count = total_blobs;
1147 return total_length;
1148 }
1149
1150 // Estimates the Orientation And Script of the image.
1151 // Returns true if the image was processed successfully.
DetectOS(OSResults * osr)1152 bool TessBaseAPI::DetectOS(OSResults* osr) {
1153 if (tesseract_ == NULL)
1154 return false;
1155 ClearResults();
1156 Threshold(NULL);
1157 if (input_file_ == NULL)
1158 input_file_ = new STRING(kInputFile);
1159 return orientation_and_script_detection(*input_file_, osr, tesseract_);
1160 }
1161
1162 // ____________________________________________________________________________
1163 // Ocropus add-ons.
1164
1165 // Find lines from the image making the BLOCK_LIST.
FindLinesCreateBlockList()1166 BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() {
1167 FindLines();
1168 BLOCK_LIST* result = block_list_;
1169 block_list_ = NULL;
1170 return result;
1171 }
1172
1173 // Delete a block list.
1174 // This is to keep BLOCK_LIST pointer opaque
1175 // and let go of including the other headers.
DeleteBlockList(BLOCK_LIST * block_list)1176 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
1177 delete block_list;
1178 }
1179
1180
make_tess_ocrrow(float baseline,float xheight,float descender,float ascender)1181 static ROW *make_tess_ocrrow(float baseline,
1182 float xheight,
1183 float descender,
1184 float ascender) {
1185 inT32 xstarts[] = {-32000};
1186 double quad_coeffs[] = {0, 0, baseline};
1187 return new ROW(1,
1188 xstarts,
1189 quad_coeffs,
1190 xheight,
1191 ascender - (baseline + xheight),
1192 descender - baseline,
1193 0,
1194 0);
1195 }
1196
1197 // Almost a copy of make_tess_row() from ccmain/tstruct.cpp.
fill_dummy_row(float baseline,float xheight,float descender,float ascender,TEXTROW * tessrow)1198 static void fill_dummy_row(float baseline, float xheight,
1199 float descender, float ascender,
1200 TEXTROW* tessrow) {
1201 tessrow->baseline.segments = 1;
1202 tessrow->baseline.xstarts[0] = -32767;
1203 tessrow->baseline.xstarts[1] = 32767;
1204 tessrow->baseline.quads[0].a = 0;
1205 tessrow->baseline.quads[0].b = 0;
1206 tessrow->baseline.quads[0].c = bln_baseline_offset;
1207 tessrow->xheight.segments = 1;
1208 tessrow->xheight.xstarts[0] = -32767;
1209 tessrow->xheight.xstarts[1] = 32767;
1210 tessrow->xheight.quads[0].a = 0;
1211 tessrow->xheight.quads[0].b = 0;
1212 tessrow->xheight.quads[0].c = bln_baseline_offset + bln_x_height;
1213 tessrow->lineheight = bln_x_height;
1214 tessrow->ascrise = bln_x_height * (ascender - (xheight + baseline)) / xheight;
1215 tessrow->descdrop = bln_x_height * (descender - baseline) / xheight;
1216 }
1217
1218
1219 // Return a TBLOB * from the whole page_image.
1220 // To be freed later with free_blob().
make_tesseract_blob(float baseline,float xheight,float descender,float ascender)1221 TBLOB *make_tesseract_blob(float baseline, float xheight,
1222 float descender, float ascender) {
1223 BLOCK *block = new BLOCK("a character",
1224 TRUE,
1225 0, 0,
1226 0, 0,
1227 page_image.get_xsize(),
1228 page_image.get_ysize());
1229
1230 // Create C_BLOBs from the page
1231 extract_edges(
1232 #ifndef GRAPHICS_DISABLED
1233 NULL,
1234 #endif
1235 &page_image, &page_image,
1236 ICOORD(page_image.get_xsize(), page_image.get_ysize()),
1237 block);
1238
1239 // Create one PBLOB from all C_BLOBs
1240 C_BLOB_LIST *list = block->blob_list();
1241 C_BLOB_IT c_blob_it(list);
1242 PBLOB *pblob = new PBLOB; // will be (hopefully) deleted by the pblob_list
1243 for (c_blob_it.mark_cycle_pt();
1244 !c_blob_it.cycled_list();
1245 c_blob_it.forward()) {
1246 C_BLOB *c_blob = c_blob_it.data();
1247 PBLOB c_as_p(c_blob, baseline + xheight);
1248 merge_blobs(pblob, &c_as_p);
1249 }
1250 PBLOB_LIST *pblob_list = new PBLOB_LIST; // will be deleted by the word
1251 PBLOB_IT pblob_it(pblob_list);
1252 pblob_it.add_after_then_move(pblob);
1253
1254 // Normalize PBLOB
1255 WERD word(pblob_list, 0, " ");
1256 ROW *row = make_tess_ocrrow(baseline, xheight, descender, ascender);
1257 word.baseline_normalise(row);
1258 delete row;
1259
1260 // Create a TBLOB from PBLOB
1261 return make_tess_blob(pblob, /* flatten: */ TRUE);
1262 }
1263
1264
1265 // Adapt to recognize the current image as the given character.
1266 // The image must be preloaded and be just an image of a single character.
AdaptToCharacter(const char * unichar_repr,int length,float baseline,float xheight,float descender,float ascender)1267 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
1268 int length,
1269 float baseline,
1270 float xheight,
1271 float descender,
1272 float ascender) {
1273 UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
1274 LINE_STATS LineStats;
1275 TEXTROW row;
1276 fill_dummy_row(baseline, xheight, descender, ascender, &row);
1277 GetLineStatsFromRow(&row, &LineStats);
1278
1279 TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender);
1280 float threshold;
1281 UNICHAR_ID best_class = 0;
1282 float best_rating = -100;
1283
1284
1285 // Classify to get a raw choice.
1286 BLOB_CHOICE_LIST choices;
1287 tesseract_->AdaptiveClassifier(blob, NULL, &row, &choices, NULL);
1288 BLOB_CHOICE_IT choice_it;
1289 choice_it.set_to_list(&choices);
1290 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1291 choice_it.forward()) {
1292 if (choice_it.data()->rating() > best_rating) {
1293 best_rating = choice_it.data()->rating();
1294 best_class = choice_it.data()->unichar_id();
1295 }
1296 }
1297
1298 if (id == best_class) {
1299 threshold = matcher_good_threshold;
1300 } else {
1301 /* the blob was incorrectly classified - find the rating threshold
1302 needed to create a template which will correct the error with
1303 some margin. However, don't waste time trying to make
1304 templates which are too tight. */
1305 threshold = tesseract_->GetBestRatingFor(blob, &LineStats, id);
1306 threshold *= .9;
1307 const float max_threshold = .125;
1308 const float min_threshold = .02;
1309
1310 if (threshold > max_threshold)
1311 threshold = max_threshold;
1312
1313 // I have cuddled the following line to set it out of the strike
1314 // of the coverage testing tool. I have no idea how to trigger
1315 // this situation nor I have any necessity to do it. --mezhirov
1316 if (threshold < min_threshold) threshold = min_threshold;
1317 }
1318
1319 if (blob->outlines)
1320 tesseract_->AdaptToChar(blob, &LineStats, id, threshold);
1321 free_blob(blob);
1322 }
1323
1324
RecognitionPass1(BLOCK_LIST * block_list)1325 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
1326 PAGE_RES *page_res = new PAGE_RES(block_list);
1327 tesseract_->recog_all_words(page_res, NULL, NULL, 1);
1328 return page_res;
1329 }
1330
RecognitionPass2(BLOCK_LIST * block_list,PAGE_RES * pass1_result)1331 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
1332 PAGE_RES* pass1_result) {
1333 if (!pass1_result)
1334 pass1_result = new PAGE_RES(block_list);
1335 tesseract_->recog_all_words(pass1_result, NULL, NULL, 2);
1336 return pass1_result;
1337 }
1338
1339 struct TESS_CHAR : ELIST_LINK {
1340 char *unicode_repr;
1341 int length; // of unicode_repr
1342 float cost;
1343 TBOX box;
1344
TESS_CHARtesseract::TESS_CHAR1345 TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
1346 length = (len == -1 ? strlen(repr) : len);
1347 unicode_repr = new char[length + 1];
1348 strncpy(unicode_repr, repr, length);
1349 }
1350
TESS_CHARtesseract::TESS_CHAR1351 TESS_CHAR() { // Satisfies ELISTIZE.
1352 }
~TESS_CHARtesseract::TESS_CHAR1353 ~TESS_CHAR() {
1354 delete [] unicode_repr;
1355 }
1356 };
1357
1358 ELISTIZEH(TESS_CHAR)
ELISTIZE(TESS_CHAR)1359 ELISTIZE(TESS_CHAR)
1360
1361 static void add_space(TESS_CHAR_IT* it) {
1362 TESS_CHAR *t = new TESS_CHAR(0, " ");
1363 it->add_after_then_move(t);
1364 }
1365
1366
rating_to_cost(float rating)1367 static float rating_to_cost(float rating) {
1368 rating = 100 + 5*rating;
1369 // cuddled that to save from coverage profiler
1370 // (I have never seen ratings worse than -100,
1371 // but the check won't hurt)
1372 if (rating < 0) rating = 0;
1373 return rating;
1374 }
1375
1376
1377 // Extract the OCR results, costs (penalty points for uncertainty),
1378 // and the bounding boxes of the characters.
extract_result(TESS_CHAR_IT * out,PAGE_RES * page_res)1379 static void extract_result(TESS_CHAR_IT* out,
1380 PAGE_RES* page_res) {
1381 PAGE_RES_IT page_res_it(page_res);
1382 int word_count = 0;
1383 while (page_res_it.word() != NULL) {
1384 WERD_RES *word = page_res_it.word();
1385 const char *str = word->best_choice->unichar_string().string();
1386 const char *len = word->best_choice->unichar_lengths().string();
1387
1388 if (word_count)
1389 add_space(out);
1390 TBOX bln_rect;
1391 PBLOB_LIST *blobs = word->outword->blob_list();
1392 PBLOB_IT it(blobs);
1393 int n = strlen(len);
1394 TBOX** boxes_to_fix = new TBOX*[n];
1395 for (int i = 0; i < n; i++) {
1396 PBLOB *blob = it.data();
1397 TBOX current = blob->bounding_box();
1398 bln_rect = bln_rect.bounding_union(current);
1399 TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->certainty()),
1400 str, *len);
1401 tc->box = current;
1402 boxes_to_fix[i] = &tc->box;
1403
1404 out->add_after_then_move(tc);
1405 it.forward();
1406 str += *len;
1407 len++;
1408 }
1409
1410 // Find the word bbox before normalization.
1411 // Here we can't use the C_BLOB bboxes directly,
1412 // since connected letters are not yet cut.
1413 TBOX real_rect = word->word->bounding_box();
1414
1415 // Denormalize boxes by transforming the bbox of the whole bln word
1416 // into the denorm bbox (`real_rect') of the whole word.
1417 double x_stretch = static_cast<double>(real_rect.width())
1418 / bln_rect.width();
1419 double y_stretch = static_cast<double>(real_rect.height())
1420 / bln_rect.height();
1421 for (int j = 0; j < n; j++) {
1422 TBOX *box = boxes_to_fix[j];
1423 int x0 = static_cast<int>(real_rect.left() +
1424 x_stretch * (box->left() - bln_rect.left()) + 0.5);
1425 int x1 = static_cast<int>(real_rect.left() +
1426 x_stretch * (box->right() - bln_rect.left()) + 0.5);
1427 int y0 = static_cast<int>(real_rect.bottom() +
1428 y_stretch * (box->bottom() - bln_rect.bottom()) + 0.5);
1429 int y1 = static_cast<int>(real_rect.bottom() +
1430 y_stretch * (box->top() - bln_rect.bottom()) + 0.5);
1431 *box = TBOX(ICOORD(x0, y0), ICOORD(x1, y1));
1432 }
1433 delete [] boxes_to_fix;
1434
1435 page_res_it.forward();
1436 word_count++;
1437 }
1438 }
1439
1440
1441 // Extract the OCR results, costs (penalty points for uncertainty),
1442 // and the bounding boxes of the characters.
TesseractExtractResult(char ** text,int ** lengths,float ** costs,int ** x0,int ** y0,int ** x1,int ** y1,PAGE_RES * page_res)1443 int TessBaseAPI::TesseractExtractResult(char** text,
1444 int** lengths,
1445 float** costs,
1446 int** x0,
1447 int** y0,
1448 int** x1,
1449 int** y1,
1450 PAGE_RES* page_res) {
1451 TESS_CHAR_LIST tess_chars;
1452 TESS_CHAR_IT tess_chars_it(&tess_chars);
1453 extract_result(&tess_chars_it, page_res);
1454 tess_chars_it.move_to_first();
1455 int n = tess_chars.length();
1456 int text_len = 0;
1457 *lengths = new int[n];
1458 *costs = new float[n];
1459 *x0 = new int[n];
1460 *y0 = new int[n];
1461 *x1 = new int[n];
1462 *y1 = new int[n];
1463 int i = 0;
1464 for (tess_chars_it.mark_cycle_pt();
1465 !tess_chars_it.cycled_list();
1466 tess_chars_it.forward(), i++) {
1467 TESS_CHAR *tc = tess_chars_it.data();
1468 text_len += (*lengths)[i] = tc->length;
1469 (*costs)[i] = tc->cost;
1470 (*x0)[i] = tc->box.left();
1471 (*y0)[i] = tc->box.bottom();
1472 (*x1)[i] = tc->box.right();
1473 (*y1)[i] = tc->box.top();
1474 }
1475 char *p = *text = new char[text_len];
1476
1477 tess_chars_it.move_to_first();
1478 for (tess_chars_it.mark_cycle_pt();
1479 !tess_chars_it.cycled_list();
1480 tess_chars_it.forward()) {
1481 TESS_CHAR *tc = tess_chars_it.data();
1482 strncpy(p, tc->unicode_repr, tc->length);
1483 p += tc->length;
1484 }
1485 return n;
1486 }
1487
1488 // This method returns the features associated with the current image.
1489 // Make sure setimage has been called before calling this method.
GetFeatures(INT_FEATURE_ARRAY int_features,int * num_features)1490 void TessBaseAPI::GetFeatures(INT_FEATURE_ARRAY int_features,
1491 int* num_features) {
1492 if (page_res_ != NULL)
1493 ClearResults();
1494 if (!threshold_done_)
1495 Threshold(NULL);
1496 // We have only one block, which is of the size of the page.
1497 BLOCK_LIST* blocks = new BLOCK_LIST;
1498 BLOCK *block = new BLOCK("", // filename.
1499 TRUE, // proportional.
1500 0, // kerning.
1501 0, // spacing.
1502 0, // Left.
1503 0, // Bottom.
1504 page_image.get_xsize(), // Right.
1505 page_image.get_ysize()); // Top.
1506 ICOORD bleft, tright;
1507 block->bounding_box (bleft, tright);
1508
1509 BLOCK_IT block_it_add = blocks;
1510 block_it_add.add_to_end(block);
1511
1512 ICOORD page_tr(page_image.get_xsize(), page_image.get_ysize());
1513 TEXTROW tessrow;
1514 make_tess_row(NULL, // Denormalizer.
1515 &tessrow); // Output row.
1516 LINE_STATS line_stats;
1517 GetLineStatsFromRow(&tessrow, &line_stats);
1518
1519 // Perform a CC analysis to detect the blobs.
1520 BLOCK_IT block_it = blocks;
1521 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
1522 block_it.forward ()) {
1523 BLOCK* block = block_it.data();
1524 #ifndef GRAPHICS_DISABLED
1525 extract_edges(NULL, // Scrollview window.
1526 &page_image, // Image.
1527 &page_image, // Thresholded image.
1528 page_tr, // corner of page.
1529 block); // block.
1530 #else
1531 extract_edges(&page_image, // Image.
1532 &page_image, // Thresholded image.
1533 page_tr, // corner of page.
1534 block); // block.
1535 #endif
1536 C_BLOB_IT blob_it = block->blob_list();
1537 PBLOB *pblob = new PBLOB;
1538 // Iterate over all blobs found and get their features.
1539 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
1540 blob_it.forward()) {
1541 C_BLOB* blob = blob_it.data();
1542 blob = blob;
1543 PBLOB c_as_p(blob, page_image.get_ysize());
1544 merge_blobs(pblob, &c_as_p);
1545 }
1546
1547 PBLOB_LIST *pblob_list = new PBLOB_LIST;
1548 PBLOB_IT pblob_it(pblob_list);
1549 pblob_it.add_after_then_move(pblob);
1550 WERD word(pblob_list, // Blob list.
1551 0, // Blanks in front.
1552 " "); // Correct text.
1553 ROW *row = make_tess_ocrrow(0, // baseline.
1554 page_image.get_ysize(), // xheight.
1555 0, // ascent.
1556 0); // descent.
1557 word.baseline_normalise(row);
1558 delete row;
1559 if (pblob->out_list () == NULL) {
1560 tprintf("Blob list is empty");
1561 }
1562 TBLOB* tblob = make_tess_blob(pblob, // Blob.
1563 TRUE); // Flatten.
1564
1565 CLASS_NORMALIZATION_ARRAY norm_array;
1566 inT32 len;
1567 *num_features = tesseract_->GetCharNormFeatures(
1568 tblob, &line_stats,
1569 tesseract_->PreTrainedTemplates,
1570 int_features, norm_array, &len);
1571 }
1572 delete blocks;
1573 }
1574
1575 // Return the pointer to the i-th dawg loaded into tesseract_ object.
GetDawg(int i) const1576 const Dawg *TessBaseAPI::GetDawg(int i) const {
1577 if (tesseract_ == NULL || i >= NumDawgs()) return NULL;
1578 return tesseract_->getDict().GetDawg(i);
1579 }
1580
1581 // Return the number of dawgs loaded into tesseract_ object.
NumDawgs() const1582 int TessBaseAPI::NumDawgs() const {
1583 return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs();
1584 }
1585
1586 // Return the language used in the last valid initialization.
GetLastInitLanguage() const1587 const char* TessBaseAPI::GetLastInitLanguage() const {
1588 return (tesseract_ == NULL || tesseract_->lang.string() == NULL) ?
1589 "" : tesseract_->lang.string();
1590 }
1591 } // namespace tesseract.
1592