1 /******************************************************************
2 * File: output.cpp (Formerly output.c)
3 * Description: Output pass
4 * Author: Phil Cheatle
5 * Created: Thu Aug 4 10:56:08 BST 1994
6 *
7 * (C) Copyright 1994, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 #include "mfcpch.h"
21 #include "ocrshell.h"
22 #include <string.h>
23 #include <ctype.h>
24 #ifdef __UNIX__
25 #include <assert.h>
26 #include <unistd.h>
27 #include <errno.h>
28 #endif
29 #include "mainblk.h"
30 #include "tfacep.h"
31 #include "tessvars.h"
32 #include "control.h"
33 #include "secname.h"
34 #include "reject.h"
35 #include "docqual.h"
36 #include "output.h"
37 #include "bestfirst.h"
38 #include "globals.h"
39 #include "tesseractclass.h"
40
41 #define EXTERN
42
43 #define EPAPER_EXT ".ep"
44 #define PAGE_YSIZE 3508
45 #define CTRL_INSET '\024' //dc4=text inset
46 #define CTRL_FONT '\016' //so=font change
47 #define CTRL_DEFAULT '\017' //si=default font
48 #define CTRL_SHIFT '\022' //dc2=x shift
49 #define CTRL_TAB '\011' //tab
50 #define CTRL_NEWLINE '\012' //newline
51 #define CTRL_HARDLINE '\015' //cr
52
53 EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
54 "Write block separators in output");
55 EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
56 "Write raw stuff to name.raw");
57 EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
58 EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
59 "Return ratings in IPEOCRAPI data");
60 EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
61 "Write .txt to .etx map file");
62 EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
63 "Write repetition char code");
64 EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
65 EXTERN STRING_EVAR (unrecognised_char, "|",
66 "Output char for unidentified blobs");
67 EXTERN INT_EVAR (suspect_level, 99, "Suspect marker level");
68 EXTERN INT_VAR (suspect_space_level, 100,
69 "Min suspect level for rejecting spaces");
70 EXTERN INT_VAR (suspect_short_words, 2,
71 "Dont Suspect dict wds longer than this");
72 EXTERN BOOL_VAR (suspect_constrain_1Il, FALSE,
73 "UNLV keep 1Il chars rejected");
74 EXTERN double_VAR (suspect_rating_per_ch, 999.9,
75 "Dont touch bad rating limit");
76 EXTERN double_VAR (suspect_accept_rating, -999.9, "Accept good rating limit");
77
78 EXTERN BOOL_EVAR (tessedit_minimal_rejection, FALSE,
79 "Only reject tess failures");
80 EXTERN BOOL_VAR (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING");
81 EXTERN BOOL_VAR (tessedit_word_for_word, FALSE,
82 "Make output have exactly one word per WERD");
83 EXTERN BOOL_VAR (tessedit_zero_kelvin_rejection, FALSE,
84 "Dont reject ANYTHING AT ALL");
85 EXTERN BOOL_VAR (tessedit_consistent_reps, TRUE,
86 "Force all rep chars the same");
87
88 FILE *txt_mapfile = NULL; //reject map
89 FILE *unlv_file = NULL; //reject map
90
91 /**********************************************************************
92 * pixels_to_pts
93 *
94 * Convert an integer number of pixels to the nearest integer
95 * number of points.
96 **********************************************************************/
97
pixels_to_pts(inT32 pixels,inT32 pix_res)98 inT32 pixels_to_pts( //convert coords
99 inT32 pixels,
100 inT32 pix_res //resolution
101 ) {
102 float pts; //converted value
103
104 pts = pixels * 72.0 / pix_res;
105 return (inT32) (pts + 0.5); //round it
106 }
107
108 namespace tesseract {
output_pass(PAGE_RES_IT & page_res_it,BOOL8 write_to_shm,TBOX * target_word_box)109 void Tesseract::output_pass( //Tess output pass //send to api
110 PAGE_RES_IT &page_res_it,
111 BOOL8 write_to_shm,
112 TBOX *target_word_box) {
113 BLOCK_RES *block_of_last_word;
114 inT16 block_id;
115 BOOL8 force_eol; //During output
116 BLOCK *nextblock; //block of next word
117 WERD *nextword; //next word
118
119 if (tessedit_write_txt_map)
120 txt_mapfile = open_outfile (".map");
121
122 page_res_it.restart_page ();
123 block_of_last_word = NULL;
124 while (page_res_it.word () != NULL) {
125 check_debug_pt (page_res_it.word (), 120);
126
127 if (target_word_box)
128 {
129
130 TBOX current_word_box=page_res_it.word ()->word->bounding_box();
131 FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
132 if (!target_word_box->contains(center_pt))
133 {
134 page_res_it.forward ();
135 continue;
136 }
137
138 }
139 if (tessedit_write_block_separators &&
140 block_of_last_word != page_res_it.block ()) {
141 block_of_last_word = page_res_it.block ();
142 block_id = block_of_last_word->block->index();
143 if (!wordrec_no_block)
144 fprintf (textfile, "|^~tr%d\n", block_id);
145 fprintf (txt_mapfile, "|^~tr%d\n", block_id);
146 }
147
148 force_eol = (tessedit_write_block_separators &&
149 (page_res_it.block () != page_res_it.next_block ())) ||
150 (page_res_it.next_word () == NULL);
151
152 if (page_res_it.next_word () != NULL)
153 nextword = page_res_it.next_word ()->word;
154 else
155 nextword = NULL;
156 if (page_res_it.next_block () != NULL)
157 nextblock = page_res_it.next_block ()->block;
158 else
159 nextblock = NULL;
160 //regardless of tilde crunching
161 write_results (page_res_it, determine_newline_type (page_res_it.word ()->word, page_res_it.block ()->block, nextword, nextblock), force_eol,
162 write_to_shm);
163 page_res_it.forward ();
164 }
165 if (write_to_shm)
166 ocr_send_text(FALSE);
167 if (tessedit_write_block_separators) {
168 if (!wordrec_no_block)
169 fprintf (textfile, "|^~tr\n");
170 fprintf (txt_mapfile, "|^~tr\n");
171 }
172 if (tessedit_write_txt_map) {
173 fprintf (txt_mapfile, "\n"); //because txt gets one
174 #ifdef __UNIX__
175 fsync (fileno (txt_mapfile));
176 #endif
177 fclose(txt_mapfile);
178 }
179 }
180
181
182 /*************************************************************************
183 * write_results()
184 *
185 * All recognition and rejection has now been done. Generate the following:
186 * .txt file - giving the final best choices with NO highlighting
187 * .raw file - giving the tesseract top choice output for each word
188 * .map file - showing how the .txt file has been rejected in the .ep file
189 * epchoice list - a list of one element per word, containing the text for the
190 * epaper. Reject strings are inserted.
191 * inset list - a list of bounding boxes of reject insets - indexed by the
192 * reject strings in the epchoice text.
193 *************************************************************************/
194
write_results(PAGE_RES_IT & page_res_it,char newline_type,BOOL8 force_eol,BOOL8 write_to_shm)195 void Tesseract::write_results( //output a word
196 //full info
197 PAGE_RES_IT &page_res_it,
198 char newline_type, //type of newline
199 //override tilde crunch?
200 BOOL8 force_eol,
201 BOOL8 write_to_shm //send to api
202 ) {
203 //word to do
204 WERD_RES *word = page_res_it.word ();
205 // WERD_CHOICE *ep_choice; //ep format
206 STRING repetition_code;
207 const STRING *wordstr;
208 STRING wordstr_lengths;
209 int i;
210 char unrecognised = STRING (unrecognised_char)[0];
211 char ep_chars[32]; //Only for unlv_tilde_crunch
212 int ep_chars_index = 0;
213 char txt_chs[32]; //Only for unlv_tilde_crunch
214 char map_chs[32]; //Only for unlv_tilde_crunch
215 int txt_index = 0;
216 static BOOL8 tilde_crunch_written = FALSE;
217 static BOOL8 last_char_was_newline = TRUE;
218 static BOOL8 last_char_was_tilde = FALSE;
219 static BOOL8 empty_block = TRUE;
220 BOOL8 need_reject = FALSE;
221 PBLOB_IT blob_it; //blobs
222 UNICHAR_ID space = unicharset.unichar_to_id(" ");
223
224 /* if (word->best_choice->string().length() == 0)
225 {
226 tprintf("No output: to output\n");
227 }
228 else if (word->best_choice->string()[0]==' ')
229 {
230 tprintf("spaceword to output\n");
231 }
232 else if (word->best_choice->string()[0]=='\0')
233 {
234 tprintf("null to output\n");
235 }*/
236 if (word->unlv_crunch_mode != CR_NONE
237 && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
238 if ((word->unlv_crunch_mode != CR_DELETE) &&
239 (!tilde_crunch_written ||
240 ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
241 (word->word->space () > 0) &&
242 !word->word->flag (W_FUZZY_NON) &&
243 !word->word->flag (W_FUZZY_SP)))) {
244 if (!word->word->flag (W_BOL) &&
245 (word->word->space () > 0) &&
246 !word->word->flag (W_FUZZY_NON) &&
247 !word->word->flag (W_FUZZY_SP)) {
248 /* Write a space to separate from preceeding good text */
249 txt_chs[txt_index] = ' ';
250 map_chs[txt_index++] = '1';
251 ep_chars[ep_chars_index++] = ' ';
252 last_char_was_tilde = FALSE;
253 }
254 need_reject = TRUE;
255 }
256 if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) {
257 /* Write a reject char - mark as rejected unless zero_rejection mode */
258 last_char_was_tilde = TRUE;
259 txt_chs[txt_index] = unrecognised;
260 if (tessedit_zero_rejection || (suspect_level == 0)) {
261 map_chs[txt_index++] = '1';
262 ep_chars[ep_chars_index++] = unrecognised;
263 }
264 else {
265 map_chs[txt_index++] = '0';
266 /*
267 The ep_choice string is a faked reject to allow newdiff to sync the
268 .etx with the .txt and .map files.
269 */
270 ep_chars[ep_chars_index++] = CTRL_INSET;
271 //escape code
272 //dummy reject
273 ep_chars[ep_chars_index++] = 1;
274 //dummy reject
275 ep_chars[ep_chars_index++] = 1;
276 //type
277 ep_chars[ep_chars_index++] = 2;
278 //dummy reject
279 ep_chars[ep_chars_index++] = 1;
280 //dummy reject
281 ep_chars[ep_chars_index++] = 1;
282 }
283 tilde_crunch_written = TRUE;
284 last_char_was_newline = FALSE;
285 empty_block = FALSE;
286 }
287
288 if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) {
289 /* Add a new line output */
290 txt_chs[txt_index] = '\n';
291 map_chs[txt_index++] = '\n';
292 //end line
293 ep_chars[ep_chars_index++] = newline_type;
294
295 //Cos of the real newline
296 tilde_crunch_written = FALSE;
297 last_char_was_newline = TRUE;
298 last_char_was_tilde = FALSE;
299 }
300 txt_chs[txt_index] = '\0';
301 map_chs[txt_index] = '\0';
302 //xiaofan
303 if (tessedit_write_output && !wordrec_no_block)
304 fprintf (textfile, "%s", txt_chs);
305
306 if (tessedit_write_txt_map)
307 fprintf (txt_mapfile, "%s", map_chs);
308
309 //terminate string
310 ep_chars[ep_chars_index] = '\0';
311 word->ep_choice = new WERD_CHOICE(ep_chars, unicharset);
312
313 if (force_eol)
314 empty_block = TRUE;
315 return;
316 }
317
318 /* NORMAL PROCESSING of non tilde crunched words */
319
320 tilde_crunch_written = FALSE;
321 if (newline_type)
322 last_char_was_newline = TRUE;
323 else
324 last_char_was_newline = FALSE;
325 empty_block = force_eol; //About to write a real word
326
327 if (unlv_tilde_crunching &&
328 last_char_was_tilde &&
329 (word->word->space() == 0) &&
330 !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
331 (word->best_choice->unichar_id(0) == space)) {
332 /* Prevent adjacent tilde across words - we know that adjacent tildes within
333 words have been removed */
334 word->best_choice->remove_unichar_id(0);
335 word->best_choice->populate_unichars(getDict().getUnicharset());
336 word->reject_map.remove_pos (0);
337 blob_it = word->outword->blob_list ();
338 delete blob_it.extract (); //get rid of reject blob
339 }
340 if (newline_type ||
341 (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
342 last_char_was_tilde = FALSE;
343 else {
344 if (word->reject_map.length () > 0) {
345 if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
346 last_char_was_tilde = TRUE;
347 else
348 last_char_was_tilde = FALSE;
349 }
350 else if (word->word->space () > 0)
351 last_char_was_tilde = FALSE;
352 /* else it is unchanged as there are no output chars */
353 }
354
355 ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
356
357 if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
358 ensure_rep_chars_are_consistent(word);
359
360 set_unlv_suspects(word);
361 check_debug_pt (word, 120);
362 if (tessedit_rejection_debug) {
363 tprintf ("Dict word: \"%s\": %d\n",
364 word->best_choice->debug_string(unicharset).string(),
365 dict_word(*(word->best_choice)));
366 }
367
368 #if 0
369 if (tessedit_write_unlv) {
370 write_unlv_text(word);
371 }
372 #endif
373
374 if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
375 repetition_code = "|^~R";
376 wordstr_lengths = "\001\001\001\001";
377 repetition_code += unicharset.id_to_unichar(get_rep_char (word));
378 wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
379 wordstr = &repetition_code;
380 }
381 else {
382 if (tessedit_zero_rejection) {
383 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
384 for (i = 0; i < word->best_choice->length(); ++i) {
385 if (word->reject_map[i].rejected())
386 word->reject_map[i].setrej_minimal_rej_accept();
387 }
388 }
389 if (tessedit_minimal_rejection) {
390 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
391 for (i = 0; i < word->best_choice->length(); ++i) {
392 if ((word->best_choice->unichar_id(i) != space) &&
393 word->reject_map[i].rejected())
394 word->reject_map[i].setrej_minimal_rej_accept();
395 }
396 }
397 }
398
399 if (write_to_shm)
400 write_shm_text (word, page_res_it.block ()->block,
401 page_res_it.row (), *wordstr, wordstr_lengths);
402
403 #if 0
404 if (tessedit_write_output)
405 write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
406
407 if (tessedit_write_raw_output)
408 write_cooked_text (word->word, word->raw_choice->string (),
409 TRUE, FALSE, rawfile);
410
411 if (tessedit_write_txt_map)
412 write_map(txt_mapfile, word);
413
414 ep_choice = make_epaper_choice (word, newline_type);
415 word->ep_choice = ep_choice;
416 #endif
417
418 character_count += word->best_choice->length();
419 word_count++;
420 }
421 } // namespace tesseract
422
423 /**********************************************************************
424 * make_epaper_choice
425 *
426 * Construct the epaper text string for a word, using the reject map to
427 * determine whether each blob should be rejected.
428 **********************************************************************/
429
430 #if 0
431 WERD_CHOICE *make_epaper_choice( //convert one word
432 WERD_RES *word, //word to do
433 char newline_type //type of newline
434 ) {
435 inT16 index = 0; //to string
436 inT16 blobindex; //to word
437 inT16 prevright = 0; //right of previous blob
438 inT16 nextleft; //left of next blob
439 PBLOB *blob;
440 TBOX inset_box; //bounding box
441 PBLOB_IT blob_it; //blob iterator
442 char word_string[MAX_PATH]; //converted string
443 BOOL8 force_total_reject;
444 char unrecognised = STRING (unrecognised_char)[0];
445
446 blob_it.set_to_list (word->outword->blob_list ());
447
448 ASSERT_HOST (word->reject_map.length () ==
449 word->best_choice->string ().length ());
450 /*
451 tprintf( "\"%s\" -> length: %d; blobcount: %d (%d)\n",
452 word->best_choice->string().string(),
453 word->best_choice->string().length(),
454 blob_it.length(),
455 blob_count( word->outword ) );
456 */
457
458 if (word->best_choice->string ().length () == 0)
459 force_total_reject = TRUE;
460 else {
461 force_total_reject = FALSE;
462 ASSERT_HOST (blob_it.length () ==
463 word->best_choice->string ().length ());
464 }
465 if (!blob_it.empty ()) {
466 for (index = 0; index < word->word->space (); index++)
467 word_string[index] = ' '; //leading blanks
468 }
469 /* Why does this generate leading blanks regardless of whether the
470 word_choice string is empty, when write_cooked_text ony generates leading
471 blanks when the string is NOT empty???. */
472
473 if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
474 strcpy (word_string + index, "|^~R");
475 index += 4;
476 strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
477 index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
478 }
479 else {
480 if (!blob_it.empty ())
481 prevright = blob_it.data ()->bounding_box ().left ();
482 //actually first left
483 for (blobindex = 0, blob_it.mark_cycle_pt ();
484 !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
485 blob = blob_it.data ();
486 if (word->reject_map[blobindex].accepted ()) {
487 if (word->best_choice->string ()[blobindex] == ' ')
488 //but not rejected!!
489 word_string[index++] = unrecognised;
490 else
491 word_string[index++] =
492 word->best_choice->string ()[blobindex];
493 }
494 else { // start reject
495 inset_box = blob->bounding_box ();
496 /* Extend reject box to include rejected neighbours */
497 while (!blob_it.at_last () &&
498 (force_total_reject ||
499 (word->reject_map[blobindex + 1].rejected ()))) {
500 blobindex++;
501 blob = blob_it.forward ();
502 //get total box
503 inset_box += blob->bounding_box ();
504 }
505 if (blob_it.at_last ())
506 nextleft = inset_box.right ();
507 else
508 nextleft = blob_it.data_relative (1)->bounding_box ().left ();
509
510 // tprintf("Making reject from (%d,%d)->(%d,%d)\n",
511 // inset_box.left(),inset_box.bottom(),
512 // inset_box.right(),inset_box.top());
513
514 index += make_reject (&inset_box, prevright, nextleft,
515 &word->denorm, &word_string[index]);
516 }
517 prevright = blob->bounding_box ().right ();
518 }
519 }
520 if (newline_type)
521 //end line
522 word_string[index++] = newline_type;
523 word_string[index] = '\0'; //terminate string
524 if (strlen (word_string) != index) {
525 tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n",
526 word_string, index, strlen (word_string));
527 }
528 //don't pass any zeros
529 ASSERT_HOST (strlen (word_string) == index);
530 return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
531 }
532 #endif
533
534 /**********************************************************************
535 * make_reject
536 *
537 * Add the escape code to the string for the reject.
538 **********************************************************************/
539
540 inT16
make_reject(TBOX * inset_box,inT16 prevright,inT16 nextleft,DENORM * denorm,char word_string[])541 make_reject ( //make reject code
542 TBOX * inset_box, //bounding box
543 inT16 prevright, //previous char
544 inT16 nextleft, //next char
545 DENORM * denorm, //de-normalizer
546 char word_string[] //output string
547 ) {
548 inT16 index; //to string
549 inT16 xpos; //start of inset
550 inT16 ypos;
551 inT16 width; //size of inset
552 inT16 height;
553 inT16 left_offset; //shift form prev char
554 inT16 right_offset; //shift to next char
555 inT16 baseline_offset; //shift from baseline
556 inT16 inset_index = 0; //number of inset
557 inT16 min_chars; //min width estimate
558 inT16 max_chars; //max width estimate
559 float x_centre; //centre of box
560
561 index = 0;
562 x_centre = (inset_box->left () + inset_box->right ()) / 2.0;
563 left_offset =
564 (inT16) (denorm->x (inset_box->left ()) - denorm->x (prevright));
565 right_offset =
566 (inT16) (denorm->x (nextleft) - denorm->x (inset_box->right ()));
567 xpos = (inT16) floor (denorm->x (inset_box->left ()));
568 width = (inT16) ceil (denorm->x (inset_box->right ())) - xpos;
569 ypos = (inT16) floor (denorm->y (inset_box->bottom (), x_centre));
570 height = (inT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos;
571 baseline_offset = ypos - (inT16) denorm->y (bln_baseline_offset, x_centre);
572 //escape code
573 word_string[index++] = CTRL_INSET;
574 min_chars = (inT16) ceil (0.27 * width / denorm->row ()->x_height ());
575 max_chars = (inT16) floor (1.8 * width / denorm->row ()->x_height ());
576 /*
577 Ensure min_chars and max_chars are in the range 0..254. This ensures that
578 we can add 1 to them to avoid putting \0 in a string, and still not exceed
579 the max value in a byte.
580 */
581 if (min_chars < 0)
582 min_chars = 0;
583 if (min_chars > 254)
584 min_chars = 254;
585 if (max_chars < min_chars)
586 max_chars = min_chars;
587 if (max_chars > 254)
588 max_chars = 254;
589 //min chars
590 word_string[index++] = min_chars + 1;
591 //max chars
592 word_string[index++] = max_chars + 1;
593 word_string[index++] = 2; //type?
594 //store index
595 word_string[index++] = inset_index / 255 + 1;
596 word_string[index++] = inset_index % 255 + 1;
597 return index; //size of string
598 }
599
600
601 /**********************************************************************
602 * determine_newline_type
603 *
604 * Find whether we have a wrapping or hard newline.
605 * Return FALSE if not at end of line.
606 **********************************************************************/
607
determine_newline_type(WERD * word,BLOCK * block,WERD * next_word,BLOCK * next_block)608 char determine_newline_type( //test line ends
609 WERD *word, //word to do
610 BLOCK *block, //current block
611 WERD *next_word, //next word
612 BLOCK *next_block //block of next word
613 ) {
614 inT16 end_gap; //to right edge
615 inT16 width; //of next word
616 TBOX word_box; //bounding
617 TBOX next_box; //next word
618 TBOX block_box; //block bounding
619
620 if (!word->flag (W_EOL))
621 return FALSE; //not end of line
622 if (next_word == NULL || next_block == NULL || block != next_block)
623 return CTRL_NEWLINE;
624 if (next_word->space () > 0)
625 return CTRL_HARDLINE; //it is tabbed
626 word_box = word->bounding_box ();
627 next_box = next_word->bounding_box ();
628 block_box = block->bounding_box ();
629 //gap to eol
630 end_gap = block_box.right () - word_box.right ();
631 end_gap -= (inT32) block->space ();
632 width = next_box.right () - next_box.left ();
633 // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
634 // block_box.right(),word_box.right(),end_gap,
635 // next_box.right(),next_box.left(),width,
636 // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
637 return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
638 }
639
640 /**********************************************************************
641 * write_shm_text
642 *
643 * Write the cooked text to the shared memory for the api.
644 **********************************************************************/
645
write_shm_text(WERD_RES * word,BLOCK * block,ROW_RES * row,const STRING & text,const STRING & text_lengths)646 void write_shm_text( //write output
647 WERD_RES *word, //word to do
648 BLOCK *block, //block it is from
649 ROW_RES *row, //row it is from
650 const STRING &text, //text to write
651 const STRING &text_lengths
652 ) {
653 inT32 index; //char counter
654 inT32 index2; //char counter
655 inT32 length; //chars in word
656 inT32 ptsize; //font size
657 inT8 blanks; //blanks in word
658 uinT8 enhancement; //bold etc
659 uinT8 font; //font index
660 char unrecognised = STRING (unrecognised_char)[0];
661 PBLOB *blob;
662 TBOX blob_box; //bounding box
663 PBLOB_IT blob_it; //blob iterator
664 WERD copy_outword; // copy to denorm
665 uinT32 rating; //of char
666 BOOL8 lineend; //end of line
667 int offset;
668 int offset2;
669
670 //point size
671 ptsize = pixels_to_pts ((inT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
672 if (word->word->flag (W_BOL) && ocr_char_space () < 128
673 && ocr_send_text (TRUE) != OKAY)
674 return; //release failed
675 copy_outword = *(word->outword);
676 copy_outword.baseline_denormalise (&word->denorm);
677 blob_it.set_to_list (copy_outword.blob_list ());
678 length = text_lengths.length ();
679
680 if (length > 0) {
681 blanks = word->word->space ();
682 if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
683 blanks = 1;
684 for (index = 0, offset = 0; index < length;
685 offset += text_lengths[index++], blob_it.forward ()) {
686 blob = blob_it.data ();
687 blob_box = blob->bounding_box ();
688
689 enhancement = 0;
690 if (word->italic > 0 || (word->italic == 0 && row->italic > 0))
691 enhancement |= EUC_ITALIC;
692 if (word->bold > 0 || (word->bold == 0 && row->bold > 0))
693 enhancement |= EUC_BOLD;
694 if (tessedit_write_ratings)
695 rating = (uinT32) (-word->best_choice->certainty () / 0.035);
696 else if (tessedit_zero_rejection)
697 rating = text[offset] == ' ' ? 100 : 0;
698 else
699 rating = word->reject_map[index].accepted ()? 0 : 100;
700 if (rating > 255)
701 rating = 255;
702 if (word->font1_count > 2)
703 font = word->font1;
704 else if (row->font1_count > 8)
705 font = row->font1;
706 else
707 //font index
708 font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
709
710 lineend = word->word->flag (W_EOL) && index == length - 1;
711 if (word->word->flag (W_EOL) && tessedit_zero_rejection
712 && index < length - 1 && text[index + text_lengths[index]] == ' ') {
713 for (index2 = index + 1, offset2 = offset + text_lengths[index];
714 index2 < length && text[offset2] == ' ';
715 offset2 += text_lengths[index2++]);
716 if (index2 == length)
717 lineend = TRUE;
718 }
719
720 if (!tessedit_zero_rejection || text[offset] != ' '
721 || tessedit_word_for_word) {
722 //confidence
723 if (text[offset] == ' ') {
724 ocr_append_char (unrecognised,
725 blob_box.left (), blob_box.right (),
726 page_image.get_ysize () - 1 - blob_box.top (),
727 page_image.get_ysize () - 1 - blob_box.bottom (),
728 font, (uinT8) rating,
729 ptsize, //point size
730 blanks, enhancement, //enhancement
731 OCR_CDIR_LEFT_RIGHT,
732 OCR_LDIR_DOWN_RIGHT,
733 lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
734 } else {
735 for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
736 ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]),
737 blob_box.left (), blob_box.right (),
738 page_image.get_ysize () - 1 - blob_box.top (),
739 page_image.get_ysize () - 1 - blob_box.bottom (),
740 font, (uinT8) rating,
741 ptsize, //point size
742 blanks, enhancement, //enhancement
743 OCR_CDIR_LEFT_RIGHT,
744 OCR_LDIR_DOWN_RIGHT,
745 lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
746 }
747 blanks = 0;
748 }
749
750 }
751 }
752 else if (tessedit_word_for_word) {
753 blanks = word->word->space ();
754 if (blanks == 0 && !word->word->flag (W_BOL))
755 blanks = 1;
756 blob_box = word->word->bounding_box ();
757
758 enhancement = 0;
759 if (word->italic > 0)
760 enhancement |= EUC_ITALIC;
761 if (word->bold > 0)
762 enhancement |= EUC_BOLD;
763 rating = 100;
764 if (word->font1_count > 2)
765 font = word->font1;
766 else if (row->font1_count > 8)
767 font = row->font1;
768 else
769 //font index
770 font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
771
772 lineend = word->word->flag (W_EOL);
773
774 //font index
775 ocr_append_char (unrecognised,
776 blob_box.left (), blob_box.right (),
777 page_image.get_ysize () - 1 - blob_box.top (),
778 page_image.get_ysize () - 1 - blob_box.bottom (),
779 font,
780 rating, //confidence
781 ptsize, //point size
782 blanks, enhancement, //enhancement
783 OCR_CDIR_LEFT_RIGHT,
784 OCR_LDIR_DOWN_RIGHT,
785 lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
786 }
787 }
788
789
790 /**********************************************************************
791 * write_map
792 *
793 * Write a map file of 0's and 1'a which associates characters from the .txt
794 * file with those in the .etx file. 0 = .txt char was deleted. 1 = .txt char
795 * is kept. Note that there may be reject regions in the .etx file WITHOUT
796 * .txt chars being rejected. The map file should be the same length, and
797 * the same number of lines as the .txt file
798 *
799 * The paramaterised input is because I thought I might be able to generate
800 * multiple map files in a single run. However, it didn't work because
801 * newdiff needs etx files!
802 **********************************************************************/
803
804 #if 0
805 void write_map( //output a map file
806 FILE *mapfile, //mapfile to write to
807 WERD_RES *word) {
808 inT16 index;
809 int status;
810 STRING mapstr = "";
811
812 if (word->best_choice->string ().length () > 0) {
813 for (index = 0; index < word->word->space (); index++) {
814 if (word->reject_spaces &&
815 (suspect_level >= suspect_space_level) &&
816 !tessedit_minimal_rejection && !tessedit_zero_rejection)
817 /* Write rejected spaces to .map file ONLY. Newdiff converts these back to
818 accepted spaces AFTER generating basic space stats but BEFORE using .etx */
819 status = fprintf (mapfile, "0");
820 else
821 status = fprintf (mapfile, "1");
822 if (status < 0)
823 WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno);
824 }
825
826 if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) {
827 for (index = 0; index < 5; index++)
828 mapstr += '1';
829 }
830 else {
831 ASSERT_HOST (word->reject_map.length () ==
832 word->best_choice->string ().length ());
833
834 for (index = 0; index < word->reject_map.length (); index++) {
835 if (word->reject_map[index].accepted ())
836 mapstr += '1';
837 else
838 mapstr += '0';
839 }
840 }
841 status = fprintf (mapfile, "%s", mapstr.string ());
842 if (status < 0)
843 WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno);
844 }
845 if (word->word->flag (W_EOL)) {
846 status = fprintf (mapfile, "\n");
847 if (status < 0)
848 WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno);
849 }
850 status = fflush (mapfile);
851 if (status != 0)
852 WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
853 }
854 #endif
855
856
857 /*************************************************************************
858 * open_file()
859 *************************************************************************/
860
861 namespace tesseract {
open_outfile(const char * extension)862 FILE *Tesseract::open_outfile( //open .map & .unlv file
863 const char *extension) {
864 STRING file_name;
865 FILE *outfile;
866
867 file_name = imagebasename + extension;
868 if (!(outfile = fopen (file_name.string (), "w"))) {
869 CANTOPENFILE.error ("open_outfile", EXIT, "%s %d",
870 file_name.string (), errno);
871 }
872 return outfile;
873 }
874 } // namespace tesseract
875
876
877 #if 0
878 void write_unlv_text(WERD_RES *word) {
879 const char *wordstr;
880
881 char buff[512]; //string to output
882 int i = 0;
883 int j = 0;
884 char unrecognised = STRING (unrecognised_char)[0];
885 int status;
886 char space_str[3];
887
888 wordstr = word->best_choice->string ().string ();
889
890 /* DONT need to do anything special for repeated char words - at this stage
891 the repetition char has been identified and any other chars have been
892 rejected.
893 */
894
895 for (; wordstr[i] != '\0'; i++) {
896 if ((wordstr[i] == ' ') ||
897 (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|'))
898 buff[j++] = unrecognised;
899 else {
900 if (word->reject_map[i].rejected ())
901 buff[j++] = '^'; //Add suspect marker
902 buff[j++] = wordstr[i];
903 }
904 }
905 buff[j] = '\0';
906
907 if (strlen (wordstr) > 0) {
908 if (word->reject_spaces &&
909 (suspect_level >= suspect_space_level) &&
910 !tessedit_minimal_rejection && !tessedit_zero_rejection)
911 strcpy (space_str, "^ "); //Suspect space
912 else
913 strcpy (space_str, " "); //Certain space
914
915 for (i = 0; i < word->word->space (); i++) {
916 status = fprintf (unlv_file, "%s", space_str);
917 if (status < 0)
918 WRITEFAILED.error ("write_unlv_text", EXIT,
919 "Space Errno: %d", errno);
920 }
921
922 status = fprintf (unlv_file, "%s", buff);
923 if (status < 0)
924 WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno);
925 }
926 if (word->word->flag (W_EOL)) {
927 status = fprintf (unlv_file, "\n");
928 if (status < 0)
929 WRITEFAILED.error ("write_unlv_text", EXIT,
930 "Newline Errno: %d", errno);
931 }
932 status = fflush (unlv_file);
933 if (status != 0)
934 WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
935 }
936 #endif
937
938
939 /*************************************************************************
940 * get_rep_char()
941 * Return the first accepted character from the repetition string. This is the
942 * character which is repeated - as determined earlier by fix_rep_char()
943 *************************************************************************/
944 namespace tesseract {
get_rep_char(WERD_RES * word)945 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
946 int i;
947 for (i = 0; ((i < word->reject_map.length()) &&
948 (word->reject_map[i].rejected())); ++i);
949
950 if (i < word->reject_map.length()) {
951 return word->best_choice->unichar_id(i);
952 } else {
953 return unicharset.unichar_to_id(unrecognised_char.string());
954 }
955 }
956 } // namespace tesseract
957
ensure_rep_chars_are_consistent(WERD_RES * word)958 void ensure_rep_chars_are_consistent(WERD_RES *word) {
959 #if 0
960 char rep_char = get_rep_char (word);
961 char *ptr;
962
963 ptr = (char *) word->best_choice->string ().string ();
964 for (; *ptr != '\0'; ptr++) {
965 if (*ptr != rep_char)
966 *ptr = rep_char;
967 }
968 #endif
969
970 #if 0
971 UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
972 int i;
973 char *ptr;
974 STRING consistent_string;
975 STRING consistent_string_lengths;
976
977 ptr = (char *) word->best_choice->string ().string ();
978 for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
979 consistent_string += unicharset.id_to_unichar(rep_char);
980 consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
981 }
982 word->best_choice->string() = consistent_string;
983 word->best_choice->lengths() = consistent_string_lengths;
984 #endif
985 }
986
987 /*************************************************************************
988 * SUSPECT LEVELS
989 *
990 * 0 - dont reject ANYTHING
991 * 1,2 - partial rejection
992 * 3 - BEST
993 *
994 * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
995 * tessedit_minimal_rejection.
996 *************************************************************************/
997
998 namespace tesseract {
set_unlv_suspects(WERD_RES * word_res)999 void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
1000 int len = word_res->reject_map.length();
1001 const WERD_CHOICE &word = *(word_res->best_choice);
1002 int i;
1003 float rating_per_ch;
1004
1005 if (suspect_level == 0) {
1006 for (i = 0; i < len; i++) {
1007 if (word_res->reject_map[i].rejected())
1008 word_res->reject_map[i].setrej_minimal_rej_accept();
1009 }
1010 return;
1011 }
1012
1013 if (suspect_level >= 3)
1014 return; //Use defaults
1015
1016 /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
1017
1018 if (safe_dict_word(word) &&
1019 (count_alphas(word) > suspect_short_words)) {
1020 /* Unreject alphas in dictionary words */
1021 for (i = 0; i < len; ++i) {
1022 if (word_res->reject_map[i].rejected() &&
1023 unicharset.get_isalpha(word.unichar_id(i)))
1024 word_res->reject_map[i].setrej_minimal_rej_accept();
1025 }
1026 }
1027
1028 rating_per_ch = word.rating() / word_res->reject_map.length();
1029
1030 if (rating_per_ch >= suspect_rating_per_ch)
1031 return; //Dont touch bad ratings
1032
1033 if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
1034 /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
1035 for (i = 0; i < len; ++i) {
1036 if (word_res->reject_map[i].rejected() &&
1037 (!unicharset.eq(word.unichar_id(i), " ")))
1038 word_res->reject_map[i].setrej_minimal_rej_accept();
1039 }
1040 }
1041
1042 for (i = 0; i < len; i++) {
1043 if (word_res->reject_map[i].rejected()) {
1044 if (word_res->reject_map[i].flag(R_DOC_REJ))
1045 word_res->reject_map[i].setrej_minimal_rej_accept();
1046 if (word_res->reject_map[i].flag(R_BLOCK_REJ))
1047 word_res->reject_map[i].setrej_minimal_rej_accept();
1048 if (word_res->reject_map[i].flag(R_ROW_REJ))
1049 word_res->reject_map[i].setrej_minimal_rej_accept();
1050 }
1051 }
1052
1053 if (suspect_level == 2)
1054 return;
1055
1056 if (!suspect_constrain_1Il ||
1057 (word_res->reject_map.length() <= suspect_short_words)) {
1058 for (i = 0; i < len; i++) {
1059 if (word_res->reject_map[i].rejected()) {
1060 if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
1061 word_res->reject_map[i].flag(R_POSTNN_1IL)))
1062 word_res->reject_map[i].setrej_minimal_rej_accept();
1063
1064 if (!suspect_constrain_1Il &&
1065 word_res->reject_map[i].flag(R_MM_REJECT))
1066 word_res->reject_map[i].setrej_minimal_rej_accept();
1067 }
1068 }
1069 }
1070
1071 if ((acceptable_word_string(word.unichar_string().string(),
1072 word.unichar_lengths().string()) !=
1073 AC_UNACCEPTABLE) ||
1074 acceptable_number_string(word.unichar_string().string(),
1075 word.unichar_lengths().string())) {
1076 if (word_res->reject_map.length() > suspect_short_words) {
1077 for (i = 0; i < len; i++) {
1078 if (word_res->reject_map[i].rejected() &&
1079 (!word_res->reject_map[i].perm_rejected() ||
1080 word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
1081 word_res->reject_map[i].flag (R_POSTNN_1IL) ||
1082 word_res->reject_map[i].flag (R_MM_REJECT))) {
1083 word_res->reject_map[i].setrej_minimal_rej_accept();
1084 }
1085 }
1086 }
1087 }
1088 }
1089
count_alphas(const WERD_CHOICE & word)1090 inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
1091 int count = 0;
1092 for (int i = 0; i < word.length(); ++i) {
1093 if (unicharset.get_isalpha(word.unichar_id(i)))
1094 count++;
1095 }
1096 return count;
1097 }
1098
1099
count_alphanums(const WERD_CHOICE & word)1100 inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) {
1101 int count = 0;
1102 for (int i = 0; i < word.length(); ++i) {
1103 if (unicharset.get_isalpha(word.unichar_id(i)) ||
1104 unicharset.get_isdigit(word.unichar_id(i)))
1105 count++;
1106 }
1107 return count;
1108 }
1109
1110
acceptable_number_string(const char * s,const char * lengths)1111 BOOL8 Tesseract::acceptable_number_string(const char *s,
1112 const char *lengths) {
1113 BOOL8 prev_digit = FALSE;
1114
1115 if (*lengths == 1 && *s == '(')
1116 s++;
1117
1118 if (*lengths == 1 &&
1119 ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
1120 s++;
1121
1122 for (; *s != '\0'; s += *(lengths++)) {
1123 if (unicharset.get_isdigit (s, *lengths))
1124 prev_digit = TRUE;
1125 else if (prev_digit &&
1126 (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
1127 prev_digit = FALSE;
1128 else if (prev_digit && *lengths == 1 &&
1129 (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
1130 return TRUE;
1131 else if (prev_digit &&
1132 *lengths == 1 && (*s == '%') &&
1133 (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
1134 (*(s + *lengths + *(lengths + 1)) == '\0'))
1135 return TRUE;
1136 else
1137 return FALSE;
1138 }
1139 return TRUE;
1140 }
1141 } // namespace tesseract
1142