• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************
2  * File:        output.cpp  (Formerly output.c)
3  * Description: Output pass
4  * Author:					Phil Cheatle
5  * Created:					Thu Aug  4 10:56:08 BST 1994
6  *
7  * (C) Copyright 1994, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #include          "ocrshell.h"
22 #include          <string.h>
23 #include          <ctype.h>
24 #ifdef __UNIX__
25 #include          <assert.h>
26 #include          <unistd.h>
27 #include                    <errno.h>
28 #endif
29 #include          "mainblk.h"
30 #include          "tfacep.h"
31 #include          "tessvars.h"
32 #include          "control.h"
33 #include          "secname.h"
34 #include          "reject.h"
35 #include          "docqual.h"
36 #include          "output.h"
37 #include "bestfirst.h"
38 #include "globals.h"
39 #include "tesseractclass.h"
40 
41 #define EXTERN
42 
43 #define EPAPER_EXT      ".ep"
44 #define PAGE_YSIZE      3508
45 #define CTRL_INSET      '\024'   //dc4=text inset
46 #define CTRL_FONT       '\016'   //so=font change
47 #define CTRL_DEFAULT      '\017' //si=default font
48 #define CTRL_SHIFT      '\022'   //dc2=x shift
49 #define CTRL_TAB        '\011'   //tab
50 #define CTRL_NEWLINE      '\012' //newline
51 #define CTRL_HARDLINE   '\015'   //cr
52 
53 EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
54 "Write block separators in output");
55 EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
56 "Write raw stuff to name.raw");
57 EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
58 EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
59 "Return ratings in IPEOCRAPI data");
60 EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
61 "Write .txt to .etx map file");
62 EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
63 "Write repetition char code");
64 EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
65 EXTERN STRING_EVAR (unrecognised_char, "|",
66 "Output char for unidentified blobs");
67 EXTERN INT_EVAR (suspect_level, 99, "Suspect marker level");
68 EXTERN INT_VAR (suspect_space_level, 100,
69 "Min suspect level for rejecting spaces");
70 EXTERN INT_VAR (suspect_short_words, 2,
71 "Dont Suspect dict wds longer than this");
72 EXTERN BOOL_VAR (suspect_constrain_1Il, FALSE,
73 "UNLV keep 1Il chars rejected");
74 EXTERN double_VAR (suspect_rating_per_ch, 999.9,
75 "Dont touch bad rating limit");
76 EXTERN double_VAR (suspect_accept_rating, -999.9, "Accept good rating limit");
77 
78 EXTERN BOOL_EVAR (tessedit_minimal_rejection, FALSE,
79 "Only reject tess failures");
80 EXTERN BOOL_VAR (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING");
81 EXTERN BOOL_VAR (tessedit_word_for_word, FALSE,
82 "Make output have exactly one word per WERD");
83 EXTERN BOOL_VAR (tessedit_zero_kelvin_rejection, FALSE,
84 "Dont reject ANYTHING AT ALL");
85 EXTERN BOOL_VAR (tessedit_consistent_reps, TRUE,
86 "Force all rep chars the same");
87 
88 FILE *txt_mapfile = NULL;        //reject map
89 FILE *unlv_file = NULL;          //reject map
90 
91 /**********************************************************************
92  * pixels_to_pts
93  *
94  * Convert an integer number of pixels to the nearest integer
95  * number of points.
96  **********************************************************************/
97 
pixels_to_pts(inT32 pixels,inT32 pix_res)98 inT32 pixels_to_pts(               //convert coords
99                     inT32 pixels,
100                     inT32 pix_res  //resolution
101                    ) {
102   float pts;                     //converted value
103 
104   pts = pixels * 72.0 / pix_res;
105   return (inT32) (pts + 0.5);    //round it
106 }
107 
108 namespace tesseract {
output_pass(PAGE_RES_IT & page_res_it,BOOL8 write_to_shm,TBOX * target_word_box)109 void Tesseract::output_pass(  //Tess output pass //send to api
110                             PAGE_RES_IT &page_res_it,
111                             BOOL8 write_to_shm,
112                             TBOX *target_word_box) {
113   BLOCK_RES *block_of_last_word;
114   inT16 block_id;
115   BOOL8 force_eol;               //During output
116   BLOCK *nextblock;              //block of next word
117   WERD *nextword;                //next word
118 
119   if (tessedit_write_txt_map)
120     txt_mapfile = open_outfile (".map");
121 
122   page_res_it.restart_page ();
123   block_of_last_word = NULL;
124   while (page_res_it.word () != NULL) {
125     check_debug_pt (page_res_it.word (), 120);
126 
127 	if (target_word_box)
128 	{
129 
130 		TBOX current_word_box=page_res_it.word ()->word->bounding_box();
131 		FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
132 		if (!target_word_box->contains(center_pt))
133 		{
134 			page_res_it.forward ();
135 			continue;
136 		}
137 
138 	}
139     if (tessedit_write_block_separators &&
140     block_of_last_word != page_res_it.block ()) {
141       block_of_last_word = page_res_it.block ();
142       block_id = block_of_last_word->block->index();
143       if (!wordrec_no_block)
144         fprintf (textfile, "|^~tr%d\n", block_id);
145       fprintf (txt_mapfile, "|^~tr%d\n", block_id);
146     }
147 
148     force_eol = (tessedit_write_block_separators &&
149       (page_res_it.block () != page_res_it.next_block ())) ||
150       (page_res_it.next_word () == NULL);
151 
152     if (page_res_it.next_word () != NULL)
153       nextword = page_res_it.next_word ()->word;
154     else
155       nextword = NULL;
156     if (page_res_it.next_block () != NULL)
157       nextblock = page_res_it.next_block ()->block;
158     else
159       nextblock = NULL;
160                                  //regardless of tilde crunching
161     write_results (page_res_it, determine_newline_type (page_res_it.word ()->word, page_res_it.block ()->block, nextword, nextblock), force_eol,
162       write_to_shm);
163     page_res_it.forward ();
164   }
165   if (write_to_shm)
166     ocr_send_text(FALSE);
167   if (tessedit_write_block_separators) {
168     if (!wordrec_no_block)
169       fprintf (textfile, "|^~tr\n");
170     fprintf (txt_mapfile, "|^~tr\n");
171   }
172   if (tessedit_write_txt_map) {
173     fprintf (txt_mapfile, "\n"); //because txt gets one
174     #ifdef __UNIX__
175     fsync (fileno (txt_mapfile));
176     #endif
177     fclose(txt_mapfile);
178   }
179 }
180 
181 
182 /*************************************************************************
183  * write_results()
184  *
185  * All recognition and rejection has now been done. Generate the following:
186  *   .txt file     - giving the final best choices with NO highlighting
187  *   .raw file     - giving the tesseract top choice output for each word
188  *   .map file     - showing how the .txt file has been rejected in the .ep file
189  *   epchoice list - a list of one element per word, containing the text for the
190  *                   epaper. Reject strings are inserted.
191  *   inset list    - a list of bounding boxes of reject insets - indexed by the
192  *                   reject strings in the epchoice text.
193  *************************************************************************/
194 
write_results(PAGE_RES_IT & page_res_it,char newline_type,BOOL8 force_eol,BOOL8 write_to_shm)195 void Tesseract::write_results(                        //output a word
196                                                       //full info
197                               PAGE_RES_IT &page_res_it,
198                               char newline_type,      //type of newline
199                                                       //override tilde crunch?
200                               BOOL8 force_eol,
201                               BOOL8 write_to_shm      //send to api
202                   ) {
203                                  //word to do
204   WERD_RES *word = page_res_it.word ();
205 //   WERD_CHOICE *ep_choice;        //ep format
206   STRING repetition_code;
207   const STRING *wordstr;
208   STRING wordstr_lengths;
209   int i;
210   char unrecognised = STRING (unrecognised_char)[0];
211   char ep_chars[32];             //Only for unlv_tilde_crunch
212   int ep_chars_index = 0;
213   char txt_chs[32];              //Only for unlv_tilde_crunch
214   char map_chs[32];              //Only for unlv_tilde_crunch
215   int txt_index = 0;
216   static BOOL8 tilde_crunch_written = FALSE;
217   static BOOL8 last_char_was_newline = TRUE;
218   static BOOL8 last_char_was_tilde = FALSE;
219   static BOOL8 empty_block = TRUE;
220   BOOL8 need_reject = FALSE;
221   PBLOB_IT blob_it;              //blobs
222   UNICHAR_ID space = unicharset.unichar_to_id(" ");
223 
224   /*	if (word->best_choice->string().length() == 0)
225     {
226       tprintf("No output: to output\n");
227     }
228     else if (word->best_choice->string()[0]==' ')
229     {
230       tprintf("spaceword to output\n");
231     }
232     else if (word->best_choice->string()[0]=='\0')
233     {
234       tprintf("null to output\n");
235     }*/
236   if (word->unlv_crunch_mode != CR_NONE
237   && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
238     if ((word->unlv_crunch_mode != CR_DELETE) &&
239       (!tilde_crunch_written ||
240       ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
241       (word->word->space () > 0) &&
242       !word->word->flag (W_FUZZY_NON) &&
243     !word->word->flag (W_FUZZY_SP)))) {
244       if (!word->word->flag (W_BOL) &&
245         (word->word->space () > 0) &&
246         !word->word->flag (W_FUZZY_NON) &&
247       !word->word->flag (W_FUZZY_SP)) {
248         /* Write a space to separate from preceeding good text */
249         txt_chs[txt_index] = ' ';
250         map_chs[txt_index++] = '1';
251         ep_chars[ep_chars_index++] = ' ';
252         last_char_was_tilde = FALSE;
253       }
254       need_reject = TRUE;
255     }
256     if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) {
257       /* Write a reject char - mark as rejected unless zero_rejection mode */
258       last_char_was_tilde = TRUE;
259       txt_chs[txt_index] = unrecognised;
260       if (tessedit_zero_rejection || (suspect_level == 0)) {
261         map_chs[txt_index++] = '1';
262         ep_chars[ep_chars_index++] = unrecognised;
263       }
264       else {
265         map_chs[txt_index++] = '0';
266         /*
267            The ep_choice string is a faked reject to allow newdiff to sync the
268            .etx with the .txt and .map files.
269          */
270         ep_chars[ep_chars_index++] = CTRL_INSET;
271         //escape code
272                                  //dummy reject
273         ep_chars[ep_chars_index++] = 1;
274                                  //dummy reject
275         ep_chars[ep_chars_index++] = 1;
276                                  //type
277         ep_chars[ep_chars_index++] = 2;
278                                  //dummy reject
279         ep_chars[ep_chars_index++] = 1;
280                                  //dummy reject
281         ep_chars[ep_chars_index++] = 1;
282       }
283       tilde_crunch_written = TRUE;
284       last_char_was_newline = FALSE;
285       empty_block = FALSE;
286     }
287 
288     if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) {
289       /* Add a new line output */
290       txt_chs[txt_index] = '\n';
291       map_chs[txt_index++] = '\n';
292                                  //end line
293       ep_chars[ep_chars_index++] = newline_type;
294 
295                                  //Cos of the real newline
296       tilde_crunch_written = FALSE;
297       last_char_was_newline = TRUE;
298       last_char_was_tilde = FALSE;
299     }
300     txt_chs[txt_index] = '\0';
301     map_chs[txt_index] = '\0';
302                                  //xiaofan
303     if (tessedit_write_output && !wordrec_no_block)
304       fprintf (textfile, "%s", txt_chs);
305 
306     if (tessedit_write_txt_map)
307       fprintf (txt_mapfile, "%s", map_chs);
308 
309                                  //terminate string
310     ep_chars[ep_chars_index] = '\0';
311     word->ep_choice = new WERD_CHOICE(ep_chars, unicharset);
312 
313     if (force_eol)
314       empty_block = TRUE;
315     return;
316   }
317 
318   /* NORMAL PROCESSING of non tilde crunched words */
319 
320   tilde_crunch_written = FALSE;
321   if (newline_type)
322     last_char_was_newline = TRUE;
323   else
324     last_char_was_newline = FALSE;
325   empty_block = force_eol;       //About to write a real word
326 
327   if (unlv_tilde_crunching &&
328       last_char_was_tilde &&
329       (word->word->space() == 0) &&
330       !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
331       (word->best_choice->unichar_id(0) == space)) {
332     /* Prevent adjacent tilde across words - we know that adjacent tildes within
333        words have been removed */
334     word->best_choice->remove_unichar_id(0);
335     word->best_choice->populate_unichars(getDict().getUnicharset());
336     word->reject_map.remove_pos (0);
337     blob_it = word->outword->blob_list ();
338     delete blob_it.extract ();   //get rid of reject blob
339   }
340   if (newline_type ||
341     (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
342     last_char_was_tilde = FALSE;
343   else {
344     if (word->reject_map.length () > 0) {
345       if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
346         last_char_was_tilde = TRUE;
347       else
348         last_char_was_tilde = FALSE;
349     }
350     else if (word->word->space () > 0)
351       last_char_was_tilde = FALSE;
352     /* else it is unchanged as there are no output chars */
353   }
354 
355   ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
356 
357   if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
358     ensure_rep_chars_are_consistent(word);
359 
360   set_unlv_suspects(word);
361   check_debug_pt (word, 120);
362   if (tessedit_rejection_debug) {
363     tprintf ("Dict word: \"%s\": %d\n",
364              word->best_choice->debug_string(unicharset).string(),
365              dict_word(*(word->best_choice)));
366   }
367 
368 #if 0
369   if (tessedit_write_unlv) {
370     write_unlv_text(word);
371   }
372 #endif
373 
374   if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
375     repetition_code = "|^~R";
376     wordstr_lengths = "\001\001\001\001";
377     repetition_code += unicharset.id_to_unichar(get_rep_char (word));
378     wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
379     wordstr = &repetition_code;
380   }
381   else {
382     if (tessedit_zero_rejection) {
383       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
384       for (i = 0; i < word->best_choice->length(); ++i) {
385         if (word->reject_map[i].rejected())
386           word->reject_map[i].setrej_minimal_rej_accept();
387       }
388     }
389     if (tessedit_minimal_rejection) {
390       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
391       for (i = 0; i < word->best_choice->length(); ++i) {
392         if ((word->best_choice->unichar_id(i) != space) &&
393             word->reject_map[i].rejected())
394           word->reject_map[i].setrej_minimal_rej_accept();
395       }
396     }
397   }
398 
399   if (write_to_shm)
400     write_shm_text (word, page_res_it.block ()->block,
401       page_res_it.row (), *wordstr, wordstr_lengths);
402 
403 #if 0
404   if (tessedit_write_output)
405     write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
406 
407   if (tessedit_write_raw_output)
408     write_cooked_text (word->word, word->raw_choice->string (),
409       TRUE, FALSE, rawfile);
410 
411   if (tessedit_write_txt_map)
412     write_map(txt_mapfile, word);
413 
414   ep_choice = make_epaper_choice (word, newline_type);
415   word->ep_choice = ep_choice;
416 #endif
417 
418   character_count += word->best_choice->length();
419   word_count++;
420 }
421 }  // namespace tesseract
422 
423 /**********************************************************************
424  * make_epaper_choice
425  *
426  * Construct the epaper text string for a word, using the reject map to
427  * determine whether each blob should be rejected.
428  **********************************************************************/
429 
430 #if 0
431 WERD_CHOICE *make_epaper_choice(                   //convert one word
432                                 WERD_RES *word,    //word to do
433                                 char newline_type  //type of newline
434                                ) {
435   inT16 index = 0;               //to string
436   inT16 blobindex;               //to word
437   inT16 prevright = 0;           //right of previous blob
438   inT16 nextleft;                //left of next blob
439   PBLOB *blob;
440   TBOX inset_box;                 //bounding box
441   PBLOB_IT blob_it;              //blob iterator
442   char word_string[MAX_PATH];    //converted string
443   BOOL8 force_total_reject;
444   char unrecognised = STRING (unrecognised_char)[0];
445 
446   blob_it.set_to_list (word->outword->blob_list ());
447 
448   ASSERT_HOST (word->reject_map.length () ==
449     word->best_choice->string ().length ());
450   /*
451   tprintf( "\"%s\" -> length: %d;  blobcount: %d (%d)\n",
452       word->best_choice->string().string(),
453         word->best_choice->string().length(),
454       blob_it.length(),
455         blob_count( word->outword ) );
456   */
457 
458   if (word->best_choice->string ().length () == 0)
459     force_total_reject = TRUE;
460   else {
461     force_total_reject = FALSE;
462     ASSERT_HOST (blob_it.length () ==
463       word->best_choice->string ().length ());
464   }
465   if (!blob_it.empty ()) {
466     for (index = 0; index < word->word->space (); index++)
467       word_string[index] = ' ';  //leading blanks
468   }
469   /* Why does this generate leading blanks regardless of whether the
470   word_choice string is empty, when write_cooked_text ony generates leading
471   blanks when the string is NOT empty???. */
472 
473   if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
474     strcpy (word_string + index, "|^~R");
475     index += 4;
476     strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
477     index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
478   }
479   else {
480     if (!blob_it.empty ())
481       prevright = blob_it.data ()->bounding_box ().left ();
482     //actually first left
483     for (blobindex = 0, blob_it.mark_cycle_pt ();
484     !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
485       blob = blob_it.data ();
486       if (word->reject_map[blobindex].accepted ()) {
487         if (word->best_choice->string ()[blobindex] == ' ')
488                                  //but not rejected!!
489           word_string[index++] = unrecognised;
490         else
491           word_string[index++] =
492             word->best_choice->string ()[blobindex];
493       }
494       else {                     // start reject
495         inset_box = blob->bounding_box ();
496         /* Extend reject box to include rejected neighbours */
497         while (!blob_it.at_last () &&
498           (force_total_reject ||
499         (word->reject_map[blobindex + 1].rejected ()))) {
500           blobindex++;
501           blob = blob_it.forward ();
502                                  //get total box
503           inset_box += blob->bounding_box ();
504         }
505         if (blob_it.at_last ())
506           nextleft = inset_box.right ();
507         else
508           nextleft = blob_it.data_relative (1)->bounding_box ().left ();
509 
510         //       tprintf("Making reject from (%d,%d)->(%d,%d)\n",
511         //          inset_box.left(),inset_box.bottom(),
512         //          inset_box.right(),inset_box.top());
513 
514         index += make_reject (&inset_box, prevright, nextleft,
515           &word->denorm, &word_string[index]);
516       }
517       prevright = blob->bounding_box ().right ();
518     }
519   }
520   if (newline_type)
521                                  //end line
522     word_string[index++] = newline_type;
523   word_string[index] = '\0';     //terminate string
524   if (strlen (word_string) != index) {
525     tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n",
526       word_string, index, strlen (word_string));
527   }
528                                  //don't pass any zeros
529   ASSERT_HOST (strlen (word_string) == index);
530   return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
531 }
532 #endif
533 
534 /**********************************************************************
535  * make_reject
536  *
537  * Add the escape code to the string for the reject.
538  **********************************************************************/
539 
540 inT16
make_reject(TBOX * inset_box,inT16 prevright,inT16 nextleft,DENORM * denorm,char word_string[])541 make_reject (                    //make reject code
542 TBOX * inset_box,                 //bounding box
543 inT16 prevright,                 //previous char
544 inT16 nextleft,                  //next char
545 DENORM * denorm,                 //de-normalizer
546 char word_string[]               //output string
547 ) {
548   inT16 index;                   //to string
549   inT16 xpos;                    //start of inset
550   inT16 ypos;
551   inT16 width;                   //size of inset
552   inT16 height;
553   inT16 left_offset;             //shift form prev char
554   inT16 right_offset;            //shift to next char
555   inT16 baseline_offset;         //shift from baseline
556   inT16 inset_index = 0;         //number of inset
557   inT16 min_chars;               //min width estimate
558   inT16 max_chars;               //max width estimate
559   float x_centre;                //centre of box
560 
561   index = 0;
562   x_centre = (inset_box->left () + inset_box->right ()) / 2.0;
563   left_offset =
564     (inT16) (denorm->x (inset_box->left ()) - denorm->x (prevright));
565   right_offset =
566     (inT16) (denorm->x (nextleft) - denorm->x (inset_box->right ()));
567   xpos = (inT16) floor (denorm->x (inset_box->left ()));
568   width = (inT16) ceil (denorm->x (inset_box->right ())) - xpos;
569   ypos = (inT16) floor (denorm->y (inset_box->bottom (), x_centre));
570   height = (inT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos;
571   baseline_offset = ypos - (inT16) denorm->y (bln_baseline_offset, x_centre);
572                                  //escape code
573   word_string[index++] = CTRL_INSET;
574   min_chars = (inT16) ceil (0.27 * width / denorm->row ()->x_height ());
575   max_chars = (inT16) floor (1.8 * width / denorm->row ()->x_height ());
576   /*
577   Ensure min_chars and max_chars are in the range 0..254. This ensures that
578   we can add 1 to them to avoid putting \0 in a string, and still not exceed
579   the max value in a byte.
580   */
581   if (min_chars < 0)
582     min_chars = 0;
583   if (min_chars > 254)
584     min_chars = 254;
585   if (max_chars < min_chars)
586     max_chars = min_chars;
587   if (max_chars > 254)
588     max_chars = 254;
589                                  //min chars
590   word_string[index++] = min_chars + 1;
591                                  //max chars
592   word_string[index++] = max_chars + 1;
593   word_string[index++] = 2;      //type?
594                                  //store index
595   word_string[index++] = inset_index / 255 + 1;
596   word_string[index++] = inset_index % 255 + 1;
597   return index;                  //size of string
598 }
599 
600 
601 /**********************************************************************
602  * determine_newline_type
603  *
604  * Find whether we have a wrapping or hard newline.
605  * Return FALSE if not at end of line.
606  **********************************************************************/
607 
determine_newline_type(WERD * word,BLOCK * block,WERD * next_word,BLOCK * next_block)608 char determine_newline_type(                   //test line ends
609                             WERD *word,        //word to do
610                             BLOCK *block,      //current block
611                             WERD *next_word,   //next word
612                             BLOCK *next_block  //block of next word
613                            ) {
614   inT16 end_gap;                 //to right edge
615   inT16 width;                   //of next word
616   TBOX word_box;                  //bounding
617   TBOX next_box;                  //next word
618   TBOX block_box;                 //block bounding
619 
620   if (!word->flag (W_EOL))
621     return FALSE;                //not end of line
622   if (next_word == NULL || next_block == NULL || block != next_block)
623     return CTRL_NEWLINE;
624   if (next_word->space () > 0)
625     return CTRL_HARDLINE;        //it is tabbed
626   word_box = word->bounding_box ();
627   next_box = next_word->bounding_box ();
628   block_box = block->bounding_box ();
629                                  //gap to eol
630   end_gap = block_box.right () - word_box.right ();
631   end_gap -= (inT32) block->space ();
632   width = next_box.right () - next_box.left ();
633   //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
634   //              block_box.right(),word_box.right(),end_gap,
635   //              next_box.right(),next_box.left(),width,
636   //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
637   return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
638 }
639 
640 /**********************************************************************
641  * write_shm_text
642  *
643  * Write the cooked text to the shared memory for the api.
644  **********************************************************************/
645 
write_shm_text(WERD_RES * word,BLOCK * block,ROW_RES * row,const STRING & text,const STRING & text_lengths)646 void write_shm_text(                    //write output
647                     WERD_RES *word,     //word to do
648                     BLOCK *block,       //block it is from
649                     ROW_RES *row,       //row it is from
650                     const STRING &text, //text to write
651                     const STRING &text_lengths
652                    ) {
653   inT32 index;                   //char counter
654   inT32 index2;                  //char counter
655   inT32 length;                  //chars in word
656   inT32 ptsize;                  //font size
657   inT8 blanks;                   //blanks in word
658   uinT8 enhancement;             //bold etc
659   uinT8 font;                    //font index
660   char unrecognised = STRING (unrecognised_char)[0];
661   PBLOB *blob;
662   TBOX blob_box;                  //bounding box
663   PBLOB_IT blob_it;              //blob iterator
664   WERD copy_outword;             // copy to denorm
665   uinT32 rating;                 //of char
666   BOOL8 lineend;                 //end of line
667   int offset;
668   int offset2;
669 
670                                  //point size
671   ptsize = pixels_to_pts ((inT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
672   if (word->word->flag (W_BOL) && ocr_char_space () < 128
673     && ocr_send_text (TRUE) != OKAY)
674     return;                      //release failed
675   copy_outword = *(word->outword);
676   copy_outword.baseline_denormalise (&word->denorm);
677   blob_it.set_to_list (copy_outword.blob_list ());
678   length = text_lengths.length ();
679 
680   if (length > 0) {
681     blanks = word->word->space ();
682     if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
683       blanks = 1;
684     for (index = 0, offset = 0; index < length;
685          offset += text_lengths[index++], blob_it.forward ()) {
686       blob = blob_it.data ();
687       blob_box = blob->bounding_box ();
688 
689       enhancement = 0;
690       if (word->italic > 0 || (word->italic == 0 && row->italic > 0))
691         enhancement |= EUC_ITALIC;
692       if (word->bold > 0 || (word->bold == 0 && row->bold > 0))
693         enhancement |= EUC_BOLD;
694       if (tessedit_write_ratings)
695         rating = (uinT32) (-word->best_choice->certainty () / 0.035);
696       else if (tessedit_zero_rejection)
697         rating = text[offset] == ' ' ? 100 : 0;
698       else
699         rating = word->reject_map[index].accepted ()? 0 : 100;
700       if (rating > 255)
701         rating = 255;
702       if (word->font1_count > 2)
703         font = word->font1;
704       else if (row->font1_count > 8)
705         font = row->font1;
706       else
707                                  //font index
708         font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
709 
710       lineend = word->word->flag (W_EOL) && index == length - 1;
711       if (word->word->flag (W_EOL) && tessedit_zero_rejection
712       && index < length - 1 && text[index + text_lengths[index]] == ' ') {
713         for (index2 = index + 1, offset2 = offset + text_lengths[index];
714              index2 < length && text[offset2] == ' ';
715              offset2 += text_lengths[index2++]);
716         if (index2 == length)
717           lineend = TRUE;
718       }
719 
720       if (!tessedit_zero_rejection || text[offset] != ' '
721       || tessedit_word_for_word) {
722                                  //confidence
723         if (text[offset] == ' ') {
724         ocr_append_char (unrecognised,
725                          blob_box.left (), blob_box.right (),
726                          page_image.get_ysize () - 1 - blob_box.top (),
727                          page_image.get_ysize () - 1 - blob_box.bottom (),
728                          font, (uinT8) rating,
729                          ptsize,                //point size
730                          blanks, enhancement,   //enhancement
731                          OCR_CDIR_LEFT_RIGHT,
732                          OCR_LDIR_DOWN_RIGHT,
733                          lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
734         } else {
735           for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
736             ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]),
737                              blob_box.left (), blob_box.right (),
738                              page_image.get_ysize () - 1 - blob_box.top (),
739                              page_image.get_ysize () - 1 - blob_box.bottom (),
740                              font, (uinT8) rating,
741                              ptsize,                //point size
742                              blanks, enhancement,   //enhancement
743                              OCR_CDIR_LEFT_RIGHT,
744                              OCR_LDIR_DOWN_RIGHT,
745                              lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
746         }
747         blanks = 0;
748       }
749 
750     }
751   }
752   else if (tessedit_word_for_word) {
753     blanks = word->word->space ();
754     if (blanks == 0 && !word->word->flag (W_BOL))
755       blanks = 1;
756     blob_box = word->word->bounding_box ();
757 
758     enhancement = 0;
759     if (word->italic > 0)
760       enhancement |= EUC_ITALIC;
761     if (word->bold > 0)
762       enhancement |= EUC_BOLD;
763     rating = 100;
764     if (word->font1_count > 2)
765       font = word->font1;
766     else if (row->font1_count > 8)
767       font = row->font1;
768     else
769                                  //font index
770       font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
771 
772     lineend = word->word->flag (W_EOL);
773 
774                                  //font index
775     ocr_append_char (unrecognised,
776                      blob_box.left (), blob_box.right (),
777                      page_image.get_ysize () - 1 - blob_box.top (),
778                      page_image.get_ysize () - 1 - blob_box.bottom (),
779                      font,
780                      rating,                    //confidence
781                      ptsize,                    //point size
782                      blanks, enhancement,       //enhancement
783                      OCR_CDIR_LEFT_RIGHT,
784                      OCR_LDIR_DOWN_RIGHT,
785                      lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
786   }
787 }
788 
789 
790 /**********************************************************************
791  * write_map
792  *
793  * Write a map file of 0's and 1'a which associates characters from the .txt
794  * file with those in the .etx file. 0 = .txt char was deleted. 1 = .txt char
795  * is kept.  Note that there may be reject regions in the .etx file WITHOUT
796  * .txt chars being rejected.  The map file should be the same length, and
797  * the same number of lines as the .txt file
798  *
799  * The paramaterised input is because I thought I might be able to generate
800  * multiple map files in a single run.  However, it didn't work because
801  * newdiff needs etx files!
802  **********************************************************************/
803 
804 #if 0
805 void write_map(                //output a map file
806                FILE *mapfile,  //mapfile to write to
807                WERD_RES *word) {
808   inT16 index;
809   int status;
810   STRING mapstr = "";
811 
812   if (word->best_choice->string ().length () > 0) {
813     for (index = 0; index < word->word->space (); index++) {
814       if (word->reject_spaces &&
815         (suspect_level >= suspect_space_level) &&
816         !tessedit_minimal_rejection && !tessedit_zero_rejection)
817         /* Write rejected spaces to .map file ONLY. Newdiff converts these back to
818         accepted spaces AFTER generating basic space stats but BEFORE using .etx */
819         status = fprintf (mapfile, "0");
820       else
821         status = fprintf (mapfile, "1");
822       if (status < 0)
823         WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno);
824     }
825 
826     if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) {
827       for (index = 0; index < 5; index++)
828         mapstr += '1';
829     }
830     else {
831       ASSERT_HOST (word->reject_map.length () ==
832         word->best_choice->string ().length ());
833 
834       for (index = 0; index < word->reject_map.length (); index++) {
835         if (word->reject_map[index].accepted ())
836           mapstr += '1';
837         else
838           mapstr += '0';
839       }
840     }
841     status = fprintf (mapfile, "%s", mapstr.string ());
842     if (status < 0)
843       WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno);
844   }
845   if (word->word->flag (W_EOL)) {
846     status = fprintf (mapfile, "\n");
847     if (status < 0)
848       WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno);
849   }
850   status = fflush (mapfile);
851   if (status != 0)
852     WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
853 }
854 #endif
855 
856 
857 /*************************************************************************
858  * open_file()
859  *************************************************************************/
860 
861 namespace tesseract {
open_outfile(const char * extension)862 FILE *Tesseract::open_outfile(  //open .map & .unlv file
863                    const char *extension) {
864   STRING file_name;
865   FILE *outfile;
866 
867   file_name = imagebasename + extension;
868   if (!(outfile = fopen (file_name.string (), "w"))) {
869     CANTOPENFILE.error ("open_outfile", EXIT, "%s %d",
870       file_name.string (), errno);
871   }
872   return outfile;
873 }
874 }  // namespace tesseract
875 
876 
877 #if 0
878 void write_unlv_text(WERD_RES *word) {
879   const char *wordstr;
880 
881   char buff[512];                //string to output
882   int i = 0;
883   int j = 0;
884   char unrecognised = STRING (unrecognised_char)[0];
885   int status;
886   char space_str[3];
887 
888   wordstr = word->best_choice->string ().string ();
889 
890   /* DONT need to do anything special for repeated char words - at this stage
891   the repetition char has been identified and any other chars have been
892   rejected.
893   */
894 
895   for (; wordstr[i] != '\0'; i++) {
896     if ((wordstr[i] == ' ') ||
897       (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|'))
898       buff[j++] = unrecognised;
899     else {
900       if (word->reject_map[i].rejected ())
901         buff[j++] = '^';         //Add suspect marker
902       buff[j++] = wordstr[i];
903     }
904   }
905   buff[j] = '\0';
906 
907   if (strlen (wordstr) > 0) {
908     if (word->reject_spaces &&
909       (suspect_level >= suspect_space_level) &&
910       !tessedit_minimal_rejection && !tessedit_zero_rejection)
911       strcpy (space_str, "^ ");  //Suspect space
912     else
913       strcpy (space_str, " ");   //Certain space
914 
915     for (i = 0; i < word->word->space (); i++) {
916       status = fprintf (unlv_file, "%s", space_str);
917       if (status < 0)
918         WRITEFAILED.error ("write_unlv_text", EXIT,
919           "Space Errno: %d", errno);
920     }
921 
922     status = fprintf (unlv_file, "%s", buff);
923     if (status < 0)
924       WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno);
925   }
926   if (word->word->flag (W_EOL)) {
927     status = fprintf (unlv_file, "\n");
928     if (status < 0)
929       WRITEFAILED.error ("write_unlv_text", EXIT,
930         "Newline Errno: %d", errno);
931   }
932   status = fflush (unlv_file);
933   if (status != 0)
934     WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
935 }
936 #endif
937 
938 
939 /*************************************************************************
940  * get_rep_char()
941  * Return the first accepted character from the repetition string. This is the
942  * character which is repeated - as determined earlier by fix_rep_char()
943  *************************************************************************/
944 namespace tesseract {
get_rep_char(WERD_RES * word)945 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) {  // what char is repeated?
946   int i;
947   for (i = 0; ((i < word->reject_map.length()) &&
948                (word->reject_map[i].rejected())); ++i);
949 
950   if (i < word->reject_map.length()) {
951     return word->best_choice->unichar_id(i);
952   } else {
953     return unicharset.unichar_to_id(unrecognised_char.string());
954   }
955 }
956 }  // namespace tesseract
957 
ensure_rep_chars_are_consistent(WERD_RES * word)958 void ensure_rep_chars_are_consistent(WERD_RES *word) {
959 #if 0
960   char rep_char = get_rep_char (word);
961   char *ptr;
962 
963   ptr = (char *) word->best_choice->string ().string ();
964   for (; *ptr != '\0'; ptr++) {
965     if (*ptr != rep_char)
966       *ptr = rep_char;
967   }
968 #endif
969 
970 #if 0
971   UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
972   int i;
973   char *ptr;
974   STRING consistent_string;
975   STRING consistent_string_lengths;
976 
977   ptr = (char *) word->best_choice->string ().string ();
978   for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
979     consistent_string += unicharset.id_to_unichar(rep_char);
980     consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
981   }
982   word->best_choice->string() = consistent_string;
983   word->best_choice->lengths() = consistent_string_lengths;
984 #endif
985 }
986 
987 /*************************************************************************
988  * SUSPECT LEVELS
989  *
990  * 0 - dont reject ANYTHING
991  * 1,2 - partial rejection
992  * 3 - BEST
993  *
994  * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
995  * tessedit_minimal_rejection.
996  *************************************************************************/
997 
998 namespace tesseract {
set_unlv_suspects(WERD_RES * word_res)999 void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
1000   int len = word_res->reject_map.length();
1001   const WERD_CHOICE &word = *(word_res->best_choice);
1002   int i;
1003   float rating_per_ch;
1004 
1005   if (suspect_level == 0) {
1006     for (i = 0; i < len; i++) {
1007       if (word_res->reject_map[i].rejected())
1008         word_res->reject_map[i].setrej_minimal_rej_accept();
1009     }
1010     return;
1011   }
1012 
1013   if (suspect_level >= 3)
1014     return;                      //Use defaults
1015 
1016   /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
1017 
1018   if (safe_dict_word(word) &&
1019       (count_alphas(word) > suspect_short_words)) {
1020     /* Unreject alphas in dictionary words */
1021     for (i = 0; i < len; ++i) {
1022       if (word_res->reject_map[i].rejected() &&
1023           unicharset.get_isalpha(word.unichar_id(i)))
1024         word_res->reject_map[i].setrej_minimal_rej_accept();
1025     }
1026   }
1027 
1028   rating_per_ch = word.rating() / word_res->reject_map.length();
1029 
1030   if (rating_per_ch >= suspect_rating_per_ch)
1031     return;                      //Dont touch bad ratings
1032 
1033   if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
1034     /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
1035     for (i = 0; i < len; ++i) {
1036       if (word_res->reject_map[i].rejected() &&
1037           (!unicharset.eq(word.unichar_id(i), " ")))
1038         word_res->reject_map[i].setrej_minimal_rej_accept();
1039     }
1040   }
1041 
1042   for (i = 0; i < len; i++) {
1043     if (word_res->reject_map[i].rejected()) {
1044       if (word_res->reject_map[i].flag(R_DOC_REJ))
1045         word_res->reject_map[i].setrej_minimal_rej_accept();
1046       if (word_res->reject_map[i].flag(R_BLOCK_REJ))
1047         word_res->reject_map[i].setrej_minimal_rej_accept();
1048       if (word_res->reject_map[i].flag(R_ROW_REJ))
1049         word_res->reject_map[i].setrej_minimal_rej_accept();
1050     }
1051   }
1052 
1053   if (suspect_level == 2)
1054     return;
1055 
1056   if (!suspect_constrain_1Il ||
1057       (word_res->reject_map.length() <= suspect_short_words)) {
1058     for (i = 0; i < len; i++) {
1059       if (word_res->reject_map[i].rejected()) {
1060         if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
1061           word_res->reject_map[i].flag(R_POSTNN_1IL)))
1062           word_res->reject_map[i].setrej_minimal_rej_accept();
1063 
1064         if (!suspect_constrain_1Il &&
1065           word_res->reject_map[i].flag(R_MM_REJECT))
1066           word_res->reject_map[i].setrej_minimal_rej_accept();
1067       }
1068     }
1069   }
1070 
1071   if ((acceptable_word_string(word.unichar_string().string(),
1072                               word.unichar_lengths().string()) !=
1073        AC_UNACCEPTABLE) ||
1074       acceptable_number_string(word.unichar_string().string(),
1075                                word.unichar_lengths().string())) {
1076     if (word_res->reject_map.length() > suspect_short_words) {
1077       for (i = 0; i < len; i++) {
1078         if (word_res->reject_map[i].rejected() &&
1079           (!word_res->reject_map[i].perm_rejected() ||
1080            word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
1081            word_res->reject_map[i].flag (R_POSTNN_1IL) ||
1082            word_res->reject_map[i].flag (R_MM_REJECT))) {
1083           word_res->reject_map[i].setrej_minimal_rej_accept();
1084         }
1085       }
1086     }
1087   }
1088 }
1089 
count_alphas(const WERD_CHOICE & word)1090 inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
1091   int count = 0;
1092   for (int i = 0; i < word.length(); ++i) {
1093     if (unicharset.get_isalpha(word.unichar_id(i)))
1094       count++;
1095   }
1096   return count;
1097 }
1098 
1099 
count_alphanums(const WERD_CHOICE & word)1100 inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) {
1101   int count = 0;
1102   for (int i = 0; i < word.length(); ++i) {
1103     if (unicharset.get_isalpha(word.unichar_id(i)) ||
1104         unicharset.get_isdigit(word.unichar_id(i)))
1105       count++;
1106   }
1107   return count;
1108 }
1109 
1110 
acceptable_number_string(const char * s,const char * lengths)1111 BOOL8 Tesseract::acceptable_number_string(const char *s,
1112                                           const char *lengths) {
1113   BOOL8 prev_digit = FALSE;
1114 
1115   if (*lengths == 1 && *s == '(')
1116     s++;
1117 
1118   if (*lengths == 1 &&
1119       ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
1120     s++;
1121 
1122   for (; *s != '\0'; s += *(lengths++)) {
1123     if (unicharset.get_isdigit (s, *lengths))
1124       prev_digit = TRUE;
1125     else if (prev_digit &&
1126              (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
1127       prev_digit = FALSE;
1128     else if (prev_digit && *lengths == 1 &&
1129              (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
1130       return TRUE;
1131     else if (prev_digit &&
1132              *lengths == 1 && (*s == '%') &&
1133              (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
1134              (*(s + *lengths + *(lengths + 1)) == '\0'))
1135       return TRUE;
1136     else
1137       return FALSE;
1138   }
1139   return TRUE;
1140 }
1141 }  // namespace tesseract
1142