• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************
2  * File:        control.cpp  (Formerly control.c)
3  * Description: Module-independent matcher controller.
4  * Author:					Ray Smith
5  * Created:					Thu Apr 23 11:09:58 BST 1992
6  * ReHacked:    Tue Sep 22 08:42:49 BST 1992 Phil Cheatle
7  *
8  * (C) Copyright 1992, Hewlett-Packard Ltd.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 #include          "mfcpch.h"
22 #include          "mainblk.h"
23 #include          <string.h>
24 #include          <math.h>
25 #ifdef __UNIX__
26 #include          <assert.h>
27 #include          <unistd.h>
28 #include                    <errno.h>
29 #endif
30 #include          <ctype.h>
31 #include          "ocrclass.h"
32 #include          "werdit.h"
33 #include          "drawfx.h"
34 #include          "tfacep.h"
35 #include          "tessbox.h"
36 #include          "tessvars.h"
37 //#include                                      "fxtop.h"
38 #include          "pgedit.h"
39 #include          "reject.h"
40 #include          "adaptions.h"
41 #include          "charcut.h"
42 #include          "fixxht.h"
43 #include          "fixspace.h"
44 #include          "genblob.h"
45 #include          "docqual.h"
46 #include          "control.h"
47 #include          "secname.h"
48 #include          "output.h"
49 #include          "callcpp.h"
50 #include          "notdll.h"
51 #include "tordvars.h"
52 #include "adaptmatch.h"
53 #include "globals.h"
54 #include "tesseractclass.h"
55 
56 #define MIN_FONT_ROW_COUNT  8
57 #define MAX_XHEIGHT_DIFF  3
58 
59 #define EXTERN
60 //extern "C" {
61 //EXTERN BOOL_VAR(tessedit_small_match,FALSE,"Use small matrix matcher");
62 
63 //extern FILE*                          matcher_fp;
64 //extern FILE*                          correct_fp;
65 //};
66 BOOL_VAR (tessedit_small_match, FALSE, "Use small matrix matcher");
67 EXTERN BOOL_VAR (tessedit_print_text, FALSE, "Write text to stdout");
68 EXTERN BOOL_VAR (tessedit_draw_words, FALSE, "Draw source words");
69 EXTERN BOOL_VAR (tessedit_draw_outwords, FALSE, "Draw output words");
70 EXTERN BOOL_VAR (tessedit_training_wiseowl, FALSE, "Call WO to learn blobs");
71 EXTERN BOOL_VAR (tessedit_training_tess, FALSE, "Call Tess to learn blobs");
72 EXTERN BOOL_VAR (tessedit_matcher_is_wiseowl, FALSE, "Call WO to classify");
73 EXTERN BOOL_VAR (tessedit_dump_choices, FALSE, "Dump char choices");
74 EXTERN BOOL_VAR (tessedit_fix_fuzzy_spaces, TRUE,
75 "Try to improve fuzzy spaces");
76 EXTERN BOOL_VAR (tessedit_unrej_any_wd, FALSE,
77 "Dont bother with word plausibility");
78 EXTERN BOOL_VAR (tessedit_fix_hyphens, TRUE, "Crunch double hyphens?");
79 
80 EXTERN BOOL_VAR (tessedit_reject_fullstops, FALSE, "Reject all fullstops");
81 EXTERN BOOL_VAR (tessedit_reject_suspect_fullstops, FALSE,
82 "Reject suspect fullstops");
83 EXTERN BOOL_VAR (tessedit_redo_xheight, TRUE, "Check/Correct x-height");
84 EXTERN BOOL_VAR (tessedit_cluster_adaption_on, TRUE,
85 "Do our own adaption - ems only");
86 EXTERN BOOL_VAR (tessedit_enable_doc_dict, TRUE,
87 "Add words to the document dictionary");
88 EXTERN BOOL_VAR (word_occ_first, FALSE, "Do word occ before re-est xht");
89 EXTERN BOOL_VAR (tessedit_debug_fonts, FALSE, "Output font info per char");
90 EXTERN BOOL_VAR (tessedit_xht_fiddles_on_done_wds, TRUE,
91 "Apply xht fix up even if done");
92 EXTERN BOOL_VAR (tessedit_xht_fiddles_on_no_rej_wds, TRUE,
93 "Apply xht fix up even in no rejects");
94 EXTERN INT_VAR (x_ht_check_word_occ, 2, "Check Char Block occupancy");
95 EXTERN INT_VAR (x_ht_stringency, 1, "How many confirmed a/n to accept?");
96 EXTERN BOOL_VAR (x_ht_quality_check, TRUE, "Dont allow worse quality");
97 EXTERN BOOL_VAR (tessedit_debug_block_rejection, FALSE,
98 "Block and Row stats");
99 EXTERN INT_VAR (debug_x_ht_level, 0, "Reestimate debug");
100 EXTERN BOOL_VAR (rej_use_xht, TRUE, "Individual rejection control");
101 EXTERN BOOL_VAR (debug_acceptable_wds, FALSE, "Dump word pass/fail chk");
102 
103 EXTERN STRING_VAR (chs_leading_punct, "('`\"", "Leading punctuation");
104 EXTERN
105 STRING_VAR (chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
106 EXTERN STRING_VAR (chs_trailing_punct2, ")'`\"",
107 "2nd Trailing punctuation");
108 
109 EXTERN double_VAR (quality_rej_pc, 0.08,
110 "good_quality_doc lte rejection limit");
111 EXTERN double_VAR (quality_blob_pc, 0.0,
112 "good_quality_doc gte good blobs limit");
113 EXTERN double_VAR (quality_outline_pc, 1.0,
114 "good_quality_doc lte outline error limit");
115 EXTERN double_VAR (quality_char_pc, 0.95,
116 "good_quality_doc gte good char limit");
117 EXTERN INT_VAR (quality_min_initial_alphas_reqd, 2,
118 "alphas in a good word");
119 
120 EXTERN BOOL_VAR (tessedit_tess_adapt_to_rejmap, FALSE,
121 "Use reject map to control Tesseract adaption");
122 EXTERN INT_VAR (tessedit_tess_adaption_mode, 0x27,
123 "Adaptation decision algorithm for tess");
124 EXTERN INT_VAR (tessedit_em_adaption_mode, 0,
125 "Adaptation decision algorithm for ems matrix matcher");
126 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass1, FALSE,
127 "Adapt using clusterer after pass 1");
128 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass2, FALSE,
129 "Adapt using clusterer after pass 1");
130 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass3, FALSE,
131 "Adapt using clusterer after pass 1");
132 EXTERN BOOL_VAR (tessedit_cluster_adapt_before_pass1, FALSE,
133 "Adapt using clusterer before Tess adaping during pass 1");
134 EXTERN INT_VAR (tessedit_cluster_adaption_mode, 0,
135 "Adaptation decision algorithm for matrix matcher");
136 EXTERN BOOL_VAR (tessedit_adaption_debug, FALSE,
137 "Generate and print debug information for adaption");
138 EXTERN BOOL_VAR (tessedit_minimal_rej_pass1, FALSE,
139 "Do minimal rejection on pass 1 output");
140 EXTERN BOOL_VAR (tessedit_test_adaption, FALSE,
141 "Test adaption criteria");
142 EXTERN BOOL_VAR (tessedit_global_adaption, FALSE,
143 "Adapt to all docs over time");
144 EXTERN BOOL_VAR (tessedit_matcher_log, FALSE, "Log matcher activity");
145 EXTERN INT_VAR (tessedit_test_adaption_mode, 3,
146 "Adaptation decision algorithm for tess");
147 EXTERN BOOL_VAR(save_best_choices, FALSE,
148                 "Save the results of the recognition step"
149 " (blob_choices) within the corresponding WERD_CHOICE");
150 
151 EXTERN BOOL_VAR (test_pt, FALSE, "Test for point");
152 EXTERN double_VAR (test_pt_x, 99999.99, "xcoord");
153 EXTERN double_VAR (test_pt_y, 99999.99, "ycoord");
154 
155 extern int display_ratings;
156 extern int number_debug;
157 FILE *choice_file = NULL;        //Choice file ptr
158 
CLISTIZE(PBLOB)159 CLISTIZEH (PBLOB) CLISTIZE (PBLOB)
160 /* DEBUGGING */
161 inT16 blob_count(WERD *w) {
162   return w->blob_list ()->length ();
163 }
164 
165 
166 /**********************************************************************
167  * recog_pseudo_word
168  *
169  * Make a word from the selected blobs and run Tess on them.
170  **********************************************************************/
171 namespace tesseract {
recog_pseudo_word(BLOCK_LIST * block_list,TBOX & selection_box)172 void Tesseract::recog_pseudo_word(                         //recognize blobs
173                                   BLOCK_LIST *block_list,  //blocks to check
174                                   TBOX &selection_box) {
175   WERD *word;
176   ROW *pseudo_row;               //row of word
177   BLOCK *pseudo_block;           //block of word
178 
179   word = make_pseudo_word (block_list, selection_box,
180     pseudo_block, pseudo_row);
181   if (word != NULL) {
182     recog_interactive(pseudo_block, pseudo_row, word);
183     delete word;
184   }
185 }
186 
187 
188 /**********************************************************************
189  * recog_interactive
190  *
191  * Recognize a single word in interactive mode.
192  **********************************************************************/
recog_interactive(BLOCK * block,ROW * row,WERD * word)193 BOOL8 Tesseract::recog_interactive(            //recognize blobs
194                                    BLOCK *block,    //block
195                                    ROW *row,   //row of word
196                                    WERD *word  //word to recognize
197                        ) {
198   WERD_RES word_res(word);
199   inT16 char_qual;
200   inT16 good_char_qual;
201 
202   classify_word_pass2(&word_res, block, row);
203   #ifndef SECURE_NAMES
204   if (tessedit_debug_quality_metrics) {
205     word_char_quality(&word_res, row, &char_qual, &good_char_qual);
206     tprintf
207       ("\n%d chars;  word_blob_quality: %d;  outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
208       word_res.reject_map.length (), word_blob_quality (&word_res, row),
209       word_outline_errs (&word_res), char_qual, good_char_qual);
210   }
211   #endif
212   return TRUE;
213 }
214 
215 
216 /**********************************************************************
217  * recog_all_words()
218  *
219  * Walk the current block list applying the specified word processor function
220  * to all words
221  **********************************************************************/
222 
recog_all_words(PAGE_RES * page_res,volatile ETEXT_DESC * monitor,TBOX * target_word_box,inT16 dopasses)223 void Tesseract::recog_all_words(                            //process words
224                                 PAGE_RES *page_res,         //page structure
225                                                             //progress monitor
226                                 volatile ETEXT_DESC *monitor,
227                                         // specifies just to extract a rectangle
228                                 TBOX *target_word_box,
229                                 //0 - all, 1 just pass 1, 2 passes 2 and higher
230                                 inT16 dopasses
231                                ) {
232                                  //reset page iterator
233   static PAGE_RES_IT page_res_it;
234   inT16 chars_in_word;
235   inT16 rejects_in_word;
236   static CHAR_SAMPLES_LIST em_clusters;
237   static CHAR_SAMPLE_LIST ems_waiting;
238   static CHAR_SAMPLES_LIST char_clusters;
239   static CHAR_SAMPLE_LIST chars_waiting;
240   inT16 blob_quality = 0;
241   inT16 outline_errs = 0;
242   static inT16 doc_blob_quality = 0;
243   static inT16 doc_outline_errs = 0;
244   static inT16 doc_char_quality = 0;
245   inT16 all_char_quality;
246   inT16 accepted_all_char_quality;
247   static inT16 good_char_count = 0;
248   static inT16 doc_good_char_quality = 0;
249   int i;
250 
251 
252   inT32 tess_adapt_mode = 0;
253   static inT32 word_count;              //count of words in doc
254   inT32 word_index;              //current word
255   static int dict_words;
256 
257   if (tessedit_minimal_rej_pass1) {
258     tessedit_test_adaption.set_value (TRUE);
259     tessedit_minimal_rejection.set_value (TRUE);
260   }
261 
262   if (tessedit_cluster_adapt_before_pass1) {
263     tess_adapt_mode = tessedit_tess_adaption_mode;
264     tessedit_tess_adaption_mode.set_value (0);
265     tessedit_tess_adapt_to_rejmap.set_value (TRUE);
266   }
267 
268 
269 if (dopasses==0 || dopasses==1)
270 {
271 	page_res_it.page_res=page_res;
272 	page_res_it.restart_page();
273 
274   /* Pass 1 */
275   word_count = 0;
276   if (monitor != NULL) {
277     monitor->ocr_alive = TRUE;
278     while (page_res_it.word () != NULL) {
279       word_count++;
280       page_res_it.forward ();
281     }
282     page_res_it.restart_page ();
283   }
284   else
285     word_count = 1;
286 
287   word_index = 0;
288 
289 	em_clusters.clear();
290     ems_waiting.clear();
291     char_clusters.clear();
292     chars_waiting.clear();
293     dict_words = 0;
294 	doc_blob_quality = 0;
295 	doc_outline_errs = 0;
296 	doc_char_quality = 0;
297 	good_char_count = 0;
298 	doc_good_char_quality = 0;
299 
300   while (page_res_it.word () != NULL) {
301     set_global_loc_code(LOC_PASS1);
302     word_index++;
303     if (monitor != NULL) {
304       monitor->ocr_alive = TRUE;
305       monitor->progress = 30 + 50 * word_index / word_count;
306       if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
307           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
308                                                          dict_words)))
309         return;
310     }
311     classify_word_pass1(page_res_it.word(), page_res_it.row()->row,
312                         page_res_it.block()->block, FALSE, NULL, NULL);
313     if (tessedit_dump_choices) {
314 #ifndef GRAPHICS_DISABLED
315       word_dumper(NULL, page_res_it.row()->row, page_res_it.word()->word);
316 #endif
317       tprintf("Pass1: %s [%s]\n",
318               page_res_it.word()->best_choice->unichar_string().string(),
319               page_res_it.word()->best_choice->
320                 debug_string(unicharset).string());
321     }
322 
323     if (tessedit_test_adaption && !tessedit_minimal_rejection) {
324       if (!word_adaptable (page_res_it.word (),
325         tessedit_test_adaption_mode)) {
326         page_res_it.word ()->reject_map.rej_word_tess_failure ();
327       //FAKE PERM REJ
328       } else {
329         // Override rejection mechanisms for this word.
330         UNICHAR_ID space = unicharset.unichar_to_id(" ");
331         for (i = 0; i < page_res_it.word()->best_choice->length(); i++) {
332           if ((page_res_it.word()->best_choice->unichar_id(i) != space) &&
333               page_res_it.word()->reject_map[i].rejected())
334             page_res_it.word ()->reject_map[i].setrej_minimal_rej_accept();
335         }
336       }
337     }
338 
339     if ((tessedit_cluster_adapt_after_pass1
340       || tessedit_cluster_adapt_after_pass3
341       || tessedit_cluster_adapt_before_pass1)
342     && tessedit_cluster_adaption_mode != 0) {
343       collect_characters_for_adaption (page_res_it.word (),
344         &char_clusters, &chars_waiting);
345     }
346     // Count dict words.
347     if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
348       ++dict_words;
349     page_res_it.forward ();
350   }
351 
352   if (tessedit_cluster_adapt_before_pass1)
353     tessedit_tess_adaption_mode.set_value (tess_adapt_mode);
354 
355   page_res_it.restart_page ();
356   while ((tessedit_cluster_adapt_after_pass1
357     || tessedit_cluster_adapt_before_pass1)
358   && page_res_it.word () != NULL) {
359     if (monitor != NULL)
360       monitor->ocr_alive = TRUE;
361     if (tessedit_cluster_adapt_after_pass1)
362       adapt_to_good_samples (page_res_it.word (),
363         &char_clusters, &chars_waiting);
364     else
365       classify_word_pass1 (page_res_it.word (),
366         page_res_it.row ()->row,
367                           page_res_it.block()->block,
368         TRUE, &char_clusters, &chars_waiting);
369 
370     page_res_it.forward ();
371   }
372 
373   //
374 
375 
376  }
377 
378 if (dopasses==1) return;
379 
380   /* Pass 2 */
381   page_res_it.restart_page ();
382   word_index = 0;
383   while (!tessedit_test_adaption && page_res_it.word () != NULL) {
384     set_global_loc_code(LOC_PASS2);
385     word_index++;
386     if (monitor != NULL) {
387       monitor->ocr_alive = TRUE;
388       monitor->progress = 80 + 10 * word_index / word_count;
389       if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
390           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
391                                                          dict_words)))
392         return;
393     }
394 //changed by jetsoft
395 //specific to its needs to extract one word when need
396 
397 	if (target_word_box)
398 	{
399 
400 		TBOX current_word_box=page_res_it.word ()->word->bounding_box();
401 		FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
402 		if (!target_word_box->contains(center_pt))
403 		{
404 			page_res_it.forward ();
405 			continue;
406 		}
407 
408 	}
409 //end jetsoft
410 
411     classify_word_pass2(page_res_it.word(), page_res_it.block()->block,
412                         page_res_it.row()->row);
413     if (tessedit_dump_choices) {
414 #ifndef GRAPHICS_DISABLED
415       word_dumper(NULL, page_res_it.row()->row, page_res_it.word()->word);
416 #endif
417       tprintf("Pass2: %s [%s]\n",
418               page_res_it.word()->best_choice->unichar_string().string(),
419               page_res_it.word()->best_choice->
420                 debug_string(unicharset).string());
421     }
422 
423     if (tessedit_em_adaption_mode > 0)
424       collect_ems_for_adaption (page_res_it.word (),
425         &em_clusters, &ems_waiting);
426 
427     if (tessedit_cluster_adapt_after_pass2
428       && tessedit_cluster_adaption_mode != 0)
429       collect_characters_for_adaption (page_res_it.word (),
430         &char_clusters, &chars_waiting);
431     page_res_it.forward ();
432   }
433 
434   /* Another pass */
435   set_global_loc_code(LOC_FUZZY_SPACE);
436 
437   if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
438     && !tessedit_word_for_word)
439     fix_fuzzy_spaces(monitor, word_count, page_res);
440 
441   if (!tessedit_test_adaption && tessedit_em_adaption_mode != 0)
442                                  // Initially ems only
443     print_em_stats(&em_clusters, &ems_waiting);
444 
445   /* Pass 3 - used for checking confusion sets */
446   page_res_it.restart_page ();
447   word_index = 0;
448   while (!tessedit_test_adaption && page_res_it.word () != NULL) {
449     set_global_loc_code(LOC_MM_ADAPT);
450     word_index++;
451     if (monitor != NULL) {
452       monitor->ocr_alive = TRUE;
453       monitor->progress = 95 + 5 * word_index / word_count;
454     }
455     check_debug_pt (page_res_it.word (), 70);
456     /* Use good matches to sort out confusions */
457 
458 
459 //changed by jetsoft
460 //specific to its needs to extract one word when need
461 
462 	if (target_word_box)
463 	{
464 
465 		TBOX current_word_box=page_res_it.word ()->word->bounding_box();
466 		FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
467 		if (!target_word_box->contains(center_pt))
468 		{
469 			page_res_it.forward ();
470 			continue;
471 		}
472 
473 	}
474 // end jetsoft
475 
476     if (tessedit_em_adaption_mode != 0)
477       adapt_to_good_ems (page_res_it.word (), &em_clusters, &ems_waiting);
478 
479     if (tessedit_cluster_adapt_after_pass2
480       && tessedit_cluster_adaption_mode != 0)
481       adapt_to_good_samples (page_res_it.word (),
482         &char_clusters, &chars_waiting);
483 
484     UNICHAR_ID dot = unicharset.unichar_to_id(".");
485     if (tessedit_reject_fullstops &&
486         page_res_it.word()->best_choice->contains_unichar_id(dot)) {
487       reject_all_fullstops (page_res_it.word ());
488     } else if (tessedit_reject_suspect_fullstops &&
489              page_res_it.word()->best_choice->contains_unichar_id(dot)) {
490       reject_suspect_fullstops (page_res_it.word ());
491     }
492 
493     page_res_it.rej_stat_word ();
494     chars_in_word = page_res_it.word ()->reject_map.length ();
495     rejects_in_word = page_res_it.word ()->reject_map.reject_count ();
496 
497     blob_quality = word_blob_quality (page_res_it.word (),
498       page_res_it.row ()->row);
499     doc_blob_quality += blob_quality;
500     outline_errs = word_outline_errs (page_res_it.word ());
501     doc_outline_errs += outline_errs;
502     word_char_quality (page_res_it.word (),
503       page_res_it.row ()->row,
504       &all_char_quality, &accepted_all_char_quality);
505     doc_char_quality += all_char_quality;
506     uinT8 permuter_type = page_res_it.word ()->best_choice->permuter ();
507     if ((permuter_type == SYSTEM_DAWG_PERM) ||
508       (permuter_type == FREQ_DAWG_PERM) ||
509     (permuter_type == USER_DAWG_PERM)) {
510       good_char_count += chars_in_word - rejects_in_word;
511       doc_good_char_quality += accepted_all_char_quality;
512     }
513     check_debug_pt (page_res_it.word (), 80);
514     if (tessedit_reject_bad_qual_wds &&
515       (blob_quality == 0) && (outline_errs >= chars_in_word))
516       page_res_it.word ()->reject_map.rej_word_bad_quality ();
517     check_debug_pt (page_res_it.word (), 90);
518     page_res_it.forward ();
519   }
520 
521   page_res_it.restart_page ();
522   while (!tessedit_test_adaption
523   && tessedit_cluster_adapt_after_pass3 && page_res_it.word () != NULL) {
524     if (monitor != NULL)
525       monitor->ocr_alive = TRUE;
526 
527 //changed by jetsoft
528 //specific to its needs to extract one word when need
529 
530 	if (target_word_box)
531 	{
532 
533 		TBOX current_word_box=page_res_it.word ()->word->bounding_box();
534 		FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
535 		if (!target_word_box->contains(center_pt))
536 		{
537 			page_res_it.forward ();
538 			continue;
539 		}
540 
541 	}
542 
543 //end jetsoft
544     if (tessedit_cluster_adaption_mode != 0)
545       adapt_to_good_samples (page_res_it.word (),
546         &char_clusters, &chars_waiting);
547     page_res_it.forward ();
548   }
549 
550   #ifndef SECURE_NAMES
551   if (tessedit_debug_quality_metrics) {
552     tprintf
553       ("QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
554       page_res->char_count, page_res->rej_count,
555       page_res->rej_count / (float) page_res->char_count, doc_blob_quality,
556       doc_blob_quality / (float) page_res->char_count, doc_outline_errs,
557       doc_outline_errs / (float) page_res->char_count, doc_char_quality,
558       doc_char_quality / (float) page_res->char_count,
559       doc_good_char_quality,
560       good_char_count >
561       0 ? doc_good_char_quality / (float) good_char_count : 0.0);
562   }
563   #endif
564   BOOL8 good_quality_doc =
565     (page_res->rej_count / (float) page_res->char_count <= quality_rej_pc)
566     &&
567     (doc_blob_quality / (float) page_res->char_count >= quality_blob_pc) &&
568     (doc_outline_errs / (float) page_res->char_count <= quality_outline_pc) &&
569     (doc_char_quality / (float) page_res->char_count >= quality_char_pc);
570 
571   /* Do whole document or whole block rejection pass*/
572 
573   if (!tessedit_test_adaption) {
574     set_global_loc_code(LOC_DOC_BLK_REJ);
575     quality_based_rejection(page_res_it, good_quality_doc);
576   }
577   font_recognition_pass(page_res_it);
578 
579   /* Write results pass */
580   set_global_loc_code(LOC_WRITE_RESULTS);
581   // This is now redundant, but retained commented so show how to obtain
582   // bounding boxes and style information.
583 
584   // changed by jetsoft
585   // needed for dll to output memory structure
586   if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
587     output_pass(page_res_it, ocr_char_space() > 0, target_word_box);
588   // end jetsoft
589 }
590 
591 
592 /**********************************************************************
593  * classify_word_pass1
594  *
595  * Baseline normalize the word and pass it to Tess.
596  **********************************************************************/
597 
classify_word_pass1(WERD_RES * word,ROW * row,BLOCK * block,BOOL8 cluster_adapt,CHAR_SAMPLES_LIST * char_clusters,CHAR_SAMPLE_LIST * chars_waiting)598 void Tesseract::classify_word_pass1(                 //recog one word
599                                     WERD_RES *word,  //word to do
600                                     ROW *row,
601                                     BLOCK* block,
602                                     BOOL8 cluster_adapt,
603                                     CHAR_SAMPLES_LIST *char_clusters,
604                                     CHAR_SAMPLE_LIST *chars_waiting) {
605   WERD *bln_word;                //baseline norm copy
606                                  //detailed results
607   BLOB_CHOICE_LIST_CLIST local_blob_choices;
608   BLOB_CHOICE_LIST_CLIST *blob_choices;
609   BOOL8 adapt_ok;
610   const char *rejmap;
611   inT16 index;
612   STRING mapstr = "";
613   char *match_string;
614   char word_string[1024];
615 
616   if (save_best_choices)
617     blob_choices = new BLOB_CHOICE_LIST_CLIST();
618   else
619     blob_choices = &local_blob_choices;
620 
621   if (matcher_fp != NULL) {
622     fgets (word_string, 1023, correct_fp);
623     if ((match_string = strchr (word_string, '\r')) != NULL)
624       *match_string = '\0';
625     if ((match_string = strchr (word_string, '\n')) != NULL)
626       *match_string = '\0';
627     if (word_string[0] != '\0') {
628       word->word->set_text (word_string);
629       word_answer = (char *) word->word->text ();
630     }
631     else
632       word_answer = NULL;
633   }
634 
635   check_debug_pt (word, 0);
636   bln_word = make_bln_copy(word->word, row, block, word->x_height,
637                            &word->denorm);
638 
639   word->best_choice = tess_segment_pass1 (bln_word, &word->denorm,
640     &Tesseract::tess_default_matcher,
641     word->raw_choice, blob_choices,
642     word->outword);
643   /*
644      Test for TESS screw up on word. Recog_word has already ensured that the
645      choice list, outword blob lists and best_choice string are the same
646      length. A TESS screw up is indicated by a blank filled or 0 length string.
647    */
648   if ((word->best_choice->length() == 0) ||
649       (strspn (word->best_choice->unichar_string().string(), " ") ==
650        word->best_choice->length())) {
651     word->done = FALSE;          // Try again on pass2 - adaption may help.
652     word->tess_failed = TRUE;
653     word->reject_map.initialise(word->best_choice->length());
654     word->reject_map.rej_word_tess_failure ();
655   } else {
656     word->tess_failed = FALSE;
657     if ((word->best_choice->length() !=
658          word->outword->blob_list()->length()) ||
659         (word->best_choice->length() != blob_choices->length())) {
660       tprintf
661         ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
662         word->best_choice->debug_string(unicharset).string(),
663         word->best_choice->length(),
664         word->outword->blob_list()->length(),
665         blob_choices->length());
666     }
667     ASSERT_HOST (word->best_choice->length() ==
668                  word->outword->blob_list()->length());
669     ASSERT_HOST (word->best_choice->length() == blob_choices->length ());
670 
671     /*
672        The adaption step used to be here. It has been moved to after
673        make_reject_map so that we know whether the word will be accepted in the
674        first pass or not.   This move will PREVENT adaption to words containing
675        double quotes because the word will not be identical to what tess thinks
676        its best choice is. (See CurrentBestChoiceIs in
677        danj/microfeatures/stopper.c which is used by AdaptableWord in
678        danj/microfeatures/adaptmatch.c)
679      */
680 
681     if (word->word->flag (W_REP_CHAR)) {
682       fix_rep_char(word);
683     } else {
684       // TODO(daria) delete these hacks when replaced by more generic code.
685       // Convert '' (double single) to " (single double).
686       fix_quotes(word->best_choice, word->outword, blob_choices);
687       if (tessedit_fix_hyphens)  // turn -- to -
688         fix_hyphens (word->best_choice, word->outword, blob_choices);
689       record_certainty (word->best_choice->certainty (), 1);
690       // accounting.
691 
692       word->tess_accepted = tess_acceptable_word (word->best_choice,
693         word->raw_choice);
694 
695       word->tess_would_adapt = tess_adaptable_word (word->outword,
696         word->best_choice,
697         word->raw_choice);
698                                  // Also sets word->done flag
699       make_reject_map (word, blob_choices, row, 1);
700 
701       adapt_ok = word_adaptable (word, tessedit_tess_adaption_mode);
702 
703       if (cluster_adapt)
704         adapt_to_good_samples(word, char_clusters, chars_waiting);
705 
706       if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
707         if (!tessedit_tess_adapt_to_rejmap) {
708           rejmap = NULL;
709         } else {
710           ASSERT_HOST(word->reject_map.length() ==
711                       word->best_choice->length());
712 
713           for (index = 0; index < word->reject_map.length (); index++) {
714             if (adapt_ok || word->reject_map[index].accepted ())
715               mapstr += '1';
716             else
717               mapstr += '0';
718           }
719           rejmap = mapstr.string ();
720         }
721 
722                                  // adapt to it.
723         tess_adapter (word->outword, &word->denorm,
724                       *word->best_choice,
725                       *word->raw_choice, rejmap);
726       }
727 
728       if (tessedit_enable_doc_dict)
729         tess_add_doc_word (word->best_choice);
730       set_word_fonts(word, blob_choices);
731     }
732   }
733 #if 0
734   if (tessedit_print_text) {
735     write_cooked_text (bln_word, word->best_choice->string (),
736       word->done, FALSE, stdout);
737   }
738 #endif
739   delete bln_word;
740 
741   // Save best choices in the WERD_CHOICE if needed
742   if (blob_choices != &local_blob_choices) {
743     word->best_choice->set_blob_choices(blob_choices);
744   } else {
745     blob_choices->deep_clear();
746   }
747 }
748 
749 /**********************************************************************
750  * classify_word_pass2
751  *
752  * Control what to do with the word in pass 2
753  **********************************************************************/
754 
classify_word_pass2(WERD_RES * word,BLOCK * block,ROW * row)755 void Tesseract::classify_word_pass2(WERD_RES *word, BLOCK* block, ROW *row) {
756   BOOL8 done_this_pass = FALSE;
757   WERD_RES new_x_ht_word (word->word);
758   float new_x_ht = 0.0;
759   inT16 old_xht_reject_count;
760   inT16 new_xht_reject_count;
761   inT16 old_xht_accept_count;
762   inT16 new_xht_accept_count;
763   BOOL8 accept_new_x_ht = FALSE;
764   inT16 old_chs_in_wd;
765   inT16 new_chs_in_wd;
766   inT16 old_word_quality;
767   inT16 new_word_quality;
768   inT16 dummy;
769 
770   set_global_subloc_code(SUBLOC_NORM);
771   check_debug_pt (word, 30);
772   if (!word->done ||
773     tessedit_training_tess ||
774   tessedit_training_wiseowl) {
775     word->caps_height = 0.0;
776     if (word->x_height == 0.0f)
777       word->x_height = row->x_height();
778     if (word->outword != NULL) {
779       delete word->outword;      //get rid of junk
780       delete word->best_choice;
781       delete word->raw_choice;
782     }
783     match_word_pass2 (word, row, block, word->x_height);
784     done_this_pass = TRUE;
785     check_debug_pt (word, 40);
786   }
787 
788   if (!word->tess_failed && !word->word->flag (W_REP_CHAR)) {
789     set_global_subloc_code(SUBLOC_FIX_XHT);
790     if ((tessedit_xht_fiddles_on_done_wds || !word->done) &&
791       (tessedit_xht_fiddles_on_no_rej_wds ||
792     (word->reject_map.reject_count () > 0))) {
793       if ((x_ht_check_word_occ >= 2) && word_occ_first)
794         check_block_occ(word);
795 
796       if (tessedit_redo_xheight)
797         re_estimate_x_ht(word, &new_x_ht);
798 
799       if (((x_ht_check_word_occ >= 2) && !word_occ_first) ||
800         ((x_ht_check_word_occ >= 1) && (new_x_ht > 0)))
801         check_block_occ(word);
802     }
803     if (new_x_ht > 0) {
804       old_chs_in_wd = word->reject_map.length ();
805 
806       /* Re-estimated x_ht error suggests a rematch is worthwhile. */
807       new_x_ht_word.x_height = new_x_ht;
808       new_x_ht_word.caps_height = 0.0;
809       match_word_pass2(&new_x_ht_word, row, block, new_x_ht_word.x_height);
810       if (!new_x_ht_word.tess_failed) {
811         if ((x_ht_check_word_occ >= 1) && word_occ_first)
812           check_block_occ(&new_x_ht_word);
813 
814         re_estimate_x_ht(&new_x_ht_word, &new_x_ht);
815 
816         if ((x_ht_check_word_occ >= 1) && !word_occ_first)
817           check_block_occ(&new_x_ht_word);
818 
819         old_xht_reject_count = word->reject_map.reject_count ();
820         old_xht_accept_count = old_chs_in_wd - old_xht_reject_count;
821         new_xht_reject_count = new_x_ht_word.reject_map.reject_count ();
822         new_chs_in_wd = new_x_ht_word.reject_map.length ();
823         new_xht_accept_count = new_chs_in_wd - new_xht_reject_count;
824         accept_new_x_ht =
825           ((new_xht_accept_count > old_xht_accept_count) ||
826           ((new_xht_accept_count == old_xht_accept_count) &&
827           (new_xht_accept_count > 0))) &&
828           (!new_x_ht_word.guessed_x_ht ||
829           !new_x_ht_word.guessed_caps_ht);
830 
831         if (accept_new_x_ht && x_ht_quality_check) {
832           word_char_quality(word, row, &old_word_quality, &dummy);
833           word_char_quality(&new_x_ht_word, row, &new_word_quality, &dummy);
834           if (old_word_quality > new_word_quality)
835             accept_new_x_ht = FALSE;
836         }
837 
838         if (accept_new_x_ht && (x_ht_stringency > 0)) {
839           accept_new_x_ht =
840             (count_alphanums (&new_x_ht_word) > x_ht_stringency);
841           if (!accept_new_x_ht && rej_use_xht) {
842             if (debug_x_ht_level >= 1)
843               tprintf
844                 ("Failed stringency test so reject original word\n");
845             word->reject_map.rej_word_xht_fixup ();
846           }
847         }
848 
849         #ifndef SECURE_NAMES
850         if (debug_x_ht_level >= 1) {
851           tprintf ("New XHT Match:: %s ",
852                    word->best_choice->debug_string(unicharset).string());
853           word->reject_map.print (debug_fp);
854           tprintf (" -> %s ",
855                    new_x_ht_word.best_choice->debug_string(
856                        unicharset).string());
857           new_x_ht_word.reject_map.print (debug_fp);
858           tprintf (" %s->%s %s %s\n",
859             word->guessed_x_ht ? "GUESS" : "CERT",
860             new_x_ht_word.guessed_x_ht ? "GUESS" : "CERT",
861             new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
862             accept_new_x_ht ? "ACCEPTED" : "");
863         }
864         #endif
865       }
866       if (accept_new_x_ht) {
867         /*
868            The new x_ht is deemed superior so put the final results in the real
869            word and destroy the old results
870          */
871         delete word->outword;    //get rid of junk
872         word->outword = new_x_ht_word.outword;
873         word->denorm = new_x_ht_word.denorm;
874         delete word->best_choice;
875         word->best_choice = new_x_ht_word.best_choice;
876         delete word->raw_choice;
877         word->raw_choice = new_x_ht_word.raw_choice;
878         word->reject_map = new_x_ht_word.reject_map;
879         word->done = new_x_ht_word.done;
880         done_this_pass = TRUE;
881       }
882       else {
883       /*
884          The new x_ht is no better, so destroy the copy word and put any
885          uncertain x or cap ht estimate back to default. (I.e. dont blame
886          me if its bad!) Conditionally, use any ammended block occ chars.
887        */
888                                  //get rid of junk
889         delete new_x_ht_word.outword;
890         delete new_x_ht_word.best_choice;
891         delete new_x_ht_word.raw_choice;
892       }
893                                  //to keep new destructor happy
894       new_x_ht_word.outword = NULL;
895                                  //to keep new destructor happy
896       new_x_ht_word.best_choice = NULL;
897                                  //to keep new destructor happy
898       new_x_ht_word.raw_choice = NULL;
899 
900       if (rej_mostly_reject_mode == 2) {
901         reject_mostly_rejects(word);
902         tprintf("Rejecting mostly rejects on %s ",
903                 word->best_choice->debug_string(unicharset).string());
904       }
905     }
906 
907     set_global_subloc_code(SUBLOC_NORM);
908 
909     if (done_this_pass && !word->done && tessedit_save_stats) {
910       STRING word_str;
911       word->best_choice->string_and_lengths(unicharset, &word_str, NULL);
912       SaveBadWord(word_str.string(), word->best_choice->certainty());
913     }
914     record_certainty (word->best_choice->certainty(), 2);
915     //accounting
916   }
917 #ifndef GRAPHICS_DISABLED
918   if (tessedit_draw_outwords) {
919     if (fx_win == NULL)
920       create_fx_win();
921     clear_fx_win();
922     word->outword->plot (fx_win);
923     TBOX wbox = word->outword->bounding_box();
924     fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
925                             wbox.right(), wbox.bottom());
926     //make_picture_current(fx_win);
927     ScrollView::Update();
928   }
929 #endif
930 
931   set_global_subloc_code(SUBLOC_NORM);
932 #if 0
933   if (tessedit_print_text) {
934     write_cooked_text (word->outword, word->best_choice->string (),
935       word->done, done_this_pass, stdout);
936   }
937 #endif
938   check_debug_pt (word, 50);
939 }
940 
941 
942 /**********************************************************************
943  * match_word_pass2
944  *
945  * Baseline normalize the word and pass it to Tess.
946  **********************************************************************/
947 
match_word_pass2(WERD_RES * word,ROW * row,BLOCK * block,float x_height)948 void Tesseract::match_word_pass2(                 //recog one word
949                                  WERD_RES *word,  //word to do
950                                  ROW *row,
951                                  BLOCK* block,
952                                  float x_height) {
953   WERD *bln_word;                //baseline norm copy
954                                  //detailed results
955   BLOB_CHOICE_LIST_CLIST local_blob_choices;
956   BLOB_CHOICE_LIST_CLIST *blob_choices;
957 
958   if (save_best_choices)
959     blob_choices = new BLOB_CHOICE_LIST_CLIST();
960   else
961     blob_choices = &local_blob_choices;
962 
963   set_global_subsubloc_code(SUBSUBLOC_OTHER);
964   if (matcher_fp != NULL) {
965     word_answer = (char *) word->word->text ();
966     if (word_answer != NULL && word_answer[0] == '\0')
967       word_answer = NULL;
968   }
969   bln_word = make_bln_copy (word->word, row, block, x_height, &word->denorm);
970   set_global_subsubloc_code(SUBSUBLOC_TESS);
971   if (tessedit_training_tess)
972     word->best_choice = correct_segment_pass2 (bln_word,
973       &word->denorm,
974       &Tesseract::tess_default_matcher,
975       tess_training_tester,
976       word->raw_choice,
977       blob_choices, word->outword);
978   else {
979     word->best_choice = tess_segment_pass2 (bln_word, &word->denorm,
980       &Tesseract::tess_default_matcher,
981       word->raw_choice, blob_choices,
982       word->outword);
983   }
984   set_global_subsubloc_code(SUBSUBLOC_OTHER);
985   /*
986      Test for TESS screw up on word. Recog_word has already ensured that the
987      choice list, outword blob lists and best_choice string are the same
988      length. A TESS screw up is indicated by a blank filled or 0 length string.
989    */
990   if ((word->best_choice->length() == 0) ||
991       (strspn (word->best_choice->unichar_string().string (), " ") ==
992        word->best_choice->length())) {
993     word->tess_failed = TRUE;
994     word->reject_map.initialise (word->best_choice->length());
995     word->reject_map.rej_word_tess_failure ();
996     //              tprintf("Empty word produced\n");
997   }
998   else {
999     if ((word->best_choice->length() !=
1000       word->outword->blob_list()->length ()) ||
1001         (word->best_choice->length() != blob_choices->length())) {
1002       tprintf
1003         ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
1004         word->best_choice->debug_string(unicharset).string(),
1005         word->best_choice->length(),
1006         word->outword->blob_list()->length(), blob_choices->length());
1007     }
1008     ASSERT_HOST (word->best_choice->length() ==
1009                  word->outword->blob_list()->length());
1010     ASSERT_HOST (word->best_choice->length() == blob_choices->length());
1011 
1012     word->tess_failed = FALSE;
1013     if (word->word->flag (W_REP_CHAR)) {
1014       fix_rep_char(word);
1015     }
1016     else {
1017       fix_quotes (word->best_choice,
1018         word->outword, blob_choices);
1019       if (tessedit_fix_hyphens)
1020         fix_hyphens (word->best_choice,
1021           word->outword, blob_choices);
1022       /* Dont trust fix_quotes! - though I think I've fixed the bug */
1023       if ((word->best_choice->length() !=
1024            word->outword->blob_list()->length()) ||
1025           (word->best_choice->length() != blob_choices->length())) {
1026         #ifndef SECURE_NAMES
1027         tprintf
1028           ("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
1029            word->best_choice->debug_string(unicharset).string(),
1030            word->best_choice->length(),
1031            word->outword->blob_list()->length(), blob_choices->length());
1032         #endif
1033 
1034       }
1035       ASSERT_HOST (word->best_choice->length() ==
1036                    word->outword->blob_list()->length());
1037       ASSERT_HOST (word->best_choice->length() == blob_choices->length());
1038 
1039       word->tess_accepted = tess_acceptable_word(word->best_choice,
1040                                                  word->raw_choice);
1041 
1042       make_reject_map (word, blob_choices, row, 2);
1043     }
1044   }
1045 
1046   // Save best choices in the WERD_CHOICE if needed
1047   if (blob_choices != &local_blob_choices)
1048     word->best_choice->set_blob_choices(blob_choices);
1049   else
1050     blob_choices->deep_clear();
1051 
1052   delete bln_word;
1053   assert (word->raw_choice != NULL);
1054 }
1055 }  // namespace tesseract
1056 
1057 
1058 /*************************************************************************
1059  * fix_rep_char()
1060  * The word is a repeated char. Find the repeated char character. Make a reject
1061  * string which rejects any char other than the voted char. Set the word to done
1062  * to stop rematching it.
1063  *
1064  *************************************************************************/
1065 namespace tesseract {
fix_rep_char(WERD_RES * word_res)1066 void Tesseract::fix_rep_char(WERD_RES *word_res) {
1067   struct REP_CH {
1068     UNICHAR_ID unichar_id;
1069     int count;
1070   };
1071   const WERD_CHOICE &word = *(word_res->best_choice);
1072   REP_CH *rep_ch;        // array of char counts
1073   int rep_ch_count = 0;  // how many unique chs
1074   int i, j;
1075   int total = 0;
1076   int max = 0;
1077   UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1078   UNICHAR_ID space = unicharset.unichar_to_id(" ");
1079 
1080   rep_ch = new REP_CH[word.length()];
1081   for (i = 0; i < word.length(); ++i) {
1082     for (j = 0; j < rep_ch_count &&
1083          rep_ch[j].unichar_id != word.unichar_id(i); ++j);
1084     if (j < rep_ch_count) {
1085       rep_ch[j].count++;
1086     } else {
1087       rep_ch[rep_ch_count].unichar_id = word.unichar_id(i);
1088       rep_ch[rep_ch_count].count = 1;
1089       rep_ch_count++;
1090     }
1091   }
1092 
1093   for (j = 0; j < rep_ch_count; j++) {
1094     total += rep_ch[j].count;
1095     if ((rep_ch[j].count > max) && (rep_ch[j].unichar_id != space)) {
1096       max = rep_ch[j].count;
1097       maxch_id = rep_ch[j].unichar_id;
1098     }
1099   }
1100   //      tprintf( "REPEATED CHAR %s len=%d total=%d choice=%c\n",
1101   //                        word_str, word_len, total, maxch );
1102   delete[] rep_ch;
1103 
1104   word_res->reject_map.initialise(word.length());
1105   for (i = 0; i < word.length(); ++i) {
1106     if (word.unichar_id(i) != maxch_id)
1107       word_res->reject_map[i].setrej_bad_repetition(); // rej unrecognised blobs
1108   }
1109   word_res->done = TRUE;
1110 }
1111 
1112 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
1113 // training data.
1114 
1115 // Utility function for fix_quotes
1116 // Return true if the next character in the string (given the UTF8 length in
1117 // bytes) is a quote character.
is_simple_quote(const char * signed_str,int length)1118 static int is_simple_quote(const char* signed_str, int length) {
1119   const unsigned char* str =
1120     reinterpret_cast<const unsigned char*>(signed_str);
1121    //standard 1 byte quotes
1122   return (length == 1 && (*str == '\'' || *str == '`')) ||
1123       //utf8 3 bytes curved quotes
1124       (length == 3 && ((*str == 0xe2 &&
1125                         *(str + 1) == 0x80 &&
1126                         *(str + 2) == 0x98) ||
1127                        (*str == 0xe2 &&
1128                         *(str + 1) == 0x80 &&
1129                         *(str + 2) == 0x99)));
1130 }
1131 
1132 /**********************************************************************
1133  * fix_quotes
1134  *
1135  * Change pairs of quotes to double quotes.
1136  **********************************************************************/
fix_quotes(WERD_CHOICE * choice,WERD * word,BLOB_CHOICE_LIST_CLIST * blob_choices)1137 void Tesseract::fix_quotes(WERD_CHOICE *choice,  //choice to fix
1138                            WERD *word,    //word to do //char choices
1139                            BLOB_CHOICE_LIST_CLIST *blob_choices) {
1140   if (!unicharset.contains_unichar("\"") ||
1141       !unicharset.get_enabled(unicharset.unichar_to_id("\"")))
1142     return;  // Don't create it if it is disallowed.
1143 
1144   PBLOB_IT blob_it = word->blob_list();  // blobs
1145   BLOB_CHOICE_LIST_C_IT blob_choices_it = blob_choices;  // choices
1146   BLOB_CHOICE_IT it1;  // first choices
1147   BLOB_CHOICE_IT it2;  // second choices
1148 
1149   int i;
1150   int modified = false;
1151   for (i = 0; i < choice->length()-1;
1152        ++i, blob_it.forward(), blob_choices_it.forward()) {
1153     const char *ch = unicharset.id_to_unichar(choice->unichar_id(i));
1154     const char *next_ch = unicharset.id_to_unichar(choice->unichar_id(i+1));
1155     if (is_simple_quote(ch, strlen(ch)) &&
1156         is_simple_quote(next_ch, strlen(next_ch))) {
1157       choice->set_unichar_id(unicharset.unichar_to_id("\""), i);
1158       choice->remove_unichar_id(i+1);
1159       modified = true;
1160       merge_blobs(blob_it.data(), blob_it.data_relative(1));
1161       blob_it.forward();
1162       delete blob_it.extract();  // get rid of spare
1163 
1164       it1.set_to_list(blob_choices_it.data());
1165       it2.set_to_list(blob_choices_it.data_relative(1));
1166       if (it1.data()->certainty() < it2.data()->certainty()) {
1167         blob_choices_it.forward();
1168         delete blob_choices_it.extract();  // get rid of spare
1169       } else {
1170         delete blob_choices_it.extract();  // get rid of spare
1171         blob_choices_it.forward();
1172       }
1173     }
1174   }
1175   if (modified) {
1176     choice->populate_unichars(unicharset);
1177   }
1178 }
1179 
1180 
1181 /**********************************************************************
1182  * fix_hyphens
1183  *
1184  * Change pairs of hyphens to a single hyphen if the bounding boxes touch
1185  * Typically a long dash which has been segmented.
1186  **********************************************************************/
fix_hyphens(WERD_CHOICE * choice,WERD * word,BLOB_CHOICE_LIST_CLIST * blob_choices)1187 void Tesseract::fix_hyphens(               //crunch double hyphens
1188                             WERD_CHOICE *choice,  //choice to fix
1189                             WERD *word,    //word to do //char choices
1190                             BLOB_CHOICE_LIST_CLIST *blob_choices) {
1191   if (!unicharset.contains_unichar("-") ||
1192       !unicharset.get_enabled(unicharset.unichar_to_id("-")))
1193     return;  // Don't create it if it is disallowed.
1194 
1195   PBLOB_IT blob_it = word->blob_list();
1196   BLOB_CHOICE_LIST_C_IT blob_choices_it = blob_choices;
1197   BLOB_CHOICE_IT it1;            // first choices
1198   BLOB_CHOICE_IT it2;            // second choices
1199 
1200   bool modified = false;
1201   for (int i = 0; i+1 < choice->length();
1202        ++i, blob_it.forward (), blob_choices_it.forward ()) {
1203     const char *ch = unicharset.id_to_unichar(choice->unichar_id(i));
1204     const char *next_ch = unicharset.id_to_unichar(choice->unichar_id(i+1));
1205     if (strlen(ch) != 1 || strlen(next_ch) != 1) continue;
1206     if ((*ch == '-' || *ch == '~') &&
1207         (*next_ch == '-' || *next_ch == '~') &&
1208         (blob_it.data()->bounding_box().right() >=
1209          blob_it.data_relative(1)->bounding_box().left ())) {
1210       choice->set_unichar_id(unicharset.unichar_to_id("-"), i);
1211       choice->remove_unichar_id(i+1);
1212       modified = true;
1213       merge_blobs(blob_it.data(), blob_it.data_relative(1));
1214       blob_it.forward();
1215       delete blob_it.extract();  // get rid of spare
1216 
1217       it1.set_to_list(blob_choices_it.data());
1218       it2.set_to_list(blob_choices_it.data_relative(1));
1219       if (it1.data()->certainty() < it2.data()->certainty()) {
1220         blob_choices_it.forward();
1221         delete blob_choices_it.extract();  // get rid of spare
1222       } else {
1223         delete blob_choices_it.extract();  // get rid of spare
1224         blob_choices_it.forward();
1225       }
1226     }
1227   }
1228   if (modified) {
1229     choice->populate_unichars(unicharset);
1230   }
1231 }
1232 }  // namespace tesseract
1233 
1234 
1235 /**********************************************************************
1236  * merge_blobs
1237  *
1238  * Add the outlines from blob2 to blob1. Blob2 is emptied but not deleted.
1239  **********************************************************************/
1240 
merge_blobs(PBLOB * blob1,PBLOB * blob2)1241 void merge_blobs(               //combine 2 blobs
1242                  PBLOB *blob1,  //dest blob
1243                  PBLOB *blob2   //source blob
1244                 ) {
1245   OUTLINE_IT outline_it = blob1->out_list ();
1246   //iterator
1247 
1248   outline_it.move_to_last ();    //go to end
1249                                  //do it
1250   outline_it.add_list_after (blob2->out_list ());
1251 }
1252 
1253 
1254 /**********************************************************************
1255  * choice_dump_tester
1256  *
1257  * Matcher tester function which generates .chc file entries.
1258  * Called via test_segment_pass2 for every blob tested by tess in a word.
1259  * (But only for words for which a correct segmentation could be found.)
1260  **********************************************************************/
1261 /* DEADCODE
1262 void choice_dump_tester(                           //dump chars in word
1263                         PBLOB *,                   //blob
1264                         DENORM *,                  //de-normaliser
1265                         BOOL8 correct,             //ly segmented
1266                         char *text,                //correct text
1267                         inT32 count,               //chars in text
1268                         BLOB_CHOICE_LIST *ratings  //list of results
1269                        ) {
1270   STRING choice_file_name;
1271   BLOB_CHOICE *blob_choice;
1272   BLOB_CHOICE_IT it;
1273   char source_chars[20];
1274   char correct_char[3];
1275 
1276   if (choice_file == NULL) {
1277     choice_file_name = imagebasename + ".chc";
1278     if (!(choice_file = fopen (choice_file_name.string (), "w"))) {
1279       CANTOPENFILE.error ("choice_dump_tester", EXIT, "%s %d",
1280         choice_file_name.string (), errno);
1281     }
1282   }
1283 
1284   if ((count == 0) || (text == NULL) || (text[0] == '\0')) {
1285     strcpy (source_chars, "$$");
1286     strcpy (correct_char, "$$");
1287   }
1288   else {
1289     strncpy(source_chars, text, count);
1290     source_chars[count] = '\0';
1291     if (correct) {
1292       correct_char[0] = text[0];
1293       correct_char[1] = '\0';
1294     }
1295     else {
1296       strcpy (correct_char, "$$");
1297     }
1298   }
1299   fprintf (choice_file, "%s\t%s", source_chars, correct_char);
1300 
1301   it.set_to_list (ratings);
1302   for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
1303     blob_choice = it.data ();
1304     fprintf (choice_file, "\t%s\t%f\t%f",
1305              blob_choice->unichar (),
1306              blob_choice->rating (), blob_choice->certainty ());
1307   }
1308   fprintf (choice_file, "\n");
1309 }
1310 */
1311 
1312 /*************************************************************************
1313  * make_bln_copy()
1314  *
1315  * Generate a baseline normalised copy of the source word. The copy is done so
1316  * that whatever format the original word is in, a polygonal bln version is
1317  * generated as output.
1318  *************************************************************************/
1319 
make_bln_copy(WERD * src_word,ROW * row,BLOCK * block,float x_height,DENORM * denorm)1320 WERD *make_bln_copy(WERD *src_word, ROW *row, BLOCK* block,
1321                     float x_height, DENORM *denorm) {
1322   WERD *result = src_word->poly_copy(row->x_height());
1323 
1324   result->baseline_normalise_x (row, x_height, denorm);
1325   if (block != NULL)
1326     denorm->set_block(block);
1327   return result;
1328 }
1329 
1330 
1331 namespace tesseract {
acceptable_word_string(const char * s,const char * lengths)1332 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s,
1333                                                        const char *lengths) {
1334   int i = 0;
1335   int offset = 0;
1336   int leading_punct_count;
1337   int upper_count = 0;
1338   int hyphen_pos = -1;
1339   ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
1340 
1341   if (strlen (lengths) > 20)
1342     return word_type;
1343 
1344   /* Single Leading punctuation char*/
1345 
1346   if ((s[offset] != '\0') && (STRING (chs_leading_punct).contains (s[offset])))
1347     offset += lengths[i++];
1348   leading_punct_count = i;
1349 
1350   /* Initial cap */
1351   while ((s[offset] != '\0') &&
1352          unicharset.get_isupper(s + offset, lengths[i])) {
1353     offset += lengths[i++];
1354     upper_count++;
1355   }
1356   if (upper_count > 1)
1357     word_type = AC_UPPER_CASE;
1358   else {
1359     /* Lower case word, possibly with an initial cap */
1360     while ((s[offset] != '\0') &&
1361            unicharset.get_islower (s + offset, lengths[i])) {
1362       offset += lengths[i++];
1363     }
1364     if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1365       goto not_a_word;
1366     /*
1367     Allow a single hyphen in a lower case word
1368     - dont trust upper case - I've seen several cases of "H" -> "I-I"
1369     */
1370     if (lengths[i] == 1 && s[offset] == '-') {
1371       hyphen_pos = i;
1372       offset += lengths[i++];
1373       if (s[offset] != '\0') {
1374         while ((s[offset] != '\0') &&
1375                unicharset.get_islower(s + offset, lengths[i])) {
1376           offset += lengths[i++];
1377         }
1378         if (i < hyphen_pos + 3)
1379           goto not_a_word;
1380       }
1381     }
1382     else {
1383       /* Allow "'s" in NON hyphenated lower case words */
1384       if (lengths[i] == 1 && (s[offset] == '\'') &&
1385           lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1386         offset += lengths[i++];
1387         offset += lengths[i++];
1388       }
1389     }
1390     if (upper_count > 0)
1391       word_type = AC_INITIAL_CAP;
1392     else
1393       word_type = AC_LOWER_CASE;
1394   }
1395 
1396   /* Up to two different, constrained trailing punctuation chars */
1397   if (lengths[i] == 1 && (s[offset] != '\0') &&
1398       (STRING (chs_trailing_punct1).contains (s[offset])))
1399     offset += lengths[i++];
1400   if (lengths[i] == 1 && (s[offset] != '\0') && i > 0 &&
1401     (s[offset - lengths[i - 1]] != s[offset]) &&
1402       (STRING (chs_trailing_punct2).contains (s[offset])))
1403     offset += lengths[i++];
1404 
1405   if (s[offset] != '\0')
1406     word_type = AC_UNACCEPTABLE;
1407 
1408   not_a_word:
1409 
1410   if (word_type == AC_UNACCEPTABLE) {
1411     /* Look for abbreviation string */
1412     i = 0;
1413     offset = 0;
1414     if (s[0] != '\0' && unicharset.get_isupper (s, lengths[0])) {
1415       word_type = AC_UC_ABBREV;
1416       while ((s[offset] != '\0') &&
1417              unicharset.get_isupper(s + offset, lengths[i]) &&
1418              (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
1419         offset += lengths[i++];
1420         offset += lengths[i++];
1421       }
1422     }
1423     else if (s[0] != '\0' && unicharset.get_islower (s, lengths[0])) {
1424       word_type = AC_LC_ABBREV;
1425       while ((s[offset] != '\0') &&
1426              unicharset.get_islower(s + offset, lengths[i]) &&
1427              (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
1428         offset += lengths[i++];
1429         offset += lengths[i++];
1430       }
1431     }
1432     if (s[offset] != '\0')
1433       word_type = AC_UNACCEPTABLE;
1434   }
1435 
1436   return word_type;
1437 }
1438 
1439 }  // namespace tesseract
1440 
1441 /* DEBUGGING ROUTINE */
1442 
check_debug_pt(WERD_RES * word,int location)1443 BOOL8 check_debug_pt(WERD_RES *word, int location) {
1444   BOOL8 show_map_detail = FALSE;
1445   inT16 i;
1446 
1447   #ifndef SECURE_NAMES
1448   if (!test_pt)
1449     return FALSE;
1450 
1451   tessedit_rejection_debug.set_value (FALSE);
1452   debug_x_ht_level.set_value (0);
1453   tessedit_cluster_debug.set_value (FALSE);
1454   nn_debug.set_value (FALSE);
1455   nn_reject_debug.set_value (FALSE);
1456 
1457   if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
1458     if (location < 0)
1459       return TRUE;               //For breakpoint use
1460     tessedit_rejection_debug.set_value (TRUE);
1461     debug_x_ht_level.set_value (20);
1462     tessedit_cluster_debug.set_value (TRUE);
1463     nn_debug.set_value (TRUE);
1464     nn_reject_debug.set_value (TRUE);
1465     tprintf ("\n\nTESTWD::");
1466     switch (location) {
1467       case 0:
1468         tprintf ("classify_word_pass1 start\n");
1469         word->word->print (debug_fp);
1470         break;
1471       case 10:
1472         tprintf ("make_reject_map: initial map");
1473         break;
1474       case 20:
1475         tprintf ("make_reject_map: after NN");
1476         break;
1477       case 30:
1478         tprintf ("classify_word_pass2 - START");
1479         break;
1480       case 40:
1481         tprintf ("classify_word_pass2 - Pre Xht");
1482         break;
1483       case 50:
1484         tprintf ("classify_word_pass2 - END");
1485         show_map_detail = TRUE;
1486         break;
1487       case 60:
1488         tprintf ("fixspace");
1489         break;
1490       case 70:
1491         tprintf ("MM pass START");
1492         break;
1493       case 80:
1494         tprintf ("MM pass END");
1495         break;
1496       case 90:
1497         tprintf ("After Poor quality rejection");
1498         break;
1499       case 100:
1500         tprintf ("unrej_good_quality_words - START");
1501         break;
1502       case 110:
1503         tprintf ("unrej_good_quality_words - END");
1504         break;
1505       case 120:
1506         tprintf ("Write results pass");
1507         show_map_detail = TRUE;
1508         break;
1509     }
1510     tprintf(" \"%s\" ",
1511             word->best_choice->unichar_string().string());
1512     word->reject_map.print (debug_fp);
1513     tprintf ("\n");
1514     if (show_map_detail) {
1515       tprintf ("\"%s\"\n", word->best_choice->unichar_string().string());
1516       for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1517         tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1518         word->reject_map[i].full_print(debug_fp);
1519       }
1520     }
1521 
1522     tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1523     tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1524     return TRUE;
1525   }
1526   else
1527   #endif
1528     return FALSE;
1529 }
1530 
1531 
1532 /**********************************************************************
1533  * set_word_fonts
1534  *
1535  * Get the fonts for the word.
1536  **********************************************************************/
1537 namespace tesseract {
set_word_fonts(WERD_RES * word,BLOB_CHOICE_LIST_CLIST * blob_choices)1538 void Tesseract::set_word_fonts(
1539     WERD_RES *word,  // word to adapt to
1540     BLOB_CHOICE_LIST_CLIST *blob_choices  // detailed results
1541     ) {
1542   inT32 index;                   // char id index
1543   UNICHAR_ID choice_char_id;     // char id from word
1544   inT8 config;                   // font of char
1545                                  // character iterator
1546   BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
1547   BLOB_CHOICE_IT choice_it;      // choice iterator
1548   int fontinfo_size = get_fontinfo_table().size();
1549   int fontset_size = get_fontset_table().size();
1550   if (fontinfo_size == 0 || fontset_size == 0)
1551     return;
1552   STATS fonts(0, fontinfo_size);  // font counters
1553 
1554   word->italic = 0;
1555   word->bold = 0;
1556   for (char_it.mark_cycle_pt(), index = 0;
1557        !char_it.cycled_list(); ++index, char_it.forward()) {
1558     choice_char_id = word->best_choice->unichar_id(index);
1559     choice_it.set_to_list(char_it.data());
1560     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1561          choice_it.forward()) {
1562       if (choice_it.data()->unichar_id() == choice_char_id) {
1563         config = choice_it.data()->config();
1564         int class_id = choice_it.data()->unichar_id();
1565         int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
1566         if (font_set_id >= 0 && config >= 0 && font_set_id < fontset_size) {
1567           FontSet font_set = get_fontset_table().get(font_set_id);
1568           if (tessedit_debug_fonts) {
1569             tprintf("%s(%d=%d%c%c)", unicharset.id_to_unichar(choice_char_id),
1570                     config, (config & 31) >> 2,
1571                     config & 2 ? 'N' : 'B', config & 1 ? 'N' : 'I');
1572             const char* fontname;
1573             if (config >= font_set.size) {
1574               fontname = "Unknown";
1575             } else {
1576               fontname = get_fontinfo_table().get(
1577                 font_set.configs[config]).name;
1578             }
1579             tprintf("%s(%d,%d=%s)\n",
1580                     unicharset.id_to_unichar(choice_it.data()->unichar_id()),
1581                     font_set_id, config, fontname);
1582           }
1583           if (config < font_set.size) {
1584             int fontinfo_id = font_set.configs[config];
1585             if (fontinfo_id < fontinfo_size) {
1586               FontInfo fi = get_fontinfo_table().get(fontinfo_id);
1587               word->italic += fi.is_italic();
1588               word->bold += fi.is_bold();
1589               fonts.add(fontinfo_id, 1);
1590             }
1591           }
1592         }
1593         break;
1594       }
1595     }
1596   }
1597   find_modal_font(&fonts, &word->font1, &word->font1_count);
1598   find_modal_font(&fonts, &word->font2, &word->font2_count);
1599   if (tessedit_debug_fonts)
1600     tprintf("\n");
1601   if (word->font1_count > 0) {
1602     word->italic = word->bold = 0;
1603     for (char_it.mark_cycle_pt(), index = 0;
1604          !char_it.cycled_list();  char_it.forward(), ++index) {
1605       choice_char_id = word->best_choice->unichar_id(index);
1606       choice_it.set_to_list(char_it.data());
1607       for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1608            choice_it.forward()) {
1609         if (choice_it.data()->unichar_id() == choice_char_id) {
1610           config = choice_it.data()->config();
1611           int class_id = choice_it.data()->unichar_id();
1612           int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
1613           if (font_set_id >= 0 && config >= 0 && font_set_id < fontset_size) {
1614             int fontinfo_id = get_fontset_table().get(font_set_id).
1615                 configs[config];
1616             if (fontinfo_id == word->font1 && fontinfo_id < fontinfo_size) {
1617               FontInfo fi = fontinfo_table_.get(fontinfo_id);
1618               word->italic += fi.is_italic();
1619               word->bold += fi.is_bold();
1620             }
1621           }
1622           break;
1623         }
1624       }
1625     }
1626   }
1627 }
1628 
1629 
1630 /**********************************************************************
1631  * font_recognition_pass
1632  *
1633  * Smooth the fonts for the document.
1634  **********************************************************************/
1635 
font_recognition_pass(PAGE_RES_IT & page_res_it)1636 void Tesseract::font_recognition_pass(  //good chars in word
1637                                       PAGE_RES_IT &page_res_it) {
1638   inT32 length;                  //of word
1639   inT32 count;                   //of a feature
1640   inT8 doc_font;                 //modal font
1641   inT8 doc_font_count;           //modal font
1642   inT32 doc_italic;              //total italics
1643   inT32 doc_bold;                //total bolds
1644   ROW_RES *row = NULL;           //current row
1645   WERD_RES *word;                //current word
1646   STATS fonts (0, get_fontinfo_table().size() ?
1647                get_fontinfo_table().size() : 32);           // font counters
1648   STATS doc_fonts (0, get_fontinfo_table().size() ?
1649                get_fontinfo_table().size() : 32);           // font counters
1650 
1651   doc_italic = 0;
1652   doc_bold = 0;
1653   page_res_it.restart_page ();
1654   while (page_res_it.word () != NULL) {
1655     if (row != page_res_it.row ()) {
1656       if (row != NULL) {
1657         find_modal_font (&fonts, &row->font1, &row->font1_count);
1658         find_modal_font (&fonts, &row->font2, &row->font2_count);
1659       }
1660       row = page_res_it.row ();  //current row
1661       fonts.clear ();            //clear counters
1662       row->italic = 0;
1663       row->bold = 0;
1664     }
1665     word = page_res_it.word ();
1666     row->italic += word->italic;
1667     row->bold += word->bold;
1668     fonts.add (word->font1, word->font1_count);
1669     fonts.add (word->font2, word->font2_count);
1670     doc_italic += word->italic;
1671     doc_bold += word->bold;
1672     doc_fonts.add (word->font1, word->font1_count);
1673     doc_fonts.add (word->font2, word->font2_count);
1674     page_res_it.forward ();
1675   }
1676   if (row != NULL) {
1677     find_modal_font (&fonts, &row->font1, &row->font1_count);
1678     find_modal_font (&fonts, &row->font2, &row->font2_count);
1679   }
1680   find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
1681   /*
1682     row=NULL;
1683     page_res_it.restart_page();
1684     while (page_res_it.word() != NULL)
1685     {
1686       if (row!=page_res_it.row())
1687       {
1688         row2=row;
1689         row=page_res_it.row();
1690         if (row->font1_count<MIN_FONT_ROW_COUNT)
1691         {
1692           fonts.clear();
1693           italic=0;
1694           bold=0;
1695           add_in_one_row(row,&fonts,&italic,&bold);
1696           if (row2!=NULL)
1697           {
1698             hdiff=row->row->x_height()-row2->row->x_height();
1699             if (hdiff<0)
1700               hdiff=-hdiff;
1701             if (hdiff<MAX_XHEIGHT_DIFF)
1702               add_in_one_row(row2,&fonts,&italic,&bold);
1703           }
1704           do
1705             page_res_it.forward();
1706           while (page_res_it.row()==row);
1707           row2=page_res_it.row();
1708           if (row2!=NULL)
1709           {
1710             hdiff=row->row->x_height()-row2->row->x_height();
1711             if (hdiff<0)
1712               hdiff=-hdiff;
1713             if (hdiff<MAX_XHEIGHT_DIFF)
1714               add_in_one_row(row2,&fonts,&italic,&bold);
1715           }
1716           row->italic=italic;
1717           row->bold=bold;
1718           find_modal_font(&fonts,&row->font1,&row->font1_count);
1719           find_modal_font(&fonts,&row->font2,&row->font2_count);
1720         }
1721         else
1722           page_res_it.forward();
1723       }
1724       else
1725         page_res_it.forward();
1726     }*/
1727 
1728   page_res_it.restart_page ();
1729   while (page_res_it.word () != NULL) {
1730     row = page_res_it.row ();    //current row
1731     word = page_res_it.word ();
1732     length = word->best_choice->length();
1733 
1734     count = word->italic;
1735     if (count < 0)
1736       count = -count;
1737     if (!(count == length || (length > 3 && count >= length * 3 / 4)))
1738       word->italic = doc_italic > 0 ? 1 : -1;
1739 
1740     count = word->bold;
1741     if (count < 0)
1742       count = -count;
1743     if (!(count == length || (length > 3 && count >= length * 3 / 4)))
1744       word->bold = doc_bold > 0 ? 1 : -1;
1745 
1746     count = word->font1_count;
1747     if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
1748       word->font1 = doc_font;
1749       word->font1_count = doc_font_count;
1750     }
1751 
1752     page_res_it.forward ();
1753   }
1754 }
1755 }  // namespace tesseract
1756 
1757 
1758 /**********************************************************************
1759  * add_in_one_row
1760  *
1761  * Add into the stats for one row.
1762  **********************************************************************/
1763 
add_in_one_row(ROW_RES * row,STATS * fonts,inT8 * italic,inT8 * bold)1764 void add_in_one_row(               //good chars in word
1765                     ROW_RES *row,  //current row
1766                     STATS *fonts,  //font stats
1767                     inT8 *italic,  //output count
1768                     inT8 *bold     //output count
1769                    ) {
1770   WERD_RES *word;                //current word
1771   WERD_RES_IT word_it = &row->word_res_list;
1772 
1773   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
1774     word = word_it.data ();
1775     *italic += word->italic;
1776     *bold += word->bold;
1777     if (word->font1_count > 0)
1778       fonts->add (word->font1, word->font1_count);
1779     if (word->font2_count > 0)
1780       fonts->add (word->font2, word->font2_count);
1781 
1782   }
1783 }
1784 
1785 
1786 /**********************************************************************
1787  * find_modal_font
1788  *
1789  * Find the modal font and remove from the stats.
1790  **********************************************************************/
1791 
find_modal_font(STATS * fonts,inT8 * font_out,inT8 * font_count)1792 void find_modal_font(                  //good chars in word
1793                      STATS *fonts,     //font stats
1794                      inT8 *font_out,   //output font
1795                      inT8 *font_count  //output count
1796                     ) {
1797   inT8 font;                     //font index
1798   inT32 count;                   //pile couat
1799 
1800   if (fonts->get_total () > 0) {
1801     font = (inT8) fonts->mode ();
1802     *font_out = font;
1803     count = fonts->pile_count (font);
1804     *font_count = count < MAX_INT8 ? count : MAX_INT8;
1805     fonts->add (font, -*font_count);
1806   }
1807   else {
1808     *font_out = -1;
1809     *font_count = 0;
1810   }
1811 }
1812