• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**********************************************************************
2  * File:        fixxht.cpp  (Formerly fixxht.c)
3  * Description: Improve x_ht and look out for case inconsistencies
4  * Author:		Phil Cheatle
5  * Created:		Thu Aug  5 14:11:08 BST 1993
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #include          <string.h>
22 #include          <ctype.h>
23 #include          "varable.h"
24 #include          "tessvars.h"
25 #include          "control.h"
26 #include          "reject.h"
27 #include          "fixxht.h"
28 #include          "secname.h"
29 #include          "tesseractclass.h"
30 
31 #define EXTERN
32 
33 EXTERN double_VAR (x_ht_fraction_of_caps_ht, 0.7,
34 "Fract of cps ht est of xht");
35 EXTERN double_VAR (x_ht_variation, 0.35,
36 "Err band as fract of caps/xht dist");
37 EXTERN double_VAR (x_ht_sub_variation, 0.5,
38 "Err band as fract of caps/xht dist");
39 EXTERN BOOL_VAR (rej_trial_ambigs, TRUE,
40 "reject x-ht ambigs when under trial");
41 EXTERN BOOL_VAR (x_ht_conservative_ambigs, FALSE,
42 "Dont rely on ambigs + maxht");
43 EXTERN BOOL_VAR (x_ht_check_est, TRUE, "Cross check estimates");
44 EXTERN BOOL_VAR (x_ht_case_flip, FALSE, "Flip or reject suspect case");
45 EXTERN BOOL_VAR (x_ht_include_dodgy_blobs, TRUE,
46 "Include blobs with possible noise?");
47 EXTERN BOOL_VAR (x_ht_limit_flip_trials, TRUE,
48 "Dont do trial flips when ambigs are close to xht?");
49 EXTERN BOOL_VAR (rej_use_check_block_occ, TRUE,
50 "Analyse rejection behaviour");
51 
52 EXTERN STRING_VAR (chs_non_ambig_caps_ht,
53 "!#$%&()/12346789?ABDEFGHIKLNQRT[]\\bdfhkl",
54 "Reliable ascenders");
55 EXTERN STRING_VAR (chs_x_ht, "acegmnopqrsuvwxyz", "X height chars");
56 EXTERN STRING_VAR (chs_non_ambig_x_ht, "aenqr", "reliable X height chars");
57 EXTERN STRING_VAR (chs_ambig_caps_x, "cCmMoO05sSuUvVwWxXzZ",
58 "X ht or caps ht chars");
59 EXTERN STRING_VAR (chs_bl_ambig_caps_x, "pPyY", " Caps or descender ambigs");
60 
61 /* The following arent used in this module but are used in applybox.c */
62 EXTERN STRING_VAR (chs_caps_ht,
63 "!#$%&()/0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]\\bdfhkl{|}",
64 "Ascender chars");
65 EXTERN STRING_VAR (chs_desc, "gjpqy", "Descender chars");
66 EXTERN STRING_VAR (chs_non_ambig_bl,
67 "!#$%&01246789?ABCDEFGHIKLMNORSTUVWXYZabcdehiklmnorstuvwxz",
68 "Reliable baseline chars");
69 EXTERN STRING_VAR (chs_odd_top, "ijt", "Chars with funny ascender region");
70 EXTERN STRING_VAR (chs_odd_bot, "()35JQ[]\\/{}|", "Chars with funny base");
71 
72 /* The following arent used but are defined for completeness */
73 EXTERN STRING_VAR (chs_bl,
74 "!#$%&()/01246789?ABCDEFGHIJKLMNOPRSTUVWXYZ[]\\abcdefhiklmnorstuvwxz{}",
75 "Baseline chars");
76 EXTERN STRING_VAR (chs_non_ambig_desc, "gq", "Reliable descender chars");
77 
78 /*************************************************************************
79  * re_estimate_x_ht()
80  *
81  * Walk the blobs in the word together with the text string and reject map.
82  * NOTE: All evaluation is done on the baseline normalised word. This is so that
83  * the TBOX class can be used (integer). The reasons for this are:
84  *   a) We must use the outword - ie the Tess result
85  *   b) The outword is always converted to integer representation as that is how
86  *      Tess works
87  *   c) We would like to use the TBOX class, cos its there - this is integer
88  *      precision.
89  *   d) If we de-normed the outword we would get rounding errors and would find
90  *      that integers are too imprecise (x-height around 15 pixels instead of a
91  *      scale of 128 in bln form.
92  *   CONVINCED?
93  *
94  * A) Try to re-estimatate x-ht and caps ht from confirmed pts in word.
95  *
96  *    FOR each non reject blob
97  *       IF char is baseline posn ambiguous
98  *			Remove ambiguity by comparing its posn with respect to baseline.
99  *		IF char is a confirmed x-ht char
100  *			Add x-ht posn to confirmed_x_ht pts for word
101  *    IF char is a confirmed caps-ht char
102  *			Add blob_ht to caps ht pts for word
103  *
104  *    IF Std Dev of caps hts < 2  (AND # samples > 0)
105  *		Use mean as caps ht estimate (Dont use median as we can expect a
106  *			fair variation between the heights of the NON_AMBIG_CAPS_HT_CHS)
107  *    IF Std Dev of caps hts >= 2  (AND # samples > 0)
108  *			Suspect small caps font.
109  *			Look for 2 clusters,	each with Std Dev < 2.
110  *			IF 2 clusters found
111  *			Pick the smaller median as the caps ht estimate of the smallcaps.
112  *
113  *    IF failed to estimate a caps ht
114  *       Use the median caps ht if there is one,
115  *		ELSE use the caps ht estimate of the previous word. NO!!!
116  *
117  *
118  *    IF there are confirmed x-height chars
119  *			Estimate confirmed x-height as the median value
120  *    ELSE IF there is a confirmed caps ht
121  *			Estimate confirmed x-height as a fraction of confirmed caps ht value
122  *		ELSE
123  *			Use the value for the previous word or the row value if this is the
124  *			first word in the block. NO!!!
125  *
126  * B) Add in case ambiguous blobs based on confirmed x-ht/caps ht, changing case
127  *    as necessary. Reestimate caps ht and x-ht as in A, using the extended
128  *    clusters.
129  *
130  * C) If word contains rejects, and x-ht estimate significantly differs from
131  *    original estimate, return TRUE so that the word can be rematched
132  *************************************************************************/
133 
re_estimate_x_ht(WERD_RES * word_res,float * trial_x_ht)134 void re_estimate_x_ht(                     //improve for 1 word
135                       WERD_RES *word_res,  //word to do
136                       float *trial_x_ht    //new match value
137                      ) {
138   PBLOB_IT blob_it;
139   inT16 blob_ht_above_baseline;
140 
141   const char *word_str;
142   inT16 i;
143   inT16 offset;
144 
145   STATS all_blobs_ht (0, 300);   //every blob in word
146   STATS x_ht (0, 300);           //confirmed pts in wd
147   STATS caps_ht (0, 300);        //confirmed pts in wd
148   STATS case_ambig (0, 300);     //lower case ambigs
149 
150   inT16 rej_blobs_count = 0;
151   inT16 rej_blobs_max_height = 0;
152   inT32 rej_blobs_max_area = 0;
153   float x_ht_ok_variation;
154   float max_blob_ht;
155   float marginally_above_x_ht;
156 
157   TBOX blob_box;                  //blob bounding box
158   float est_x_ht = 0.0;          //word estimate
159   float est_caps_ht = 0.0;       //word estimate
160                                  //based on hard data?
161   BOOL8 est_caps_ht_certain = FALSE;
162   BOOL8 est_x_ht_certain = FALSE;//based on hard data?
163   BOOL8 trial = FALSE;           //Sepeculative values?
164   BOOL8 no_comment = FALSE;      //No change in xht
165   float ambig_lc_x_est;
166   float ambig_uc_caps_est;
167   inT16 x_ht_ambigs = 0;
168   inT16 caps_ht_ambigs = 0;
169 
170   /* Calculate default variation of blob x_ht from bln x_ht for bln word */
171   x_ht_ok_variation =
172     (bln_x_height / x_ht_fraction_of_caps_ht - bln_x_height) * x_ht_variation;
173 
174   word_str = word_res->best_choice->unichar_string().string();
175   /*
176     Cycle blobs, allocating to one of the stats sets when possible.
177   */
178   blob_it.set_to_list (word_res->outword->blob_list ());
179   for (blob_it.mark_cycle_pt (), i = 0, offset = 0;
180   !blob_it.cycled_list (); blob_it.forward (),
181            offset += word_res->best_choice->unichar_lengths()[i++]) {
182     if (!dodgy_blob (blob_it.data ())) {
183       blob_box = blob_it.data ()->bounding_box ();
184       blob_ht_above_baseline = blob_box.top () - bln_baseline_offset;
185       all_blobs_ht.add (blob_ht_above_baseline, 1);
186 
187       if (word_res->reject_map[i].rejected ()) {
188         rej_blobs_count++;
189         if (blob_box.height () > rej_blobs_max_height)
190           rej_blobs_max_height = blob_box.height ();
191         if (blob_box.area () > rej_blobs_max_area)
192           rej_blobs_max_area = blob_box.area ();
193       }
194       else {
195         if (STRING (chs_non_ambig_x_ht).contains (word_str[offset]))
196           x_ht.add (blob_ht_above_baseline, 1);
197 
198         if (STRING (chs_non_ambig_caps_ht).contains (word_str[offset]))
199           caps_ht.add (blob_ht_above_baseline, 1);
200 
201         if (STRING (chs_ambig_caps_x).contains (word_str[offset])) {
202           case_ambig.add (blob_ht_above_baseline, 1);
203           if (STRING (chs_x_ht).contains (word_str[offset]))
204             x_ht_ambigs++;
205           else
206             caps_ht_ambigs++;
207         }
208 
209         if (STRING (chs_bl_ambig_caps_x).contains (word_str[offset])) {
210           if (STRING (chs_x_ht).contains (word_str[offset])) {
211             /* confirm x_height provided > 15% total height below baseline */
212             if ((bln_baseline_offset - blob_box.bottom ()) /
213               (float) blob_box.height () > 0.15)
214               x_ht.add (blob_ht_above_baseline, 1);
215           }
216           else {
217             /* confirm caps_height provided < 5% total height below baseline */
218             if ((bln_baseline_offset - blob_box.bottom ()) /
219               (float) blob_box.height () < 0.05)
220               caps_ht.add (blob_ht_above_baseline, 1);
221           }
222         }
223       }
224     }
225   }
226   est_caps_ht = estimate_from_stats (caps_ht);
227   est_x_ht = estimate_from_stats (x_ht);
228   est_ambigs(word_res, case_ambig, &ambig_lc_x_est, &ambig_uc_caps_est);
229   max_blob_ht = all_blobs_ht.ile (0.9999);
230 
231   #ifndef SECURE_NAMES
232   if (debug_x_ht_level >= 20) {
233     tprintf ("Mode20:A: %s ", word_str);
234     word_res->reject_map.print (debug_fp);
235     tprintf (" XHT:%f CAP:%f MAX:%f AMBIG X:%f CAP:%f\n",
236       est_x_ht, est_caps_ht, max_blob_ht,
237       ambig_lc_x_est, ambig_uc_caps_est);
238   }
239   #endif
240   if (!x_ht_conservative_ambigs &&
241     (ambig_lc_x_est > 0) &&
242     (ambig_lc_x_est == ambig_uc_caps_est) &&
243   (max_blob_ht > ambig_lc_x_est + x_ht_ok_variation)) {
244                                  //may be zero but believe xht
245     ambig_uc_caps_est = est_caps_ht;
246     #ifndef SECURE_NAMES
247     if (debug_x_ht_level >= 20)
248       tprintf ("Mode20:B: Fiddle ambig_uc_caps_est to %f\n",
249         ambig_lc_x_est);
250     #endif
251   }
252 
253   /* Now make some estimates */
254 
255   if ((est_x_ht > 0) ||
256     (est_caps_ht > 0) ||
257   ((ambig_lc_x_est > 0) && (ambig_lc_x_est != ambig_uc_caps_est))) {
258     /* There is some sensible data to go on so make the most of it. */
259     if (debug_x_ht_level >= 20)
260       tprintf ("Mode20:C: Sensible Data\n", ambig_lc_x_est);
261     if (est_x_ht > 0) {
262       est_x_ht_certain = TRUE;
263       if (est_caps_ht == 0) {
264         if ((ambig_uc_caps_est > ambig_lc_x_est) &&
265           (ambig_uc_caps_est > est_x_ht + x_ht_ok_variation))
266           est_caps_ht = ambig_uc_caps_est;
267         else
268           est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
269       }
270       if (case_ambig.get_total () > 0)
271         improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht);
272       est_caps_ht_certain = caps_ht.get_total () > 0;
273       #ifndef SECURE_NAMES
274       if (debug_x_ht_level >= 20)
275         tprintf ("Mode20:D: Est from xht XHT:%f CAP:%f\n",
276           est_x_ht, est_caps_ht);
277       #endif
278     }
279     else if (est_caps_ht > 0) {
280       est_caps_ht_certain = TRUE;
281       if ((ambig_lc_x_est > 0) &&
282         (ambig_lc_x_est < est_caps_ht - x_ht_ok_variation))
283         est_x_ht = ambig_lc_x_est;
284       else
285         est_x_ht = est_caps_ht * x_ht_fraction_of_caps_ht;
286       if (ambig_lc_x_est + ambig_uc_caps_est > 0)
287         improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht);
288       est_x_ht_certain = x_ht.get_total () > 0;
289       #ifndef SECURE_NAMES
290       if (debug_x_ht_level >= 20)
291         tprintf ("Mode20:E: Est from caps XHT:%f CAP:%f\n",
292           est_x_ht, est_caps_ht);
293       #endif
294     }
295     else {
296       /* Do something based on case ambig chars alone - we have guessed that the
297         ambigs are lower case. */
298       est_x_ht = ambig_lc_x_est;
299       est_x_ht_certain = TRUE;
300       if (ambig_uc_caps_est > ambig_lc_x_est) {
301         est_caps_ht = ambig_uc_caps_est;
302         est_caps_ht_certain = TRUE;
303       }
304       else
305         est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
306 
307       #ifndef SECURE_NAMES
308       if (debug_x_ht_level >= 20)
309         tprintf ("Mode20:F: Est from ambigs XHT:%f CAP:%f\n",
310           est_x_ht, est_caps_ht);
311       #endif
312     }
313     /* Check for sane interpretation of evidence:
314       Try shifting caps ht if min certain caps ht is not significantly greater
315       than the estimated x ht or the max certain x ht is not significantly less
316       than the estimated caps ht. */
317     if (x_ht_check_est) {
318       if ((caps_ht.get_total () > 0) &&
319       (est_x_ht + x_ht_ok_variation >= caps_ht.ile (0.0001))) {
320         trial = TRUE;
321         est_caps_ht = est_x_ht;
322         est_x_ht = x_ht_fraction_of_caps_ht * est_caps_ht;
323 
324         #ifndef SECURE_NAMES
325         if (debug_x_ht_level >= 20)
326           tprintf ("Mode20:G: Trial XHT:%f CAP:%f\n",
327             est_x_ht, est_caps_ht);
328         #endif
329       }
330       else if ((x_ht.get_total () > 0) &&
331       (est_caps_ht - x_ht_ok_variation <= x_ht.ile (0.9999))) {
332         trial = TRUE;
333         est_x_ht = est_caps_ht;
334         est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
335         #ifndef SECURE_NAMES
336         if (debug_x_ht_level >= 20)
337           tprintf ("Mode20:H: Trial XHT:%f CAP:%f\n",
338             est_x_ht, est_caps_ht);
339         #endif
340       }
341     }
342   }
343 
344   else {
345     /* There is no sensible data so we're in the dark. */
346 
347     marginally_above_x_ht = bln_x_height +
348       x_ht_ok_variation * x_ht_sub_variation;
349     /*
350       If there are no rejects, or the only rejects have a narrow height, or have
351       a small area compared to a normal char, then estimate the x-height as the
352       original one. (I.e dont fiddle about if the only rejects look like
353       punctuation) - we use max height as mean or median will be too low if
354       there are only two blobs - Eg "F."
355     */
356 
357     if (debug_x_ht_level >= 20)
358       tprintf ("Mode20:I: In the dark\n");
359 
360     if ((rej_blobs_count == 0) ||
361       (rej_blobs_max_height < 0.3 * max_blob_ht) ||
362     (rej_blobs_max_area < 0.3 * max_blob_ht * max_blob_ht)) {
363       no_comment = TRUE;
364       if (debug_x_ht_level >= 20)
365         tprintf ("Mode20:J: No comment due to no rejects\n");
366     }
367     else if (x_ht_limit_flip_trials &&
368       ((max_blob_ht < marginally_above_x_ht) ||
369       ((ambig_lc_x_est > 0) &&
370       (ambig_lc_x_est == ambig_uc_caps_est) &&
371     (ambig_lc_x_est < marginally_above_x_ht)))) {
372       no_comment = TRUE;
373       if (debug_x_ht_level >= 20)
374         tprintf ("Mode20:K: No comment as close to xht %f < %f\n",
375           ambig_lc_x_est, marginally_above_x_ht);
376     }
377     else if (x_ht_conservative_ambigs && (ambig_uc_caps_est > 0)) {
378       trial = TRUE;
379       est_caps_ht = ambig_lc_x_est;
380       est_x_ht = x_ht_fraction_of_caps_ht * est_caps_ht;
381 
382       #ifndef SECURE_NAMES
383       if (debug_x_ht_level >= 20)
384         tprintf ("Mode20:L: Trial XHT:%f CAP:%f\n",
385           est_x_ht, est_caps_ht);
386       #endif
387     }
388     /*
389       If the top of the word is nowhere near where we expect ascenders to be
390       (less than half the x_ht -> caps_ht distance) - suspect an all caps word
391       at the x-ht. Estimate x-ht accordingly - but only as a TRIAL!
392       NOTE we do NOT check location of baseline. Commas can descend as much as
393       real descenders so we would need to do something to make sure that any
394       disqualifying descenders were not at the end.
395     */
396     else {
397       if (max_blob_ht <
398       (bln_x_height + bln_x_height / x_ht_fraction_of_caps_ht) / 2.0) {
399         trial = TRUE;
400         est_x_ht = x_ht_fraction_of_caps_ht * max_blob_ht;
401         est_caps_ht = max_blob_ht;
402 
403         #ifndef SECURE_NAMES
404         if (debug_x_ht_level >= 20)
405           tprintf ("Mode20:M: Trial XHT:%f CAP:%f\n",
406             est_x_ht, est_caps_ht);
407         #endif
408       }
409       else {
410         no_comment = TRUE;
411         if (debug_x_ht_level >= 20)
412           tprintf ("Mode20:N: No comment as nothing else matched\n");
413       }
414     }
415   }
416 
417   /* Sanity check - reject word if fails */
418 
419   if (!no_comment &&
420     ((est_x_ht > 2 * bln_x_height) ||
421     (est_x_ht / word_res->denorm.scale () <= min_sane_x_ht_pixels) ||
422   (est_caps_ht <= est_x_ht) || (est_caps_ht >= 2.5 * est_x_ht))) {
423     no_comment = TRUE;
424     if (!trial && rej_use_xht) {
425       if (debug_x_ht_level >= 2) {
426         tprintf ("Sanity check rejecting %s ", word_str);
427         word_res->reject_map.print (debug_fp);
428         tprintf ("\n");
429       }
430       word_res->reject_map.rej_word_xht_fixup ();
431 
432     }
433     if (debug_x_ht_level >= 20)
434       tprintf ("Mode20:O: No comment as nothing else matched\n");
435   }
436 
437   if (no_comment || trial) {
438     word_res->x_height = bln_x_height / word_res->denorm.scale ();
439     word_res->guessed_x_ht = TRUE;
440     word_res->caps_height = (bln_x_height / x_ht_fraction_of_caps_ht) /
441       word_res->denorm.scale ();
442     word_res->guessed_caps_ht = TRUE;
443     /*
444     Reject ambigs in the current word if we are uncertain and:
445         there are rejects OR
446         there is only one char which is an ambig OR
447         there is conflict between the case of the ambigs even though there is
448         no height separation Eg "Ms" recognised from "MS"
449     */
450     if (rej_trial_ambigs &&
451       ((word_res->reject_map.reject_count () > 0) ||
452       (word_res->reject_map.length () == 1) ||
453     ((x_ht_ambigs > 0) && (caps_ht_ambigs > 0)))) {
454       #ifndef SECURE_NAMES
455       if (debug_x_ht_level >= 2) {
456         tprintf ("TRIAL Rej Ambigs %s ", word_str);
457         word_res->reject_map.print (debug_fp);
458       }
459       #endif
460       reject_ambigs(word_res);
461       if (debug_x_ht_level >= 2) {
462         tprintf (" ");
463         word_res->reject_map.print (debug_fp);
464         tprintf ("\n");
465       }
466     }
467   }
468   else {
469     word_res->x_height = est_x_ht / word_res->denorm.scale ();
470     word_res->guessed_x_ht = !est_x_ht_certain;
471     word_res->caps_height = est_caps_ht / word_res->denorm.scale ();
472     word_res->guessed_caps_ht = !est_caps_ht_certain;
473   }
474 
475   if (!no_comment && (fabs (est_x_ht - bln_x_height) > x_ht_ok_variation))
476     *trial_x_ht = est_x_ht / word_res->denorm.scale ();
477   else
478     *trial_x_ht = 0.0;
479 
480   #ifndef SECURE_NAMES
481   if (((*trial_x_ht > 0) && (debug_x_ht_level >= 3)) ||
482   (debug_x_ht_level >= 5)) {
483     tprintf ("%s ", word_str);
484     word_res->reject_map.print (debug_fp);
485     tprintf
486       (" X:%0.2f Cps:%0.2f Mxht:%0.2f RJ MxHt:%d MxAr:%d Rematch:%c\n",
487       est_x_ht, est_caps_ht, max_blob_ht, rej_blobs_max_height,
488       rej_blobs_max_area, *trial_x_ht > 0 ? '*' : ' ');
489   }
490   #endif
491 
492 }
493 
494 
495 /*************************************************************************
496  * check_block_occ()
497  * Checks word for coarse block occupancy, rejecting more chars and flipping
498  * case of case ambiguous chars as required.
499  *************************************************************************/
500 namespace tesseract {
check_block_occ(WERD_RES * word_res)501 void Tesseract::check_block_occ(WERD_RES *word_res) {
502   PBLOB_IT blob_it;
503   STRING new_string;
504   STRING new_string_lengths(word_res->best_choice->unichar_lengths());
505   REJMAP new_map = word_res->reject_map;
506   WERD_CHOICE *new_choice;
507 
508   const char *word_str = word_res->best_choice->unichar_string().string();
509   inT16 i;
510   inT16 offset;
511   inT16 reject_count = 0;
512   char confirmed_char[UNICHAR_LEN + 1];
513   char temp_char[UNICHAR_LEN + 1];
514   float x_ht;
515   float caps_ht;
516 
517   new_string_lengths[0] = 0;
518 
519   if (word_res->x_height > 0)
520     x_ht = word_res->x_height * word_res->denorm.scale ();
521   else
522     x_ht = bln_x_height;
523 
524   if (word_res->caps_height > 0)
525     caps_ht = word_res->caps_height * word_res->denorm.scale ();
526   else
527     caps_ht = x_ht / x_ht_fraction_of_caps_ht;
528 
529   blob_it.set_to_list (word_res->outword->blob_list ());
530 
531   for (blob_it.mark_cycle_pt (), i = 0, offset = 0;
532   !blob_it.cycled_list (); blob_it.forward (),
533            offset += word_res->best_choice->unichar_lengths()[i++]) {
534     strncpy(temp_char, word_str + offset,
535             word_res->best_choice->unichar_lengths()[i]); //default copy
536     temp_char[word_res->best_choice->unichar_lengths()[i]] = '\0';
537     if (word_res->reject_map[i].accepted ()) {
538       check_blob_occ (temp_char,
539                       blob_it.data ()->bounding_box ().
540                       top () - bln_baseline_offset, x_ht,
541                       caps_ht, confirmed_char);
542 
543       if (strcmp(confirmed_char, "") == 0) {
544         if (rej_use_check_block_occ) {
545           new_map[i].setrej_xht_fixup ();
546           reject_count++;
547         }
548       }
549       else
550         strcpy(temp_char, confirmed_char);
551     }
552     new_string += temp_char;
553     new_string_lengths[i] = strlen(temp_char);
554     new_string_lengths[i + 1] = 0;
555 
556   }
557   if ((reject_count > 0) || (new_string != word_str)) {
558     if (debug_x_ht_level >= 2) {
559       tprintf ("Shape Verification: %s ", word_str);
560       word_res->reject_map.print (debug_fp);
561       tprintf (" -> %s ", new_string.string ());
562       new_map.print (debug_fp);
563       tprintf ("\n");
564     }
565     new_choice = new WERD_CHOICE(new_string.string(),
566                                  new_string_lengths.string(),
567                                  word_res->best_choice->rating(),
568                                  word_res->best_choice->certainty(),
569                                  word_res->best_choice->permuter(),
570                                  unicharset);
571     new_choice->populate_unichars(unicharset);
572     delete word_res->best_choice;
573     word_res->best_choice = new_choice;
574     word_res->reject_map = new_map;
575   }
576 }
577 }  // namespace tesseract
578 
579 /*************************************************************************
580  * check_blob_occ()
581  *
582  * Checks blob for position relative to position above baseline
583  * Return 0 for reject, or (possibly case shifted) confirmed char
584  *************************************************************************/
585 
check_blob_occ(char * proposed_char,inT16 blob_ht_above_baseline,float x_ht,float caps_ht,char * confirmed_char)586 void check_blob_occ(char* proposed_char,
587                     inT16 blob_ht_above_baseline,
588                     float x_ht,
589                     float caps_ht,
590                     char* confirmed_char) {
591   BOOL8 blob_definite_x_ht;
592   BOOL8 blob_definite_caps_ht;
593   float acceptable_variation;
594 
595   acceptable_variation = (caps_ht - x_ht) * x_ht_variation;
596   /* ??? REJECT if expected descender and nothing significantly below BL */
597 
598   /* ??? REJECT if expected ascender and nothing significantly above x-ht */
599 
600   /*
601     IF AMBIG_CAPS_X_CHS
602       IF blob is definitely an ascender ( > xht + xht err )AND
603         char is an x-ht char
604       THEN
605         flip case
606       IF blob is defintiely an x-ht ( <= xht + xht err ) AND
607         char is an ascender char
608       THEN
609         flip case
610   */
611   blob_definite_x_ht = blob_ht_above_baseline <= x_ht + acceptable_variation;
612   blob_definite_caps_ht = blob_ht_above_baseline >=
613     caps_ht - acceptable_variation;
614 
615   if (STRING (chs_ambig_caps_x).contains (*proposed_char)) {
616     if ((!blob_definite_x_ht && !blob_definite_caps_ht) ||
617         ((strcmp(proposed_char, "0") == 0) && !blob_definite_caps_ht) ||
618         ((strcmp(proposed_char, "o") == 0) && !blob_definite_x_ht)) {
619       strcpy(confirmed_char, "");
620       return;
621     }
622 
623     else if (blob_definite_caps_ht &&
624     STRING (chs_x_ht).contains (*proposed_char)) {
625       if (x_ht_case_flip) {
626                                  //flip to upper case
627         proposed_char[0] = (char) toupper (*proposed_char);
628         return;
629       } else {
630         strcpy(confirmed_char, "");
631         return;
632       }
633     }
634 
635     else if (blob_definite_x_ht &&
636     !STRING (chs_x_ht).contains (*proposed_char)) {
637       if (x_ht_case_flip) {
638                                  //flip to lower case
639         proposed_char[0] = (char) tolower (*proposed_char);
640       } else {
641         strcpy(confirmed_char, "");
642         return;
643       }
644     }
645   }
646   else
647   if ((STRING (chs_non_ambig_x_ht).contains (*proposed_char)
648     && !blob_definite_x_ht)
649     || (STRING (chs_non_ambig_caps_ht).contains (*proposed_char)
650         && !blob_definite_caps_ht)) {
651     strcpy(confirmed_char, "");
652     return;
653   }
654   strcpy(confirmed_char, proposed_char);
655   return;
656 }
657 
658 
estimate_from_stats(STATS & stats)659 float estimate_from_stats(STATS &stats) {
660   if (stats.get_total () <= 0)
661     return 0.0;
662   else if (stats.get_total () >= 3)
663     return stats.ile (0.5);      //median
664   else
665     return stats.mean ();
666 }
667 
668 
improve_estimate(WERD_RES * word_res,float & est_x_ht,float & est_caps_ht,STATS & x_ht,STATS & caps_ht)669 void improve_estimate(WERD_RES *word_res,
670                       float &est_x_ht,
671                       float &est_caps_ht,
672                       STATS &x_ht,
673                       STATS &caps_ht) {
674   PBLOB_IT blob_it;
675   inT16 blob_ht_above_baseline;
676 
677   const char *word_str;
678   inT16 i;
679   inT16 offset;
680   TBOX blob_box;                  //blob bounding box
681   char confirmed_char[UNICHAR_LEN + 1];
682   char temp_char[UNICHAR_LEN + 1];
683   float new_val;
684 
685   /* IMPROVE estimates here - if good estimates, and case ambig chars,
686     rescan blobs to fix case ambig blobs, re-estimate hts  ??? maybe always do
687     it after deciding x-height
688   */
689 
690   blob_it.set_to_list (word_res->outword->blob_list ());
691   word_str = word_res->best_choice->unichar_string().string();
692   for (blob_it.mark_cycle_pt (), i = 0, offset = 0;
693        !blob_it.cycled_list (); blob_it.forward (),
694            offset += word_res->best_choice->unichar_lengths()[i++]) {
695     if ((STRING (chs_ambig_caps_x).contains (word_str[offset])) &&
696     (!dodgy_blob (blob_it.data ()))) {
697       blob_box = blob_it.data ()->bounding_box ();
698       blob_ht_above_baseline = blob_box.top () - bln_baseline_offset;
699       strncpy(temp_char, word_str + offset,
700               word_res->best_choice->unichar_lengths()[i]);
701       temp_char[word_res->best_choice->unichar_lengths()[i]] = '\0';
702       check_blob_occ (temp_char,
703                       blob_ht_above_baseline,
704                       est_x_ht, est_caps_ht, confirmed_char);
705       if (strcmp(confirmed_char, "") != 0) {
706         if (STRING (chs_x_ht).contains (*confirmed_char))
707           x_ht.add (blob_ht_above_baseline, 1);
708         else
709           caps_ht.add (blob_ht_above_baseline, 1);
710       }
711     }
712   }
713   new_val = estimate_from_stats (x_ht);
714   if (new_val > 0)
715     est_x_ht = new_val;
716   new_val = estimate_from_stats (caps_ht);
717   if (new_val > 0)
718     est_caps_ht = new_val;
719 }
720 
721 
reject_ambigs(WERD_RES * word)722 void reject_ambigs(  //rej any accepted xht ambig chars
723                    WERD_RES *word) {
724   const char *word_str;
725   int i = 0;
726 
727   word_str = word->best_choice->unichar_string().string();
728   while (*word_str != '\0') {
729     if (STRING (chs_ambig_caps_x).contains (*word_str))
730       word->reject_map[i].setrej_xht_fixup ();
731     word_str += word->best_choice->unichar_lengths()[i++];
732   }
733 }
734 
735 
est_ambigs(WERD_RES * word_res,STATS & stats,float * ambig_lc_x_est,float * ambig_uc_caps_est)736 void est_ambigs(                          //xht ambig ht stats
737                 WERD_RES *word_res,
738                 STATS &stats,
739                 float *ambig_lc_x_est,    //xht est
740                 float *ambig_uc_caps_est  //caps est
741                ) {
742   float x_ht_ok_variation;
743   STATS short_ambigs (0, 300);
744   STATS tall_ambigs (0, 300);
745   PBLOB_IT blob_it;
746   TBOX blob_box;                  //blob bounding box
747   inT16 blob_ht_above_baseline;
748 
749   const char *word_str;
750   inT16 i;
751   inT16 offset;
752   float min;                     //min ambig ch ht
753   float max;                     //max ambig ch ht
754   float short_limit;             // for lower case
755   float tall_limit;              // for upper case
756 
757   x_ht_ok_variation =
758     (bln_x_height / x_ht_fraction_of_caps_ht - bln_x_height) * x_ht_variation;
759 
760   if (stats.get_total () == 0) {
761     *ambig_lc_x_est = 0;
762     *ambig_uc_caps_est = 0;
763   }
764   else {
765     min = stats.ile (0.0);
766     max = stats.ile (0.99999);
767     if ((max - min) < x_ht_ok_variation) {
768       *ambig_lc_x_est = *ambig_uc_caps_est = stats.mean ();
769       //close enough
770     }
771     else {
772     /* Try reclustering into lower and upper case chars */
773       short_limit = min + (max - min) * x_ht_variation;
774       tall_limit = max - (max - min) * x_ht_variation;
775       word_str = word_res->best_choice->unichar_string().string();
776       blob_it.set_to_list (word_res->outword->blob_list ());
777       for (blob_it.mark_cycle_pt (), i = 0, offset = 0;
778       !blob_it.cycled_list (); blob_it.forward (),
779                offset += word_res->best_choice->unichar_lengths()[i++]) {
780         if (word_res->reject_map[i].accepted () &&
781           STRING (chs_ambig_caps_x).contains (word_str[offset]) &&
782         (!dodgy_blob (blob_it.data ()))) {
783           blob_box = blob_it.data ()->bounding_box ();
784           blob_ht_above_baseline =
785             blob_box.top () - bln_baseline_offset;
786           if (blob_ht_above_baseline <= short_limit)
787             short_ambigs.add (blob_ht_above_baseline, 1);
788           else if (blob_ht_above_baseline >= tall_limit)
789             tall_ambigs.add (blob_ht_above_baseline, 1);
790         }
791       }
792       *ambig_lc_x_est = short_ambigs.mean ();
793       *ambig_uc_caps_est = tall_ambigs.mean ();
794       /* Cop out if we havent got sensible clusters. */
795       if (*ambig_uc_caps_est - *ambig_lc_x_est <= x_ht_ok_variation)
796         *ambig_lc_x_est = *ambig_uc_caps_est = stats.mean ();
797       //close enough
798     }
799   }
800 }
801 
802 
803 /*************************************************************************
804  * dodgy_blob()
805  * Returns true if the blob has more than one outline, one above the other.
806  * These are dodgy as the top blob could be noise, causing the bounding box xht
807  * to be misleading
808  *************************************************************************/
809 
dodgy_blob(PBLOB * blob)810 BOOL8 dodgy_blob(PBLOB *blob) {
811   OUTLINE_IT outline_it = blob->out_list ();
812   inT16 highest_bottom = -MAX_INT16;
813   inT16 lowest_top = MAX_INT16;
814   TBOX outline_box;
815 
816   if (x_ht_include_dodgy_blobs)
817     return FALSE;                //no blob is ever dodgy
818   for (outline_it.mark_cycle_pt ();
819   !outline_it.cycled_list (); outline_it.forward ()) {
820     outline_box = outline_it.data ()->bounding_box ();
821     if (lowest_top > outline_box.top ())
822       lowest_top = outline_box.top ();
823     if (highest_bottom < outline_box.bottom ())
824       highest_bottom = outline_box.bottom ();
825   }
826   return highest_bottom >= lowest_top;
827 }
828