1 /**********************************************************************
2 * File: fixxht.cpp (Formerly fixxht.c)
3 * Description: Improve x_ht and look out for case inconsistencies
4 * Author: Phil Cheatle
5 * Created: Thu Aug 5 14:11:08 BST 1993
6 *
7 * (C) Copyright 1992, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 #include "mfcpch.h"
21 #include <string.h>
22 #include <ctype.h>
23 #include "varable.h"
24 #include "tessvars.h"
25 #include "control.h"
26 #include "reject.h"
27 #include "fixxht.h"
28 #include "secname.h"
29 #include "tesseractclass.h"
30
31 #define EXTERN
32
33 EXTERN double_VAR (x_ht_fraction_of_caps_ht, 0.7,
34 "Fract of cps ht est of xht");
35 EXTERN double_VAR (x_ht_variation, 0.35,
36 "Err band as fract of caps/xht dist");
37 EXTERN double_VAR (x_ht_sub_variation, 0.5,
38 "Err band as fract of caps/xht dist");
39 EXTERN BOOL_VAR (rej_trial_ambigs, TRUE,
40 "reject x-ht ambigs when under trial");
41 EXTERN BOOL_VAR (x_ht_conservative_ambigs, FALSE,
42 "Dont rely on ambigs + maxht");
43 EXTERN BOOL_VAR (x_ht_check_est, TRUE, "Cross check estimates");
44 EXTERN BOOL_VAR (x_ht_case_flip, FALSE, "Flip or reject suspect case");
45 EXTERN BOOL_VAR (x_ht_include_dodgy_blobs, TRUE,
46 "Include blobs with possible noise?");
47 EXTERN BOOL_VAR (x_ht_limit_flip_trials, TRUE,
48 "Dont do trial flips when ambigs are close to xht?");
49 EXTERN BOOL_VAR (rej_use_check_block_occ, TRUE,
50 "Analyse rejection behaviour");
51
52 EXTERN STRING_VAR (chs_non_ambig_caps_ht,
53 "!#$%&()/12346789?ABDEFGHIKLNQRT[]\\bdfhkl",
54 "Reliable ascenders");
55 EXTERN STRING_VAR (chs_x_ht, "acegmnopqrsuvwxyz", "X height chars");
56 EXTERN STRING_VAR (chs_non_ambig_x_ht, "aenqr", "reliable X height chars");
57 EXTERN STRING_VAR (chs_ambig_caps_x, "cCmMoO05sSuUvVwWxXzZ",
58 "X ht or caps ht chars");
59 EXTERN STRING_VAR (chs_bl_ambig_caps_x, "pPyY", " Caps or descender ambigs");
60
61 /* The following arent used in this module but are used in applybox.c */
62 EXTERN STRING_VAR (chs_caps_ht,
63 "!#$%&()/0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]\\bdfhkl{|}",
64 "Ascender chars");
65 EXTERN STRING_VAR (chs_desc, "gjpqy", "Descender chars");
66 EXTERN STRING_VAR (chs_non_ambig_bl,
67 "!#$%&01246789?ABCDEFGHIKLMNORSTUVWXYZabcdehiklmnorstuvwxz",
68 "Reliable baseline chars");
69 EXTERN STRING_VAR (chs_odd_top, "ijt", "Chars with funny ascender region");
70 EXTERN STRING_VAR (chs_odd_bot, "()35JQ[]\\/{}|", "Chars with funny base");
71
72 /* The following arent used but are defined for completeness */
73 EXTERN STRING_VAR (chs_bl,
74 "!#$%&()/01246789?ABCDEFGHIJKLMNOPRSTUVWXYZ[]\\abcdefhiklmnorstuvwxz{}",
75 "Baseline chars");
76 EXTERN STRING_VAR (chs_non_ambig_desc, "gq", "Reliable descender chars");
77
78 /*************************************************************************
79 * re_estimate_x_ht()
80 *
81 * Walk the blobs in the word together with the text string and reject map.
82 * NOTE: All evaluation is done on the baseline normalised word. This is so that
83 * the TBOX class can be used (integer). The reasons for this are:
84 * a) We must use the outword - ie the Tess result
85 * b) The outword is always converted to integer representation as that is how
86 * Tess works
87 * c) We would like to use the TBOX class, cos its there - this is integer
88 * precision.
89 * d) If we de-normed the outword we would get rounding errors and would find
90 * that integers are too imprecise (x-height around 15 pixels instead of a
91 * scale of 128 in bln form.
92 * CONVINCED?
93 *
94 * A) Try to re-estimatate x-ht and caps ht from confirmed pts in word.
95 *
96 * FOR each non reject blob
97 * IF char is baseline posn ambiguous
98 * Remove ambiguity by comparing its posn with respect to baseline.
99 * IF char is a confirmed x-ht char
100 * Add x-ht posn to confirmed_x_ht pts for word
101 * IF char is a confirmed caps-ht char
102 * Add blob_ht to caps ht pts for word
103 *
104 * IF Std Dev of caps hts < 2 (AND # samples > 0)
105 * Use mean as caps ht estimate (Dont use median as we can expect a
106 * fair variation between the heights of the NON_AMBIG_CAPS_HT_CHS)
107 * IF Std Dev of caps hts >= 2 (AND # samples > 0)
108 * Suspect small caps font.
109 * Look for 2 clusters, each with Std Dev < 2.
110 * IF 2 clusters found
111 * Pick the smaller median as the caps ht estimate of the smallcaps.
112 *
113 * IF failed to estimate a caps ht
114 * Use the median caps ht if there is one,
115 * ELSE use the caps ht estimate of the previous word. NO!!!
116 *
117 *
118 * IF there are confirmed x-height chars
119 * Estimate confirmed x-height as the median value
120 * ELSE IF there is a confirmed caps ht
121 * Estimate confirmed x-height as a fraction of confirmed caps ht value
122 * ELSE
123 * Use the value for the previous word or the row value if this is the
124 * first word in the block. NO!!!
125 *
126 * B) Add in case ambiguous blobs based on confirmed x-ht/caps ht, changing case
127 * as necessary. Reestimate caps ht and x-ht as in A, using the extended
128 * clusters.
129 *
130 * C) If word contains rejects, and x-ht estimate significantly differs from
131 * original estimate, return TRUE so that the word can be rematched
132 *************************************************************************/
133
re_estimate_x_ht(WERD_RES * word_res,float * trial_x_ht)134 void re_estimate_x_ht( //improve for 1 word
135 WERD_RES *word_res, //word to do
136 float *trial_x_ht //new match value
137 ) {
138 PBLOB_IT blob_it;
139 inT16 blob_ht_above_baseline;
140
141 const char *word_str;
142 inT16 i;
143 inT16 offset;
144
145 STATS all_blobs_ht (0, 300); //every blob in word
146 STATS x_ht (0, 300); //confirmed pts in wd
147 STATS caps_ht (0, 300); //confirmed pts in wd
148 STATS case_ambig (0, 300); //lower case ambigs
149
150 inT16 rej_blobs_count = 0;
151 inT16 rej_blobs_max_height = 0;
152 inT32 rej_blobs_max_area = 0;
153 float x_ht_ok_variation;
154 float max_blob_ht;
155 float marginally_above_x_ht;
156
157 TBOX blob_box; //blob bounding box
158 float est_x_ht = 0.0; //word estimate
159 float est_caps_ht = 0.0; //word estimate
160 //based on hard data?
161 BOOL8 est_caps_ht_certain = FALSE;
162 BOOL8 est_x_ht_certain = FALSE;//based on hard data?
163 BOOL8 trial = FALSE; //Sepeculative values?
164 BOOL8 no_comment = FALSE; //No change in xht
165 float ambig_lc_x_est;
166 float ambig_uc_caps_est;
167 inT16 x_ht_ambigs = 0;
168 inT16 caps_ht_ambigs = 0;
169
170 /* Calculate default variation of blob x_ht from bln x_ht for bln word */
171 x_ht_ok_variation =
172 (bln_x_height / x_ht_fraction_of_caps_ht - bln_x_height) * x_ht_variation;
173
174 word_str = word_res->best_choice->unichar_string().string();
175 /*
176 Cycle blobs, allocating to one of the stats sets when possible.
177 */
178 blob_it.set_to_list (word_res->outword->blob_list ());
179 for (blob_it.mark_cycle_pt (), i = 0, offset = 0;
180 !blob_it.cycled_list (); blob_it.forward (),
181 offset += word_res->best_choice->unichar_lengths()[i++]) {
182 if (!dodgy_blob (blob_it.data ())) {
183 blob_box = blob_it.data ()->bounding_box ();
184 blob_ht_above_baseline = blob_box.top () - bln_baseline_offset;
185 all_blobs_ht.add (blob_ht_above_baseline, 1);
186
187 if (word_res->reject_map[i].rejected ()) {
188 rej_blobs_count++;
189 if (blob_box.height () > rej_blobs_max_height)
190 rej_blobs_max_height = blob_box.height ();
191 if (blob_box.area () > rej_blobs_max_area)
192 rej_blobs_max_area = blob_box.area ();
193 }
194 else {
195 if (STRING (chs_non_ambig_x_ht).contains (word_str[offset]))
196 x_ht.add (blob_ht_above_baseline, 1);
197
198 if (STRING (chs_non_ambig_caps_ht).contains (word_str[offset]))
199 caps_ht.add (blob_ht_above_baseline, 1);
200
201 if (STRING (chs_ambig_caps_x).contains (word_str[offset])) {
202 case_ambig.add (blob_ht_above_baseline, 1);
203 if (STRING (chs_x_ht).contains (word_str[offset]))
204 x_ht_ambigs++;
205 else
206 caps_ht_ambigs++;
207 }
208
209 if (STRING (chs_bl_ambig_caps_x).contains (word_str[offset])) {
210 if (STRING (chs_x_ht).contains (word_str[offset])) {
211 /* confirm x_height provided > 15% total height below baseline */
212 if ((bln_baseline_offset - blob_box.bottom ()) /
213 (float) blob_box.height () > 0.15)
214 x_ht.add (blob_ht_above_baseline, 1);
215 }
216 else {
217 /* confirm caps_height provided < 5% total height below baseline */
218 if ((bln_baseline_offset - blob_box.bottom ()) /
219 (float) blob_box.height () < 0.05)
220 caps_ht.add (blob_ht_above_baseline, 1);
221 }
222 }
223 }
224 }
225 }
226 est_caps_ht = estimate_from_stats (caps_ht);
227 est_x_ht = estimate_from_stats (x_ht);
228 est_ambigs(word_res, case_ambig, &ambig_lc_x_est, &ambig_uc_caps_est);
229 max_blob_ht = all_blobs_ht.ile (0.9999);
230
231 #ifndef SECURE_NAMES
232 if (debug_x_ht_level >= 20) {
233 tprintf ("Mode20:A: %s ", word_str);
234 word_res->reject_map.print (debug_fp);
235 tprintf (" XHT:%f CAP:%f MAX:%f AMBIG X:%f CAP:%f\n",
236 est_x_ht, est_caps_ht, max_blob_ht,
237 ambig_lc_x_est, ambig_uc_caps_est);
238 }
239 #endif
240 if (!x_ht_conservative_ambigs &&
241 (ambig_lc_x_est > 0) &&
242 (ambig_lc_x_est == ambig_uc_caps_est) &&
243 (max_blob_ht > ambig_lc_x_est + x_ht_ok_variation)) {
244 //may be zero but believe xht
245 ambig_uc_caps_est = est_caps_ht;
246 #ifndef SECURE_NAMES
247 if (debug_x_ht_level >= 20)
248 tprintf ("Mode20:B: Fiddle ambig_uc_caps_est to %f\n",
249 ambig_lc_x_est);
250 #endif
251 }
252
253 /* Now make some estimates */
254
255 if ((est_x_ht > 0) ||
256 (est_caps_ht > 0) ||
257 ((ambig_lc_x_est > 0) && (ambig_lc_x_est != ambig_uc_caps_est))) {
258 /* There is some sensible data to go on so make the most of it. */
259 if (debug_x_ht_level >= 20)
260 tprintf ("Mode20:C: Sensible Data\n", ambig_lc_x_est);
261 if (est_x_ht > 0) {
262 est_x_ht_certain = TRUE;
263 if (est_caps_ht == 0) {
264 if ((ambig_uc_caps_est > ambig_lc_x_est) &&
265 (ambig_uc_caps_est > est_x_ht + x_ht_ok_variation))
266 est_caps_ht = ambig_uc_caps_est;
267 else
268 est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
269 }
270 if (case_ambig.get_total () > 0)
271 improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht);
272 est_caps_ht_certain = caps_ht.get_total () > 0;
273 #ifndef SECURE_NAMES
274 if (debug_x_ht_level >= 20)
275 tprintf ("Mode20:D: Est from xht XHT:%f CAP:%f\n",
276 est_x_ht, est_caps_ht);
277 #endif
278 }
279 else if (est_caps_ht > 0) {
280 est_caps_ht_certain = TRUE;
281 if ((ambig_lc_x_est > 0) &&
282 (ambig_lc_x_est < est_caps_ht - x_ht_ok_variation))
283 est_x_ht = ambig_lc_x_est;
284 else
285 est_x_ht = est_caps_ht * x_ht_fraction_of_caps_ht;
286 if (ambig_lc_x_est + ambig_uc_caps_est > 0)
287 improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht);
288 est_x_ht_certain = x_ht.get_total () > 0;
289 #ifndef SECURE_NAMES
290 if (debug_x_ht_level >= 20)
291 tprintf ("Mode20:E: Est from caps XHT:%f CAP:%f\n",
292 est_x_ht, est_caps_ht);
293 #endif
294 }
295 else {
296 /* Do something based on case ambig chars alone - we have guessed that the
297 ambigs are lower case. */
298 est_x_ht = ambig_lc_x_est;
299 est_x_ht_certain = TRUE;
300 if (ambig_uc_caps_est > ambig_lc_x_est) {
301 est_caps_ht = ambig_uc_caps_est;
302 est_caps_ht_certain = TRUE;
303 }
304 else
305 est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
306
307 #ifndef SECURE_NAMES
308 if (debug_x_ht_level >= 20)
309 tprintf ("Mode20:F: Est from ambigs XHT:%f CAP:%f\n",
310 est_x_ht, est_caps_ht);
311 #endif
312 }
313 /* Check for sane interpretation of evidence:
314 Try shifting caps ht if min certain caps ht is not significantly greater
315 than the estimated x ht or the max certain x ht is not significantly less
316 than the estimated caps ht. */
317 if (x_ht_check_est) {
318 if ((caps_ht.get_total () > 0) &&
319 (est_x_ht + x_ht_ok_variation >= caps_ht.ile (0.0001))) {
320 trial = TRUE;
321 est_caps_ht = est_x_ht;
322 est_x_ht = x_ht_fraction_of_caps_ht * est_caps_ht;
323
324 #ifndef SECURE_NAMES
325 if (debug_x_ht_level >= 20)
326 tprintf ("Mode20:G: Trial XHT:%f CAP:%f\n",
327 est_x_ht, est_caps_ht);
328 #endif
329 }
330 else if ((x_ht.get_total () > 0) &&
331 (est_caps_ht - x_ht_ok_variation <= x_ht.ile (0.9999))) {
332 trial = TRUE;
333 est_x_ht = est_caps_ht;
334 est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
335 #ifndef SECURE_NAMES
336 if (debug_x_ht_level >= 20)
337 tprintf ("Mode20:H: Trial XHT:%f CAP:%f\n",
338 est_x_ht, est_caps_ht);
339 #endif
340 }
341 }
342 }
343
344 else {
345 /* There is no sensible data so we're in the dark. */
346
347 marginally_above_x_ht = bln_x_height +
348 x_ht_ok_variation * x_ht_sub_variation;
349 /*
350 If there are no rejects, or the only rejects have a narrow height, or have
351 a small area compared to a normal char, then estimate the x-height as the
352 original one. (I.e dont fiddle about if the only rejects look like
353 punctuation) - we use max height as mean or median will be too low if
354 there are only two blobs - Eg "F."
355 */
356
357 if (debug_x_ht_level >= 20)
358 tprintf ("Mode20:I: In the dark\n");
359
360 if ((rej_blobs_count == 0) ||
361 (rej_blobs_max_height < 0.3 * max_blob_ht) ||
362 (rej_blobs_max_area < 0.3 * max_blob_ht * max_blob_ht)) {
363 no_comment = TRUE;
364 if (debug_x_ht_level >= 20)
365 tprintf ("Mode20:J: No comment due to no rejects\n");
366 }
367 else if (x_ht_limit_flip_trials &&
368 ((max_blob_ht < marginally_above_x_ht) ||
369 ((ambig_lc_x_est > 0) &&
370 (ambig_lc_x_est == ambig_uc_caps_est) &&
371 (ambig_lc_x_est < marginally_above_x_ht)))) {
372 no_comment = TRUE;
373 if (debug_x_ht_level >= 20)
374 tprintf ("Mode20:K: No comment as close to xht %f < %f\n",
375 ambig_lc_x_est, marginally_above_x_ht);
376 }
377 else if (x_ht_conservative_ambigs && (ambig_uc_caps_est > 0)) {
378 trial = TRUE;
379 est_caps_ht = ambig_lc_x_est;
380 est_x_ht = x_ht_fraction_of_caps_ht * est_caps_ht;
381
382 #ifndef SECURE_NAMES
383 if (debug_x_ht_level >= 20)
384 tprintf ("Mode20:L: Trial XHT:%f CAP:%f\n",
385 est_x_ht, est_caps_ht);
386 #endif
387 }
388 /*
389 If the top of the word is nowhere near where we expect ascenders to be
390 (less than half the x_ht -> caps_ht distance) - suspect an all caps word
391 at the x-ht. Estimate x-ht accordingly - but only as a TRIAL!
392 NOTE we do NOT check location of baseline. Commas can descend as much as
393 real descenders so we would need to do something to make sure that any
394 disqualifying descenders were not at the end.
395 */
396 else {
397 if (max_blob_ht <
398 (bln_x_height + bln_x_height / x_ht_fraction_of_caps_ht) / 2.0) {
399 trial = TRUE;
400 est_x_ht = x_ht_fraction_of_caps_ht * max_blob_ht;
401 est_caps_ht = max_blob_ht;
402
403 #ifndef SECURE_NAMES
404 if (debug_x_ht_level >= 20)
405 tprintf ("Mode20:M: Trial XHT:%f CAP:%f\n",
406 est_x_ht, est_caps_ht);
407 #endif
408 }
409 else {
410 no_comment = TRUE;
411 if (debug_x_ht_level >= 20)
412 tprintf ("Mode20:N: No comment as nothing else matched\n");
413 }
414 }
415 }
416
417 /* Sanity check - reject word if fails */
418
419 if (!no_comment &&
420 ((est_x_ht > 2 * bln_x_height) ||
421 (est_x_ht / word_res->denorm.scale () <= min_sane_x_ht_pixels) ||
422 (est_caps_ht <= est_x_ht) || (est_caps_ht >= 2.5 * est_x_ht))) {
423 no_comment = TRUE;
424 if (!trial && rej_use_xht) {
425 if (debug_x_ht_level >= 2) {
426 tprintf ("Sanity check rejecting %s ", word_str);
427 word_res->reject_map.print (debug_fp);
428 tprintf ("\n");
429 }
430 word_res->reject_map.rej_word_xht_fixup ();
431
432 }
433 if (debug_x_ht_level >= 20)
434 tprintf ("Mode20:O: No comment as nothing else matched\n");
435 }
436
437 if (no_comment || trial) {
438 word_res->x_height = bln_x_height / word_res->denorm.scale ();
439 word_res->guessed_x_ht = TRUE;
440 word_res->caps_height = (bln_x_height / x_ht_fraction_of_caps_ht) /
441 word_res->denorm.scale ();
442 word_res->guessed_caps_ht = TRUE;
443 /*
444 Reject ambigs in the current word if we are uncertain and:
445 there are rejects OR
446 there is only one char which is an ambig OR
447 there is conflict between the case of the ambigs even though there is
448 no height separation Eg "Ms" recognised from "MS"
449 */
450 if (rej_trial_ambigs &&
451 ((word_res->reject_map.reject_count () > 0) ||
452 (word_res->reject_map.length () == 1) ||
453 ((x_ht_ambigs > 0) && (caps_ht_ambigs > 0)))) {
454 #ifndef SECURE_NAMES
455 if (debug_x_ht_level >= 2) {
456 tprintf ("TRIAL Rej Ambigs %s ", word_str);
457 word_res->reject_map.print (debug_fp);
458 }
459 #endif
460 reject_ambigs(word_res);
461 if (debug_x_ht_level >= 2) {
462 tprintf (" ");
463 word_res->reject_map.print (debug_fp);
464 tprintf ("\n");
465 }
466 }
467 }
468 else {
469 word_res->x_height = est_x_ht / word_res->denorm.scale ();
470 word_res->guessed_x_ht = !est_x_ht_certain;
471 word_res->caps_height = est_caps_ht / word_res->denorm.scale ();
472 word_res->guessed_caps_ht = !est_caps_ht_certain;
473 }
474
475 if (!no_comment && (fabs (est_x_ht - bln_x_height) > x_ht_ok_variation))
476 *trial_x_ht = est_x_ht / word_res->denorm.scale ();
477 else
478 *trial_x_ht = 0.0;
479
480 #ifndef SECURE_NAMES
481 if (((*trial_x_ht > 0) && (debug_x_ht_level >= 3)) ||
482 (debug_x_ht_level >= 5)) {
483 tprintf ("%s ", word_str);
484 word_res->reject_map.print (debug_fp);
485 tprintf
486 (" X:%0.2f Cps:%0.2f Mxht:%0.2f RJ MxHt:%d MxAr:%d Rematch:%c\n",
487 est_x_ht, est_caps_ht, max_blob_ht, rej_blobs_max_height,
488 rej_blobs_max_area, *trial_x_ht > 0 ? '*' : ' ');
489 }
490 #endif
491
492 }
493
494
495 /*************************************************************************
496 * check_block_occ()
497 * Checks word for coarse block occupancy, rejecting more chars and flipping
498 * case of case ambiguous chars as required.
499 *************************************************************************/
500 namespace tesseract {
check_block_occ(WERD_RES * word_res)501 void Tesseract::check_block_occ(WERD_RES *word_res) {
502 PBLOB_IT blob_it;
503 STRING new_string;
504 STRING new_string_lengths(word_res->best_choice->unichar_lengths());
505 REJMAP new_map = word_res->reject_map;
506 WERD_CHOICE *new_choice;
507
508 const char *word_str = word_res->best_choice->unichar_string().string();
509 inT16 i;
510 inT16 offset;
511 inT16 reject_count = 0;
512 char confirmed_char[UNICHAR_LEN + 1];
513 char temp_char[UNICHAR_LEN + 1];
514 float x_ht;
515 float caps_ht;
516
517 new_string_lengths[0] = 0;
518
519 if (word_res->x_height > 0)
520 x_ht = word_res->x_height * word_res->denorm.scale ();
521 else
522 x_ht = bln_x_height;
523
524 if (word_res->caps_height > 0)
525 caps_ht = word_res->caps_height * word_res->denorm.scale ();
526 else
527 caps_ht = x_ht / x_ht_fraction_of_caps_ht;
528
529 blob_it.set_to_list (word_res->outword->blob_list ());
530
531 for (blob_it.mark_cycle_pt (), i = 0, offset = 0;
532 !blob_it.cycled_list (); blob_it.forward (),
533 offset += word_res->best_choice->unichar_lengths()[i++]) {
534 strncpy(temp_char, word_str + offset,
535 word_res->best_choice->unichar_lengths()[i]); //default copy
536 temp_char[word_res->best_choice->unichar_lengths()[i]] = '\0';
537 if (word_res->reject_map[i].accepted ()) {
538 check_blob_occ (temp_char,
539 blob_it.data ()->bounding_box ().
540 top () - bln_baseline_offset, x_ht,
541 caps_ht, confirmed_char);
542
543 if (strcmp(confirmed_char, "") == 0) {
544 if (rej_use_check_block_occ) {
545 new_map[i].setrej_xht_fixup ();
546 reject_count++;
547 }
548 }
549 else
550 strcpy(temp_char, confirmed_char);
551 }
552 new_string += temp_char;
553 new_string_lengths[i] = strlen(temp_char);
554 new_string_lengths[i + 1] = 0;
555
556 }
557 if ((reject_count > 0) || (new_string != word_str)) {
558 if (debug_x_ht_level >= 2) {
559 tprintf ("Shape Verification: %s ", word_str);
560 word_res->reject_map.print (debug_fp);
561 tprintf (" -> %s ", new_string.string ());
562 new_map.print (debug_fp);
563 tprintf ("\n");
564 }
565 new_choice = new WERD_CHOICE(new_string.string(),
566 new_string_lengths.string(),
567 word_res->best_choice->rating(),
568 word_res->best_choice->certainty(),
569 word_res->best_choice->permuter(),
570 unicharset);
571 new_choice->populate_unichars(unicharset);
572 delete word_res->best_choice;
573 word_res->best_choice = new_choice;
574 word_res->reject_map = new_map;
575 }
576 }
577 } // namespace tesseract
578
579 /*************************************************************************
580 * check_blob_occ()
581 *
582 * Checks blob for position relative to position above baseline
583 * Return 0 for reject, or (possibly case shifted) confirmed char
584 *************************************************************************/
585
check_blob_occ(char * proposed_char,inT16 blob_ht_above_baseline,float x_ht,float caps_ht,char * confirmed_char)586 void check_blob_occ(char* proposed_char,
587 inT16 blob_ht_above_baseline,
588 float x_ht,
589 float caps_ht,
590 char* confirmed_char) {
591 BOOL8 blob_definite_x_ht;
592 BOOL8 blob_definite_caps_ht;
593 float acceptable_variation;
594
595 acceptable_variation = (caps_ht - x_ht) * x_ht_variation;
596 /* ??? REJECT if expected descender and nothing significantly below BL */
597
598 /* ??? REJECT if expected ascender and nothing significantly above x-ht */
599
600 /*
601 IF AMBIG_CAPS_X_CHS
602 IF blob is definitely an ascender ( > xht + xht err )AND
603 char is an x-ht char
604 THEN
605 flip case
606 IF blob is defintiely an x-ht ( <= xht + xht err ) AND
607 char is an ascender char
608 THEN
609 flip case
610 */
611 blob_definite_x_ht = blob_ht_above_baseline <= x_ht + acceptable_variation;
612 blob_definite_caps_ht = blob_ht_above_baseline >=
613 caps_ht - acceptable_variation;
614
615 if (STRING (chs_ambig_caps_x).contains (*proposed_char)) {
616 if ((!blob_definite_x_ht && !blob_definite_caps_ht) ||
617 ((strcmp(proposed_char, "0") == 0) && !blob_definite_caps_ht) ||
618 ((strcmp(proposed_char, "o") == 0) && !blob_definite_x_ht)) {
619 strcpy(confirmed_char, "");
620 return;
621 }
622
623 else if (blob_definite_caps_ht &&
624 STRING (chs_x_ht).contains (*proposed_char)) {
625 if (x_ht_case_flip) {
626 //flip to upper case
627 proposed_char[0] = (char) toupper (*proposed_char);
628 return;
629 } else {
630 strcpy(confirmed_char, "");
631 return;
632 }
633 }
634
635 else if (blob_definite_x_ht &&
636 !STRING (chs_x_ht).contains (*proposed_char)) {
637 if (x_ht_case_flip) {
638 //flip to lower case
639 proposed_char[0] = (char) tolower (*proposed_char);
640 } else {
641 strcpy(confirmed_char, "");
642 return;
643 }
644 }
645 }
646 else
647 if ((STRING (chs_non_ambig_x_ht).contains (*proposed_char)
648 && !blob_definite_x_ht)
649 || (STRING (chs_non_ambig_caps_ht).contains (*proposed_char)
650 && !blob_definite_caps_ht)) {
651 strcpy(confirmed_char, "");
652 return;
653 }
654 strcpy(confirmed_char, proposed_char);
655 return;
656 }
657
658
estimate_from_stats(STATS & stats)659 float estimate_from_stats(STATS &stats) {
660 if (stats.get_total () <= 0)
661 return 0.0;
662 else if (stats.get_total () >= 3)
663 return stats.ile (0.5); //median
664 else
665 return stats.mean ();
666 }
667
668
improve_estimate(WERD_RES * word_res,float & est_x_ht,float & est_caps_ht,STATS & x_ht,STATS & caps_ht)669 void improve_estimate(WERD_RES *word_res,
670 float &est_x_ht,
671 float &est_caps_ht,
672 STATS &x_ht,
673 STATS &caps_ht) {
674 PBLOB_IT blob_it;
675 inT16 blob_ht_above_baseline;
676
677 const char *word_str;
678 inT16 i;
679 inT16 offset;
680 TBOX blob_box; //blob bounding box
681 char confirmed_char[UNICHAR_LEN + 1];
682 char temp_char[UNICHAR_LEN + 1];
683 float new_val;
684
685 /* IMPROVE estimates here - if good estimates, and case ambig chars,
686 rescan blobs to fix case ambig blobs, re-estimate hts ??? maybe always do
687 it after deciding x-height
688 */
689
690 blob_it.set_to_list (word_res->outword->blob_list ());
691 word_str = word_res->best_choice->unichar_string().string();
692 for (blob_it.mark_cycle_pt (), i = 0, offset = 0;
693 !blob_it.cycled_list (); blob_it.forward (),
694 offset += word_res->best_choice->unichar_lengths()[i++]) {
695 if ((STRING (chs_ambig_caps_x).contains (word_str[offset])) &&
696 (!dodgy_blob (blob_it.data ()))) {
697 blob_box = blob_it.data ()->bounding_box ();
698 blob_ht_above_baseline = blob_box.top () - bln_baseline_offset;
699 strncpy(temp_char, word_str + offset,
700 word_res->best_choice->unichar_lengths()[i]);
701 temp_char[word_res->best_choice->unichar_lengths()[i]] = '\0';
702 check_blob_occ (temp_char,
703 blob_ht_above_baseline,
704 est_x_ht, est_caps_ht, confirmed_char);
705 if (strcmp(confirmed_char, "") != 0) {
706 if (STRING (chs_x_ht).contains (*confirmed_char))
707 x_ht.add (blob_ht_above_baseline, 1);
708 else
709 caps_ht.add (blob_ht_above_baseline, 1);
710 }
711 }
712 }
713 new_val = estimate_from_stats (x_ht);
714 if (new_val > 0)
715 est_x_ht = new_val;
716 new_val = estimate_from_stats (caps_ht);
717 if (new_val > 0)
718 est_caps_ht = new_val;
719 }
720
721
reject_ambigs(WERD_RES * word)722 void reject_ambigs( //rej any accepted xht ambig chars
723 WERD_RES *word) {
724 const char *word_str;
725 int i = 0;
726
727 word_str = word->best_choice->unichar_string().string();
728 while (*word_str != '\0') {
729 if (STRING (chs_ambig_caps_x).contains (*word_str))
730 word->reject_map[i].setrej_xht_fixup ();
731 word_str += word->best_choice->unichar_lengths()[i++];
732 }
733 }
734
735
est_ambigs(WERD_RES * word_res,STATS & stats,float * ambig_lc_x_est,float * ambig_uc_caps_est)736 void est_ambigs( //xht ambig ht stats
737 WERD_RES *word_res,
738 STATS &stats,
739 float *ambig_lc_x_est, //xht est
740 float *ambig_uc_caps_est //caps est
741 ) {
742 float x_ht_ok_variation;
743 STATS short_ambigs (0, 300);
744 STATS tall_ambigs (0, 300);
745 PBLOB_IT blob_it;
746 TBOX blob_box; //blob bounding box
747 inT16 blob_ht_above_baseline;
748
749 const char *word_str;
750 inT16 i;
751 inT16 offset;
752 float min; //min ambig ch ht
753 float max; //max ambig ch ht
754 float short_limit; // for lower case
755 float tall_limit; // for upper case
756
757 x_ht_ok_variation =
758 (bln_x_height / x_ht_fraction_of_caps_ht - bln_x_height) * x_ht_variation;
759
760 if (stats.get_total () == 0) {
761 *ambig_lc_x_est = 0;
762 *ambig_uc_caps_est = 0;
763 }
764 else {
765 min = stats.ile (0.0);
766 max = stats.ile (0.99999);
767 if ((max - min) < x_ht_ok_variation) {
768 *ambig_lc_x_est = *ambig_uc_caps_est = stats.mean ();
769 //close enough
770 }
771 else {
772 /* Try reclustering into lower and upper case chars */
773 short_limit = min + (max - min) * x_ht_variation;
774 tall_limit = max - (max - min) * x_ht_variation;
775 word_str = word_res->best_choice->unichar_string().string();
776 blob_it.set_to_list (word_res->outword->blob_list ());
777 for (blob_it.mark_cycle_pt (), i = 0, offset = 0;
778 !blob_it.cycled_list (); blob_it.forward (),
779 offset += word_res->best_choice->unichar_lengths()[i++]) {
780 if (word_res->reject_map[i].accepted () &&
781 STRING (chs_ambig_caps_x).contains (word_str[offset]) &&
782 (!dodgy_blob (blob_it.data ()))) {
783 blob_box = blob_it.data ()->bounding_box ();
784 blob_ht_above_baseline =
785 blob_box.top () - bln_baseline_offset;
786 if (blob_ht_above_baseline <= short_limit)
787 short_ambigs.add (blob_ht_above_baseline, 1);
788 else if (blob_ht_above_baseline >= tall_limit)
789 tall_ambigs.add (blob_ht_above_baseline, 1);
790 }
791 }
792 *ambig_lc_x_est = short_ambigs.mean ();
793 *ambig_uc_caps_est = tall_ambigs.mean ();
794 /* Cop out if we havent got sensible clusters. */
795 if (*ambig_uc_caps_est - *ambig_lc_x_est <= x_ht_ok_variation)
796 *ambig_lc_x_est = *ambig_uc_caps_est = stats.mean ();
797 //close enough
798 }
799 }
800 }
801
802
803 /*************************************************************************
804 * dodgy_blob()
805 * Returns true if the blob has more than one outline, one above the other.
806 * These are dodgy as the top blob could be noise, causing the bounding box xht
807 * to be misleading
808 *************************************************************************/
809
dodgy_blob(PBLOB * blob)810 BOOL8 dodgy_blob(PBLOB *blob) {
811 OUTLINE_IT outline_it = blob->out_list ();
812 inT16 highest_bottom = -MAX_INT16;
813 inT16 lowest_top = MAX_INT16;
814 TBOX outline_box;
815
816 if (x_ht_include_dodgy_blobs)
817 return FALSE; //no blob is ever dodgy
818 for (outline_it.mark_cycle_pt ();
819 !outline_it.cycled_list (); outline_it.forward ()) {
820 outline_box = outline_it.data ()->bounding_box ();
821 if (lowest_top > outline_box.top ())
822 lowest_top = outline_box.top ();
823 if (highest_bottom < outline_box.bottom ())
824 highest_bottom = outline_box.bottom ();
825 }
826 return highest_bottom >= lowest_top;
827 }
828