• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File:        heuristic.c  (Formerly heuristic.c)
5  * Description:
6  * Author:       Mark Seaman, OCR Technology
7  * Created:      Fri Oct 16 14:37:00 1987
8  * Modified:     Wed Jul 10 14:15:08 1991 (Mark Seaman) marks@hpgrlt
9  * Language:     C
10  * Package:      N/A
11  * Status:       Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26               I n c l u d e s
27 ----------------------------------------------------------------------*/
28 #include <math.h>
29 
30 #include "heuristic.h"
31 
32 #include "baseline.h"
33 #include "freelist.h"
34 #include "metrics.h"
35 #include "measure.h"
36 #include "ratngs.h"
37 #include "wordrec.h"
38 
39 /*----------------------------------------------------------------------
40               M a c r o s
41 ----------------------------------------------------------------------*/
42 #define MAX_SQUAT       2.0      /* Width ratio */
43 #define BAD_RATING   1000.0      /* No valid blob */
44 
45 INT_VAR(segment_adjust_debug, 0,
46         "Segmentation adjustment debug");
47 BOOL_VAR(assume_fixed_pitch_char_segment, 0,
48          "include fixed-pitch heuristics in char segmentation");
49 BOOL_VAR(use_new_state_cost, 0,
50          "use new state cost heuristics for segmentation state evaluation");
51 double_VAR(heuristic_segcost_rating_base, 1.25,
52            "base factor for adding segmentation cost into word rating."
53            "It's a multiplying factor, the larger the value above 1, "
54            "the bigger the effect of segmentation cost.");
55 double_VAR(heuristic_weight_rating, 1,
56            "weight associated with char rating in combined cost of state");
57 double_VAR(heuristic_weight_width, 0,
58            "weight associated with width evidence in combined cost of state");
59 double_VAR(heuristic_weight_seamcut, 0,
60            "weight associated with seam cut in combined cost of state");
61 double_VAR(heuristic_max_char_wh_ratio, MAX_SQUAT,
62            "max char width-to-height ratio allowed in segmentation");
63 
64 namespace tesseract {
65 
66 /*----------------------------------------------------------------------
67 // Some static helpers used only in this file
68 ----------------------------------------------------------------------*/
69 
70 // Return a character width record corresponding to the character
71 // width that will be generated in this segmentation state.
72 // The calling function need to memfree WIDTH_RECORD when finished.
73 // This is the same as the original function, only cosmetic changes,
74 // except instead of passing chunks back to be freed, it deallocates
75 // internally.
state_char_widths(WIDTH_RECORD * chunk_widths,STATE * state,int num_joints)76 WIDTH_RECORD *state_char_widths(WIDTH_RECORD *chunk_widths,
77                                 STATE *state,
78                                 int num_joints) {
79   SEARCH_STATE chunks = bin_to_chunks(state, num_joints);
80   int num_chars = chunks[0] + 1;
81 
82   // allocate and store (n+1,w0,g0,w1,g1...,wn) in int[2*(n+1)] as a
83   // struct { num_chars, widths[2*n+1]; }
84   WIDTH_RECORD *char_widths = (WIDTH_RECORD*) memalloc(sizeof(int)*num_chars*2);
85   char_widths->num_chars = num_chars;
86 
87   int first_blob = 0;
88   int last_blob;
89   for (int i = 1; i <= num_chars; i++) {
90     last_blob = (i > chunks[0]) ? num_joints : first_blob + chunks[i];
91 
92     char_widths->widths[2*i-2] = chunks_width(chunk_widths,
93                                               first_blob, last_blob);
94     if (i <= chunks[0])
95       char_widths->widths[2*i-1] = chunks_gap(chunk_widths, last_blob);
96 
97     if (segment_adjust_debug > 3)
98       tprintf("width_record[%d]s%d--s%d(%d) %d %d:%d\n",
99               i-1, first_blob, last_blob, chunks[i],
100               char_widths->widths[2*i-2], char_widths->widths[2*i-1],
101               chunk_widths->widths[2*last_blob+1]);
102     first_blob = last_blob + 1;
103   }
104 
105   memfree(chunks);
106   return char_widths;
107 }
108 
109 // Computes the variance of the char widths normalized to the given height
110 // TODO(dsl): Do this in a later stage and use char choice info to skip
111 // punctuations.
get_width_variance(WIDTH_RECORD * wrec,float norm_height)112 FLOAT32 get_width_variance(WIDTH_RECORD *wrec, float norm_height) {
113   MEASUREMENT ws;
114   new_measurement(ws);
115   for (int x = 0; x < wrec->num_chars; x++) {
116     FLOAT32 wh_ratio = wrec->widths[2 * x] * 1.0f / norm_height;
117     if (x == wrec->num_chars - 1 && wh_ratio > 0.3)
118       continue;   // exclude trailing punctuation from stats
119     ADD_SAMPLE(ws, wh_ratio);
120   }
121   if (segment_adjust_debug > 2)
122     tprintf("Width Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws));
123   return VARIANCE(ws);
124 }
125 
126 // Computes the variance of char positioning (width + spacing) wrt norm_height
get_gap_variance(WIDTH_RECORD * wrec,float norm_height)127 FLOAT32 get_gap_variance(WIDTH_RECORD *wrec, float norm_height) {
128   MEASUREMENT ws;
129   new_measurement(ws);
130   for (int x = 0; x < wrec->num_chars - 1; x++) {
131     FLOAT32 gap_ratio = (wrec->widths[2 * x] + wrec->widths[ 2*x + 1])
132                         * 1.0 / norm_height;
133     ADD_SAMPLE(ws, gap_ratio);
134   }
135   if (segment_adjust_debug > 2)
136     tprintf("Gap Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws));
137   return VARIANCE(ws);
138 }
139 
140 
141 /*----------------------------------------------------------------------
142  Below are the new state prioritization functions, which incorporates
143  segmentation cost and width distribution for fixed pitch languages.
144  They are included as methods in class Wordrec to obtain more context.
145  ----------------------------------------------------------------------*/
146 
147 /**********************************************************************
148  * Returns the cost associated with the segmentation state.
149  *
150  * The number of states should be the same as the number of seams.
151  * If state[i] is 1, then seams[i] is present; otherwise, it is hidden.
152  * This function returns the sum of priority for active seams.
153  * TODO(dsl): To keep this clean, this function will just return the
154  * sum of raw "priority" as seam cost.  The normalization of this score
155  * relative to other costs will be done in bestfirst.cpp evaluate_seg().
156  **********************************************************************/
157 
seamcut_priority(SEAMS seams,STATE * state,int num_joints)158 FLOAT32 Wordrec::seamcut_priority(SEAMS seams,
159                                   STATE *state,
160                                   int num_joints) {
161   int x;
162   unsigned int mask = (num_joints > 32) ? (1 << (num_joints - 1 - 32))
163                                         : (1 << (num_joints - 1));
164   float seam_cost = 0.0f;
165   for (x = num_joints - 1; x >= 0; x--) {
166     int i = num_joints - 1 - x;
167     uinT32 value = (x < 32) ? state->part2 : state->part1;
168     bool state_on = value & mask;
169     if (state_on) {
170       SEAM* seam = (SEAM *) array_value(seams, i);
171       seam_cost += seam->priority;
172     }
173     if (mask == 1)
174       mask = 1 << 31;
175     else
176       mask >>= 1;
177   }
178   if (segment_adjust_debug > 2)
179     tprintf("seam_cost: %f\n", seam_cost);
180   return seam_cost;
181 }
182 
183 /**********************************************************************
184  * rating_priority
185  *
186  * Assign a segmentation priority based on the ratings of the blobs
187  * (in that segmentation) that have been classified.  The average
188  * "goodness" (i.e. rating / weight) for each blob is used to indicate
189  * the segmentation priority.  This is the original.
190  **********************************************************************/
rating_priority(CHUNKS_RECORD * chunks_record,STATE * state,int num_joints)191 FLOAT32 Wordrec::rating_priority(CHUNKS_RECORD *chunks_record,
192                         STATE *state,
193                         int num_joints) {
194   BLOB_CHOICE_LIST *blob_choices;
195   BLOB_CHOICE_IT blob_choice_it;
196   inT16 first_chunk = 0;
197   inT16 last_chunk;
198   inT16 ratings = 0;
199   inT16 weights = 0;
200 
201   PIECES_STATE blob_chunks;
202   bin_to_pieces(state, num_joints, blob_chunks);
203 
204   for (int x = 0; blob_chunks[x]; x++) {
205     last_chunk = first_chunk + blob_chunks[x];
206 
207     blob_choices = chunks_record->ratings->get(first_chunk, last_chunk - 1);
208     if (blob_choices != NOT_CLASSIFIED && blob_choices->length() > 0) {
209       blob_choice_it.set_to_list(blob_choices);
210       ratings += (inT16) blob_choice_it.data()->rating();
211       for (int y = first_chunk; y < last_chunk; y++) {
212         weights += (inT16) (chunks_record->weights[y]);
213       }
214     }
215     first_chunk = last_chunk;
216   }
217   if (weights <= 0)
218     weights = 1;
219   FLOAT32 rating_cost = static_cast<FLOAT32>(ratings) /
220                         static_cast<FLOAT32>(weights);
221   if (segment_adjust_debug > 2)
222     tprintf("rating_cost: r%f / w%f = %f\n", ratings, weights, rating_cost);
223   return rating_cost;
224 }
225 
226 // Returns the cost, eg. -log(p), of a given value in char width distribution.
fp_width_cost(float norm_width,bool end_pos)227 FLOAT32 fp_width_cost(float norm_width, bool end_pos) {
228   bool use_old_hack = true;
229   if (use_old_hack) {
230     float cost = 0;
231     if (norm_width > heuristic_max_char_wh_ratio)
232       cost += norm_width;
233     if (norm_width > MAX_SQUAT)  // extra penalty for merging two CJK chars
234       cost += norm_width * norm_width;
235     // penalize skinny blobs, except for punctuation in the last position
236     if (norm_width < 0.5 && !end_pos)
237       cost += 1 - norm_width;
238     return cost;
239 }
240 
241   // otherwise, approximate with our not-so-normal distribution
242   float s = fabs((norm_width - 0.85) / 0.35);
243   if (s < 1)   // clip penalty to zero for anything within 1 std
244     return 0.0f;
245   // Allow smaller chars at begin or end position for punctuations
246   if (end_pos && norm_width < 0.3)
247     return 0.0f;
248   if (segment_adjust_debug > 2)
249     tprintf("fp_width_cost(%f) = %f**2 = %f\n", norm_width, s, s*s);
250   return s*s;
251 }
252 
fp_gap_cost(float norm_gap,bool end_pos)253 FLOAT32 fp_gap_cost(float norm_gap, bool end_pos) {
254   bool use_old_hack = true;
255   if (use_old_hack) {
256     if (norm_gap < 0.05 && !end_pos)
257       return 5;   // penalize vertically overlapping components
258     else
259       return 0;
260   }
261   float s = fabs((norm_gap - 0.1) / 0.02);
262   if (s > -1) return 0.0f;  // no penalty for wider gaps
263   if (segment_adjust_debug > 2)
264     tprintf("fp_gap_cost(%f) = %f**2 = %f\n", norm_gap, s, s*s);
265   return s*s;
266 }
267 
268 /**********************************************************************
269  * width_priority
270  *
271  * Return a priority value for this word segmentation based on the
272  * character widths present in the new segmentation.
273  * For variable-pitch fonts, this should do the same thing as before:
274  * ie. penalize only on really wide squatting blobs.
275  * For fixed-pitch fonts, this will include a measure of char & gap
276  * width consistency.
277  * TODO(dsl): generalize this to use a PDF estimate for proportional and
278  * fixed pitch mode.
279  **********************************************************************/
width_priority(CHUNKS_RECORD * chunks_record,STATE * state,int num_joints)280 FLOAT32 Wordrec::width_priority(CHUNKS_RECORD *chunks_record,
281                                 STATE *state,
282                                 int num_joints) {
283   FLOAT32 penalty = 0.0;
284   WIDTH_RECORD *width_rec = state_char_widths(chunks_record->chunk_widths,
285                                               state, num_joints);
286   // When baseline_enable==True, which is the current default for Tesseract,
287   // a fixed value of 128 (BASELINE_SCALE) is always used.
288   FLOAT32 normalizing_height = BASELINE_SCALE;
289   if (!classify_baseline_normalized)   // this doesn't work and is never invoked
290     normalizing_height = chunks_record->row->lineheight;
291   if (assume_fixed_pitch_char_segment) {
292     // For fixed pitch language like CJK, we use the full text height as the
293     // normalizing factor so we are not dependent on xheight calculation.
294     // In the normalized coord. xheight * scale == BASELINE_SCALE(128),
295     // so add proportionally scaled ascender zone to get full text height.
296     normalizing_height = tess_denorm->scale() *
297         (tess_denorm->row()->x_height() + tess_denorm->row()->ascenders());
298     if (segment_adjust_debug > 1)
299       tprintf("WidthPriority: %f %f normalizing height = %f\n",
300               tess_denorm->row()->x_height(), tess_denorm->row()->ascenders(),
301               normalizing_height);
302     // Impose additional segmentation penalties if blob widths or gaps
303     // distribution don't fit a fixed-pitch model.
304     FLOAT32 width_var = get_width_variance(width_rec, normalizing_height);
305     FLOAT32 gap_var = get_gap_variance(width_rec, normalizing_height);
306     penalty += width_var;
307     penalty += gap_var;
308   }
309 
310   for (int x = 0; x < width_rec->num_chars; x++) {
311     FLOAT32 squat = width_rec->widths[2*x];
312     FLOAT32 gap = (x < width_rec->num_chars-1) ? width_rec->widths[2*x+1] : 0;
313     squat /= normalizing_height;
314     gap /= normalizing_height;
315     if (assume_fixed_pitch_char_segment) {
316       penalty += fp_width_cost(squat, x == 0 || x == width_rec->num_chars -1);
317       penalty += fp_gap_cost(gap, x == width_rec->num_chars - 1);
318       if (width_rec->num_chars == 1 && squat > MAX_SQUAT)
319         penalty += 10;
320     } else {
321       // original equation when heuristic_max_char_ratio == MAX_SQUAT
322       if (squat > heuristic_max_char_wh_ratio)
323         penalty += squat - heuristic_max_char_wh_ratio;
324     }
325   }
326 
327   free_widths(width_rec);
328   return (penalty);
329 }
330 
331 
332 /**********************************************************************
333  * prioritize_state
334  *
335  * Create a priority for this state.  It represents the urgency of
336  * checking this state.  The larger the priority value, the worse the
337  * state is (lower priority).  The "value" of this priority should be
338  * somewhat consistent with the final word rating.
339  * The question is how to normalize the different scores, and adjust
340  * the relative importance among them.
341  **********************************************************************/
prioritize_state(CHUNKS_RECORD * chunks_record,SEARCH_RECORD * the_search)342 FLOAT32 Wordrec::prioritize_state(CHUNKS_RECORD *chunks_record,
343                                   SEARCH_RECORD *the_search) {
344   FLOAT32 shape_cost;
345   FLOAT32 width_cost;
346   FLOAT32 seam_cost;
347 
348   shape_cost = rating_priority(chunks_record,
349                                the_search->this_state,
350                                the_search->num_joints);
351 
352   width_cost = width_priority(chunks_record,
353                               the_search->this_state,
354                               the_search->num_joints);
355 
356   // The rating_priority is the same as the original, and the width_priority
357   // is the same as before if assume_fixed_pitch_char_segment == FALSE.
358   // So this would return the original state priority.
359   if (!use_new_state_cost)
360     return width_cost * 1000 + shape_cost;
361 
362   seam_cost = seamcut_priority(chunks_record->splits,
363                                the_search->this_state,
364                                the_search->num_joints);
365   record_priorities(the_search, shape_cost, width_cost);
366 
367   // TODO(dsl): how do we normalize the scores for these separate evidence?
368   // FLOAT32 total_cost = shape_cost + width_cost * 0.01 + seam_cost * 0.001;
369   FLOAT32 total_cost = shape_cost * heuristic_weight_rating +
370                        width_cost * heuristic_weight_width +
371                        seam_cost * heuristic_weight_seamcut;
372 
373   // We don't have an adjustment model for variable pitch segmentation cost
374   // into word rating
375   if (assume_fixed_pitch_char_segment) {
376     float seg_bias = 1.0;
377     if (width_cost < 1) seg_bias *= 0.85;
378     if (width_cost > 3)
379       seg_bias *= pow(heuristic_segcost_rating_base, width_cost/3.0);
380     if (seam_cost > 10)
381       seg_bias *= pow(heuristic_segcost_rating_base, log(seam_cost)/log(10.0));
382     if (shape_cost > 5)
383       seg_bias *= pow(heuristic_segcost_rating_base, shape_cost/5.0);
384     if (segment_adjust_debug) {
385       tprintf("SegCost: %g Weight: %g rating: %g  width: %g  seam: %g\n",
386                total_cost, seg_bias, shape_cost, width_cost, seam_cost);
387     }
388     the_search->segcost_bias = seg_bias;
389   } else {
390     the_search->segcost_bias = 0;
391   }
392 
393   return total_cost;
394 }
395 
396 
397 /*----------------------------------------------------------------------
398               F u n c t i o n s
399 
400  Below are the original state prioritization functions for reference.
401  Since they work well for Latin, we need to keep them around until the
402  new path is verified to do no worse than before.
403 
404 //  Assign a segmentation priority based on the ratings of the blobs
405 //  (in that segmentation) that have been classified.  The average
406 //  "goodness" (i.e. rating / weight) for each blob is used to indicate
407 //  the segmentation priority.
408 FLOAT32 rating_priority(CHUNKS_RECORD *chunks_record,
409                         STATE *state,
410                         STATE *old_state,
411                         int num_joints) {
412   PIECES_STATE blob_chunks;
413   inT16 x;
414   inT16 y;
415   BLOB_CHOICE_LIST *blob_choices;
416   BLOB_CHOICE_IT blob_choice_it;
417   inT16 first_chunk = 0;
418   inT16 last_chunk;
419   inT16 ratings = 0;
420   inT16 weights = 0;
421 
422   bin_to_pieces(state, num_joints, blob_chunks);
423 
424   for (x = 0; blob_chunks[x]; x++) {
425                                  // Iterate each blob
426     last_chunk = first_chunk + blob_chunks[x] - 1;
427 
428     blob_choices = chunks_record->ratings->get(first_chunk, last_chunk);
429 
430     if (blob_choices != NOT_CLASSIFIED) {
431       blob_choice_it.set_to_list(blob_choices);
432       ratings += (inT16) blob_choice_it.data()->rating();
433       for (y = first_chunk; y <= last_chunk; y++) {
434         weights += (inT16) (chunks_record->weights[y]);
435       }
436     }
437     first_chunk += blob_chunks[x];
438   }
439   if (weights <= 0)
440     weights = 1;
441   return ((FLOAT32) ratings / weights);
442 }
443 
444 // Return a priority value for this word segmentation based on the
445 // character widths present in the new segmentation.
446 FLOAT32 width_priority(CHUNKS_RECORD *chunks_record,
447                        STATE *state,
448                        int num_joints) {
449   FLOAT32 result = 0.0;
450   WIDTH_RECORD *width_record;
451   FLOAT32 squat;
452   int x;
453 
454   width_record = state_char_widths (chunks_record->chunk_widths,
455     state, num_joints);
456   for (x = 0; x < width_record->num_chars; x++) {
457 
458     squat = width_record->widths[2 * x];
459     if (!classify_baseline_normalized) {
460       squat /= chunks_record->row->lineheight;
461     }
462     else {
463       squat /= BASELINE_SCALE;
464     }
465 
466     if (squat > MAX_SQUAT)
467       result += squat - MAX_SQUAT;
468 
469   }
470 
471   free_widths(width_record);
472 
473   return (result);
474 }
475 
476 //  Create a priority for this state.  It represents the urgency of
477 //  checking this state.
478 FLOAT32 prioritize_state(CHUNKS_RECORD *chunks_record,
479                          SEARCH_RECORD *the_search,
480                          STATE *old_state) {
481   FLOAT32 width_pri;
482   FLOAT32 match_pri;
483 
484   match_pri = rating_priority (chunks_record, the_search->this_state,
485     old_state, the_search->num_joints);
486 
487   width_pri = width_priority (chunks_record, the_search->this_state,
488     the_search->num_joints) * 1000.0;
489 
490   record_priorities(the_search, old_state, match_pri, width_pri);
491 
492   return (width_pri + match_pri);
493 }
494 ------------------   Original Rating Functions   -----------------*/
495 
496 }  // namespace tesseract
497