1 /* -*-C-*-
2 ********************************************************************************
3 *
4 * File: heuristic.c (Formerly heuristic.c)
5 * Description:
6 * Author: Mark Seaman, OCR Technology
7 * Created: Fri Oct 16 14:37:00 1987
8 * Modified: Wed Jul 10 14:15:08 1991 (Mark Seaman) marks@hpgrlt
9 * Language: C
10 * Package: N/A
11 * Status: Reusable Software Component
12 *
13 * (c) Copyright 1987, Hewlett-Packard Company.
14 ** Licensed under the Apache License, Version 2.0 (the "License");
15 ** you may not use this file except in compliance with the License.
16 ** You may obtain a copy of the License at
17 ** http://www.apache.org/licenses/LICENSE-2.0
18 ** Unless required by applicable law or agreed to in writing, software
19 ** distributed under the License is distributed on an "AS IS" BASIS,
20 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 ** See the License for the specific language governing permissions and
22 ** limitations under the License.
23 *
24 *********************************************************************************/
25 /*----------------------------------------------------------------------
26 I n c l u d e s
27 ----------------------------------------------------------------------*/
28 #include <math.h>
29
30 #include "heuristic.h"
31
32 #include "baseline.h"
33 #include "freelist.h"
34 #include "metrics.h"
35 #include "measure.h"
36 #include "ratngs.h"
37 #include "wordrec.h"
38
39 /*----------------------------------------------------------------------
40 M a c r o s
41 ----------------------------------------------------------------------*/
42 #define MAX_SQUAT 2.0 /* Width ratio */
43 #define BAD_RATING 1000.0 /* No valid blob */
44
45 INT_VAR(segment_adjust_debug, 0,
46 "Segmentation adjustment debug");
47 BOOL_VAR(assume_fixed_pitch_char_segment, 0,
48 "include fixed-pitch heuristics in char segmentation");
49 BOOL_VAR(use_new_state_cost, 0,
50 "use new state cost heuristics for segmentation state evaluation");
51 double_VAR(heuristic_segcost_rating_base, 1.25,
52 "base factor for adding segmentation cost into word rating."
53 "It's a multiplying factor, the larger the value above 1, "
54 "the bigger the effect of segmentation cost.");
55 double_VAR(heuristic_weight_rating, 1,
56 "weight associated with char rating in combined cost of state");
57 double_VAR(heuristic_weight_width, 0,
58 "weight associated with width evidence in combined cost of state");
59 double_VAR(heuristic_weight_seamcut, 0,
60 "weight associated with seam cut in combined cost of state");
61 double_VAR(heuristic_max_char_wh_ratio, MAX_SQUAT,
62 "max char width-to-height ratio allowed in segmentation");
63
64 namespace tesseract {
65
66 /*----------------------------------------------------------------------
67 // Some static helpers used only in this file
68 ----------------------------------------------------------------------*/
69
70 // Return a character width record corresponding to the character
71 // width that will be generated in this segmentation state.
72 // The calling function need to memfree WIDTH_RECORD when finished.
73 // This is the same as the original function, only cosmetic changes,
74 // except instead of passing chunks back to be freed, it deallocates
75 // internally.
state_char_widths(WIDTH_RECORD * chunk_widths,STATE * state,int num_joints)76 WIDTH_RECORD *state_char_widths(WIDTH_RECORD *chunk_widths,
77 STATE *state,
78 int num_joints) {
79 SEARCH_STATE chunks = bin_to_chunks(state, num_joints);
80 int num_chars = chunks[0] + 1;
81
82 // allocate and store (n+1,w0,g0,w1,g1...,wn) in int[2*(n+1)] as a
83 // struct { num_chars, widths[2*n+1]; }
84 WIDTH_RECORD *char_widths = (WIDTH_RECORD*) memalloc(sizeof(int)*num_chars*2);
85 char_widths->num_chars = num_chars;
86
87 int first_blob = 0;
88 int last_blob;
89 for (int i = 1; i <= num_chars; i++) {
90 last_blob = (i > chunks[0]) ? num_joints : first_blob + chunks[i];
91
92 char_widths->widths[2*i-2] = chunks_width(chunk_widths,
93 first_blob, last_blob);
94 if (i <= chunks[0])
95 char_widths->widths[2*i-1] = chunks_gap(chunk_widths, last_blob);
96
97 if (segment_adjust_debug > 3)
98 tprintf("width_record[%d]s%d--s%d(%d) %d %d:%d\n",
99 i-1, first_blob, last_blob, chunks[i],
100 char_widths->widths[2*i-2], char_widths->widths[2*i-1],
101 chunk_widths->widths[2*last_blob+1]);
102 first_blob = last_blob + 1;
103 }
104
105 memfree(chunks);
106 return char_widths;
107 }
108
109 // Computes the variance of the char widths normalized to the given height
110 // TODO(dsl): Do this in a later stage and use char choice info to skip
111 // punctuations.
get_width_variance(WIDTH_RECORD * wrec,float norm_height)112 FLOAT32 get_width_variance(WIDTH_RECORD *wrec, float norm_height) {
113 MEASUREMENT ws;
114 new_measurement(ws);
115 for (int x = 0; x < wrec->num_chars; x++) {
116 FLOAT32 wh_ratio = wrec->widths[2 * x] * 1.0f / norm_height;
117 if (x == wrec->num_chars - 1 && wh_ratio > 0.3)
118 continue; // exclude trailing punctuation from stats
119 ADD_SAMPLE(ws, wh_ratio);
120 }
121 if (segment_adjust_debug > 2)
122 tprintf("Width Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws));
123 return VARIANCE(ws);
124 }
125
126 // Computes the variance of char positioning (width + spacing) wrt norm_height
get_gap_variance(WIDTH_RECORD * wrec,float norm_height)127 FLOAT32 get_gap_variance(WIDTH_RECORD *wrec, float norm_height) {
128 MEASUREMENT ws;
129 new_measurement(ws);
130 for (int x = 0; x < wrec->num_chars - 1; x++) {
131 FLOAT32 gap_ratio = (wrec->widths[2 * x] + wrec->widths[ 2*x + 1])
132 * 1.0 / norm_height;
133 ADD_SAMPLE(ws, gap_ratio);
134 }
135 if (segment_adjust_debug > 2)
136 tprintf("Gap Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws));
137 return VARIANCE(ws);
138 }
139
140
141 /*----------------------------------------------------------------------
142 Below are the new state prioritization functions, which incorporates
143 segmentation cost and width distribution for fixed pitch languages.
144 They are included as methods in class Wordrec to obtain more context.
145 ----------------------------------------------------------------------*/
146
147 /**********************************************************************
148 * Returns the cost associated with the segmentation state.
149 *
150 * The number of states should be the same as the number of seams.
151 * If state[i] is 1, then seams[i] is present; otherwise, it is hidden.
152 * This function returns the sum of priority for active seams.
153 * TODO(dsl): To keep this clean, this function will just return the
154 * sum of raw "priority" as seam cost. The normalization of this score
155 * relative to other costs will be done in bestfirst.cpp evaluate_seg().
156 **********************************************************************/
157
seamcut_priority(SEAMS seams,STATE * state,int num_joints)158 FLOAT32 Wordrec::seamcut_priority(SEAMS seams,
159 STATE *state,
160 int num_joints) {
161 int x;
162 unsigned int mask = (num_joints > 32) ? (1 << (num_joints - 1 - 32))
163 : (1 << (num_joints - 1));
164 float seam_cost = 0.0f;
165 for (x = num_joints - 1; x >= 0; x--) {
166 int i = num_joints - 1 - x;
167 uinT32 value = (x < 32) ? state->part2 : state->part1;
168 bool state_on = value & mask;
169 if (state_on) {
170 SEAM* seam = (SEAM *) array_value(seams, i);
171 seam_cost += seam->priority;
172 }
173 if (mask == 1)
174 mask = 1 << 31;
175 else
176 mask >>= 1;
177 }
178 if (segment_adjust_debug > 2)
179 tprintf("seam_cost: %f\n", seam_cost);
180 return seam_cost;
181 }
182
183 /**********************************************************************
184 * rating_priority
185 *
186 * Assign a segmentation priority based on the ratings of the blobs
187 * (in that segmentation) that have been classified. The average
188 * "goodness" (i.e. rating / weight) for each blob is used to indicate
189 * the segmentation priority. This is the original.
190 **********************************************************************/
rating_priority(CHUNKS_RECORD * chunks_record,STATE * state,int num_joints)191 FLOAT32 Wordrec::rating_priority(CHUNKS_RECORD *chunks_record,
192 STATE *state,
193 int num_joints) {
194 BLOB_CHOICE_LIST *blob_choices;
195 BLOB_CHOICE_IT blob_choice_it;
196 inT16 first_chunk = 0;
197 inT16 last_chunk;
198 inT16 ratings = 0;
199 inT16 weights = 0;
200
201 PIECES_STATE blob_chunks;
202 bin_to_pieces(state, num_joints, blob_chunks);
203
204 for (int x = 0; blob_chunks[x]; x++) {
205 last_chunk = first_chunk + blob_chunks[x];
206
207 blob_choices = chunks_record->ratings->get(first_chunk, last_chunk - 1);
208 if (blob_choices != NOT_CLASSIFIED && blob_choices->length() > 0) {
209 blob_choice_it.set_to_list(blob_choices);
210 ratings += (inT16) blob_choice_it.data()->rating();
211 for (int y = first_chunk; y < last_chunk; y++) {
212 weights += (inT16) (chunks_record->weights[y]);
213 }
214 }
215 first_chunk = last_chunk;
216 }
217 if (weights <= 0)
218 weights = 1;
219 FLOAT32 rating_cost = static_cast<FLOAT32>(ratings) /
220 static_cast<FLOAT32>(weights);
221 if (segment_adjust_debug > 2)
222 tprintf("rating_cost: r%f / w%f = %f\n", ratings, weights, rating_cost);
223 return rating_cost;
224 }
225
226 // Returns the cost, eg. -log(p), of a given value in char width distribution.
fp_width_cost(float norm_width,bool end_pos)227 FLOAT32 fp_width_cost(float norm_width, bool end_pos) {
228 bool use_old_hack = true;
229 if (use_old_hack) {
230 float cost = 0;
231 if (norm_width > heuristic_max_char_wh_ratio)
232 cost += norm_width;
233 if (norm_width > MAX_SQUAT) // extra penalty for merging two CJK chars
234 cost += norm_width * norm_width;
235 // penalize skinny blobs, except for punctuation in the last position
236 if (norm_width < 0.5 && !end_pos)
237 cost += 1 - norm_width;
238 return cost;
239 }
240
241 // otherwise, approximate with our not-so-normal distribution
242 float s = fabs((norm_width - 0.85) / 0.35);
243 if (s < 1) // clip penalty to zero for anything within 1 std
244 return 0.0f;
245 // Allow smaller chars at begin or end position for punctuations
246 if (end_pos && norm_width < 0.3)
247 return 0.0f;
248 if (segment_adjust_debug > 2)
249 tprintf("fp_width_cost(%f) = %f**2 = %f\n", norm_width, s, s*s);
250 return s*s;
251 }
252
fp_gap_cost(float norm_gap,bool end_pos)253 FLOAT32 fp_gap_cost(float norm_gap, bool end_pos) {
254 bool use_old_hack = true;
255 if (use_old_hack) {
256 if (norm_gap < 0.05 && !end_pos)
257 return 5; // penalize vertically overlapping components
258 else
259 return 0;
260 }
261 float s = fabs((norm_gap - 0.1) / 0.02);
262 if (s > -1) return 0.0f; // no penalty for wider gaps
263 if (segment_adjust_debug > 2)
264 tprintf("fp_gap_cost(%f) = %f**2 = %f\n", norm_gap, s, s*s);
265 return s*s;
266 }
267
268 /**********************************************************************
269 * width_priority
270 *
271 * Return a priority value for this word segmentation based on the
272 * character widths present in the new segmentation.
273 * For variable-pitch fonts, this should do the same thing as before:
274 * ie. penalize only on really wide squatting blobs.
275 * For fixed-pitch fonts, this will include a measure of char & gap
276 * width consistency.
277 * TODO(dsl): generalize this to use a PDF estimate for proportional and
278 * fixed pitch mode.
279 **********************************************************************/
width_priority(CHUNKS_RECORD * chunks_record,STATE * state,int num_joints)280 FLOAT32 Wordrec::width_priority(CHUNKS_RECORD *chunks_record,
281 STATE *state,
282 int num_joints) {
283 FLOAT32 penalty = 0.0;
284 WIDTH_RECORD *width_rec = state_char_widths(chunks_record->chunk_widths,
285 state, num_joints);
286 // When baseline_enable==True, which is the current default for Tesseract,
287 // a fixed value of 128 (BASELINE_SCALE) is always used.
288 FLOAT32 normalizing_height = BASELINE_SCALE;
289 if (!classify_baseline_normalized) // this doesn't work and is never invoked
290 normalizing_height = chunks_record->row->lineheight;
291 if (assume_fixed_pitch_char_segment) {
292 // For fixed pitch language like CJK, we use the full text height as the
293 // normalizing factor so we are not dependent on xheight calculation.
294 // In the normalized coord. xheight * scale == BASELINE_SCALE(128),
295 // so add proportionally scaled ascender zone to get full text height.
296 normalizing_height = tess_denorm->scale() *
297 (tess_denorm->row()->x_height() + tess_denorm->row()->ascenders());
298 if (segment_adjust_debug > 1)
299 tprintf("WidthPriority: %f %f normalizing height = %f\n",
300 tess_denorm->row()->x_height(), tess_denorm->row()->ascenders(),
301 normalizing_height);
302 // Impose additional segmentation penalties if blob widths or gaps
303 // distribution don't fit a fixed-pitch model.
304 FLOAT32 width_var = get_width_variance(width_rec, normalizing_height);
305 FLOAT32 gap_var = get_gap_variance(width_rec, normalizing_height);
306 penalty += width_var;
307 penalty += gap_var;
308 }
309
310 for (int x = 0; x < width_rec->num_chars; x++) {
311 FLOAT32 squat = width_rec->widths[2*x];
312 FLOAT32 gap = (x < width_rec->num_chars-1) ? width_rec->widths[2*x+1] : 0;
313 squat /= normalizing_height;
314 gap /= normalizing_height;
315 if (assume_fixed_pitch_char_segment) {
316 penalty += fp_width_cost(squat, x == 0 || x == width_rec->num_chars -1);
317 penalty += fp_gap_cost(gap, x == width_rec->num_chars - 1);
318 if (width_rec->num_chars == 1 && squat > MAX_SQUAT)
319 penalty += 10;
320 } else {
321 // original equation when heuristic_max_char_ratio == MAX_SQUAT
322 if (squat > heuristic_max_char_wh_ratio)
323 penalty += squat - heuristic_max_char_wh_ratio;
324 }
325 }
326
327 free_widths(width_rec);
328 return (penalty);
329 }
330
331
332 /**********************************************************************
333 * prioritize_state
334 *
335 * Create a priority for this state. It represents the urgency of
336 * checking this state. The larger the priority value, the worse the
337 * state is (lower priority). The "value" of this priority should be
338 * somewhat consistent with the final word rating.
339 * The question is how to normalize the different scores, and adjust
340 * the relative importance among them.
341 **********************************************************************/
prioritize_state(CHUNKS_RECORD * chunks_record,SEARCH_RECORD * the_search)342 FLOAT32 Wordrec::prioritize_state(CHUNKS_RECORD *chunks_record,
343 SEARCH_RECORD *the_search) {
344 FLOAT32 shape_cost;
345 FLOAT32 width_cost;
346 FLOAT32 seam_cost;
347
348 shape_cost = rating_priority(chunks_record,
349 the_search->this_state,
350 the_search->num_joints);
351
352 width_cost = width_priority(chunks_record,
353 the_search->this_state,
354 the_search->num_joints);
355
356 // The rating_priority is the same as the original, and the width_priority
357 // is the same as before if assume_fixed_pitch_char_segment == FALSE.
358 // So this would return the original state priority.
359 if (!use_new_state_cost)
360 return width_cost * 1000 + shape_cost;
361
362 seam_cost = seamcut_priority(chunks_record->splits,
363 the_search->this_state,
364 the_search->num_joints);
365 record_priorities(the_search, shape_cost, width_cost);
366
367 // TODO(dsl): how do we normalize the scores for these separate evidence?
368 // FLOAT32 total_cost = shape_cost + width_cost * 0.01 + seam_cost * 0.001;
369 FLOAT32 total_cost = shape_cost * heuristic_weight_rating +
370 width_cost * heuristic_weight_width +
371 seam_cost * heuristic_weight_seamcut;
372
373 // We don't have an adjustment model for variable pitch segmentation cost
374 // into word rating
375 if (assume_fixed_pitch_char_segment) {
376 float seg_bias = 1.0;
377 if (width_cost < 1) seg_bias *= 0.85;
378 if (width_cost > 3)
379 seg_bias *= pow(heuristic_segcost_rating_base, width_cost/3.0);
380 if (seam_cost > 10)
381 seg_bias *= pow(heuristic_segcost_rating_base, log(seam_cost)/log(10.0));
382 if (shape_cost > 5)
383 seg_bias *= pow(heuristic_segcost_rating_base, shape_cost/5.0);
384 if (segment_adjust_debug) {
385 tprintf("SegCost: %g Weight: %g rating: %g width: %g seam: %g\n",
386 total_cost, seg_bias, shape_cost, width_cost, seam_cost);
387 }
388 the_search->segcost_bias = seg_bias;
389 } else {
390 the_search->segcost_bias = 0;
391 }
392
393 return total_cost;
394 }
395
396
397 /*----------------------------------------------------------------------
398 F u n c t i o n s
399
400 Below are the original state prioritization functions for reference.
401 Since they work well for Latin, we need to keep them around until the
402 new path is verified to do no worse than before.
403
404 // Assign a segmentation priority based on the ratings of the blobs
405 // (in that segmentation) that have been classified. The average
406 // "goodness" (i.e. rating / weight) for each blob is used to indicate
407 // the segmentation priority.
408 FLOAT32 rating_priority(CHUNKS_RECORD *chunks_record,
409 STATE *state,
410 STATE *old_state,
411 int num_joints) {
412 PIECES_STATE blob_chunks;
413 inT16 x;
414 inT16 y;
415 BLOB_CHOICE_LIST *blob_choices;
416 BLOB_CHOICE_IT blob_choice_it;
417 inT16 first_chunk = 0;
418 inT16 last_chunk;
419 inT16 ratings = 0;
420 inT16 weights = 0;
421
422 bin_to_pieces(state, num_joints, blob_chunks);
423
424 for (x = 0; blob_chunks[x]; x++) {
425 // Iterate each blob
426 last_chunk = first_chunk + blob_chunks[x] - 1;
427
428 blob_choices = chunks_record->ratings->get(first_chunk, last_chunk);
429
430 if (blob_choices != NOT_CLASSIFIED) {
431 blob_choice_it.set_to_list(blob_choices);
432 ratings += (inT16) blob_choice_it.data()->rating();
433 for (y = first_chunk; y <= last_chunk; y++) {
434 weights += (inT16) (chunks_record->weights[y]);
435 }
436 }
437 first_chunk += blob_chunks[x];
438 }
439 if (weights <= 0)
440 weights = 1;
441 return ((FLOAT32) ratings / weights);
442 }
443
444 // Return a priority value for this word segmentation based on the
445 // character widths present in the new segmentation.
446 FLOAT32 width_priority(CHUNKS_RECORD *chunks_record,
447 STATE *state,
448 int num_joints) {
449 FLOAT32 result = 0.0;
450 WIDTH_RECORD *width_record;
451 FLOAT32 squat;
452 int x;
453
454 width_record = state_char_widths (chunks_record->chunk_widths,
455 state, num_joints);
456 for (x = 0; x < width_record->num_chars; x++) {
457
458 squat = width_record->widths[2 * x];
459 if (!classify_baseline_normalized) {
460 squat /= chunks_record->row->lineheight;
461 }
462 else {
463 squat /= BASELINE_SCALE;
464 }
465
466 if (squat > MAX_SQUAT)
467 result += squat - MAX_SQUAT;
468
469 }
470
471 free_widths(width_record);
472
473 return (result);
474 }
475
476 // Create a priority for this state. It represents the urgency of
477 // checking this state.
478 FLOAT32 prioritize_state(CHUNKS_RECORD *chunks_record,
479 SEARCH_RECORD *the_search,
480 STATE *old_state) {
481 FLOAT32 width_pri;
482 FLOAT32 match_pri;
483
484 match_pri = rating_priority (chunks_record, the_search->this_state,
485 old_state, the_search->num_joints);
486
487 width_pri = width_priority (chunks_record, the_search->this_state,
488 the_search->num_joints) * 1000.0;
489
490 record_priorities(the_search, old_state, match_pri, width_pri);
491
492 return (width_pri + match_pri);
493 }
494 ------------------ Original Rating Functions -----------------*/
495
496 } // namespace tesseract
497