1 /* -*-C-*-
2 ********************************************************************************
3 *
4 * File: chopper.c (Formerly chopper.c)
5 * Description:
6 * Author: Mark Seaman, OCR Technology
7 * Created: Fri Oct 16 14:37:00 1987
8 * Modified: Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt
9 * Language: C
10 * Package: N/A
11 * Status: Reusable Software Component
12 *
13 * (c) Copyright 1987, Hewlett-Packard Company.
14 ** Licensed under the Apache License, Version 2.0 (the "License");
15 ** you may not use this file except in compliance with the License.
16 ** You may obtain a copy of the License at
17 ** http://www.apache.org/licenses/LICENSE-2.0
18 ** Unless required by applicable law or agreed to in writing, software
19 ** distributed under the License is distributed on an "AS IS" BASIS,
20 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 ** See the License for the specific language governing permissions and
22 ** limitations under the License.
23 *
24 **************************************************************************/
25
26 /*----------------------------------------------------------------------
27 I n c l u d e s
28 ----------------------------------------------------------------------*/
29 #include <math.h>
30
31 #include "chopper.h"
32
33 #include "assert.h"
34 #include "associate.h"
35 #include "callcpp.h"
36 #include "choices.h"
37 #include "const.h"
38 #include "findseam.h"
39 #include "freelist.h"
40 #include "globals.h"
41 #include "makechop.h"
42 #include "metrics.h"
43 #include "render.h"
44 #include "permute.h"
45 #include "pieces.h"
46 #include "seam.h"
47 #include "stopper.h"
48 #include "structures.h"
49 #include "tordvars.h"
50 #include "unicharset.h"
51 #include "wordclass.h"
52 #include "wordrec.h"
53
54 INT_VAR (repair_unchopped_blobs, 1, "Fix blobs that aren't chopped");
55
56 //?extern int tessedit_dangambigs_chop;
57 double_VAR(tessedit_certainty_threshold, -2.25, "Good blob limit");
58
59 BOOL_VAR(fragments_guide_chopper, FALSE,
60 "Use information from fragments to guide chopping process");
61
62 /*----------------------------------------------------------------------
63 M a c r o s
64 ----------------------------------------------------------------------*/
65 /**********************************************************************
66 * bounds_inside
67 *
68 * Check to see if the bounding box of one thing is inside the
69 * bounding box of another.
70 **********************************************************************/
71 #define bounds_inside(inner_tl,inner_br,outer_tl,outer_br) \
72 ((inner_tl.x >= outer_tl.x) && \
73 (inner_tl.y <= outer_tl.y) && \
74 (inner_br.x <= outer_br.x) && \
75 (inner_br.y >= outer_br.y)) \
76
77 /*----------------------------------------------------------------------
78 F u n c t i o n s
79 ----------------------------------------------------------------------*/
80 /**********************************************************************
81 * preserve_outline_tree
82 *
83 * Copy the list of outlines.
84 **********************************************************************/
preserve_outline(EDGEPT * start)85 void preserve_outline(EDGEPT *start) {
86 EDGEPT *srcpt;
87
88 if (start == NULL)
89 return;
90 srcpt = start;
91 do {
92 srcpt->flags[1] = 1;
93 srcpt = srcpt->next;
94 }
95 while (srcpt != start);
96 srcpt->flags[1] = 2;
97 }
98
99
100 /**************************************************************************/
preserve_outline_tree(TESSLINE * srcline)101 void preserve_outline_tree(TESSLINE *srcline) {
102 TESSLINE *outline;
103
104 for (outline = srcline; outline != NULL; outline = outline->next) {
105 preserve_outline (outline->loop);
106 }
107 if (srcline != NULL && srcline->child != NULL)
108 preserve_outline_tree (srcline->child);
109 }
110
111
112 /**********************************************************************
113 * restore_outline_tree
114 *
115 * Copy the list of outlines.
116 **********************************************************************/
restore_outline(EDGEPT * start)117 EDGEPT *restore_outline(EDGEPT *start) {
118 EDGEPT *srcpt;
119 EDGEPT *real_start;
120 EDGEPT *deadpt;
121
122 if (start == NULL)
123 return NULL;
124 srcpt = start;
125 do {
126 if (srcpt->flags[1] == 2)
127 break;
128 srcpt = srcpt->next;
129 }
130 while (srcpt != start);
131 real_start = srcpt;
132 do {
133 if (srcpt->flags[1] == 0) {
134 deadpt = srcpt;
135 srcpt = srcpt->next;
136 srcpt->prev = deadpt->prev;
137 deadpt->prev->next = srcpt;
138 deadpt->prev->vec.x = srcpt->pos.x - deadpt->prev->pos.x;
139 deadpt->prev->vec.y = srcpt->pos.y - deadpt->prev->pos.y;
140 oldedgept(deadpt);
141 }
142 else
143 srcpt = srcpt->next;
144 }
145 while (srcpt != real_start);
146 return real_start;
147 }
148
149
150 /******************************************************************************/
restore_outline_tree(TESSLINE * srcline)151 void restore_outline_tree(TESSLINE *srcline) {
152 TESSLINE *outline;
153
154 for (outline = srcline; outline != NULL; outline = outline->next) {
155 outline->loop = restore_outline (outline->loop);
156 outline->start = outline->loop->pos;
157 }
158 if (srcline != NULL && srcline->child != NULL)
159 restore_outline_tree (srcline->child);
160 }
161
162
163 /**********************************************************************
164 * attempt_blob_chop
165 *
166 * Try to split the this blob after this one. Check to make sure that
167 * it was successful.
168 **********************************************************************/
attempt_blob_chop(TWERD * word,inT32 blob_number,SEAMS seam_list)169 SEAM *attempt_blob_chop(TWERD *word, inT32 blob_number, SEAMS seam_list) {
170 TBLOB *blob;
171 TBLOB *other_blob;
172 SEAM *seam;
173 TBLOB *last_blob;
174 TBLOB *next_blob;
175 inT16 x;
176
177 if (first_pass)
178 chops_attempted1++;
179 else
180 chops_attempted2++;
181
182 last_blob = NULL;
183 blob = word->blobs;
184 for (x = 0; x < blob_number; x++) {
185 last_blob = blob;
186 blob = blob->next;
187 }
188 next_blob = blob->next;
189
190 if (repair_unchopped_blobs)
191 preserve_outline_tree (blob->outlines);
192 other_blob = newblob (); /* Make new blob */
193 other_blob->next = blob->next;
194 other_blob->outlines = NULL;
195 blob->next = other_blob;
196
197 seam = pick_good_seam (blob);
198 if (chop_debug) {
199 if (seam != NULL) {
200 print_seam ("Good seam picked=", seam);
201 }
202 else
203 cprintf ("\n** no seam picked *** \n");
204 }
205 if (seam) {
206 apply_seam(blob, other_blob, seam);
207 }
208
209 if ((seam == NULL) ||
210 (blob->outlines == NULL) ||
211 (other_blob->outlines == NULL) ||
212 total_containment (blob, other_blob) ||
213 check_blob (other_blob) ||
214 !(check_seam_order (blob, seam) &&
215 check_seam_order (other_blob, seam)) ||
216 any_shared_split_points (seam_list, seam) ||
217 !test_insert_seam(seam_list, blob_number, blob, word->blobs)) {
218
219 blob->next = next_blob;
220 if (seam) {
221 undo_seam(blob, other_blob, seam);
222 delete_seam(seam);
223 #ifndef GRAPHICS_DISABLED
224 if (chop_debug) {
225 if (chop_debug >2)
226 display_blob(blob, Red);
227 cprintf ("\n** seam being removed ** \n");
228 }
229 #endif
230 }
231 else {
232 oldblob(other_blob);
233 }
234
235 if (repair_unchopped_blobs)
236 restore_outline_tree (blob->outlines);
237 return (NULL);
238 }
239 return (seam);
240 }
241
242
243 /**********************************************************************
244 * any_shared_split_points
245 *
246 * Return true if any of the splits share a point with this one.
247 **********************************************************************/
any_shared_split_points(SEAMS seam_list,SEAM * seam)248 int any_shared_split_points(SEAMS seam_list, SEAM *seam) {
249 int length;
250 int index;
251
252 length = array_count (seam_list);
253 for (index = 0; index < length; index++)
254 if (shared_split_points ((SEAM *) array_value (seam_list, index), seam))
255 return TRUE;
256 return FALSE;
257 }
258
259
260 /**********************************************************************
261 * check_blob
262 *
263 * Return true if blob has a non whole outline.
264 **********************************************************************/
check_blob(TBLOB * blob)265 int check_blob(TBLOB *blob) {
266 TESSLINE *outline;
267 EDGEPT *edgept;
268
269 for (outline = blob->outlines; outline != NULL; outline = outline->next) {
270 edgept = outline->loop;
271 do {
272 if (edgept == NULL)
273 break;
274 edgept = edgept->next;
275 }
276 while (edgept != outline->loop);
277 if (edgept == NULL)
278 return 1;
279 }
280 return 0;
281 }
282
283
284 /**********************************************************************
285 * improve_one_blob
286 *
287 * Start with the current word of blobs and its classification. Find
288 * the worst blobs and try to divide it up to improve the ratings.
289 *********************************************************************/
290 namespace tesseract {
improve_one_blob(TWERD * word,BLOB_CHOICE_LIST_VECTOR * char_choices,int fx,inT32 * blob_number,SEAMS * seam_list,DANGERR * fixpt,bool split_next_to_fragment)291 bool Wordrec::improve_one_blob(TWERD *word,
292 BLOB_CHOICE_LIST_VECTOR *char_choices,
293 int fx,
294 inT32 *blob_number,
295 SEAMS *seam_list,
296 DANGERR *fixpt,
297 bool split_next_to_fragment) {
298 TBLOB *pblob;
299 TBLOB *blob;
300 inT16 x = 0;
301 float rating_ceiling = MAX_FLOAT32;
302 BLOB_CHOICE_LIST *answer;
303 BLOB_CHOICE_IT answer_it;
304 SEAM *seam;
305
306 do {
307 *blob_number = select_blob_to_split(*char_choices, rating_ceiling,
308 split_next_to_fragment);
309 if (chop_debug)
310 cprintf("blob_number = %d\n", *blob_number);
311 if (*blob_number == -1)
312 return false;
313
314 seam = attempt_blob_chop (word, *blob_number, *seam_list);
315 if (seam != NULL)
316 break;
317 /* Must split null blobs */
318 answer = char_choices->get(*blob_number);
319 if (answer == NULL)
320 return false;
321 answer_it.set_to_list(answer);
322 rating_ceiling = answer_it.data()->rating(); // try a different blob
323 } while (!tord_blob_skip);
324 /* Split OK */
325 for (blob = word->blobs, pblob = NULL; x < *blob_number; x++) {
326 pblob = blob;
327 blob = blob->next;
328 }
329
330 *seam_list =
331 insert_seam (*seam_list, *blob_number, seam, blob, word->blobs);
332
333 delete char_choices->get(*blob_number);
334
335 answer = classify_blob(pblob, blob, blob->next, NULL, "improve 1:", Red);
336 char_choices->insert(answer, *blob_number);
337
338 answer = classify_blob(blob, blob->next, blob->next->next, NULL,
339 "improve 2:", Yellow);
340 char_choices->set(answer, *blob_number + 1);
341
342 return true;
343 }
344
345 /**********************************************************************
346 * modify_blob_choice
347 *
348 * Takes a blob and its chop index, converts that chop index to a
349 * unichar_id, and stores the chop index in place of the blob's
350 * original unichar_id.
351 *********************************************************************/
modify_blob_choice(BLOB_CHOICE_LIST * answer,int chop_index)352 void Wordrec::modify_blob_choice(BLOB_CHOICE_LIST *answer,
353 int chop_index) {
354 char chop_index_string[2];
355 if (chop_index <= 9) {
356 snprintf(chop_index_string, sizeof(chop_index_string), "%d", chop_index);
357 } else {
358 chop_index_string[0] = static_cast<char>('A' - 10 + chop_index);
359 chop_index_string[1] = '\0';
360 }
361 UNICHAR_ID unichar_id = unicharset.unichar_to_id(chop_index_string);
362 ASSERT_HOST(unichar_id!=INVALID_UNICHAR_ID);
363 BLOB_CHOICE_IT answer_it(answer);
364 BLOB_CHOICE *modified_blob = new BLOB_CHOICE(unichar_id,
365 answer_it.data()->rating(),
366 answer_it.data()->certainty(),
367 answer_it.data()->config(),
368 answer_it.data()->script_id());
369 answer->clear();
370 answer_it.set_to_list(answer);
371 answer_it.add_after_then_move(modified_blob);
372 }
373
374 /**********************************************************************
375 * chop_one_blob
376 *
377 * Start with the current one-blob word and its classification. Find
378 * the worst blobs and try to divide it up to improve the ratings.
379 * Used for testing chopper.
380 *********************************************************************/
chop_one_blob(TWERD * word,BLOB_CHOICE_LIST_VECTOR * char_choices,inT32 * blob_number,SEAMS * seam_list,int * right_chop_index)381 bool Wordrec::chop_one_blob(TWERD *word,
382 BLOB_CHOICE_LIST_VECTOR *char_choices,
383 inT32 *blob_number,
384 SEAMS *seam_list,
385 int *right_chop_index) {
386 TBLOB *pblob;
387 TBLOB *blob;
388 inT16 x = 0;
389 float rating_ceiling = MAX_FLOAT32;
390 BLOB_CHOICE_LIST *answer;
391 BLOB_CHOICE_IT answer_it;
392 SEAM *seam;
393 UNICHAR_ID unichar_id = 0;
394 int left_chop_index = 0;
395
396 do {
397 *blob_number = select_blob_to_split(*char_choices, rating_ceiling,
398 false);
399 if (chop_debug)
400 cprintf("blob_number = %d\n", *blob_number);
401 if (*blob_number == -1)
402 return false;
403 seam = attempt_blob_chop(word, *blob_number, *seam_list);
404 if (seam != NULL)
405 break;
406 /* Must split null blobs */
407 answer = char_choices->get(*blob_number);
408 if (answer == NULL)
409 return false;
410 answer_it.set_to_list(answer);
411 rating_ceiling = answer_it.data()->rating(); // try a different blob
412 } while (!tord_blob_skip);
413 /* Split OK */
414 for (blob = word->blobs, pblob = NULL; x < *blob_number; x++) {
415 pblob = blob;
416 blob = blob->next;
417 }
418 *seam_list =
419 insert_seam(*seam_list, *blob_number, seam, blob, word->blobs);
420
421 answer = char_choices->get(*blob_number);
422 answer_it.set_to_list(answer);
423 unichar_id = answer_it.data()->unichar_id();
424 left_chop_index = atoi(unicharset.id_to_unichar(unichar_id));
425
426 delete char_choices->get(*blob_number);
427 // combine confidence w/ serial #
428 answer = classify_blob(pblob, blob, blob->next, NULL, "improve 1:", Red);
429 modify_blob_choice(answer, left_chop_index);
430 char_choices->insert(answer, *blob_number);
431
432 answer = classify_blob(blob, blob->next, blob->next->next, NULL,
433 "improve 2:", Yellow);
434 modify_blob_choice(answer, ++*right_chop_index);
435 char_choices->set(answer, *blob_number + 1);
436 return true;
437 }
438 } // namespace tesseract
439
440 /**********************************************************************
441 * check_seam_order
442 *
443 * Make sure that each of the splits in this seam match to outlines
444 * in this blob. If any of the splits could not correspond to this
445 * blob then there is a problem (and FALSE should be returned to the
446 * caller).
447 **********************************************************************/
check_seam_order(TBLOB * blob,SEAM * seam)448 inT16 check_seam_order(TBLOB *blob, SEAM *seam) {
449 TESSLINE *outline;
450 TESSLINE *last_outline;
451 inT8 found_em[3];
452
453 if (seam->split1 == NULL || seam->split1 == NULL || blob == NULL)
454 return (TRUE);
455
456 found_em[0] = found_em[1] = found_em[2] = FALSE;
457
458 for (outline = blob->outlines; outline; outline = outline->next) {
459 if (!found_em[0] &&
460 ((seam->split1 == NULL) ||
461 is_split_outline (outline, seam->split1))) {
462 found_em[0] = TRUE;
463 }
464 if (!found_em[1] &&
465 ((seam->split2 == NULL) ||
466 is_split_outline (outline, seam->split2))) {
467 found_em[1] = TRUE;
468 }
469 if (!found_em[2] &&
470 ((seam->split3 == NULL) ||
471 is_split_outline (outline, seam->split3))) {
472 found_em[2] = TRUE;
473 }
474 last_outline = outline;
475 }
476
477 if (!found_em[0] || !found_em[1] || !found_em[2])
478 return (FALSE);
479 else
480 return (TRUE);
481 }
482
483 /**********************************************************************
484 * chop_word_main
485 *
486 * Classify the blobs in this word and permute the results. Find the
487 * worst blob in the word and chop it up. Continue this process until
488 * a good answer has been found or all the blobs have been chopped up
489 * enough. Return the word level ratings.
490 **********************************************************************/
491 namespace tesseract {
chop_word_main(register TWERD * word,int fx,WERD_CHOICE * best_choice,WERD_CHOICE * raw_choice,BOOL8 tester,BOOL8 trainer)492 BLOB_CHOICE_LIST_VECTOR *Wordrec::chop_word_main(register TWERD *word,
493 int fx,
494 WERD_CHOICE *best_choice,
495 WERD_CHOICE *raw_choice,
496 BOOL8 tester,
497 BOOL8 trainer) {
498 TBLOB *pblob;
499 TBLOB *blob;
500 int index;
501 int did_chopping;
502 float rating_limit = 1000.0;
503 STATE state;
504 SEAMS seam_list = start_seam_list(word->blobs);
505 BLOB_CHOICE_LIST *match_result;
506 MATRIX *ratings = NULL;
507 DANGERR fixpt; /*dangerous ambig */
508 inT32 state_count; //no of states
509 inT32 bit_count; //no of bits
510 static STATE best_state;
511 static STATE chop_states[64]; //in between states
512
513 state_count = 0;
514 best_choice->make_bad();
515 raw_choice->make_bad();
516
517 BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
518
519 did_chopping = 0;
520 for (blob = word->blobs, pblob = NULL, index = 0;
521 blob != NULL; blob = blob->next, index++) {
522 match_result = classify_blob(pblob, blob, blob->next, NULL,
523 "chop_word:", Green);
524 if (match_result == NULL)
525 cprintf("Null classifier output!\n");
526 *char_choices += match_result;
527 pblob = blob;
528 }
529 bit_count = index - 1;
530 getDict().permute_characters(*char_choices, rating_limit,
531 best_choice, raw_choice);
532 set_n_ones(&state, char_choices->length() - 1);
533 if (matcher_fp != NULL) {
534 bits_in_states = bit_count;
535 chop_states[state_count] = state;
536 state_count++;
537 }
538 bool replaced = false;
539 if (!getDict().AcceptableChoice(char_choices, best_choice, *raw_choice,
540 &fixpt, CHOPPER_CALLER, &replaced) ||
541 ((tester || trainer) &&
542 strcmp(word->correct, best_choice->unichar_string().string()))) {
543 if (replaced) update_blob_classifications(word, *char_choices);
544 did_chopping = 1;
545 if (first_pass)
546 words_chopped1++;
547 else
548 words_chopped2++;
549
550 if (chop_enable)
551 improve_by_chopping(word,
552 char_choices,
553 fx,
554 &state,
555 best_choice,
556 raw_choice,
557 &seam_list,
558 &fixpt,
559 chop_states,
560 &state_count);
561 if (chop_debug)
562 print_seams ("Final seam list:", seam_list);
563
564 // The force_word_assoc is almost redundant to enable_assoc. However,
565 // it is not conditioned on the dict behavior. For CJK, we need to force
566 // the associator to be invoked. When we figure out the exact behavior
567 // of dict on CJK, we can remove the flag if it turns out to be redundant.
568 if ((wordrec_enable_assoc &&
569 !getDict().AcceptableChoice(char_choices, best_choice, *raw_choice,
570 NULL, CHOPPER_CALLER, &replaced)) ||
571 force_word_assoc ||
572 ((tester || trainer) &&
573 strcmp(word->correct, best_choice->unichar_string().string()))) {
574 ratings = word_associator (word->blobs, seam_list, &state, fx,
575 best_choice, raw_choice, word->correct,
576 /*0, */ &fixpt, &best_state);
577 }
578 bits_in_states = bit_count + state_count - 1;
579 }
580 if (replaced) update_blob_classifications(word, *char_choices);
581
582 char_choices =
583 rebuild_current_state(word->blobs, seam_list, &state, char_choices, fx,
584 (did_chopping || tester || trainer), *best_choice,
585 ratings);
586
587 if (ratings != NULL) {
588 ratings->delete_matrix_pointers();
589 delete ratings;
590 }
591 if (seam_list != NULL)
592 free_seam_list(seam_list);
593 if (matcher_fp != NULL) {
594 best_state = state;
595 }
596 getDict().FilterWordChoices();
597 return char_choices;
598 }
599
600
601
602 /**********************************************************************
603 * improve_by_chopping
604 *
605 * Start with the current word of blobs and its classification. Find
606 * the worst blobs and try to divide them up to improve the ratings.
607 * As long as ratings are produced by the new blob splitting. When
608 * all the splitting has been accomplished all the ratings memory is
609 * reclaimed.
610 **********************************************************************/
improve_by_chopping(register TWERD * word,BLOB_CHOICE_LIST_VECTOR * char_choices,int fx,STATE * best_state,WERD_CHOICE * best_choice,WERD_CHOICE * raw_choice,SEAMS * seam_list,DANGERR * fixpt,STATE * chop_states,inT32 * state_count)611 void Wordrec::improve_by_chopping(register TWERD *word,
612 BLOB_CHOICE_LIST_VECTOR *char_choices,
613 int fx,
614 STATE *best_state,
615 WERD_CHOICE *best_choice,
616 WERD_CHOICE *raw_choice,
617 SEAMS *seam_list,
618 DANGERR *fixpt,
619 STATE *chop_states,
620 inT32 *state_count) {
621 inT32 blob_number;
622 inT32 index; //to states
623 float old_best;
624 int fixpt_valid = 1;
625 static inT32 old_count; //from pass1
626 bool replaced = false;
627
628 do { // improvement loop
629 if (replaced) update_blob_classifications(word, *char_choices);
630 if (!fixpt_valid)
631 fixpt->index = -1;
632 old_best = best_choice->rating();
633 if (improve_one_blob(word, char_choices, fx, &blob_number, seam_list,
634 fixpt, (fragments_guide_chopper &&
635 best_choice->fragment_mark()))) {
636 getDict().LogNewSplit(blob_number);
637 getDict().permute_characters(*char_choices, best_choice->rating(),
638 best_choice, raw_choice);
639
640 if (old_best > best_choice->rating()) {
641 set_n_ones(best_state, char_choices->length() - 1);
642 fixpt_valid = 1;
643 }
644 else {
645 insert_new_chunk(best_state, blob_number, char_choices->length() - 2);
646 fixpt_valid = 0;
647 }
648 if (*state_count > 0) {
649 for (index = 0; index < *state_count; index++) {
650 insert_new_chunk(&chop_states[index], blob_number,
651 char_choices->length() - 2);
652 }
653 set_n_ones(&chop_states[index], char_choices->length() - 1);
654 (*state_count)++;
655 }
656
657 if (chop_debug)
658 print_state ("best state = ",
659 best_state, count_blobs (word->blobs) - 1);
660 if (first_pass)
661 chops_performed1++;
662 else
663 chops_performed2++;
664 } else {
665 break;
666 }
667 } while (!getDict().AcceptableChoice(char_choices, best_choice, *raw_choice,
668 fixpt, CHOPPER_CALLER, &replaced) &&
669 !tord_blob_skip && char_choices->length() < MAX_NUM_CHUNKS);
670 if (replaced) update_blob_classifications(word, *char_choices);
671 old_count = *state_count;
672 if (!fixpt_valid)
673 fixpt->index = -1;
674 }
675
676
677 /**********************************************************************
678 * select_blob_to_split
679 *
680 * These are the results of the last classification. Find a likely
681 * place to apply splits.
682 **********************************************************************/
select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR & char_choices,float rating_ceiling,bool split_next_to_fragment)683 inT16 Wordrec::select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR &char_choices,
684 float rating_ceiling,
685 bool split_next_to_fragment) {
686 BLOB_CHOICE_IT blob_choice_it;
687 BLOB_CHOICE *blob_choice;
688 BLOB_CHOICE_IT temp_it;
689 int x;
690 float worst = -MAX_FLOAT32;
691 int worst_index = -1;
692 float worst_near_fragment = -MAX_FLOAT32;
693 int worst_index_near_fragment = -1;
694 const CHAR_FRAGMENT **fragments = NULL;
695
696 if (chop_debug) {
697 if (rating_ceiling < MAX_FLOAT32)
698 cprintf("rating_ceiling = %8.4f\n", rating_ceiling);
699 else
700 cprintf("rating_ceiling = No Limit\n");
701 }
702
703 if (split_next_to_fragment && char_choices.length() > 0) {
704 fragments = new const CHAR_FRAGMENT *[char_choices.length()];
705 if (char_choices.get(0) != NULL) {
706 temp_it.set_to_list(char_choices.get(0));
707 fragments[0] = getDict().getUnicharset().get_fragment(
708 temp_it.data()->unichar_id());
709 } else {
710 fragments[0] = NULL;
711 }
712 }
713
714 for (x = 0; x < char_choices.length(); ++x) {
715 if (char_choices.get(x) == NULL) {
716 if (fragments != NULL) {
717 delete[] fragments;
718 }
719 return x;
720 } else {
721 blob_choice_it.set_to_list(char_choices.get(x));
722 blob_choice = blob_choice_it.data();
723 // Populate fragments for the following position.
724 if (split_next_to_fragment && x+1 < char_choices.length()) {
725 if (char_choices.get(x+1) != NULL) {
726 temp_it.set_to_list(char_choices.get(x+1));
727 fragments[x+1] = getDict().getUnicharset().get_fragment(
728 temp_it.data()->unichar_id());
729 } else {
730 fragments[x+1] = NULL;
731 }
732 }
733 if (blob_choice->rating() < rating_ceiling &&
734 blob_choice->certainty() < tessedit_certainty_threshold) {
735 // Update worst and worst_index.
736 if (blob_choice->rating() > worst) {
737 worst_index = x;
738 worst = blob_choice->rating();
739 }
740 if (split_next_to_fragment) {
741 // Update worst_near_fragment and worst_index_near_fragment.
742 bool expand_following_fragment =
743 (x + 1 < char_choices.length() &&
744 fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
745 bool expand_preceding_fragment =
746 (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
747 if ((expand_following_fragment || expand_preceding_fragment) &&
748 blob_choice->rating() > worst_near_fragment) {
749 worst_index_near_fragment = x;
750 worst_near_fragment = blob_choice->rating();
751 if (chop_debug) {
752 cprintf("worst_index_near_fragment=%d"
753 " expand_following_fragment=%d"
754 " expand_preceding_fragment=%d\n",
755 worst_index_near_fragment,
756 expand_following_fragment,
757 expand_preceding_fragment);
758 }
759 }
760 }
761 }
762 }
763 }
764 if (fragments != NULL) {
765 delete[] fragments;
766 }
767 // TODO(daria): maybe a threshold of badness for
768 // worst_near_fragment would be useful.
769 return worst_index_near_fragment != -1 ?
770 worst_index_near_fragment : worst_index;
771 }
772 } // namespace tesseract
773
774
775 /**********************************************************************
776 * start_seam_list
777 *
778 * Initialize a list of seams that match the original number of blobs
779 * present in the starting segmentation. Each of the seams created
780 * by this routine have location information only.
781 **********************************************************************/
start_seam_list(TBLOB * blobs)782 SEAMS start_seam_list(TBLOB *blobs) {
783 TBLOB *blob;
784 SEAMS seam_list;
785 TPOINT topleft;
786 TPOINT botright;
787 int location;
788 /* Seam slot per char */
789 seam_list = new_seam_list ();
790
791 for (blob = blobs; blob->next != NULL; blob = blob->next) {
792
793 blob_bounding_box(blob, &topleft, &botright);
794 location = botright.x;
795 blob_bounding_box (blob->next, &topleft, &botright);
796 location += topleft.x;
797 location /= 2;
798
799 seam_list = add_seam (seam_list,
800 new_seam (0.0, location, NULL, NULL, NULL));
801 }
802
803 return (seam_list);
804 }
805
806
807 /**********************************************************************
808 * total_containment
809 *
810 * Check to see if one of these outlines is totally contained within
811 * the bounding box of the other.
812 **********************************************************************/
total_containment(TBLOB * blob1,TBLOB * blob2)813 inT16 total_containment(TBLOB *blob1, TBLOB *blob2) {
814 TPOINT topleft1;
815 TPOINT botright1;
816 TPOINT topleft2;
817 TPOINT botright2;
818
819 blob_bounding_box(blob1, &topleft1, &botright1);
820 blob_bounding_box(blob2, &topleft2, &botright2);
821
822 return (bounds_inside (topleft1, botright1, topleft2, botright2) ||
823 bounds_inside (topleft2, botright2, topleft1, botright1));
824 }
825
826
827 /**********************************************************************
828 * word_associator
829 *
830 * Reassociate and classify the blobs in a word. Continue this process
831 * until a good answer is found or all the possibilities have been tried.
832 **********************************************************************/
833 namespace tesseract {
word_associator(TBLOB * blobs,SEAMS seams,STATE * state,int fxid,WERD_CHOICE * best_choice,WERD_CHOICE * raw_choice,char * correct,DANGERR * fixpt,STATE * best_state)834 MATRIX *Wordrec::word_associator(TBLOB *blobs,
835 SEAMS seams,
836 STATE *state,
837 int fxid,
838 WERD_CHOICE *best_choice,
839 WERD_CHOICE *raw_choice,
840 char *correct,
841 DANGERR *fixpt,
842 STATE *best_state) {
843 CHUNKS_RECORD chunks_record;
844 BLOB_WEIGHTS blob_weights;
845 int x;
846 int num_chunks;
847 BLOB_CHOICE_IT blob_choice_it;
848
849 num_chunks = array_count (seams) + 1;
850
851 chunks_record.chunks = blobs;
852 chunks_record.splits = seams;
853 chunks_record.ratings = record_piece_ratings (blobs);
854 chunks_record.char_widths = blobs_widths (blobs);
855 chunks_record.chunk_widths = blobs_widths (blobs);
856 chunks_record.fx = fxid;
857 /* Save chunk weights */
858 for (x = 0; x < num_chunks; x++) {
859 BLOB_CHOICE_LIST* choices = get_piece_rating(chunks_record.ratings,
860 blobs, seams, x, x);
861 blob_choice_it.set_to_list(choices);
862 //This is done by Jetsoft. Divide by zero is possible.
863 if (blob_choice_it.data()->certainty() == 0) {
864 blob_weights[x]=0;
865 } else {
866 blob_weights[x] =
867 -(inT16) (10 * blob_choice_it.data()->rating() /
868 blob_choice_it.data()->certainty());
869 }
870 }
871 chunks_record.weights = blob_weights;
872
873 if (chop_debug)
874 chunks_record.ratings->print(getDict().getUnicharset());
875
876 best_first_search(&chunks_record,
877 best_choice,
878 raw_choice,
879 state,
880 fixpt,
881 best_state);
882
883 free_widths (chunks_record.chunk_widths);
884 free_widths (chunks_record.char_widths);
885 return chunks_record.ratings;
886 }
887 } // namespace tesseract
888
889