• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File:        chopper.c  (Formerly chopper.c)
5  * Description:
6  * Author:       Mark Seaman, OCR Technology
7  * Created:      Fri Oct 16 14:37:00 1987
8  * Modified:     Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt
9  * Language:     C
10  * Package:      N/A
11  * Status:       Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  **************************************************************************/
25 
26 /*----------------------------------------------------------------------
27           I n c l u d e s
28 ----------------------------------------------------------------------*/
29 #include <math.h>
30 
31 #include "chopper.h"
32 
33 #include "assert.h"
34 #include "associate.h"
35 #include "callcpp.h"
36 #include "choices.h"
37 #include "const.h"
38 #include "findseam.h"
39 #include "freelist.h"
40 #include "globals.h"
41 #include "makechop.h"
42 #include "metrics.h"
43 #include "render.h"
44 #include "permute.h"
45 #include "pieces.h"
46 #include "seam.h"
47 #include "stopper.h"
48 #include "structures.h"
49 #include "tordvars.h"
50 #include "unicharset.h"
51 #include "wordclass.h"
52 #include "wordrec.h"
53 
54 INT_VAR (repair_unchopped_blobs, 1, "Fix blobs that aren't chopped");
55 
56 //?extern int tessedit_dangambigs_chop;
57 double_VAR(tessedit_certainty_threshold, -2.25, "Good blob limit");
58 
59 BOOL_VAR(fragments_guide_chopper, FALSE,
60          "Use information from fragments to guide chopping process");
61 
62 /*----------------------------------------------------------------------
63           M a c r o s
64 ----------------------------------------------------------------------*/
65 /**********************************************************************
66  * bounds_inside
67  *
68  * Check to see if the bounding box of one thing is inside the
69  * bounding box of another.
70  **********************************************************************/
71 #define bounds_inside(inner_tl,inner_br,outer_tl,outer_br)  \
72 ((inner_tl.x >= outer_tl.x)	&& \
73 (inner_tl.y <= outer_tl.y)	&& \
74 (inner_br.x <= outer_br.x)   && \
75 (inner_br.y >= outer_br.y))     \
76 
77 /*----------------------------------------------------------------------
78           F u n c t i o n s
79 ----------------------------------------------------------------------*/
80 /**********************************************************************
81  * preserve_outline_tree
82  *
83  * Copy the list of outlines.
84  **********************************************************************/
preserve_outline(EDGEPT * start)85 void preserve_outline(EDGEPT *start) {
86   EDGEPT *srcpt;
87 
88   if (start == NULL)
89     return;
90   srcpt = start;
91   do {
92     srcpt->flags[1] = 1;
93     srcpt = srcpt->next;
94   }
95   while (srcpt != start);
96   srcpt->flags[1] = 2;
97 }
98 
99 
100 /**************************************************************************/
preserve_outline_tree(TESSLINE * srcline)101 void preserve_outline_tree(TESSLINE *srcline) {
102   TESSLINE *outline;
103 
104   for (outline = srcline; outline != NULL; outline = outline->next) {
105     preserve_outline (outline->loop);
106   }
107   if (srcline != NULL && srcline->child != NULL)
108     preserve_outline_tree (srcline->child);
109 }
110 
111 
112 /**********************************************************************
113  * restore_outline_tree
114  *
115  * Copy the list of outlines.
116  **********************************************************************/
restore_outline(EDGEPT * start)117 EDGEPT *restore_outline(EDGEPT *start) {
118   EDGEPT *srcpt;
119   EDGEPT *real_start;
120   EDGEPT *deadpt;
121 
122   if (start == NULL)
123     return NULL;
124   srcpt = start;
125   do {
126     if (srcpt->flags[1] == 2)
127       break;
128     srcpt = srcpt->next;
129   }
130   while (srcpt != start);
131   real_start = srcpt;
132   do {
133     if (srcpt->flags[1] == 0) {
134       deadpt = srcpt;
135       srcpt = srcpt->next;
136       srcpt->prev = deadpt->prev;
137       deadpt->prev->next = srcpt;
138       deadpt->prev->vec.x = srcpt->pos.x - deadpt->prev->pos.x;
139       deadpt->prev->vec.y = srcpt->pos.y - deadpt->prev->pos.y;
140       oldedgept(deadpt);
141     }
142     else
143       srcpt = srcpt->next;
144   }
145   while (srcpt != real_start);
146   return real_start;
147 }
148 
149 
150 /******************************************************************************/
restore_outline_tree(TESSLINE * srcline)151 void restore_outline_tree(TESSLINE *srcline) {
152   TESSLINE *outline;
153 
154   for (outline = srcline; outline != NULL; outline = outline->next) {
155     outline->loop = restore_outline (outline->loop);
156     outline->start = outline->loop->pos;
157   }
158   if (srcline != NULL && srcline->child != NULL)
159     restore_outline_tree (srcline->child);
160 }
161 
162 
163 /**********************************************************************
164  * attempt_blob_chop
165  *
166  * Try to split the this blob after this one.  Check to make sure that
167  * it was successful.
168  **********************************************************************/
attempt_blob_chop(TWERD * word,inT32 blob_number,SEAMS seam_list)169 SEAM *attempt_blob_chop(TWERD *word, inT32 blob_number, SEAMS seam_list) {
170   TBLOB *blob;
171   TBLOB *other_blob;
172   SEAM *seam;
173   TBLOB *last_blob;
174   TBLOB *next_blob;
175   inT16 x;
176 
177   if (first_pass)
178     chops_attempted1++;
179   else
180     chops_attempted2++;
181 
182   last_blob = NULL;
183   blob = word->blobs;
184   for (x = 0; x < blob_number; x++) {
185     last_blob = blob;
186     blob = blob->next;
187   }
188   next_blob = blob->next;
189 
190   if (repair_unchopped_blobs)
191     preserve_outline_tree (blob->outlines);
192   other_blob = newblob ();       /* Make new blob */
193   other_blob->next = blob->next;
194   other_blob->outlines = NULL;
195   blob->next = other_blob;
196 
197   seam = pick_good_seam (blob);
198   if (chop_debug) {
199     if (seam != NULL) {
200       print_seam ("Good seam picked=", seam);
201     }
202     else
203       cprintf ("\n** no seam picked *** \n");
204   }
205   if (seam) {
206     apply_seam(blob, other_blob, seam);
207   }
208 
209   if ((seam == NULL) ||
210     (blob->outlines == NULL) ||
211     (other_blob->outlines == NULL) ||
212     total_containment (blob, other_blob) ||
213     check_blob (other_blob) ||
214     !(check_seam_order (blob, seam) &&
215     check_seam_order (other_blob, seam)) ||
216     any_shared_split_points (seam_list, seam) ||
217     !test_insert_seam(seam_list, blob_number, blob, word->blobs)) {
218 
219     blob->next = next_blob;
220     if (seam) {
221       undo_seam(blob, other_blob, seam);
222       delete_seam(seam);
223 #ifndef GRAPHICS_DISABLED
224       if (chop_debug) {
225         if (chop_debug >2)
226           display_blob(blob, Red);
227         cprintf ("\n** seam being removed ** \n");
228       }
229 #endif
230     }
231     else {
232       oldblob(other_blob);
233     }
234 
235     if (repair_unchopped_blobs)
236       restore_outline_tree (blob->outlines);
237     return (NULL);
238   }
239   return (seam);
240 }
241 
242 
243 /**********************************************************************
244  * any_shared_split_points
245  *
246  * Return true if any of the splits share a point with this one.
247  **********************************************************************/
any_shared_split_points(SEAMS seam_list,SEAM * seam)248 int any_shared_split_points(SEAMS seam_list, SEAM *seam) {
249   int length;
250   int index;
251 
252   length = array_count (seam_list);
253   for (index = 0; index < length; index++)
254     if (shared_split_points ((SEAM *) array_value (seam_list, index), seam))
255       return TRUE;
256   return FALSE;
257 }
258 
259 
260 /**********************************************************************
261  * check_blob
262  *
263  * Return true if blob has a non whole outline.
264  **********************************************************************/
check_blob(TBLOB * blob)265 int check_blob(TBLOB *blob) {
266   TESSLINE *outline;
267   EDGEPT *edgept;
268 
269   for (outline = blob->outlines; outline != NULL; outline = outline->next) {
270     edgept = outline->loop;
271     do {
272       if (edgept == NULL)
273         break;
274       edgept = edgept->next;
275     }
276     while (edgept != outline->loop);
277     if (edgept == NULL)
278       return 1;
279   }
280   return 0;
281 }
282 
283 
284 /**********************************************************************
285  * improve_one_blob
286  *
287  * Start with the current word of blobs and its classification.  Find
288  * the worst blobs and try to divide it up to improve the ratings.
289  *********************************************************************/
290 namespace tesseract {
improve_one_blob(TWERD * word,BLOB_CHOICE_LIST_VECTOR * char_choices,int fx,inT32 * blob_number,SEAMS * seam_list,DANGERR * fixpt,bool split_next_to_fragment)291 bool Wordrec::improve_one_blob(TWERD *word,
292                                BLOB_CHOICE_LIST_VECTOR *char_choices,
293                                int fx,
294                                inT32 *blob_number,
295                                SEAMS *seam_list,
296                                DANGERR *fixpt,
297                                bool split_next_to_fragment) {
298   TBLOB *pblob;
299   TBLOB *blob;
300   inT16 x = 0;
301   float rating_ceiling = MAX_FLOAT32;
302   BLOB_CHOICE_LIST *answer;
303   BLOB_CHOICE_IT answer_it;
304   SEAM *seam;
305 
306   do {
307     *blob_number = select_blob_to_split(*char_choices, rating_ceiling,
308                                         split_next_to_fragment);
309     if (chop_debug)
310       cprintf("blob_number = %d\n", *blob_number);
311     if (*blob_number == -1)
312       return false;
313 
314     seam = attempt_blob_chop (word, *blob_number, *seam_list);
315     if (seam != NULL)
316       break;
317     /* Must split null blobs */
318     answer = char_choices->get(*blob_number);
319     if (answer == NULL)
320       return false;
321     answer_it.set_to_list(answer);
322     rating_ceiling = answer_it.data()->rating();  // try a different blob
323   } while (!tord_blob_skip);
324   /* Split OK */
325   for (blob = word->blobs, pblob = NULL; x < *blob_number; x++) {
326     pblob = blob;
327     blob = blob->next;
328   }
329 
330   *seam_list =
331     insert_seam (*seam_list, *blob_number, seam, blob, word->blobs);
332 
333   delete char_choices->get(*blob_number);
334 
335   answer = classify_blob(pblob, blob, blob->next, NULL, "improve 1:", Red);
336   char_choices->insert(answer, *blob_number);
337 
338   answer = classify_blob(blob, blob->next, blob->next->next, NULL,
339                          "improve 2:", Yellow);
340   char_choices->set(answer, *blob_number + 1);
341 
342   return true;
343 }
344 
345 /**********************************************************************
346  * modify_blob_choice
347  *
348  * Takes a blob and its chop index, converts that chop index to a
349  * unichar_id, and stores the chop index in place of the blob's
350  * original unichar_id.
351  *********************************************************************/
modify_blob_choice(BLOB_CHOICE_LIST * answer,int chop_index)352 void Wordrec::modify_blob_choice(BLOB_CHOICE_LIST *answer,
353                         int chop_index) {
354   char chop_index_string[2];
355   if (chop_index <= 9) {
356     snprintf(chop_index_string, sizeof(chop_index_string), "%d", chop_index);
357   } else {
358     chop_index_string[0] = static_cast<char>('A' - 10 + chop_index);
359     chop_index_string[1] = '\0';
360   }
361   UNICHAR_ID unichar_id = unicharset.unichar_to_id(chop_index_string);
362   ASSERT_HOST(unichar_id!=INVALID_UNICHAR_ID);
363   BLOB_CHOICE_IT answer_it(answer);
364   BLOB_CHOICE *modified_blob = new BLOB_CHOICE(unichar_id,
365                                              answer_it.data()->rating(),
366                                              answer_it.data()->certainty(),
367                                              answer_it.data()->config(),
368                                              answer_it.data()->script_id());
369   answer->clear();
370   answer_it.set_to_list(answer);
371   answer_it.add_after_then_move(modified_blob);
372 }
373 
374 /**********************************************************************
375  * chop_one_blob
376  *
377  * Start with the current one-blob word and its classification.  Find
378  * the worst blobs and try to divide it up to improve the ratings.
379  * Used for testing chopper.
380  *********************************************************************/
chop_one_blob(TWERD * word,BLOB_CHOICE_LIST_VECTOR * char_choices,inT32 * blob_number,SEAMS * seam_list,int * right_chop_index)381 bool Wordrec::chop_one_blob(TWERD *word,
382                                BLOB_CHOICE_LIST_VECTOR *char_choices,
383                                inT32 *blob_number,
384                                SEAMS *seam_list,
385                                int *right_chop_index) {
386   TBLOB *pblob;
387   TBLOB *blob;
388   inT16 x = 0;
389   float rating_ceiling = MAX_FLOAT32;
390   BLOB_CHOICE_LIST *answer;
391   BLOB_CHOICE_IT answer_it;
392   SEAM *seam;
393   UNICHAR_ID unichar_id = 0;
394   int left_chop_index = 0;
395 
396   do {
397     *blob_number = select_blob_to_split(*char_choices, rating_ceiling,
398                                         false);
399     if (chop_debug)
400       cprintf("blob_number = %d\n", *blob_number);
401     if (*blob_number == -1)
402       return false;
403     seam = attempt_blob_chop(word, *blob_number, *seam_list);
404     if (seam != NULL)
405       break;
406     /* Must split null blobs */
407     answer = char_choices->get(*blob_number);
408     if (answer == NULL)
409       return false;
410     answer_it.set_to_list(answer);
411     rating_ceiling = answer_it.data()->rating();  // try a different blob
412   } while (!tord_blob_skip);
413   /* Split OK */
414   for (blob = word->blobs, pblob = NULL; x < *blob_number; x++) {
415     pblob = blob;
416     blob = blob->next;
417   }
418   *seam_list =
419     insert_seam(*seam_list, *blob_number, seam, blob, word->blobs);
420 
421   answer = char_choices->get(*blob_number);
422   answer_it.set_to_list(answer);
423   unichar_id = answer_it.data()->unichar_id();
424   left_chop_index = atoi(unicharset.id_to_unichar(unichar_id));
425 
426   delete char_choices->get(*blob_number);
427   // combine confidence w/ serial #
428   answer = classify_blob(pblob, blob, blob->next, NULL, "improve 1:", Red);
429   modify_blob_choice(answer, left_chop_index);
430   char_choices->insert(answer, *blob_number);
431 
432   answer = classify_blob(blob, blob->next, blob->next->next, NULL,
433                          "improve 2:", Yellow);
434   modify_blob_choice(answer, ++*right_chop_index);
435   char_choices->set(answer, *blob_number + 1);
436   return true;
437 }
438 }  // namespace tesseract
439 
440 /**********************************************************************
441  * check_seam_order
442  *
443  * Make sure that each of the splits in this seam match to outlines
444  * in this blob.  If any of the splits could not correspond to this
445  * blob then there is a problem (and FALSE should be returned to the
446  * caller).
447  **********************************************************************/
check_seam_order(TBLOB * blob,SEAM * seam)448 inT16 check_seam_order(TBLOB *blob, SEAM *seam) {
449   TESSLINE *outline;
450   TESSLINE *last_outline;
451   inT8 found_em[3];
452 
453   if (seam->split1 == NULL || seam->split1 == NULL || blob == NULL)
454     return (TRUE);
455 
456   found_em[0] = found_em[1] = found_em[2] = FALSE;
457 
458   for (outline = blob->outlines; outline; outline = outline->next) {
459     if (!found_em[0] &&
460       ((seam->split1 == NULL) ||
461     is_split_outline (outline, seam->split1))) {
462       found_em[0] = TRUE;
463     }
464     if (!found_em[1] &&
465       ((seam->split2 == NULL) ||
466     is_split_outline (outline, seam->split2))) {
467       found_em[1] = TRUE;
468     }
469     if (!found_em[2] &&
470       ((seam->split3 == NULL) ||
471     is_split_outline (outline, seam->split3))) {
472       found_em[2] = TRUE;
473     }
474     last_outline = outline;
475   }
476 
477   if (!found_em[0] || !found_em[1] || !found_em[2])
478     return (FALSE);
479   else
480     return (TRUE);
481 }
482 
483 /**********************************************************************
484  * chop_word_main
485  *
486  * Classify the blobs in this word and permute the results.  Find the
487  * worst blob in the word and chop it up.  Continue this process until
488  * a good answer has been found or all the blobs have been chopped up
489  * enough.  Return the word level ratings.
490  **********************************************************************/
491 namespace tesseract {
chop_word_main(register TWERD * word,int fx,WERD_CHOICE * best_choice,WERD_CHOICE * raw_choice,BOOL8 tester,BOOL8 trainer)492 BLOB_CHOICE_LIST_VECTOR *Wordrec::chop_word_main(register TWERD *word,
493                                                  int fx,
494                                                  WERD_CHOICE *best_choice,
495                                                  WERD_CHOICE *raw_choice,
496                                                  BOOL8 tester,
497                                                  BOOL8 trainer) {
498   TBLOB *pblob;
499   TBLOB *blob;
500   int index;
501   int did_chopping;
502   float rating_limit = 1000.0;
503   STATE state;
504   SEAMS seam_list = start_seam_list(word->blobs);
505   BLOB_CHOICE_LIST *match_result;
506   MATRIX *ratings = NULL;
507   DANGERR fixpt;                 /*dangerous ambig */
508   inT32 state_count;             //no of states
509   inT32 bit_count;               //no of bits
510   static STATE best_state;
511   static STATE chop_states[64];  //in between states
512 
513   state_count = 0;
514   best_choice->make_bad();
515   raw_choice->make_bad();
516 
517   BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
518 
519   did_chopping = 0;
520   for (blob = word->blobs, pblob = NULL, index = 0;
521        blob != NULL; blob = blob->next, index++) {
522     match_result = classify_blob(pblob, blob, blob->next, NULL,
523                                  "chop_word:", Green);
524     if (match_result == NULL)
525       cprintf("Null classifier output!\n");
526     *char_choices += match_result;
527     pblob = blob;
528   }
529   bit_count = index - 1;
530   getDict().permute_characters(*char_choices, rating_limit,
531                                best_choice, raw_choice);
532   set_n_ones(&state, char_choices->length() - 1);
533   if (matcher_fp != NULL) {
534     bits_in_states = bit_count;
535     chop_states[state_count] = state;
536     state_count++;
537   }
538   bool replaced = false;
539   if (!getDict().AcceptableChoice(char_choices, best_choice, *raw_choice,
540                                   &fixpt, CHOPPER_CALLER, &replaced) ||
541       ((tester || trainer) &&
542        strcmp(word->correct, best_choice->unichar_string().string()))) {
543     if (replaced) update_blob_classifications(word, *char_choices);
544     did_chopping = 1;
545     if (first_pass)
546       words_chopped1++;
547     else
548       words_chopped2++;
549 
550     if (chop_enable)
551       improve_by_chopping(word,
552                           char_choices,
553                           fx,
554                           &state,
555                           best_choice,
556                           raw_choice,
557                           &seam_list,
558                           &fixpt,
559                           chop_states,
560                           &state_count);
561     if (chop_debug)
562       print_seams ("Final seam list:", seam_list);
563 
564     // The force_word_assoc is almost redundant to enable_assoc.  However,
565     // it is not conditioned on the dict behavior.  For CJK, we need to force
566     // the associator to be invoked.  When we figure out the exact behavior
567     // of dict on CJK, we can remove the flag if it turns out to be redundant.
568     if ((wordrec_enable_assoc &&
569          !getDict().AcceptableChoice(char_choices, best_choice, *raw_choice,
570                                      NULL, CHOPPER_CALLER, &replaced)) ||
571         force_word_assoc ||
572         ((tester || trainer) &&
573          strcmp(word->correct, best_choice->unichar_string().string()))) {
574       ratings = word_associator (word->blobs, seam_list, &state, fx,
575         best_choice, raw_choice, word->correct,
576         /*0, */ &fixpt, &best_state);
577     }
578     bits_in_states = bit_count + state_count - 1;
579   }
580   if (replaced) update_blob_classifications(word, *char_choices);
581 
582   char_choices =
583     rebuild_current_state(word->blobs, seam_list, &state, char_choices, fx,
584                           (did_chopping || tester || trainer), *best_choice,
585                           ratings);
586 
587   if (ratings != NULL) {
588     ratings->delete_matrix_pointers();
589     delete ratings;
590   }
591   if (seam_list != NULL)
592     free_seam_list(seam_list);
593   if (matcher_fp != NULL) {
594     best_state = state;
595   }
596   getDict().FilterWordChoices();
597   return char_choices;
598 }
599 
600 
601 
602 /**********************************************************************
603  * improve_by_chopping
604  *
605  * Start with the current word of blobs and its classification.  Find
606  * the worst blobs and try to divide them up to improve the ratings.
607  * As long as ratings are produced by the new blob splitting.  When
608  * all the splitting has been accomplished all the ratings memory is
609  * reclaimed.
610  **********************************************************************/
improve_by_chopping(register TWERD * word,BLOB_CHOICE_LIST_VECTOR * char_choices,int fx,STATE * best_state,WERD_CHOICE * best_choice,WERD_CHOICE * raw_choice,SEAMS * seam_list,DANGERR * fixpt,STATE * chop_states,inT32 * state_count)611 void Wordrec::improve_by_chopping(register TWERD *word,
612                                   BLOB_CHOICE_LIST_VECTOR *char_choices,
613                                   int fx,
614                                   STATE *best_state,
615                                   WERD_CHOICE *best_choice,
616                                   WERD_CHOICE *raw_choice,
617                                   SEAMS *seam_list,
618                                   DANGERR *fixpt,
619                                   STATE *chop_states,
620                                   inT32 *state_count) {
621   inT32 blob_number;
622   inT32 index;                   //to states
623   float old_best;
624   int fixpt_valid = 1;
625   static inT32 old_count;        //from pass1
626   bool replaced = false;
627 
628   do {  // improvement loop
629     if (replaced) update_blob_classifications(word, *char_choices);
630     if (!fixpt_valid)
631       fixpt->index = -1;
632     old_best = best_choice->rating();
633     if (improve_one_blob(word, char_choices, fx, &blob_number, seam_list,
634                          fixpt, (fragments_guide_chopper &&
635                                  best_choice->fragment_mark()))) {
636       getDict().LogNewSplit(blob_number);
637       getDict().permute_characters(*char_choices, best_choice->rating(),
638                                    best_choice, raw_choice);
639 
640       if (old_best > best_choice->rating()) {
641         set_n_ones(best_state, char_choices->length() - 1);
642         fixpt_valid = 1;
643       }
644       else {
645         insert_new_chunk(best_state, blob_number, char_choices->length() - 2);
646         fixpt_valid = 0;
647       }
648       if (*state_count > 0) {
649         for (index = 0; index < *state_count; index++) {
650           insert_new_chunk(&chop_states[index], blob_number,
651                            char_choices->length() - 2);
652         }
653         set_n_ones(&chop_states[index], char_choices->length() - 1);
654         (*state_count)++;
655       }
656 
657       if (chop_debug)
658         print_state ("best state = ",
659           best_state, count_blobs (word->blobs) - 1);
660       if (first_pass)
661         chops_performed1++;
662       else
663         chops_performed2++;
664     } else {
665       break;
666     }
667   } while (!getDict().AcceptableChoice(char_choices, best_choice, *raw_choice,
668                                        fixpt, CHOPPER_CALLER, &replaced) &&
669            !tord_blob_skip && char_choices->length() < MAX_NUM_CHUNKS);
670   if (replaced) update_blob_classifications(word, *char_choices);
671   old_count = *state_count;
672   if (!fixpt_valid)
673     fixpt->index = -1;
674 }
675 
676 
677 /**********************************************************************
678  * select_blob_to_split
679  *
680  * These are the results of the last classification.  Find a likely
681  * place to apply splits.
682  **********************************************************************/
select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR & char_choices,float rating_ceiling,bool split_next_to_fragment)683 inT16 Wordrec::select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR &char_choices,
684                                     float rating_ceiling,
685                                     bool split_next_to_fragment) {
686   BLOB_CHOICE_IT blob_choice_it;
687   BLOB_CHOICE *blob_choice;
688   BLOB_CHOICE_IT temp_it;
689   int x;
690   float worst = -MAX_FLOAT32;
691   int worst_index = -1;
692   float worst_near_fragment = -MAX_FLOAT32;
693   int worst_index_near_fragment = -1;
694   const CHAR_FRAGMENT **fragments = NULL;
695 
696   if (chop_debug) {
697     if (rating_ceiling < MAX_FLOAT32)
698       cprintf("rating_ceiling = %8.4f\n", rating_ceiling);
699     else
700       cprintf("rating_ceiling = No Limit\n");
701   }
702 
703   if (split_next_to_fragment && char_choices.length() > 0) {
704     fragments = new const CHAR_FRAGMENT *[char_choices.length()];
705     if (char_choices.get(0) != NULL) {
706       temp_it.set_to_list(char_choices.get(0));
707       fragments[0] = getDict().getUnicharset().get_fragment(
708           temp_it.data()->unichar_id());
709     } else {
710       fragments[0] = NULL;
711     }
712   }
713 
714   for (x = 0; x < char_choices.length(); ++x) {
715     if (char_choices.get(x) == NULL) {
716       if (fragments != NULL) {
717         delete[] fragments;
718       }
719       return x;
720     } else {
721       blob_choice_it.set_to_list(char_choices.get(x));
722       blob_choice = blob_choice_it.data();
723       // Populate fragments for the following position.
724       if (split_next_to_fragment && x+1 < char_choices.length()) {
725         if (char_choices.get(x+1) != NULL) {
726           temp_it.set_to_list(char_choices.get(x+1));
727           fragments[x+1] = getDict().getUnicharset().get_fragment(
728               temp_it.data()->unichar_id());
729         } else {
730           fragments[x+1] = NULL;
731         }
732       }
733       if (blob_choice->rating() < rating_ceiling &&
734           blob_choice->certainty() < tessedit_certainty_threshold) {
735         // Update worst and worst_index.
736         if (blob_choice->rating() > worst) {
737           worst_index = x;
738           worst = blob_choice->rating();
739         }
740         if (split_next_to_fragment) {
741           // Update worst_near_fragment and worst_index_near_fragment.
742           bool expand_following_fragment =
743             (x + 1 < char_choices.length() &&
744              fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
745           bool expand_preceding_fragment =
746             (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
747           if ((expand_following_fragment || expand_preceding_fragment) &&
748               blob_choice->rating() > worst_near_fragment) {
749             worst_index_near_fragment = x;
750             worst_near_fragment = blob_choice->rating();
751             if (chop_debug) {
752               cprintf("worst_index_near_fragment=%d"
753                       " expand_following_fragment=%d"
754                       " expand_preceding_fragment=%d\n",
755                       worst_index_near_fragment,
756                       expand_following_fragment,
757                       expand_preceding_fragment);
758             }
759           }
760         }
761       }
762     }
763   }
764   if (fragments != NULL) {
765     delete[] fragments;
766   }
767   // TODO(daria): maybe a threshold of badness for
768   // worst_near_fragment would be useful.
769   return worst_index_near_fragment != -1 ?
770     worst_index_near_fragment : worst_index;
771 }
772 }  // namespace tesseract
773 
774 
775 /**********************************************************************
776  * start_seam_list
777  *
778  * Initialize a list of seams that match the original number of blobs
779  * present in the starting segmentation.  Each of the seams created
780  * by this routine have location information only.
781  **********************************************************************/
start_seam_list(TBLOB * blobs)782 SEAMS start_seam_list(TBLOB *blobs) {
783   TBLOB *blob;
784   SEAMS seam_list;
785   TPOINT topleft;
786   TPOINT botright;
787   int location;
788   /* Seam slot per char */
789   seam_list = new_seam_list ();
790 
791   for (blob = blobs; blob->next != NULL; blob = blob->next) {
792 
793     blob_bounding_box(blob, &topleft, &botright);
794     location = botright.x;
795     blob_bounding_box (blob->next, &topleft, &botright);
796     location += topleft.x;
797     location /= 2;
798 
799     seam_list = add_seam (seam_list,
800       new_seam (0.0, location, NULL, NULL, NULL));
801   }
802 
803   return (seam_list);
804 }
805 
806 
807 /**********************************************************************
808  * total_containment
809  *
810  * Check to see if one of these outlines is totally contained within
811  * the bounding box of the other.
812  **********************************************************************/
total_containment(TBLOB * blob1,TBLOB * blob2)813 inT16 total_containment(TBLOB *blob1, TBLOB *blob2) {
814   TPOINT topleft1;
815   TPOINT botright1;
816   TPOINT topleft2;
817   TPOINT botright2;
818 
819   blob_bounding_box(blob1, &topleft1, &botright1);
820   blob_bounding_box(blob2, &topleft2, &botright2);
821 
822   return (bounds_inside (topleft1, botright1, topleft2, botright2) ||
823     bounds_inside (topleft2, botright2, topleft1, botright1));
824 }
825 
826 
827 /**********************************************************************
828  * word_associator
829  *
830  * Reassociate and classify the blobs in a word.  Continue this process
831  * until a good answer is found or all the possibilities have been tried.
832  **********************************************************************/
833 namespace tesseract {
word_associator(TBLOB * blobs,SEAMS seams,STATE * state,int fxid,WERD_CHOICE * best_choice,WERD_CHOICE * raw_choice,char * correct,DANGERR * fixpt,STATE * best_state)834 MATRIX *Wordrec::word_associator(TBLOB *blobs,
835                                  SEAMS seams,
836                                  STATE *state,
837                                  int fxid,
838                                  WERD_CHOICE *best_choice,
839                                  WERD_CHOICE *raw_choice,
840                                  char *correct,
841                                  DANGERR *fixpt,
842                                  STATE *best_state) {
843   CHUNKS_RECORD chunks_record;
844   BLOB_WEIGHTS blob_weights;
845   int x;
846   int num_chunks;
847   BLOB_CHOICE_IT blob_choice_it;
848 
849   num_chunks = array_count (seams) + 1;
850 
851   chunks_record.chunks = blobs;
852   chunks_record.splits = seams;
853   chunks_record.ratings = record_piece_ratings (blobs);
854   chunks_record.char_widths = blobs_widths (blobs);
855   chunks_record.chunk_widths = blobs_widths (blobs);
856   chunks_record.fx = fxid;
857   /* Save chunk weights */
858   for (x = 0; x < num_chunks; x++) {
859     BLOB_CHOICE_LIST* choices = get_piece_rating(chunks_record.ratings,
860                                                  blobs, seams, x, x);
861     blob_choice_it.set_to_list(choices);
862     //This is done by Jetsoft. Divide by zero is possible.
863     if (blob_choice_it.data()->certainty() == 0) {
864       blob_weights[x]=0;
865     } else {
866       blob_weights[x] =
867         -(inT16) (10 * blob_choice_it.data()->rating() /
868                   blob_choice_it.data()->certainty());
869     }
870   }
871   chunks_record.weights = blob_weights;
872 
873   if (chop_debug)
874     chunks_record.ratings->print(getDict().getUnicharset());
875 
876   best_first_search(&chunks_record,
877                     best_choice,
878                     raw_choice,
879                     state,
880                     fixpt,
881                     best_state);
882 
883   free_widths (chunks_record.chunk_widths);
884   free_widths (chunks_record.char_widths);
885   return chunks_record.ratings;
886 }
887 }  // namespace tesseract
888 
889