• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**********************************************************************
2  * File:        edgblob.c  (Formerly edgeloop.c)
3  * Description: Functions to clean up an outline before approximation.
4  * Author:		Ray Smith
5  * Created:		Tue Mar 26 16:56:25 GMT 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #include          "scanedg.h"
22 #include          "drawedg.h"
23 #include          "edgloop.h"
24 #include          "edgblob.h"
25 
26 #define EXTERN
27 
28 // Control parameters used in outline_complexity(), which rejects an outline
29 // if any one of the 3 conditions is satisfied:
30 //  - number of children exceeds edges_max_children_per_outline
31 //  - number of nested layers exceeds edges_max_children_layers
32 //  - joint complexity exceeds edges_children_count_limit(as in child_count())
33 EXTERN BOOL_VAR(edges_use_new_outline_complexity, FALSE,
34                 "Use the new outline complexity module");
35 EXTERN INT_VAR(edges_max_children_per_outline, 10,
36                "Max number of children inside a character outline");
37 EXTERN INT_VAR(edges_max_children_layers, 5,
38                "Max layers of nested children inside a character outline");
39 EXTERN BOOL_VAR(edges_debug, FALSE,
40                 "turn on debugging for this module");
41 
42 
43 EXTERN INT_VAR (edges_children_per_grandchild, 10,
44 "Importance ratio for chucking outlines");
45 EXTERN INT_VAR(edges_children_count_limit, 45,
46                "Max holes allowed in blob");
47 EXTERN BOOL_VAR (edges_children_fix, FALSE,
48 "Remove boxy parents of char-like children");
49 EXTERN INT_VAR (edges_min_nonhole, 12,
50 "Min pixels for potential char in box");
51 EXTERN INT_VAR (edges_patharea_ratio, 40,
52 "Max lensq/area for acceptable child outline");
53 EXTERN double_VAR (edges_childarea, 0.5,
54                   "Min area fraction of child outline");
55 EXTERN double_VAR (edges_boxarea, 0.875,
56 "Min area fraction of grandchild for box");
57 
58 /**********************************************************************
59  * OL_BUCKETS::OL_BUCKETS
60  *
61  * Construct an array of buckets for associating outlines into blobs.
62  **********************************************************************/
63 
OL_BUCKETS(ICOORD bleft,ICOORD tright)64 OL_BUCKETS::OL_BUCKETS (
65 ////constructor
66 ICOORD bleft,                    //corners
67 ICOORD tright):         bl (bleft), tr (tright) {
68   bxdim = (tright.x () - bleft.x ()) / BUCKETSIZE + 1;
69   bydim = (tright.y () - bleft.y ()) / BUCKETSIZE + 1;
70                                  //make array
71   buckets = new C_OUTLINE_LIST[bxdim * bydim];
72   index = 0;
73 }
74 
75 
76 /**********************************************************************
77  * OL_BUCKETS::operator(
78  *
79  * Return a pointer to a list of C_OUTLINEs corresponding to the
80  * given pixel coordinates.
81  **********************************************************************/
82 
83 C_OUTLINE_LIST *
operator ()(inT16 x,inT16 y)84 OL_BUCKETS::operator () (        //array access
85 inT16 x,                         //image coords
86 inT16 y) {
87   return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE];
88 }
89 
90 
91 /**********************************************************************
92  * OL_BUCKETS::outline_complexity
93  *
94  * This is the new version of count_child.
95  *
96  * The goal of this function is to determine if an outline and its
97  * interiors could be part of a character blob.  This is done by
98  * computing a "complexity" index for the outline, which is the return
99  * value of this function, and checking it against a threshold.
100  * The max_count is used for short-circuiting the recursion and forcing
101  * a rejection that guarantees to fail the threshold test.
102  * The complexity F for outline X with N children X[i] is
103  *   F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild
104  * so each layer of nesting increases complexity exponentially.
105  * An outline can be rejected as a text blob candidate if its complexity
106  * is too high, has too many children(likely a container), or has too
107  * many layers of nested inner loops.  This has the side-effect of
108  * flattening out boxed or reversed video text regions.
109  **********************************************************************/
110 
outline_complexity(C_OUTLINE * outline,inT32 max_count,inT16 depth)111 inT32 OL_BUCKETS::outline_complexity(
112                                      C_OUTLINE *outline,   // parent outline
113                                      inT32 max_count,      // max output
114                                      inT16 depth           // recurion depth
115                                     ) {
116   inT16 xmin, xmax;              // coord limits
117   inT16 ymin, ymax;
118   inT16 xindex, yindex;          // current bucket
119   C_OUTLINE *child;              // current child
120   inT32 child_count;             // no of children
121   inT32 grandchild_count;        // no of grandchildren
122   C_OUTLINE_IT child_it;         // search iterator
123 
124   TBOX olbox = outline->bounding_box();
125   xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
126   xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
127   ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
128   ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
129   child_count = 0;
130   grandchild_count = 0;
131   if (++depth > edges_max_children_layers)  // nested loops are too deep
132     return max_count + depth;
133 
134   for (yindex = ymin; yindex <= ymax; yindex++) {
135     for (xindex = xmin; xindex <= xmax; xindex++) {
136       child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
137       if (child_it.empty())
138         continue;
139       for (child_it.mark_cycle_pt(); !child_it.cycled_list();
140            child_it.forward()) {
141         child = child_it.data();
142         if (child == outline || !(*child < *outline))
143           continue;
144         child_count++;
145 
146         if (child_count > edges_max_children_per_outline) {   // too fragmented
147           if (edges_debug)
148             tprintf("Discard outline on child_count=%d > "
149                     "max_children_per_outline=%d\n",
150                     child_count,
151                     static_cast<inT32>(edges_max_children_per_outline));
152           return max_count + child_count;
153         }
154 
155         // Compute the "complexity" of each child recursively
156         inT32 remaining_count = max_count - child_count - grandchild_count;
157         if (remaining_count > 0)
158           grandchild_count += edges_children_per_grandchild *
159                               outline_complexity(child, remaining_count, depth);
160         if (child_count + grandchild_count > max_count) {  // too complex
161           if (edges_debug)
162             tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
163                     "> max_count=%d\n",
164                     child_count, grandchild_count, max_count);
165           return child_count + grandchild_count;
166         }
167       }
168     }
169   }
170   return child_count + grandchild_count;
171 }
172 
173 
174 /**********************************************************************
175  * OL_BUCKETS::count_children
176  *
177  * Find number of descendants of this outline.
178  **********************************************************************/
179 
count_children(C_OUTLINE * outline,inT32 max_count)180 inT32 OL_BUCKETS::count_children(                     //recursive count
181                                  C_OUTLINE *outline,  //parent outline
182                                  inT32 max_count      //max output
183                                 ) {
184   BOOL8 parent_box;              //could it be boxy
185   inT16 xmin, xmax;              //coord limits
186   inT16 ymin, ymax;
187   inT16 xindex, yindex;          //current bucket
188   C_OUTLINE *child;              //current child
189   inT32 child_count;             //no of children
190   inT32 grandchild_count;        //no of grandchildren
191   inT32 parent_area;             //potential box
192   FLOAT32 max_parent_area;       //potential box
193   inT32 child_area;              //current child
194   inT32 child_length;            //current child
195   TBOX olbox;
196   C_OUTLINE_IT child_it;         //search iterator
197 
198   olbox = outline->bounding_box ();
199   xmin = (olbox.left () - bl.x ()) / BUCKETSIZE;
200   xmax = (olbox.right () - bl.x ()) / BUCKETSIZE;
201   ymin = (olbox.bottom () - bl.y ()) / BUCKETSIZE;
202   ymax = (olbox.top () - bl.y ()) / BUCKETSIZE;
203   child_count = 0;
204   grandchild_count = 0;
205   parent_area = 0;
206   max_parent_area = 0;
207   parent_box = TRUE;
208   for (yindex = ymin; yindex <= ymax; yindex++) {
209     for (xindex = xmin; xindex <= xmax; xindex++) {
210       child_it.set_to_list (&buckets[yindex * bxdim + xindex]);
211       if (child_it.empty ())
212         continue;
213       for (child_it.mark_cycle_pt (); !child_it.cycled_list ();
214       child_it.forward ()) {
215         child = child_it.data ();
216         if (child != outline && *child < *outline) {
217           child_count++;
218           if (child_count <= max_count) {
219             int max_grand = (max_count - child_count) /
220                             edges_children_per_grandchild;
221             if (max_grand > 0)
222               grandchild_count += count_children (child, max_grand) *
223                                   edges_children_per_grandchild;
224             else
225               grandchild_count += count_children(child, 1);
226           }
227           if (child_count + grandchild_count > max_count) {
228             if (edges_debug)
229               tprintf("Discarding parent with child count=%d, gc=%d\n",
230                       child_count,grandchild_count);
231             return child_count + grandchild_count;
232           }
233           if (parent_area == 0) {
234             parent_area = outline->outer_area ();
235             if (parent_area < 0)
236               parent_area = -parent_area;
237             max_parent_area = outline->bounding_box().area() * edges_boxarea;
238             if (parent_area < max_parent_area)
239               parent_box = FALSE;
240           }
241           if (parent_box &&
242               (!edges_children_fix ||
243                child->bounding_box().height() > edges_min_nonhole)) {
244             child_area = child->outer_area ();
245             if (child_area < 0)
246               child_area = -child_area;
247             if (edges_children_fix) {
248               if (parent_area - child_area < max_parent_area) {
249                 parent_box = FALSE;
250                 continue;
251               }
252               if (grandchild_count > 0) {
253                 if (edges_debug)
254                   tprintf("Discarding parent of area %d, child area=%d, max%g "
255                           "with gc=%d\n",
256                           parent_area, child_area, max_parent_area,
257                           grandchild_count);
258                 return max_count + 1;
259               }
260               child_length = child->pathlength ();
261               if (child_length * child_length >
262               child_area * edges_patharea_ratio) {
263                 if (edges_debug)
264                   tprintf("Discarding parent of area %d, child area=%d, max%g "
265                           "with child length=%d\n",
266                           parent_area, child_area, max_parent_area,
267                           child_length);
268                 return max_count + 1;
269               }
270             }
271             if (child_area < child->bounding_box().area() * edges_childarea) {
272               if (edges_debug)
273                 tprintf("Discarding parent of area %d, child area=%d, max%g "
274                         "with child rect=%d\n",
275                         parent_area, child_area, max_parent_area,
276                         child->bounding_box().area());
277               return max_count + 1;
278             }
279           }
280         }
281       }
282     }
283   }
284   return child_count + grandchild_count;
285 }
286 
287 
288 
289 
290 /**********************************************************************
291  * OL_BUCKETS::extract_children
292  *
293  * Find number of descendants of this outline.
294  **********************************************************************/
295 
extract_children(C_OUTLINE * outline,C_OUTLINE_IT * it)296 void OL_BUCKETS::extract_children(                     //recursive count
297                                   C_OUTLINE *outline,  //parent outline
298                                   C_OUTLINE_IT *it     //destination iterator
299                                  ) {
300   inT16 xmin, xmax;              //coord limits
301   inT16 ymin, ymax;
302   inT16 xindex, yindex;          //current bucket
303   TBOX olbox;
304   C_OUTLINE_IT child_it;         //search iterator
305 
306   olbox = outline->bounding_box ();
307   xmin = (olbox.left () - bl.x ()) / BUCKETSIZE;
308   xmax = (olbox.right () - bl.x ()) / BUCKETSIZE;
309   ymin = (olbox.bottom () - bl.y ()) / BUCKETSIZE;
310   ymax = (olbox.top () - bl.y ()) / BUCKETSIZE;
311   for (yindex = ymin; yindex <= ymax; yindex++) {
312     for (xindex = xmin; xindex <= xmax; xindex++) {
313       child_it.set_to_list (&buckets[yindex * bxdim + xindex]);
314       for (child_it.mark_cycle_pt (); !child_it.cycled_list ();
315       child_it.forward ()) {
316         if (*child_it.data () < *outline) {
317           it->add_after_then_move (child_it.extract ());
318         }
319       }
320     }
321   }
322 }
323 
324 
325 /**********************************************************************
326  * extract_edges
327  *
328  * Run the edge detector over the block and return a list of blobs.
329  **********************************************************************/
330 
extract_edges(ScrollView * window,IMAGE * image,IMAGE * t_image,ICOORD page_tr,BLOCK * block)331 void extract_edges(                 //find blobs
332 #ifndef GRAPHICS_DISABLED
333                    ScrollView* window,   //window for output
334 #endif
335                    IMAGE *image,    //image to scan
336                    IMAGE *t_image,  //thresholded image
337                    ICOORD page_tr,  //corner of page
338                    BLOCK *block     //block to scan
339                   ) {
340   ICOORD bleft;                  //block box
341   ICOORD tright;
342   C_OUTLINE_LIST outlines;       //outlines in block
343                                  //iterator
344   C_OUTLINE_IT out_it = &outlines;
345 
346 #ifndef GRAPHICS_DISABLED
347   get_outlines (window, image, t_image, page_tr, (PDBLK *) block, &out_it);
348 #else
349   get_outlines (image, t_image, page_tr, (PDBLK *) block, &out_it);
350 #endif
351                                  //block box
352   block->bounding_box (bleft, tright);
353                                  //make blobs
354   outlines_to_blobs(block, bleft, tright, &outlines);
355 }
356 
357 
358 /**********************************************************************
359  * outlines_to_blobs
360  *
361  * Gather together outlines into blobs using the usual bucket sort.
362  **********************************************************************/
363 
outlines_to_blobs(BLOCK * block,ICOORD bleft,ICOORD tright,C_OUTLINE_LIST * outlines)364 void outlines_to_blobs(               //find blobs
365                        BLOCK *block,  //block to scan
366                        ICOORD bleft,
367                        ICOORD tright,
368                        C_OUTLINE_LIST *outlines) {
369                                  //make buckets
370   OL_BUCKETS buckets(bleft, tright);
371 
372   fill_buckets(outlines, &buckets);
373   empty_buckets(block, &buckets);
374 }
375 
376 
377 /**********************************************************************
378  * fill_buckets
379  *
380  * Run the edge detector over the block and return a list of blobs.
381  **********************************************************************/
382 
fill_buckets(C_OUTLINE_LIST * outlines,OL_BUCKETS * buckets)383 void fill_buckets(                           //find blobs
384                   C_OUTLINE_LIST *outlines,  //outlines in block
385                   OL_BUCKETS *buckets        //output buckets
386                  ) {
387   TBOX ol_box;                    //outline box
388   C_OUTLINE_IT out_it = outlines;//iterator
389   C_OUTLINE_IT bucket_it;        //iterator in bucket
390   C_OUTLINE *outline;            //current outline
391 
392   for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) {
393     outline = out_it.extract (); //take off list
394                                  //get box
395     ol_box = outline->bounding_box ();
396     bucket_it.set_to_list ((*buckets) (ol_box.left (), ol_box.bottom ()));
397     bucket_it.add_to_end (outline);
398   }
399 }
400 
401 
402 /**********************************************************************
403  * empty_buckets
404  *
405  * Run the edge detector over the block and return a list of blobs.
406  **********************************************************************/
407 
empty_buckets(BLOCK * block,OL_BUCKETS * buckets)408 void empty_buckets(                     //find blobs
409                    BLOCK *block,        //block to scan
410                    OL_BUCKETS *buckets  //output buckets
411                   ) {
412   BOOL8 good_blob;               //healthy blob
413   C_OUTLINE_LIST outlines;       //outlines in block
414                                  //iterator
415   C_OUTLINE_IT out_it = &outlines;
416   C_OUTLINE_IT bucket_it = buckets->start_scan ();
417   C_OUTLINE_IT parent_it;        //parent outline
418   C_BLOB *blob;                  //new blob
419   C_BLOB_IT good_blobs = block->blob_list ();
420   C_BLOB_IT junk_blobs = block->reject_blobs ();
421 
422   while (!bucket_it.empty ()) {
423     out_it.set_to_list (&outlines);
424     do {
425       parent_it = bucket_it;     //find outermost
426       do
427       bucket_it.forward ();
428       while (!bucket_it.at_first ()
429         && !(*parent_it.data () < *bucket_it.data ()));
430     }
431     while (!bucket_it.at_first ());
432 
433                                  //move to new list
434     out_it.add_after_then_move (parent_it.extract ());
435     good_blob = capture_children (buckets, &junk_blobs, &out_it);
436     blob = new C_BLOB (&outlines);
437     if (good_blob)
438       good_blobs.add_after_then_move (blob);
439     else
440       junk_blobs.add_after_then_move (blob);
441 
442     bucket_it.set_to_list (buckets->scan_next ());
443   }
444 }
445 
446 
447 /**********************************************************************
448  * capture_children
449  *
450  * Find all neighbouring outlines that are children of this outline
451  * and either move them to the output list or declare this outline
452  * illegal and return FALSE.
453  **********************************************************************/
454 
capture_children(OL_BUCKETS * buckets,C_BLOB_IT * reject_it,C_OUTLINE_IT * blob_it)455 BOOL8 capture_children(                       //find children
456                        OL_BUCKETS *buckets,   //bucket sort clanss
457                        C_BLOB_IT *reject_it,  //dead grandchildren
458                        C_OUTLINE_IT *blob_it  //output outlines
459                       ) {
460   C_OUTLINE *outline;            //master outline
461   inT32 child_count;             //no of children
462 
463   outline = blob_it->data ();
464   if (edges_use_new_outline_complexity)
465     child_count = buckets->outline_complexity(outline,
466                                                edges_children_count_limit,
467                                                0);
468   else
469     child_count = buckets->count_children(outline,
470                                            edges_children_count_limit);
471   if (child_count > edges_children_count_limit)
472     return FALSE;
473 
474   if (child_count > 0)
475   buckets->extract_children (outline, blob_it);
476   return TRUE;
477 }
478