1 /**********************************************************************
2 * File: edgblob.c (Formerly edgeloop.c)
3 * Description: Functions to clean up an outline before approximation.
4 * Author: Ray Smith
5 * Created: Tue Mar 26 16:56:25 GMT 1991
6 *
7 * (C) Copyright 1991, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 #include "mfcpch.h"
21 #include "scanedg.h"
22 #include "drawedg.h"
23 #include "edgloop.h"
24 #include "edgblob.h"
25
26 #define EXTERN
27
28 // Control parameters used in outline_complexity(), which rejects an outline
29 // if any one of the 3 conditions is satisfied:
30 // - number of children exceeds edges_max_children_per_outline
31 // - number of nested layers exceeds edges_max_children_layers
32 // - joint complexity exceeds edges_children_count_limit(as in child_count())
33 EXTERN BOOL_VAR(edges_use_new_outline_complexity, FALSE,
34 "Use the new outline complexity module");
35 EXTERN INT_VAR(edges_max_children_per_outline, 10,
36 "Max number of children inside a character outline");
37 EXTERN INT_VAR(edges_max_children_layers, 5,
38 "Max layers of nested children inside a character outline");
39 EXTERN BOOL_VAR(edges_debug, FALSE,
40 "turn on debugging for this module");
41
42
43 EXTERN INT_VAR (edges_children_per_grandchild, 10,
44 "Importance ratio for chucking outlines");
45 EXTERN INT_VAR(edges_children_count_limit, 45,
46 "Max holes allowed in blob");
47 EXTERN BOOL_VAR (edges_children_fix, FALSE,
48 "Remove boxy parents of char-like children");
49 EXTERN INT_VAR (edges_min_nonhole, 12,
50 "Min pixels for potential char in box");
51 EXTERN INT_VAR (edges_patharea_ratio, 40,
52 "Max lensq/area for acceptable child outline");
53 EXTERN double_VAR (edges_childarea, 0.5,
54 "Min area fraction of child outline");
55 EXTERN double_VAR (edges_boxarea, 0.875,
56 "Min area fraction of grandchild for box");
57
58 /**********************************************************************
59 * OL_BUCKETS::OL_BUCKETS
60 *
61 * Construct an array of buckets for associating outlines into blobs.
62 **********************************************************************/
63
OL_BUCKETS(ICOORD bleft,ICOORD tright)64 OL_BUCKETS::OL_BUCKETS (
65 ////constructor
66 ICOORD bleft, //corners
67 ICOORD tright): bl (bleft), tr (tright) {
68 bxdim = (tright.x () - bleft.x ()) / BUCKETSIZE + 1;
69 bydim = (tright.y () - bleft.y ()) / BUCKETSIZE + 1;
70 //make array
71 buckets = new C_OUTLINE_LIST[bxdim * bydim];
72 index = 0;
73 }
74
75
76 /**********************************************************************
77 * OL_BUCKETS::operator(
78 *
79 * Return a pointer to a list of C_OUTLINEs corresponding to the
80 * given pixel coordinates.
81 **********************************************************************/
82
83 C_OUTLINE_LIST *
operator ()(inT16 x,inT16 y)84 OL_BUCKETS::operator () ( //array access
85 inT16 x, //image coords
86 inT16 y) {
87 return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE];
88 }
89
90
91 /**********************************************************************
92 * OL_BUCKETS::outline_complexity
93 *
94 * This is the new version of count_child.
95 *
96 * The goal of this function is to determine if an outline and its
97 * interiors could be part of a character blob. This is done by
98 * computing a "complexity" index for the outline, which is the return
99 * value of this function, and checking it against a threshold.
100 * The max_count is used for short-circuiting the recursion and forcing
101 * a rejection that guarantees to fail the threshold test.
102 * The complexity F for outline X with N children X[i] is
103 * F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild
104 * so each layer of nesting increases complexity exponentially.
105 * An outline can be rejected as a text blob candidate if its complexity
106 * is too high, has too many children(likely a container), or has too
107 * many layers of nested inner loops. This has the side-effect of
108 * flattening out boxed or reversed video text regions.
109 **********************************************************************/
110
outline_complexity(C_OUTLINE * outline,inT32 max_count,inT16 depth)111 inT32 OL_BUCKETS::outline_complexity(
112 C_OUTLINE *outline, // parent outline
113 inT32 max_count, // max output
114 inT16 depth // recurion depth
115 ) {
116 inT16 xmin, xmax; // coord limits
117 inT16 ymin, ymax;
118 inT16 xindex, yindex; // current bucket
119 C_OUTLINE *child; // current child
120 inT32 child_count; // no of children
121 inT32 grandchild_count; // no of grandchildren
122 C_OUTLINE_IT child_it; // search iterator
123
124 TBOX olbox = outline->bounding_box();
125 xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
126 xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
127 ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
128 ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
129 child_count = 0;
130 grandchild_count = 0;
131 if (++depth > edges_max_children_layers) // nested loops are too deep
132 return max_count + depth;
133
134 for (yindex = ymin; yindex <= ymax; yindex++) {
135 for (xindex = xmin; xindex <= xmax; xindex++) {
136 child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
137 if (child_it.empty())
138 continue;
139 for (child_it.mark_cycle_pt(); !child_it.cycled_list();
140 child_it.forward()) {
141 child = child_it.data();
142 if (child == outline || !(*child < *outline))
143 continue;
144 child_count++;
145
146 if (child_count > edges_max_children_per_outline) { // too fragmented
147 if (edges_debug)
148 tprintf("Discard outline on child_count=%d > "
149 "max_children_per_outline=%d\n",
150 child_count,
151 static_cast<inT32>(edges_max_children_per_outline));
152 return max_count + child_count;
153 }
154
155 // Compute the "complexity" of each child recursively
156 inT32 remaining_count = max_count - child_count - grandchild_count;
157 if (remaining_count > 0)
158 grandchild_count += edges_children_per_grandchild *
159 outline_complexity(child, remaining_count, depth);
160 if (child_count + grandchild_count > max_count) { // too complex
161 if (edges_debug)
162 tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
163 "> max_count=%d\n",
164 child_count, grandchild_count, max_count);
165 return child_count + grandchild_count;
166 }
167 }
168 }
169 }
170 return child_count + grandchild_count;
171 }
172
173
174 /**********************************************************************
175 * OL_BUCKETS::count_children
176 *
177 * Find number of descendants of this outline.
178 **********************************************************************/
179
count_children(C_OUTLINE * outline,inT32 max_count)180 inT32 OL_BUCKETS::count_children( //recursive count
181 C_OUTLINE *outline, //parent outline
182 inT32 max_count //max output
183 ) {
184 BOOL8 parent_box; //could it be boxy
185 inT16 xmin, xmax; //coord limits
186 inT16 ymin, ymax;
187 inT16 xindex, yindex; //current bucket
188 C_OUTLINE *child; //current child
189 inT32 child_count; //no of children
190 inT32 grandchild_count; //no of grandchildren
191 inT32 parent_area; //potential box
192 FLOAT32 max_parent_area; //potential box
193 inT32 child_area; //current child
194 inT32 child_length; //current child
195 TBOX olbox;
196 C_OUTLINE_IT child_it; //search iterator
197
198 olbox = outline->bounding_box ();
199 xmin = (olbox.left () - bl.x ()) / BUCKETSIZE;
200 xmax = (olbox.right () - bl.x ()) / BUCKETSIZE;
201 ymin = (olbox.bottom () - bl.y ()) / BUCKETSIZE;
202 ymax = (olbox.top () - bl.y ()) / BUCKETSIZE;
203 child_count = 0;
204 grandchild_count = 0;
205 parent_area = 0;
206 max_parent_area = 0;
207 parent_box = TRUE;
208 for (yindex = ymin; yindex <= ymax; yindex++) {
209 for (xindex = xmin; xindex <= xmax; xindex++) {
210 child_it.set_to_list (&buckets[yindex * bxdim + xindex]);
211 if (child_it.empty ())
212 continue;
213 for (child_it.mark_cycle_pt (); !child_it.cycled_list ();
214 child_it.forward ()) {
215 child = child_it.data ();
216 if (child != outline && *child < *outline) {
217 child_count++;
218 if (child_count <= max_count) {
219 int max_grand = (max_count - child_count) /
220 edges_children_per_grandchild;
221 if (max_grand > 0)
222 grandchild_count += count_children (child, max_grand) *
223 edges_children_per_grandchild;
224 else
225 grandchild_count += count_children(child, 1);
226 }
227 if (child_count + grandchild_count > max_count) {
228 if (edges_debug)
229 tprintf("Discarding parent with child count=%d, gc=%d\n",
230 child_count,grandchild_count);
231 return child_count + grandchild_count;
232 }
233 if (parent_area == 0) {
234 parent_area = outline->outer_area ();
235 if (parent_area < 0)
236 parent_area = -parent_area;
237 max_parent_area = outline->bounding_box().area() * edges_boxarea;
238 if (parent_area < max_parent_area)
239 parent_box = FALSE;
240 }
241 if (parent_box &&
242 (!edges_children_fix ||
243 child->bounding_box().height() > edges_min_nonhole)) {
244 child_area = child->outer_area ();
245 if (child_area < 0)
246 child_area = -child_area;
247 if (edges_children_fix) {
248 if (parent_area - child_area < max_parent_area) {
249 parent_box = FALSE;
250 continue;
251 }
252 if (grandchild_count > 0) {
253 if (edges_debug)
254 tprintf("Discarding parent of area %d, child area=%d, max%g "
255 "with gc=%d\n",
256 parent_area, child_area, max_parent_area,
257 grandchild_count);
258 return max_count + 1;
259 }
260 child_length = child->pathlength ();
261 if (child_length * child_length >
262 child_area * edges_patharea_ratio) {
263 if (edges_debug)
264 tprintf("Discarding parent of area %d, child area=%d, max%g "
265 "with child length=%d\n",
266 parent_area, child_area, max_parent_area,
267 child_length);
268 return max_count + 1;
269 }
270 }
271 if (child_area < child->bounding_box().area() * edges_childarea) {
272 if (edges_debug)
273 tprintf("Discarding parent of area %d, child area=%d, max%g "
274 "with child rect=%d\n",
275 parent_area, child_area, max_parent_area,
276 child->bounding_box().area());
277 return max_count + 1;
278 }
279 }
280 }
281 }
282 }
283 }
284 return child_count + grandchild_count;
285 }
286
287
288
289
290 /**********************************************************************
291 * OL_BUCKETS::extract_children
292 *
293 * Find number of descendants of this outline.
294 **********************************************************************/
295
extract_children(C_OUTLINE * outline,C_OUTLINE_IT * it)296 void OL_BUCKETS::extract_children( //recursive count
297 C_OUTLINE *outline, //parent outline
298 C_OUTLINE_IT *it //destination iterator
299 ) {
300 inT16 xmin, xmax; //coord limits
301 inT16 ymin, ymax;
302 inT16 xindex, yindex; //current bucket
303 TBOX olbox;
304 C_OUTLINE_IT child_it; //search iterator
305
306 olbox = outline->bounding_box ();
307 xmin = (olbox.left () - bl.x ()) / BUCKETSIZE;
308 xmax = (olbox.right () - bl.x ()) / BUCKETSIZE;
309 ymin = (olbox.bottom () - bl.y ()) / BUCKETSIZE;
310 ymax = (olbox.top () - bl.y ()) / BUCKETSIZE;
311 for (yindex = ymin; yindex <= ymax; yindex++) {
312 for (xindex = xmin; xindex <= xmax; xindex++) {
313 child_it.set_to_list (&buckets[yindex * bxdim + xindex]);
314 for (child_it.mark_cycle_pt (); !child_it.cycled_list ();
315 child_it.forward ()) {
316 if (*child_it.data () < *outline) {
317 it->add_after_then_move (child_it.extract ());
318 }
319 }
320 }
321 }
322 }
323
324
325 /**********************************************************************
326 * extract_edges
327 *
328 * Run the edge detector over the block and return a list of blobs.
329 **********************************************************************/
330
extract_edges(ScrollView * window,IMAGE * image,IMAGE * t_image,ICOORD page_tr,BLOCK * block)331 void extract_edges( //find blobs
332 #ifndef GRAPHICS_DISABLED
333 ScrollView* window, //window for output
334 #endif
335 IMAGE *image, //image to scan
336 IMAGE *t_image, //thresholded image
337 ICOORD page_tr, //corner of page
338 BLOCK *block //block to scan
339 ) {
340 ICOORD bleft; //block box
341 ICOORD tright;
342 C_OUTLINE_LIST outlines; //outlines in block
343 //iterator
344 C_OUTLINE_IT out_it = &outlines;
345
346 #ifndef GRAPHICS_DISABLED
347 get_outlines (window, image, t_image, page_tr, (PDBLK *) block, &out_it);
348 #else
349 get_outlines (image, t_image, page_tr, (PDBLK *) block, &out_it);
350 #endif
351 //block box
352 block->bounding_box (bleft, tright);
353 //make blobs
354 outlines_to_blobs(block, bleft, tright, &outlines);
355 }
356
357
358 /**********************************************************************
359 * outlines_to_blobs
360 *
361 * Gather together outlines into blobs using the usual bucket sort.
362 **********************************************************************/
363
outlines_to_blobs(BLOCK * block,ICOORD bleft,ICOORD tright,C_OUTLINE_LIST * outlines)364 void outlines_to_blobs( //find blobs
365 BLOCK *block, //block to scan
366 ICOORD bleft,
367 ICOORD tright,
368 C_OUTLINE_LIST *outlines) {
369 //make buckets
370 OL_BUCKETS buckets(bleft, tright);
371
372 fill_buckets(outlines, &buckets);
373 empty_buckets(block, &buckets);
374 }
375
376
377 /**********************************************************************
378 * fill_buckets
379 *
380 * Run the edge detector over the block and return a list of blobs.
381 **********************************************************************/
382
fill_buckets(C_OUTLINE_LIST * outlines,OL_BUCKETS * buckets)383 void fill_buckets( //find blobs
384 C_OUTLINE_LIST *outlines, //outlines in block
385 OL_BUCKETS *buckets //output buckets
386 ) {
387 TBOX ol_box; //outline box
388 C_OUTLINE_IT out_it = outlines;//iterator
389 C_OUTLINE_IT bucket_it; //iterator in bucket
390 C_OUTLINE *outline; //current outline
391
392 for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) {
393 outline = out_it.extract (); //take off list
394 //get box
395 ol_box = outline->bounding_box ();
396 bucket_it.set_to_list ((*buckets) (ol_box.left (), ol_box.bottom ()));
397 bucket_it.add_to_end (outline);
398 }
399 }
400
401
402 /**********************************************************************
403 * empty_buckets
404 *
405 * Run the edge detector over the block and return a list of blobs.
406 **********************************************************************/
407
empty_buckets(BLOCK * block,OL_BUCKETS * buckets)408 void empty_buckets( //find blobs
409 BLOCK *block, //block to scan
410 OL_BUCKETS *buckets //output buckets
411 ) {
412 BOOL8 good_blob; //healthy blob
413 C_OUTLINE_LIST outlines; //outlines in block
414 //iterator
415 C_OUTLINE_IT out_it = &outlines;
416 C_OUTLINE_IT bucket_it = buckets->start_scan ();
417 C_OUTLINE_IT parent_it; //parent outline
418 C_BLOB *blob; //new blob
419 C_BLOB_IT good_blobs = block->blob_list ();
420 C_BLOB_IT junk_blobs = block->reject_blobs ();
421
422 while (!bucket_it.empty ()) {
423 out_it.set_to_list (&outlines);
424 do {
425 parent_it = bucket_it; //find outermost
426 do
427 bucket_it.forward ();
428 while (!bucket_it.at_first ()
429 && !(*parent_it.data () < *bucket_it.data ()));
430 }
431 while (!bucket_it.at_first ());
432
433 //move to new list
434 out_it.add_after_then_move (parent_it.extract ());
435 good_blob = capture_children (buckets, &junk_blobs, &out_it);
436 blob = new C_BLOB (&outlines);
437 if (good_blob)
438 good_blobs.add_after_then_move (blob);
439 else
440 junk_blobs.add_after_then_move (blob);
441
442 bucket_it.set_to_list (buckets->scan_next ());
443 }
444 }
445
446
447 /**********************************************************************
448 * capture_children
449 *
450 * Find all neighbouring outlines that are children of this outline
451 * and either move them to the output list or declare this outline
452 * illegal and return FALSE.
453 **********************************************************************/
454
capture_children(OL_BUCKETS * buckets,C_BLOB_IT * reject_it,C_OUTLINE_IT * blob_it)455 BOOL8 capture_children( //find children
456 OL_BUCKETS *buckets, //bucket sort clanss
457 C_BLOB_IT *reject_it, //dead grandchildren
458 C_OUTLINE_IT *blob_it //output outlines
459 ) {
460 C_OUTLINE *outline; //master outline
461 inT32 child_count; //no of children
462
463 outline = blob_it->data ();
464 if (edges_use_new_outline_complexity)
465 child_count = buckets->outline_complexity(outline,
466 edges_children_count_limit,
467 0);
468 else
469 child_count = buckets->count_children(outline,
470 edges_children_count_limit);
471 if (child_count > edges_children_count_limit)
472 return FALSE;
473
474 if (child_count > 0)
475 buckets->extract_children (outline, blob_it);
476 return TRUE;
477 }
478