1 ///////////////////////////////////////////////////////////////////////
2 // File: tablefind.cpp
3 // Description: Helper classes to find tables from ColPartitions.
4 // Author: Faisal Shafait (faisal.shafait@dfki.de)
5 // Created: Tue Jan 06 11:13:01 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19
20 #include "colfind.h"
21 #include <math.h>
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25 #ifdef HAVE_LIBLEPT
26 #include "allheaders.h"
27 #endif
28
29 namespace tesseract {
30
31 // Maximum vertical spacing between neighbor partitions
32 const int kMaxVerticalSpacing = 500;
33
34 // Minimum number of components in a text partition. A partition having fewer
35 // components than that is more likely a data partition and is a candidate
36 // table cell.
37 const int kMinBoxesInTextPartition = 10;
38
39 // Maximum number of components that a data partition can have
40 const int kMaxBoxesInDataPartition = 20;
41
42 // Maximum allowed gap in a text partitions as a multiple of its median size.
43 const double kMaxGapInTextPartition = 4.0;
44
45 // Minimum value that the maximum gap in a text partition should have as a
46 // factor of its median size.
47 const double kMinMaxGapInTextPartition = 0.5;
48
49 // Maximum x-height a table partition can have as a multiple of global
50 // median x-height
51 const double kMaxTableCellXheight = 2.0;
52
53 // Maximum line spacing between a table column header and column contents
54 // for merging the two
55 const int kMaxColumnHeaderDistance = 100;
56
57 // Minimum ratio of num_table_partitions to num_text_partitions in a column
58 // block to be called it a table column
59 const double kTableColumnThreshold = 3.0;
60
61 // Search for horizontal ruling lines within the vertical margin as a
62 // multiple of grid size
63 const int kRulingVerticalMargin = 3;
64
65 // Minimum overlap that a colpartition must have with a table region
66 // to become part of that table
67 const double kMinOverlapWithTable = 0.6;
68
69 // Maximum side space (distance from column boundary) that a typical
70 // text-line in flowing text should have as a multiple of its x-height
71 // (Median size).
72 const int kSideSpaceMargin = 10;
73
74 // Fraction of the peak of x-projection of a table region to set the
75 // threshold for the x-projection histogram
76 const double kProjectionThreshold = 0.35;
77
78 // Minmimum number of rows in a table
79 const int kMinRowsInTable = 3;
80
81 BOOL_VAR(textord_dump_table_images, false, "Paint table detection output");
82 BOOL_VAR(textord_show_tables, false, "Show table regions");
83
84 ELISTIZE(ColSegment)
CLISTIZE(ColSegment)85 CLISTIZE(ColSegment)
86
87 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
88 // insert dot-like noise into period_grid_
89 void ColumnFinder::GetCleanPartitions(TO_BLOCK* block) {
90 double min_dim = block->line_size/3.0;
91 // Initialize clean partitions list and grid
92 clean_part_grid_.Init(gridsize(), bleft(), tright());
93 period_grid_.Init(gridsize(), bleft(), tright());
94 // Iterate the ColPartitions in the grid.
95 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
96 gsearch(&part_grid_);
97 gsearch.StartFullSearch();
98 ColPartition* part;
99 while ((part = gsearch.NextFullSearch()) != NULL) {
100 ColPartition* clean_part = part->ShallowCopy();
101 // Insert all non-text partitions to clean_parts
102 if (!part->IsTextType()) {
103 clean_part_grid_.InsertBBox(true, true, clean_part);
104 continue;
105 }
106 // Insert text colpartitions after removing noisy components from them
107 BLOBNBOX_CLIST* part_boxes = part->boxes();
108 BLOBNBOX_C_IT pit(part_boxes);
109 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
110 BLOBNBOX *pblob = pit.data();
111 if (!pblob->noise_flag()) {
112 clean_part->AddBox(pblob);
113 } else {
114 TBOX blob_box = pblob->bounding_box();
115 if (blob_box.height() < min_dim && blob_box.width() < 2*min_dim) {
116 period_grid_.InsertBBox(false, false, pblob);
117 }
118 }
119 }
120 if (!clean_part->IsEmpty())
121 clean_part_grid_.InsertBBox(true, true, clean_part);
122 }
123
124 // TODO(rays) This is the previous period blob code. Neither is completely
125 // satisfactory, as a more disciplined approach to noise removal would be
126 // better, so revisit this choice and decide what to keep when the earlier
127 // stages do a better job of noise removal.
128 #if 0
129 BLOBNBOX_IT sit(&block->small_blobs);
130 BLOBNBOX_IT nit(&block->noise_blobs);
131 BLOBNBOX_IT it(&period_blobs_);
132 // Insert dot sized boxes from small_blobs into period_blobs_
133 for (sit.mark_cycle_pt(); !sit.cycled_list(); sit.forward()) {
134 BLOBNBOX * blob = sit.data();
135 TBOX blob_box = blob->bounding_box();
136 if (blob_box.height() < min_dim && blob_box.width() < 2*min_dim) {
137 it.add_after_then_move(sit.extract());
138 }
139 }
140 // Insert dot sized boxes from noise_blobs into period_blobs_
141 for (nit.mark_cycle_pt(); !nit.cycled_list(); nit.forward()) {
142 BLOBNBOX * blob = nit.data();
143 TBOX blob_box = blob->bounding_box();
144 if (blob_box.height() < min_dim && blob_box.width() < 2*min_dim) {
145 it.add_after_then_move(nit.extract());
146 }
147 }
148 InsertBlobList(false, false, false, &period_blobs_, false, &period_grid_);
149 #endif
150 }
151
152 // High level function to perform table detection
LocateTables()153 void ColumnFinder::LocateTables() {
154 // Make single-column blocks from good_columns_ partitions. col_segments are
155 // moved to a grid later which takes the ownership
156 ColSegment_LIST column_blocks;
157 GetColumnBlocks(&column_blocks);
158
159 SetPartitionSpacings();
160
161 // Mark ColPartitions as being candidate table partition depending on
162 // the inter-word spacing
163 GridMarkTablePartitions();
164 FilterFalseAlarms();
165 SmoothTablePartitionRuns();
166
167 // Set the ratio of candidate table partitions in each column
168 SetColumnsType(&column_blocks);
169
170 // Move column segments to col_seg_grid_
171 MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
172
173 // Detect split in column layout that might have occured due to the
174 // presence of a table. In such a case, merge the corresponding columns.
175 GridMergeColumnBlocks();
176
177 // Group horizontally overlapping table partitions into table columns.
178 // table_columns created here get deleted at the end of this method.
179 ColSegment_LIST table_columns;
180 GetTableColumns(&table_columns);
181
182 // Within each column, mark the range table regions occupy based on the
183 // table columns detected. table_regions are moved to a grid later which
184 // takes the ownership
185 ColSegment_LIST table_regions;
186 GetTableRegions(&table_columns, &table_regions);
187
188 // Merge table regions across columns for tables spanning multiple
189 // columns
190 MoveColSegmentsToGrid(&table_regions, &table_grid_);
191 GridMergeTableRegions();
192
193 // Adjust table boundaries by including nearby horizontal lines and left
194 // out column headers
195 AdjustTableBoundaries();
196 GridMergeTableRegions();
197
198 // Remove false alarms consiting of a single column
199 DeleteSingleColumnTables();
200
201 if (textord_show_tables) {
202 ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
203 DisplayColPartitions(table_win, ScrollView::BLUE);
204 DisplayColSegments(&table_columns, table_win, ScrollView::GREEN);
205 table_grid_.DisplayBoxes(table_win);
206 }
207
208 if (textord_dump_table_images)
209 WriteToPix();
210
211 // Merge all colpartitions in table regions to make them a single
212 // colpartition and revert types of isolated table cells not
213 // assigned to any table to their original types.
214 MakeTableBlocks();
215 }
216
217 // Make single-column blocks from good_columns_ partitions.
GetColumnBlocks(ColSegment_LIST * column_blocks)218 void ColumnFinder::GetColumnBlocks(ColSegment_LIST* column_blocks) {
219 for (int i = 0; i < gridheight_; ++i) {
220 ColPartitionSet* columns = best_columns_[i];
221 if (columns != NULL) {
222 ColSegment_LIST new_blocks;
223 // Get boxes from the current vertical position on the grid
224 columns->GetColumnBoxes(i*gridsize_, (i + 1) * gridsize_, &new_blocks);
225 // Merge the new_blocks boxes into column_blocks if they are well-aligned
226 GroupColumnBlocks(&new_blocks, column_blocks);
227 }
228 }
229 }
230
231 // Merge column segments into the current list if they are well aligned.
GroupColumnBlocks(ColSegment_LIST * new_blocks,ColSegment_LIST * column_blocks)232 void ColumnFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks,
233 ColSegment_LIST* column_blocks) {
234 ColSegment_IT src_it(new_blocks);
235 ColSegment_IT dest_it(column_blocks);
236 // iterate through the source list
237 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
238 ColSegment* src_seg = src_it.data();
239 TBOX src_box = src_seg->bounding_box();
240 bool match_found = false;
241 // iterate through the destination list to find a matching column block
242 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
243 ColSegment* dest_seg = dest_it.data();
244 TBOX dest_box = dest_seg->bounding_box();
245 if (ConsecutiveBoxes(src_box, dest_box)) {
246 // If matching block is found, insert the current block into it
247 // and delete the soure block
248 dest_seg->InsertBox(src_box);
249 match_found = true;
250 delete src_it.extract();
251 break;
252 }
253 }
254 // If no match is found, just append the source block to column_blocks
255 if (!match_found) {
256 dest_it.add_after_then_move(src_it.extract());
257 }
258 }
259 }
260
261 // are the two boxes immediate neighbors along the vertical direction
ConsecutiveBoxes(const TBOX & b1,const TBOX & b2)262 bool ColumnFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
263 int x_margin = 20;
264 int y_margin = 5;
265 return (abs(b1.left() - b2.left()) < x_margin) &&
266 (abs(b1.right() - b2.right()) < x_margin) &&
267 (abs(b1.top()-b2.bottom()) < y_margin ||
268 abs(b2.top()-b1.bottom()) < y_margin);
269 }
270
271 // Set left, right and top, bottom spacings of each colpartition.
SetPartitionSpacings()272 void ColumnFinder::SetPartitionSpacings() {
273 // Iterate the ColPartitions in the grid.
274 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
275 gsearch(&clean_part_grid_);
276 gsearch.StartFullSearch();
277 ColPartition* part;
278 while ((part = gsearch.NextFullSearch()) != NULL) {
279 ColPartitionSet* columns = best_columns_[gsearch.GridY()];
280 TBOX box = part->bounding_box();
281 int y = part->MidY();
282 ColPartition* left_column = columns->ColumnContaining(box.left(), y);
283 ColPartition* right_column = columns->ColumnContaining(box.right(), y);
284 // set distance from left column as space to the left
285 if (left_column) {
286 int left_space = MAX(0, box.left() - left_column->LeftAtY(y));
287 part->set_space_to_left(left_space);
288 }
289 // set distance from right column as space to the right
290 if (right_column) {
291 int right_space = MAX(0, right_column->RightAtY(y) - box.right());
292 part->set_space_to_right(right_space);
293 }
294 SetVerticalSpacing(part);
295 }
296 SetGlobalSpacings();
297 }
298
299 // Set spacing and closest neighbors above and below a given colpartition.
SetVerticalSpacing(ColPartition * part)300 void ColumnFinder::SetVerticalSpacing(ColPartition* part) {
301 TBOX box = part->bounding_box();
302 int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y());
303 int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y());
304 box.set_top(top_range);
305 box.set_bottom(bottom_range);
306
307 TBOX part_box = part->bounding_box();
308 // Start a rect search
309 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
310 rectsearch(&clean_part_grid_);
311 rectsearch.StartRectSearch(box);
312 ColPartition* neighbor;
313 int min_space_above = kMaxVerticalSpacing;
314 int min_space_below = kMaxVerticalSpacing;
315 ColPartition* above_neighbor = NULL;
316 ColPartition* below_neighbor = NULL;
317 while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
318 if (neighbor == part)
319 continue;
320 TBOX neighbor_box = neighbor->bounding_box();
321 if (neighbor_box.major_x_overlap(part_box)) {
322 int gap = abs(part->median_bottom() - neighbor->median_bottom());
323 // If neighbor is below current partition
324 if (neighbor_box.top() < part_box.bottom() &&
325 gap < min_space_below) {
326 min_space_below = gap;
327 below_neighbor = neighbor;
328 } // If neighbor is above current partition
329 else if (part_box.top() < neighbor_box.bottom() &&
330 gap < min_space_above) {
331 min_space_above = gap;
332 above_neighbor = neighbor;
333 }
334 }
335 }
336 part->set_space_above(min_space_above);
337 part->set_space_below(min_space_below);
338 part->set_nearest_neighbor_above(above_neighbor);
339 part->set_nearest_neighbor_below(below_neighbor);
340 }
341
342 // Set global spacing and x-height estimates
SetGlobalSpacings()343 void ColumnFinder::SetGlobalSpacings() {
344 STATS xheight_stats(0, kMaxVerticalSpacing + 1);
345 STATS ledding_stats(0, kMaxVerticalSpacing + 1);
346 // Iterate the ColPartitions in the grid.
347 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
348 gsearch(&clean_part_grid_);
349 gsearch.StartFullSearch();
350 ColPartition* part;
351 while ((part = gsearch.NextFullSearch()) != NULL) {
352 if (part->IsTextType()) {
353 xheight_stats.add(part->median_size(), 1);
354 ledding_stats.add(part->space_above(), 1);
355 ledding_stats.add(part->space_below(), 1);
356 }
357 }
358 // Set estimates based on median of statistics obtained
359 global_median_xheight_ = static_cast<int>(xheight_stats.median() + 0.5);
360 global_median_ledding_ = static_cast<int>(ledding_stats.median() + 0.5);
361 if (textord_show_tables) {
362 ScrollView* stats_win = MakeWindow(500, 10,
363 "X-height and ledding histograms");
364 xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
365 ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
366 }
367 }
368
369 // Three types of partitions are maked as table partitions:
370 // 1- Partitions that have at lease one large gap between words
371 // 2- Partitions that consist of only one word (no significant gap
372 // between components)
373 // 3- Partitions that vertically overlap with other partitions within the
374 // same column.
GridMarkTablePartitions()375 void ColumnFinder::GridMarkTablePartitions() {
376 // Iterate the ColPartitions in the grid.
377 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
378 gsearch(&clean_part_grid_);
379 gsearch.StartFullSearch();
380 ColPartition* part;
381 while ((part = gsearch.NextFullSearch()) != NULL) {
382 if (!part->IsTextType()) // Only consider text partitions
383 continue;
384 // Only consider partitions in dominant font size or smaller
385 if (part->median_size() > kMaxTableCellXheight * global_median_xheight_)
386 continue;
387 // Mark partitions with a large gap, or no significant gap as
388 // table partitions.
389 // Comments: It produces several false alarms at:
390 // - last line of a paragraph
391 // - single word section headings
392 // - page headers and footers
393 // - numbered equations
394 // - line drawing regions
395 // TODO(faisal): detect and fix above-mentioned cases
396 if (HasWideOrNoInterWordGap(part)) {
397 part->set_table_type();
398 }
399 }
400 }
401
402 // Check if the partition has at lease one large gap between words or no
403 // significant gap at all
HasWideOrNoInterWordGap(ColPartition * part)404 bool ColumnFinder::HasWideOrNoInterWordGap(ColPartition* part) {
405 BLOBNBOX_CLIST* part_boxes = part->boxes();
406 BLOBNBOX_C_IT pit(part_boxes);
407
408 if (part->bounding_box().width() <
409 kMinBoxesInTextPartition * part->median_size() &&
410 pit.length() < kMinBoxesInTextPartition)
411 return true;
412
413 // Make a copy of the components in the current partition and insert periods
414 // into it to compute gaps while taking periods into account.
415 BLOBNBOX_CLIST boxes;
416 BLOBNBOX_C_IT it(&boxes);
417 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
418 BLOBNBOX *pblob = pit.data();
419 it.add_after_then_move(pblob);
420 }
421 // Start rectangular search to find periods in this partition
422 GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> rectsearch(&period_grid_);
423 TBOX part_box = part->bounding_box();
424 rectsearch.StartRectSearch(part_box);
425 BLOBNBOX* period;
426 while ((period = rectsearch.NextRectSearch()) != NULL) {
427 // Insert a period only if it lies in a gap between two consecutive boxes
428 if (LiesInGap(period, &boxes))
429 boxes.add_sorted(SortByBoxLeft<BLOBNBOX>, true, period);
430 }
431
432 int current_x0;
433 int current_x1;
434 int previous_x1 = -1;
435 int max_partition_gap = -1;
436 double max_gap = kMaxGapInTextPartition * part->median_size();
437 double min_gap = kMinMaxGapInTextPartition * part->median_size();
438
439 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
440 BLOBNBOX *blob = it.data();
441 current_x0 = blob->bounding_box().left();
442 current_x1 = blob->bounding_box().right();
443 if (previous_x1 != -1) {
444 int gap = current_x0 - previous_x1;
445 // If a large enough gap is found, mark it as a table cell (return true)
446 if (gap > max_gap)
447 return true;
448 if (gap > max_partition_gap)
449 max_partition_gap = gap;
450 }
451 previous_x1 = current_x1;
452 }
453 // Since no large gap was found, return false if the partition is too
454 // long to be a data cell
455 if (part->bounding_box().width() >
456 kMaxBoxesInDataPartition * part->median_size() ||
457 pit.length() > kMaxBoxesInDataPartition)
458 return false;
459
460 // return true if the maximum gap found is smaller than the minimum allowed
461 // max_gap in a text partition
462 return (max_partition_gap < min_gap);
463 }
464
465 // Check if the period lies in a gap between consecutive boxes
LiesInGap(BLOBNBOX * period,BLOBNBOX_CLIST * boxes)466 bool ColumnFinder::LiesInGap(BLOBNBOX* period, BLOBNBOX_CLIST* boxes) {
467 BLOBNBOX_C_IT it(boxes);
468 TBOX period_box = period->bounding_box();
469 int num_boxes = it.length();
470 // skip the first element since it has no gap to its left.
471 it.forward();
472 for (int i = 1; i < num_boxes; i++) {
473 TBOX box = it.data()->bounding_box();
474 TBOX previous_blob = it.data_relative(-1)->bounding_box();
475 TBOX gap_box = TBOX(previous_blob.botright(), box.topleft());
476 if (gap_box.major_overlap(period_box)) {
477 return true;
478 }
479 it.forward();
480 }
481 return false;
482 }
483
484 // Filter individual text partitions marked as table partitions
485 // consisting of paragraph endings, small section headings, and
486 // headers and footers.
FilterFalseAlarms()487 void ColumnFinder::FilterFalseAlarms() {
488 // Detect last line of paragraph
489 // Iterate the ColPartitions in the grid.
490 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
491 gsearch(&clean_part_grid_);
492 gsearch.StartFullSearch();
493 ColPartition* part;
494 while ((part = gsearch.NextFullSearch()) != NULL) {
495 if (part->type() != PT_TABLE)
496 continue; // Consider only table partitions
497
498 // Paragraph ending should have flowing text above it.
499 ColPartition* upper_part = part->nearest_neighbor_above();
500 if (!upper_part)
501 continue;
502 if (upper_part->type() != PT_FLOWING_TEXT)
503 continue;
504 if (upper_part->bounding_box().width() <
505 2 * part->bounding_box().width())
506 continue;
507 // Paragraph ending should be left-aligned to text line above it.
508 if (abs(part->bounding_box().left() - upper_part->bounding_box().left())
509 > global_median_xheight_)
510 continue;
511 // Ledding above the line should be less than ledding below
512 if (part->space_above() < part->space_below() &&
513 part->space_above() <= 2 * global_median_ledding_)
514 part->clear_table_type();
515 }
516 // Consider top-most text colpartition as header and bottom most as footer
517 ColPartition* header = NULL;
518 ColPartition* footer = NULL;
519 int max_top = -MAX_INT32;
520 int min_bottom = MAX_INT32;
521 gsearch.StartFullSearch();
522 while ((part = gsearch.NextFullSearch()) != NULL) {
523 if (!part->IsTextType())
524 continue; // Consider only text partitions
525 int top = part->bounding_box().top();
526 int bottom = part->bounding_box().bottom();
527 if (top > max_top) {
528 max_top = top;
529 header = part;
530 }
531 if (bottom < min_bottom) {
532 min_bottom = bottom;
533 footer = part;
534 }
535 }
536 if (header)
537 header->clear_table_type();
538 if (footer)
539 footer->clear_table_type();
540 }
541
542 // Mark all ColPartitions as table cells that have a table cell above
543 // and below them
544 // TODO(faisal): This is too aggressive at the moment. The method needs to
545 // consider spacing and alignment as well. Detection of false alarm table cells
546 // should also be done as part of it.
SmoothTablePartitionRuns()547 void ColumnFinder::SmoothTablePartitionRuns() {
548 // Iterate the ColPartitions in the grid.
549 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
550 gsearch(&clean_part_grid_);
551 gsearch.StartFullSearch();
552 ColPartition* part;
553 while ((part = gsearch.NextFullSearch()) != NULL) {
554 if (part->type() >= PT_TABLE)
555 continue; // Consider only text partitions
556 ColPartition* upper_part = part->nearest_neighbor_above();
557 ColPartition* lower_part = part->nearest_neighbor_below();
558 if (!upper_part || !lower_part)
559 continue;
560 if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
561 part->set_table_type();
562 }
563 }
564
565 // Set the type of a column segment based on the ratio of table to text cells
SetColumnsType(ColSegment_LIST * column_blocks)566 void ColumnFinder::SetColumnsType(ColSegment_LIST* column_blocks) {
567 ColSegment_IT it(column_blocks);
568 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
569 ColSegment* seg = it.data();
570 TBOX box = seg->bounding_box();
571 int num_table_cells = 0;
572 int num_text_cells = 0;
573 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
574 rsearch(&clean_part_grid_);
575 rsearch.StartRectSearch(box);
576 ColPartition* part;
577 while ((part = rsearch.NextRectSearch()) != NULL) {
578 if (!rsearch.ReturnedSeedElement())
579 continue; // Consider each partition only once
580 if (part->type() == PT_TABLE) {
581 num_table_cells++;
582 } else if (part->type() == PT_FLOWING_TEXT) {
583 num_text_cells++;
584 }
585 }
586 // If a column block has no text or table partition in it, it is not needed
587 // for table detection.
588 if (!num_table_cells && !num_text_cells) {
589 delete it.extract();
590 } else {
591 seg->set_num_table_cells(num_table_cells);
592 seg->set_num_text_cells(num_text_cells);
593 // set column type based on the ratio of table to text cells
594 seg->set_type();
595 }
596 }
597 }
598
599 // Move column blocks to grid
MoveColSegmentsToGrid(ColSegment_LIST * segments,ColSegmentGrid * col_seg_grid)600 void ColumnFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
601 ColSegmentGrid *col_seg_grid) {
602 col_seg_grid->Init(gridsize(), bleft(), tright());
603 ColSegment_IT it(segments);
604 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
605 ColSegment* seg = it.extract();
606 col_seg_grid->InsertBBox(true, true, seg);
607 }
608 }
609
610 // Merge column blocks if a split is detected due to the presence of a
611 // table. A text block is considered split if it has multiple
612 // neighboring blocks above/below it, and at least one of the
613 // neighboring blocks is of table type (has a high density of table
614 // partitions). In this case neighboring blocks in the direction
615 // (above/below) of the table block are merged with the text block.
616
617 // Comment: This method does not handle split due to a full page table
618 // since table columns in this case do not have a text column on which
619 // split decision can be based.
GridMergeColumnBlocks()620 void ColumnFinder::GridMergeColumnBlocks() {
621 int margin = gridsize_;
622
623 // Iterate the Column Blocks in the grid.
624 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
625 gsearch(&col_seg_grid_);
626 gsearch.StartFullSearch();
627 ColSegment* seg;
628 while ((seg = gsearch.NextFullSearch()) != NULL) {
629 if (seg->type() != COL_TEXT)
630 continue; // only consider text blocks for split detection
631 bool neighbor_found = false;
632 bool modified = false; // Modified at least once
633 // keep expanding current box as long as neighboring table columns
634 // are found above or below it.
635 do {
636 TBOX box = seg->bounding_box();
637 // slightly expand the search region vertically
638 int top_range = MIN(box.top() + margin, tright().y());
639 int bottom_range = MAX(box.bottom() - margin, bleft().y());
640 box.set_top(top_range);
641 box.set_bottom(bottom_range);
642 neighbor_found = false;
643 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
644 rectsearch(&col_seg_grid_);
645 rectsearch.StartRectSearch(box);
646 ColSegment* neighbor;
647 while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
648 if (neighbor == seg)
649 continue;
650 TBOX neighbor_box = neighbor->bounding_box();
651 // If the neighbor box significantly overlaps with the current
652 // box (due to the expansion of the current box in the
653 // previous iteration of this loop), remove the neighbor box
654 // and expand the current box to include it.
655 if (neighbor_box.overlap_fraction(box) >= 0.9) {
656 seg->InsertBox(neighbor_box);
657 modified = true;
658 rectsearch.RemoveBBox();
659 gsearch.RepositionIterator();
660 delete neighbor;
661 continue;
662 }
663 // Only expand if the neighbor box is of table type
664 if (neighbor->type() != COL_TABLE)
665 continue;
666 // Insert the neighbor box into the current column block
667 if (neighbor_box.major_x_overlap(box) &&
668 !box.contains(neighbor_box)) {
669 seg->InsertBox(neighbor_box);
670 neighbor_found = true;
671 modified = true;
672 rectsearch.RemoveBBox();
673 gsearch.RepositionIterator();
674 delete neighbor;
675 }
676 }
677 } while (neighbor_found);
678 if (modified) {
679 // Because the box has changed, it has to be removed first.
680 gsearch.RemoveBBox();
681 col_seg_grid_.InsertBBox(true, true, seg);
682 gsearch.RepositionIterator();
683 }
684 }
685 }
686
687 // Group horizontally overlapping table partitions into table columns.
688 // TODO(faisal): This is too aggressive at the moment. The method should
689 // consider more attributes to group table partitions together. Some common
690 // errors are:
691 // 1- page number is merged with a table column above it even
692 // if there is a large vertical gap between them.
693 // 2- column headers go on to catch one of the columns arbitrarily
694 // 3- an isolated noise blob near page top or bottom merges with the table
695 // column below/above it
696 // 4- cells from two vertically adjacent tables merge together to make a
697 // single column resulting in merging of the two tables
GetTableColumns(ColSegment_LIST * table_columns)698 void ColumnFinder::GetTableColumns(ColSegment_LIST *table_columns) {
699 ColSegment_IT it(table_columns);
700 // Iterate the ColPartitions in the grid.
701 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
702 gsearch(&clean_part_grid_);
703 gsearch.StartFullSearch();
704 ColPartition* part;
705 while ((part = gsearch.NextFullSearch()) != NULL) {
706 if (part->inside_table_column() || part->type() != PT_TABLE)
707 continue; // prevent a partition to be assigned to multiple columns
708 TBOX box = part->bounding_box();
709 ColSegment* col = new ColSegment();
710 col->InsertBox(box);
711 part->set_inside_table_column(true);
712 // Start a search below the current cell to find bottom neighbours
713 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
714 vsearch(&clean_part_grid_);
715 vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
716 ColPartition* neighbor;
717 bool found_neighbours = false;
718 while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) {
719 // only consider neighbors not assigned to any column yet
720 if (neighbor->inside_table_column())
721 continue;
722
723 // presence of a non-table neighbor marks the end of current
724 // table column
725 if (neighbor->type() != PT_TABLE) {
726 // Horizontal lines should not break the flow
727 if (neighbor->IsLineType())
728 continue;
729 else
730 break;
731 }
732 TBOX neighbor_box = neighbor->bounding_box();
733 col->InsertBox(neighbor_box);
734 neighbor->set_inside_table_column(true);
735 found_neighbours = true;
736 }
737 if (found_neighbours) {
738 it.add_after_then_move(col);
739 } else {
740 part->set_inside_table_column(false);
741 delete col;
742 }
743 }
744 }
745
746 // Mark regions in a column that are x-bounded by the column boundaries and
747 // y-bounded by the table columns' projection on the y-axis as table regions
GetTableRegions(ColSegment_LIST * table_columns,ColSegment_LIST * table_regions)748 void ColumnFinder::GetTableRegions(ColSegment_LIST* table_columns,
749 ColSegment_LIST* table_regions) {
750 ColSegment_IT cit(table_columns);
751 ColSegment_IT rit(table_regions);
752 // Iterate through column blocks
753 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
754 gsearch(&col_seg_grid_);
755 gsearch.StartFullSearch();
756 ColSegment* part;
757 int page_height = tright().y() - bleft().y();
758 ASSERT_HOST(page_height > 0);
759 // create a bool array to hold projection on y-axis
760 bool* table_region = new bool[page_height];
761 while ((part = gsearch.NextFullSearch()) != NULL) {
762 TBOX part_box = part->bounding_box();
763 // reset the projection array
764 for (int i = 0; i < page_height; i++) {
765 table_region[i] = false;
766 }
767 // iterate through all table columns to find regions in the current
768 // page column block
769 cit.move_to_first();
770 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
771 TBOX col_box = cit.data()->bounding_box();
772 // find intersection region of table column and page column
773 TBOX intersection_box = col_box.intersection(part_box);
774 // project table column on the y-axis
775 for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
776 table_region[i - bleft().y()] = true;
777 }
778 }
779 // set x-limits of table regions to page column width
780 TBOX current_table_box;
781 current_table_box.set_left(part_box.left());
782 current_table_box.set_right(part_box.right());
783 // go through the y-axis projection to find runs of table
784 // regions. Each run makes one table region.
785 for (int i = 1; i < page_height; i++) {
786 // detect start of a table region
787 if (!table_region[i - 1] && table_region[i]) {
788 current_table_box.set_bottom(i + bleft().y());
789 }
790 // detect end of a table region
791 if (table_region[i - 1] && !table_region[i]) {
792 current_table_box.set_top(i + bleft().y());
793 if (!current_table_box.null_box()) {
794 ColSegment* seg = new ColSegment();
795 seg->InsertBox(current_table_box);
796 rit.add_after_then_move(seg);
797 }
798 }
799 }
800 }
801 delete[] table_region;
802 }
803
804 // Merge table regions corresponding to tables spanning multiple columns if
805 // there is a colpartition (horizontal ruling line or normal text) that
806 // touches both regions.
807 // TODO(faisal): A rare error occurs if there are two horizontally adjacent
808 // tables with aligned ruling lines. In this case, line finder returns a
809 // single line and hence the tables get merged together
GridMergeTableRegions()810 void ColumnFinder::GridMergeTableRegions() {
811 // Iterate the table regions in the grid.
812 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
813 gsearch(&table_grid_);
814 gsearch.StartFullSearch();
815 ColSegment* seg;
816 while ((seg = gsearch.NextFullSearch()) != NULL) {
817 bool neighbor_found = false;
818 bool modified = false; // Modified at least once
819 do {
820 // Start a rectangle search x-bounded by the image and y by the table
821 TBOX box = seg->bounding_box();
822 TBOX search_region(box);
823 search_region.set_left(bleft().x());
824 search_region.set_right(tright().x());
825 neighbor_found = false;
826 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
827 rectsearch(&table_grid_);
828 rectsearch.StartRectSearch(search_region);
829 ColSegment* neighbor;
830 while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
831 if (neighbor == seg)
832 continue;
833 TBOX neighbor_box = neighbor->bounding_box();
834 // Check if a neighbor box has a large overlap with the table
835 // region. This may happen as a result of merging two table
836 // regions in the previous iteration.
837 if (neighbor_box.overlap_fraction(box) >= 0.9) {
838 seg->InsertBox(neighbor_box);
839 rectsearch.RemoveBBox();
840 gsearch.RepositionIterator();
841 delete neighbor;
842 modified = true;
843 continue;
844 }
845 // Check if two table regions belong together based on a common
846 // horizontal ruling line
847 if (BelongToOneTable(box, neighbor_box)) {
848 seg->InsertBox(neighbor_box);
849 neighbor_found = true;
850 modified = true;
851 }
852 }
853 } while (neighbor_found);
854 if (modified) {
855 // Because the box has changed, it has to be removed first.
856 gsearch.RemoveBBox();
857 table_grid_.InsertBBox(true, true, seg);
858 gsearch.RepositionIterator();
859 }
860 }
861 }
862
863 // Decide if two table regions belong to one table based on a common
864 // horizontal ruling line or another colpartition
BelongToOneTable(const TBOX & box1,const TBOX & box2)865 bool ColumnFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
866 // Check for ColPartitions spanning both table regions
867 TBOX bbox = box1.bounding_union(box2);
868 // Start a rect search on bbox
869 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
870 rectsearch(&clean_part_grid_);
871 rectsearch.StartRectSearch(bbox);
872 ColPartition* part;
873 while ((part = rectsearch.NextRectSearch()) != NULL) {
874 TBOX part_box = part->bounding_box();
875 // return true if a colpartition spanning both table regions is found
876 if (part_box.overlap(box1) && part_box.overlap(box2))
877 return true;
878 }
879 return false;
880 }
881
882 // Adjust table boundaries by:
883 // - building a tight bounding box around all ColPartitions contained in it.
884 // - expanding table boundaries to include all colpartitions that overlap the
885 // table by more than half of their area
886 // - expanding table boundaries to include nearby horizontal rule lines
887 // - expanding table vertically to include left out column headers
888 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
889 // makes following errors:
890 // 1- horizontal lines consisting of underlines are included in the table if
891 // they are close enough
892 // 2- horizontal lines originating from noise tend to get merged with a table
893 // near the top of the page
894 // 3- the criteria for including horizontal lines is very generous. Many times
895 // horizontal lines separating headers and footers get merged with a
896 // single-column table in a multi-column page thereby including text
897 // from the neighboring column inside the table
898 // 4- the criteria for including left out column headers also tends to
899 // occasionally include text-lines above the tables, typically from
900 // table caption
AdjustTableBoundaries()901 void ColumnFinder::AdjustTableBoundaries() {
902 // Iterate the table regions in the grid
903 ColSegment_CLIST adjusted_tables;
904 ColSegment_C_IT it(&adjusted_tables);
905 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
906 gsearch(&table_grid_);
907 gsearch.StartFullSearch();
908 ColSegment* table;
909 // search for horizontal ruling lines within the vertical margin
910 int vertical_margin = kRulingVerticalMargin * gridsize_;
911 while ((table = gsearch.NextFullSearch()) != NULL) {
912 TBOX table_box = table->bounding_box();
913 TBOX search_box = table_box;
914 int top = MIN(search_box.top() + vertical_margin, tright().y());
915 int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
916 search_box.set_top(top);
917 search_box.set_bottom(bottom);
918 TBOX box;
919 // Start a rect search on table_box
920 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
921 rectsearch(&clean_part_grid_);
922 rectsearch.StartRectSearch(search_box);
923 ColPartition* part;
924 while ((part = rectsearch.NextRectSearch()) != NULL) {
925 // Do not consider image partitions
926 if (part->IsImageType())
927 continue;
928 TBOX part_box = part->bounding_box();
929 // Include partition in the table if more than half of it
930 // is covered by the table
931 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
932 box = box.bounding_union(part_box);
933 continue;
934 }
935 // Include a partially overlapping horizontal line only if the
936 // extra ColPartitions that will be included due to expansion
937 // have large side spacing w.r.t. columns containing them.
938 if (HLineBelongsToTable(part, table_box)) {
939 box = box.bounding_union(part_box);
940 }
941 }
942 IncludeLeftOutColumnHeaders(box);
943 // To prevent a table from expanding again, do not insert the
944 // modified box back to the grid. Instead move it to a list and
945 // and remove it from the grid. The list is moved later back to the grid.
946 if (!box.null_box()) {
947 ColSegment* col = new ColSegment();
948 col->InsertBox(box);
949 it.add_after_then_move(col);
950 }
951 gsearch.RemoveBBox();
952 delete table;
953 }
954 // clear table grid to move final tables in it
955 table_grid_.Clear();
956 it.move_to_first();
957 // move back final tables to table_grid_
958 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
959 ColSegment* seg = it.extract();
960 table_grid_.InsertBBox(true, true, seg);
961 }
962 }
963
964 // Checks whether the horizontal line belong to the table by looking at the
965 // side spacing of extra ColParitions that will be included in the table
966 // due to expansion
HLineBelongsToTable(ColPartition * part,const TBOX & table_box)967 bool ColumnFinder::HLineBelongsToTable(ColPartition* part,
968 const TBOX& table_box) {
969 TBOX part_box = part->bounding_box();
970 if (!part->IsLineType() || !part_box.major_x_overlap(table_box))
971 return false;
972 // Do not consider top-most horizontal line since it usually
973 // originates from noise
974 if (!part->nearest_neighbor_above())
975 return false;
976 TBOX bbox = part_box.bounding_union(table_box);
977 // Start a rect search on bbox
978 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
979 rectsearch(&clean_part_grid_);
980 rectsearch.StartRectSearch(bbox);
981 ColPartition* extra_part;
982 int num_extra_partitions = 0;
983 int extra_space_to_right = 0;
984 int extra_space_to_left = 0;
985 while ((extra_part = rectsearch.NextRectSearch()) != NULL) {
986 if (!rectsearch.ReturnedSeedElement())
987 continue;
988 TBOX extra_part_box = extra_part->bounding_box();
989 if (extra_part_box.overlap_fraction(table_box) > 0.6)
990 continue; // ColPartition already in table
991 if (extra_part->IsImageType()) // Non-text ColPartitions do not contribute
992 continue;
993 num_extra_partitions++;
994 // presense of a table cell is a strong hint, so just increment the scores
995 // without looking at the spacing.
996 if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
997 extra_space_to_right++;
998 extra_space_to_left++;
999 continue;
1000 }
1001 int space_threshold = kSideSpaceMargin * part->median_size();
1002 if (extra_part->space_to_right() > space_threshold)
1003 extra_space_to_right++;
1004 if (extra_part->space_to_left() > space_threshold)
1005 extra_space_to_left++;
1006 }
1007 // tprintf("%d %d %d\n",
1008 // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1009 return (extra_space_to_right > num_extra_partitions / 2) ||
1010 (extra_space_to_left > num_extra_partitions / 2);
1011 }
1012
1013 // Look for isolated column headers above the given table box and
1014 // include them in the table
IncludeLeftOutColumnHeaders(TBOX & table_box)1015 void ColumnFinder::IncludeLeftOutColumnHeaders(TBOX& table_box) {
1016 // Start a search above the current table to look for column headers
1017 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1018 vsearch(&clean_part_grid_);
1019 vsearch.StartVerticalSearch(table_box.left(), table_box.right(),
1020 table_box.top());
1021 ColPartition* neighbor;
1022 ColPartition* previous_neighbor = NULL;
1023 while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) {
1024 int table_top = table_box.top();
1025 TBOX box = neighbor->bounding_box();
1026 // Do not continue if the next box is way above
1027 // TODO(faisal): make the threshold some factor of line spacing
1028 if (box.bottom() - table_top > kMaxColumnHeaderDistance)
1029 break;
1030 // Unconditionally include partitions of type TABLE or LINE
1031 // TODO(faisal): add some reasonable conditions here
1032 if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1033 table_box.set_top(box.top());
1034 previous_neighbor = NULL;
1035 continue;
1036 }
1037 // If there are two text partitions, one above the other, without a table
1038 // cell on their left or right side, consider them a barrier and quit
1039 if (previous_neighbor == NULL) {
1040 previous_neighbor = neighbor;
1041 } else {
1042 TBOX previous_box = previous_neighbor->bounding_box();
1043 if (!box.major_y_overlap(previous_box))
1044 break;
1045 }
1046 }
1047 }
1048
1049 // Remove false alarms consiting of a single column based on their
1050 // projection on the x-axis. Projection of a real table on the x-axis
1051 // should have at least one zero-valley larger than the global median
1052 // x-height of the page.
DeleteSingleColumnTables()1053 void ColumnFinder::DeleteSingleColumnTables() {
1054 int page_width = tright().x() - bleft().x();
1055 ASSERT_HOST(page_width > 0);
1056 // create an integer array to hold projection on x-axis
1057 int* table_xprojection = new int[page_width];
1058 // Iterate through all tables in the table grid
1059 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1060 table_search(&table_grid_);
1061 table_search.StartFullSearch();
1062 ColSegment* table;
1063 while ((table = table_search.NextFullSearch()) != NULL) {
1064 TBOX table_box = table->bounding_box();
1065 // reset the projection array
1066 for (int i = 0; i < page_width; i++) {
1067 table_xprojection[i] = 0;
1068 }
1069 // Start a rect search on table_box
1070 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1071 rectsearch(&clean_part_grid_);
1072 rectsearch.StartRectSearch(table_box);
1073 ColPartition* part;
1074 while ((part = rectsearch.NextRectSearch()) != NULL) {
1075 if (!rectsearch.ReturnedSeedElement())
1076 continue; // Consider each partition only once
1077 if (!part->IsTextType())
1078 continue; // Do not consider non-text partitions
1079 TBOX part_box = part->bounding_box();
1080 // Do not consider partitions partially covered by the table
1081 if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
1082 continue;
1083 BLOBNBOX_CLIST* part_boxes = part->boxes();
1084 BLOBNBOX_C_IT pit(part_boxes);
1085 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1086 BLOBNBOX *pblob = pit.data();
1087 // ignore blob height for the purpose of projection since we
1088 // are only interested in finding valleys
1089 int xstart = pblob->bounding_box().left();
1090 int xend = pblob->bounding_box().right();
1091 for (int i = xstart; i < xend; i++)
1092 table_xprojection[i - bleft().x()]++;
1093 }
1094 }
1095 // Find largest valley between two reasonable peaks in the table
1096 if (!GapInXProjection(table_xprojection, page_width)) {
1097 table_search.RemoveBBox();
1098 delete table;
1099 }
1100 }
1101 delete[] table_xprojection;
1102 }
1103
1104 // Return true if at least one gap larger than the global x-height
1105 // exists in the horizontal projection
GapInXProjection(int * xprojection,int length)1106 bool ColumnFinder::GapInXProjection(int* xprojection, int length) {
1107 // Find peak value of the histogram
1108 int peak_value = 0;
1109 for (int i = 0; i < length; i++) {
1110 if (xprojection[i] > peak_value) {
1111 peak_value = xprojection[i];
1112 }
1113 }
1114 // Peak value represents the maximum number of horizontally
1115 // overlapping colpartitions, so this can be considered as the
1116 // number of rows in the table
1117 if (peak_value < kMinRowsInTable)
1118 return false;
1119 double projection_threshold = kProjectionThreshold * peak_value;
1120 // Threshold the histogram
1121 for (int i = 0; i < length; i++) {
1122 xprojection[i] = (xprojection[i] > projection_threshold) ? 1 : 0;
1123 }
1124 // Find the largest run of zeros between two ones
1125 int largest_gap = 0;
1126 int run_start = -1;
1127 for (int i = 1; i < length; i++) {
1128 // detect start of a run of zeros
1129 if (xprojection[i - 1] && !xprojection[i]) {
1130 run_start = i;
1131 }
1132 // detect end of a run of zeros and update the value of largest gap
1133 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1134 int gap = i - run_start;
1135 if (gap > largest_gap)
1136 largest_gap = gap;
1137 run_start = -1;
1138 }
1139 }
1140 return (largest_gap > global_median_xheight_);
1141 }
1142
1143 // Displays the column segments in some window.
DisplayColSegments(ColSegment_LIST * segments,ScrollView * win,ScrollView::Color color)1144 void ColumnFinder::DisplayColSegments(ColSegment_LIST *segments,
1145 ScrollView* win,
1146 ScrollView::Color color) {
1147 #ifndef GRAPHICS_DISABLED
1148 win->Pen(color);
1149 win->Brush(ScrollView::NONE);
1150 ColSegment_IT it(segments);
1151 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1152 ColSegment *col = it.data();
1153 TBOX box = col->bounding_box();
1154 int left_x = box.left();
1155 int right_x = box.right();
1156 int top_y = box.top();
1157 int bottom_y = box.bottom();
1158 win->Rectangle(left_x, bottom_y, right_x, top_y);
1159 }
1160 win->Update();
1161 #endif
1162 }
1163
1164 // Displays the colpartitions using a new coloring on an existing window.
1165 // Note: This method is only for debug purpose during development and
1166 // would not be part of checked in code
DisplayColPartitions(ScrollView * win,ScrollView::Color default_color)1167 void ColumnFinder::DisplayColPartitions(ScrollView* win,
1168 ScrollView::Color default_color) {
1169 #ifndef GRAPHICS_DISABLED
1170 // Iterate the ColPartitions in the grid.
1171 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1172 gsearch(&clean_part_grid_);
1173 gsearch.StartFullSearch();
1174 ColPartition* part;
1175 win->Brush(ScrollView::NONE);
1176 ScrollView::Color color;
1177 while ((part = gsearch.NextFullSearch()) != NULL) {
1178 color = default_color;
1179 TBOX box = part->bounding_box();
1180 // ColPartition* upper_part = part->nearest_neighbor_above();
1181 // ColPartition* lower_part = part->nearest_neighbor_below();
1182 // if (!upper_part && !lower_part)
1183 // color = ScrollView::ORANGE;
1184 // else if (upper_part && !lower_part)
1185 // color = ScrollView::CYAN;
1186 // else if (!upper_part && lower_part)
1187 // color = ScrollView::MAGENTA;
1188 if (part->type() == PT_TABLE)
1189 color = ScrollView::YELLOW;
1190
1191 int left_x = box.left();
1192 int right_x = box.right();
1193 int top_y = box.top();
1194 int bottom_y = box.bottom();
1195 win->Pen(color);
1196 win->Rectangle(left_x, bottom_y, right_x, top_y);
1197 }
1198 win->Update();
1199 #endif
1200 }
1201
1202 // Write debug image and text file.
1203 // Note: This method is only for debug purpose during development and
1204 // would not be part of checked in code
WriteToPix()1205 void ColumnFinder::WriteToPix() {
1206 #ifdef HAVE_LIBLEPT
1207 // Input file must be named test1.tif
1208 PIX* pix = pixRead("test1.tif");
1209 if (!pix) {
1210 tprintf("Input file test1.tif not found.\n");
1211 return;
1212 }
1213 int img_height = pixGetHeight(pix);
1214 int img_width = pixGetWidth(pix);
1215 // Maximum number of text or table partitions
1216 int num_boxes = 10;
1217 BOXA* text_box_array = boxaCreate(num_boxes);
1218 BOXA* table_box_array = boxaCreate(num_boxes);
1219 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1220 gsearch(&clean_part_grid_);
1221 gsearch.StartFullSearch();
1222 ColPartition* part;
1223 // load colpartitions into text_box_array and table_box_array
1224 while ((part = gsearch.NextFullSearch()) != NULL) {
1225 TBOX box = part->bounding_box();
1226 box.rotate_large(reskew_);
1227 BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
1228 box.right() - box.left(),
1229 box.top() - box.bottom());
1230 if (part->type() == PT_TABLE)
1231 boxaAddBox(table_box_array, lept_box, L_INSERT);
1232 else
1233 boxaAddBox(text_box_array, lept_box, L_INSERT);
1234 }
1235 // draw colpartitions on the output image
1236 PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000);
1237 out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00);
1238
1239 BOXA* table_array = boxaCreate(num_boxes);
1240 // text file containing detected table bounding boxes
1241 FILE* fptr = fopen("tess-table.txt", "w");
1242 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1243 table_search(&table_grid_);
1244 table_search.StartFullSearch();
1245 ColSegment* table;
1246 // load table boxes to table_array and write them to text file as well
1247 while ((table = table_search.NextFullSearch()) != NULL) {
1248 TBOX box = table->bounding_box();
1249 box.rotate_large(reskew_);
1250 // Since deskewing introduces negative coordinates, reskewing
1251 // might not completely recover from that since both steps enlarge
1252 // the actual box. Hence a box that undergoes deskewing/reskewing
1253 // may go out of image boundaries. Crop a table box if needed to
1254 // contain it inside the image dimensions.
1255 box = box.intersection(TBOX(0, 0, img_width - 1, img_height - 1));
1256 BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
1257 box.right() - box.left(),
1258 box.top() - box.bottom());
1259 boxaAddBox(table_array, lept_box, L_INSERT);
1260 fprintf(fptr, "%d %d %d %d TABLE\n", box.left(),
1261 img_height - box.top(), box.right(), img_height - box.bottom());
1262 }
1263 fclose(fptr);
1264 // paint table boxes on the debug image
1265 out = pixDrawBoxa(out, table_array, 5, 0x7fff0000);
1266
1267 pixWrite("out.png", out, IFF_PNG);
1268 // memory cleanup
1269 boxaDestroy(&text_box_array);
1270 boxaDestroy(&table_box_array);
1271 boxaDestroy(&table_array);
1272 pixDestroy(&pix);
1273 pixDestroy(&out);
1274 #endif
1275 }
1276
1277 // Merge all colpartitions in table regions to make them a single
1278 // colpartition and revert types of isolated table cells not
1279 // assigned to any table to their original types.
MakeTableBlocks()1280 void ColumnFinder::MakeTableBlocks() {
1281 // Since we have table blocks already, remove table tags from all
1282 // colpartitions
1283 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1284 gsearch(&part_grid_);
1285 gsearch.StartFullSearch();
1286 ColPartition* part;
1287 while ((part = gsearch.NextFullSearch()) != NULL) {
1288 if (part->type() == PT_TABLE) {
1289 part->clear_table_type();
1290 }
1291 }
1292 // Now make a single colpartition out of each table block and remove
1293 // all colpartitions contained within a table
1294 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1295 table_search(&table_grid_);
1296 table_search.StartFullSearch();
1297 ColSegment* table;
1298 while ((table = table_search.NextFullSearch()) != NULL) {
1299 TBOX table_box = table->bounding_box();
1300 // Start a rect search on table_box
1301 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1302 rectsearch(&part_grid_);
1303 rectsearch.StartRectSearch(table_box);
1304 ColPartition* part;
1305 ColPartition* table_partition = NULL;
1306 while ((part = rectsearch.NextRectSearch()) != NULL) {
1307 // Do not consider image partitions
1308 if (!part->IsTextType())
1309 continue;
1310 TBOX part_box = part->bounding_box();
1311 // Include partition in the table if more than half of it
1312 // is covered by the table
1313 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1314 rectsearch.RemoveBBox();
1315 if (table_partition) {
1316 table_partition->Absorb(part, WidthCB());
1317 } else {
1318 table_partition = part;
1319 }
1320 }
1321 }
1322 // Insert table colpartition back to part_grid_
1323 if (table_partition) {
1324 table_partition->SetPartitionType(best_columns_[table_search.GridY()]);
1325 table_partition->set_table_type();
1326 part_grid_.InsertBBox(true, true, table_partition);
1327 }
1328 }
1329 }
1330
1331 // Provides a color for BBGrid to draw the rectangle.
BoxColor() const1332 ScrollView::Color ColSegment::BoxColor() const {
1333 const ScrollView::Color kBoxColors[PT_COUNT] = {
1334 ScrollView::YELLOW,
1335 ScrollView::BLUE,
1336 ScrollView::YELLOW,
1337 ScrollView::MAGENTA,
1338 };
1339 return kBoxColors[type_];
1340 }
1341
1342 // Insert a box into this column segment
InsertBox(const TBOX & other)1343 void ColSegment::InsertBox(const TBOX& other) {
1344 bounding_box_ = bounding_box_.bounding_union(other);
1345 }
1346
1347 // Set column segment type based on the ratio of text and table partitions
1348 // in it.
set_type()1349 void ColSegment::set_type() {
1350 if (num_table_cells_ > kTableColumnThreshold * num_text_cells_)
1351 type_ = COL_TABLE;
1352 else if (num_text_cells_ > num_table_cells_)
1353 type_ = COL_TEXT;
1354 else
1355 type_ = COL_MIXED;
1356 }
1357
1358 } // namespace tesseract.
1359