• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        tablefind.cpp
3 // Description: Helper classes to find tables from ColPartitions.
4 // Author:      Faisal Shafait (faisal.shafait@dfki.de)
5 // Created:     Tue Jan 06 11:13:01 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19 
20 #include "colfind.h"
21 #include <math.h>
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25 #ifdef HAVE_LIBLEPT
26 #include "allheaders.h"
27 #endif
28 
29 namespace tesseract {
30 
31 // Maximum vertical spacing between neighbor partitions
32 const int kMaxVerticalSpacing = 500;
33 
34 // Minimum number of components in a text partition. A partition having fewer
35 // components than that is more likely a data partition and is a candidate
36 // table cell.
37 const int kMinBoxesInTextPartition = 10;
38 
39 // Maximum number of components that a data partition can have
40 const int kMaxBoxesInDataPartition = 20;
41 
42 // Maximum allowed gap in a text partitions as a multiple of its median size.
43 const double kMaxGapInTextPartition = 4.0;
44 
45 // Minimum value that the maximum gap in a text partition should have as a
46 // factor of its median size.
47 const double kMinMaxGapInTextPartition = 0.5;
48 
49 // Maximum x-height a table partition can have as a multiple of global
50 // median x-height
51 const double kMaxTableCellXheight = 2.0;
52 
53 // Maximum line spacing between a table column header and column contents
54 // for merging the two
55 const int kMaxColumnHeaderDistance = 100;
56 
57 // Minimum ratio of num_table_partitions to num_text_partitions in a column
58 // block to be called it a table column
59 const double kTableColumnThreshold = 3.0;
60 
61 // Search for horizontal ruling lines within the vertical margin as a
62 // multiple of grid size
63 const int kRulingVerticalMargin = 3;
64 
65 // Minimum overlap that a colpartition must have with a table region
66 // to become part of that table
67 const double kMinOverlapWithTable = 0.6;
68 
69 // Maximum side space (distance from column boundary) that a typical
70 // text-line in flowing text should have as a multiple of its x-height
71 // (Median size).
72 const int kSideSpaceMargin = 10;
73 
74 // Fraction of the peak of x-projection of a table region to set the
75 // threshold for the x-projection histogram
76 const double kProjectionThreshold = 0.35;
77 
78 // Minmimum number of rows in a table
79 const int kMinRowsInTable = 3;
80 
81 BOOL_VAR(textord_dump_table_images, false, "Paint table detection output");
82 BOOL_VAR(textord_show_tables, false, "Show table regions");
83 
84 ELISTIZE(ColSegment)
CLISTIZE(ColSegment)85 CLISTIZE(ColSegment)
86 
87 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
88 // insert dot-like noise into period_grid_
89 void ColumnFinder::GetCleanPartitions(TO_BLOCK* block) {
90   double min_dim = block->line_size/3.0;
91   // Initialize clean partitions list and grid
92   clean_part_grid_.Init(gridsize(), bleft(), tright());
93   period_grid_.Init(gridsize(), bleft(), tright());
94   // Iterate the ColPartitions in the grid.
95   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
96     gsearch(&part_grid_);
97   gsearch.StartFullSearch();
98   ColPartition* part;
99   while ((part = gsearch.NextFullSearch()) != NULL) {
100     ColPartition* clean_part = part->ShallowCopy();
101     // Insert all non-text partitions to clean_parts
102     if (!part->IsTextType()) {
103       clean_part_grid_.InsertBBox(true, true, clean_part);
104       continue;
105     }
106     // Insert text colpartitions after removing noisy components from them
107     BLOBNBOX_CLIST* part_boxes = part->boxes();
108     BLOBNBOX_C_IT pit(part_boxes);
109     for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
110       BLOBNBOX *pblob = pit.data();
111       if (!pblob->noise_flag()) {
112         clean_part->AddBox(pblob);
113       } else {
114         TBOX blob_box = pblob->bounding_box();
115         if (blob_box.height() < min_dim && blob_box.width() < 2*min_dim) {
116           period_grid_.InsertBBox(false, false, pblob);
117         }
118       }
119     }
120     if (!clean_part->IsEmpty())
121       clean_part_grid_.InsertBBox(true, true, clean_part);
122   }
123 
124 // TODO(rays) This is the previous period blob code. Neither is completely
125 // satisfactory, as a more disciplined approach to noise removal would be
126 // better, so revisit this choice and decide what to keep when the earlier
127 // stages do a better job of noise removal.
128 #if 0
129   BLOBNBOX_IT sit(&block->small_blobs);
130   BLOBNBOX_IT nit(&block->noise_blobs);
131   BLOBNBOX_IT it(&period_blobs_);
132   // Insert dot sized boxes from small_blobs into period_blobs_
133   for (sit.mark_cycle_pt(); !sit.cycled_list(); sit.forward()) {
134     BLOBNBOX * blob = sit.data();
135     TBOX blob_box = blob->bounding_box();
136     if (blob_box.height() < min_dim && blob_box.width() < 2*min_dim) {
137       it.add_after_then_move(sit.extract());
138     }
139   }
140   // Insert dot sized boxes from noise_blobs into period_blobs_
141   for (nit.mark_cycle_pt(); !nit.cycled_list(); nit.forward()) {
142     BLOBNBOX * blob = nit.data();
143     TBOX blob_box = blob->bounding_box();
144     if (blob_box.height() < min_dim && blob_box.width() < 2*min_dim) {
145       it.add_after_then_move(nit.extract());
146     }
147   }
148   InsertBlobList(false, false, false, &period_blobs_, false, &period_grid_);
149 #endif
150 }
151 
152 // High level function to perform table detection
LocateTables()153 void ColumnFinder::LocateTables() {
154   // Make single-column blocks from good_columns_ partitions. col_segments are
155   // moved to a grid later which takes the ownership
156   ColSegment_LIST column_blocks;
157   GetColumnBlocks(&column_blocks);
158 
159   SetPartitionSpacings();
160 
161   // Mark ColPartitions as being candidate table partition depending on
162   // the inter-word spacing
163   GridMarkTablePartitions();
164   FilterFalseAlarms();
165   SmoothTablePartitionRuns();
166 
167   // Set the ratio of candidate table partitions in each column
168   SetColumnsType(&column_blocks);
169 
170   // Move column segments to col_seg_grid_
171   MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
172 
173   // Detect split in column layout that might have occured due to the
174   // presence of a table. In such a case, merge the corresponding columns.
175   GridMergeColumnBlocks();
176 
177   // Group horizontally overlapping table partitions into table columns.
178   // table_columns created here get deleted at the end of this method.
179   ColSegment_LIST table_columns;
180   GetTableColumns(&table_columns);
181 
182   // Within each column, mark the range table regions occupy based on the
183   // table columns detected. table_regions are moved to a grid later which
184   // takes the ownership
185   ColSegment_LIST table_regions;
186   GetTableRegions(&table_columns, &table_regions);
187 
188   // Merge table regions across columns for tables spanning multiple
189   // columns
190   MoveColSegmentsToGrid(&table_regions, &table_grid_);
191   GridMergeTableRegions();
192 
193   // Adjust table boundaries by including nearby horizontal lines and left
194   // out column headers
195   AdjustTableBoundaries();
196   GridMergeTableRegions();
197 
198   // Remove false alarms consiting of a single column
199   DeleteSingleColumnTables();
200 
201   if (textord_show_tables) {
202     ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
203     DisplayColPartitions(table_win, ScrollView::BLUE);
204     DisplayColSegments(&table_columns, table_win, ScrollView::GREEN);
205     table_grid_.DisplayBoxes(table_win);
206   }
207 
208   if (textord_dump_table_images)
209     WriteToPix();
210 
211   // Merge all colpartitions in table regions to make them a single
212   // colpartition and revert types of isolated table cells not
213   // assigned to any table to their original types.
214   MakeTableBlocks();
215 }
216 
217 // Make single-column blocks from good_columns_ partitions.
GetColumnBlocks(ColSegment_LIST * column_blocks)218 void ColumnFinder::GetColumnBlocks(ColSegment_LIST* column_blocks) {
219   for (int i = 0; i < gridheight_; ++i) {
220     ColPartitionSet* columns = best_columns_[i];
221     if (columns != NULL) {
222       ColSegment_LIST new_blocks;
223       // Get boxes from the current vertical position on the grid
224       columns->GetColumnBoxes(i*gridsize_, (i + 1) * gridsize_, &new_blocks);
225       // Merge the new_blocks boxes into column_blocks if they are well-aligned
226       GroupColumnBlocks(&new_blocks, column_blocks);
227     }
228   }
229 }
230 
231 // Merge column segments into the current list if they are well aligned.
GroupColumnBlocks(ColSegment_LIST * new_blocks,ColSegment_LIST * column_blocks)232 void ColumnFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks,
233                                      ColSegment_LIST* column_blocks) {
234   ColSegment_IT src_it(new_blocks);
235   ColSegment_IT dest_it(column_blocks);
236   // iterate through the source list
237   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
238     ColSegment* src_seg = src_it.data();
239     TBOX src_box = src_seg->bounding_box();
240     bool match_found = false;
241     // iterate through the destination list to find a matching column block
242     for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
243       ColSegment* dest_seg = dest_it.data();
244       TBOX dest_box = dest_seg->bounding_box();
245       if (ConsecutiveBoxes(src_box, dest_box)) {
246         // If matching block is found, insert the current block into it
247         // and delete the soure block
248         dest_seg->InsertBox(src_box);
249         match_found = true;
250         delete src_it.extract();
251         break;
252       }
253     }
254     // If no match is found, just append the source block to column_blocks
255     if (!match_found) {
256       dest_it.add_after_then_move(src_it.extract());
257     }
258   }
259 }
260 
261 // are the two boxes immediate neighbors along the vertical direction
ConsecutiveBoxes(const TBOX & b1,const TBOX & b2)262 bool ColumnFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
263   int x_margin = 20;
264   int y_margin = 5;
265   return (abs(b1.left() - b2.left()) < x_margin) &&
266       (abs(b1.right() - b2.right()) < x_margin) &&
267       (abs(b1.top()-b2.bottom()) < y_margin ||
268        abs(b2.top()-b1.bottom()) < y_margin);
269 }
270 
271 // Set left, right and top, bottom spacings of each colpartition.
SetPartitionSpacings()272 void ColumnFinder::SetPartitionSpacings() {
273   // Iterate the ColPartitions in the grid.
274   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
275     gsearch(&clean_part_grid_);
276   gsearch.StartFullSearch();
277   ColPartition* part;
278   while ((part = gsearch.NextFullSearch()) != NULL) {
279     ColPartitionSet* columns = best_columns_[gsearch.GridY()];
280     TBOX box = part->bounding_box();
281     int y = part->MidY();
282     ColPartition* left_column = columns->ColumnContaining(box.left(), y);
283     ColPartition* right_column = columns->ColumnContaining(box.right(), y);
284     // set distance from left column as space to the left
285     if (left_column) {
286       int left_space = MAX(0, box.left() - left_column->LeftAtY(y));
287       part->set_space_to_left(left_space);
288     }
289     // set distance from right column as space to the right
290     if (right_column) {
291       int right_space = MAX(0, right_column->RightAtY(y) - box.right());
292       part->set_space_to_right(right_space);
293     }
294     SetVerticalSpacing(part);
295   }
296   SetGlobalSpacings();
297 }
298 
299 // Set spacing and closest neighbors above and below a given colpartition.
SetVerticalSpacing(ColPartition * part)300 void ColumnFinder::SetVerticalSpacing(ColPartition* part) {
301   TBOX box = part->bounding_box();
302   int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y());
303   int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y());
304   box.set_top(top_range);
305   box.set_bottom(bottom_range);
306 
307   TBOX part_box = part->bounding_box();
308   // Start a rect search
309   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
310       rectsearch(&clean_part_grid_);
311   rectsearch.StartRectSearch(box);
312   ColPartition* neighbor;
313   int min_space_above = kMaxVerticalSpacing;
314   int min_space_below = kMaxVerticalSpacing;
315   ColPartition* above_neighbor = NULL;
316   ColPartition* below_neighbor = NULL;
317   while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
318     if (neighbor == part)
319       continue;
320     TBOX neighbor_box = neighbor->bounding_box();
321     if (neighbor_box.major_x_overlap(part_box)) {
322       int gap = abs(part->median_bottom() - neighbor->median_bottom());
323       // If neighbor is below current partition
324       if (neighbor_box.top() < part_box.bottom() &&
325           gap < min_space_below) {
326         min_space_below = gap;
327         below_neighbor = neighbor;
328       }  // If neighbor is above current partition
329       else if (part_box.top() < neighbor_box.bottom() &&
330                gap < min_space_above) {
331         min_space_above = gap;
332         above_neighbor = neighbor;
333        }
334     }
335   }
336   part->set_space_above(min_space_above);
337   part->set_space_below(min_space_below);
338   part->set_nearest_neighbor_above(above_neighbor);
339   part->set_nearest_neighbor_below(below_neighbor);
340 }
341 
342 // Set global spacing and x-height estimates
SetGlobalSpacings()343 void ColumnFinder::SetGlobalSpacings() {
344   STATS xheight_stats(0, kMaxVerticalSpacing + 1);
345   STATS ledding_stats(0, kMaxVerticalSpacing + 1);
346   // Iterate the ColPartitions in the grid.
347   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
348     gsearch(&clean_part_grid_);
349   gsearch.StartFullSearch();
350   ColPartition* part;
351   while ((part = gsearch.NextFullSearch()) != NULL) {
352     if (part->IsTextType()) {
353       xheight_stats.add(part->median_size(), 1);
354       ledding_stats.add(part->space_above(), 1);
355       ledding_stats.add(part->space_below(), 1);
356     }
357   }
358   // Set estimates based on median of statistics obtained
359   global_median_xheight_ = static_cast<int>(xheight_stats.median() + 0.5);
360   global_median_ledding_ = static_cast<int>(ledding_stats.median() + 0.5);
361   if (textord_show_tables) {
362     ScrollView* stats_win = MakeWindow(500, 10,
363                                        "X-height and ledding histograms");
364     xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
365     ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
366   }
367 }
368 
369 // Three types of partitions are maked as table partitions:
370 //  1- Partitions that have at lease one large gap between words
371 //  2- Partitions that consist of only one word (no significant gap
372 //     between components)
373 //  3- Partitions that vertically overlap with other partitions within the
374 //     same column.
GridMarkTablePartitions()375 void ColumnFinder::GridMarkTablePartitions() {
376   // Iterate the ColPartitions in the grid.
377   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
378     gsearch(&clean_part_grid_);
379   gsearch.StartFullSearch();
380   ColPartition* part;
381   while ((part = gsearch.NextFullSearch()) != NULL) {
382     if (!part->IsTextType())  // Only consider text partitions
383       continue;
384     // Only consider partitions in dominant font size or smaller
385     if (part->median_size() > kMaxTableCellXheight * global_median_xheight_)
386       continue;
387     // Mark partitions with a large gap, or no significant gap as
388     // table partitions.
389     // Comments: It produces several false alarms at:
390     //  - last line of a paragraph
391     //  - single word section headings
392     //  - page headers and footers
393     //  - numbered equations
394     //  - line drawing regions
395     // TODO(faisal): detect and fix above-mentioned cases
396     if (HasWideOrNoInterWordGap(part)) {
397       part->set_table_type();
398     }
399   }
400 }
401 
402 // Check if the partition has at lease one large gap between words or no
403 // significant gap at all
HasWideOrNoInterWordGap(ColPartition * part)404 bool ColumnFinder::HasWideOrNoInterWordGap(ColPartition* part) {
405   BLOBNBOX_CLIST* part_boxes = part->boxes();
406   BLOBNBOX_C_IT pit(part_boxes);
407 
408   if (part->bounding_box().width() <
409       kMinBoxesInTextPartition * part->median_size() &&
410       pit.length() < kMinBoxesInTextPartition)
411     return true;
412 
413   // Make a copy of the components in the current partition and insert periods
414   // into it to compute gaps while taking periods into account.
415   BLOBNBOX_CLIST boxes;
416   BLOBNBOX_C_IT it(&boxes);
417   for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
418     BLOBNBOX *pblob = pit.data();
419     it.add_after_then_move(pblob);
420   }
421   // Start rectangular search to find periods in this partition
422   GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> rectsearch(&period_grid_);
423   TBOX part_box = part->bounding_box();
424   rectsearch.StartRectSearch(part_box);
425   BLOBNBOX* period;
426   while ((period = rectsearch.NextRectSearch()) != NULL) {
427     // Insert a period only if it lies in a gap between two consecutive boxes
428     if (LiesInGap(period, &boxes))
429       boxes.add_sorted(SortByBoxLeft<BLOBNBOX>, true, period);
430   }
431 
432   int current_x0;
433   int current_x1;
434   int previous_x1 = -1;
435   int max_partition_gap = -1;
436   double max_gap = kMaxGapInTextPartition * part->median_size();
437   double min_gap = kMinMaxGapInTextPartition * part->median_size();
438 
439   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
440     BLOBNBOX *blob = it.data();
441     current_x0 = blob->bounding_box().left();
442     current_x1 = blob->bounding_box().right();
443     if (previous_x1 != -1) {
444       int gap = current_x0 - previous_x1;
445       // If a large enough gap is found, mark it as a table cell (return true)
446       if (gap > max_gap)
447         return true;
448       if (gap > max_partition_gap)
449         max_partition_gap = gap;
450     }
451     previous_x1 = current_x1;
452   }
453   // Since no large gap was found, return false if the partition is too
454   // long to be a data cell
455   if (part->bounding_box().width() >
456       kMaxBoxesInDataPartition * part->median_size() ||
457       pit.length() > kMaxBoxesInDataPartition)
458     return false;
459 
460   // return true if the maximum gap found is smaller than the minimum allowed
461   // max_gap in a text partition
462   return (max_partition_gap < min_gap);
463 }
464 
465 // Check if the period lies in a gap between consecutive boxes
LiesInGap(BLOBNBOX * period,BLOBNBOX_CLIST * boxes)466 bool ColumnFinder::LiesInGap(BLOBNBOX* period, BLOBNBOX_CLIST* boxes) {
467   BLOBNBOX_C_IT it(boxes);
468   TBOX period_box = period->bounding_box();
469   int num_boxes = it.length();
470   // skip the first element since it has no gap to its left.
471   it.forward();
472   for (int i = 1; i < num_boxes; i++) {
473     TBOX box = it.data()->bounding_box();
474     TBOX previous_blob = it.data_relative(-1)->bounding_box();
475     TBOX gap_box = TBOX(previous_blob.botright(), box.topleft());
476     if (gap_box.major_overlap(period_box)) {
477       return true;
478     }
479     it.forward();
480   }
481   return false;
482 }
483 
484 // Filter individual text partitions marked as table partitions
485 // consisting of paragraph endings, small section headings, and
486 // headers and footers.
FilterFalseAlarms()487 void ColumnFinder::FilterFalseAlarms() {
488   // Detect last line of paragraph
489   // Iterate the ColPartitions in the grid.
490   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
491     gsearch(&clean_part_grid_);
492   gsearch.StartFullSearch();
493   ColPartition* part;
494   while ((part = gsearch.NextFullSearch()) != NULL) {
495     if (part->type() != PT_TABLE)
496       continue;  // Consider only table partitions
497 
498     // Paragraph ending should have flowing text above it.
499     ColPartition* upper_part = part->nearest_neighbor_above();
500     if (!upper_part)
501       continue;
502     if (upper_part->type() != PT_FLOWING_TEXT)
503       continue;
504     if (upper_part->bounding_box().width() <
505         2 * part->bounding_box().width())
506       continue;
507     // Paragraph ending should be left-aligned to text line above it.
508     if (abs(part->bounding_box().left() - upper_part->bounding_box().left())
509         > global_median_xheight_)
510       continue;
511     // Ledding above the line should be less than ledding below
512     if (part->space_above() < part->space_below() &&
513         part->space_above() <= 2 * global_median_ledding_)
514       part->clear_table_type();
515   }
516   // Consider top-most text colpartition as header and bottom most as footer
517   ColPartition* header = NULL;
518   ColPartition* footer = NULL;
519   int max_top = -MAX_INT32;
520   int min_bottom = MAX_INT32;
521   gsearch.StartFullSearch();
522   while ((part = gsearch.NextFullSearch()) != NULL) {
523     if (!part->IsTextType())
524       continue;  // Consider only text partitions
525     int top = part->bounding_box().top();
526     int bottom = part->bounding_box().bottom();
527     if (top > max_top) {
528       max_top = top;
529       header = part;
530     }
531     if (bottom < min_bottom) {
532       min_bottom = bottom;
533       footer = part;
534     }
535   }
536   if (header)
537     header->clear_table_type();
538   if (footer)
539     footer->clear_table_type();
540 }
541 
542 // Mark all ColPartitions as table cells that have a table cell above
543 // and below them
544 // TODO(faisal): This is too aggressive at the moment. The method needs to
545 // consider spacing and alignment as well. Detection of false alarm table cells
546 // should also be done as part of it.
SmoothTablePartitionRuns()547 void ColumnFinder::SmoothTablePartitionRuns() {
548   // Iterate the ColPartitions in the grid.
549   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
550     gsearch(&clean_part_grid_);
551   gsearch.StartFullSearch();
552   ColPartition* part;
553   while ((part = gsearch.NextFullSearch()) != NULL) {
554     if (part->type() >= PT_TABLE)
555       continue;  // Consider only text partitions
556     ColPartition* upper_part = part->nearest_neighbor_above();
557     ColPartition* lower_part = part->nearest_neighbor_below();
558     if (!upper_part || !lower_part)
559       continue;
560     if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
561       part->set_table_type();
562   }
563 }
564 
565 // Set the type of a column segment based on the ratio of table to text cells
SetColumnsType(ColSegment_LIST * column_blocks)566 void ColumnFinder::SetColumnsType(ColSegment_LIST* column_blocks) {
567   ColSegment_IT it(column_blocks);
568   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
569     ColSegment* seg = it.data();
570     TBOX box = seg->bounding_box();
571     int num_table_cells = 0;
572     int num_text_cells = 0;
573     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
574         rsearch(&clean_part_grid_);
575     rsearch.StartRectSearch(box);
576     ColPartition* part;
577     while ((part = rsearch.NextRectSearch()) != NULL) {
578       if (!rsearch.ReturnedSeedElement())
579         continue;  // Consider each partition only once
580       if (part->type() == PT_TABLE) {
581         num_table_cells++;
582       } else if (part->type() == PT_FLOWING_TEXT) {
583         num_text_cells++;
584       }
585     }
586     // If a column block has no text or table partition in it, it is not needed
587     // for table detection.
588     if (!num_table_cells && !num_text_cells) {
589       delete it.extract();
590     } else {
591       seg->set_num_table_cells(num_table_cells);
592       seg->set_num_text_cells(num_text_cells);
593       // set column type based on the ratio of table to text cells
594       seg->set_type();
595     }
596   }
597 }
598 
599 // Move column blocks to grid
MoveColSegmentsToGrid(ColSegment_LIST * segments,ColSegmentGrid * col_seg_grid)600 void ColumnFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
601                                          ColSegmentGrid *col_seg_grid) {
602   col_seg_grid->Init(gridsize(), bleft(), tright());
603   ColSegment_IT it(segments);
604   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
605     ColSegment* seg = it.extract();
606     col_seg_grid->InsertBBox(true, true, seg);
607   }
608 }
609 
610 // Merge column blocks if a split is detected due to the presence of a
611 // table. A text block is considered split if it has multiple
612 // neighboring blocks above/below it, and at least one of the
613 // neighboring blocks is of table type (has a high density of table
614 // partitions). In this case neighboring blocks in the direction
615 // (above/below) of the table block are merged with the text block.
616 
617 // Comment: This method does not handle split due to a full page table
618 // since table columns in this case do not have a text column on which
619 // split decision can be based.
GridMergeColumnBlocks()620 void ColumnFinder::GridMergeColumnBlocks() {
621   int margin = gridsize_;
622 
623   // Iterate the Column Blocks in the grid.
624   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
625     gsearch(&col_seg_grid_);
626   gsearch.StartFullSearch();
627   ColSegment* seg;
628   while ((seg = gsearch.NextFullSearch()) != NULL) {
629     if (seg->type() != COL_TEXT)
630       continue;  // only consider text blocks for split detection
631     bool neighbor_found = false;
632     bool modified = false;  // Modified at least once
633     // keep expanding current box as long as neighboring table columns
634     // are found above or below it.
635     do {
636       TBOX box = seg->bounding_box();
637       // slightly expand the search region vertically
638       int top_range = MIN(box.top() + margin, tright().y());
639       int bottom_range = MAX(box.bottom() - margin, bleft().y());
640       box.set_top(top_range);
641       box.set_bottom(bottom_range);
642       neighbor_found = false;
643       GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
644           rectsearch(&col_seg_grid_);
645       rectsearch.StartRectSearch(box);
646       ColSegment* neighbor;
647       while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
648         if (neighbor == seg)
649           continue;
650         TBOX neighbor_box = neighbor->bounding_box();
651         // If the neighbor box significantly overlaps with the current
652         // box (due to the expansion of the current box in the
653         // previous iteration of this loop), remove the neighbor box
654         // and expand the current box to include it.
655         if (neighbor_box.overlap_fraction(box) >= 0.9) {
656           seg->InsertBox(neighbor_box);
657           modified = true;
658           rectsearch.RemoveBBox();
659           gsearch.RepositionIterator();
660           delete neighbor;
661           continue;
662         }
663         // Only expand if the neighbor box is of table type
664         if (neighbor->type() != COL_TABLE)
665           continue;
666         // Insert the neighbor box into the current column block
667         if (neighbor_box.major_x_overlap(box) &&
668             !box.contains(neighbor_box)) {
669           seg->InsertBox(neighbor_box);
670           neighbor_found = true;
671           modified = true;
672           rectsearch.RemoveBBox();
673           gsearch.RepositionIterator();
674           delete neighbor;
675         }
676       }
677     } while (neighbor_found);
678     if (modified) {
679       // Because the box has changed, it has to be removed first.
680       gsearch.RemoveBBox();
681       col_seg_grid_.InsertBBox(true, true, seg);
682       gsearch.RepositionIterator();
683     }
684   }
685 }
686 
687 // Group horizontally overlapping table partitions into table columns.
688 // TODO(faisal): This is too aggressive at the moment. The method should
689 // consider more attributes to group table partitions together. Some common
690 // errors are:
691 //  1- page number is merged with a table column above it even
692 //      if there is a large vertical gap between them.
693 //  2- column headers go on to catch one of the columns arbitrarily
694 //  3- an isolated noise blob near page top or bottom merges with the table
695 //     column below/above it
696 //  4- cells from two vertically adjacent tables merge together to make a
697 //     single column resulting in merging of the two tables
GetTableColumns(ColSegment_LIST * table_columns)698 void ColumnFinder::GetTableColumns(ColSegment_LIST *table_columns) {
699   ColSegment_IT it(table_columns);
700   // Iterate the ColPartitions in the grid.
701   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
702     gsearch(&clean_part_grid_);
703   gsearch.StartFullSearch();
704   ColPartition* part;
705   while ((part = gsearch.NextFullSearch()) != NULL) {
706     if (part->inside_table_column() || part->type() != PT_TABLE)
707       continue;  // prevent a partition to be assigned to multiple columns
708     TBOX box = part->bounding_box();
709     ColSegment* col = new ColSegment();
710     col->InsertBox(box);
711     part->set_inside_table_column(true);
712     // Start a search below the current cell to find bottom neighbours
713     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
714         vsearch(&clean_part_grid_);
715     vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
716     ColPartition* neighbor;
717     bool found_neighbours = false;
718     while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) {
719       // only consider neighbors not assigned to any column yet
720       if (neighbor->inside_table_column())
721         continue;
722 
723       // presence of a non-table neighbor marks the end of current
724       // table column
725       if (neighbor->type() != PT_TABLE) {
726         // Horizontal lines should not break the flow
727         if (neighbor->IsLineType())
728           continue;
729         else
730           break;
731       }
732       TBOX neighbor_box = neighbor->bounding_box();
733       col->InsertBox(neighbor_box);
734       neighbor->set_inside_table_column(true);
735       found_neighbours = true;
736     }
737     if (found_neighbours) {
738       it.add_after_then_move(col);
739     } else {
740       part->set_inside_table_column(false);
741       delete col;
742     }
743   }
744 }
745 
746 // Mark regions in a column that are x-bounded by the column boundaries and
747 // y-bounded by the table columns' projection on the y-axis as table regions
GetTableRegions(ColSegment_LIST * table_columns,ColSegment_LIST * table_regions)748 void ColumnFinder::GetTableRegions(ColSegment_LIST* table_columns,
749                                    ColSegment_LIST* table_regions) {
750   ColSegment_IT cit(table_columns);
751   ColSegment_IT rit(table_regions);
752   // Iterate through column blocks
753   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
754       gsearch(&col_seg_grid_);
755   gsearch.StartFullSearch();
756   ColSegment* part;
757   int page_height = tright().y() - bleft().y();
758   ASSERT_HOST(page_height > 0);
759   // create a bool array to hold projection on y-axis
760   bool* table_region = new bool[page_height];
761   while ((part = gsearch.NextFullSearch()) != NULL) {
762     TBOX part_box = part->bounding_box();
763     // reset the projection array
764     for (int i = 0; i < page_height; i++) {
765       table_region[i] = false;
766     }
767     // iterate through all table columns to find regions in the current
768     // page column block
769     cit.move_to_first();
770     for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
771       TBOX col_box = cit.data()->bounding_box();
772       // find intersection region of table column and page column
773       TBOX intersection_box = col_box.intersection(part_box);
774       // project table column on the y-axis
775       for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
776         table_region[i - bleft().y()] = true;
777       }
778     }
779     // set x-limits of table regions to page column width
780     TBOX current_table_box;
781     current_table_box.set_left(part_box.left());
782     current_table_box.set_right(part_box.right());
783     // go through the y-axis projection to find runs of table
784     // regions. Each run makes one table region.
785     for (int i = 1; i < page_height; i++) {
786       // detect start of a table region
787       if (!table_region[i - 1] && table_region[i]) {
788         current_table_box.set_bottom(i + bleft().y());
789       }
790       // detect end of a table region
791       if (table_region[i - 1] && !table_region[i]) {
792         current_table_box.set_top(i + bleft().y());
793         if (!current_table_box.null_box()) {
794           ColSegment* seg = new ColSegment();
795           seg->InsertBox(current_table_box);
796           rit.add_after_then_move(seg);
797         }
798       }
799     }
800   }
801   delete[] table_region;
802 }
803 
804 // Merge table regions corresponding to tables spanning multiple columns if
805 // there is a colpartition (horizontal ruling line or normal text) that
806 // touches both regions.
807 // TODO(faisal): A rare error occurs if there are two horizontally adjacent
808 // tables with aligned ruling lines. In this case, line finder returns a
809 // single line and hence the tables get merged together
GridMergeTableRegions()810 void ColumnFinder::GridMergeTableRegions() {
811   // Iterate the table regions in the grid.
812   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
813       gsearch(&table_grid_);
814   gsearch.StartFullSearch();
815   ColSegment* seg;
816   while ((seg = gsearch.NextFullSearch()) != NULL) {
817     bool neighbor_found = false;
818     bool modified = false;  // Modified at least once
819     do {
820       // Start a rectangle search x-bounded by the image and y by the table
821       TBOX box = seg->bounding_box();
822       TBOX search_region(box);
823       search_region.set_left(bleft().x());
824       search_region.set_right(tright().x());
825       neighbor_found = false;
826       GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
827           rectsearch(&table_grid_);
828       rectsearch.StartRectSearch(search_region);
829       ColSegment* neighbor;
830       while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
831         if (neighbor == seg)
832           continue;
833         TBOX neighbor_box = neighbor->bounding_box();
834         // Check if a neighbor box has a large overlap with the table
835         // region.  This may happen as a result of merging two table
836         // regions in the previous iteration.
837         if (neighbor_box.overlap_fraction(box) >= 0.9) {
838           seg->InsertBox(neighbor_box);
839           rectsearch.RemoveBBox();
840           gsearch.RepositionIterator();
841           delete neighbor;
842           modified = true;
843           continue;
844         }
845         // Check if two table regions belong together based on a common
846         // horizontal ruling line
847         if (BelongToOneTable(box, neighbor_box)) {
848           seg->InsertBox(neighbor_box);
849           neighbor_found = true;
850           modified = true;
851         }
852       }
853     } while (neighbor_found);
854     if (modified) {
855       // Because the box has changed, it has to be removed first.
856       gsearch.RemoveBBox();
857       table_grid_.InsertBBox(true, true, seg);
858       gsearch.RepositionIterator();
859     }
860   }
861 }
862 
863 // Decide if two table regions belong to one table based on a common
864 // horizontal ruling line or another colpartition
BelongToOneTable(const TBOX & box1,const TBOX & box2)865 bool ColumnFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
866   // Check for ColPartitions spanning both table regions
867   TBOX bbox = box1.bounding_union(box2);
868   // Start a rect search on bbox
869   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
870       rectsearch(&clean_part_grid_);
871   rectsearch.StartRectSearch(bbox);
872   ColPartition* part;
873   while ((part = rectsearch.NextRectSearch()) != NULL) {
874     TBOX part_box = part->bounding_box();
875     // return true if a colpartition spanning both table regions is found
876     if (part_box.overlap(box1) && part_box.overlap(box2))
877       return true;
878   }
879   return false;
880 }
881 
882 // Adjust table boundaries by:
883 //  - building a tight bounding box around all ColPartitions contained in it.
884 //  - expanding table boundaries to include all colpartitions that overlap the
885 //    table by more than half of their area
886 //  - expanding table boundaries to include nearby horizontal rule lines
887 //  - expanding table vertically to include left out column headers
888 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
889 //               makes following errors:
890 //  1- horizontal lines consisting of underlines are included in the table if
891 //     they are close enough
892 //  2- horizontal lines originating from noise tend to get merged with a table
893 //     near the top of the page
894 //  3- the criteria for including horizontal lines is very generous. Many times
895 //     horizontal lines separating headers and footers get merged with a
896 //     single-column table in a multi-column page thereby including text
897 //     from the neighboring column inside the table
898 //  4- the criteria for including left out column headers also tends to
899 //     occasionally include text-lines above the tables, typically from
900 //     table caption
AdjustTableBoundaries()901 void ColumnFinder::AdjustTableBoundaries() {
902   // Iterate the table regions in the grid
903   ColSegment_CLIST adjusted_tables;
904   ColSegment_C_IT it(&adjusted_tables);
905   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
906       gsearch(&table_grid_);
907   gsearch.StartFullSearch();
908   ColSegment* table;
909   // search for horizontal ruling lines within the vertical margin
910   int vertical_margin = kRulingVerticalMargin * gridsize_;
911   while ((table = gsearch.NextFullSearch()) != NULL) {
912     TBOX table_box = table->bounding_box();
913     TBOX search_box = table_box;
914     int top = MIN(search_box.top() + vertical_margin, tright().y());
915     int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
916     search_box.set_top(top);
917     search_box.set_bottom(bottom);
918     TBOX box;
919     // Start a rect search on table_box
920     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
921         rectsearch(&clean_part_grid_);
922     rectsearch.StartRectSearch(search_box);
923     ColPartition* part;
924     while ((part = rectsearch.NextRectSearch()) != NULL) {
925      // Do not consider image partitions
926       if (part->IsImageType())
927         continue;
928       TBOX part_box = part->bounding_box();
929       // Include partition in the table if more than half of it
930       // is covered by the table
931       if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
932         box = box.bounding_union(part_box);
933         continue;
934       }
935       // Include a partially overlapping horizontal line only if the
936       // extra ColPartitions that will be included due to expansion
937       // have large side spacing w.r.t. columns containing them.
938       if (HLineBelongsToTable(part, table_box)) {
939         box = box.bounding_union(part_box);
940       }
941     }
942     IncludeLeftOutColumnHeaders(box);
943     // To prevent a table from expanding again, do not insert the
944     // modified box back to the grid. Instead move it to a list and
945     // and remove it from the grid. The list is moved later back to the grid.
946     if (!box.null_box()) {
947       ColSegment* col = new ColSegment();
948       col->InsertBox(box);
949       it.add_after_then_move(col);
950     }
951     gsearch.RemoveBBox();
952     delete table;
953   }
954   // clear table grid to move final tables in it
955   table_grid_.Clear();
956   it.move_to_first();
957   // move back final tables to table_grid_
958   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
959     ColSegment* seg = it.extract();
960     table_grid_.InsertBBox(true, true, seg);
961   }
962 }
963 
964 // Checks whether the horizontal line belong to the table by looking at the
965 // side spacing of extra ColParitions that will be included in the table
966 // due to expansion
HLineBelongsToTable(ColPartition * part,const TBOX & table_box)967 bool ColumnFinder::HLineBelongsToTable(ColPartition* part,
968                                        const TBOX& table_box) {
969   TBOX part_box = part->bounding_box();
970   if (!part->IsLineType() || !part_box.major_x_overlap(table_box))
971     return false;
972   // Do not consider top-most horizontal line since it usually
973   // originates from noise
974   if (!part->nearest_neighbor_above())
975     return false;
976   TBOX bbox = part_box.bounding_union(table_box);
977   // Start a rect search on bbox
978   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
979       rectsearch(&clean_part_grid_);
980   rectsearch.StartRectSearch(bbox);
981   ColPartition* extra_part;
982   int num_extra_partitions = 0;
983   int extra_space_to_right = 0;
984   int extra_space_to_left = 0;
985   while ((extra_part = rectsearch.NextRectSearch()) != NULL) {
986     if (!rectsearch.ReturnedSeedElement())
987       continue;
988     TBOX extra_part_box = extra_part->bounding_box();
989     if (extra_part_box.overlap_fraction(table_box) > 0.6)
990       continue;  // ColPartition already in table
991     if (extra_part->IsImageType())  // Non-text ColPartitions do not contribute
992       continue;
993     num_extra_partitions++;
994     // presense of a table cell is a strong hint, so just increment the scores
995     // without looking at the spacing.
996     if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
997       extra_space_to_right++;
998       extra_space_to_left++;
999       continue;
1000     }
1001     int space_threshold = kSideSpaceMargin * part->median_size();
1002     if (extra_part->space_to_right() > space_threshold)
1003       extra_space_to_right++;
1004     if (extra_part->space_to_left() > space_threshold)
1005       extra_space_to_left++;
1006   }
1007   // tprintf("%d %d %d\n",
1008   // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1009   return (extra_space_to_right > num_extra_partitions / 2) ||
1010       (extra_space_to_left > num_extra_partitions / 2);
1011 }
1012 
1013 // Look for isolated column headers above the given table box and
1014 // include them in the table
IncludeLeftOutColumnHeaders(TBOX & table_box)1015 void ColumnFinder::IncludeLeftOutColumnHeaders(TBOX& table_box) {
1016   // Start a search above the current table to look for column headers
1017   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1018       vsearch(&clean_part_grid_);
1019   vsearch.StartVerticalSearch(table_box.left(), table_box.right(),
1020                               table_box.top());
1021   ColPartition* neighbor;
1022   ColPartition* previous_neighbor = NULL;
1023   while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) {
1024     int table_top = table_box.top();
1025     TBOX box = neighbor->bounding_box();
1026     // Do not continue if the next box is way above
1027     // TODO(faisal): make the threshold some factor of line spacing
1028     if (box.bottom() - table_top > kMaxColumnHeaderDistance)
1029       break;
1030     // Unconditionally include partitions of type TABLE or LINE
1031     // TODO(faisal): add some reasonable conditions here
1032     if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1033       table_box.set_top(box.top());
1034       previous_neighbor = NULL;
1035       continue;
1036     }
1037     // If there are two text partitions, one above the other, without a table
1038     // cell on their left or right side, consider them a barrier and quit
1039     if (previous_neighbor == NULL) {
1040       previous_neighbor = neighbor;
1041     } else {
1042       TBOX previous_box = previous_neighbor->bounding_box();
1043       if (!box.major_y_overlap(previous_box))
1044         break;
1045     }
1046   }
1047 }
1048 
1049 // Remove false alarms consiting of a single column based on their
1050 // projection on the x-axis. Projection of a real table on the x-axis
1051 // should have at least one zero-valley larger than the global median
1052 // x-height of the page.
DeleteSingleColumnTables()1053 void ColumnFinder::DeleteSingleColumnTables() {
1054   int page_width = tright().x() - bleft().x();
1055   ASSERT_HOST(page_width > 0);
1056   // create an integer array to hold projection on x-axis
1057   int* table_xprojection = new int[page_width];
1058   // Iterate through all tables in the table grid
1059   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1060       table_search(&table_grid_);
1061   table_search.StartFullSearch();
1062   ColSegment* table;
1063   while ((table = table_search.NextFullSearch()) != NULL) {
1064     TBOX table_box = table->bounding_box();
1065     // reset the projection array
1066     for (int i = 0; i < page_width; i++) {
1067       table_xprojection[i] = 0;
1068     }
1069     // Start a rect search on table_box
1070     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1071         rectsearch(&clean_part_grid_);
1072     rectsearch.StartRectSearch(table_box);
1073     ColPartition* part;
1074     while ((part = rectsearch.NextRectSearch()) != NULL) {
1075       if (!rectsearch.ReturnedSeedElement())
1076         continue;  // Consider each partition only once
1077       if (!part->IsTextType())
1078         continue;  // Do not consider non-text partitions
1079       TBOX part_box = part->bounding_box();
1080       // Do not consider partitions partially covered by the table
1081       if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
1082         continue;
1083       BLOBNBOX_CLIST* part_boxes = part->boxes();
1084       BLOBNBOX_C_IT pit(part_boxes);
1085       for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1086         BLOBNBOX *pblob = pit.data();
1087         // ignore blob height for the purpose of projection since we
1088         // are only interested in finding valleys
1089         int xstart = pblob->bounding_box().left();
1090         int xend = pblob->bounding_box().right();
1091         for (int i = xstart; i < xend; i++)
1092           table_xprojection[i - bleft().x()]++;
1093       }
1094     }
1095     // Find largest valley between two reasonable peaks in the table
1096     if (!GapInXProjection(table_xprojection, page_width)) {
1097       table_search.RemoveBBox();
1098       delete table;
1099     }
1100   }
1101   delete[] table_xprojection;
1102 }
1103 
1104 // Return true if at least one gap larger than the global x-height
1105 // exists in the horizontal projection
GapInXProjection(int * xprojection,int length)1106 bool ColumnFinder::GapInXProjection(int* xprojection, int length) {
1107   // Find peak value of the histogram
1108   int peak_value = 0;
1109   for (int i = 0; i < length; i++) {
1110     if (xprojection[i] > peak_value) {
1111       peak_value = xprojection[i];
1112     }
1113   }
1114   // Peak value represents the maximum number of horizontally
1115   // overlapping colpartitions, so this can be considered as the
1116   // number of rows in the table
1117   if (peak_value < kMinRowsInTable)
1118     return false;
1119   double projection_threshold = kProjectionThreshold * peak_value;
1120   // Threshold the histogram
1121   for (int i = 0; i < length; i++) {
1122     xprojection[i] = (xprojection[i] > projection_threshold) ? 1 : 0;
1123   }
1124   // Find the largest run of zeros between two ones
1125   int largest_gap = 0;
1126   int run_start = -1;
1127   for (int i = 1; i < length; i++) {
1128     // detect start of a run of zeros
1129     if (xprojection[i - 1] && !xprojection[i]) {
1130       run_start = i;
1131     }
1132     // detect end of a run of zeros and update the value of largest gap
1133     if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1134       int gap = i - run_start;
1135       if (gap > largest_gap)
1136         largest_gap = gap;
1137       run_start = -1;
1138     }
1139   }
1140   return (largest_gap > global_median_xheight_);
1141 }
1142 
1143 // Displays the column segments in some window.
DisplayColSegments(ColSegment_LIST * segments,ScrollView * win,ScrollView::Color color)1144 void ColumnFinder::DisplayColSegments(ColSegment_LIST *segments,
1145                                       ScrollView* win,
1146                                       ScrollView::Color color) {
1147 #ifndef GRAPHICS_DISABLED
1148   win->Pen(color);
1149   win->Brush(ScrollView::NONE);
1150   ColSegment_IT it(segments);
1151   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1152     ColSegment *col = it.data();
1153     TBOX box = col->bounding_box();
1154     int left_x = box.left();
1155     int right_x = box.right();
1156     int top_y = box.top();
1157     int bottom_y = box.bottom();
1158     win->Rectangle(left_x, bottom_y, right_x, top_y);
1159   }
1160   win->Update();
1161 #endif
1162 }
1163 
1164 // Displays the colpartitions using a new coloring on an existing window.
1165 // Note: This method is only for debug purpose during development and
1166 // would not be part of checked in code
DisplayColPartitions(ScrollView * win,ScrollView::Color default_color)1167 void ColumnFinder::DisplayColPartitions(ScrollView* win,
1168                                         ScrollView::Color default_color) {
1169 #ifndef GRAPHICS_DISABLED
1170   // Iterate the ColPartitions in the grid.
1171   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1172     gsearch(&clean_part_grid_);
1173   gsearch.StartFullSearch();
1174   ColPartition* part;
1175   win->Brush(ScrollView::NONE);
1176   ScrollView::Color color;
1177   while ((part = gsearch.NextFullSearch()) != NULL) {
1178     color = default_color;
1179     TBOX box = part->bounding_box();
1180 //     ColPartition* upper_part = part->nearest_neighbor_above();
1181 //     ColPartition* lower_part = part->nearest_neighbor_below();
1182 //     if (!upper_part && !lower_part)
1183 //       color = ScrollView::ORANGE;
1184 //     else if (upper_part && !lower_part)
1185 //       color = ScrollView::CYAN;
1186 //     else if (!upper_part && lower_part)
1187 //       color = ScrollView::MAGENTA;
1188     if (part->type() == PT_TABLE)
1189       color = ScrollView::YELLOW;
1190 
1191     int left_x = box.left();
1192     int right_x = box.right();
1193     int top_y = box.top();
1194     int bottom_y = box.bottom();
1195     win->Pen(color);
1196     win->Rectangle(left_x, bottom_y, right_x, top_y);
1197   }
1198   win->Update();
1199 #endif
1200 }
1201 
1202 // Write debug image and text file.
1203 // Note: This method is only for debug purpose during development and
1204 // would not be part of checked in code
WriteToPix()1205 void ColumnFinder::WriteToPix() {
1206 #ifdef HAVE_LIBLEPT
1207   // Input file must be named test1.tif
1208   PIX* pix = pixRead("test1.tif");
1209   if (!pix) {
1210     tprintf("Input file test1.tif not found.\n");
1211     return;
1212   }
1213   int img_height = pixGetHeight(pix);
1214   int img_width = pixGetWidth(pix);
1215   // Maximum number of text or table partitions
1216   int num_boxes = 10;
1217   BOXA* text_box_array = boxaCreate(num_boxes);
1218   BOXA* table_box_array = boxaCreate(num_boxes);
1219   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1220     gsearch(&clean_part_grid_);
1221   gsearch.StartFullSearch();
1222   ColPartition* part;
1223   // load colpartitions into text_box_array and table_box_array
1224   while ((part = gsearch.NextFullSearch()) != NULL) {
1225     TBOX box = part->bounding_box();
1226     box.rotate_large(reskew_);
1227     BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
1228                               box.right() - box.left(),
1229                               box.top() - box.bottom());
1230     if (part->type() == PT_TABLE)
1231       boxaAddBox(table_box_array, lept_box, L_INSERT);
1232     else
1233       boxaAddBox(text_box_array, lept_box, L_INSERT);
1234   }
1235   // draw colpartitions on the output image
1236   PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000);
1237   out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00);
1238 
1239   BOXA* table_array = boxaCreate(num_boxes);
1240   // text file containing detected table bounding boxes
1241   FILE* fptr = fopen("tess-table.txt", "w");
1242   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1243       table_search(&table_grid_);
1244   table_search.StartFullSearch();
1245   ColSegment* table;
1246   // load table boxes to table_array and write them to text file as well
1247   while ((table = table_search.NextFullSearch()) != NULL) {
1248     TBOX box = table->bounding_box();
1249     box.rotate_large(reskew_);
1250     // Since deskewing introduces negative coordinates, reskewing
1251     // might not completely recover from that since both steps enlarge
1252     // the actual box. Hence a box that undergoes deskewing/reskewing
1253     // may go out of image boundaries. Crop a table box if needed to
1254     // contain it inside the image dimensions.
1255     box = box.intersection(TBOX(0, 0, img_width - 1, img_height - 1));
1256     BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
1257                               box.right() - box.left(),
1258                               box.top() - box.bottom());
1259     boxaAddBox(table_array, lept_box, L_INSERT);
1260     fprintf(fptr, "%d %d %d %d TABLE\n", box.left(),
1261             img_height - box.top(), box.right(), img_height - box.bottom());
1262   }
1263   fclose(fptr);
1264   // paint table boxes on the debug image
1265   out = pixDrawBoxa(out, table_array, 5, 0x7fff0000);
1266 
1267   pixWrite("out.png", out, IFF_PNG);
1268   // memory cleanup
1269   boxaDestroy(&text_box_array);
1270   boxaDestroy(&table_box_array);
1271   boxaDestroy(&table_array);
1272   pixDestroy(&pix);
1273   pixDestroy(&out);
1274 #endif
1275 }
1276 
1277 // Merge all colpartitions in table regions to make them a single
1278 // colpartition and revert types of isolated table cells not
1279 // assigned to any table to their original types.
MakeTableBlocks()1280 void ColumnFinder::MakeTableBlocks() {
1281   // Since we have table blocks already, remove table tags from all
1282   // colpartitions
1283   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1284     gsearch(&part_grid_);
1285   gsearch.StartFullSearch();
1286   ColPartition* part;
1287   while ((part = gsearch.NextFullSearch()) != NULL) {
1288     if (part->type() == PT_TABLE) {
1289       part->clear_table_type();
1290     }
1291   }
1292   // Now make a single colpartition out of each table block and remove
1293   // all colpartitions contained within a table
1294   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1295       table_search(&table_grid_);
1296   table_search.StartFullSearch();
1297   ColSegment* table;
1298   while ((table = table_search.NextFullSearch()) != NULL) {
1299     TBOX table_box = table->bounding_box();
1300     // Start a rect search on table_box
1301     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1302         rectsearch(&part_grid_);
1303     rectsearch.StartRectSearch(table_box);
1304     ColPartition* part;
1305     ColPartition* table_partition = NULL;
1306     while ((part = rectsearch.NextRectSearch()) != NULL) {
1307      // Do not consider image partitions
1308       if (!part->IsTextType())
1309         continue;
1310       TBOX part_box = part->bounding_box();
1311       // Include partition in the table if more than half of it
1312       // is covered by the table
1313       if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1314         rectsearch.RemoveBBox();
1315         if (table_partition) {
1316           table_partition->Absorb(part, WidthCB());
1317         } else {
1318           table_partition = part;
1319         }
1320       }
1321     }
1322     // Insert table colpartition back to part_grid_
1323     if (table_partition) {
1324       table_partition->SetPartitionType(best_columns_[table_search.GridY()]);
1325       table_partition->set_table_type();
1326       part_grid_.InsertBBox(true, true, table_partition);
1327     }
1328   }
1329 }
1330 
1331 // Provides a color for BBGrid to draw the rectangle.
BoxColor() const1332 ScrollView::Color  ColSegment::BoxColor() const {
1333   const ScrollView::Color kBoxColors[PT_COUNT] = {
1334     ScrollView::YELLOW,
1335     ScrollView::BLUE,
1336     ScrollView::YELLOW,
1337     ScrollView::MAGENTA,
1338   };
1339   return kBoxColors[type_];
1340 }
1341 
1342 // Insert a box into this column segment
InsertBox(const TBOX & other)1343 void ColSegment::InsertBox(const TBOX& other) {
1344   bounding_box_ = bounding_box_.bounding_union(other);
1345 }
1346 
1347 // Set column segment type based on the ratio of text and table partitions
1348 // in it.
set_type()1349 void ColSegment::set_type() {
1350   if (num_table_cells_ > kTableColumnThreshold * num_text_cells_)
1351     type_ = COL_TABLE;
1352   else if (num_text_cells_ > num_table_cells_)
1353     type_ = COL_TEXT;
1354   else
1355     type_ = COL_MIXED;
1356 }
1357 
1358 }  // namespace tesseract.
1359