• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        colfind.h
3 // Description: Class to find columns in the grid of BLOBNBOXes.
4 // Author:      Ray Smith
5 // Created:     Thu Feb 21 14:04:01 PST 2008
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19 
20 #ifndef TESSERACT_TEXTORD_COLFIND_H__
21 #define TESSERACT_TEXTORD_COLFIND_H__
22 
23 #include "tabfind.h"
24 #include "tablefind.h"
25 #include "imagefind.h"
26 #include "colpartition.h"
27 #include "colpartitionset.h"
28 #include "ocrblock.h"
29 
30 class ScrollView;
31 class TO_BLOCK;
32 class STATS;
33 class BLOCK_LIST;
34 struct Boxa;
35 struct Pixa;
36 
37 namespace tesseract {
38 
39 class StrokeWidth;
40 class LineSpacing;
41 class TempColumn_LIST;
42 class ColSegment_LIST;
43 class ColumnGroup_LIST;
44 class ColPartitionSet;
45 class ColPartitionSet_LIST;
46 
47 // The ColumnFinder class finds columns in the grid.
48 class ColumnFinder : public TabFind {
49  public:
50   // Gridsize is an estimate of the text size in the image. A suitable value
51   // is in TO_BLOCK::line_size after find_components has been used to make
52   // the blobs.
53   // bleft and tright are the bounds of the image (rectangle) being processed.
54   // vlines is a (possibly empty) list of TabVector and vertical_x and y are
55   // the sum logical vertical vector produced by LineFinder::FindVerticalLines.
56   ColumnFinder(int gridsize, const ICOORD& bleft, const ICOORD& tright,
57                TabVector_LIST* vlines, TabVector_LIST* hlines,
58                int vertical_x, int vertical_y);
59   virtual ~ColumnFinder();
60 
61   // Finds the text and image blocks, returning them in the blocks and to_blocks
62   // lists. (Each TO_BLOCK points to the basic BLOCK and adds more information.)
63   // If boxa and pixa are not NULL, they are assumed to be the output of
64   // ImageFinder::FindImages, and are used to generate image blocks.
65   // The input boxa and pixa are destroyed.
66   // Imageheight and resolution should be the pixel height and resolution in
67   // pixels per inch of the original image.
68   // The input block is the result of a call to find_components, and contains
69   // the blobs found in the image. These blobs will be removed and placed
70   // in the output blocks, while unused ones will be deleted.
71   // If single_column is true, the input is treated as single column, but
72   // it is still divided into blocks of equal line spacing/text size.
73   // Returns -1 if the user requested retry with more debug info.
74   int FindBlocks(int imageheight, int resolution, bool single_column,
75                  TO_BLOCK* block, Boxa* boxa, Pixa* pixa,
76                  BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);
77 
78  private:
79   // Displays the blob and block bounding boxes in a window called Blocks.
80   void DisplayBlocks(BLOCK_LIST* blocks);
81   // Displays the column edges at each grid y coordinate defined by
82   // best_columns_.
83   void DisplayColumnBounds(PartSetVector* sets);
84 
85   // Converts the arrays of Box/Pix to a list of C_OUTLINE, and then to blobs.
86   // The output is a list of C_BLOBs for the images, but the C_OUTLINEs
87   // contain no data.
88   void ExtractImageBlobs(int image_height, Boxa* boxa, Pixa* pixa);
89 
90   ////// Functions involved in making the initial ColPartitions. /////
91 
92   // Creates the initial ColPartitions, and puts them in a ColPartitionSet
93   // for each grid y coordinate, storing the ColPartitionSets in part_sets_.
94   // After creating the ColPartitonSets, attempts to merge them where they
95   // overlap and unique the BLOBNBOXes within.
96   // The return value is the number of ColPartitionSets made.
97   int MakeColumnPartitions();
98   // Partition the BLOBNBOXES horizontally at the given grid y, creating a
99   // ColPartitionSet which is returned. NULL is returned if there are no
100   // BLOBNBOXES at the given grid y.
101   ColPartitionSet* PartitionsAtGridY(int grid_y);
102   // Insert the blobs in the given list into the main grid and for
103   // each one also make it a separate unknown partition.
104   // If filter is true, use only the blobs that are above a threshold in
105   // size or a non-isolated.
106   void InsertSmallBlobsAsUnknowns(bool filter, BLOBNBOX_LIST* blobs);
107   // Helper function for PartitionsAtGridY, with a long argument list.
108   // This bbox is of unknown type, so it is added to an unk_partition.
109   // If the edge is past the unk_right_margin then unk_partition has to be
110   // completed and a new one made. See CompletePartition and StartPartition
111   // for the other args.
112   void ProcessUnknownBlob(int page_edge, BLOBNBOX* bbox,
113                           ColPartition** unk_partition,
114                           ColPartition_IT* unk_part_it,
115                           TabVector** unk_right_line,
116                           int* unk_right_margin,
117                           int* unk_prev_margin,
118                           bool* unk_edge_is_left);
119   // Creates and returns a new ColPartition of the given start_type
120   // and adds the given bbox to it.
121   // Also finds the left and right tabvectors that bound the textline, setting
122   // the members of the returned ColPartition appropriately:
123   // If the left tabvector is less constraining than the input left_margin
124   // (assumed to be the right edge of the previous partition), then the
125   // tabvector is ignored and the left_margin used instead.
126   // If the right tabvector is more constraining than the input *right_margin,
127   // (probably the right edge of the page), then the *right_margin is adjusted
128   // to use the tabvector.
129   // *edge_is_left is set to true if the right tabvector is good and used as the
130   // margin, so we can include blobs that overhang the tabvector in this
131   // partition.
132   ColPartition* StartPartition(BlobRegionType start_type, int left_margin,
133                                BLOBNBOX* bbox, TabVector** right_line,
134                                int* right_margin, bool* edge_is_left);
135   // Completes the given partition, and adds it to the given iterator.
136   // The right_margin on input is the left edge of the next blob if there is
137   // one. The right tab vector plus a margin is used as the right margin if
138   // it is more constraining than the next blob, but if there are no more
139   // blobs, we want the right margin to make it to the page edge.
140   // The return value is the next left margin, being the right edge of the
141   // bounding box of blobs.
142   int CompletePartition(bool no_more_blobs, int page_edge,
143                         TabVector* right_line, int* right_margin,
144                         ColPartition** partition, ColPartition_IT* part_it);
145 
146 
147   ////// Functions involved in determining the columns used on the page. /////
148 
149   // Makes an ordered list of candidates to partition the width of the page
150   // into columns using the part_sets_.
151   // See AddToColumnSetsIfUnique for the ordering.
152   // If single_column, then it just makes a single page-wide fake column.
153   void MakeColumnCandidates(bool single_column);
154   // Attempt to improve the column_candidates by expanding the columns
155   // and adding new partitions from the partition sets in src_sets.
156   // Src_sets may be equal to column_candidates, in which case it will
157   // use them as a source to improve themselves.
158   void ImproveColumnCandidates(PartSetVector* src_sets,
159                                PartSetVector* column_sets);
160   // Prints debug information on the column candidates.
161   void PrintColumnCandidates(const char* title);
162   // Finds the optimal set of columns that cover the entire image with as
163   // few changes in column partition as possible.
164   void AssignColumns();
165   // Finds the biggest range in part_sets_ that has no assigned column, but
166   // column assignment is possible.
167   bool BiggestUnassignedRange(const bool* any_columns_possible,
168                               int* start, int* end);
169   // Finds the modal compatible column_set_ index within the given range.
170   int RangeModalColumnSet(bool** possible_column_sets,
171                           int start, int end);
172   // Given that there are many column_set_id compatible columns in the range,
173   // shrinks the range to the longest contiguous run of compatibility, allowing
174   // gaps where no columns are possible, but not where competing columns are
175   // possible.
176   void ShrinkRangeToLongestRun(bool** possible_column_sets,
177                               const bool* any_columns_possible,
178                               int column_set_id,
179                               int* best_start, int* best_end);
180   // Moves start in the direction of step, upto, but not including end while
181   // the only incompatible regions are no more than kMaxIncompatibleColumnCount
182   // in size, and the compatible regions beyond are bigger.
183   void ExtendRangePastSmallGaps(bool** possible_column_sets,
184                                 const bool* any_columns_possible,
185                                 int column_set_id,
186                                 int step, int end, int* start);
187   // Assigns the given column_set_id to the part_sets_ in the given range.
188   void AssignColumnToRange(int column_set_id, int start, int end,
189                            bool** assigned_column_sets);
190 
191   // Computes the mean_column_gap_.
192   void ComputeMeanColumnGap();
193 
194   //////// Functions that manipulate ColPartitions in the part_grid_ /////
195   //////// to split, merge, find margins, and find types.  //////////////
196 
197   // Removes the ColPartitions from part_sets_, the ColPartitionSets that
198   // contain them, and puts them in the part_grid_ after ensuring that no
199   // BLOBNBOX is owned by more than one of them.
200   void MovePartitionsToGrid();
201   // Splits partitions that cross columns where they have nothing in the gap.
202   void GridSplitPartitions();
203   // Merges partitions where there is vertical overlap, within a single column,
204   // and the horizontal gap is small enough.
205   void GridMergePartitions();
206   // Resolves unknown partitions from the unknown_parts_ list by merging them
207   // with a close neighbour, inserting them into the grid with a known type,
208   // or declaring them to be noise.
209   void GridInsertUnknowns();
210   // Add horizontal line separators as partitions.
211   void GridInsertHLinePartitions();
212   // Improves the margins of the ColPartitions in the grid by calling
213   // FindPartitionMargins on each.
214   void GridFindMargins();
215   // Improves the margins of the ColPartitions in the list by calling
216   // FindPartitionMargins on each.
217   void ListFindMargins(ColPartition_LIST* parts);
218   // Improves the margins of the ColPartition by searching for
219   // neighbours that vertically overlap significantly.
220   void FindPartitionMargins(ColPartitionSet* columns, ColPartition* part);
221   // Starting at x, and going in the specified direction, upto x_limit, finds
222   // the margin for the given y range by searching sideways,
223   // and ignoring not_this.
224   int FindMargin(int x, bool right_to_left, int x_limit,
225                  int y_bottom, int y_top, const ColPartition* not_this);
226   // For every ColPartition in the grid, sets its type based on position
227   // in the columns.
228   void SetPartitionTypes();
229 
230   //////// Functions that manipulate ColPartitions in the part_grid_ /////
231   //////// to find chains of partner partitions of the same type.  ///////
232 
233   // For every ColPartition in the grid, finds its upper and lower neighbours.
234   void FindPartitionPartners();
235   // Finds the best partner in the given direction for the given partition.
236   // Stores the result with AddPartner.
237   void FindPartitionPartners(bool upper, ColPartition* part);
238   // For every ColPartition with multiple partners in the grid, reduces the
239   // number of partners to 0 or 1.
240   void RefinePartitionPartners();
241   // Only images remain with multiple types in a run of partners.
242   // Sets the type of all in the group to the maximum of the group.
243   void SmoothPartnerRuns();
244 
245   //////// Functions that manipulate ColPartitions in the part_grid_ /////
246   //////// to find tables.                                          ///////
247 
248   // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
249   // insert dot-like noise into period_grid_
250   void GetCleanPartitions(TO_BLOCK* block);
251 
252   // High level function to perform table detection
253   void LocateTables();
254 
255   // Get Column segments from best_columns_
256   void GetColumnBlocks(ColSegment_LIST *col_segments);
257 
258   // Group Column segments into consecutive single column regions.
259   void GroupColumnBlocks(ColSegment_LIST *current_segments,
260                         ColSegment_LIST *col_segments);
261 
262   // Check if two boxes are consecutive within the same column
263   bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2);
264 
265   // Set left, right and top, bottom spacings of each colpartition.
266   // Left/right spacings are w.r.t the column boundaries
267   // Top/bottom spacings are w.r.t. previous and next colpartitions
268   void SetPartitionSpacings();
269 
270   // Set spacing and closest neighbors above and below a given colpartition.
271   void SetVerticalSpacing(ColPartition* part);
272 
273   // Set global spacing estimates
274   void SetGlobalSpacings();
275 
276   // Mark partitions as table rows/cells.
277   void GridMarkTablePartitions();
278 
279   // Check if the partition has at lease one large gap between words or no
280   // significant gap at all
281   bool HasWideOrNoInterWordGap(ColPartition* part);
282 
283   // Check if a period lies in the inter-wrod gap in the parition boxes
284   bool LiesInGap(BLOBNBOX* period, BLOBNBOX_CLIST* boxes);
285 
286   // Filter individual text partitions marked as table partitions
287   // consisting of paragraph endings, small section headings, and
288   // headers and footers.
289   void FilterFalseAlarms();
290 
291   // Mark all ColPartitions as table cells that have a table cell above
292   // and below them
293   void SmoothTablePartitionRuns();
294 
295   // Set the ratio of candidate table partitions in each column
296   void SetColumnsType(ColSegment_LIST* col_segments);
297 
298   // Move Column Blocks to col_seg_grid_
299   void MoveColSegmentsToGrid(ColSegment_LIST *segments,
300                              ColSegmentGrid *col_seg_grid);
301 
302   // Merge Column Blocks that were split due to the presence of a table
303   void GridMergeColumnBlocks();
304 
305   // Merge table cells into table columns
306   void GetTableColumns(ColSegment_LIST *table_columns);
307 
308   // Get Column segments from best_columns_
309   void GetTableRegions(ColSegment_LIST *table_columns,
310                        ColSegment_LIST *table_regions);
311 
312   // Merge table regions corresponding to tables spanning multiple columns
313   void GridMergeTableRegions();
314   bool BelongToOneTable(const TBOX &box1, const TBOX &box2);
315 
316   // Adjust table boundaries by building a tight bounding box around all
317   // ColPartitions contained in it.
318   void AdjustTableBoundaries();
319 
320   // Checks whether the horizontal line belong to the table by looking at the
321   // side spacing of extra ColParitions that will be included in the table
322   // due to expansion
323   bool HLineBelongsToTable(ColPartition* part, const TBOX& table_box);
324 
325   // Look for isolated column headers above the given table box and
326   // include them in the table
327   void IncludeLeftOutColumnHeaders(TBOX& table_box);
328 
329   // Remove false alarms consiting of a single column
330   void DeleteSingleColumnTables();
331 
332   // Return true if at least one gap larger than the global x-height
333   // exists in the horizontal projection
334   bool GapInXProjection(int* xprojection, int length);
335 
336   // Displays Colpartitions marked as table row. Overlays them on top of
337   // part_grid_.
338   void DisplayColSegments(ColSegment_LIST *cols, ScrollView* win,
339                           ScrollView::Color color);
340 
341   // Displays the colpartitions using a new coloring on an existing window.
342   // Note: This method is only for debug purpose during development and
343   // would not be part of checked in code
344   void DisplayColPartitions(ScrollView* win,
345                             ScrollView::Color color);
346 
347   // Write ColParitions and Tables to a PIX image
348   // Note: This method is only for debug purpose during development and
349   // would not be part of checked in code
350   void WriteToPix();
351 
352   // Merge all colpartitions in table regions to make them a single
353   // colpartition and revert types of isolated table cells not
354   // assigned to any table to their original types.
355   void MakeTableBlocks();
356 
357   //////// Functions that make the final output blocks             ///////
358 
359   // Helper functions for TransformToBlocks.
360   // Add the part to the temp list in the correct order.
361   void AddToTempPartList(ColPartition* part, ColPartition_CLIST* temp_list);
362   // Add everything from the temp list to the work_set assuming correct order.
363   void EmptyTempPartList(ColPartition_CLIST* temp_list,
364                          WorkingPartSet_LIST* work_set);
365 
366   // Transform the grid of partitions to the output blocks.
367   void TransformToBlocks(BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);
368 
369   // Reskew the completed blocks to put them back to the original coords.
370   // (Blob outlines are not corrected for skew.)
371   // Rotate blobs and blocks individually so text line direction is
372   // horizontal. Record appropriate inverse transformations and required
373   // classifier transformation in the blocks.
374   void RotateAndReskewBlocks(TO_BLOCK_LIST* to_blocks);
375 
376 
377   // Move all the small and noise blobs into the main blobs list of
378   // the block from the to_blocks list that contains them.
379   void MoveSmallBlobs(BLOBNBOX_LIST* bblobs, TO_BLOCK_LIST* to_blocks);
380 
381   // The mean gap between columns over the page.
382   int mean_column_gap_;
383   // Estimate of median x-height over the page
384   int global_median_xheight_;
385   // Estimate of median ledding on the page
386   int global_median_ledding_;
387   // The rotation vector needed to convert deskewed back to original coords.
388   FCOORD reskew_;
389   // The rotation vector needed to convert the rotated back to original coords.
390   FCOORD rerotate_;
391   // The part_sets_ are the initial text-line-like partition of the grid,
392   // and is a vector of ColPartitionSets.
393   PartSetVector part_sets_;
394   // The column_sets_ contain the ordered candidate ColPartitionSets that
395   // define the possible divisions of the page into columns.
396   PartSetVector column_sets_;
397   // A simple array of pointers to the best assigned column division at
398   // each grid y coordinate.
399   ColPartitionSet** best_columns_;
400   // The grid used to hold ColPartitions after the columns have been determined.
401   ColPartitionGrid part_grid_;
402   // Grid to hold cleaned colpartitions after removing all
403   // colpartitions that consist of only noise blobs, and removing
404   // noise blobs from remaining colpartitions.
405   ColPartitionGrid clean_part_grid_;
406   // Grid to hold periods, commas, i-dots etc.
407   BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> period_grid_;
408   // List of period blobs extracted from small and noise blobs
409   BLOBNBOX_LIST period_blobs_;
410   // Grid of page column blocks
411   ColSegmentGrid col_seg_grid_;
412   // Grid of detected tables
413   ColSegmentGrid table_grid_;
414   // List of ColPartitions that are no longer needed after they have been
415   // turned into regions, but are kept around because they are referenced
416   // by the part_grid_.
417   ColPartition_LIST good_parts_;
418   // List of ColPartitions of unknown type.
419   ColPartition_LIST unknown_parts_;
420   // List of ColPartitions that have been declared noise.
421   ColPartition_LIST noise_parts_;
422   // The fake blobs that are made from the input boxa/pixa pair.
423   BLOBNBOX_LIST image_bblobs_;
424   // Horizontal line separators.
425   TabVector_LIST horizontal_lines_;
426   // Allow a subsequent instance to reuse the blocks window.
427   // Not thread-safe, but multiple threads shouldn't be using windows anyway.
428   static ScrollView* blocks_win_;
429 };
430 
431 }  // namespace tesseract.
432 
433 #endif  // TESSERACT_TEXTORD_COLFIND_H__
434