• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        colfind.cpp
3 // Description: Class to hold BLOBNBOXs in a grid for fast access
4 //              to neighbours.
5 // Author:      Ray Smith
6 // Created:     Wed Jun 06 17:22:01 PDT 2007
7 //
8 // (C) Copyright 2007, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
19 ///////////////////////////////////////////////////////////////////////
20 
21 #include "colfind.h"
22 #include "colpartition.h"
23 #include "colpartitionset.h"
24 #include "linefind.h"
25 #include "strokewidth.h"
26 #include "blobbox.h"
27 #include "scrollview.h"
28 #include "tessvars.h"
29 #include "varable.h"
30 #include "workingpartset.h"
31 
32 namespace tesseract {
33 
34 // Minimum width to be considered when making columns.
35 const int kMinColumnWidth = 100;
36 // When assigning columns, the max number of misfits that can be ignored.
37 const int kMaxIncompatibleColumnCount = 2;
38 // Max vertical distance of neighbouring ColPartition for it to be a partner.
39 const double kMaxPartitionSpacing = 1.75;
40 // Min fraction of ColPartition height to be overlapping for margin purposes.
41 const double kMarginOverlapFraction = 0.25;
42 // Max fraction of mean_column_gap_ for the gap between two partitions within a
43 // column to allow them to merge.
44 const double kHorizontalGapMergeFraction = 0.5;
45 // Min fraction of grid size to not be considered likely noise.
46 const double kMinNonNoiseFraction = 0.5;
47 // Search radius to use for finding large neighbours of smaller blobs.
48 const int kSmallBlobSearchRadius = 2;
49 
50 BOOL_VAR(textord_tabfind_show_strokewidths, false, "Show stroke widths");
51 BOOL_VAR(textord_tabfind_show_initial_partitions,
52          false, "Show partition bounds");
53 INT_VAR(textord_tabfind_show_partitions, 0,
54         "Show partition bounds, waiting if >1");
55 BOOL_VAR(textord_tabfind_show_columns, false, "Show column bounds");
56 BOOL_VAR(textord_tabfind_show_blocks, false, "Show final block bounds");
57 
58 ScrollView* ColumnFinder::blocks_win_ = NULL;
59 
60 // Gridsize is an estimate of the text size in the image. A suitable value
61 // is in TO_BLOCK::line_size after find_components has been used to make
62 // the blobs.
63 // bleft and tright are the bounds of the image (or rectangle) being processed.
64 // vlines is a (possibly empty) list of TabVector and vertical_x and y are
65 // the sum logical vertical vector produced by LineFinder::FindVerticalLines.
ColumnFinder(int gridsize,const ICOORD & bleft,const ICOORD & tright,TabVector_LIST * vlines,TabVector_LIST * hlines,int vertical_x,int vertical_y)66 ColumnFinder::ColumnFinder(int gridsize,
67                            const ICOORD& bleft, const ICOORD& tright,
68                            TabVector_LIST* vlines, TabVector_LIST* hlines,
69                            int vertical_x, int vertical_y)
70   : TabFind(gridsize, bleft, tright, vlines, vertical_x, vertical_y),
71     mean_column_gap_(tright.x() - bleft.x()),
72     global_median_xheight_(0), global_median_ledding_(0),
73     reskew_(1.0f, 0.0f), rerotate_(1.0f, 0.0f),
74     best_columns_(NULL) {
75   TabVector_IT h_it(&horizontal_lines_);
76   h_it.add_list_after(hlines);
77 }
78 
79 // Templated helper function used to create destructor callbacks for the
80 // BBGrid::ClearGridData() method.
DeleteObject(T * object)81 template <typename T> void DeleteObject(T *object) {
82   delete object;
83 }
84 
~ColumnFinder()85 ColumnFinder::~ColumnFinder() {
86   column_sets_.delete_data_pointers();
87   if (best_columns_ != NULL) {
88     delete [] best_columns_;
89   }
90   // ColPartitions and ColSegments created by this class for storage in grids
91   // need to be deleted explicitly.
92   clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
93   col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
94 
95   // The ColPartitions are destroyed automatically, but any boxes in
96   // the noise_parts_ list are owned and need to be deleted explicitly.
97   ColPartition_IT part_it(&noise_parts_);
98   for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {
99     ColPartition* part = part_it.data();
100     part->DeleteBoxes();
101   }
102   // Likewise any boxes in the good_parts_ list need to be deleted.
103   // These are just the image parts. Text parts have already given their
104   // boxes on to the TO_BLOCK, and have empty lists.
105   part_it.set_to_list(&good_parts_);
106   for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {
107     ColPartition* part = part_it.data();
108     part->DeleteBoxes();
109   }
110   // Also, any blobs on the image_bblobs_ list need to have their cblobs
111   // deleted. This only happens if there has been an early return from
112   // FindColumns, as in a normal return, the blobs go into the grid and
113   // end up in noise_parts_, good_parts_ or the output blocks.
114   BLOBNBOX_IT bb_it(&image_bblobs_);
115   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
116     BLOBNBOX* bblob = bb_it.data();
117     delete bblob->cblob();
118   }
119 }
120 
121 // Finds the text and image blocks, returning them in the blocks and to_blocks
122 // lists. (Each TO_BLOCK points to the basic BLOCK and adds more information.)
123 // If boxa and pixa are not NULL, they are assumed to be the output of
124 // ImageFinder::FindImages, and are used to generate image blocks.
125 // The input boxa and pixa are destroyed.
126 // Imageheight and resolution should be the pixel height and resolution in
127 // pixels per inch of the original image.
128 // The input block is the result of a call to find_components, and contains
129 // the blobs found in the image. These blobs will be removed and placed
130 // in the output blocks, while unused ones will be deleted.
131 // If single_column is true, the input is treated as single column, but
132 // it is still divided into blocks of equal line spacing/text size.
133 // Returns -1 if the user requested retry with more debug info.
FindBlocks(int imageheight,int resolution,bool single_column,TO_BLOCK * block,Boxa * boxa,Pixa * pixa,BLOCK_LIST * blocks,TO_BLOCK_LIST * to_blocks)134 int ColumnFinder::FindBlocks(int imageheight, int resolution,
135                              bool single_column, TO_BLOCK* block,
136                              Boxa* boxa, Pixa* pixa,
137                              BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) {
138 #ifdef HAVE_LIBLEPT
139   if (boxa != NULL) {
140     // Convert the boxa/pixa to fake blobs aligned on the grid.
141     ExtractImageBlobs(imageheight, boxa, pixa);
142     boxaDestroy(&boxa);
143     pixaDestroy(&pixa);
144   }
145 #endif
146   // Decide which large blobs should be included in the grid as potential
147   // characters.
148   // A subsidiary grid used to decide which large blobs to use.
149   StrokeWidth* stroke_width = new StrokeWidth(gridsize(), bleft(), tright());
150   stroke_width->InsertBlobs(block, this);
151   if (textord_tabfind_show_strokewidths) {
152     stroke_width->DisplayGoodBlobs("GoodStrokewidths", NULL);
153   }
154   stroke_width->MoveGoodLargeBlobs(resolution, block);
155   delete stroke_width;
156 
157   if (single_column) {
158     // No tab stops needed. Just the grid that FindTabVectors makes.
159     DontFindTabVectors(resolution, &image_bblobs_, block, &reskew_);
160   } else {
161     // Find the tab stops.
162     FindTabVectors(resolution, &horizontal_lines_, &image_bblobs_, block,
163                    &reskew_, &rerotate_);
164   }
165 
166   // Find the columns.
167   if (MakeColumnPartitions() == 0)
168     return 0;  // This is an empty page.
169   // Make the initial column candidates from the part_sets_.
170   MakeColumnCandidates(single_column);
171   if (textord_debug_tabfind)
172     PrintColumnCandidates("Column candidates");
173   // Improve the column candidates against themselves.
174   ImproveColumnCandidates(&column_sets_, &column_sets_);
175   if (textord_debug_tabfind)
176     PrintColumnCandidates("Improved columns");
177   // Improve the column candidates using the part_sets_.
178   ImproveColumnCandidates(&part_sets_, &column_sets_);
179   if (textord_debug_tabfind)
180     PrintColumnCandidates("Final Columns");
181   // Divide the page into sections of uniform column layout.
182   AssignColumns();
183   if (textord_tabfind_show_columns) {
184     DisplayColumnBounds(&part_sets_);
185   }
186   ComputeMeanColumnGap();
187 
188   // Refill the grid using rectangular spreading, and get the benefit
189   // of the completed tab vectors marking the rule edges of each blob.
190   Clear();
191   InsertBlobList(false, false, false, &image_bblobs_, true, this);
192   InsertBlobList(true, true, false, &block->blobs, true, this);
193 
194   // Insert all the remaining small and noise blobs into the grid and also
195   // make an unknown partition for each. Ownership is taken by the grid.
196   InsertSmallBlobsAsUnknowns(true, &block->small_blobs);
197   InsertSmallBlobsAsUnknowns(true, &block->noise_blobs);
198 
199   // Ownership of the ColPartitions moves from part_sets_ to part_grid_ here,
200   // and ownership of the BLOBNBOXes moves to the ColPartitions.
201   // (They were previously owned by the block or the image_bblobs list,
202   // but they both gave up ownership to the grid at the InsertBlobList above.)
203   MovePartitionsToGrid();
204   // Split and merge the partitions by looking at local neighbours.
205   GridSplitPartitions();
206   // Resolve unknown partitions by adding to an existing partition, fixing
207   // the type, or declaring them noise.
208   GridFindMargins();
209   ListFindMargins(&unknown_parts_);
210   GridInsertUnknowns();
211   GridMergePartitions();
212   // Add horizontal line separators as partitions.
213   GridInsertHLinePartitions();
214   // Recompute margins based on a local neighbourhood search.
215   GridFindMargins();
216   SetPartitionTypes();
217   if (textord_tabfind_show_initial_partitions) {
218     ScrollView* part_win = MakeWindow(100, 300, "InitialPartitions");
219     part_grid_.DisplayBoxes(part_win);
220     DisplayTabVectors(part_win);
221   }
222 
223   // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
224   // insert dot-like noise into period_grid_
225   GetCleanPartitions(block);
226 
227   // Get Table Regions
228   LocateTables();
229 
230   // Build the partitions into chains that belong in the same block and
231   // refine into one-to-one links, then smooth the types within each chain.
232   FindPartitionPartners();
233   RefinePartitionPartners();
234   SmoothPartnerRuns();
235 #ifndef GRAPHICS_DISABLED
236   if (textord_tabfind_show_partitions) {
237     ScrollView* window = MakeWindow(400, 300, "Partitions");
238     if (textord_debug_images)
239       window->Image(AlignedBlob::textord_debug_pix().string(),
240                     image_origin().x(), image_origin().y());
241     part_grid_.DisplayBoxes(window);
242     if (!textord_debug_printable)
243       DisplayTabVectors(window);
244     if (window != NULL && textord_tabfind_show_partitions > 1) {
245       delete window->AwaitEvent(SVET_DESTROY);
246     }
247   }
248 #endif
249   part_grid_.AssertNoDuplicates();
250   // Ownership of the ColPartitions moves from part_grid_ to good_parts_ and
251   // noise_parts_ here. In text blocks, ownership of the BLOBNBOXes moves
252   // from the ColPartitions to the output TO_BLOCK. In non-text, the
253   // BLOBNBOXes stay with the ColPartitions and get deleted in the destructor.
254   TransformToBlocks(blocks, to_blocks);
255   if (textord_debug_tabfind) {
256     tprintf("Found %d blocks, %d to_blocks\n",
257             blocks->length(), to_blocks->length());
258   }
259 
260   DisplayBlocks(blocks);
261 //  MoveSmallBlobs(&block->small_blobs, to_blocks);
262 //  MoveSmallBlobs(&block->noise_blobs, to_blocks);
263 //  MoveSmallBlobs(&period_blobs_, to_blocks);
264   RotateAndReskewBlocks(to_blocks);
265   int result = 0;
266 #ifndef GRAPHICS_DISABLED
267   if (blocks_win_ != NULL) {
268     bool waiting = false;
269     do {
270       waiting = false;
271       SVEvent* event = blocks_win_->AwaitEvent(SVET_ANY);
272       if (event->type == SVET_INPUT && event->parameter != NULL) {
273         if (*event->parameter == 'd')
274           result = -1;
275         else
276           blocks->clear();
277       } else if (event->type == SVET_DESTROY) {
278         blocks_win_ = NULL;
279       } else {
280         waiting = true;
281       }
282       delete event;
283     } while (waiting);
284   }
285 #endif
286   return result;
287 }
288 
289 //////////////// PRIVATE CODE /////////////////////////
290 
291 // Displays the blob and block bounding boxes in a window called Blocks.
DisplayBlocks(BLOCK_LIST * blocks)292 void ColumnFinder::DisplayBlocks(BLOCK_LIST* blocks) {
293 #ifndef GRAPHICS_DISABLED
294   if (textord_tabfind_show_blocks) {
295     if (blocks_win_ == NULL)
296       blocks_win_ = MakeWindow(700, 300, "Blocks");
297     else
298       blocks_win_->Clear();
299     if (textord_debug_images)
300       blocks_win_->Image(AlignedBlob::textord_debug_pix().string(),
301                          image_origin().x(), image_origin().y());
302     else
303       DisplayBoxes(blocks_win_);
304     BLOCK_IT block_it(blocks);
305     int serial = 1;
306     for (block_it.mark_cycle_pt(); !block_it.cycled_list();
307          block_it.forward()) {
308       BLOCK* block = block_it.data();
309       block->plot(blocks_win_, serial++,
310                   textord_debug_printable ? ScrollView::BLUE
311                                           : ScrollView::GREEN);
312     }
313     blocks_win_->Update();
314   }
315 #endif
316 }
317 
318 // Displays the column edges at each grid y coordinate defined by
319 // best_columns_.
DisplayColumnBounds(PartSetVector * sets)320 void ColumnFinder::DisplayColumnBounds(PartSetVector* sets) {
321 #ifndef GRAPHICS_DISABLED
322   ScrollView* col_win = MakeWindow(50, 300, "Columns");
323   if (textord_debug_images)
324     col_win->Image(AlignedBlob::textord_debug_pix().string(),
325                    image_origin().x(), image_origin().y());
326   else
327     DisplayBoxes(col_win);
328   col_win->Pen(textord_debug_printable ? ScrollView::BLUE : ScrollView::GREEN);
329   for (int i = 0; i < gridheight_; ++i) {
330     ColPartitionSet* columns = best_columns_[i];
331     if (columns != NULL)
332       columns->DisplayColumnEdges(i * gridsize_, (i + 1) * gridsize_, col_win);
333   }
334 #endif
335 }
336 
337 // Converts the arrays of Box/Pix to a list of C_OUTLINE, and then to blobs.
338 // The output is a list of C_BLOBs for the images, but the C_OUTLINEs
339 // contain no data.
ExtractImageBlobs(int image_height,Boxa * boxa,Pixa * pixa)340 void ColumnFinder::ExtractImageBlobs(int image_height, Boxa* boxa, Pixa* pixa) {
341 #ifdef HAVE_LIBLEPT
342   BLOBNBOX_IT bb_it(&image_bblobs_);
343   // Iterate the connected components in the image regions mask.
344   int nboxes = boxaGetCount(boxa);
345   for (int i = 0; i < nboxes; ++i) {
346     l_int32 x, y, width, height;
347     boxaGetBoxGeometry(boxa, i, &x, &y, &width, &height);
348     Pix* pix = pixaGetPix(pixa, i, L_CLONE);
349     // Special case set in FindImages:
350     // The image is a rectangle if its width doesn't match the box width.
351     bool rectangle = width != pixGetWidth(pix);
352     // For each grid cell in the pix, find the bounding box of the black
353     // pixels within the cell.
354     int grid_xmin, grid_ymin, grid_xmax, grid_ymax;
355     GridCoords(x, image_height - (y + height), &grid_xmin, &grid_ymin);
356     GridCoords(x + width - 1, image_height - 1 - y, &grid_xmax, &grid_ymax);
357     for (int grid_y = grid_ymin; grid_y <= grid_ymax; ++grid_y) {
358       for (int grid_x = grid_xmin; grid_x <= grid_xmax; ++grid_x) {
359         // Compute bounds of grid cell in sub-image.
360         int x_start = grid_x * gridsize_ + bleft_.x() - x;
361         int y_end = image_height - (grid_y * gridsize_ + bleft_.y()) - y;
362         int x_end = x_start + gridsize_;
363         int y_start = y_end - gridsize_;
364         ImageFinder::BoundsWithinRect(pix, &x_start, &y_start, &x_end, &y_end);
365         // If the box is not degenerate, make a blob.
366         if (x_end > x_start && y_end > y_start) {
367           C_OUTLINE_LIST outlines;
368           C_OUTLINE_IT ol_it = &outlines;
369           // Make a C_OUTLINE from the bounds. This is a bit of a hack,
370           // as there is no outline, just a bounding box, but with some very
371           // small changes to coutln.cpp, it works nicely.
372           ICOORD top_left(x_start + x, image_height - (y_start + y));
373           ICOORD bot_right(x_end + x, image_height - (y_end + y));
374           CRACKEDGE startpt;
375           startpt.pos = top_left;
376           C_OUTLINE* outline = new C_OUTLINE(&startpt, top_left, bot_right, 0);
377           ol_it.add_after_then_move(outline);
378           C_BLOB* blob = new C_BLOB(&outlines);
379           // Although a BLOBNBOX doesn't normally think it owns the blob,
380           // these will all end up in a ColPartition, which deletes the
381           // C_BLOBs of all its BLOBNBOXes in its destructor to match the
382           // fact that the rest get moved to a block later.
383           BLOBNBOX* bblob = new BLOBNBOX(blob);
384           bblob->set_region_type(rectangle ? BRT_RECTIMAGE : BRT_POLYIMAGE);
385           bb_it.add_after_then_move(bblob);
386         }
387       }
388     }
389     pixDestroy(&pix);
390   }
391 #endif  // HAVE_LIBLEPT
392 }
393 
394 ////// Functions involved in making the initial ColPartitions. /////
395 
396 // Creates the initial ColPartitions, and puts them in a ColPartitionSet
397 // for each grid y coordinate, storing the ColPartitionSets in part_sets_.
398 // After creating the ColPartitonSets, attempts to merge them where they
399 // overlap and unique the BLOBNBOXes within.
400 // The return value is the number of ColPartitionSets made.
MakeColumnPartitions()401 int ColumnFinder::MakeColumnPartitions() {
402   part_sets_.reserve(gridheight_);
403   for (int grid_y = 0; grid_y < gridheight_; ++grid_y) {
404     ColPartitionSet* line_set = PartitionsAtGridY(grid_y);
405     part_sets_.push_back(line_set);
406   }
407   // Now merge neighbouring partitions that overlap significantly.
408   int part_set_count = 0;
409   for (int i = 0; i < gridheight_; ++i) {
410     ColPartitionSet* line_set = part_sets_.get(i);
411     if (line_set == NULL)
412       continue;
413     bool merged_any = false;
414     for (int j = i + 1; j < gridheight_; ++j) {
415       ColPartitionSet* line_set2 = part_sets_.get(j);
416       if (line_set2 == NULL)
417         continue;
418       if (line_set->MergeOverlaps(line_set2, WidthCB())) {
419         merged_any = true;
420         if (line_set2->Empty()) {
421           delete line_set2;
422           part_sets_.set(NULL, j);
423         }
424         if (line_set->Empty()) {
425           delete line_set;
426           part_sets_.set(NULL, i);
427           merged_any = false;
428           break;
429         }
430       } else {
431         break;
432       }
433     }
434     if (merged_any)
435       --i;  // Try this one again.
436     else
437       ++part_set_count;
438   }
439   return part_set_count;
440 }
441 
442 // Partition the BLOBNBOXES horizontally at the given grid y, creating a
443 // ColPartitionSet which is returned. NULL is returned if there are no
444 // BLOBNBOXES at the given grid y.
PartitionsAtGridY(int grid_y)445 ColPartitionSet* ColumnFinder::PartitionsAtGridY(int grid_y) {
446   ColPartition_LIST partitions;
447   ColPartition_IT part_it(&partitions);
448   // Setup a search of all the grid cells at the given y.
449   GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> rectsearch(this);
450   int y = grid_y * gridsize_ + bleft_.y();
451   ICOORD botleft(bleft_.x(), y);
452   ICOORD topright(tright_.x(), y + gridsize_ - 1);
453   TBOX line_box(botleft, topright);
454   rectsearch.StartRectSearch(line_box);
455   BLOBNBOX* bbox = rectsearch.NextRectSearch();
456   // Each iteration round this while loop finds a set of boxes between
457   // tabvectors (or a change of aligned_text type) and places them in
458   // a ColPartition.
459   int page_edge = line_box.right() + kColumnWidthFactor;
460   int prev_margin = line_box.left() - kColumnWidthFactor;
461   // Runs of unknown blobs (not certainly text or image) go in a special
462   // unk_part, following the same rules as known blobs, but need a
463   // separate set of variables to hold the margin/edge information.
464   ColPartition_IT unk_part_it(&unknown_parts_);
465   ColPartition* unk_partition = NULL;
466   TabVector* unk_right_line = NULL;
467   int unk_right_margin = page_edge;
468   int unk_prev_margin = prev_margin;
469   bool unk_edge_is_left = false;
470   while (bbox != NULL) {
471     TBOX box = bbox->bounding_box();
472     if (WithinTestRegion(2, box.left(), box.bottom()))
473       tprintf("Starting partition on grid y=%d with box (%d,%d)->(%d,%d)\n",
474               grid_y, box.left(), box.bottom(), box.right(), box.top());
475     if (box.left() < prev_margin + 1 && textord_debug_bugs) {
476       tprintf("Starting box too far left at %d vs %d:",
477               box.left(), prev_margin + 1);
478       part_it.data()->Print();
479     }
480     int right_margin = page_edge;
481     BlobRegionType start_type = bbox->region_type();
482     if (start_type == BRT_NOISE) {
483       // Ignore blobs that have been overruled by image blobs.
484       // TODO(rays) Possible place to fix inverse text.
485       bbox = rectsearch.NextRectSearch();
486       continue;
487     }
488     if (start_type == BRT_UNKNOWN) {
489       // Keep unknown blobs in a special partition.
490       ProcessUnknownBlob(page_edge, bbox, &unk_partition, &unk_part_it,
491                          &unk_right_line, &unk_right_margin,
492                          &unk_prev_margin, &unk_edge_is_left);
493       bbox = rectsearch.NextRectSearch();
494       continue;
495     }
496     if (unk_partition != NULL)
497       unk_prev_margin = CompletePartition(false, page_edge,
498                                           unk_right_line, &unk_right_margin,
499                                           &unk_partition, &unk_part_it);
500     TabVector* right_line = NULL;
501     bool edge_is_left = false;
502     ColPartition* partition = StartPartition(start_type, prev_margin + 1, bbox,
503                                              &right_line, &right_margin,
504                                              &edge_is_left);
505     // Search for the right edge of this partition.
506     while ((bbox = rectsearch.NextRectSearch()) != NULL) {
507       TBOX box = bbox->bounding_box();
508       int left = box.left();
509       int right = box.right();
510       int edge = edge_is_left ? left : right;
511       BlobRegionType next_type = bbox->region_type();
512       if (next_type == BRT_NOISE)
513         continue;
514       if (next_type == BRT_UNKNOWN) {
515         // Keep unknown blobs in a special partition.
516         ProcessUnknownBlob(page_edge, bbox, &unk_partition, &unk_part_it,
517                            &unk_right_line, &unk_right_margin,
518                            &unk_prev_margin, &unk_edge_is_left);
519         continue;  // Deal with them later.
520       }
521       if (unk_partition != NULL)
522         unk_prev_margin = CompletePartition(false, page_edge,
523                                             unk_right_line, &unk_right_margin,
524                                             &unk_partition, &unk_part_it);
525       if (ColPartition::TypesMatch(next_type, start_type) &&
526           edge < right_margin) {
527         // Still within the region and it is still the same type.
528         partition->AddBox(bbox);
529       } else {
530         // This is the first blob in the next set. It gives us the absolute
531         // max right coord of the block. (The right margin.)
532         right_margin = left - 1;
533         if (WithinTestRegion(2, box.left(), box.bottom()))
534           tprintf("Box (%d,%d)->(%d,%d) ended partition at %d\n",
535                   box.left(), box.bottom(), box.right(), box.top(),
536                   right_margin);
537         break;
538       }
539     }
540     prev_margin = CompletePartition(bbox == NULL, page_edge,
541                                     right_line, &right_margin,
542                                     &partition, &part_it);
543   }
544   if (unk_partition != NULL)
545     CompletePartition(true, page_edge, unk_right_line, &unk_right_margin,
546                       &unk_partition, &unk_part_it);
547   return partitions.empty() ? NULL : new ColPartitionSet(&partitions);
548 }
549 
550 // Insert the blobs in the given list into the main grid and for
551 // each one also make it a separate unknown partition.
552 // If filter is true, use only the blobs that are above a threshold in
553 // size or a non-isolated.
InsertSmallBlobsAsUnknowns(bool filter,BLOBNBOX_LIST * blobs)554 void ColumnFinder::InsertSmallBlobsAsUnknowns(bool filter,
555                                               BLOBNBOX_LIST* blobs) {
556   double noise_blob_size = gridsize() * kMinNonNoiseFraction;
557   ColPartition_IT unk_part_it(&unknown_parts_);
558   BLOBNBOX_IT blob_it(blobs);
559   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
560     BLOBNBOX* blob = blob_it.data();
561     TBOX box = blob->bounding_box();
562     bool good_blob = !filter ||
563                      box.width() > noise_blob_size ||
564                      box.height() > noise_blob_size;
565     if (!good_blob) {
566       // Search the vicinity for a bigger blob.
567       GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> radsearch(this);
568       radsearch.StartRadSearch((box.left() + box.right()) / 2,
569                                (box.bottom() + box.top()) / 2,
570                                kSmallBlobSearchRadius);
571       BLOBNBOX* neighbour;
572       while ((neighbour = radsearch.NextRadSearch()) != NULL) {
573         TBOX nbox = neighbour->bounding_box();
574         // Neighbours must be bigger than the noise size limit to prevent
575         // the seed effect of starting with one noise object near a real
576         // object, and it then allowing all its neighbours to be accepted.
577         if (nbox.height() > noise_blob_size || nbox.width() > noise_blob_size)
578           break;
579       }
580       if (neighbour != NULL)
581         good_blob = true;
582     }
583     if (good_blob) {
584       blob_it.extract();
585       blob->set_noise_flag(true);
586       InsertBlob(true, true, false, blob, this);
587       if (WithinTestRegion(2, box.left(), box.bottom()))
588         tprintf("Starting small partition with box (%d,%d)->(%d,%d)\n",
589                 box.left(), box.bottom(), box.right(), box.top());
590 
591       int unk_right_margin = tright().x();
592       TabVector* unk_right_line = NULL;
593       bool unk_edge_is_left = false;
594       ColPartition* unk_partition = StartPartition(BRT_TEXT, bleft().x(), blob,
595                                                    &unk_right_line,
596                                                    &unk_right_margin,
597                                                    &unk_edge_is_left);
598       CompletePartition(false, tright().x(), unk_right_line,
599                         &unk_right_margin, &unk_partition, &unk_part_it);
600     }
601   }
602 }
603 
604 // Helper function for PartitionsAtGridY, with a long argument list.
605 // This bbox is of unknown type, so it is added to an unk_partition.
606 // If the edge is past the unk_right_margin then unk_partition has to be
607 // completed and a new one made. See CompletePartition and StartPartition
608 // for the other args.
ProcessUnknownBlob(int page_edge,BLOBNBOX * bbox,ColPartition ** unk_partition,ColPartition_IT * unk_part_it,TabVector ** unk_right_line,int * unk_right_margin,int * unk_prev_margin,bool * unk_edge_is_left)609 void ColumnFinder::ProcessUnknownBlob(int page_edge,
610                                       BLOBNBOX* bbox,
611                                       ColPartition** unk_partition,
612                                       ColPartition_IT* unk_part_it,
613                                       TabVector** unk_right_line,
614                                       int* unk_right_margin,
615                                       int* unk_prev_margin,
616                                       bool* unk_edge_is_left) {
617   if (*unk_partition != NULL) {
618     const TBOX& box = bbox->bounding_box();
619     int edge = *unk_edge_is_left ? box.left() : box.right();
620     if (edge >= *unk_right_margin)
621       *unk_prev_margin = CompletePartition(false, page_edge,
622                                            *unk_right_line, unk_right_margin,
623                                            unk_partition, unk_part_it);
624   }
625   if (*unk_partition == NULL) {
626     *unk_partition = StartPartition(BRT_TEXT, *unk_prev_margin + 1, bbox,
627                                     unk_right_line,
628                                     unk_right_margin,
629                                     unk_edge_is_left);
630   } else {
631     (*unk_partition)->AddBox(bbox);
632   }
633 }
634 
635 // Creates and returns a new ColPartition of the given start_type
636 // and adds the given bbox to it.
637 // Also finds the left and right tabvectors that bound the textline, setting
638 // the members of the returned ColPartition appropriately:
639 // If the left tabvector is less constraining than the input left_margin
640 // (assumed to be the right edge of the previous partition), then the
641 // tabvector is ignored and the left_margin used instead.
642 // If the right tabvector is more constraining than the input *right_margin,
643 // (probably the right edge of the page), then the *right_margin is adjusted
644 // to use the tabvector.
645 // *edge_is_left is set to true if the right tabvector is good and used as the
646 // margin, so we can include blobs that overhang the tabvector in this
647 // partition.
StartPartition(BlobRegionType start_type,int left_margin,BLOBNBOX * bbox,TabVector ** right_line,int * right_margin,bool * edge_is_left)648 ColPartition* ColumnFinder::StartPartition(BlobRegionType start_type,
649                                            int left_margin, BLOBNBOX* bbox,
650                                            TabVector** right_line,
651                                            int* right_margin,
652                                            bool* edge_is_left) {
653   ColPartition* partition = new ColPartition(start_type, vertical_skew_);
654   partition->AddBox(bbox);
655   // Find the tabs that bound it.
656   TBOX box = bbox->bounding_box();
657   int mid_y = (box.bottom() + box.top()) / 2;
658   TabVector* left_line = LeftTabForBox(box, true, false);
659   // If the overlapping line is not a left tab, try for non-overlapping.
660   if (left_line != NULL && !left_line->IsLeftTab())
661     left_line = LeftTabForBox(box, false, false);
662   if (left_line != NULL) {
663     int left_x = left_line->XAtY(mid_y);
664     left_x += left_line->IsLeftTab() ? -kColumnWidthFactor : 1;
665     // If the left line is not a genuine left or is less constraining
666     // than the previous blob, then don't store it in the partition.
667     if (left_x < left_margin || !left_line->IsLeftTab())
668       left_line = NULL;
669     if (left_x > left_margin)
670       left_margin = left_x;
671     if (WithinTestRegion(2, box.left(), box.bottom()))
672       tprintf("Left x =%d, left margin = %d\n",
673               left_x, left_margin);
674   }
675   partition->set_left_margin(left_margin);
676   *right_line = RightTabForBox(box, true, false);
677   // If the overlapping line is not a right tab, try for non-overlapping.
678   if (*right_line != NULL && !(*right_line)->IsRightTab())
679     *right_line = RightTabForBox(box, false, false);
680   *edge_is_left = false;
681   if (*right_line != NULL) {
682     int right_x = (*right_line)->XAtY(box.bottom());
683     if (right_x < *right_margin) {
684       *right_margin = right_x;
685       if ((*right_line)->IsRightTab())
686         *edge_is_left = true;
687     }
688     if (WithinTestRegion(3, box.left(), box.bottom()))
689       tprintf("Right x =%d, right_max = %d\n",
690               right_x, *right_margin);
691   }
692   partition->set_right_margin(*right_margin);
693   partition->ComputeLimits();
694   partition->SetLeftTab(left_line);
695   partition->SetRightTab(*right_line);
696   return partition;
697 }
698 
699 // Completes the given partition, and adds it to the given iterator.
700 // The right_margin on input is the left edge of the next blob if there is
701 // one. The right tab vector plus a margin is used as the right margin if
702 // it is more constraining than the next blob, but if there are no more
703 // blobs, we want the right margin to make it to the page edge.
704 // The return value is the next left margin, being the right edge of the
705 // bounding box of blobs.
CompletePartition(bool no_more_blobs,int page_edge,TabVector * right_line,int * right_margin,ColPartition ** partition,ColPartition_IT * part_it)706 int ColumnFinder::CompletePartition(bool no_more_blobs,
707                                     int page_edge,
708                                     TabVector* right_line,
709                                     int* right_margin,
710                                     ColPartition** partition,
711                                     ColPartition_IT* part_it) {
712   ASSERT_HOST(partition !=NULL && *partition != NULL);
713   // If we have a right line, it is possible that its edge is more
714   // constraining than the next blob.
715   if (right_line != NULL && right_line->IsRightTab()) {
716     int mid_y = (*partition)->MidY();
717     int right_x = right_line->XAtY(mid_y) + kColumnWidthFactor;
718     if (right_x < *right_margin)
719       *right_margin = right_x;
720     else if (no_more_blobs)
721       *right_margin = MAX(right_x, page_edge);
722     else if (right_line->XAtY(mid_y) > *right_margin)
723       right_line = NULL;
724   } else {
725     right_line = NULL;
726   }
727   // Now we can complete the partition and add it to the list.
728   (*partition)->set_right_margin(*right_margin);
729   (*partition)->ComputeLimits();
730   (*partition)->SetRightTab(right_line);
731   (*partition)->SetColumnGoodness(WidthCB());
732   part_it->add_after_then_move(*partition);
733   // Setup ready to start the next one.
734   *right_margin = page_edge;
735   int next_left_margin = (*partition)->bounding_box().right();
736   *partition = NULL;
737   return next_left_margin;
738 }
739 
740 // Makes an ordered list of candidates to partition the width of the page
741 // into columns using the part_sets_.
742 // See AddToColumnSetsIfUnique for the ordering.
743 // If single_column, then it just makes a single page-wide fake column.
MakeColumnCandidates(bool single_column)744 void ColumnFinder::MakeColumnCandidates(bool single_column) {
745   if (!single_column) {
746     // Try using only the good parts first.
747     bool good_only = true;
748     do {
749       for (int i = 0; i < gridheight_; ++i) {
750         ColPartitionSet* line_set = part_sets_.get(i);
751         if (line_set != NULL && line_set->LegalColumnCandidate()) {
752           ColPartitionSet* column_candidate = line_set->Copy(good_only);
753           if (column_candidate != NULL)
754             column_candidate->AddToColumnSetsIfUnique(&column_sets_, WidthCB());
755         }
756       }
757       good_only = !good_only;
758     } while (column_sets_.empty() && !good_only);
759   }
760   if (column_sets_.empty()) {
761     // The page contains only image or is single column.
762     // Make a fake page-wide column.
763     ColPartition* fake_part = new ColPartition(BRT_TEXT, vertical_skew_);
764     fake_part->set_left_margin(bleft_.x());
765     fake_part->set_right_margin(tright_.x());
766     fake_part->ComputeLimits();
767     fake_part->SetColumnGoodness(WidthCB());
768     ColPartitionSet* column_candidate = new ColPartitionSet(fake_part);
769     column_candidate->AddToColumnSetsIfUnique(&column_sets_, WidthCB());
770   }
771 }
772 
773 // Attempt to improve the column_candidates by expanding the columns
774 // and adding new partitions from the partition sets in src_sets.
775 // Src_sets may be equal to column_candidates, in which case it will
776 // use them as a source to improve themselves.
ImproveColumnCandidates(PartSetVector * src_sets,PartSetVector * column_sets)777 void ColumnFinder::ImproveColumnCandidates(PartSetVector* src_sets,
778                                            PartSetVector* column_sets) {
779   PartSetVector temp_cols;
780   temp_cols.move(column_sets);
781   if (src_sets == column_sets)
782     src_sets = &temp_cols;
783   int set_size = temp_cols.size();
784   // Try using only the good parts first.
785   bool good_only = true;
786   do {
787     for (int i = 0; i < set_size; ++i) {
788       ColPartitionSet* column_candidate = temp_cols.get(i);
789       ASSERT_HOST(column_candidate != NULL);
790       ColPartitionSet* improved = column_candidate->Copy(good_only);
791       if (improved != NULL) {
792         improved->ImproveColumnCandidate(WidthCB(), src_sets);
793         improved->AddToColumnSetsIfUnique(column_sets, WidthCB());
794       }
795     }
796     good_only = !good_only;
797   } while (column_sets->empty() && !good_only);
798   if (column_sets->empty())
799     column_sets->move(&temp_cols);
800   else
801     temp_cols.delete_data_pointers();
802 }
803 
804 // Prints debug information on the column candidates.
PrintColumnCandidates(const char * title)805 void ColumnFinder::PrintColumnCandidates(const char* title) {
806   int set_size =  column_sets_.size();
807   tprintf("Found %d %s:\n", set_size, title);
808   if (textord_debug_tabfind >= 3) {
809     for (int i = 0; i < set_size; ++i) {
810       ColPartitionSet* column_set = column_sets_.get(i);
811       column_set->Print();
812     }
813   }
814 }
815 
816 // Finds the optimal set of columns that cover the entire image with as
817 // few changes in column partition as possible.
818 // NOTE: this could be thought of as an optimization problem, but a simple
819 // greedy algorithm is used instead. The algorithm repeatedly finds the modal
820 // compatible column in an unassigned region and uses that with the extra
821 // tweak of extending the modal region over small breaks in compatibility.
AssignColumns()822 void ColumnFinder::AssignColumns() {
823   int set_count = part_sets_.size();
824   ASSERT_HOST(set_count == gridheight());
825   // Allocate and init the best_columns_.
826   best_columns_ = new ColPartitionSet*[set_count];
827   for (int y = 0; y < set_count; ++y)
828     best_columns_[y] = NULL;
829   int column_count = column_sets_.size();
830   // possible_column_sets[part_sets_ index][column_sets_ index] is
831   // true if the partition set is compatible with the column set.
832   // assigned_column_sets[part_sets_ index][column_sets_ index] is true
833   // if the partition set has been assigned the column. (Multiple bits
834   // true is possible.)
835   // any_columns_possible[part_sets_ index] is true if any of
836   // possible_column_sets[part_sets_ index][*] is true.
837   // On return the best_columns_ member is set.
838   bool* any_columns_possible = new bool[set_count];
839   bool** possible_column_sets = new bool*[set_count];
840   bool** assigned_column_sets = new bool*[set_count];
841   // Set possible column_sets to indicate whether each set is compatible
842   // with each column.
843   for (int part_i = 0; part_i < set_count; ++part_i) {
844     ColPartitionSet* line_set = part_sets_.get(part_i);
845     bool debug = line_set != NULL &&
846                  WithinTestRegion(2, line_set->bounding_box().left(),
847                                   line_set->bounding_box().bottom());
848     possible_column_sets[part_i] = new bool[column_count];
849     assigned_column_sets[part_i] = new bool[column_count];
850     any_columns_possible[part_i] = false;
851     for (int col_i = 0; col_i < column_count; ++col_i) {
852       assigned_column_sets[part_i][col_i] = false;
853       if (line_set != NULL &&
854           column_sets_.get(col_i)->CompatibleColumns(debug, line_set,
855                                                      WidthCB())) {
856         possible_column_sets[part_i][col_i] = true;
857         any_columns_possible[part_i] = true;
858       } else {
859         possible_column_sets[part_i][col_i] = false;
860       }
861     }
862   }
863   // Assign a column set to each vertical grid position.
864   // While there is an unassigned range, find its mode.
865   int start, end;
866   while (BiggestUnassignedRange(any_columns_possible, &start, &end)) {
867     if (textord_debug_tabfind >= 2)
868       tprintf("Biggest unassigned range = %d- %d\n", start, end);
869     // Find the modal column_set_id in the range.
870     int column_set_id = RangeModalColumnSet(possible_column_sets, start, end);
871     if (textord_debug_tabfind >= 2) {
872       tprintf("Range modal column id = %d\n", column_set_id);
873       column_sets_.get(column_set_id)->Print();
874     }
875     // Now find the longest run of the column_set_id in the range.
876     ShrinkRangeToLongestRun(possible_column_sets, any_columns_possible,
877                             column_set_id, &start, &end);
878     if (textord_debug_tabfind >= 2)
879       tprintf("Shrunk range = %d- %d\n", start, end);
880     // Extend the start and end past the longest run, while there are
881     // only small gaps in compatibility that can be overcome by larger
882     // regions of compatibility beyond.
883     ExtendRangePastSmallGaps(possible_column_sets, any_columns_possible,
884                              column_set_id, -1, -1, &start);
885     --end;
886     ExtendRangePastSmallGaps(possible_column_sets, any_columns_possible,
887                              column_set_id, 1, set_count, &end);
888     ++end;
889     if (textord_debug_tabfind)
890       tprintf("Column id %d applies to range = %d - %d\n",
891               column_set_id, start, end);
892     // Assign the column to the range, which now may overlap with other ranges.
893     AssignColumnToRange(column_set_id, start, end,
894                         assigned_column_sets);
895   }
896   // If anything remains unassigned, the whole lot is unassigned, so
897   // arbitrarily assign id 0.
898   if (best_columns_[0] == NULL) {
899     AssignColumnToRange(0, 0, gridheight_, assigned_column_sets);
900   }
901   // Free memory.
902   for (int i = 0; i < set_count; ++i) {
903     delete [] possible_column_sets[i];
904     delete [] assigned_column_sets[i];
905   }
906   delete [] any_columns_possible;
907   delete [] possible_column_sets;
908   delete [] assigned_column_sets;
909   // TODO(rays) Now resolve overlapping assignments.
910 }
911 
912 // Finds the biggest range in part_sets_ that has no assigned column, but
913 // column assignment is possible.
BiggestUnassignedRange(const bool * any_columns_possible,int * best_start,int * best_end)914 bool ColumnFinder::BiggestUnassignedRange(const bool* any_columns_possible,
915                                           int* best_start, int* best_end) {
916   int set_count = part_sets_.size();
917   int best_range_size = 0;
918   *best_start = set_count;
919   *best_end = set_count;
920   int end = set_count;
921   for (int start = 0; start < gridheight_; start = end) {
922     // Find the first unassigned index in start.
923     while (start < set_count) {
924       if (best_columns_[start] == NULL && any_columns_possible[start])
925         break;
926       ++start;
927     }
928     // Find the first past the end and count the good ones in between.
929     int range_size = 1;  // Number of non-null, but unassigned line sets.
930     end = start + 1;
931     while (end < set_count) {
932       if (best_columns_[end] != NULL)
933         break;
934       if (any_columns_possible[end])
935         ++range_size;
936       ++end;
937     }
938     if (start < set_count && range_size > best_range_size) {
939       best_range_size = range_size;
940       *best_start = start;
941       *best_end = end;
942     }
943   }
944   return *best_start < *best_end;
945 }
946 
947 // Finds the modal compatible column_set_ index within the given range.
RangeModalColumnSet(bool ** possible_column_sets,int start,int end)948 int ColumnFinder::RangeModalColumnSet(bool** possible_column_sets,
949                                       int start, int end) {
950   int column_count = column_sets_.size();
951   STATS column_stats(0, column_count);
952   for (int part_i = start; part_i < end; ++part_i) {
953     for (int col_j = 0; col_j < column_count; ++col_j) {
954       if (possible_column_sets[part_i][col_j])
955         column_stats.add(col_j, 1);
956     }
957   }
958   ASSERT_HOST(column_stats.get_total() > 0);
959   return column_stats.mode();
960 }
961 
962 // Given that there are many column_set_id compatible columns in the range,
963 // shrinks the range to the longest contiguous run of compatibility, allowing
964 // gaps where no columns are possible, but not where competing columns are
965 // possible.
ShrinkRangeToLongestRun(bool ** possible_column_sets,const bool * any_columns_possible,int column_set_id,int * best_start,int * best_end)966 void ColumnFinder::ShrinkRangeToLongestRun(bool** possible_column_sets,
967                                            const bool* any_columns_possible,
968                                            int column_set_id,
969                                            int* best_start, int* best_end) {
970   // orig_start and orig_end are the maximum range we will look at.
971   int orig_start = *best_start;
972   int orig_end = *best_end;
973   int best_range_size = 0;
974   *best_start = orig_end;
975   *best_end = orig_end;
976   int end = orig_end;
977   for (int start = orig_start; start < orig_end; start = end) {
978     // Find the first possible
979     while (start < orig_end) {
980       if (possible_column_sets[start][column_set_id] ||
981           !any_columns_possible[start])
982         break;
983       ++start;
984     }
985     // Find the first past the end.
986     end = start + 1;
987     while (end < orig_end) {
988       if (!possible_column_sets[end][column_set_id] &&
989           any_columns_possible[end])
990           break;
991       ++end;
992     }
993     if (start < orig_end && end - start > best_range_size) {
994       best_range_size = end - start;
995       *best_start = start;
996       *best_end = end;
997     }
998   }
999 }
1000 
1001 // Moves start in the direction of step, upto, but not including end while
1002 // the only incompatible regions are no more than kMaxIncompatibleColumnCount
1003 // in size, and the compatible regions beyond are bigger.
ExtendRangePastSmallGaps(bool ** possible_column_sets,const bool * any_columns_possible,int column_set_id,int step,int end,int * start)1004 void ColumnFinder::ExtendRangePastSmallGaps(bool** possible_column_sets,
1005                                             const bool* any_columns_possible,
1006                                             int column_set_id,
1007                                             int step, int end, int* start) {
1008   if (textord_debug_tabfind > 2)
1009     tprintf("Starting expansion at %d, step=%d, limit=%d\n",
1010             *start, step, end);
1011   if (*start == end)
1012     return;  // Cannot be expanded.
1013 
1014   int barrier_size = 0;
1015   int good_size = 0;
1016   do {
1017     // Find the size of the incompatible barrier.
1018     barrier_size = 0;
1019     int i;
1020     for (i = *start + step; i != end; i += step) {
1021       if (possible_column_sets[i][column_set_id])
1022         break;  // We are back on.
1023       // Locations where none are possible don't count.
1024       if (any_columns_possible[i])
1025         ++barrier_size;
1026     }
1027     if (textord_debug_tabfind > 2)
1028       tprintf("At %d, Barrier size=%d\n", i, barrier_size);
1029     if (barrier_size > kMaxIncompatibleColumnCount)
1030       return;  // Barrier too big.
1031     if (i == end) {
1032       // We can't go any further, but the barrier was small, so go to the end.
1033       *start = i - step;
1034       return;
1035     }
1036     // Now find the size of the good region on the other side.
1037     good_size = 1;
1038     for (i += step; i != end; i += step) {
1039       if (possible_column_sets[i][column_set_id])
1040         ++good_size;
1041       else if (any_columns_possible[i])
1042         break;
1043     }
1044     if (textord_debug_tabfind > 2)
1045       tprintf("At %d, good size = %d\n", i, good_size);
1046     // If we had enough good ones we can extend the start and keep looking.
1047     if (good_size > barrier_size)
1048       *start = i - step;
1049   } while (good_size > barrier_size);
1050 }
1051 
1052 // Assigns the given column_set_id to the given range.
AssignColumnToRange(int column_set_id,int start,int end,bool ** assigned_column_sets)1053 void ColumnFinder::AssignColumnToRange(int column_set_id, int start, int end,
1054                                        bool** assigned_column_sets) {
1055   ColPartitionSet* column_set = column_sets_.get(column_set_id);
1056   for (int i = start; i < end; ++i) {
1057     assigned_column_sets[i][column_set_id] = true;
1058     best_columns_[i] = column_set;
1059   }
1060 }
1061 
1062 // Computes the mean_column_gap_.
ComputeMeanColumnGap()1063 void ColumnFinder::ComputeMeanColumnGap() {
1064   int total_gap = 0;
1065   int total_width = 0;
1066   int gap_samples = 0;
1067   int width_samples = 0;
1068   for (int i = 0; i < gridheight_; ++i) {
1069     ASSERT_HOST(best_columns_[i] != NULL);
1070     best_columns_[i]->AccumulateColumnWidthsAndGaps(&total_width,
1071                                                     &width_samples,
1072                                                     &total_gap,
1073                                                     &gap_samples);
1074   }
1075   mean_column_gap_ = gap_samples > 0 ? total_gap / gap_samples
1076                                      : total_width / width_samples;
1077 }
1078 
1079 //////// Functions that manipulate ColPartitions in the part_grid_ /////
1080 //////// to split, merge, find margins, and find types.  //////////////
1081 
1082 // Removes the ColPartitions from part_sets_, the ColPartitionSets that
1083 // contain them, and puts them in the part_grid_ after ensuring that no
1084 // BLOBNBOX is owned by more than one of them.
MovePartitionsToGrid()1085 void ColumnFinder::MovePartitionsToGrid() {
1086   // Remove the parts from the part_sets_ and put them in the parts list.
1087   part_grid_.Init(gridsize(), bleft(), tright());
1088   ColPartition_LIST parts;
1089   for (int i = 0; i < gridheight_; ++i) {
1090     ColPartitionSet* line_set = part_sets_.get(i);
1091     if (line_set != NULL) {
1092       line_set->ReturnParts(&parts);
1093       delete line_set;
1094       part_sets_.set(NULL, i);
1095     }
1096   }
1097   // Make each part claim ownership of its own boxes uniquely.
1098   ColPartition_IT it(&parts);
1099   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1100     ColPartition* part = it.data();
1101     part->ClaimBoxes(WidthCB());
1102   }
1103   // Unknowns must be uniqued too.
1104   it.set_to_list(&unknown_parts_);
1105   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1106     ColPartition* part = it.data();
1107     part->ClaimBoxes(WidthCB());
1108   }
1109   // Put non-empty parts into the grid and delete empty ones.
1110   for (it.set_to_list(&parts); !it.empty(); it.forward()) {
1111     ColPartition* part = it.extract();
1112     if (part->IsEmpty())
1113       delete part;
1114     else
1115       part_grid_.InsertBBox(true, true, part);
1116   }
1117 }
1118 
1119 // Splits partitions that cross columns where they have nothing in the gap.
GridSplitPartitions()1120 void ColumnFinder::GridSplitPartitions() {
1121   // Iterate the ColPartitions in the grid.
1122   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1123     gsearch(&part_grid_);
1124   gsearch.StartFullSearch();
1125   ColPartition* dont_repeat = NULL;
1126   ColPartition* part;
1127   while ((part = gsearch.NextFullSearch()) != NULL) {
1128     if (part->blob_type() < BRT_UNKNOWN || part == dont_repeat)
1129       continue;  // Only applies to text partitions.
1130     ColPartitionSet* column_set = best_columns_[gsearch.GridY()];
1131     int first_col = -1;
1132     int last_col = -1;
1133     // Find which columns the partition spans.
1134     part->ColumnRange(column_set, &first_col, &last_col);
1135     if (first_col > 0)
1136       --first_col;
1137     // Convert output column indices to physical column indices.
1138     first_col /= 2;
1139     last_col /= 2;
1140     // We will only consider cases where a partition spans two columns,
1141     // since a heading that spans more columns than that is most likely
1142     // genuine.
1143     if (last_col != first_col + 1)
1144       continue;
1145     if (textord_debug_tabfind) {
1146       tprintf("Considering partition for GridSplit:");
1147       part->Print();
1148     }
1149     // Set up a rectangle search x-bounded by the column gap and y by the part.
1150     int y = part->MidY();
1151     TBOX margin_box = part->bounding_box();
1152     ColPartition* column = column_set->GetColumnByIndex(first_col);
1153     if (column == NULL)
1154       continue;
1155     margin_box.set_left(column->RightAtY(y) + 2);
1156     column = column_set->GetColumnByIndex(last_col);
1157     if (column == NULL)
1158       continue;
1159     margin_box.set_right(column->LeftAtY(y) - 2);
1160     // TODO(rays) Decide whether to keep rectangular filling or not in the
1161     // main grid and therefore whether we need a fancier search here.
1162     // Now run the rect search on the main blob grid.
1163     GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> rectsearch(this);
1164     if (textord_debug_tabfind) {
1165       tprintf("Searching box (%d,%d)->(%d,%d)\n",
1166               margin_box.left(), margin_box.bottom(),
1167               margin_box.right(), margin_box.top());
1168       part->Print();
1169     }
1170     rectsearch.StartRectSearch(margin_box);
1171     BLOBNBOX* bbox;
1172     while ((bbox = rectsearch.NextRectSearch()) != NULL) {
1173       if (bbox->bounding_box().overlap(margin_box))
1174         break;
1175     }
1176     if (bbox == NULL) {
1177       // There seems to be nothing in the hole, so split the partition.
1178       gsearch.RemoveBBox();
1179       int x_middle = (margin_box.left() + margin_box.right()) / 2;
1180       if (textord_debug_tabfind) {
1181         tprintf("Splitting part at %d:", x_middle);
1182         part->Print();
1183       }
1184       ColPartition* split_part = part->SplitAt(x_middle);
1185       if (split_part != NULL) {
1186         if (textord_debug_tabfind) {
1187           tprintf("Split result:");
1188           part->Print();
1189           split_part->Print();
1190         }
1191         part_grid_.InsertBBox(true, true, split_part);
1192       } else {
1193         // Split had no effect
1194         if (textord_debug_tabfind)
1195           tprintf("Split had no effect\n");
1196         dont_repeat = part;
1197       }
1198       part_grid_.InsertBBox(true, true, part);
1199       gsearch.RepositionIterator();
1200     } else if (textord_debug_tabfind) {
1201       tprintf("Part cannot be split: blob (%d,%d)->(%d,%d) in column gap\n",
1202               bbox->bounding_box().left(), bbox->bounding_box().bottom(),
1203               bbox->bounding_box().right(), bbox->bounding_box().top());
1204     }
1205   }
1206 }
1207 
1208 // Merges partitions where there is vertical overlap, within a single column,
1209 // and the horizontal gap is small enough.
GridMergePartitions()1210 void ColumnFinder::GridMergePartitions() {
1211   // Iterate the ColPartitions in the grid.
1212   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1213     gsearch(&part_grid_);
1214   gsearch.StartFullSearch();
1215   ColPartition* part;
1216   while ((part = gsearch.NextFullSearch()) != NULL) {
1217     // Set up a rectangle search x-bounded by the column and y by the part.
1218     ColPartitionSet* columns = best_columns_[gsearch.GridY()];
1219     TBOX box = part->bounding_box();
1220     int y = part->MidY();
1221     ColPartition* left_column = columns->ColumnContaining(box.left(), y);
1222     ColPartition* right_column = columns->ColumnContaining(box.right(), y);
1223     if (left_column == NULL || right_column != left_column)
1224       continue;
1225     box.set_left(left_column->LeftAtY(y));
1226     box.set_right(right_column->RightAtY(y));
1227     // Now run the rect search.
1228     bool modified_box = false;
1229     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1230       rsearch(&part_grid_);
1231     rsearch.StartRectSearch(box);
1232     ColPartition* neighbour;
1233 
1234     while ((neighbour = rsearch.NextRectSearch()) != NULL) {
1235       if (neighbour == part)
1236         continue;
1237       const TBOX& neighbour_box = neighbour->bounding_box();
1238       if (neighbour_box.right() < box.left() ||
1239           neighbour_box.left() > box.right())
1240         continue;  // Not within the same column.
1241       if (part->VOverlaps(*neighbour) && part->TypesMatch(*neighbour)) {
1242         // There is vertical overlap and the gross types match, but only
1243         // merge if the horizontal gap is small enough, as one of the
1244         // partitions may be a figure caption within a column.
1245         // If there is only one column, then the mean_column_gap_ is large
1246         // enough to allow almost any merge, by being the mean column width.
1247         const TBOX& part_box = part->bounding_box();
1248         int h_gap = MAX(part_box.left(), neighbour_box.left()) -
1249                     MIN(part_box.right(), neighbour_box.right());
1250         if (h_gap < mean_column_gap_ * kHorizontalGapMergeFraction ||
1251             part_box.width() < mean_column_gap_ ||
1252             neighbour_box.width() < mean_column_gap_) {
1253           if (textord_debug_tabfind) {
1254             tprintf("Running grid-based merge between:\n");
1255             part->Print();
1256             neighbour->Print();
1257           }
1258           rsearch.RemoveBBox();
1259           gsearch.RepositionIterator();
1260           part->Absorb(neighbour, WidthCB());
1261           modified_box = true;
1262         }
1263       }
1264     }
1265     if (modified_box) {
1266       // We modified the box of part, so re-insert it into the grid.
1267       // This does no harm in the current cell, as it already exists there,
1268       // but it needs to exist in all the cells covered by its bounding box,
1269       // or it will never be found by a full search.
1270       // Because the box has changed, it has to be removed first, otherwise
1271       // add_sorted may fail to keep a single copy of the pointer.
1272       gsearch.RemoveBBox();
1273       part_grid_.InsertBBox(true, true, part);
1274       gsearch.RepositionIterator();
1275     }
1276   }
1277 }
1278 
1279 // Helper function to compute the total pairwise area overlap of a list of
1280 // Colpartitions. If box_this matches an element in the list, the test_box
1281 // is used in place of its box.
TotalPairwiseOverlap(const ColPartition * box_this,const TBOX & test_box,ColPartition_CLIST * parts)1282 static int TotalPairwiseOverlap(const ColPartition* box_this,
1283                                 const TBOX& test_box,
1284                                 ColPartition_CLIST* parts) {
1285   if (parts->singleton())
1286     return 0;
1287   int total_area = 0;
1288   for (ColPartition_C_IT it(parts); !it.at_last(); it.forward()) {
1289     ColPartition* part = it.data();
1290     TBOX part_box = part == box_this ? test_box : part->bounding_box();
1291     ColPartition_C_IT it2(it);
1292     for (it2.forward(); !it2.at_first(); it2.forward()) {
1293       ColPartition* part2 = it2.data();
1294       TBOX part_box2 = part2 == box_this ? test_box : part2->bounding_box();
1295       total_area += part_box.intersection(part_box2).area();
1296     }
1297   }
1298   return total_area;
1299 }
1300 
1301 // Helper function to compute the total area of a list of Colpartitions.
1302 // If box_this matches an element in the list, the test_box
1303 // is used in place of its box.
TotalArea(const ColPartition * box_this,const TBOX & test_box,ColPartition_CLIST * parts)1304 static int TotalArea(const ColPartition* box_this, const TBOX& test_box,
1305                      ColPartition_CLIST* parts) {
1306   int total_area = 0;
1307   ColPartition_C_IT it(parts);
1308   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1309     ColPartition* part = it.data();
1310     TBOX part_box = part == box_this ? test_box : part->bounding_box();
1311     total_area += part_box.area();
1312   }
1313   return total_area;
1314 }
1315 
1316 // Resolves unknown partitions from the unknown_parts_ list by merging them
1317 // with a close neighbour, inserting them into the grid with a known type,
1318 // or declaring them to be noise.
GridInsertUnknowns()1319 void ColumnFinder::GridInsertUnknowns() {
1320   ColPartition_IT noise_it(&noise_parts_);
1321   for (ColPartition_IT it(&unknown_parts_); !it.empty(); it.forward()) {
1322     ColPartition* part = it.extract();
1323     if (part->IsEmpty()) {
1324       // Claiming ownership left this one empty.
1325       delete part;
1326       continue;
1327     }
1328     const TBOX& part_box = part->bounding_box();
1329     int left_limit = LeftEdgeForBox(part_box, false, false);
1330     int right_limit = RightEdgeForBox(part_box, false, false);
1331     TBOX search_box = part_box;
1332     int y = part->MidY();
1333     int grid_x, grid_y;
1334     GridCoords(part_box.left(), y, &grid_x, &grid_y);
1335     // Set up a rectangle search x-bounded by the column and y by the part.
1336     ColPartitionSet* columns = best_columns_[grid_y];
1337     int first_col = -1;
1338     int last_col = -1;
1339     // Find which columns the partition spans.
1340     part->ColumnRange(columns, &first_col, &last_col);
1341     // Convert output column indices to physical column indices.
1342     // Twiddle with first and last_col to get the desired effect with
1343     // in-between columns:
1344     // As returned by ColumnRange, the indices are even for in-betweens and
1345     // odd for real columns (being 2n+1 the real column index).
1346     // Subtract 1 from first col, so we can use left edge of first_col/2 if it
1347     // is even, and the right edge of first_col/2 if it is odd.
1348     // With last_col unchanged, we can use the right edge of last_col/2 if it
1349     // is odd and the left edge of last_col/2 if it is even.
1350     // with first_col, we have to special-case to pretend that the first
1351     // in-between is actually the first column, and with last_col, we have to
1352     // pretend that the last in-between is actually the last column.
1353     if (first_col > 0)
1354       --first_col;
1355     ColPartition* first_column = columns->GetColumnByIndex(first_col / 2);
1356     ColPartition* last_column = columns->GetColumnByIndex(last_col / 2);
1357     if (last_column == NULL && last_col > first_col + 1)
1358       last_column = columns->GetColumnByIndex(last_col / 2 - 1);
1359     // Do not accept the case of both being in the first or last in-between.
1360     if (last_col > 0 && first_column != NULL && last_column != NULL) {
1361       search_box.set_left((first_col & 1) ? first_column->RightAtY(y)
1362                                           : first_column->LeftAtY(y));
1363       search_box.set_right((last_col & 1) ? last_column->RightAtY(y)
1364                                           : last_column->LeftAtY(y));
1365       // Expand the search vertically.
1366       int height = search_box.height();
1367       search_box.set_top(search_box.top() + height);
1368       search_box.set_bottom(search_box.bottom() - height);
1369       // Keep valid neighbours in a list.
1370       ColPartition_CLIST neighbours;
1371       // Now run the rect search.
1372       GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1373         rsearch(&part_grid_);
1374       rsearch.StartRectSearch(search_box);
1375       ColPartition* neighbour;
1376       while ((neighbour = rsearch.NextRectSearch()) != NULL) {
1377         const TBOX& n_box = neighbour->bounding_box();
1378         if (n_box.left() > right_limit || n_box.right() < left_limit)
1379           continue;  // Other side of a tab vector.
1380         if (neighbour->blob_type() == BRT_RECTIMAGE) {
1381           continue;  // Rectangle images aren't allowed to acquire anything.
1382         }
1383         // We can't merge with a partition where it would go beyond the margin
1384         // of the partition.
1385         if ((part_box.left() < neighbour->left_margin() ||
1386             part_box.right() > neighbour->right_margin()) &&
1387             !n_box.contains(part_box)) {
1388           continue;  // This would create an overlap with another partition.
1389         }
1390         // Candidates must be within a reasonable vertical distance.
1391         int v_dist = -part->VOverlap(*neighbour);
1392         if (v_dist >= MAX(part_box.height(), n_box.height()) / 2)
1393           continue;
1394         // Unique elements as they arrive.
1395         neighbours.add_sorted(SortByBoxLeft<ColPartition>, true, neighbour);
1396       }
1397       // The best neighbour to merge with is the one that causes least
1398       // total pairwise overlap among all the candidates.
1399       // If more than one offers the same total overlap, choose the one
1400       // with the least total area.
1401       ColPartition* best_neighbour = NULL;
1402       ColPartition_C_IT n_it(&neighbours);
1403       if (neighbours.singleton()) {
1404         best_neighbour = n_it.data();
1405       } else if (!neighbours.empty()) {
1406         int best_overlap = MAX_INT32;
1407         int best_area = 0;
1408         for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
1409           neighbour = n_it.data();
1410           TBOX merged_box = neighbour->bounding_box();
1411           merged_box += part_box;
1412           int overlap = TotalPairwiseOverlap(neighbour, merged_box,
1413                                              &neighbours);
1414           if (best_neighbour == NULL || overlap < best_overlap) {
1415             best_neighbour = neighbour;
1416             best_overlap = overlap;
1417             best_area = TotalArea(neighbour, merged_box, &neighbours);
1418           } else if (overlap == best_overlap) {
1419             int area = TotalArea(neighbour, merged_box, &neighbours);
1420             if (area < best_area) {
1421               best_area = area;
1422               best_neighbour = neighbour;
1423             }
1424           }
1425         }
1426       }
1427       if (best_neighbour != NULL) {
1428         // It was close enough to an existing partition to merge it.
1429         if (textord_debug_tabfind) {
1430           tprintf("Merging unknown partition:\n");
1431           part->Print();
1432           best_neighbour->Print();
1433         }
1434         // Because the box is about to be resized, it must be removed and
1435         // then re-inserted to prevent duplicates in the grid lists.
1436         part_grid_.RemoveBBox(best_neighbour);
1437         best_neighbour->Absorb(part, WidthCB());
1438         // We modified the box of best_neighbour, so re-insert it into the grid.
1439         part_grid_.InsertBBox(true, true, best_neighbour);
1440       } else {
1441         // It was inside a column, so just add it to the grid.
1442         if (textord_debug_tabfind)
1443           tprintf("Inserting unknown partition:\n");
1444         part_grid_.InsertBBox(true, true, part);
1445       }
1446     } else {
1447       if (textord_debug_tabfind) {
1448         tprintf("Unknown partition at (%d,%d)->(%d,%d) not in any column\n",
1449                 part_box.left(), part_box.bottom(), part_box.right(),
1450                 part_box.top());
1451         tprintf("first_col = %d->%p, last_col=%d->%p\n",
1452                 first_col, first_column, last_col, last_column);
1453       }
1454       noise_it.add_to_end(part);
1455     }
1456   }
1457 }
1458 
1459 // Add horizontal line separators as partitions.
GridInsertHLinePartitions()1460 void ColumnFinder::GridInsertHLinePartitions() {
1461   TabVector_IT hline_it(&horizontal_lines_);
1462   for (hline_it.mark_cycle_pt(); !hline_it.cycled_list(); hline_it.forward()) {
1463     TabVector* hline = hline_it.data();
1464     int top = MAX(hline->startpt().y(), hline->endpt().y());
1465     int bottom = MIN(hline->startpt().y(), hline->endpt().y());
1466     if (top == bottom) {
1467       if (bottom > 0)
1468         --bottom;
1469       else
1470         ++top;
1471     }
1472     ColPartition* part = new ColPartition(vertical_skew_,
1473                                           hline->startpt().x(), bottom,
1474                                           hline->endpt().x(), top);
1475     part_grid_.InsertBBox(true, true, part);
1476   }
1477 }
1478 
1479 // Improves the margins of the ColPartitions in the grid by calling
1480 // FindPartitionMargins on each.
GridFindMargins()1481 void ColumnFinder::GridFindMargins() {
1482   // Iterate the ColPartitions in the grid.
1483   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1484     gsearch(&part_grid_);
1485   gsearch.StartFullSearch();
1486   ColPartition* part;
1487   while ((part = gsearch.NextFullSearch()) != NULL) {
1488     // Set up a rectangle search x-bounded by the column and y by the part.
1489     ColPartitionSet* columns = best_columns_[gsearch.GridY()];
1490     FindPartitionMargins(columns, part);
1491   }
1492 }
1493 
1494 // Improves the margins of the ColPartitions in the list by calling
1495 // FindPartitionMargins on each.
ListFindMargins(ColPartition_LIST * parts)1496 void ColumnFinder::ListFindMargins(ColPartition_LIST* parts) {
1497   ColPartition_IT part_it(parts);
1498   for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {
1499     ColPartition* part = part_it.data();
1500     TBOX part_box = part->bounding_box();
1501     // Get the columns from the y grid coord.
1502     int grid_x, grid_y;
1503     GridCoords(part_box.left(), part_box.bottom(), &grid_x, &grid_y);
1504     ColPartitionSet* columns = best_columns_[grid_y];
1505     FindPartitionMargins(columns, part);
1506   }
1507 }
1508 
1509 // Improves the margins of the ColPartition by searching for
1510 // neighbours that vertically overlap significantly.
FindPartitionMargins(ColPartitionSet * columns,ColPartition * part)1511 void ColumnFinder::FindPartitionMargins(ColPartitionSet* columns,
1512                                         ColPartition* part) {
1513   // Set up a rectangle search x-bounded by the column and y by the part.
1514   ASSERT_HOST(columns != NULL);
1515   TBOX box = part->bounding_box();
1516   int y = part->MidY();
1517   // Initial left margin is based on the column, if there is one.
1518   ColPartition* column = columns->ColumnContaining(box.left(), y);
1519   int left_margin = column != NULL ? column->LeftAtY(y) : bleft_.x();
1520   left_margin -= kColumnWidthFactor;
1521   // Search for ColPartitions that reduce the margin.
1522   left_margin = FindMargin(box.left()+ box.height(), true, left_margin,
1523                            box.bottom(), box.top(), part);
1524   part->set_left_margin(left_margin);
1525   column = columns->ColumnContaining(box.right(), y);
1526   int right_margin = column != NULL ? column->RightAtY(y) : tright_.x();
1527   right_margin += kColumnWidthFactor;
1528   // Search for ColPartitions that reduce the margin.
1529   right_margin = FindMargin(box.right() - box.height(), false, right_margin,
1530                             box.bottom(), box.top(), part);
1531   part->set_right_margin(right_margin);
1532 }
1533 
1534 // Starting at x, and going in the specified direction, upto x_limit, finds
1535 // the margin for the given y range by searching sideways,
1536 // and ignoring not_this.
FindMargin(int x,bool right_to_left,int x_limit,int y_bottom,int y_top,const ColPartition * not_this)1537 int ColumnFinder::FindMargin(int x, bool right_to_left, int x_limit,
1538                              int y_bottom, int y_top,
1539                              const ColPartition* not_this) {
1540   int height = y_top - y_bottom;
1541   int target_overlap = static_cast<int>(height * kMarginOverlapFraction);
1542   // Iterate the ColPartitions in the grid.
1543   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1544     side_search(&part_grid_);
1545   side_search.StartSideSearch(x, y_bottom, y_top);
1546   ColPartition* part;
1547   while ((part = side_search.NextSideSearch(right_to_left)) != NULL) {
1548     // Ignore itself.
1549     if (part == not_this)
1550       continue;
1551     // Must overlap by enough.
1552     TBOX box = part->bounding_box();
1553     int y_overlap = MIN(y_top, box.top()) - MAX(y_bottom, box.bottom());
1554     if (y_overlap < target_overlap)
1555       continue;
1556     // Must be going the right way.
1557     int x_edge = right_to_left ? box.right() : box.left();
1558     if ((x_edge < x) != right_to_left)
1559       continue;
1560     // If we have gone past x_limit, then x_limit will do.
1561     if ((x_edge < x_limit) == right_to_left)
1562       break;
1563     // It reduces x limit, so save the new one.
1564     x_limit = x_edge;
1565   }
1566   return x_limit;
1567 }
1568 
1569 // For every ColPartition in the grid, sets its type based on position
1570 // in the columns.
SetPartitionTypes()1571 void ColumnFinder::SetPartitionTypes() {
1572   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1573     gsearch(&part_grid_);
1574   gsearch.StartFullSearch();
1575   ColPartition* part;
1576   while ((part = gsearch.NextFullSearch()) != NULL) {
1577     part->SetPartitionType(best_columns_[gsearch.GridY()]);
1578   }
1579 }
1580 
1581 //////// Functions that manipulate ColPartitions in the part_grid_ /////
1582 //////// to find chains of partner partitions of the same type.  ///////
1583 
1584 // For every ColPartition in the grid, finds its upper and lower neighbours.
FindPartitionPartners()1585 void ColumnFinder::FindPartitionPartners() {
1586   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1587     gsearch(&part_grid_);
1588   gsearch.StartFullSearch();
1589   ColPartition* part;
1590   while ((part = gsearch.NextFullSearch()) != NULL) {
1591     FindPartitionPartners(true, part);
1592     FindPartitionPartners(false, part);
1593   }
1594 }
1595 
1596 // Finds the best partner in the given direction for the given partition.
1597 // Stores the result with AddPartner.
FindPartitionPartners(bool upper,ColPartition * part)1598 void ColumnFinder::FindPartitionPartners(bool upper, ColPartition* part) {
1599   if (part->type() == PT_NOISE)
1600     return;  // Noise is not allowed to partner anything.
1601   const TBOX& box = part->bounding_box();
1602   int top = part->median_top();
1603   int bottom = part->median_bottom();
1604   int height = top - bottom;
1605   int mid_y = (bottom + top) / 2;
1606   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1607     vsearch(&part_grid_);
1608   // Search down for neighbour below
1609   vsearch.StartVerticalSearch(box.left(), box.right(), part->MidY());
1610   ColPartition* neighbour;
1611   ColPartition* best_neighbour = NULL;
1612   int best_dist = MAX_INT32;
1613   while ((neighbour = vsearch.NextVerticalSearch(!upper)) != NULL) {
1614     if (neighbour == part || neighbour->type() == PT_NOISE)
1615       continue;  // Noise is not allowed to partner anything.
1616     int neighbour_bottom = neighbour->median_bottom();
1617     int neighbour_top = neighbour->median_top();
1618     int neighbour_y = (neighbour_bottom + neighbour_top) / 2;
1619     if (upper != (neighbour_y > mid_y))
1620       continue;
1621     if (!part->HOverlaps(*neighbour) && !part->HCompatible(*neighbour))
1622       continue;
1623     if (!part->TypesMatch(*neighbour)) {
1624       if (best_neighbour == NULL)
1625         best_neighbour = neighbour;
1626       continue;
1627     }
1628     int dist = upper ? neighbour_bottom - top : bottom - neighbour_top;
1629     if (dist <= kMaxPartitionSpacing * height) {
1630       if (dist < best_dist) {
1631         best_dist = dist;
1632         best_neighbour = neighbour;
1633       }
1634     } else {
1635       break;
1636     }
1637   }
1638   if (best_neighbour != NULL)
1639     part->AddPartner(upper, best_neighbour);
1640 }
1641 
1642 // For every ColPartition with multiple partners in the grid, reduces the
1643 // number of partners to 0 or 1.
RefinePartitionPartners()1644 void ColumnFinder::RefinePartitionPartners() {
1645   // Refine in type order so that chasing multple partners can be done
1646   // before eliminating type mis-matching partners.
1647   for (int type = PT_UNKNOWN + 1; type <= PT_COUNT; type++) {
1648     // Iterate the ColPartitions in the grid.
1649     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1650       gsearch(&part_grid_);
1651     gsearch.StartFullSearch();
1652     ColPartition* part;
1653     while ((part = gsearch.NextFullSearch()) != NULL) {
1654       part->RefinePartners(static_cast<PolyBlockType>(type));
1655     }
1656   }
1657 }
1658 
1659 // Only images remain with multiple types in a run of partners.
1660 // Sets the type of all in the group to the maximum of the group.
SmoothPartnerRuns()1661 void ColumnFinder::SmoothPartnerRuns() {
1662   // Iterate the ColPartitions in the grid.
1663   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1664     gsearch(&part_grid_);
1665   gsearch.StartFullSearch();
1666   ColPartition* part;
1667   while ((part = gsearch.NextFullSearch()) != NULL) {
1668     ColPartition* partner = part->SingletonPartner(true);
1669     if (partner != NULL) {
1670       ASSERT_HOST(partner->SingletonPartner(false) == part);
1671     } else if (part->SingletonPartner(false) != NULL) {
1672       ColPartitionSet* column_set = best_columns_[gsearch.GridY()];
1673       int column_count = column_set->ColumnCount();
1674       part->SmoothPartnerRun(column_count * 2 + 1);
1675     }
1676   }
1677 }
1678 
1679 // Helper functions for TransformToBlocks.
1680 // Add the part to the temp list in the correct order.
AddToTempPartList(ColPartition * part,ColPartition_CLIST * temp_list)1681 void ColumnFinder::AddToTempPartList(ColPartition* part,
1682                                      ColPartition_CLIST* temp_list) {
1683   int mid_y = part->MidY();
1684   ColPartition_C_IT it(temp_list);
1685   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1686     ColPartition* test_part = it.data();
1687     if (part->type() == PT_NOISE || test_part->type() == PT_NOISE)
1688       continue;  // Noise stays in sequence.
1689     if (test_part == part->SingletonPartner(false))
1690       break;  // Insert before its lower partner.
1691     int neighbour_bottom = test_part->median_bottom();
1692     int neighbour_top = test_part->median_top();
1693     int neighbour_y = (neighbour_bottom + neighbour_top) / 2;
1694     if (neighbour_y < mid_y)
1695       break;  // part is above test_part so insert it.
1696     if (!part->HOverlaps(*test_part) && !part->HCompatible(*test_part))
1697       continue;  // Incompatibles stay in order
1698   }
1699   if (it.cycled_list()) {
1700     it.add_to_end(part);
1701   } else {
1702     it.add_before_stay_put(part);
1703   }
1704 }
1705 
1706 // Add everything from the temp list to the work_set assuming correct order.
EmptyTempPartList(ColPartition_CLIST * temp_list,WorkingPartSet_LIST * work_set)1707 void ColumnFinder::EmptyTempPartList(ColPartition_CLIST* temp_list,
1708                                      WorkingPartSet_LIST* work_set) {
1709   ColPartition_C_IT it(temp_list);
1710   while (!it.empty()) {
1711     it.extract()->AddToWorkingSet(bleft_, tright_, resolution_,
1712                           &good_parts_, work_set);
1713     it.forward();
1714   }
1715 }
1716 
1717 // Transform the grid of partitions to the output blocks.
TransformToBlocks(BLOCK_LIST * blocks,TO_BLOCK_LIST * to_blocks)1718 void ColumnFinder::TransformToBlocks(BLOCK_LIST* blocks,
1719                                      TO_BLOCK_LIST* to_blocks) {
1720   WorkingPartSet_LIST work_set;
1721   ColPartitionSet* column_set = NULL;
1722   ColPartition_IT noise_it(&noise_parts_);
1723   // The temp_part_list holds a list of parts at the same grid y coord
1724   // so they can be added in the correct order. This prevents thin objects
1725   // like horizontal lines going before the text lines above them.
1726   ColPartition_CLIST temp_part_list;
1727   // Iterate the ColPartitions in the grid. It starts at the top
1728   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1729     gsearch(&part_grid_);
1730   gsearch.StartFullSearch();
1731   int prev_grid_y = -1;
1732   ColPartition* part;
1733   while ((part = gsearch.NextFullSearch()) != NULL) {
1734     int grid_y = gsearch.GridY();
1735     if (grid_y != prev_grid_y) {
1736       EmptyTempPartList(&temp_part_list, &work_set);
1737       prev_grid_y = grid_y;
1738     }
1739     if (best_columns_[grid_y] != column_set) {
1740       column_set = best_columns_[grid_y];
1741       // Every line should have a non-null best column.
1742       ASSERT_HOST(column_set != NULL);
1743       column_set->ChangeWorkColumns(bleft_, tright_, resolution_,
1744                                     &good_parts_, &work_set);
1745       if (textord_debug_tabfind)
1746         tprintf("Changed column groups at grid index %d\n", gsearch.GridY());
1747     }
1748     if (part->type() == PT_NOISE) {
1749       noise_it.add_to_end(part);
1750     } else {
1751       AddToTempPartList(part, &temp_part_list);
1752     }
1753   }
1754   EmptyTempPartList(&temp_part_list, &work_set);
1755   // Now finish all working sets and transfer ColPartitionSets to block_sets.
1756   WorkingPartSet_IT work_it(&work_set);
1757   while (!work_it.empty()) {
1758     WorkingPartSet* working_set = work_it.extract();
1759     working_set->ExtractCompletedBlocks(bleft_, tright_, resolution_,
1760                                         &good_parts_, blocks, to_blocks);
1761     delete working_set;
1762     work_it.forward();
1763   }
1764 }
1765 
1766 // Reskew the completed blocks to put them back to the original coords.
1767 // (Blob outlines are not corrected for skew.)
1768 // Rotate blobs and blocks individually so text line direction is
1769 // horizontal. Record appropriate inverse transformations and required
1770 // classifier transformation in the blocks.
RotateAndReskewBlocks(TO_BLOCK_LIST * blocks)1771 void ColumnFinder::RotateAndReskewBlocks(TO_BLOCK_LIST* blocks) {
1772   TO_BLOCK_IT it(blocks);
1773   int block_index = 1;
1774   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1775     TO_BLOCK* to_block = it.data();
1776     BLOCK* block = to_block->block;
1777     block->set_index(block_index++);
1778     BLOBNBOX_IT blob_it(&to_block->blobs);
1779     // ColPartition::MakeBlock stored the inverse rotation that must be
1780     // applied to small vertical blocks to go back to the original image
1781     // coords at the end of recognition, but did not actually do any rotations,
1782     // so now blocks must actually be rotated to make them horizontal by the
1783     // inverse of that stored inverse rotation. This is of course a no-op
1784     // for normal blocks.
1785     FCOORD block_rotation = block->re_rotation();
1786     block_rotation.set_y(-block_rotation.y());
1787     block->poly_block()->rotate(block_rotation);
1788     // The final stored inverse coordinate rotation (block->re_rotation_)
1789     // is the sum of rerotate_ (for gross vertical pages) and the current
1790     // block->re_rotation_ (for small vertical text regions).
1791     // We will execute the inverse of that on all the blobs.
1792     FCOORD blob_rotation = block->re_rotation();
1793     blob_rotation.rotate(rerotate_);
1794     block->set_re_rotation(blob_rotation);
1795     blob_rotation.set_y(-blob_rotation.y());
1796     // TODO(rays) determine classify rotation by orientation detection.
1797     // In the mean time, it works for Chinese and English photo credits
1798     // to set a classify rotation to the stored block rerotation only if
1799     // the block rotation to do (before skew) is 0.
1800     if (block_rotation.y() == 0.0f) {
1801       block->set_classify_rotation(block->re_rotation());
1802     }
1803     // Blocks must also be rotated back by the skew angle.
1804     block->rotate(reskew_);
1805     // Save the skew in the block.
1806     block->set_skew(reskew_);
1807     // Rotate all the blobs if needed and recompute the bounding boxes.
1808     // Compute the block median blob width and height as we go.
1809     STATS widths(0, block->bounding_box().width());
1810     STATS heights(0, block->bounding_box().height());
1811     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1812       BLOBNBOX* blob = blob_it.data();
1813       if (blob_rotation.y() != 0.0f) {
1814         blob->cblob()->rotate(blob_rotation);
1815       }
1816       blob->compute_bounding_box();
1817       widths.add(blob->bounding_box().width(), 1);
1818       heights.add(blob->bounding_box().height(), 1);
1819     }
1820     block->set_median_size(static_cast<int>(widths.median() + 0.5),
1821                            static_cast<int>(heights.median() + 0.5));
1822     if (textord_debug_tabfind > 1)
1823       tprintf("Block median size = (%d, %d)\n",
1824               block->median_size().x(), block->median_size().y());
1825   }
1826 }
1827 
1828 // TransformToBlocks leaves all the small and noise blobs untouched in the
1829 // source TO_BLOCK. MoveSmallBlobs moves them into the main blobs list of
1830 // the block from the to_blocks list that contains them.
1831 // TODO(rays) This is inefficient with a large number of blocks. A more
1832 // efficient implementation is possible using a BBGrid.
MoveSmallBlobs(BLOBNBOX_LIST * bblobs,TO_BLOCK_LIST * to_blocks)1833 void ColumnFinder::MoveSmallBlobs(BLOBNBOX_LIST* bblobs,
1834                                   TO_BLOCK_LIST* to_blocks) {
1835   for (BLOBNBOX_IT bb_it(bblobs); !bb_it.empty(); bb_it.forward()) {
1836     BLOBNBOX* bblob = bb_it.extract();
1837     const TBOX& bbox = bblob->bounding_box();
1838     // Find the centre of the blob.
1839     ICOORD centre = bbox.botleft();
1840     centre += bbox.topright();
1841     centre /= 2;
1842     // Find the TO_BLOCK that contains the centre and put the blob in
1843     // its main blobs list.
1844     TO_BLOCK_IT to_it(to_blocks);
1845     for (to_it.mark_cycle_pt(); !to_it.cycled_list(); to_it.forward()) {
1846       TO_BLOCK* to_block = to_it.data();
1847       BLOCK* block = to_block->block;
1848       if (block->contains(centre)) {
1849         BLOBNBOX_IT blob_it(&to_block->blobs);
1850         blob_it.add_to_end(bblob);
1851         bblob = NULL;
1852         break;
1853       }
1854     }
1855     if (bblob != NULL) {
1856       delete bblob->cblob();
1857       delete bblob;
1858     }
1859   }
1860 }
1861 
1862 }  // namespace tesseract.
1863