• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        colpartition.cpp
3 // Description: Class to hold partitions of the page that correspond
4 //              roughly to text lines.
5 // Author:      Ray Smith
6 // Created:     Thu Aug 14 10:54:01 PDT 2008
7 //
8 // (C) Copyright 2008, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
19 ///////////////////////////////////////////////////////////////////////
20 
21 #include "colpartition.h"
22 #include "colpartitionset.h"
23 #include "workingpartset.h"
24 
25 namespace tesseract {
26 
27 ELIST2IZE(ColPartition)
28 CLISTIZE(ColPartition)
29 
30 //////////////// ColPartition Implementation ////////////////
31 
32 // If multiple partners survive the partner depth test beyond this level,
33 // then arbitrarily pick one.
34 const int kMaxPartnerDepth = 4;
35 // Maximum change in spacing (in inches) to ignore.
36 const double kMaxSpacingDrift = 1.0 / 72;  // 1/72 is one point.
37 // Maximum fraction of line height used as an additional allowance
38 // for top spacing.
39 const double kMaxTopSpacingFraction = 0.25;
40 // Maximum ratio of sizes for lines to be considered the same size.
41 const double kMaxSizeRatio = 1.5;
42 
43 // blob_type is the blob_region_type_ of the blobs in this partition.
44 // Vertical is the direction of logical vertical on the possibly skewed image.
ColPartition(BlobRegionType blob_type,const ICOORD & vertical)45 ColPartition::ColPartition(BlobRegionType blob_type, const ICOORD& vertical)
46   : left_margin_(MIN_INT32), right_margin_(MAX_INT32),
47     median_bottom_(MAX_INT32), median_top_(MIN_INT32), median_size_(0),
48     blob_type_(blob_type),
49     good_width_(false), good_column_(false),
50     left_key_tab_(false), right_key_tab_(false),
51     left_key_(0), right_key_(0), type_(PT_UNKNOWN), vertical_(vertical),
52     working_set_(NULL), block_owned_(false),
53     first_column_(-1), last_column_(-1), column_set_(NULL),
54     side_step_(0), top_spacing_(0), bottom_spacing_(0),
55     type_before_table_(PT_UNKNOWN), inside_table_column_(false),
56     nearest_neighbor_above_(NULL), nearest_neighbor_below_(NULL),
57     space_above_(0), space_below_(0), space_to_left_(0), space_to_right_(0) {
58 }
59 
60 // Constructs a fake ColPartition with a single fake BLOBNBOX, all made
61 // from a single TBOX.
62 // WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and
63 // the ColPartition owns the BLOBNBOX!!!
64 // Call DeleteBoxes before deleting the ColPartition.
FakePartition(const TBOX & box)65 ColPartition* ColPartition::FakePartition(const TBOX& box) {
66   ColPartition* part = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
67   part->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(box)));
68   part->set_left_margin(box.left());
69   part->set_right_margin(box.right());
70   part->ComputeLimits();
71   return part;
72 }
73 
~ColPartition()74 ColPartition::~ColPartition() {
75   // Remove this as a partner of all partners, as we don't want them
76   // referring to a deleted object.
77   ColPartition_C_IT it(&upper_partners_);
78   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
79     it.data()->RemovePartner(false, this);
80   }
81   it.set_to_list(&lower_partners_);
82   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
83     it.data()->RemovePartner(true, this);
84   }
85 }
86 
87 // Constructs a fake ColPartition with no BLOBNBOXes.
88 // Used for making horizontal line ColPartitions and types it accordingly.
ColPartition(const ICOORD & vertical,int left,int bottom,int right,int top)89 ColPartition::ColPartition(const ICOORD& vertical,
90                            int left, int bottom, int right, int top)
91   : left_margin_(MIN_INT32), right_margin_(MAX_INT32),
92     bounding_box_(left, bottom, right, top),
93     median_bottom_(bottom), median_top_(top), median_size_(top - bottom),
94     blob_type_(BRT_HLINE),
95     good_width_(false), good_column_(false),
96     left_key_tab_(false), right_key_tab_(false),
97     type_(PT_UNKNOWN), vertical_(vertical),
98     working_set_(NULL), block_owned_(false),
99     first_column_(-1), last_column_(-1), column_set_(NULL),
100     side_step_(0), top_spacing_(0), bottom_spacing_(0),
101     type_before_table_(PT_UNKNOWN), inside_table_column_(false),
102     nearest_neighbor_above_(NULL), nearest_neighbor_below_(NULL),
103     space_above_(0), space_below_(0), space_to_left_(0), space_to_right_(0) {
104   left_key_ = BoxLeftKey();
105   right_key_ = BoxRightKey();
106 }
107 
108 
109 // Adds the given box to the partition, updating the partition bounds.
110 // The list of boxes in the partition is updated, ensuring that no box is
111 // recorded twice, and the boxes are kept in increasing left position.
AddBox(BLOBNBOX * bbox)112 void ColPartition::AddBox(BLOBNBOX* bbox) {
113   boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>, true, bbox);
114   TBOX box = bbox->bounding_box();
115   // Update the partition limits.
116   bounding_box_ += box;
117   if (!left_key_tab_)
118     left_key_ = BoxLeftKey();
119   if (!right_key_tab_)
120     right_key_ = BoxRightKey();
121   if (TabFind::WithinTestRegion(2, box.left(), box.bottom()))
122     tprintf("Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\n",
123             box.left(), box.bottom(), box.right(), box.top(),
124             bounding_box_.left(), bounding_box_.right());
125 }
126 
127 // Claims the boxes in the boxes_list by marking them with a this owner
128 // pointer. If a box is already owned, then run Unique on it.
ClaimBoxes(WidthCallback * cb)129 void ColPartition::ClaimBoxes(WidthCallback* cb) {
130   bool completed = true;
131   do {
132     completed = true;
133     BLOBNBOX_C_IT bb_it(&boxes_);
134     for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
135       BLOBNBOX* bblob = bb_it.data();
136       ColPartition* other = bblob->owner();
137       if (other == NULL) {
138         // Normal case: ownership is available.
139         bblob->set_owner(this);
140       } else if (other != this) {
141         // bblob already has an owner, so resolve the dispute with Unique.
142         // Null everything owned by this upto, but not including bblob, as
143         // they will all be up for grabs in Unique.
144         for (bb_it.move_to_first(); bb_it.data() != bblob; bb_it.forward()) {
145           ASSERT_HOST(bb_it.data()->owner() == this);
146           bb_it.data()->set_owner(NULL);
147         }
148         // Null the owners of all other's blobs. They should all be
149         // still owned by other.
150         BLOBNBOX_C_IT other_it(&other->boxes_);
151         for (other_it.mark_cycle_pt(); !other_it.cycled_list();
152              other_it.forward()) {
153           ASSERT_HOST(other_it.data()->owner() == other);
154           other_it.data()->set_owner(NULL);
155         }
156         Unique(other, cb);
157         // Now we need to run ClaimBoxes on other, as it may have obtained
158         // a box from this (beyond bbox) that is owned by a third party.
159         other->ClaimBoxes(cb);
160         // Scan our own list for bblob. If bblob is still in it and owned by
161         // other, there is trouble. Otherwise we can just restart to finish
162         // the blob list.
163         bb_it.set_to_list(&boxes_);
164         for (bb_it.mark_cycle_pt();
165              !bb_it.cycled_list() && bb_it.data() != bblob;
166              bb_it.forward());
167         ASSERT_HOST(bb_it.cycled_list() || bblob->owner() == NULL);
168         completed = false;
169         break;
170       }
171     }
172   } while (!completed);
173 }
174 
175 // Delete the boxes that this partition owns.
DeleteBoxes()176 void ColPartition::DeleteBoxes() {
177   // Although the boxes_ list is a C_LIST, in some cases it owns the
178   // BLOBNBOXes, as the ColPartition takes ownership from the grid,
179   // and the BLOBNBOXes own the underlying C_BLOBs.
180   for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {
181     BLOBNBOX* bblob = bb_it.extract();
182     delete bblob->cblob();
183     delete bblob;
184   }
185 }
186 
187 // Returns true if this is a legal partition - meaning that the conditions
188 // left_margin <= bounding_box left
189 // left_key <= bounding box left key
190 // bounding box left <= bounding box right
191 // and likewise for right margin and key
192 // are all met.
IsLegal()193 bool ColPartition::IsLegal() {
194   if (bounding_box_.left() > bounding_box_.right()) {
195     if (textord_debug_bugs) {
196       tprintf("Bounding box invalid\n");
197       Print();
198     }
199     return false;  // Bounding box invalid.
200   }
201   if (left_margin_ > bounding_box_.left() ||
202       right_margin_ < bounding_box_.right()) {
203     if (textord_debug_bugs) {
204       tprintf("Margins invalid\n");
205       Print();
206     }
207     return false;  // Margins invalid.
208   }
209   if (left_key_ > BoxLeftKey() || right_key_ < BoxRightKey()) {
210     if (textord_debug_bugs) {
211       tprintf("Key inside box: %d v %d or %d v %d\n",
212               left_key_, BoxLeftKey(), right_key_, BoxRightKey());
213       Print();
214     }
215     return false;  // Keys inside the box.
216   }
217   return true;
218 }
219 
220 // Returns true if the left and right edges are approximately equal.
MatchingColumns(const ColPartition & other) const221 bool ColPartition::MatchingColumns(const ColPartition& other) const {
222   int y = (MidY() + other.MidY()) / 2;
223   if (!NearlyEqual(other.LeftAtY(y) / kColumnWidthFactor,
224                    LeftAtY(y) / kColumnWidthFactor, 1))
225     return false;
226   if (!NearlyEqual(other.RightAtY(y) / kColumnWidthFactor,
227                    RightAtY(y) / kColumnWidthFactor, 1))
228     return false;
229   return true;
230 }
231 
232 // Sets the sort key using either the tab vector, or the bounding box if
233 // the tab vector is NULL. If the tab_vector lies inside the bounding_box,
234 // use the edge of the box as a key any way.
SetLeftTab(const TabVector * tab_vector)235 void ColPartition::SetLeftTab(const TabVector* tab_vector) {
236   if (tab_vector != NULL) {
237     left_key_ = tab_vector->sort_key();
238     left_key_tab_ = left_key_ <= BoxLeftKey();
239   } else {
240     left_key_tab_ = false;
241   }
242   if (!left_key_tab_)
243     left_key_ = BoxLeftKey();
244 }
245 
246 // As SetLeftTab, but with the right.
SetRightTab(const TabVector * tab_vector)247 void ColPartition::SetRightTab(const TabVector* tab_vector) {
248   if (tab_vector != NULL) {
249     right_key_ = tab_vector->sort_key();
250     right_key_tab_ = right_key_ >= BoxRightKey();
251   } else {
252     right_key_tab_ = false;
253   }
254   if (!right_key_tab_)
255     right_key_ = BoxRightKey();
256 }
257 
258 // Copies the left/right tab from the src partition, but if take_box is
259 // true, copies the box instead and uses that as a key.
CopyLeftTab(const ColPartition & src,bool take_box)260 void ColPartition::CopyLeftTab(const ColPartition& src, bool take_box) {
261   left_key_tab_ = take_box ? false : src.left_key_tab_;
262   if (left_key_tab_) {
263     left_key_ = src.left_key_;
264   } else {
265     bounding_box_.set_left(XAtY(src.BoxLeftKey(), MidY()));
266     left_key_ = BoxLeftKey();
267   }
268   if (left_margin_ > bounding_box_.left())
269     left_margin_ = src.left_margin_;
270 }
271 
272 // As CopyLeftTab, but with the right.
CopyRightTab(const ColPartition & src,bool take_box)273 void ColPartition::CopyRightTab(const ColPartition& src, bool take_box) {
274   right_key_tab_ = take_box ? false : src.right_key_tab_;
275   if (right_key_tab_) {
276     right_key_ = src.right_key_;
277   } else {
278     bounding_box_.set_right(XAtY(src.BoxRightKey(), MidY()));
279     right_key_ = BoxRightKey();
280   }
281   if (right_margin_ < bounding_box_.right())
282     right_margin_ = src.right_margin_;
283 }
284 
285 // Add a partner above if upper, otherwise below.
286 // Add them uniquely and keep the list sorted by box left.
287 // Partnerships are added symmetrically to partner and this.
AddPartner(bool upper,ColPartition * partner)288 void ColPartition::AddPartner(bool upper, ColPartition* partner) {
289   if (upper) {
290     partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>,
291                                         true, this);
292     upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
293   } else {
294     partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>,
295                                         true, this);
296     lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
297   }
298 }
299 
300 // Removes the partner from this, but does not remove this from partner.
301 // This asymmetric removal is so as not to mess up the iterator that is
302 // working on partner's partner list.
RemovePartner(bool upper,ColPartition * partner)303 void ColPartition::RemovePartner(bool upper, ColPartition* partner) {
304   ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);
305   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
306     if (it.data() == partner) {
307       it.extract();
308       break;
309     }
310   }
311 }
312 
313 // Returns the partner if the given partner is a singleton, otherwise NULL.
SingletonPartner(bool upper)314 ColPartition* ColPartition::SingletonPartner(bool upper) {
315   ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
316   if (!partners->singleton())
317     return NULL;
318   ColPartition_C_IT it(partners);
319   return it.data();
320 }
321 
322 // Merge with the other partition and delete it.
Absorb(ColPartition * other,WidthCallback * cb)323 void ColPartition::Absorb(ColPartition* other, WidthCallback* cb) {
324   if (TabFind::WithinTestRegion(2, bounding_box_.left(),
325                                 bounding_box_.bottom()) ||
326       TabFind::WithinTestRegion(2, other->bounding_box_.left(),
327                                 other->bounding_box_.bottom())) {
328     tprintf("Merging:");
329     Print();
330     other->Print();
331   }
332   // Merge the two sorted lists.
333   BLOBNBOX_C_IT it(&boxes_);
334   BLOBNBOX_C_IT it2(&other->boxes_);
335   for (; !it2.empty(); it2.forward()) {
336     BLOBNBOX* bbox2 = it2.extract();
337     ColPartition* prev_owner = bbox2->owner();
338     ASSERT_HOST(prev_owner == other || prev_owner == NULL);
339     if (prev_owner == other)
340       bbox2->set_owner(this);
341     bbox2->set_region_type(blob_type_);
342     TBOX box2 = bbox2->bounding_box();
343     int left2 = box2.left();
344     while (!it.at_last() && it.data()->bounding_box().left() <= left2) {
345       if (it.data() == bbox2)
346         break;
347       it.forward();
348     }
349     if (!it.empty() && it.data() == bbox2)
350       continue;
351     if (it.empty() || (it.at_last() &&
352                        it.data()->bounding_box().left() <= left2)) {
353       it.add_after_then_move(bbox2);
354     } else {
355       it.add_before_then_move(bbox2);
356     }
357   }
358   left_margin_ = MIN(left_margin_, other->left_margin_);
359   right_margin_ = MAX(right_margin_, other->right_margin_);
360   if (other->left_key_ < left_key_) {
361     left_key_ = other->left_key_;
362     left_key_tab_ = other->left_key_tab_;
363   }
364   if (other->right_key_ > right_key_) {
365     right_key_ = other->right_key_;
366     right_key_tab_ = other->right_key_tab_;
367   }
368   delete other;
369   ComputeLimits();
370   if (cb != NULL) {
371     SetColumnGoodness(cb);
372   }
373 }
374 
375 // Shares out any common boxes amongst the partitions, ensuring that no
376 // box stays in both. Returns true if anything was done.
Unique(ColPartition * other,WidthCallback * cb)377 bool ColPartition::Unique(ColPartition* other, WidthCallback* cb) {
378   bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
379                                          bounding_box_.bottom()) ||
380                TabFind::WithinTestRegion(2, other->bounding_box_.left(),
381                                          other->bounding_box_.bottom());
382   if (debug) {
383     tprintf("Running Unique:");
384     Print();
385     other->Print();
386   }
387   BLOBNBOX_C_IT it(&boxes_);
388   BLOBNBOX_C_IT it2(&other->boxes_);
389   it.mark_cycle_pt();
390   it2.mark_cycle_pt();
391   bool any_moved = false;
392   while (!it.cycled_list() && !it2.cycled_list()) {
393     BLOBNBOX* bbox = it.data();
394     BLOBNBOX* bbox2 = it2.data();
395     TBOX box = bbox->bounding_box();
396     TBOX box2 = bbox2->bounding_box();
397     if (box.left() < box2.left()) {
398       it.forward();
399     } else if (box.left() > box2.left()) {
400       it2.forward();
401     } else if (bbox == bbox2) {
402       // Separate out most frequent case for efficiency.
403       if (debug) {
404         tprintf("Keeping box (%d,%d)->(%d,%d) only in %s\n",
405                 box.left(), box.bottom(), box.right(), box.top(),
406                 ThisPartitionBetter(bbox, *other) ? "This" : "Other");
407       }
408       if (ThisPartitionBetter(bbox, *other))
409         it2.extract();
410       else
411         it.extract();
412       it.forward();
413       it2.forward();
414       any_moved = true;
415     } else {
416       // Lefts are equal, but boxes may be in any order.
417       BLOBNBOX_C_IT search_it(it2);
418       for (search_it.forward(); !search_it.at_first() &&
419            search_it.data() != bbox &&
420            search_it.data()->bounding_box().left() == box.left();
421            search_it.forward());
422       if (search_it.data() == bbox) {
423         // Found a match.
424         if (ThisPartitionBetter(bbox, *other)) {
425           search_it.extract();
426           // We just (potentially) invalidated it2, so reposition at bbox2.
427           it2.move_to_first();
428           for (it2.mark_cycle_pt(); it2.data() != bbox2; it2.forward());
429         } else {
430           it.extract();
431         }
432         it.forward();
433         any_moved = true;
434       } else {
435         // No match to bbox in list2. Just move first it forward.
436         it.forward();
437       }
438     }
439   }
440   // Now check to see if there are any in either list that would be better
441   // off in the other.
442   if (!it.empty()) {
443     it.move_to_first();
444     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
445       BLOBNBOX* bbox = it.data();
446       if (!ThisPartitionBetter(bbox, *other)) {
447         other->AddBox(it.extract());
448         TBOX box = bbox->bounding_box();
449         if (debug) {
450           tprintf("Moved box (%d,%d)->(%d,%d) from this to other:\n",
451                   box.left(), box.bottom(), box.right(), box.top());
452         }
453         any_moved = true;
454       }
455     }
456   }
457   if (!it2.empty()) {
458     it2.move_to_first();
459     for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
460       BLOBNBOX* bbox2 = it2.data();
461       if (ThisPartitionBetter(bbox2, *other)) {
462         AddBox(it2.extract());
463         TBOX box = bbox2->bounding_box();
464         if (debug) {
465           tprintf("Moved box (%d,%d)->(%d,%d) from other to this:\n",
466                   box.left(), box.bottom(), box.right(), box.top());
467         }
468         any_moved = true;
469       }
470     }
471   }
472   if (any_moved) {
473     if (debug)
474       tprintf("Unique did something!\n");
475     ComputeLimits();
476     other->ComputeLimits();
477     if (cb != NULL) {
478       SetColumnGoodness(cb);
479       other->SetColumnGoodness(cb);
480     }
481   }
482   return any_moved;
483 }
484 
485 // Split this partition at the given x coordinate, returning the right
486 // half and keeping the left half in this.
SplitAt(int split_x)487 ColPartition* ColPartition::SplitAt(int split_x) {
488   if (split_x <= bounding_box_.left() || split_x >= bounding_box_.right())
489     return NULL;  // There will be no change.
490   ColPartition* split_part = ShallowCopy();
491   BLOBNBOX_C_IT it(&boxes_);
492   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
493     BLOBNBOX* bbox = it.data();
494     ColPartition* prev_owner = bbox->owner();
495     ASSERT_HOST(prev_owner == this || prev_owner == NULL);
496     const TBOX& box = bbox->bounding_box();
497     if (box.left() >= split_x) {
498       split_part->AddBox(it.extract());
499       if (prev_owner != NULL)
500         bbox->set_owner(split_part);
501     }
502   }
503   ASSERT_HOST(!it.empty());
504   if (split_part->IsEmpty()) {
505     // Split part ended up with nothing. Possible if split_x passes
506     // through the last blob.
507     delete split_part;
508     return NULL;
509   }
510   right_key_tab_ = false;
511   split_part->left_key_tab_ = false;
512   right_margin_ = split_x;
513   split_part->left_margin_ = split_x;
514   ComputeLimits();
515   split_part->ComputeLimits();
516   return split_part;
517 }
518 
519 // Recalculates all the coordinate limits of the partition.
ComputeLimits()520 void ColPartition::ComputeLimits() {
521   bounding_box_ = TBOX();  // Clear it
522   BLOBNBOX_C_IT it(&boxes_);
523   BLOBNBOX* bbox = NULL;
524   if (it.empty()) {
525     bounding_box_.set_left(left_margin_);
526     bounding_box_.set_right(right_margin_);
527     bounding_box_.set_bottom(0);
528     bounding_box_.set_top(0);
529   } else {
530     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
531       bbox = it.data();
532       bounding_box_ += bbox->bounding_box();
533     }
534   }
535   if (!left_key_tab_)
536     left_key_ = BoxLeftKey();
537   if (left_key_ > BoxLeftKey() && textord_debug_bugs) {
538     // TODO(rays) investigate the causes of these error messages, to find
539     // out if they are genuinely harmful, or just indicative of junk input.
540     tprintf("Computed left-illegal partition\n");
541     Print();
542   }
543   if (!right_key_tab_)
544     right_key_ = BoxRightKey();
545   if (right_key_ < BoxRightKey() && textord_debug_bugs) {
546     tprintf("Computed right-illegal partition\n");
547     Print();
548   }
549   if (it.empty())
550     return;
551   STATS top_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
552   STATS bottom_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
553   STATS size_stats(0, bounding_box_.height() + 1);
554   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
555     bbox = it.data();
556     TBOX box = bbox->bounding_box();
557     top_stats.add(box.top(), 1);
558     bottom_stats.add(box.bottom(), 1);
559     size_stats.add(box.height(), 1);
560   }
561   median_top_ = static_cast<int>(top_stats.median() + 0.5);
562   median_bottom_ = static_cast<int>(bottom_stats.median() + 0.5);
563   median_size_ = static_cast<int>(size_stats.median() + 0.5);
564 
565   if (right_margin_ < bounding_box_.right() && textord_debug_bugs) {
566     tprintf("Made partition with bad right coords");
567     Print();
568   }
569   if (left_margin_ > bounding_box_.left() && textord_debug_bugs) {
570     tprintf("Made partition with bad left coords");
571     Print();
572   }
573   if (TabFind::WithinTestRegion(2, bounding_box_.left(),
574                                 bounding_box_.bottom())) {
575     tprintf("Recomputed box for partition %p\n", this);
576     Print();
577   }
578 }
579 
580 // Computes and sets the type_ and first_colum_, last_column_ and column_set_.
SetPartitionType(ColPartitionSet * columns)581 void ColPartition::SetPartitionType(ColPartitionSet* columns) {
582   int first_spanned_col = -1;
583   int last_spanned_col = -1;
584   type_ = columns->SpanningType(blob_type_,
585                                 bounding_box_.left(), bounding_box_.right(),
586                                 MidY(), left_margin_, right_margin_,
587                                 &first_column_, &last_column_,
588                                 &first_spanned_col, &last_spanned_col);
589   column_set_ = columns;
590   if (first_column_ != last_column_ &&
591       (type_ == PT_PULLOUT_TEXT || type_ == PT_PULLOUT_IMAGE ||
592        type_ == PT_PULLOUT_LINE)) {
593     // Unequal columns may indicate that the pullout spans one of the columns
594     // it lies in, so force it to be allocated to just that column.
595     if (first_spanned_col >= 0) {
596       first_column_ = first_spanned_col;
597       last_column_ = first_spanned_col;
598     } else {
599       if ((first_column_ & 1) == 0)
600         last_column_ = first_column_;
601       else if ((last_column_ & 1) == 0)
602         first_column_ = last_column_;
603       else
604         first_column_ = last_column_ = (first_column_ + last_column_) / 2;
605     }
606   }
607 }
608 
609 // Returns the first and last column touched by this partition.
ColumnRange(ColPartitionSet * columns,int * first_col,int * last_col)610 void ColPartition::ColumnRange(ColPartitionSet* columns,
611                                int* first_col, int* last_col) {
612   int first_spanned_col = -1;
613   int last_spanned_col = -1;
614   type_ = columns->SpanningType(blob_type_,
615                                 bounding_box_.left(), bounding_box_.right(),
616                                 MidY(), left_margin_, right_margin_,
617                                 first_col, last_col,
618                                 &first_spanned_col, &last_spanned_col);
619 }
620 
621 // Sets the internal flags good_width_ and good_column_.
SetColumnGoodness(WidthCallback * cb)622 void ColPartition::SetColumnGoodness(WidthCallback* cb) {
623   int y = MidY();
624   int width = RightAtY(y) - LeftAtY(y);
625   good_width_ = cb->Run(width);
626   good_column_ = blob_type_ == BRT_TEXT && left_key_tab_ && right_key_tab_;
627 }
628 
629 // Adds this ColPartition to a matching WorkingPartSet if one can be found,
630 // otherwise starts a new one in the appropriate column, ending the previous.
AddToWorkingSet(const ICOORD & bleft,const ICOORD & tright,int resolution,ColPartition_LIST * used_parts,WorkingPartSet_LIST * working_sets)631 void ColPartition::AddToWorkingSet(const ICOORD& bleft, const ICOORD& tright,
632                                    int resolution,
633                                    ColPartition_LIST* used_parts,
634                                    WorkingPartSet_LIST* working_sets) {
635   if (block_owned_)
636     return;  // Done it already.
637   block_owned_ = true;
638   WorkingPartSet_IT it(working_sets);
639   // If there is an upper partner use its working_set_ directly.
640   ColPartition* partner = SingletonPartner(true);
641   if (partner != NULL && partner->working_set_ != NULL) {
642     working_set_ = partner->working_set_;
643     working_set_->AddPartition(this);
644     return;
645   }
646   if (partner != NULL && textord_debug_bugs) {
647     tprintf("Partition with partner has no working set!:");
648     Print();
649     partner->Print();
650   }
651   // Search for the column that the left edge fits in.
652   WorkingPartSet* work_set = NULL;
653   it.move_to_first();
654   int col_index = 0;
655   for (it.mark_cycle_pt(); !it.cycled_list() &&
656        col_index != first_column_;
657         it.forward(), ++col_index);
658   if (textord_debug_tabfind >= 2) {
659     tprintf("Match is %s for:", (col_index & 1) ? "Real" : "Between");
660     Print();
661   }
662   if (it.cycled_list() && textord_debug_bugs) {
663     tprintf("Target column=%d, only had %d\n", first_column_, col_index);
664   }
665   ASSERT_HOST(!it.cycled_list());
666   work_set = it.data();
667   // If last_column_ != first_column, then we need to scoop up all blocks
668   // between here and the last_column_ and put back in work_set.
669   if (!it.cycled_list() && last_column_ != first_column_) {
670     // Find the column that the right edge falls in.
671     BLOCK_LIST completed_blocks;
672     TO_BLOCK_LIST to_blocks;
673     for (; !it.cycled_list() && col_index <= last_column_;
674          it.forward(), ++col_index) {
675       WorkingPartSet* end_set = it.data();
676       end_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts,
677                                       &completed_blocks, &to_blocks);
678     }
679     work_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
680   }
681   working_set_ = work_set;
682   work_set->AddPartition(this);
683 }
684 
685 // From the given block_parts list, builds one or more BLOCKs and
686 // corresponding TO_BLOCKs, such that the line spacing is uniform in each.
687 // Created blocks are appended to the end of completed_blocks and to_blocks.
688 // The used partitions are put onto used_parts, as they may still be referred
689 // to in the partition grid. bleft, tright and resolution are the bounds
690 // and resolution of the original image.
LineSpacingBlocks(const ICOORD & bleft,const ICOORD & tright,int resolution,ColPartition_LIST * block_parts,ColPartition_LIST * used_parts,BLOCK_LIST * completed_blocks,TO_BLOCK_LIST * to_blocks)691 void ColPartition::LineSpacingBlocks(const ICOORD& bleft, const ICOORD& tright,
692                                      int resolution,
693                                      ColPartition_LIST* block_parts,
694                                      ColPartition_LIST* used_parts,
695                                      BLOCK_LIST* completed_blocks,
696                                      TO_BLOCK_LIST* to_blocks) {
697   int page_height = tright.y() - bleft.y();
698   // Compute the initial spacing stats.
699   ColPartition_IT it(block_parts);
700   int part_count = 0;
701   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
702     ColPartition* part = it.data();
703     ASSERT_HOST(!part->boxes()->empty());
704     STATS side_steps(0, part->bounding_box().height());
705     BLOBNBOX_C_IT blob_it(part->boxes());
706     int prev_bottom = blob_it.data()->bounding_box().bottom();
707     for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
708       BLOBNBOX* blob = blob_it.data();
709       int bottom = blob->bounding_box().bottom();
710       int step = bottom - prev_bottom;
711       if (step < 0)
712         step = -step;
713       side_steps.add(step, 1);
714       prev_bottom = bottom;
715     }
716     part->set_side_step(static_cast<int>(side_steps.median() + 0.5));
717     if (!it.at_last()) {
718       ColPartition* next_part = it.data_relative(1);
719       part->set_bottom_spacing(part->median_bottom() -
720                                next_part->median_bottom());
721       part->set_top_spacing(part->median_top() - next_part->median_top());
722     } else {
723       part->set_bottom_spacing(page_height);
724       part->set_top_spacing(page_height);
725     }
726     if (textord_debug_tabfind) {
727       part->Print();
728       tprintf("side step = %.2f, top spacing = %d, bottom spacing=%d\n",
729               side_steps.median(), part->top_spacing(), part->bottom_spacing());
730     }
731     ++part_count;
732   }
733   if (part_count == 0)
734     return;
735 
736   SmoothSpacings(resolution, page_height, block_parts);
737 
738   // Move the partitions into individual block lists and make the blocks.
739   BLOCK_IT block_it(completed_blocks);
740   TO_BLOCK_IT to_block_it(to_blocks);
741   ColPartition_LIST spacing_parts;
742   ColPartition_IT sp_block_it(&spacing_parts);
743   for (it.mark_cycle_pt(); !it.empty();) {
744     ColPartition* part = it.extract();
745     sp_block_it.add_to_end(part);
746     it.forward();
747     if (it.empty() || !part->SpacingsEqual(*it.data(), resolution)) {
748       // There is a spacing boundary. Check to see if it.data() belongs
749       // better in the current block or the next one.
750       if (!it.empty()) {
751         ColPartition* next_part = it.data();
752         // If there is a size match one-way, then the middle line goes with
753         // its matched size, otherwise it goes with the smallest spacing.
754         ColPartition* third_part = it.at_last() ? NULL : it.data_relative(1);
755         if (textord_debug_tabfind)
756           tprintf("Spacings unequal: upper:%d/%d, lower:%d/%d,"
757                   " sizes %d %d %d\n",
758                   part->top_spacing(), part->bottom_spacing(),
759                   next_part->top_spacing(), next_part->bottom_spacing(),
760                   part->median_size(), next_part->median_size(),
761                   third_part != NULL ? third_part->median_size() : 0);
762         // If spacing_diff ends up positive, then next_part goes in the
763         // current block.
764         int spacing_diff = next_part->bottom_spacing() - part->bottom_spacing();
765         if (part->SizesSimilar(*next_part) &&
766             (third_part == NULL || !next_part->SizesSimilar(*third_part))) {
767           // Sizes overrule.
768           spacing_diff = 1;
769         } else if (!part->SizesSimilar(*next_part) && third_part != NULL &&
770                    next_part->SizesSimilar(*third_part)) {
771           // Sizes overrule.
772           spacing_diff = -1;
773         }
774         if (spacing_diff > 0) {
775           sp_block_it.add_to_end(it.extract());
776           it.forward();
777         }
778       }
779       TO_BLOCK* to_block = MakeBlock(bleft, tright, &spacing_parts, used_parts);
780       if (to_block != NULL) {
781         to_block_it.add_to_end(to_block);
782         block_it.add_to_end(to_block->block);
783       }
784       sp_block_it.set_to_list(&spacing_parts);
785     }
786   }
787 }
788 
789 // Helper function to clip the input pos to the given bleft, tright bounds.
ClipCoord(const ICOORD & bleft,const ICOORD & tright,ICOORD * pos)790 static void ClipCoord(const ICOORD& bleft, const ICOORD& tright, ICOORD* pos) {
791   if (pos->x() < bleft.x())
792     pos->set_x(bleft.x());
793   if (pos->x() > tright.x())
794     pos->set_x(tright.x());
795   if (pos->y() < bleft.y())
796     pos->set_y(bleft.y());
797   if (pos->y() > tright.y())
798     pos->set_y(tright.y());
799 }
800 
801 // Constructs a block from the given list of partitions.
802 // Arguments are as LineSpacingBlocks above.
MakeBlock(const ICOORD & bleft,const ICOORD & tright,ColPartition_LIST * block_parts,ColPartition_LIST * used_parts)803 TO_BLOCK* ColPartition::MakeBlock(const ICOORD& bleft, const ICOORD& tright,
804                                   ColPartition_LIST* block_parts,
805                                   ColPartition_LIST* used_parts) {
806   if (block_parts->empty())
807     return NULL;  // Nothing to do.
808   ColPartition_IT it(block_parts);
809   ColPartition* part = it.data();
810   int line_spacing = part->bottom_spacing();
811   if (line_spacing < part->median_size())
812     line_spacing = part->bounding_box().height();
813   PolyBlockType type = it.data()->type();
814   bool text_type = it.data()->IsTextType();
815   ICOORDELT_LIST vertices;
816   ICOORDELT_IT vert_it(&vertices);
817   ICOORD start, end;
818   int min_x = MAX_INT32;
819   int max_x = MIN_INT32;
820   int min_y = MAX_INT32;
821   int max_y = MIN_INT32;
822   int iteration = 0;
823   do {
824     if (iteration == 0)
825       ColPartition::LeftEdgeRun(&it, &start, &end);
826     else
827       ColPartition::RightEdgeRun(&it, &start, &end);
828     ClipCoord(bleft, tright, &start);
829     ClipCoord(bleft, tright, &end);
830     vert_it.add_after_then_move(new ICOORDELT(start));
831     vert_it.add_after_then_move(new ICOORDELT(end));
832     min_x = MIN(min_x, start.x());
833     min_x = MIN(min_x, end.x());
834     max_x = MAX(max_x, start.x());
835     max_x = MAX(max_x, end.x());
836     min_y = MIN(min_y, start.y());
837     min_y = MIN(min_y, end.y());
838     max_y = MAX(max_y, start.y());
839     max_y = MAX(max_y, end.y());
840     if ((iteration == 0 && it.at_first()) ||
841         (iteration == 1 && it.at_last())) {
842       ++iteration;
843       it.move_to_last();
844     }
845   } while (iteration < 2);
846   if (textord_debug_tabfind)
847     tprintf("Making block at (%d,%d)->(%d,%d)\n",
848             min_x, min_y, max_x, max_y);
849   BLOCK* block = new BLOCK("", true, 0, 0, min_x, min_y, max_x, max_y);
850   block->set_poly_block(new POLY_BLOCK(&vertices, type));
851   // Make a matching TO_BLOCK and put all the BLOBNBOXes from the parts in it.
852   // Move all the parts to a done list as they are no longer needed, except
853   // that have have to continue to exist until the part grid is deleted.
854   // Compute the median blob size as we go, as the block needs to know.
855   STATS heights(0, max_y + 1 - min_y);
856   TO_BLOCK* to_block = new TO_BLOCK(block);
857   BLOBNBOX_IT blob_it(&to_block->blobs);
858   ColPartition_IT used_it(used_parts);
859   for (it.move_to_first(); !it.empty(); it.forward()) {
860     ColPartition* part = it.extract();
861     if (text_type) {
862       // Only transfer blobs from text regions to the output blocks.
863       // The rest stay behind and get deleted with the ColPartitions.
864       for (BLOBNBOX_C_IT bb_it(part->boxes()); !bb_it.empty();
865            bb_it.forward()) {
866         BLOBNBOX* bblob = bb_it.extract();
867         ASSERT_HOST(bblob->owner() == part);
868         ASSERT_HOST(bblob->region_type() >= BRT_UNKNOWN);
869         C_OUTLINE_IT ol_it(bblob->cblob()->out_list());
870         ASSERT_HOST(ol_it.data()->pathlength() > 0);
871         heights.add(bblob->bounding_box().height(), 1);
872         blob_it.add_after_then_move(bblob);
873       }
874     }
875     used_it.add_to_end(part);
876   }
877   if (text_type && blob_it.empty()) {
878     delete block;
879     delete to_block;
880     return NULL;
881   }
882   to_block->line_size = heights.median();
883   int block_height = block->bounding_box().height();
884   if (block_height < line_spacing)
885     line_spacing = block_height;
886   to_block->line_spacing = line_spacing;
887   to_block->max_blob_size = block_height + 1;
888   if (type == PT_VERTICAL_TEXT) {
889     // This block will get rotated 90 deg clockwise so record the inverse.
890     FCOORD rotation(0.0f, 1.0f);
891     block->set_re_rotation(rotation);
892   }
893   return to_block;
894 }
895 
896 // Returns a copy of everything except the list of boxes. The resulting
897 // ColPartition is only suitable for keeping in a column candidate list.
ShallowCopy() const898 ColPartition* ColPartition::ShallowCopy() const {
899   ColPartition* part = new ColPartition(blob_type_, vertical_);
900   part->left_margin_ = left_margin_;
901   part->right_margin_ = right_margin_;
902   part->bounding_box_ = bounding_box_;
903   part->median_bottom_ = median_bottom_;
904   part->median_top_ = median_top_;
905   part->median_size_ = median_size_;
906   part->good_width_ = good_width_;
907   part->good_column_ = good_column_;
908   part->left_key_tab_ = left_key_tab_;
909   part->right_key_tab_ = right_key_tab_;
910   part->type_ = type_;
911   part->left_key_ = left_key_;
912   part->right_key_ = right_key_;
913   return part;
914 }
915 
916 // Provides a color for BBGrid to draw the rectangle.
917 // Must be kept in sync with PolyBlockType.
BoxColor() const918 ScrollView::Color  ColPartition::BoxColor() const {
919   return POLY_BLOCK::ColorForPolyBlockType(type_);
920 }
921 
922 // Keep in sync with BlobRegionType.
923 static char kBlobTypes[BRT_COUNT + 1] = "NHRIUVT";
924 
925 // Prints debug information on this.
Print()926 void ColPartition::Print() {
927   int y = MidY();
928   tprintf("ColPart:%c(M%d-%c%d-B%d,%d/%d)->(%dB-%d%c-%dM,%d/%d)"
929           " w-ok=%d, v-ok=%d, type=%d%c, fc=%d, lc=%d, boxes=%d"
930           " ts=%d bs=%d ls=%d rs=%d\n",
931           boxes_.empty() ? 'E' : ' ',
932           left_margin_, left_key_tab_ ? 'T' : 'B', LeftAtY(y),
933           bounding_box_.left(), median_bottom_, bounding_box_.bottom(),
934           bounding_box_.right(), RightAtY(y), right_key_tab_ ? 'T' : 'B',
935           right_margin_, median_top_, bounding_box_.top(),
936           good_width_, good_column_, type_,
937           kBlobTypes[blob_type_],
938           first_column_, last_column_, boxes_.length(),
939           space_above_, space_below_, space_to_left_, space_to_right_);
940 }
941 
942 // Sets the types of all partitions in the run to be the max of the types.
SmoothPartnerRun(int working_set_count)943 void ColPartition::SmoothPartnerRun(int working_set_count) {
944   STATS left_stats(0, working_set_count);
945   STATS right_stats(0, working_set_count);
946   PolyBlockType max_type = type_;
947   ColPartition* partner;
948   for (partner = SingletonPartner(false); partner != NULL;
949        partner = partner->SingletonPartner(false)) {
950     if (partner->type_ > max_type)
951       max_type = partner->type_;
952     if (column_set_ == partner->column_set_) {
953       left_stats.add(partner->first_column_, 1);
954       right_stats.add(partner->last_column_, 1);
955     }
956   }
957   type_ = max_type;
958   first_column_ = left_stats.mode();
959   last_column_ = right_stats.mode();
960   if (last_column_ < first_column_)
961     last_column_ = first_column_;
962 
963   for (partner = SingletonPartner(false); partner != NULL;
964        partner = partner->SingletonPartner(false)) {
965     partner->type_ = max_type;
966     if (column_set_ == partner->column_set_) {
967       partner->first_column_ = first_column_;
968       partner->last_column_ = last_column_;
969     }
970   }
971 }
972 
973 // Cleans up the partners of the given type so that there is at most
974 // one partner. This makes block creation simpler.
RefinePartners(PolyBlockType type)975 void ColPartition::RefinePartners(PolyBlockType type) {
976   if (type_ == type) {
977     RefinePartnersInternal(true);
978     RefinePartnersInternal(false);
979   } else if (type == PT_COUNT) {
980     // This is the final pass. Make sure only the correctly typed
981     // partners surivive, however many there are.
982     RefinePartnersByType(true, &upper_partners_);
983     RefinePartnersByType(false, &lower_partners_);
984   }
985 }
986 
987 ////////////////// PRIVATE CODE /////////////////////////////
988 
989 // Cleans up the partners above if upper is true, else below.
RefinePartnersInternal(bool upper)990 void ColPartition::RefinePartnersInternal(bool upper) {
991   ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
992   if (!partners->empty() && !partners->singleton()) {
993     RefinePartnersByType(upper, partners);
994     if (!partners->empty() && !partners->singleton()) {
995       // Check for transitive partnerships and break the cycle.
996       RefinePartnerShortcuts(upper, partners);
997       if (!partners->empty() && !partners->singleton()) {
998         // Types didn't fix it. Flowing text keeps the one with the longest
999         // sequence of singleton matching partners. All others max overlap.
1000         if (type_ == PT_FLOWING_TEXT)
1001           RefineFlowingTextPartners(upper, partners);
1002         else
1003           RefinePartnersByOverlap(upper, partners);
1004       }
1005     }
1006   }
1007 }
1008 
1009 // Restricts the partners to only desirable types. For text and BRT_HLINE this
1010 // means the same type_ , and for image types it means any image type.
RefinePartnersByType(bool upper,ColPartition_CLIST * partners)1011 void ColPartition::RefinePartnersByType(bool upper,
1012                                         ColPartition_CLIST* partners) {
1013   if (TabFind::WithinTestRegion(2, bounding_box_.left(),
1014                                 bounding_box_.bottom())) {
1015     tprintf("Refining %s partners by type for:\n", upper ? "Upper" : "Lower");
1016     Print();
1017   }
1018   ColPartition_C_IT it(partners);
1019   // Purify text by type.
1020   if (blob_type_ > BRT_UNKNOWN || blob_type_ == BRT_HLINE) {
1021     // Keep only partners matching type_.
1022     // Exception: PT_VERTICAL_TEXT is allowed to stay with the other
1023     // text types if it is the only partner.
1024     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1025       ColPartition* partner = it.data();
1026       if (partner->type_ != type_ &&
1027           (!partners->singleton() ||
1028            (type_ != PT_VERTICAL_TEXT && partner->type_ != PT_VERTICAL_TEXT) ||
1029             !IsTextType() || !partner->IsTextType())) {
1030         partner->RemovePartner(!upper, this);
1031         it.extract();
1032       } else if (TabFind::WithinTestRegion(2, bounding_box_.left(),
1033                                            bounding_box_.bottom())) {
1034         tprintf("Keeping partner:");
1035         partner->Print();
1036       }
1037     }
1038   } else {
1039     // Keep only images with images, but not being fussy about type.
1040     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1041       ColPartition* partner = it.data();
1042       if (partner->blob_type_ > BRT_UNKNOWN ||
1043           partner->blob_type_ == BRT_HLINE) {
1044         partner->RemovePartner(!upper, this);
1045         it.extract();
1046       } else if (TabFind::WithinTestRegion(2, bounding_box_.left(),
1047                                            bounding_box_.bottom())) {
1048         tprintf("Keeping partner:");
1049         partner->Print();
1050       }
1051     }
1052   }
1053 }
1054 
1055 // Remove transitive partnerships: this<->a, and a<->b and this<->b.
1056 // Gets rid of this<->b, leaving a clean chain.
1057 // Also if we have this<->a and a<->this, then gets rid of this<->a, as
1058 // this has multiple partners.
RefinePartnerShortcuts(bool upper,ColPartition_CLIST * partners)1059 void ColPartition::RefinePartnerShortcuts(bool upper,
1060                                           ColPartition_CLIST* partners) {
1061   bool done_any = false;
1062   do {
1063     done_any = false;
1064     ColPartition_C_IT it(partners);
1065     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1066       ColPartition* a = it.data();
1067       // Check for a match between all of a's partners (it1/b1) and all
1068       // of this's partners (it2/b2).
1069       ColPartition_C_IT it1(upper ? &a->upper_partners_ : &a->lower_partners_);
1070       for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
1071         ColPartition* b1 = it1.data();
1072         if (b1 == this) {
1073           done_any = true;
1074           it.extract();
1075           a->RemovePartner(!upper, this);
1076           break;
1077         }
1078         ColPartition_C_IT it2(partners);
1079         for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
1080           ColPartition* b2 = it2.data();
1081           if (b1 == b2) {
1082             // Jackpot! b2 should not be a partner of this.
1083             it2.extract();
1084             b2->RemovePartner(!upper, this);
1085             done_any = true;
1086             // That potentially invalidated all the iterators, so break out
1087             // and start again.
1088             break;
1089           }
1090         }
1091         if (done_any)
1092           break;
1093       }
1094       if (done_any)
1095         break;
1096     }
1097   } while (done_any && !partners->empty() && !partners->singleton());
1098 }
1099 
1100 // Keeps the partner with the longest sequence of singleton matching partners.
1101 // Converts all others to pullout.
RefineFlowingTextPartners(bool upper,ColPartition_CLIST * partners)1102 void ColPartition::RefineFlowingTextPartners(bool upper,
1103                                              ColPartition_CLIST* partners) {
1104   ColPartition_C_IT it(partners);
1105   ColPartition* best_partner = it.data();
1106   // Nasty iterative algorithm.
1107   int depth = 1;
1108   int survivors = 0;
1109   do {
1110     survivors = 0;
1111     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1112       ColPartition* partner = it.data();
1113       // See if it survives a chase to depth levels.
1114       for (int i = 0; i < depth && partner != NULL; ++i) {
1115         partner = partner->SingletonPartner(upper);
1116         if (partner != NULL && partner->type_ != PT_FLOWING_TEXT)
1117           partner = NULL;
1118       }
1119       if (partner != NULL) {
1120         ++survivors;
1121         best_partner = it.data();
1122       }
1123     }
1124     ++depth;
1125   } while (survivors > 1 && depth <= kMaxPartnerDepth);
1126   // Keep only the best partner.
1127   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1128     ColPartition* partner = it.data();
1129     if (partner != best_partner) {
1130       partner->RemovePartner(!upper, this);
1131       it.extract();
1132       // Change the types of partner to be PT_PULLOUT_TEXT.
1133       while (partner != NULL && partner->type_ == PT_FLOWING_TEXT) {
1134         partner->type_ = PT_PULLOUT_TEXT;
1135         partner = partner->SingletonPartner(upper);
1136       }
1137     }
1138   }
1139 }
1140 
1141 // Keep the partner with the biggest overlap.
RefinePartnersByOverlap(bool upper,ColPartition_CLIST * partners)1142 void ColPartition::RefinePartnersByOverlap(bool upper,
1143                                            ColPartition_CLIST* partners) {
1144   ColPartition_C_IT it(partners);
1145   ColPartition* best_partner = it.data();
1146   // Find the partner with the best overlap.
1147   int best_overlap = 0;
1148   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1149     ColPartition* partner = it.data();
1150     int overlap = MIN(bounding_box_.right(), partner->bounding_box_.right())
1151                 - MAX(bounding_box_.left(), partner->bounding_box_.left());
1152     if (overlap > best_overlap) {
1153       best_overlap = overlap;
1154       best_partner = partner;
1155     }
1156   }
1157   // Keep only the best partner.
1158   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1159     ColPartition* partner = it.data();
1160     if (partner != best_partner) {
1161       partner->RemovePartner(!upper, this);
1162       it.extract();
1163     }
1164   }
1165 }
1166 
1167 // Return true if bbox belongs better in this than other.
ThisPartitionBetter(BLOBNBOX * bbox,const ColPartition & other)1168 bool ColPartition::ThisPartitionBetter(BLOBNBOX* bbox,
1169                                        const ColPartition& other) {
1170   TBOX box = bbox->bounding_box();
1171   // Margins take priority.
1172   int left = box.left();
1173   int right = box.right();
1174   if (left < left_margin_ || right > right_margin_)
1175     return false;
1176   if (left < other.left_margin_ || right > other.right_margin_)
1177     return true;
1178   int top = box.top();
1179   int bottom = box.bottom();
1180   int this_overlap = MIN(top, median_top_) - MAX(bottom, median_bottom_);
1181   int other_overlap = MIN(top, other.median_top_) -
1182                       MAX(bottom, other.median_bottom_);
1183   int this_miss = median_top_ - median_bottom_ - this_overlap;
1184   int other_miss = other.median_top_ - other.median_bottom_ - other_overlap;
1185   if (TabFind::WithinTestRegion(3, box.left(), box.bottom())) {
1186     tprintf("Unique on (%d,%d)->(%d,%d) overlap %d/%d, miss %d/%d, mt=%d/%d\n",
1187             box.left(), box.bottom(), box.right(), box.top(),
1188             this_overlap, other_overlap, this_miss, other_miss,
1189             median_top_, other.median_top_);
1190   }
1191   if (this_miss < other_miss)
1192     return true;
1193   if (this_miss > other_miss)
1194     return false;
1195   if (this_overlap > other_overlap)
1196     return true;
1197   if (this_overlap < other_overlap)
1198     return false;
1199   return median_top_ >= other.median_top_;
1200 }
1201 
1202 // Returns the median line-spacing between the current position and the end
1203 // of the list.
1204 // The iterator is passed by value so the iteration does not modify the
1205 // caller's iterator.
MedianSpacing(int page_height,ColPartition_IT it)1206 static int MedianSpacing(int page_height, ColPartition_IT it) {
1207   STATS stats(0, page_height);
1208   while (!it.cycled_list()) {
1209     ColPartition* part = it.data();
1210     it.forward();
1211     stats.add(part->bottom_spacing(), 1);
1212     stats.add(part->top_spacing(), 1);
1213   }
1214   return static_cast<int>(stats.median() + 0.5);
1215 }
1216 
1217 // Smoothes the spacings in the list into groups of equal linespacing.
1218 // resolution is the resolution of the original image, used as a basis
1219 // for thresholds in change of spacing. page_height is in pixels.
SmoothSpacings(int resolution,int page_height,ColPartition_LIST * parts)1220 void ColPartition::SmoothSpacings(int resolution, int page_height,
1221                                   ColPartition_LIST* parts) {
1222   // The task would be trivial if we didn't have to allow for blips -
1223   // occasional offsets in spacing caused by anomolous text, such as all
1224   // caps, groups of descenders, joined words, Arabic etc.
1225   // The neighbourhood stores a consecutive group of partitions so that
1226   // blips can be detected correctly, yet conservatively enough to not
1227   // mistake genuine spacing changes for blips. See example below.
1228   ColPartition* neighbourhood[PN_COUNT];
1229   ColPartition_IT it(parts);
1230   it.mark_cycle_pt();
1231   // Although we know nothing about the spacings is this list, the median is
1232   // used as an approximation to allow blips.
1233   // If parts of this block aren't spaced to the median, then we can't
1234   // accept blips in those parts, but we'll recalculate it each time we
1235   // split the block, so the median becomes more likely to match all the text.
1236   int median_space = MedianSpacing(page_height, it);
1237   ColPartition_IT start_it(it);
1238   ColPartition_IT end_it(it);
1239   for (int i = 0; i < PN_COUNT; ++i) {
1240     if (i < PN_UPPER || it.cycled_list()) {
1241       neighbourhood[i] = NULL;
1242     } else {
1243       if (i == PN_LOWER)
1244         end_it = it;
1245       neighbourhood[i] = it.data();
1246       it.forward();
1247     }
1248   }
1249   while (neighbourhood[PN_UPPER] != NULL) {
1250     // Test for end of a group. Normally SpacingsEqual is true within a group,
1251     // but in the case of a blip, it will be false. Here is an example:
1252     // Line enum   Spacing below (spacing between tops of lines)
1253     //  1   ABOVE2    20
1254     //  2   ABOVE1    20
1255     //  3   UPPER     15
1256     //  4   LOWER     25
1257     //  5   BELOW1    20
1258     //  6   BELOW2    20
1259     // Line 4 is all in caps (regular caps), so the spacing between line 3
1260     // and line 4 (looking at the tops) is smaller than normal, and the
1261     // spacing between line 4 and line 5 is larger than normal, but the
1262     // two of them add to twice the normal spacing.
1263     // The following if has to accept unequal spacings 3 times to pass the
1264     // blip (20/15, 15/25 and 25/20)
1265     // When the blip is in the middle, OKSpacingBlip tests that one of
1266     // ABOVE1 and BELOW1 matches the median.
1267     // The first time, everything is shifted down 1, so we present
1268     // OKSpacingBlip with neighbourhood+1 and check that PN_UPPER is median.
1269     // The last time, everything is shifted up 1, so we present OKSpacingBlip
1270     // with neighbourhood-1 and check that PN_LOWER matches the median.
1271     if (neighbourhood[PN_LOWER] == NULL ||
1272         (!neighbourhood[PN_UPPER]->SpacingsEqual(*neighbourhood[PN_LOWER],
1273                                                  resolution) &&
1274          !OKSpacingBlip(resolution, median_space, neighbourhood) &&
1275          (!OKSpacingBlip(resolution, median_space, neighbourhood - 1) ||
1276           !neighbourhood[PN_LOWER]->SpacingEqual(median_space, resolution)) &&
1277          (!OKSpacingBlip(resolution, median_space, neighbourhood + 1) ||
1278           !neighbourhood[PN_UPPER]->SpacingEqual(median_space, resolution)))) {
1279       // The group has ended. PN_UPPER is the last member.
1280       // Compute the mean spacing over the group.
1281       ColPartition_IT sum_it(start_it);
1282       ColPartition* last_part = neighbourhood[PN_UPPER];
1283       double total_bottom = 0.0;
1284       double total_top = 0.0;
1285       int total_count = 0;
1286       ColPartition* upper = sum_it.data();
1287       // We do not process last_part, as its spacing is different.
1288       while (upper != last_part) {
1289         total_bottom += upper->bottom_spacing();
1290         total_top += upper->top_spacing();
1291         ++total_count;
1292         sum_it.forward();
1293         upper = sum_it.data();
1294       }
1295       if (total_count > 0) {
1296         // There were at least 2 lines, so set them all to the mean.
1297         int top_spacing = static_cast<int>(total_top / total_count + 0.5);
1298         int bottom_spacing = static_cast<int>(total_bottom / total_count + 0.5);
1299         if (textord_debug_tabfind) {
1300           tprintf("Spacing run ended. Cause:");
1301           if (neighbourhood[PN_LOWER] == NULL) {
1302             tprintf("No more lines\n");
1303           } else {
1304             tprintf("Spacing change. Spacings:\n");
1305             for (int i = 0; i < PN_COUNT; ++i) {
1306               if (neighbourhood[i] == NULL) {
1307                 tprintf("NULL\n");
1308               } else {
1309                 tprintf("Top = %d, bottom = %d\n",
1310                         neighbourhood[i]->top_spacing(),
1311                         neighbourhood[i]->bottom_spacing());
1312               }
1313             }
1314           }
1315           tprintf("Mean spacing = %d/%d\n", top_spacing, bottom_spacing);
1316         }
1317         sum_it = start_it;
1318         upper = sum_it.data();
1319         while (upper != last_part) {
1320           upper->set_top_spacing(top_spacing);
1321           upper->set_bottom_spacing(bottom_spacing);
1322           if (textord_debug_tabfind) {
1323             tprintf("Setting mean on:");
1324             upper->Print();
1325           }
1326           sum_it.forward();
1327           upper = sum_it.data();
1328         }
1329       }
1330       // PN_LOWER starts the next group and end_it is the next start_it.
1331       start_it = end_it;
1332       // Recalculate the median spacing to maximize the chances of detecting
1333       // spacing blips.
1334       median_space = MedianSpacing(page_height, end_it);
1335     }
1336     // Shuffle pointers.
1337     for (int j = 1; j < PN_COUNT; ++j) {
1338       neighbourhood[j - 1] = neighbourhood[j];
1339     }
1340     if (it.cycled_list()) {
1341       neighbourhood[PN_COUNT - 1] = NULL;
1342     } else {
1343       neighbourhood[PN_COUNT - 1] = it.data();
1344       it.forward();
1345     }
1346     end_it.forward();
1347   }
1348 }
1349 
1350 // Returns true if the parts array of pointers to partitions matches the
1351 // condition for a spacing blip. See SmoothSpacings for what this means
1352 // and how it is used.
OKSpacingBlip(int resolution,int median_spacing,ColPartition ** parts)1353 bool ColPartition::OKSpacingBlip(int resolution, int median_spacing,
1354                                  ColPartition** parts) {
1355   if (parts[PN_UPPER] == NULL || parts[PN_LOWER] == NULL)
1356     return false;
1357   // The blip is OK if upper and lower sum to an OK value and at least
1358   // one of above1 and below1 is equal to the median.
1359   return parts[PN_UPPER]->SummedSpacingOK(*parts[PN_LOWER],
1360                                           median_spacing, resolution) &&
1361          ((parts[PN_ABOVE1] != NULL &&
1362            parts[PN_ABOVE1]->SpacingEqual(median_spacing, resolution)) ||
1363           (parts[PN_BELOW1] != NULL &&
1364            parts[PN_BELOW1]->SpacingEqual(median_spacing, resolution)));
1365 }
1366 
1367 // Returns true if both the top and bottom spacings of this match the given
1368 // spacing to within suitable margins dictated by the image resolution.
SpacingEqual(int spacing,int resolution) const1369 bool ColPartition::SpacingEqual(int spacing, int resolution) const {
1370   int bottom_error = BottomSpacingMargin(resolution);
1371   int top_error = TopSpacingMargin(resolution);
1372   return NearlyEqual(bottom_spacing_, spacing, bottom_error) &&
1373          NearlyEqual(top_spacing_, spacing, top_error);
1374 }
1375 
1376 // Returns true if both the top and bottom spacings of this and other
1377 // match to within suitable margins dictated by the image resolution.
SpacingsEqual(const ColPartition & other,int resolution) const1378 bool ColPartition::SpacingsEqual(const ColPartition& other,
1379                                  int resolution) const {
1380   int bottom_error = MAX(BottomSpacingMargin(resolution),
1381                          other.BottomSpacingMargin(resolution));
1382   int top_error = MAX(TopSpacingMargin(resolution),
1383                       other.TopSpacingMargin(resolution));
1384   return NearlyEqual(bottom_spacing_, other.bottom_spacing_, bottom_error) &&
1385          (NearlyEqual(top_spacing_, other.top_spacing_, top_error) ||
1386           NearlyEqual(top_spacing_ + other.top_spacing_, bottom_spacing_ * 2,
1387                       bottom_error));
1388 }
1389 
1390 // Returns true if the sum spacing of this and other match the given
1391 // spacing (or twice the given spacing) to within a suitable margin dictated
1392 // by the image resolution.
SummedSpacingOK(const ColPartition & other,int spacing,int resolution) const1393 bool ColPartition::SummedSpacingOK(const ColPartition& other,
1394                                    int spacing, int resolution) const {
1395   int bottom_error = MAX(BottomSpacingMargin(resolution),
1396                          other.BottomSpacingMargin(resolution));
1397   int top_error = MAX(TopSpacingMargin(resolution),
1398                       other.TopSpacingMargin(resolution));
1399   int bottom_total = bottom_spacing_ + other.bottom_spacing_;
1400   int top_total = top_spacing_ + other.top_spacing_;
1401   return (NearlyEqual(spacing, bottom_total, bottom_error) &&
1402           NearlyEqual(spacing, top_total, top_error)) ||
1403          (NearlyEqual(spacing * 2, bottom_total, bottom_error) &&
1404           NearlyEqual(spacing * 2, top_total, top_error));
1405 }
1406 
1407 // Returns a suitable spacing margin that can be applied to bottoms of
1408 // text lines, based on the resolution and the stored side_step_.
BottomSpacingMargin(int resolution) const1409 int ColPartition::BottomSpacingMargin(int resolution) const {
1410   return static_cast<int>(kMaxSpacingDrift * resolution + 0.5) + side_step_;
1411 }
1412 
1413 // Returns a suitable spacing margin that can be applied to tops of
1414 // text lines, based on the resolution and the stored side_step_.
TopSpacingMargin(int resolution) const1415 int ColPartition::TopSpacingMargin(int resolution) const {
1416   return static_cast<int>(kMaxTopSpacingFraction * median_size_ + 0.5) +
1417          BottomSpacingMargin(resolution);
1418 }
1419 
1420 // Returns true if the median text sizes of this and other agree to within
1421 // a reasonable multiplicative factor.
SizesSimilar(const ColPartition & other) const1422 bool ColPartition::SizesSimilar(const ColPartition& other) const {
1423   return median_size_ <= other.median_size_ * kMaxSizeRatio &&
1424          other.median_size_ <= median_size_ * kMaxSizeRatio;
1425 }
1426 
1427 // Computes and returns in start, end a line segment formed from a
1428 // forwards-iterated group of left edges of partitions that satisfy the
1429 // condition that the rightmost left margin is to the left of the
1430 // leftmost left bounding box edge.
1431 // TODO(rays) Not good enough. Needs improving to tightly wrap text in both
1432 // directions, and to loosely wrap images.
LeftEdgeRun(ColPartition_IT * part_it,ICOORD * start,ICOORD * end)1433 void ColPartition::LeftEdgeRun(ColPartition_IT* part_it,
1434                                ICOORD* start, ICOORD* end) {
1435   ColPartition* part = part_it->data();
1436   int start_y = part->bounding_box_.top();
1437   if (!part_it->at_first() &&
1438       part_it->data_relative(-1)->bounding_box_.bottom() > start_y)
1439     start_y = (start_y + part_it->data_relative(-1)->bounding_box_.bottom())/2;
1440   int end_y = part->bounding_box_.bottom();
1441   int min_right = MAX_INT32;
1442   int max_left = MIN_INT32;
1443   do {
1444     part = part_it->data();
1445     int top = part->bounding_box_.top();
1446     int bottom = part->bounding_box_.bottom();
1447     int tl_key = part->SortKey(part->left_margin_, top);
1448     int tr_key = part->SortKey(part->bounding_box_.left(), top);
1449     int bl_key = part->SortKey(part->left_margin_, bottom);
1450     int br_key = part->SortKey(part->bounding_box_.left(), bottom);
1451     int left_key = MAX(tl_key, bl_key);
1452     int right_key = MIN(tr_key, br_key);
1453     if (left_key <= min_right && right_key >= max_left) {
1454       // This part is good - let's keep it.
1455       min_right = MIN(min_right, right_key);
1456       max_left = MAX(max_left, left_key);
1457       end_y = bottom;
1458       part_it->forward();
1459       if (!part_it->at_first() && part_it->data()->bounding_box_.top() < end_y)
1460         end_y = (end_y + part_it->data()->bounding_box_.top()) / 2;
1461     } else {
1462       if (textord_debug_tabfind)
1463         tprintf("Sum key %d/%d, new %d/%d\n",
1464                 max_left, min_right, left_key, right_key);
1465       break;
1466     }
1467   } while (!part_it->at_first());
1468   start->set_y(start_y);
1469   start->set_x(part->XAtY(min_right, start_y));
1470   end->set_y(end_y);
1471   end->set_x(part->XAtY(min_right, end_y));
1472   if (textord_debug_tabfind && !part_it->at_first())
1473     tprintf("Left run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
1474             start_y, end_y, part->XAtY(max_left, end_y),
1475             end->x(), part->left_margin_, part->bounding_box_.left());
1476 }
1477 
1478 // Computes and returns in start, end a line segment formed from a
1479 // backwards-iterated group of right edges of partitions that satisfy the
1480 // condition that the leftmost right margin is to the right of the
1481 // rightmost right bounding box edge.
1482 // TODO(rays) Not good enough. Needs improving to tightly wrap text in both
1483 // directions, and to loosely wrap images.
RightEdgeRun(ColPartition_IT * part_it,ICOORD * start,ICOORD * end)1484 void ColPartition::RightEdgeRun(ColPartition_IT* part_it,
1485                                 ICOORD* start, ICOORD* end) {
1486   ColPartition* part = part_it->data();
1487   int start_y = part->bounding_box_.bottom();
1488   if (!part_it->at_first() &&
1489       part_it->data_relative(1)->bounding_box_.top() < start_y)
1490     start_y = (start_y + part_it->data_relative(1)->bounding_box_.top()) / 2;
1491   int end_y = part->bounding_box_.top();
1492   int min_right = MAX_INT32;
1493   int max_left = MIN_INT32;
1494   do {
1495     part = part_it->data();
1496     int top = part->bounding_box_.top();
1497     int bottom = part->bounding_box_.bottom();
1498     int tl_key = part->SortKey(part->bounding_box_.right(), top);
1499     int tr_key = part->SortKey(part->right_margin_, top);
1500     int bl_key = part->SortKey(part->bounding_box_.right(), bottom);
1501     int br_key = part->SortKey(part->right_margin_, bottom);
1502     int left_key = MAX(tl_key, bl_key);
1503     int right_key = MIN(tr_key, br_key);
1504     if (left_key <= min_right && right_key >= max_left) {
1505       // This part is good - let's keep it.
1506       min_right = MIN(min_right, right_key);
1507       max_left = MAX(max_left, left_key);
1508       end_y = top;
1509       part_it->backward();
1510       if (!part_it->at_last() &&
1511           part_it->data()->bounding_box_.bottom() > end_y)
1512         end_y = (end_y + part_it->data()->bounding_box_.bottom()) / 2;
1513     } else {
1514       if (textord_debug_tabfind)
1515         tprintf("Sum cross %d/%d, new %d/%d\n",
1516                 max_left, min_right, left_key, right_key);
1517       break;
1518     }
1519   } while (!part_it->at_last());
1520   start->set_y(start_y);
1521   start->set_x(part->XAtY(max_left, start_y));
1522   end->set_y(end_y);
1523   end->set_x(part->XAtY(max_left, end_y));
1524   if (textord_debug_tabfind && !part_it->at_last())
1525     tprintf("Right run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
1526             start_y, end_y, end->x(), part->XAtY(min_right, end_y),
1527             part->bounding_box_.right(), part->right_margin_);
1528 }
1529 
1530 }  // namespace tesseract.
1531 
1532