tesseract 3.04.01

textord/colpartition.cpp

Go to the documentation of this file.
00001 
00002 // File:        colpartition.cpp
00003 // Description: Class to hold partitions of the page that correspond
00004 //              roughly to text lines.
00005 // Author:      Ray Smith
00006 // Created:     Thu Aug 14 10:54:01 PDT 2008
00007 //
00008 // (C) Copyright 2008, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifdef _MSC_VER
00022 #pragma warning(disable:4244)  // Conversion warnings
00023 #endif
00024 
00025 #ifdef HAVE_CONFIG_H
00026 #include "config_auto.h"
00027 #endif
00028 
00029 #include "colpartition.h"
00030 #include "colpartitiongrid.h"
00031 #include "colpartitionset.h"
00032 #include "detlinefit.h"
00033 #include "dppoint.h"
00034 #include "imagefind.h"
00035 #include "workingpartset.h"
00036 
00037 namespace tesseract {
00038 
00039 ELIST2IZE(ColPartition)
00040 CLISTIZE(ColPartition)
00041 
00043 
00044 // Maximum change in spacing (in inches) to ignore.
00045 const double kMaxSpacingDrift = 1.0 / 72;  // 1/72 is one point.
00046 // Maximum fraction of line height used as an additional allowance
00047 // for top spacing.
00048 const double kMaxTopSpacingFraction = 0.25;
00049 // What multiple of the largest line height should be used as an upper bound
00050 // for whether lines are in the same text block?
00051 const double kMaxSameBlockLineSpacing = 3;
00052 // Maximum ratio of sizes for lines to be considered the same size.
00053 const double kMaxSizeRatio = 1.5;
00054 // Fraction of max of leader width and gap for max IQR of gaps.
00055 const double kMaxLeaderGapFractionOfMax = 0.25;
00056 // Fraction of min of leader width and gap for max IQR of gaps.
00057 const double kMaxLeaderGapFractionOfMin = 0.5;
00058 // Minimum number of blobs to be considered a leader.
00059 const int kMinLeaderCount = 5;
00060 // Minimum score for a STRONG_CHAIN textline.
00061 const int kMinStrongTextValue = 6;
00062 // Minimum score for a CHAIN textline.
00063 const int kMinChainTextValue = 3;
00064 // Minimum number of blobs for strong horizontal text lines.
00065 const int kHorzStrongTextlineCount = 8;
00066 // Minimum height (in image pixels) for strong horizontal text lines.
00067 const int kHorzStrongTextlineHeight = 10;
00068 // Minimum aspect ratio for strong horizontal text lines.
00069 const int kHorzStrongTextlineAspect = 5;
00070 // Maximum upper quartile error allowed on a baseline fit as a fraction
00071 // of height.
00072 const double kMaxBaselineError = 0.4375;
00073 // Min coverage for a good baseline between vectors
00074 const double kMinBaselineCoverage = 0.5;
00075 // Max RMS color noise to compare colors.
00076 const int kMaxRMSColorNoise = 128;
00077 // Maximum distance to allow a partition color to be to use that partition
00078 // in smoothing neighbouring types. This is a squared distance.
00079 const int kMaxColorDistance = 900;
00080 
00081 // blob_type is the blob_region_type_ of the blobs in this partition.
00082 // Vertical is the direction of logical vertical on the possibly skewed image.
00083 ColPartition::ColPartition(BlobRegionType blob_type, const ICOORD& vertical)
00084   : left_margin_(-MAX_INT32), right_margin_(MAX_INT32),
00085     median_bottom_(MAX_INT32), median_top_(-MAX_INT32), median_size_(0),
00086     median_left_(MAX_INT32), median_right_(-MAX_INT32), median_width_(0),
00087     blob_type_(blob_type), flow_(BTFT_NONE), good_blob_score_(0),
00088     good_width_(false), good_column_(false),
00089     left_key_tab_(false), right_key_tab_(false),
00090     left_key_(0), right_key_(0), type_(PT_UNKNOWN), vertical_(vertical),
00091     working_set_(NULL), last_add_was_vertical_(false), block_owned_(false),
00092     desperately_merged_(false),
00093     first_column_(-1), last_column_(-1), column_set_(NULL),
00094     side_step_(0), top_spacing_(0), bottom_spacing_(0),
00095     type_before_table_(PT_UNKNOWN), inside_table_column_(false),
00096     nearest_neighbor_above_(NULL), nearest_neighbor_below_(NULL),
00097     space_above_(0), space_below_(0), space_to_left_(0), space_to_right_(0),
00098     owns_blobs_(true) {
00099   memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
00100 }
00101 
00102 // Constructs a fake ColPartition with a single fake BLOBNBOX, all made
00103 // from a single TBOX.
00104 // WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and
00105 // the ColPartition owns the BLOBNBOX!!!
00106 // Call DeleteBoxes before deleting the ColPartition.
00107 ColPartition* ColPartition::FakePartition(const TBOX& box,
00108                                           PolyBlockType block_type,
00109                                           BlobRegionType blob_type,
00110                                           BlobTextFlowType flow) {
00111   ColPartition* part = new ColPartition(blob_type, ICOORD(0, 1));
00112   part->set_type(block_type);
00113   part->set_flow(flow);
00114   part->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(box)));
00115   part->set_left_margin(box.left());
00116   part->set_right_margin(box.right());
00117   part->SetBlobTypes();
00118   part->ComputeLimits();
00119   part->ClaimBoxes();
00120   return part;
00121 }
00122 
00123 // Constructs and returns a ColPartition with the given real BLOBNBOX,
00124 // and sets it up to be a "big" partition (single-blob partition bigger
00125 // than the surrounding text that may be a dropcap, two or more vertically
00126 // touching characters, or some graphic element.
00127 // If the given list is not NULL, the partition is also added to the list.
00128 ColPartition* ColPartition::MakeBigPartition(BLOBNBOX* box,
00129                                              ColPartition_LIST* big_part_list) {
00130   box->set_owner(NULL);
00131   ColPartition* single = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
00132   single->set_flow(BTFT_NONE);
00133   single->AddBox(box);
00134   single->ComputeLimits();
00135   single->ClaimBoxes();
00136   single->SetBlobTypes();
00137   single->set_block_owned(true);
00138   if (big_part_list != NULL) {
00139     ColPartition_IT part_it(big_part_list);
00140     part_it.add_to_end(single);
00141   }
00142   return single;
00143 }
00144 
00145 ColPartition::~ColPartition() {
00146   // Remove this as a partner of all partners, as we don't want them
00147   // referring to a deleted object.
00148   ColPartition_C_IT it(&upper_partners_);
00149   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00150     it.data()->RemovePartner(false, this);
00151   }
00152   it.set_to_list(&lower_partners_);
00153   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00154     it.data()->RemovePartner(true, this);
00155   }
00156 }
00157 
00158 // Constructs a fake ColPartition with no BLOBNBOXes to represent a
00159 // horizontal or vertical line, given a type and a bounding box.
00160 ColPartition* ColPartition::MakeLinePartition(BlobRegionType blob_type,
00161                                               const ICOORD& vertical,
00162                                               int left, int bottom,
00163                                               int right, int top) {
00164   ColPartition* part = new ColPartition(blob_type, vertical);
00165   part->bounding_box_ = TBOX(left, bottom, right, top);
00166   part->median_bottom_ = bottom;
00167   part->median_top_ = top;
00168   part->median_size_ = top - bottom;
00169   part->median_width_ = right - left;
00170   part->left_key_ = part->BoxLeftKey();
00171   part->right_key_ = part->BoxRightKey();
00172   return part;
00173 }
00174 
00175 
00176 // Adds the given box to the partition, updating the partition bounds.
00177 // The list of boxes in the partition is updated, ensuring that no box is
00178 // recorded twice, and the boxes are kept in increasing left position.
00179 void ColPartition::AddBox(BLOBNBOX* bbox) {
00180   TBOX box = bbox->bounding_box();
00181   // Update the partition limits.
00182   if (boxes_.length() == 0) {
00183     bounding_box_ = box;
00184   } else {
00185     bounding_box_ += box;
00186   }
00187 
00188   if (IsVerticalType()) {
00189     if (!last_add_was_vertical_) {
00190       boxes_.sort(SortByBoxBottom<BLOBNBOX>);
00191       last_add_was_vertical_ = true;
00192     }
00193     boxes_.add_sorted(SortByBoxBottom<BLOBNBOX>, true, bbox);
00194   } else {
00195     if (last_add_was_vertical_) {
00196       boxes_.sort(SortByBoxLeft<BLOBNBOX>);
00197       last_add_was_vertical_ = false;
00198     }
00199     boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>, true, bbox);
00200   }
00201   if (!left_key_tab_)
00202     left_key_ = BoxLeftKey();
00203   if (!right_key_tab_)
00204     right_key_ = BoxRightKey();
00205   if (TabFind::WithinTestRegion(2, box.left(), box.bottom()))
00206     tprintf("Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\n",
00207             box.left(), box.bottom(), box.right(), box.top(),
00208             bounding_box_.left(), bounding_box_.right());
00209 }
00210 
00211 // Removes the given box from the partition, updating the bounds.
00212 void ColPartition::RemoveBox(BLOBNBOX* box) {
00213   BLOBNBOX_C_IT bb_it(&boxes_);
00214   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00215     if (box == bb_it.data()) {
00216       bb_it.extract();
00217       ComputeLimits();
00218       return;
00219     }
00220   }
00221 }
00222 
00223 // Returns the tallest box in the partition, as measured perpendicular to the
00224 // presumed flow of text.
00225 BLOBNBOX* ColPartition::BiggestBox() {
00226   BLOBNBOX* biggest = NULL;
00227   BLOBNBOX_C_IT bb_it(&boxes_);
00228   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00229     BLOBNBOX* bbox = bb_it.data();
00230     if (IsVerticalType()) {
00231       if (biggest == NULL ||
00232           bbox->bounding_box().width() > biggest->bounding_box().width())
00233         biggest = bbox;
00234     } else {
00235       if (biggest == NULL ||
00236           bbox->bounding_box().height() > biggest->bounding_box().height())
00237         biggest = bbox;
00238     }
00239   }
00240   return biggest;
00241 }
00242 
00243 // Returns the bounding box excluding the given box.
00244 TBOX ColPartition::BoundsWithoutBox(BLOBNBOX* box) {
00245   TBOX result;
00246   BLOBNBOX_C_IT bb_it(&boxes_);
00247   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00248     if (box != bb_it.data()) {
00249       result += bb_it.data()->bounding_box();
00250     }
00251   }
00252   return result;
00253 }
00254 
00255 // Claims the boxes in the boxes_list by marking them with a this owner
00256 // pointer. If a box is already owned, then it must be owned by this.
00257 void ColPartition::ClaimBoxes() {
00258   BLOBNBOX_C_IT bb_it(&boxes_);
00259   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00260     BLOBNBOX* bblob = bb_it.data();
00261     ColPartition* other = bblob->owner();
00262     if (other == NULL) {
00263       // Normal case: ownership is available.
00264       bblob->set_owner(this);
00265     } else {
00266       ASSERT_HOST(other == this);
00267     }
00268   }
00269 }
00270 
00271 // NULL the owner of the blobs in this partition, so they can be deleted
00272 // independently of the ColPartition.
00273 void ColPartition::DisownBoxes() {
00274   BLOBNBOX_C_IT bb_it(&boxes_);
00275   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00276     BLOBNBOX* bblob = bb_it.data();
00277     ASSERT_HOST(bblob->owner() == this || bblob->owner() == NULL);
00278     bblob->set_owner(NULL);
00279   }
00280 }
00281 
00282 // NULL the owner of the blobs in this partition that are owned by this
00283 // partition, so they can be deleted independently of the ColPartition.
00284 // Any blobs that are not owned by this partition get to keep their owner
00285 // without an assert failure.
00286 void ColPartition::DisownBoxesNoAssert() {
00287   BLOBNBOX_C_IT bb_it(&boxes_);
00288   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00289     BLOBNBOX* bblob = bb_it.data();
00290     if (bblob->owner() == this)
00291       bblob->set_owner(NULL);
00292   }
00293 }
00294 
00295 // NULLs the owner of the blobs in this partition that are owned by this
00296 // partition and not leader blobs, removing them from the boxes_ list, thus
00297 // turning this partition back to a leader partition if it contains a leader,
00298 // or otherwise leaving it empty. Returns true if any boxes remain.
00299 bool ColPartition::ReleaseNonLeaderBoxes() {
00300   BLOBNBOX_C_IT bb_it(&boxes_);
00301   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00302     BLOBNBOX* bblob = bb_it.data();
00303     if (bblob->flow() != BTFT_LEADER) {
00304       if (bblob->owner() == this) bblob->set_owner(NULL);
00305       bb_it.extract();
00306     }
00307   }
00308   if (bb_it.empty()) return false;
00309   flow_ = BTFT_LEADER;
00310   ComputeLimits();
00311   return true;
00312 }
00313 
00314 // Delete the boxes that this partition owns.
00315 void ColPartition::DeleteBoxes() {
00316   // Although the boxes_ list is a C_LIST, in some cases it owns the
00317   // BLOBNBOXes, as the ColPartition takes ownership from the grid,
00318   // and the BLOBNBOXes own the underlying C_BLOBs.
00319   for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {
00320     BLOBNBOX* bblob = bb_it.extract();
00321     delete bblob->cblob();
00322     delete bblob;
00323   }
00324 }
00325 
00326 // Reflects the partition in the y-axis, assuming that its blobs have
00327 // already been done. Corrects only a limited part of the members, since
00328 // this function is assumed to be used shortly after initial creation, which
00329 // is before a lot of the members are used.
00330 void ColPartition::ReflectInYAxis() {
00331   BLOBNBOX_CLIST reversed_boxes;
00332   BLOBNBOX_C_IT reversed_it(&reversed_boxes);
00333   // Reverse the order of the boxes_.
00334   BLOBNBOX_C_IT bb_it(&boxes_);
00335   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00336     reversed_it.add_before_then_move(bb_it.extract());
00337   }
00338   bb_it.add_list_after(&reversed_boxes);
00339   ASSERT_HOST(!left_key_tab_ && !right_key_tab_);
00340   int tmp = left_margin_;
00341   left_margin_ = -right_margin_;
00342   right_margin_ = -tmp;
00343   ComputeLimits();
00344 }
00345 
00346 // Returns true if this is a legal partition - meaning that the conditions
00347 // left_margin <= bounding_box left
00348 // left_key <= bounding box left key
00349 // bounding box left <= bounding box right
00350 // and likewise for right margin and key
00351 // are all met.
00352 bool ColPartition::IsLegal() {
00353   if (bounding_box_.left() > bounding_box_.right()) {
00354     if (textord_debug_bugs) {
00355       tprintf("Bounding box invalid\n");
00356       Print();
00357     }
00358     return false;  // Bounding box invalid.
00359   }
00360   if (left_margin_ > bounding_box_.left() ||
00361       right_margin_ < bounding_box_.right()) {
00362     if (textord_debug_bugs) {
00363       tprintf("Margins invalid\n");
00364       Print();
00365     }
00366     return false;  // Margins invalid.
00367   }
00368   if (left_key_ > BoxLeftKey() || right_key_ < BoxRightKey()) {
00369     if (textord_debug_bugs) {
00370       tprintf("Key inside box: %d v %d or %d v %d\n",
00371               left_key_, BoxLeftKey(), right_key_, BoxRightKey());
00372       Print();
00373     }
00374     return false;  // Keys inside the box.
00375   }
00376   return true;
00377 }
00378 
00379 // Returns true if the left and right edges are approximately equal.
00380 bool ColPartition::MatchingColumns(const ColPartition& other) const {
00381   int y = (MidY() + other.MidY()) / 2;
00382   if (!NearlyEqual(other.LeftAtY(y) / kColumnWidthFactor,
00383                    LeftAtY(y) / kColumnWidthFactor, 1))
00384     return false;
00385   if (!NearlyEqual(other.RightAtY(y) / kColumnWidthFactor,
00386                    RightAtY(y) / kColumnWidthFactor, 1))
00387     return false;
00388   return true;
00389 }
00390 
00391 // Returns true if the colors match for two text partitions.
00392 bool ColPartition::MatchingTextColor(const ColPartition& other) const {
00393   if (color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise &&
00394       other.color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise)
00395     return false;  // Too noisy.
00396 
00397   // Colors must match for other to count.
00398   double d_this1_o = ImageFind::ColorDistanceFromLine(other.color1_,
00399                                                       other.color2_,
00400                                                       color1_);
00401   double d_this2_o = ImageFind::ColorDistanceFromLine(other.color1_,
00402                                                       other.color2_,
00403                                                       color2_);
00404   double d_o1_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
00405                                                       other.color1_);
00406   double d_o2_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
00407                                                       other.color2_);
00408 // All 4 distances must be small enough.
00409   return d_this1_o < kMaxColorDistance && d_this2_o < kMaxColorDistance &&
00410          d_o1_this < kMaxColorDistance && d_o2_this < kMaxColorDistance;
00411 }
00412 
00413 // Returns true if the sizes match for two text partitions,
00414 // taking orientation into account. See also SizesSimilar.
00415 bool ColPartition::MatchingSizes(const ColPartition& other) const {
00416   if (blob_type_ == BRT_VERT_TEXT || other.blob_type_ == BRT_VERT_TEXT)
00417     return !TabFind::DifferentSizes(median_width_, other.median_width_);
00418   else
00419     return !TabFind::DifferentSizes(median_size_, other.median_size_);
00420 }
00421 
00422 // Returns true if there is no tabstop violation in merging this and other.
00423 bool ColPartition::ConfirmNoTabViolation(const ColPartition& other) const {
00424   if (bounding_box_.right() < other.bounding_box_.left() &&
00425       bounding_box_.right() < other.LeftBlobRule())
00426     return false;
00427   if (other.bounding_box_.right() < bounding_box_.left() &&
00428       other.bounding_box_.right() < LeftBlobRule())
00429     return false;
00430   if (bounding_box_.left() > other.bounding_box_.right() &&
00431       bounding_box_.left() > other.RightBlobRule())
00432     return false;
00433   if (other.bounding_box_.left() > bounding_box_.right() &&
00434       other.bounding_box_.left() > RightBlobRule())
00435     return false;
00436   return true;
00437 }
00438 
00439 // Returns true if other has a similar stroke width to this.
00440 bool ColPartition::MatchingStrokeWidth(const ColPartition& other,
00441                                        double fractional_tolerance,
00442                                        double constant_tolerance) const {
00443   int match_count = 0;
00444   int nonmatch_count = 0;
00445   BLOBNBOX_C_IT box_it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
00446   BLOBNBOX_C_IT other_it(const_cast<BLOBNBOX_CLIST*>(&other.boxes_));
00447   box_it.mark_cycle_pt();
00448   other_it.mark_cycle_pt();
00449   while (!box_it.cycled_list() && !other_it.cycled_list()) {
00450     if (box_it.data()->MatchingStrokeWidth(*other_it.data(),
00451                                            fractional_tolerance,
00452                                            constant_tolerance))
00453       ++match_count;
00454     else
00455       ++nonmatch_count;
00456     box_it.forward();
00457     other_it.forward();
00458   }
00459   return match_count > nonmatch_count;
00460 }
00461 
00462 // Returns true if base is an acceptable diacritic base char merge
00463 // with this as the diacritic.
00464 // Returns true if:
00465 // (1) this is a ColPartition containing only diacritics, and
00466 // (2) the base characters indicated on the diacritics all believably lie
00467 // within the text line of the candidate ColPartition.
00468 bool ColPartition::OKDiacriticMerge(const ColPartition& candidate,
00469                                     bool debug) const {
00470   BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
00471   int min_top = MAX_INT32;
00472   int max_bottom = -MAX_INT32;
00473   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00474     BLOBNBOX* blob = it.data();
00475     if (!blob->IsDiacritic()) {
00476       if (debug) {
00477         tprintf("Blob is not a diacritic:");
00478         blob->bounding_box().print();
00479       }
00480       return false;  // All blobs must have diacritic bases.
00481     }
00482     if (blob->base_char_top() < min_top)
00483       min_top = blob->base_char_top();
00484     if (blob->base_char_bottom() > max_bottom)
00485       max_bottom = blob->base_char_bottom();
00486   }
00487   // If the intersection of all vertical ranges of all base characters
00488   // overlaps the median range of this, then it is OK.
00489   bool result = min_top > candidate.median_bottom_ &&
00490                 max_bottom < candidate.median_top_;
00491   if (debug) {
00492     if (result)
00493       tprintf("OKDiacritic!\n");
00494     else
00495       tprintf("y ranges don\'t overlap: %d-%d / %d-%d\n",
00496               max_bottom, min_top, median_bottom_, median_top_);
00497   }
00498   return result;
00499 }
00500 
00501 // Sets the sort key using either the tab vector, or the bounding box if
00502 // the tab vector is NULL. If the tab_vector lies inside the bounding_box,
00503 // use the edge of the box as a key any way.
00504 void ColPartition::SetLeftTab(const TabVector* tab_vector) {
00505   if (tab_vector != NULL) {
00506     left_key_ = tab_vector->sort_key();
00507     left_key_tab_ = left_key_ <= BoxLeftKey();
00508   } else {
00509     left_key_tab_ = false;
00510   }
00511   if (!left_key_tab_)
00512     left_key_ = BoxLeftKey();
00513 }
00514 
00515 // As SetLeftTab, but with the right.
00516 void ColPartition::SetRightTab(const TabVector* tab_vector) {
00517   if (tab_vector != NULL) {
00518     right_key_ = tab_vector->sort_key();
00519     right_key_tab_ = right_key_ >= BoxRightKey();
00520   } else {
00521     right_key_tab_ = false;
00522   }
00523   if (!right_key_tab_)
00524     right_key_ = BoxRightKey();
00525 }
00526 
00527 // Copies the left/right tab from the src partition, but if take_box is
00528 // true, copies the box instead and uses that as a key.
00529 void ColPartition::CopyLeftTab(const ColPartition& src, bool take_box) {
00530   left_key_tab_ = take_box ? false : src.left_key_tab_;
00531   if (left_key_tab_) {
00532     left_key_ = src.left_key_;
00533   } else {
00534     bounding_box_.set_left(XAtY(src.BoxLeftKey(), MidY()));
00535     left_key_ = BoxLeftKey();
00536   }
00537   if (left_margin_ > bounding_box_.left())
00538     left_margin_ = src.left_margin_;
00539 }
00540 
00541 // As CopyLeftTab, but with the right.
00542 void ColPartition::CopyRightTab(const ColPartition& src, bool take_box) {
00543   right_key_tab_ = take_box ? false : src.right_key_tab_;
00544   if (right_key_tab_) {
00545     right_key_ = src.right_key_;
00546   } else {
00547     bounding_box_.set_right(XAtY(src.BoxRightKey(), MidY()));
00548     right_key_ = BoxRightKey();
00549   }
00550   if (right_margin_ < bounding_box_.right())
00551     right_margin_ = src.right_margin_;
00552 }
00553 
00554 // Returns the left rule line x coord of the leftmost blob.
00555 int ColPartition::LeftBlobRule() const {
00556   BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
00557   return it.data()->left_rule();
00558 }
00559 // Returns the right rule line x coord of the rightmost blob.
00560 int ColPartition::RightBlobRule() const {
00561   BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
00562   it.move_to_last();
00563   return it.data()->right_rule();
00564 }
00565 
00566 float ColPartition::SpecialBlobsDensity(const BlobSpecialTextType type) const {
00567   ASSERT_HOST(type < BSTT_COUNT);
00568   return special_blobs_densities_[type];
00569 }
00570 
00571 int ColPartition::SpecialBlobsCount(const BlobSpecialTextType type) {
00572   ASSERT_HOST(type < BSTT_COUNT);
00573   BLOBNBOX_C_IT blob_it(&boxes_);
00574   int count = 0;
00575   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00576     BLOBNBOX* blob = blob_it.data();
00577     BlobSpecialTextType blob_type = blob->special_text_type();
00578     if (blob_type == type) {
00579       count++;
00580     }
00581   }
00582 
00583   return count;
00584 }
00585 
00586 void ColPartition::SetSpecialBlobsDensity(
00587     const BlobSpecialTextType type, const float density) {
00588   ASSERT_HOST(type < BSTT_COUNT);
00589   special_blobs_densities_[type] = density;
00590 }
00591 
00592 void ColPartition::ComputeSpecialBlobsDensity() {
00593   memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
00594   if (boxes_.empty()) {
00595     return;
00596   }
00597 
00598   BLOBNBOX_C_IT blob_it(&boxes_);
00599   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00600     BLOBNBOX* blob = blob_it.data();
00601     BlobSpecialTextType type = blob->special_text_type();
00602     special_blobs_densities_[type]++;
00603   }
00604 
00605   for (int type = 0; type < BSTT_COUNT; ++type) {
00606     special_blobs_densities_[type] /= boxes_.length();
00607   }
00608 }
00609 
00610 // Add a partner above if upper, otherwise below.
00611 // Add them uniquely and keep the list sorted by box left.
00612 // Partnerships are added symmetrically to partner and this.
00613 void ColPartition::AddPartner(bool upper, ColPartition* partner) {
00614   if (upper) {
00615     partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>,
00616                                         true, this);
00617     upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
00618   } else {
00619     partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>,
00620                                         true, this);
00621     lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
00622   }
00623 }
00624 
00625 // Removes the partner from this, but does not remove this from partner.
00626 // This asymmetric removal is so as not to mess up the iterator that is
00627 // working on partner's partner list.
00628 void ColPartition::RemovePartner(bool upper, ColPartition* partner) {
00629   ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);
00630   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00631     if (it.data() == partner) {
00632       it.extract();
00633       break;
00634     }
00635   }
00636 }
00637 
00638 // Returns the partner if the given partner is a singleton, otherwise NULL.
00639 ColPartition* ColPartition::SingletonPartner(bool upper) {
00640   ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
00641   if (!partners->singleton())
00642     return NULL;
00643   ColPartition_C_IT it(partners);
00644   return it.data();
00645 }
00646 
00647 // Merge with the other partition and delete it.
00648 void ColPartition::Absorb(ColPartition* other, WidthCallback* cb) {
00649   // The result has to either own all of the blobs or none of them.
00650   // Verify the flag is consisent.
00651   ASSERT_HOST(owns_blobs() == other->owns_blobs());
00652   // TODO(nbeato): check owns_blobs better. Right now owns_blobs
00653   // should always be true when this is called. So there is no issues.
00654   if (TabFind::WithinTestRegion(2, bounding_box_.left(),
00655                                 bounding_box_.bottom()) ||
00656       TabFind::WithinTestRegion(2, other->bounding_box_.left(),
00657                                 other->bounding_box_.bottom())) {
00658     tprintf("Merging:");
00659     Print();
00660     other->Print();
00661   }
00662 
00663   // Update the special_blobs_densities_.
00664   memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
00665   for (int type = 0; type < BSTT_COUNT; ++type) {
00666     int w1 = boxes_.length(), w2 = other->boxes_.length();
00667     float new_val = special_blobs_densities_[type] * w1 +
00668         other->special_blobs_densities_[type] * w2;
00669     if (!w1 || !w2) {
00670       special_blobs_densities_[type] = new_val / (w1 + w2);
00671     }
00672   }
00673 
00674   // Merge the two sorted lists.
00675   BLOBNBOX_C_IT it(&boxes_);
00676   BLOBNBOX_C_IT it2(&other->boxes_);
00677   for (; !it2.empty(); it2.forward()) {
00678     BLOBNBOX* bbox2 = it2.extract();
00679     ColPartition* prev_owner = bbox2->owner();
00680     if (prev_owner != other && prev_owner != NULL) {
00681       // A blob on other's list is owned by someone else; let them have it.
00682       continue;
00683     }
00684     ASSERT_HOST(prev_owner == other || prev_owner == NULL);
00685     if (prev_owner == other)
00686       bbox2->set_owner(this);
00687     it.add_to_end(bbox2);
00688   }
00689   left_margin_ = MIN(left_margin_, other->left_margin_);
00690   right_margin_ = MAX(right_margin_, other->right_margin_);
00691   if (other->left_key_ < left_key_) {
00692     left_key_ = other->left_key_;
00693     left_key_tab_ = other->left_key_tab_;
00694   }
00695   if (other->right_key_ > right_key_) {
00696     right_key_ = other->right_key_;
00697     right_key_tab_ = other->right_key_tab_;
00698   }
00699   // Combine the flow and blob_type in a sensible way.
00700   // Dominant flows stay.
00701   if (!DominatesInMerge(flow_, other->flow_)) {
00702     flow_ = other->flow_;
00703     blob_type_ = other->blob_type_;
00704   }
00705   SetBlobTypes();
00706   if (IsVerticalType()) {
00707     boxes_.sort(SortByBoxBottom<BLOBNBOX>);
00708     last_add_was_vertical_ = true;
00709   } else {
00710     boxes_.sort(SortByBoxLeft<BLOBNBOX>);
00711     last_add_was_vertical_ = false;
00712   }
00713   ComputeLimits();
00714   // Fix partner lists. other is going away, so remove it as a
00715   // partner of all its partners and add this in its place.
00716   for (int upper = 0; upper < 2; ++upper) {
00717     ColPartition_CLIST partners;
00718     ColPartition_C_IT part_it(&partners);
00719     part_it.add_list_after(upper ? &other->upper_partners_
00720                                  : &other->lower_partners_);
00721     for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
00722       ColPartition* partner = part_it.extract();
00723       partner->RemovePartner(!upper, other);
00724       partner->RemovePartner(!upper, this);
00725       partner->AddPartner(!upper, this);
00726     }
00727   }
00728   delete other;
00729   if (cb != NULL) {
00730     SetColumnGoodness(cb);
00731   }
00732 }
00733 
00734 // Merge1 and merge2 are candidates to be merged, yet their combined box
00735 // overlaps this. Is that allowed?
00736 // Returns true if the overlap between this and the merged pair of
00737 // merge candidates is sufficiently trivial to be allowed.
00738 // The merged box can graze the edge of this by the ok_box_overlap
00739 // if that exceeds the margin to the median top and bottom.
00740 // ok_box_overlap should be set by the caller appropriate to the sizes of
00741 // the text involved, and is usually a fraction of the median size of merge1
00742 // and/or merge2, or this.
00743 // TODO(rays) Determine whether vertical text needs to be considered.
00744 bool ColPartition::OKMergeOverlap(const ColPartition& merge1,
00745                                   const ColPartition& merge2,
00746                                   int ok_box_overlap, bool debug) {
00747   // Vertical partitions are not allowed to be involved.
00748   if (IsVerticalType() || merge1.IsVerticalType() || merge2.IsVerticalType()) {
00749     if (debug)
00750       tprintf("Vertical partition\n");
00751     return false;
00752   }
00753   // The merging partitions must strongly overlap each other.
00754   if (!merge1.VSignificantCoreOverlap(merge2)) {
00755     if (debug)
00756       tprintf("Voverlap %d (%d)\n",
00757               merge1.VCoreOverlap(merge2),
00758               merge1.VSignificantCoreOverlap(merge2));
00759     return false;
00760   }
00761   // The merged box must not overlap the median bounds of this.
00762   TBOX merged_box(merge1.bounding_box());
00763   merged_box += merge2.bounding_box();
00764   if (merged_box.bottom() < median_top_ && merged_box.top() > median_bottom_ &&
00765       merged_box.bottom() < bounding_box_.top() - ok_box_overlap &&
00766       merged_box.top() > bounding_box_.bottom() + ok_box_overlap) {
00767     if (debug)
00768       tprintf("Excessive box overlap\n");
00769     return false;
00770   }
00771   // Looks OK!
00772   return true;
00773 }
00774 
00775 // Find the blob at which to split this to minimize the overlap with the
00776 // given box. Returns the first blob to go in the second partition.
00777 BLOBNBOX* ColPartition::OverlapSplitBlob(const TBOX& box) {
00778   if (boxes_.empty() || boxes_.singleton())
00779     return NULL;
00780   BLOBNBOX_C_IT it(&boxes_);
00781   TBOX left_box(it.data()->bounding_box());
00782   for (it.forward(); !it.at_first(); it.forward()) {
00783     BLOBNBOX* bbox = it.data();
00784     left_box += bbox->bounding_box();
00785     if (left_box.overlap(box))
00786       return bbox;
00787   }
00788   return NULL;
00789 }
00790 
00791 // Split this partition keeping the first half in this and returning
00792 // the second half.
00793 // Splits by putting the split_blob and the blobs that follow
00794 // in the second half, and the rest in the first half.
00795 ColPartition* ColPartition::SplitAtBlob(BLOBNBOX* split_blob) {
00796   ColPartition* split_part = ShallowCopy();
00797   split_part->set_owns_blobs(owns_blobs());
00798   BLOBNBOX_C_IT it(&boxes_);
00799   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00800     BLOBNBOX* bbox = it.data();
00801     ColPartition* prev_owner = bbox->owner();
00802     ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == NULL);
00803     if (bbox == split_blob || !split_part->boxes_.empty()) {
00804       split_part->AddBox(it.extract());
00805       if (owns_blobs() && prev_owner != NULL)
00806         bbox->set_owner(split_part);
00807     }
00808   }
00809   ASSERT_HOST(!it.empty());
00810   if (split_part->IsEmpty()) {
00811     // Split part ended up with nothing. Possible if split_blob is not
00812     // in the list of blobs.
00813     delete split_part;
00814     return NULL;
00815   }
00816   right_key_tab_ = false;
00817   split_part->left_key_tab_ = false;
00818   ComputeLimits();
00819   // TODO(nbeato) Merge Ray's CL like this:
00820   // if (owns_blobs())
00821   //  SetBlobTextlineGoodness();
00822   split_part->ComputeLimits();
00823   // TODO(nbeato) Merge Ray's CL like this:
00824   // if (split_part->owns_blobs())
00825   //   split_part->SetBlobTextlineGoodness();
00826   return split_part;
00827 }
00828 
00829 // Split this partition at the given x coordinate, returning the right
00830 // half and keeping the left half in this.
00831 ColPartition* ColPartition::SplitAt(int split_x) {
00832   if (split_x <= bounding_box_.left() || split_x >= bounding_box_.right())
00833     return NULL;  // There will be no change.
00834   ColPartition* split_part = ShallowCopy();
00835   split_part->set_owns_blobs(owns_blobs());
00836   BLOBNBOX_C_IT it(&boxes_);
00837   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00838     BLOBNBOX* bbox = it.data();
00839     ColPartition* prev_owner = bbox->owner();
00840     ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == NULL);
00841     const TBOX& box = bbox->bounding_box();
00842     if (box.left() >= split_x) {
00843       split_part->AddBox(it.extract());
00844       if (owns_blobs() && prev_owner != NULL)
00845         bbox->set_owner(split_part);
00846     }
00847   }
00848   if (it.empty()) {
00849     // Possible if split-x passes through the first blob.
00850     it.add_list_after(&split_part->boxes_);
00851   }
00852   ASSERT_HOST(!it.empty());
00853   if (split_part->IsEmpty()) {
00854     // Split part ended up with nothing. Possible if split_x passes
00855     // through the last blob.
00856     delete split_part;
00857     return NULL;
00858   }
00859   right_key_tab_ = false;
00860   split_part->left_key_tab_ = false;
00861   right_margin_ = split_x;
00862   split_part->left_margin_ = split_x;
00863   ComputeLimits();
00864   split_part->ComputeLimits();
00865   return split_part;
00866 }
00867 
00868 // Recalculates all the coordinate limits of the partition.
00869 void ColPartition::ComputeLimits() {
00870   bounding_box_ = TBOX();  // Clear it
00871   BLOBNBOX_C_IT it(&boxes_);
00872   BLOBNBOX* bbox = NULL;
00873   int non_leader_count = 0;
00874   if (it.empty()) {
00875     bounding_box_.set_left(left_margin_);
00876     bounding_box_.set_right(right_margin_);
00877     bounding_box_.set_bottom(0);
00878     bounding_box_.set_top(0);
00879   } else {
00880     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00881       bbox = it.data();
00882       bounding_box_ += bbox->bounding_box();
00883       if (bbox->flow() != BTFT_LEADER)
00884         ++non_leader_count;
00885     }
00886   }
00887   if (!left_key_tab_)
00888     left_key_ = BoxLeftKey();
00889   if (left_key_ > BoxLeftKey() && textord_debug_bugs) {
00890     // TODO(rays) investigate the causes of these error messages, to find
00891     // out if they are genuinely harmful, or just indicative of junk input.
00892     tprintf("Computed left-illegal partition\n");
00893     Print();
00894   }
00895   if (!right_key_tab_)
00896     right_key_ = BoxRightKey();
00897   if (right_key_ < BoxRightKey() && textord_debug_bugs) {
00898     tprintf("Computed right-illegal partition\n");
00899     Print();
00900   }
00901   if (it.empty())
00902     return;
00903   if (IsImageType() || blob_type() == BRT_RECTIMAGE ||
00904       blob_type() == BRT_POLYIMAGE) {
00905     median_top_ = bounding_box_.top();
00906     median_bottom_ = bounding_box_.bottom();
00907     median_size_ = bounding_box_.height();
00908     median_left_ = bounding_box_.left();
00909     median_right_ = bounding_box_.right();
00910     median_width_ = bounding_box_.width();
00911   } else {
00912     STATS top_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
00913     STATS bottom_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
00914     STATS size_stats(0, bounding_box_.height() + 1);
00915     STATS left_stats(bounding_box_.left(), bounding_box_.right() + 1);
00916     STATS right_stats(bounding_box_.left(), bounding_box_.right() + 1);
00917     STATS width_stats(0, bounding_box_.width() + 1);
00918     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00919       bbox = it.data();
00920       if (non_leader_count == 0 || bbox->flow() != BTFT_LEADER) {
00921         TBOX box = bbox->bounding_box();
00922         int area = box.area();
00923         top_stats.add(box.top(), area);
00924         bottom_stats.add(box.bottom(), area);
00925         size_stats.add(box.height(), area);
00926         left_stats.add(box.left(), area);
00927         right_stats.add(box.right(), area);
00928         width_stats.add(box.width(), area);
00929       }
00930     }
00931     median_top_ = static_cast<int>(top_stats.median() + 0.5);
00932     median_bottom_ = static_cast<int>(bottom_stats.median() + 0.5);
00933     median_size_ = static_cast<int>(size_stats.median() + 0.5);
00934     median_left_ = static_cast<int>(left_stats.median() + 0.5);
00935     median_right_ = static_cast<int>(right_stats.median() + 0.5);
00936     median_width_ = static_cast<int>(width_stats.median() + 0.5);
00937   }
00938 
00939   if (right_margin_ < bounding_box_.right() && textord_debug_bugs) {
00940     tprintf("Made partition with bad right coords");
00941     Print();
00942   }
00943   if (left_margin_ > bounding_box_.left() && textord_debug_bugs) {
00944     tprintf("Made partition with bad left coords");
00945     Print();
00946   }
00947   // Fix partner lists. The bounding box has changed and partners are stored
00948   // in bounding box order, so remove and reinsert this as a partner
00949   // of all its partners.
00950   for (int upper = 0; upper < 2; ++upper) {
00951     ColPartition_CLIST partners;
00952     ColPartition_C_IT part_it(&partners);
00953     part_it.add_list_after(upper ? &upper_partners_ : &lower_partners_);
00954     for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
00955       ColPartition* partner = part_it.extract();
00956       partner->RemovePartner(!upper, this);
00957       partner->AddPartner(!upper, this);
00958     }
00959   }
00960   if (TabFind::WithinTestRegion(2, bounding_box_.left(),
00961                                 bounding_box_.bottom())) {
00962     tprintf("Recomputed box for partition %p\n", this);
00963     Print();
00964   }
00965 }
00966 
00967 // Returns the number of boxes that overlap the given box.
00968 int ColPartition::CountOverlappingBoxes(const TBOX& box) {
00969   BLOBNBOX_C_IT it(&boxes_);
00970   int overlap_count = 0;
00971   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00972     BLOBNBOX* bbox = it.data();
00973     if (box.overlap(bbox->bounding_box()))
00974       ++overlap_count;
00975   }
00976   return overlap_count;
00977 }
00978 
00979 // Computes and sets the type_ and first_column_, last_column_ and column_set_.
00980 // resolution refers to the ppi resolution of the image.
00981 void ColPartition::SetPartitionType(int resolution, ColPartitionSet* columns) {
00982   int first_spanned_col = -1;
00983   ColumnSpanningType span_type =
00984       columns->SpanningType(resolution,
00985                             bounding_box_.left(), bounding_box_.right(),
00986                             MIN(bounding_box_.height(), bounding_box_.width()),
00987                             MidY(), left_margin_, right_margin_,
00988                             &first_column_, &last_column_,
00989                             &first_spanned_col);
00990   column_set_ = columns;
00991   if (first_column_ < last_column_ && span_type == CST_PULLOUT &&
00992       !IsLineType()) {
00993     // Unequal columns may indicate that the pullout spans one of the columns
00994     // it lies in, so force it to be allocated to just that column.
00995     if (first_spanned_col >= 0) {
00996       first_column_ = first_spanned_col;
00997       last_column_ = first_spanned_col;
00998     } else {
00999       if ((first_column_ & 1) == 0)
01000         last_column_ = first_column_;
01001       else if ((last_column_ & 1) == 0)
01002         first_column_ = last_column_;
01003       else
01004         first_column_ = last_column_ = (first_column_ + last_column_) / 2;
01005     }
01006   }
01007   type_ = PartitionType(span_type);
01008 }
01009 
01010 // Returns the PartitionType from the current BlobRegionType and a column
01011 // flow spanning type ColumnSpanningType, generated by
01012 // ColPartitionSet::SpanningType, that indicates how the partition sits
01013 // in the columns.
01014 PolyBlockType ColPartition::PartitionType(ColumnSpanningType flow) const {
01015   if (flow == CST_NOISE) {
01016     if (blob_type_ != BRT_HLINE && blob_type_ != BRT_VLINE &&
01017         blob_type_ != BRT_RECTIMAGE && blob_type_ != BRT_VERT_TEXT)
01018       return PT_NOISE;
01019     flow = CST_FLOWING;
01020   }
01021 
01022   switch (blob_type_) {
01023     case BRT_NOISE:
01024       return PT_NOISE;
01025     case BRT_HLINE:
01026       return PT_HORZ_LINE;
01027     case BRT_VLINE:
01028       return PT_VERT_LINE;
01029     case BRT_RECTIMAGE:
01030     case BRT_POLYIMAGE:
01031       switch (flow) {
01032         case CST_FLOWING:
01033           return PT_FLOWING_IMAGE;
01034         case CST_HEADING:
01035           return PT_HEADING_IMAGE;
01036         case CST_PULLOUT:
01037           return PT_PULLOUT_IMAGE;
01038         default:
01039           ASSERT_HOST(!"Undefined flow type for image!");
01040       }
01041       break;
01042     case BRT_VERT_TEXT:
01043       return PT_VERTICAL_TEXT;
01044     case BRT_TEXT:
01045     case BRT_UNKNOWN:
01046     default:
01047       switch (flow) {
01048         case CST_FLOWING:
01049           return PT_FLOWING_TEXT;
01050         case CST_HEADING:
01051           return PT_HEADING_TEXT;
01052         case CST_PULLOUT:
01053           return PT_PULLOUT_TEXT;
01054         default:
01055           ASSERT_HOST(!"Undefined flow type for text!");
01056       }
01057   }
01058   ASSERT_HOST(!"Should never get here!");
01059   return PT_NOISE;
01060 }
01061 
01062 // Returns the first and last column touched by this partition.
01063 // resolution refers to the ppi resolution of the image.
01064 void ColPartition::ColumnRange(int resolution, ColPartitionSet* columns,
01065                                int* first_col, int* last_col) {
01066   int first_spanned_col = -1;
01067   ColumnSpanningType span_type =
01068       columns->SpanningType(resolution,
01069                             bounding_box_.left(), bounding_box_.right(),
01070                             MIN(bounding_box_.height(), bounding_box_.width()),
01071                             MidY(), left_margin_, right_margin_,
01072                             first_col, last_col,
01073                             &first_spanned_col);
01074   type_ = PartitionType(span_type);
01075 }
01076 
01077 // Sets the internal flags good_width_ and good_column_.
01078 void ColPartition::SetColumnGoodness(WidthCallback* cb) {
01079   int y = MidY();
01080   int width = RightAtY(y) - LeftAtY(y);
01081   good_width_ = cb->Run(width);
01082   good_column_ = blob_type_ == BRT_TEXT && left_key_tab_ && right_key_tab_;
01083 }
01084 
01085 // Determines whether the blobs in this partition mostly represent
01086 // a leader (fixed pitch sequence) and sets the member blobs accordingly.
01087 // Note that height is assumed to have been tested elsewhere, and that this
01088 // function will find most fixed-pitch text as leader without a height filter.
01089 // Leader detection is limited to sequences of identical width objects,
01090 // such as .... or ----, so patterns, such as .-.-.-.-. will not be found.
01091 bool ColPartition::MarkAsLeaderIfMonospaced() {
01092   bool result = false;
01093   // Gather statistics on the gaps between blobs and the widths of the blobs.
01094   int part_width = bounding_box_.width();
01095   STATS gap_stats(0, part_width);
01096   STATS width_stats(0, part_width);
01097   BLOBNBOX_C_IT it(&boxes_);
01098   BLOBNBOX* prev_blob = it.data();
01099   prev_blob->set_flow(BTFT_NEIGHBOURS);
01100   width_stats.add(prev_blob->bounding_box().width(), 1);
01101   int blob_count = 1;
01102   for (it.forward(); !it.at_first(); it.forward()) {
01103     BLOBNBOX* blob = it.data();
01104     int left = blob->bounding_box().left();
01105     int right = blob->bounding_box().right();
01106     gap_stats.add(left - prev_blob->bounding_box().right(), 1);
01107     width_stats.add(right - left, 1);
01108     blob->set_flow(BTFT_NEIGHBOURS);
01109     prev_blob = blob;
01110     ++blob_count;
01111   }
01112   double median_gap = gap_stats.median();
01113   double median_width = width_stats.median();
01114   double max_width = MAX(median_gap, median_width);
01115   double min_width = MIN(median_gap, median_width);
01116   double gap_iqr = gap_stats.ile(0.75f) - gap_stats.ile(0.25f);
01117   if (textord_debug_tabfind >= 4) {
01118     tprintf("gap iqr = %g, blob_count=%d, limits=%g,%g\n",
01119             gap_iqr, blob_count, max_width * kMaxLeaderGapFractionOfMax,
01120             min_width * kMaxLeaderGapFractionOfMin);
01121   }
01122   if (gap_iqr < max_width * kMaxLeaderGapFractionOfMax &&
01123       gap_iqr < min_width * kMaxLeaderGapFractionOfMin &&
01124       blob_count >= kMinLeaderCount) {
01125     // This is stable enough to be called a leader, so check the widths.
01126     // Since leader dashes can join, run a dp cutting algorithm and go
01127     // on the cost.
01128     int offset = static_cast<int>(ceil(gap_iqr * 2));
01129     int min_step = static_cast<int>(median_gap + median_width + 0.5);
01130     int max_step = min_step + offset;
01131     min_step -= offset;
01132     // Pad the buffer with min_step/2 on each end.
01133     int part_left = bounding_box_.left() - min_step / 2;
01134     part_width += min_step;
01135     DPPoint* projection = new DPPoint[part_width];
01136     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01137       BLOBNBOX* blob = it.data();
01138       int left = blob->bounding_box().left();
01139       int right = blob->bounding_box().right();
01140       int height = blob->bounding_box().height();
01141       for (int x = left; x < right; ++x) {
01142         projection[left - part_left].AddLocalCost(height);
01143       }
01144     }
01145     DPPoint* best_end = DPPoint::Solve(min_step, max_step, false,
01146                                        &DPPoint::CostWithVariance,
01147                                        part_width, projection);
01148     if (best_end != NULL && best_end->total_cost() < blob_count) {
01149       // Good enough. Call it a leader.
01150       result = true;
01151       bool modified_blob_list = false;
01152       for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01153         BLOBNBOX* blob = it.data();
01154         TBOX box = blob->bounding_box();
01155         // If the first or last blob is spaced too much, don't mark it.
01156         if (it.at_first()) {
01157           int gap = it.data_relative(1)->bounding_box().left() -
01158                      blob->bounding_box().right();
01159           if (blob->bounding_box().width() + gap > max_step) {
01160             it.extract();
01161             modified_blob_list = true;
01162             continue;
01163           }
01164         }
01165         if (it.at_last()) {
01166           int gap = blob->bounding_box().left() -
01167                      it.data_relative(-1)->bounding_box().right();
01168           if (blob->bounding_box().width() + gap > max_step) {
01169             it.extract();
01170             modified_blob_list = true;
01171             break;
01172           }
01173         }
01174         blob->set_region_type(BRT_TEXT);
01175         blob->set_flow(BTFT_LEADER);
01176       }
01177       if (modified_blob_list) ComputeLimits();
01178       blob_type_ = BRT_TEXT;
01179       flow_ = BTFT_LEADER;
01180     } else if (textord_debug_tabfind) {
01181       if (best_end == NULL) {
01182         tprintf("No path\n");
01183       } else {
01184         tprintf("Total cost = %d vs allowed %d\n",
01185                 best_end->total_cost() < blob_count);
01186       }
01187     }
01188     delete [] projection;
01189   }
01190   return result;
01191 }
01192 
01193 // Given the result of TextlineProjection::EvaluateColPartition, (positive for
01194 // horizontal text, negative for vertical text, and near zero for non-text),
01195 // sets the blob_type_ and flow_ for this partition to indicate whether it
01196 // is strongly or weakly vertical or horizontal text, or non-text.
01197 // The function assumes that the blob neighbours are valid (from
01198 // StrokeWidth::SetNeighbours) and that those neighbours have their
01199 // region_type() set.
01200 void ColPartition::SetRegionAndFlowTypesFromProjectionValue(int value) {
01201   int blob_count = 0;        // Total # blobs.
01202   int good_blob_score_ = 0;  // Total # good strokewidth neighbours.
01203   int noisy_count = 0;       // Total # neighbours marked as noise.
01204   int hline_count = 0;
01205   int vline_count = 0;
01206   BLOBNBOX_C_IT it(&boxes_);
01207   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01208     BLOBNBOX* blob = it.data();
01209     ++blob_count;
01210     noisy_count += blob->NoisyNeighbours();
01211     good_blob_score_ += blob->GoodTextBlob();
01212     if (blob->region_type() == BRT_HLINE) ++hline_count;
01213     if (blob->region_type() == BRT_VLINE) ++vline_count;
01214   }
01215   flow_ = BTFT_NEIGHBOURS;
01216   blob_type_ = BRT_UNKNOWN;
01217   if (hline_count > vline_count) {
01218     flow_ = BTFT_NONE;
01219     blob_type_ = BRT_HLINE;
01220   } else if (vline_count > hline_count) {
01221     flow_ = BTFT_NONE;
01222     blob_type_ = BRT_VLINE;
01223   } else if (value < -1 || 1 < value) {
01224     int long_side;
01225     int short_side;
01226     if (value > 0) {
01227       long_side = bounding_box_.width();
01228       short_side = bounding_box_.height();
01229       blob_type_ = BRT_TEXT;
01230     } else {
01231       long_side = bounding_box_.height();
01232       short_side = bounding_box_.width();
01233       blob_type_ = BRT_VERT_TEXT;
01234     }
01235     // We will combine the old metrics using aspect ratio and blob counts
01236     // with the input value by allowing a strong indication to flip the
01237     // STRONG_CHAIN/CHAIN flow values.
01238     int strong_score = blob_count >= kHorzStrongTextlineCount ? 1 : 0;
01239     if (short_side > kHorzStrongTextlineHeight) ++strong_score;
01240     if (short_side * kHorzStrongTextlineAspect < long_side) ++strong_score;
01241     if (abs(value) >= kMinStrongTextValue)
01242       flow_ = BTFT_STRONG_CHAIN;
01243     else if (abs(value) >= kMinChainTextValue)
01244       flow_ = BTFT_CHAIN;
01245     else
01246       flow_ = BTFT_NEIGHBOURS;
01247     // Upgrade chain to strong chain if the other indicators are good
01248     if (flow_ == BTFT_CHAIN && strong_score == 3)
01249       flow_ = BTFT_STRONG_CHAIN;
01250     // Downgrade strong vertical text to chain if the indicators are bad.
01251     if (flow_ == BTFT_STRONG_CHAIN && value < 0 && strong_score < 2)
01252       flow_ = BTFT_CHAIN;
01253   }
01254   if (flow_ == BTFT_NEIGHBOURS) {
01255     // Check for noisy neighbours.
01256     if (noisy_count >= blob_count) {
01257       flow_ = BTFT_NONTEXT;
01258       blob_type_= BRT_NOISE;
01259     }
01260   }
01261   if (TabFind::WithinTestRegion(2, bounding_box_.left(),
01262                                 bounding_box_.bottom())) {
01263     tprintf("RegionFlowTypesFromProjectionValue count=%d, noisy=%d, score=%d,",
01264             blob_count, noisy_count, good_blob_score_);
01265     tprintf(" Projection value=%d, flow=%d, blob_type=%d\n",
01266             value, flow_, blob_type_);
01267     Print();
01268   }
01269   SetBlobTypes();
01270 }
01271 
01272 // Sets all blobs with the partition blob type and flow, but never overwrite
01273 // leader blobs, as we need to be able to identify them later.
01274 void ColPartition::SetBlobTypes() {
01275   if (!owns_blobs())
01276     return;
01277   BLOBNBOX_C_IT it(&boxes_);
01278   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01279     BLOBNBOX* blob = it.data();
01280     if (blob->flow() != BTFT_LEADER)
01281       blob->set_flow(flow_);
01282     blob->set_region_type(blob_type_);
01283     ASSERT_HOST(blob->owner() == NULL || blob->owner() == this);
01284   }
01285 }
01286 
01287 // Returns true if a decent baseline can be fitted through the blobs.
01288 // Works for both horizontal and vertical text.
01289 bool ColPartition::HasGoodBaseline() {
01290   // Approximation of the baseline.
01291   DetLineFit linepoints;
01292   // Calculation of the mean height on this line segment. Note that these
01293   // variable names apply to the context of a horizontal line, and work
01294   // analogously, rather than literally in the case of a vertical line.
01295   int total_height = 0;
01296   int coverage = 0;
01297   int height_count = 0;
01298   int width = 0;
01299   BLOBNBOX_C_IT it(&boxes_);
01300   TBOX box(it.data()->bounding_box());
01301   // Accumulate points representing the baseline at the middle of each blob,
01302   // but add an additional point for each end of the line. This makes it
01303   // harder to fit a severe skew angle, as it is most likely not right.
01304   if (IsVerticalType()) {
01305     // For a vertical line, use the right side as the baseline.
01306     ICOORD first_pt(box.right(), box.bottom());
01307     // Use the bottom-right of the first (bottom) box, the top-right of the
01308     // last, and the middle-right of all others.
01309     linepoints.Add(first_pt);
01310     for (it.forward(); !it.at_last(); it.forward()) {
01311       BLOBNBOX* blob = it.data();
01312       box = blob->bounding_box();
01313       ICOORD box_pt(box.right(), (box.top() + box.bottom()) / 2);
01314       linepoints.Add(box_pt);
01315       total_height += box.width();
01316       coverage += box.height();
01317       ++height_count;
01318     }
01319     box = it.data()->bounding_box();
01320     ICOORD last_pt(box.right(), box.top());
01321     linepoints.Add(last_pt);
01322     width = last_pt.y() - first_pt.y();
01323 
01324   } else {
01325     // Horizontal lines use the bottom as the baseline.
01326     TBOX box(it.data()->bounding_box());
01327     // Use the bottom-left of the first box, the the bottom-right of the last,
01328     // and the middle of all others.
01329     ICOORD first_pt(box.left(), box.bottom());
01330     linepoints.Add(first_pt);
01331     for (it.forward(); !it.at_last(); it.forward()) {
01332       BLOBNBOX* blob = it.data();
01333       box = blob->bounding_box();
01334       ICOORD box_pt((box.left() + box.right()) / 2, box.bottom());
01335       linepoints.Add(box_pt);
01336       total_height += box.height();
01337       coverage += box.width();
01338       ++height_count;
01339     }
01340     box = it.data()->bounding_box();
01341     ICOORD last_pt(box.right(), box.bottom());
01342     linepoints.Add(last_pt);
01343     width = last_pt.x() - first_pt.x();
01344   }
01345   // Maximum median error allowed to be a good text line.
01346   double max_error = kMaxBaselineError * total_height / height_count;
01347   ICOORD start_pt, end_pt;
01348   double error = linepoints.Fit(&start_pt, &end_pt);
01349   return error < max_error && coverage >= kMinBaselineCoverage * width;
01350 }
01351 
01352 // Adds this ColPartition to a matching WorkingPartSet if one can be found,
01353 // otherwise starts a new one in the appropriate column, ending the previous.
01354 void ColPartition::AddToWorkingSet(const ICOORD& bleft, const ICOORD& tright,
01355                                    int resolution,
01356                                    ColPartition_LIST* used_parts,
01357                                    WorkingPartSet_LIST* working_sets) {
01358   if (block_owned_)
01359     return;  // Done it already.
01360   block_owned_ = true;
01361   WorkingPartSet_IT it(working_sets);
01362   // If there is an upper partner use its working_set_ directly.
01363   ColPartition* partner = SingletonPartner(true);
01364   if (partner != NULL && partner->working_set_ != NULL) {
01365     working_set_ = partner->working_set_;
01366     working_set_->AddPartition(this);
01367     return;
01368   }
01369   if (partner != NULL && textord_debug_bugs) {
01370     tprintf("Partition with partner has no working set!:");
01371     Print();
01372     partner->Print();
01373   }
01374   // Search for the column that the left edge fits in.
01375   WorkingPartSet* work_set = NULL;
01376   it.move_to_first();
01377   int col_index = 0;
01378   for (it.mark_cycle_pt(); !it.cycled_list() &&
01379        col_index != first_column_;
01380         it.forward(), ++col_index);
01381   if (textord_debug_tabfind >= 2) {
01382     tprintf("Match is %s for:", (col_index & 1) ? "Real" : "Between");
01383     Print();
01384   }
01385   if (it.cycled_list() && textord_debug_bugs) {
01386     tprintf("Target column=%d, only had %d\n", first_column_, col_index);
01387   }
01388   ASSERT_HOST(!it.cycled_list());
01389   work_set = it.data();
01390   // If last_column_ != first_column, then we need to scoop up all blocks
01391   // between here and the last_column_ and put back in work_set.
01392   if (!it.cycled_list() && last_column_ != first_column_ && !IsPulloutType()) {
01393     // Find the column that the right edge falls in.
01394     BLOCK_LIST completed_blocks;
01395     TO_BLOCK_LIST to_blocks;
01396     for (; !it.cycled_list() && col_index <= last_column_;
01397          it.forward(), ++col_index) {
01398       WorkingPartSet* end_set = it.data();
01399       end_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts,
01400                                       &completed_blocks, &to_blocks);
01401     }
01402     work_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
01403   }
01404   working_set_ = work_set;
01405   work_set->AddPartition(this);
01406 }
01407 
01408 // From the given block_parts list, builds one or more BLOCKs and
01409 // corresponding TO_BLOCKs, such that the line spacing is uniform in each.
01410 // Created blocks are appended to the end of completed_blocks and to_blocks.
01411 // The used partitions are put onto used_parts, as they may still be referred
01412 // to in the partition grid. bleft, tright and resolution are the bounds
01413 // and resolution of the original image.
01414 void ColPartition::LineSpacingBlocks(const ICOORD& bleft, const ICOORD& tright,
01415                                      int resolution,
01416                                      ColPartition_LIST* block_parts,
01417                                      ColPartition_LIST* used_parts,
01418                                      BLOCK_LIST* completed_blocks,
01419                                      TO_BLOCK_LIST* to_blocks) {
01420   int page_height = tright.y() - bleft.y();
01421   // Compute the initial spacing stats.
01422   ColPartition_IT it(block_parts);
01423   int part_count = 0;
01424   int max_line_height = 0;
01425 
01426   // TODO(joeliu): We should add some special logic for PT_INLINE_EQUATION type
01427   // because their line spacing with their neighbors maybe smaller and their
01428   // height may be slightly larger.
01429 
01430   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01431     ColPartition* part = it.data();
01432     ASSERT_HOST(!part->boxes()->empty());
01433     STATS side_steps(0, part->bounding_box().height());
01434     if (part->bounding_box().height() > max_line_height)
01435       max_line_height = part->bounding_box().height();
01436     BLOBNBOX_C_IT blob_it(part->boxes());
01437     int prev_bottom = blob_it.data()->bounding_box().bottom();
01438     for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
01439       BLOBNBOX* blob = blob_it.data();
01440       int bottom = blob->bounding_box().bottom();
01441       int step = bottom - prev_bottom;
01442       if (step < 0)
01443         step = -step;
01444       side_steps.add(step, 1);
01445       prev_bottom = bottom;
01446     }
01447     part->set_side_step(static_cast<int>(side_steps.median() + 0.5));
01448     if (!it.at_last()) {
01449       ColPartition* next_part = it.data_relative(1);
01450       part->set_bottom_spacing(part->median_bottom() -
01451                                next_part->median_bottom());
01452       part->set_top_spacing(part->median_top() - next_part->median_top());
01453     } else {
01454       part->set_bottom_spacing(page_height);
01455       part->set_top_spacing(page_height);
01456     }
01457     if (textord_debug_tabfind) {
01458       part->Print();
01459       tprintf("side step = %.2f, top spacing = %d, bottom spacing=%d\n",
01460               side_steps.median(), part->top_spacing(), part->bottom_spacing());
01461     }
01462     ++part_count;
01463   }
01464   if (part_count == 0)
01465     return;
01466 
01467   SmoothSpacings(resolution, page_height, block_parts);
01468 
01469   // Move the partitions into individual block lists and make the blocks.
01470   BLOCK_IT block_it(completed_blocks);
01471   TO_BLOCK_IT to_block_it(to_blocks);
01472   ColPartition_LIST spacing_parts;
01473   ColPartition_IT sp_block_it(&spacing_parts);
01474   int same_block_threshold = max_line_height * kMaxSameBlockLineSpacing;
01475   for (it.mark_cycle_pt(); !it.empty();) {
01476     ColPartition* part = it.extract();
01477     sp_block_it.add_to_end(part);
01478     it.forward();
01479     if (it.empty() || part->bottom_spacing() > same_block_threshold ||
01480         !part->SpacingsEqual(*it.data(), resolution)) {
01481       // There is a spacing boundary. Check to see if it.data() belongs
01482       // better in the current block or the next one.
01483       if (!it.empty() && part->bottom_spacing() <= same_block_threshold) {
01484         ColPartition* next_part = it.data();
01485         // If there is a size match one-way, then the middle line goes with
01486         // its matched size, otherwise it goes with the smallest spacing.
01487         ColPartition* third_part = it.at_last() ? NULL : it.data_relative(1);
01488         if (textord_debug_tabfind) {
01489           tprintf("Spacings unequal: upper:%d/%d, lower:%d/%d,"
01490                   " sizes %d %d %d\n",
01491                   part->top_spacing(), part->bottom_spacing(),
01492                   next_part->top_spacing(), next_part->bottom_spacing(),
01493                   part->median_size(), next_part->median_size(),
01494                   third_part != NULL ? third_part->median_size() : 0);
01495         }
01496         // We can only consider adding the next line to the block if the sizes
01497         // match and the lines are close enough for their size.
01498         if (part->SizesSimilar(*next_part) &&
01499             next_part->median_size() * kMaxSameBlockLineSpacing >
01500                 part->bottom_spacing() &&
01501             part->median_size() * kMaxSameBlockLineSpacing >
01502                 part->top_spacing()) {
01503           // Even now, we can only add it as long as the third line doesn't
01504           // match in the same way and have a smaller bottom spacing.
01505           if (third_part == NULL ||
01506               !next_part->SizesSimilar(*third_part) ||
01507               third_part->median_size() * kMaxSameBlockLineSpacing <=
01508                   next_part->bottom_spacing() ||
01509               next_part->median_size() * kMaxSameBlockLineSpacing <=
01510                   next_part->top_spacing() ||
01511                   next_part->bottom_spacing() > part->bottom_spacing()) {
01512             // Add to the current block.
01513             sp_block_it.add_to_end(it.extract());
01514             it.forward();
01515             if (textord_debug_tabfind) {
01516               tprintf("Added line to current block.\n");
01517             }
01518           }
01519         }
01520       }
01521       TO_BLOCK* to_block = MakeBlock(bleft, tright, &spacing_parts, used_parts);
01522       if (to_block != NULL) {
01523         to_block_it.add_to_end(to_block);
01524         block_it.add_to_end(to_block->block);
01525       }
01526       sp_block_it.set_to_list(&spacing_parts);
01527     } else {
01528       if (textord_debug_tabfind && !it.empty()) {
01529         ColPartition* next_part = it.data();
01530         tprintf("Spacings equal: upper:%d/%d, lower:%d/%d\n",
01531                 part->top_spacing(), part->bottom_spacing(),
01532                 next_part->top_spacing(), next_part->bottom_spacing(),
01533                 part->median_size(), next_part->median_size());
01534       }
01535     }
01536   }
01537 }
01538 
01539 // Helper function to clip the input pos to the given bleft, tright bounds.
01540 static void ClipCoord(const ICOORD& bleft, const ICOORD& tright, ICOORD* pos) {
01541   if (pos->x() < bleft.x())
01542     pos->set_x(bleft.x());
01543   if (pos->x() > tright.x())
01544     pos->set_x(tright.x());
01545   if (pos->y() < bleft.y())
01546     pos->set_y(bleft.y());
01547   if (pos->y() > tright.y())
01548     pos->set_y(tright.y());
01549 }
01550 
01551 // Helper moves the blobs from the given list of block_parts into the block
01552 // itself. Sets up the block for (old) textline formation correctly for
01553 // vertical and horizontal text. The partitions are moved to used_parts
01554 // afterwards, as they cannot be deleted yet.
01555 static TO_BLOCK* MoveBlobsToBlock(bool vertical_text, int line_spacing,
01556                                   BLOCK* block,
01557                                   ColPartition_LIST* block_parts,
01558                                   ColPartition_LIST* used_parts) {
01559   // Make a matching TO_BLOCK and put all the BLOBNBOXes from the parts in it.
01560   // Move all the parts to a done list as they are no longer needed, except
01561   // that have have to continue to exist until the part grid is deleted.
01562   // Compute the median blob size as we go, as the block needs to know.
01563   TBOX block_box(block->bounding_box());
01564   STATS sizes(0, MAX(block_box.width(), block_box.height()));
01565   bool text_type = block->poly_block()->IsText();
01566   ColPartition_IT it(block_parts);
01567   TO_BLOCK* to_block = new TO_BLOCK(block);
01568   BLOBNBOX_IT blob_it(&to_block->blobs);
01569   ColPartition_IT used_it(used_parts);
01570   for (it.move_to_first(); !it.empty(); it.forward()) {
01571     ColPartition* part = it.extract();
01572     // Transfer blobs from all regions to the output blocks.
01573     // Blobs for non-text regions will be used to define the polygonal
01574     // bounds of the region.
01575     for (BLOBNBOX_C_IT bb_it(part->boxes()); !bb_it.empty();
01576          bb_it.forward()) {
01577       BLOBNBOX* bblob = bb_it.extract();
01578       if (bblob->owner() != part) {
01579         tprintf("Ownership incorrect for blob:");
01580         bblob->bounding_box().print();
01581         tprintf("Part=");
01582         part->Print();
01583         if (bblob->owner() == NULL) {
01584           tprintf("Not owned\n");
01585         } else {
01586           tprintf("Owner part:");
01587           bblob->owner()->Print();
01588         }
01589       }
01590       ASSERT_HOST(bblob->owner() == part);
01591       // Assert failure here is caused by arbitrarily changing the partition
01592       // type without also changing the blob type, such as in
01593       // InsertSmallBlobsAsUnknowns.
01594       ASSERT_HOST(!text_type || bblob->region_type() >= BRT_UNKNOWN);
01595       C_OUTLINE_LIST* outlines = bblob->cblob()->out_list();
01596       C_OUTLINE_IT ol_it(outlines);
01597       ASSERT_HOST(!text_type || ol_it.data()->pathlength() > 0);
01598       if (vertical_text)
01599         sizes.add(bblob->bounding_box().width(), 1);
01600       else
01601         sizes.add(bblob->bounding_box().height(), 1);
01602       blob_it.add_after_then_move(bblob);
01603     }
01604     used_it.add_to_end(part);
01605   }
01606   if (text_type && blob_it.empty()) {
01607     delete block;
01608     delete to_block;
01609     return NULL;
01610   }
01611   to_block->line_size = sizes.median();
01612   if (vertical_text) {
01613     int block_width = block->bounding_box().width();
01614     if (block_width < line_spacing)
01615       line_spacing = block_width;
01616     to_block->line_spacing = static_cast<float>(line_spacing);
01617     to_block->max_blob_size = static_cast<float>(block_width + 1);
01618   } else {
01619     int block_height = block->bounding_box().height();
01620     if (block_height < line_spacing)
01621       line_spacing = block_height;
01622     to_block->line_spacing = static_cast<float>(line_spacing);
01623     to_block->max_blob_size = static_cast<float>(block_height + 1);
01624   }
01625   return to_block;
01626 }
01627 
01628 // Constructs a block from the given list of partitions.
01629 // Arguments are as LineSpacingBlocks above.
01630 TO_BLOCK* ColPartition::MakeBlock(const ICOORD& bleft, const ICOORD& tright,
01631                                   ColPartition_LIST* block_parts,
01632                                   ColPartition_LIST* used_parts) {
01633   if (block_parts->empty())
01634     return NULL;  // Nothing to do.
01635   ColPartition_IT it(block_parts);
01636   ColPartition* part = it.data();
01637   PolyBlockType type = part->type();
01638   if (type == PT_VERTICAL_TEXT)
01639     return MakeVerticalTextBlock(bleft, tright, block_parts, used_parts);
01640   // LineSpacingBlocks has handed us a collection of evenly spaced lines and
01641   // put the average spacing in each partition, so we can just take the
01642   // linespacing from the first partition.
01643   int line_spacing = part->bottom_spacing();
01644   if (line_spacing < part->median_size())
01645     line_spacing = part->bounding_box().height();
01646   ICOORDELT_LIST vertices;
01647   ICOORDELT_IT vert_it(&vertices);
01648   ICOORD start, end;
01649   int min_x = MAX_INT32;
01650   int max_x = -MAX_INT32;
01651   int min_y = MAX_INT32;
01652   int max_y = -MAX_INT32;
01653   int iteration = 0;
01654   do {
01655     if (iteration == 0)
01656       ColPartition::LeftEdgeRun(&it, &start, &end);
01657     else
01658       ColPartition::RightEdgeRun(&it, &start, &end);
01659     ClipCoord(bleft, tright, &start);
01660     ClipCoord(bleft, tright, &end);
01661     vert_it.add_after_then_move(new ICOORDELT(start));
01662     vert_it.add_after_then_move(new ICOORDELT(end));
01663     UpdateRange(start.x(), &min_x, &max_x);
01664     UpdateRange(end.x(), &min_x, &max_x);
01665     UpdateRange(start.y(), &min_y, &max_y);
01666     UpdateRange(end.y(), &min_y, &max_y);
01667     if ((iteration == 0 && it.at_first()) ||
01668         (iteration == 1 && it.at_last())) {
01669       ++iteration;
01670       it.move_to_last();
01671     }
01672   } while (iteration < 2);
01673   if (textord_debug_tabfind)
01674     tprintf("Making block at (%d,%d)->(%d,%d)\n",
01675             min_x, min_y, max_x, max_y);
01676   BLOCK* block = new BLOCK("", true, 0, 0, min_x, min_y, max_x, max_y);
01677   block->set_poly_block(new POLY_BLOCK(&vertices, type));
01678   return MoveBlobsToBlock(false, line_spacing, block, block_parts, used_parts);
01679 }
01680 
01681 // Constructs a block from the given list of vertical text partitions.
01682 // Currently only creates rectangular blocks.
01683 TO_BLOCK* ColPartition::MakeVerticalTextBlock(const ICOORD& bleft,
01684                                               const ICOORD& tright,
01685                                               ColPartition_LIST* block_parts,
01686                                               ColPartition_LIST* used_parts) {
01687   if (block_parts->empty())
01688     return NULL;  // Nothing to do.
01689   ColPartition_IT it(block_parts);
01690   ColPartition* part = it.data();
01691   TBOX block_box = part->bounding_box();
01692   int line_spacing = block_box.width();
01693   PolyBlockType type = it.data()->type();
01694   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01695     block_box += it.data()->bounding_box();
01696   }
01697   if (textord_debug_tabfind) {
01698     tprintf("Making block at:");
01699     block_box.print();
01700   }
01701   BLOCK* block = new BLOCK("", true, 0, 0, block_box.left(), block_box.bottom(),
01702                            block_box.right(), block_box.top());
01703   block->set_poly_block(new POLY_BLOCK(block_box, type));
01704   return MoveBlobsToBlock(true, line_spacing, block, block_parts, used_parts);
01705 }
01706 
01707 // Makes a TO_ROW matching this and moves all the blobs to it, transferring
01708 // ownership to to returned TO_ROW.
01709 TO_ROW* ColPartition::MakeToRow() {
01710   BLOBNBOX_C_IT blob_it(&boxes_);
01711   TO_ROW* row = NULL;
01712   int line_size = IsVerticalType() ? median_width_ : median_size_;
01713   // Add all the blobs to a single TO_ROW.
01714   for (; !blob_it.empty(); blob_it.forward()) {
01715     BLOBNBOX* blob = blob_it.extract();
01716 //    blob->compute_bounding_box();
01717     int top = blob->bounding_box().top();
01718     int bottom = blob->bounding_box().bottom();
01719     if (row == NULL) {
01720       row = new TO_ROW(blob, static_cast<float>(top),
01721                        static_cast<float>(bottom),
01722                        static_cast<float>(line_size));
01723     } else {
01724       row->add_blob(blob, static_cast<float>(top),
01725                     static_cast<float>(bottom),
01726                     static_cast<float>(line_size));
01727     }
01728   }
01729   return row;
01730 }
01731 
01732 // Returns a copy of everything except the list of boxes. The resulting
01733 // ColPartition is only suitable for keeping in a column candidate list.
01734 ColPartition* ColPartition::ShallowCopy() const {
01735   ColPartition* part = new ColPartition(blob_type_, vertical_);
01736   part->left_margin_ = left_margin_;
01737   part->right_margin_ = right_margin_;
01738   part->bounding_box_ = bounding_box_;
01739   memcpy(part->special_blobs_densities_, special_blobs_densities_,
01740          sizeof(special_blobs_densities_));
01741   part->median_bottom_ = median_bottom_;
01742   part->median_top_ = median_top_;
01743   part->median_size_ = median_size_;
01744   part->median_left_ = median_left_;
01745   part->median_right_ = median_right_;
01746   part->median_width_ = median_width_;
01747   part->good_width_ = good_width_;
01748   part->good_column_ = good_column_;
01749   part->left_key_tab_ = left_key_tab_;
01750   part->right_key_tab_ = right_key_tab_;
01751   part->type_ = type_;
01752   part->flow_ = flow_;
01753   part->left_key_ = left_key_;
01754   part->right_key_ = right_key_;
01755   part->first_column_ = first_column_;
01756   part->last_column_ = last_column_;
01757   part->owns_blobs_ = false;
01758   return part;
01759 }
01760 
01761 ColPartition* ColPartition::CopyButDontOwnBlobs() {
01762   ColPartition* copy = ShallowCopy();
01763   copy->set_owns_blobs(false);
01764   BLOBNBOX_C_IT inserter(copy->boxes());
01765   BLOBNBOX_C_IT traverser(boxes());
01766   for (traverser.mark_cycle_pt(); !traverser.cycled_list(); traverser.forward())
01767     inserter.add_after_then_move(traverser.data());
01768   return copy;
01769 }
01770 
01771 #ifndef GRAPHICS_DISABLED
01772 // Provides a color for BBGrid to draw the rectangle.
01773 // Must be kept in sync with PolyBlockType.
01774 ScrollView::Color  ColPartition::BoxColor() const {
01775   if (type_ == PT_UNKNOWN)
01776     return BLOBNBOX::TextlineColor(blob_type_, flow_);
01777   return POLY_BLOCK::ColorForPolyBlockType(type_);
01778 }
01779 #endif  // GRAPHICS_DISABLED
01780 
01781 // Keep in sync with BlobRegionType.
01782 static char kBlobTypes[BRT_COUNT + 1] = "NHSRIUVT";
01783 
01784 // Prints debug information on this.
01785 void ColPartition::Print() const {
01786   int y = MidY();
01787   tprintf("ColPart:%c(M%d-%c%d-B%d/%d,%d/%d)->(%dB-%d%c-%dM/%d,%d/%d)"
01788           " w-ok=%d, v-ok=%d, type=%d%c%d, fc=%d, lc=%d, boxes=%d"
01789           " ts=%d bs=%d ls=%d rs=%d\n",
01790           boxes_.empty() ? 'E' : ' ',
01791           left_margin_, left_key_tab_ ? 'T' : 'B', LeftAtY(y),
01792           bounding_box_.left(), median_left_,
01793           bounding_box_.bottom(), median_bottom_,
01794           bounding_box_.right(), RightAtY(y), right_key_tab_ ? 'T' : 'B',
01795           right_margin_, median_right_, bounding_box_.top(), median_top_,
01796           good_width_, good_column_, type_,
01797           kBlobTypes[blob_type_], flow_,
01798           first_column_, last_column_, boxes_.length(),
01799           space_above_, space_below_, space_to_left_, space_to_right_);
01800 }
01801 
01802 // Prints debug information on the colors.
01803 void ColPartition::PrintColors() {
01804   tprintf("Colors:(%d, %d, %d)%d -> (%d, %d, %d)\n",
01805           color1_[COLOR_RED], color1_[COLOR_GREEN], color1_[COLOR_BLUE],
01806           color1_[L_ALPHA_CHANNEL],
01807           color2_[COLOR_RED], color2_[COLOR_GREEN], color2_[COLOR_BLUE]);
01808 }
01809 
01810 // Sets the types of all partitions in the run to be the max of the types.
01811 void ColPartition::SmoothPartnerRun(int working_set_count) {
01812   STATS left_stats(0, working_set_count);
01813   STATS right_stats(0, working_set_count);
01814   PolyBlockType max_type = type_;
01815   ColPartition* partner;
01816   for (partner = SingletonPartner(false); partner != NULL;
01817        partner = partner->SingletonPartner(false)) {
01818     if (partner->type_ > max_type)
01819       max_type = partner->type_;
01820     if (column_set_ == partner->column_set_) {
01821       left_stats.add(partner->first_column_, 1);
01822       right_stats.add(partner->last_column_, 1);
01823     }
01824   }
01825   type_ = max_type;
01826   // TODO(rays) Either establish that it isn't necessary to set the columns,
01827   // or find a way to do it that does not cause an assert failure in
01828   // AddToWorkingSet.
01829 #if 0
01830   first_column_ = left_stats.mode();
01831   last_column_ = right_stats.mode();
01832   if (last_column_ < first_column_)
01833     last_column_ = first_column_;
01834 #endif
01835 
01836   for (partner = SingletonPartner(false); partner != NULL;
01837        partner = partner->SingletonPartner(false)) {
01838     partner->type_ = max_type;
01839 #if 0  // See TODO above
01840     if (column_set_ == partner->column_set_) {
01841       partner->first_column_ = first_column_;
01842       partner->last_column_ = last_column_;
01843     }
01844 #endif
01845   }
01846 }
01847 
01848 // ======= Scenario common to all Refine*Partners* functions =======
01849 // ColPartitions are aiming to represent textlines, or horizontal slices
01850 // of images, and we are trying to form bi-directional (upper/lower) chains
01851 // of UNIQUE partner ColPartitions that can be made into blocks.
01852 // The ColPartitions have previously been typed (see SetPartitionType)
01853 // according to a combination of the content type and
01854 // how they lie on the columns. We want to chain text into
01855 // groups of a single type, but image ColPartitions may have been typed
01856 // differently in different parts of the image, due to being non-rectangular.
01857 //
01858 // We previously ran a search for upper and lower partners, but there may
01859 // be more than one, and they may be of mixed types, so now we wish to
01860 // refine the partners down to at most one.
01861 // A heading may have multiple partners:
01862 // ===============================
01863 // ========  ==========  =========
01864 // ========  ==========  =========
01865 // but it should be a different type.
01866 // A regular flowing text line may have multiple partners:
01867 // ==================   ===================
01868 // =======   =================  ===========
01869 // This could be the start of a pull-out, or it might all be in a single
01870 // column and might be caused by tightly spaced text, bold words, bullets,
01871 // funny punctuation etc, all of which can cause textlines to be split into
01872 // multiple ColPartitions. Pullouts and figure captions should now be different
01873 // types so we can more aggressively merge groups of partners that all sit
01874 // in a single column.
01875 //
01876 // Cleans up the partners of the given type so that there is at most
01877 // one partner. This makes block creation simpler.
01878 // If get_desperate is true, goes to more desperate merge methods
01879 // to merge flowing text before breaking partnerships.
01880 void ColPartition::RefinePartners(PolyBlockType type, bool get_desperate,
01881                                   ColPartitionGrid* grid) {
01882   if (TypesSimilar(type_, type)) {
01883     RefinePartnersInternal(true, get_desperate, grid);
01884     RefinePartnersInternal(false, get_desperate, grid);
01885   } else if (type == PT_COUNT) {
01886     // This is the final pass. Make sure only the correctly typed
01887     // partners surivive, however many there are.
01888     RefinePartnersByType(true, &upper_partners_);
01889     RefinePartnersByType(false, &lower_partners_);
01890     // It is possible for a merge to have given a partition multiple
01891     // partners again, so the last resort is to use overlap which is
01892     // guaranteed to leave at most one partner left.
01893     if (!upper_partners_.empty() && !upper_partners_.singleton())
01894       RefinePartnersByOverlap(true, &upper_partners_);
01895     if (!lower_partners_.empty() && !lower_partners_.singleton())
01896       RefinePartnersByOverlap(false, &lower_partners_);
01897   }
01898 }
01899 
01901 
01902 // Cleans up the partners above if upper is true, else below.
01903 // If get_desperate is true, goes to more desperate merge methods
01904 // to merge flowing text before breaking partnerships.
01905 void ColPartition::RefinePartnersInternal(bool upper, bool get_desperate,
01906                                           ColPartitionGrid* grid) {
01907   ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
01908   if (!partners->empty() && !partners->singleton()) {
01909     RefinePartnersByType(upper, partners);
01910     if (!partners->empty() && !partners->singleton()) {
01911       // Check for transitive partnerships and break the cycle.
01912       RefinePartnerShortcuts(upper, partners);
01913       if (!partners->empty() && !partners->singleton()) {
01914         // Types didn't fix it. Flowing text keeps the one with the longest
01915         // sequence of singleton matching partners. All others max overlap.
01916         if (TypesSimilar(type_, PT_FLOWING_TEXT) && get_desperate) {
01917           RefineTextPartnersByMerge(upper, false, partners, grid);
01918           if (!partners->empty() && !partners->singleton())
01919             RefineTextPartnersByMerge(upper, true, partners, grid);
01920         }
01921         // The last resort is to use overlap.
01922         if (!partners->empty() && !partners->singleton())
01923           RefinePartnersByOverlap(upper, partners);
01924       }
01925     }
01926   }
01927 }
01928 
01929 // Cleans up the partners above if upper is true, else below.
01930 // Restricts the partners to only desirable types. For text and BRT_HLINE this
01931 // means the same type_ , and for image types it means any image type.
01932 void ColPartition::RefinePartnersByType(bool upper,
01933                                         ColPartition_CLIST* partners) {
01934   bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
01935                                          bounding_box_.bottom());
01936   if (debug) {
01937     tprintf("Refining %d %s partners by type for:\n",
01938             partners->length(), upper ? "Upper" : "Lower");
01939     Print();
01940   }
01941   ColPartition_C_IT it(partners);
01942   // Purify text by type.
01943   if (!IsImageType() && !IsLineType() && type() != PT_TABLE) {
01944     // Keep only partners matching type_.
01945     // Exception: PT_VERTICAL_TEXT is allowed to stay with the other
01946     // text types if it is the only partner.
01947     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01948       ColPartition* partner = it.data();
01949       if (!TypesSimilar(type_, partner->type_)) {
01950         if (debug) {
01951           tprintf("Removing partner:");
01952           partner->Print();
01953         }
01954         partner->RemovePartner(!upper, this);
01955         it.extract();
01956       } else if (debug) {
01957         tprintf("Keeping partner:");
01958         partner->Print();
01959       }
01960     }
01961   } else {
01962     // Only polyimages are allowed to have partners of any kind!
01963     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01964       ColPartition* partner = it.data();
01965       if (partner->blob_type() != BRT_POLYIMAGE ||
01966           blob_type() != BRT_POLYIMAGE) {
01967         if (debug) {
01968           tprintf("Removing partner:");
01969           partner->Print();
01970         }
01971         partner->RemovePartner(!upper, this);
01972         it.extract();
01973       } else if (debug) {
01974         tprintf("Keeping partner:");
01975         partner->Print();
01976       }
01977     }
01978   }
01979 }
01980 
01981 // Cleans up the partners above if upper is true, else below.
01982 // Remove transitive partnerships: this<->a, and a<->b and this<->b.
01983 // Gets rid of this<->b, leaving a clean chain.
01984 // Also if we have this<->a and a<->this, then gets rid of this<->a, as
01985 // this has multiple partners.
01986 void ColPartition::RefinePartnerShortcuts(bool upper,
01987                                           ColPartition_CLIST* partners) {
01988   bool done_any = false;
01989   do {
01990     done_any = false;
01991     ColPartition_C_IT it(partners);
01992     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01993       ColPartition* a = it.data();
01994       // Check for a match between all of a's partners (it1/b1) and all
01995       // of this's partners (it2/b2).
01996       ColPartition_C_IT it1(upper ? &a->upper_partners_ : &a->lower_partners_);
01997       for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
01998         ColPartition* b1 = it1.data();
01999         if (b1 == this) {
02000           done_any = true;
02001           it.extract();
02002           a->RemovePartner(!upper, this);
02003           break;
02004         }
02005         ColPartition_C_IT it2(partners);
02006         for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
02007           ColPartition* b2 = it2.data();
02008           if (b1 == b2) {
02009             // Jackpot! b2 should not be a partner of this.
02010             it2.extract();
02011             b2->RemovePartner(!upper, this);
02012             done_any = true;
02013             // That potentially invalidated all the iterators, so break out
02014             // and start again.
02015             break;
02016           }
02017         }
02018         if (done_any)
02019           break;
02020       }
02021       if (done_any)
02022         break;
02023     }
02024   } while (done_any && !partners->empty() && !partners->singleton());
02025 }
02026 
02027 // Cleans up the partners above if upper is true, else below.
02028 // If multiple text partners can be merged, (with each other, NOT with this),
02029 // then do so.
02030 // If desperate is true, then an increase in overlap with the merge is
02031 // allowed. If the overlap increases, then the desperately_merged_ flag
02032 // is set, indicating that the textlines probably need to be regenerated
02033 // by aggressive line fitting/splitting, as there are probably vertically
02034 // joined blobs that cross textlines.
02035 void ColPartition::RefineTextPartnersByMerge(bool upper, bool desperate,
02036                                              ColPartition_CLIST* partners,
02037                                              ColPartitionGrid* grid) {
02038   bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
02039                                          bounding_box_.bottom());
02040   if (debug) {
02041     tprintf("Refining %d %s partners by merge for:\n",
02042             partners->length(), upper ? "Upper" : "Lower");
02043     Print();
02044   }
02045   while (!partners->empty() && !partners->singleton()) {
02046     // Absorb will mess up the iterators, so we have to merge one partition
02047     // at a time and rebuild the iterators each time.
02048     ColPartition_C_IT it(partners);
02049     ColPartition* part = it.data();
02050     // Gather a list of merge candidates, from the list of partners, that
02051     // are all in the same single column. See general scenario comment above.
02052     ColPartition_CLIST candidates;
02053     ColPartition_C_IT cand_it(&candidates);
02054     for (it.forward(); !it.at_first(); it.forward()) {
02055       ColPartition* candidate = it.data();
02056       if (part->first_column_ == candidate->last_column_ &&
02057           part->last_column_ == candidate->first_column_)
02058         cand_it.add_after_then_move(it.data());
02059     }
02060     int overlap_increase;
02061     ColPartition* candidate = grid->BestMergeCandidate(part, &candidates, debug,
02062                                                        NULL, &overlap_increase);
02063     if (candidate != NULL && (overlap_increase <= 0 || desperate)) {
02064       if (debug) {
02065         tprintf("Merging:hoverlap=%d, voverlap=%d, OLI=%d\n",
02066                 part->HCoreOverlap(*candidate), part->VCoreOverlap(*candidate),
02067                 overlap_increase);
02068       }
02069       // Remove before merge and re-insert to keep the integrity of the grid.
02070       grid->RemoveBBox(candidate);
02071       grid->RemoveBBox(part);
02072       part->Absorb(candidate, NULL);
02073       // We modified the box of part, so re-insert it into the grid.
02074       grid->InsertBBox(true, true, part);
02075       if (overlap_increase > 0)
02076         part->desperately_merged_ = true;
02077     } else {
02078       break;  // Can't merge.
02079     }
02080   }
02081 }
02082 
02083 // Cleans up the partners above if upper is true, else below.
02084 // Keep the partner with the biggest overlap.
02085 void ColPartition::RefinePartnersByOverlap(bool upper,
02086                                            ColPartition_CLIST* partners) {
02087   bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
02088                                          bounding_box_.bottom());
02089   if (debug) {
02090     tprintf("Refining %d %s partners by overlap for:\n",
02091             partners->length(), upper ? "Upper" : "Lower");
02092     Print();
02093   }
02094   ColPartition_C_IT it(partners);
02095   ColPartition* best_partner = it.data();
02096   // Find the partner with the best overlap.
02097   int best_overlap = 0;
02098   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
02099     ColPartition* partner = it.data();
02100     int overlap = MIN(bounding_box_.right(), partner->bounding_box_.right())
02101                 - MAX(bounding_box_.left(), partner->bounding_box_.left());
02102     if (overlap > best_overlap) {
02103       best_overlap = overlap;
02104       best_partner = partner;
02105     }
02106   }
02107   // Keep only the best partner.
02108   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
02109     ColPartition* partner = it.data();
02110     if (partner != best_partner) {
02111       if (debug) {
02112         tprintf("Removing partner:");
02113         partner->Print();
02114       }
02115       partner->RemovePartner(!upper, this);
02116       it.extract();
02117     }
02118   }
02119 }
02120 
02121 // Return true if bbox belongs better in this than other.
02122 bool ColPartition::ThisPartitionBetter(BLOBNBOX* bbox,
02123                                        const ColPartition& other) {
02124   TBOX box = bbox->bounding_box();
02125   // Margins take priority.
02126   int left = box.left();
02127   int right = box.right();
02128   if (left < left_margin_ || right > right_margin_)
02129     return false;
02130   if (left < other.left_margin_ || right > other.right_margin_)
02131     return true;
02132   int top = box.top();
02133   int bottom = box.bottom();
02134   int this_overlap = MIN(top, median_top_) - MAX(bottom, median_bottom_);
02135   int other_overlap = MIN(top, other.median_top_) -
02136                       MAX(bottom, other.median_bottom_);
02137   int this_miss = median_top_ - median_bottom_ - this_overlap;
02138   int other_miss = other.median_top_ - other.median_bottom_ - other_overlap;
02139   if (TabFind::WithinTestRegion(3, box.left(), box.bottom())) {
02140     tprintf("Unique on (%d,%d)->(%d,%d) overlap %d/%d, miss %d/%d, mt=%d/%d\n",
02141             box.left(), box.bottom(), box.right(), box.top(),
02142             this_overlap, other_overlap, this_miss, other_miss,
02143             median_top_, other.median_top_);
02144   }
02145   if (this_miss < other_miss)
02146     return true;
02147   if (this_miss > other_miss)
02148     return false;
02149   if (this_overlap > other_overlap)
02150     return true;
02151   if (this_overlap < other_overlap)
02152     return false;
02153   return median_top_ >= other.median_top_;
02154 }
02155 
02156 // Returns the median line-spacing between the current position and the end
02157 // of the list.
02158 // The iterator is passed by value so the iteration does not modify the
02159 // caller's iterator.
02160 static int MedianSpacing(int page_height, ColPartition_IT it) {
02161   STATS stats(0, page_height);
02162   while (!it.cycled_list()) {
02163     ColPartition* part = it.data();
02164     it.forward();
02165     stats.add(part->bottom_spacing(), 1);
02166     stats.add(part->top_spacing(), 1);
02167   }
02168   return static_cast<int>(stats.median() + 0.5);
02169 }
02170 
02171 // Returns true if this column partition is in the same column as
02172 // part. This function will only work after the SetPartitionType function
02173 // has been called on both column partitions. This is useful for
02174 // doing a SideSearch when you want things in the same page column.
02175 //
02176 // Currently called by the table detection code to identify if potential table
02177 // partitions exist in the same column.
02178 bool ColPartition::IsInSameColumnAs(const ColPartition& part) const {
02179   // Overlap does not occur when last < part.first or first > part.last.
02180   // In other words, one is completely to the side of the other.
02181   // This is just DeMorgan's law applied to that so the function returns true.
02182   return (last_column_ >= part.first_column_) &&
02183          (first_column_ <= part.last_column_);
02184 }
02185 
02186 // Smoothes the spacings in the list into groups of equal linespacing.
02187 // resolution is the resolution of the original image, used as a basis
02188 // for thresholds in change of spacing. page_height is in pixels.
02189 void ColPartition::SmoothSpacings(int resolution, int page_height,
02190                                   ColPartition_LIST* parts) {
02191   // The task would be trivial if we didn't have to allow for blips -
02192   // occasional offsets in spacing caused by anomalous text, such as all
02193   // caps, groups of descenders, joined words, Arabic etc.
02194   // The neighbourhood stores a consecutive group of partitions so that
02195   // blips can be detected correctly, yet conservatively enough to not
02196   // mistake genuine spacing changes for blips. See example below.
02197   ColPartition* neighbourhood[PN_COUNT];
02198   ColPartition_IT it(parts);
02199   it.mark_cycle_pt();
02200   // Although we know nothing about the spacings is this list, the median is
02201   // used as an approximation to allow blips.
02202   // If parts of this block aren't spaced to the median, then we can't
02203   // accept blips in those parts, but we'll recalculate it each time we
02204   // split the block, so the median becomes more likely to match all the text.
02205   int median_space = MedianSpacing(page_height, it);
02206   ColPartition_IT start_it(it);
02207   ColPartition_IT end_it(it);
02208   for (int i = 0; i < PN_COUNT; ++i) {
02209     if (i < PN_UPPER || it.cycled_list()) {
02210       neighbourhood[i] = NULL;
02211     } else {
02212       if (i == PN_LOWER)
02213         end_it = it;
02214       neighbourhood[i] = it.data();
02215       it.forward();
02216     }
02217   }
02218   while (neighbourhood[PN_UPPER] != NULL) {
02219     // Test for end of a group. Normally SpacingsEqual is true within a group,
02220     // but in the case of a blip, it will be false. Here is an example:
02221     // Line enum   Spacing below (spacing between tops of lines)
02222     //  1   ABOVE2    20
02223     //  2   ABOVE1    20
02224     //  3   UPPER     15
02225     //  4   LOWER     25
02226     //  5   BELOW1    20
02227     //  6   BELOW2    20
02228     // Line 4 is all in caps (regular caps), so the spacing between line 3
02229     // and line 4 (looking at the tops) is smaller than normal, and the
02230     // spacing between line 4 and line 5 is larger than normal, but the
02231     // two of them add to twice the normal spacing.
02232     // The following if has to accept unequal spacings 3 times to pass the
02233     // blip (20/15, 15/25 and 25/20)
02234     // When the blip is in the middle, OKSpacingBlip tests that one of
02235     // ABOVE1 and BELOW1 matches the median.
02236     // The first time, everything is shifted down 1, so we present
02237     // OKSpacingBlip with neighbourhood+1 and check that PN_UPPER is median.
02238     // The last time, everything is shifted up 1, so we present OKSpacingBlip
02239     // with neighbourhood-1 and check that PN_LOWER matches the median.
02240     if (neighbourhood[PN_LOWER] == NULL ||
02241         (!neighbourhood[PN_UPPER]->SpacingsEqual(*neighbourhood[PN_LOWER],
02242                                                  resolution) &&
02243          !OKSpacingBlip(resolution, median_space, neighbourhood) &&
02244          (!OKSpacingBlip(resolution, median_space, neighbourhood - 1) ||
02245           !neighbourhood[PN_LOWER]->SpacingEqual(median_space, resolution)) &&
02246          (!OKSpacingBlip(resolution, median_space, neighbourhood + 1) ||
02247           !neighbourhood[PN_UPPER]->SpacingEqual(median_space, resolution)))) {
02248       // The group has ended. PN_UPPER is the last member.
02249       // Compute the mean spacing over the group.
02250       ColPartition_IT sum_it(start_it);
02251       ColPartition* last_part = neighbourhood[PN_UPPER];
02252       double total_bottom = 0.0;
02253       double total_top = 0.0;
02254       int total_count = 0;
02255       ColPartition* upper = sum_it.data();
02256       // We do not process last_part, as its spacing is different.
02257       while (upper != last_part) {
02258         total_bottom += upper->bottom_spacing();
02259         total_top += upper->top_spacing();
02260         ++total_count;
02261         sum_it.forward();
02262         upper = sum_it.data();
02263       }
02264       if (total_count > 0) {
02265         // There were at least 2 lines, so set them all to the mean.
02266         int top_spacing = static_cast<int>(total_top / total_count + 0.5);
02267         int bottom_spacing = static_cast<int>(total_bottom / total_count + 0.5);
02268         if (textord_debug_tabfind) {
02269           tprintf("Spacing run ended. Cause:");
02270           if (neighbourhood[PN_LOWER] == NULL) {
02271             tprintf("No more lines\n");
02272           } else {
02273             tprintf("Spacing change. Spacings:\n");
02274             for (int i = 0; i < PN_COUNT; ++i) {
02275               if (neighbourhood[i] == NULL) {
02276                 tprintf("NULL");
02277                 if (i > 0 && neighbourhood[i - 1] != NULL) {
02278                   if (neighbourhood[i - 1]->SingletonPartner(false) != NULL) {
02279                     tprintf(" Lower partner:");
02280                     neighbourhood[i - 1]->SingletonPartner(false)->Print();
02281                   } else {
02282                     tprintf(" NULL lower partner:\n");
02283                   }
02284                 } else {
02285                   tprintf("\n");
02286                 }
02287               } else {
02288                 tprintf("Top = %d, bottom = %d\n",
02289                         neighbourhood[i]->top_spacing(),
02290                         neighbourhood[i]->bottom_spacing());
02291               }
02292             }
02293           }
02294           tprintf("Mean spacing = %d/%d\n", top_spacing, bottom_spacing);
02295         }
02296         sum_it = start_it;
02297         upper = sum_it.data();
02298         while (upper != last_part) {
02299           upper->set_top_spacing(top_spacing);
02300           upper->set_bottom_spacing(bottom_spacing);
02301           if (textord_debug_tabfind) {
02302             tprintf("Setting mean on:");
02303             upper->Print();
02304           }
02305           sum_it.forward();
02306           upper = sum_it.data();
02307         }
02308       }
02309       // PN_LOWER starts the next group and end_it is the next start_it.
02310       start_it = end_it;
02311       // Recalculate the median spacing to maximize the chances of detecting
02312       // spacing blips.
02313       median_space = MedianSpacing(page_height, end_it);
02314     }
02315     // Shuffle pointers.
02316     for (int j = 1; j < PN_COUNT; ++j) {
02317       neighbourhood[j - 1] = neighbourhood[j];
02318     }
02319     if (it.cycled_list()) {
02320       neighbourhood[PN_COUNT - 1] = NULL;
02321     } else {
02322       neighbourhood[PN_COUNT - 1] = it.data();
02323       it.forward();
02324     }
02325     end_it.forward();
02326   }
02327 }
02328 
02329 // Returns true if the parts array of pointers to partitions matches the
02330 // condition for a spacing blip. See SmoothSpacings for what this means
02331 // and how it is used.
02332 bool ColPartition::OKSpacingBlip(int resolution, int median_spacing,
02333                                  ColPartition** parts) {
02334   if (parts[PN_UPPER] == NULL || parts[PN_LOWER] == NULL)
02335     return false;
02336   // The blip is OK if upper and lower sum to an OK value and at least
02337   // one of above1 and below1 is equal to the median.
02338   return parts[PN_UPPER]->SummedSpacingOK(*parts[PN_LOWER],
02339                                           median_spacing, resolution) &&
02340          ((parts[PN_ABOVE1] != NULL &&
02341            parts[PN_ABOVE1]->SpacingEqual(median_spacing, resolution)) ||
02342           (parts[PN_BELOW1] != NULL &&
02343            parts[PN_BELOW1]->SpacingEqual(median_spacing, resolution)));
02344 }
02345 
02346 // Returns true if both the top and bottom spacings of this match the given
02347 // spacing to within suitable margins dictated by the image resolution.
02348 bool ColPartition::SpacingEqual(int spacing, int resolution) const {
02349   int bottom_error = BottomSpacingMargin(resolution);
02350   int top_error = TopSpacingMargin(resolution);
02351   return NearlyEqual(bottom_spacing_, spacing, bottom_error) &&
02352          NearlyEqual(top_spacing_, spacing, top_error);
02353 }
02354 
02355 // Returns true if both the top and bottom spacings of this and other
02356 // match to within suitable margins dictated by the image resolution.
02357 bool ColPartition::SpacingsEqual(const ColPartition& other,
02358                                  int resolution) const {
02359   int bottom_error = MAX(BottomSpacingMargin(resolution),
02360                          other.BottomSpacingMargin(resolution));
02361   int top_error = MAX(TopSpacingMargin(resolution),
02362                       other.TopSpacingMargin(resolution));
02363   return NearlyEqual(bottom_spacing_, other.bottom_spacing_, bottom_error) &&
02364          (NearlyEqual(top_spacing_, other.top_spacing_, top_error) ||
02365           NearlyEqual(top_spacing_ + other.top_spacing_, bottom_spacing_ * 2,
02366                       bottom_error));
02367 }
02368 
02369 // Returns true if the sum spacing of this and other match the given
02370 // spacing (or twice the given spacing) to within a suitable margin dictated
02371 // by the image resolution.
02372 bool ColPartition::SummedSpacingOK(const ColPartition& other,
02373                                    int spacing, int resolution) const {
02374   int bottom_error = MAX(BottomSpacingMargin(resolution),
02375                          other.BottomSpacingMargin(resolution));
02376   int top_error = MAX(TopSpacingMargin(resolution),
02377                       other.TopSpacingMargin(resolution));
02378   int bottom_total = bottom_spacing_ + other.bottom_spacing_;
02379   int top_total = top_spacing_ + other.top_spacing_;
02380   return (NearlyEqual(spacing, bottom_total, bottom_error) &&
02381           NearlyEqual(spacing, top_total, top_error)) ||
02382          (NearlyEqual(spacing * 2, bottom_total, bottom_error) &&
02383           NearlyEqual(spacing * 2, top_total, top_error));
02384 }
02385 
02386 // Returns a suitable spacing margin that can be applied to bottoms of
02387 // text lines, based on the resolution and the stored side_step_.
02388 int ColPartition::BottomSpacingMargin(int resolution) const {
02389   return static_cast<int>(kMaxSpacingDrift * resolution + 0.5) + side_step_;
02390 }
02391 
02392 // Returns a suitable spacing margin that can be applied to tops of
02393 // text lines, based on the resolution and the stored side_step_.
02394 int ColPartition::TopSpacingMargin(int resolution) const {
02395   return static_cast<int>(kMaxTopSpacingFraction * median_size_ + 0.5) +
02396          BottomSpacingMargin(resolution);
02397 }
02398 
02399 // Returns true if the median text sizes of this and other agree to within
02400 // a reasonable multiplicative factor.
02401 bool ColPartition::SizesSimilar(const ColPartition& other) const {
02402   return median_size_ <= other.median_size_ * kMaxSizeRatio &&
02403          other.median_size_ <= median_size_ * kMaxSizeRatio;
02404 }
02405 
02406 // Helper updates margin_left and margin_right, being the bounds of the left
02407 // margin of part of a block. Returns false and does not update the bounds if
02408 // this partition has a disjoint margin with the established margin.
02409 static bool UpdateLeftMargin(const ColPartition& part,
02410                              int* margin_left, int* margin_right) {
02411   const TBOX& part_box = part.bounding_box();
02412   int top = part_box.top();
02413   int bottom = part_box.bottom();
02414   int tl_key = part.SortKey(part.left_margin(), top);
02415   int tr_key = part.SortKey(part_box.left(), top);
02416   int bl_key = part.SortKey(part.left_margin(), bottom);
02417   int br_key = part.SortKey(part_box.left(), bottom);
02418   int left_key = MAX(tl_key, bl_key);
02419   int right_key = MIN(tr_key, br_key);
02420   if (left_key <= *margin_right && right_key >= *margin_left) {
02421     // This part is good - let's keep it.
02422     *margin_right = MIN(*margin_right, right_key);
02423     *margin_left = MAX(*margin_left, left_key);
02424     return true;
02425   }
02426   return false;
02427 }
02428 
02429 // Computes and returns in start, end a line segment formed from a
02430 // forwards-iterated group of left edges of partitions that satisfy the
02431 // condition that the intersection of the left margins is non-empty, ie the
02432 // rightmost left margin is to the left of the leftmost left bounding box edge.
02433 // On return the iterator is set to the start of the next run.
02434 void ColPartition::LeftEdgeRun(ColPartition_IT* part_it,
02435                                ICOORD* start, ICOORD* end) {
02436   ColPartition* part = part_it->data();
02437   ColPartition* start_part = part;
02438   int start_y = part->bounding_box_.top();
02439   if (!part_it->at_first()) {
02440     int prev_bottom = part_it->data_relative(-1)->bounding_box_.bottom();
02441     if (prev_bottom < start_y)
02442       start_y = prev_bottom;
02443     else if (prev_bottom > start_y)
02444       start_y = (start_y + prev_bottom) / 2;
02445   }
02446   int end_y = part->bounding_box_.bottom();
02447   int margin_right = MAX_INT32;
02448   int margin_left = -MAX_INT32;
02449   UpdateLeftMargin(*part, &margin_left, &margin_right);
02450   do {
02451     part_it->forward();
02452     part = part_it->data();
02453   } while (!part_it->at_first() &&
02454            UpdateLeftMargin(*part, &margin_left, &margin_right));
02455   // The run ended. If we were pushed inwards, compute the next run and
02456   // extend it backwards into the run we just calculated to find the end of
02457   // this run that provides a tight box.
02458   int next_margin_right = MAX_INT32;
02459   int next_margin_left = -MAX_INT32;
02460   UpdateLeftMargin(*part, &next_margin_left, &next_margin_right);
02461   if (next_margin_left > margin_right) {
02462     ColPartition_IT next_it(*part_it);
02463     do {
02464       next_it.forward();
02465       part = next_it.data();
02466     } while (!next_it.at_first() &&
02467              UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
02468     // Now extend the next run backwards into the original run to get the
02469     // tightest fit.
02470     do {
02471       part_it->backward();
02472       part = part_it->data();
02473     } while (part != start_part &&
02474              UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
02475     part_it->forward();
02476   }
02477   // Now calculate the end_y.
02478   part = part_it->data_relative(-1);
02479   end_y = part->bounding_box_.bottom();
02480   if (!part_it->at_first() && part_it->data()->bounding_box_.top() < end_y)
02481     end_y = (end_y + part_it->data()->bounding_box_.top()) / 2;
02482   start->set_y(start_y);
02483   start->set_x(part->XAtY(margin_right, start_y));
02484   end->set_y(end_y);
02485   end->set_x(part->XAtY(margin_right, end_y));
02486   if (textord_debug_tabfind && !part_it->at_first())
02487     tprintf("Left run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
02488             start_y, end_y, part->XAtY(margin_left, end_y),
02489             end->x(), part->left_margin_, part->bounding_box_.left());
02490 }
02491 
02492 // Helper updates margin_left and margin_right, being the bounds of the right
02493 // margin of part of a block. Returns false and does not update the bounds if
02494 // this partition has a disjoint margin with the established margin.
02495 static bool UpdateRightMargin(const ColPartition& part,
02496                               int* margin_left, int* margin_right) {
02497   const TBOX& part_box = part.bounding_box();
02498   int top = part_box.top();
02499   int bottom = part_box.bottom();
02500   int tl_key = part.SortKey(part_box.right(), top);
02501   int tr_key = part.SortKey(part.right_margin(), top);
02502   int bl_key = part.SortKey(part_box.right(), bottom);
02503   int br_key = part.SortKey(part.right_margin(), bottom);
02504   int left_key = MAX(tl_key, bl_key);
02505   int right_key = MIN(tr_key, br_key);
02506   if (left_key <= *margin_right && right_key >= *margin_left) {
02507     // This part is good - let's keep it.
02508     *margin_right = MIN(*margin_right, right_key);
02509     *margin_left = MAX(*margin_left, left_key);
02510     return true;
02511   }
02512   return false;
02513 }
02514 
02515 // Computes and returns in start, end a line segment formed from a
02516 // backwards-iterated group of right edges of partitions that satisfy the
02517 // condition that the intersection of the right margins is non-empty, ie the
02518 // leftmost right margin is to the right of the rightmost right bounding box
02519 // edge.
02520 // On return the iterator is set to the start of the next run.
02521 void ColPartition::RightEdgeRun(ColPartition_IT* part_it,
02522                                 ICOORD* start, ICOORD* end) {
02523   ColPartition* part = part_it->data();
02524   ColPartition* start_part = part;
02525   int start_y = part->bounding_box_.bottom();
02526   if (!part_it->at_last()) {
02527     int next_y = part_it->data_relative(1)->bounding_box_.top();
02528     if (next_y > start_y)
02529       start_y = next_y;
02530     else if (next_y < start_y)
02531       start_y = (start_y + next_y) / 2;
02532   }
02533   int end_y = part->bounding_box_.top();
02534   int margin_right = MAX_INT32;
02535   int margin_left = -MAX_INT32;
02536   UpdateRightMargin(*part, &margin_left, &margin_right);
02537   do {
02538     part_it->backward();
02539     part = part_it->data();
02540   } while (!part_it->at_last() &&
02541            UpdateRightMargin(*part, &margin_left, &margin_right));
02542   // The run ended. If we were pushed inwards, compute the next run and
02543   // extend it backwards to find the end of this run for a tight box.
02544   int next_margin_right = MAX_INT32;
02545   int next_margin_left = -MAX_INT32;
02546   UpdateRightMargin(*part, &next_margin_left, &next_margin_right);
02547   if (next_margin_right < margin_left) {
02548     ColPartition_IT next_it(*part_it);
02549     do {
02550       next_it.backward();
02551       part = next_it.data();
02552     } while (!next_it.at_last() &&
02553              UpdateRightMargin(*part, &next_margin_left,
02554                                &next_margin_right));
02555     // Now extend the next run forwards into the original run to get the
02556     // tightest fit.
02557     do {
02558       part_it->forward();
02559       part = part_it->data();
02560     } while (part != start_part &&
02561              UpdateRightMargin(*part, &next_margin_left,
02562                                &next_margin_right));
02563     part_it->backward();
02564   }
02565   // Now calculate the end_y.
02566   part = part_it->data_relative(1);
02567   end_y = part->bounding_box().top();
02568   if (!part_it->at_last() &&
02569       part_it->data()->bounding_box_.bottom() > end_y)
02570     end_y = (end_y + part_it->data()->bounding_box_.bottom()) / 2;
02571   start->set_y(start_y);
02572   start->set_x(part->XAtY(margin_left, start_y));
02573   end->set_y(end_y);
02574   end->set_x(part->XAtY(margin_left, end_y));
02575   if (textord_debug_tabfind && !part_it->at_last())
02576     tprintf("Right run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
02577             start_y, end_y, end->x(), part->XAtY(margin_right, end_y),
02578             part->bounding_box_.right(), part->right_margin_);
02579 }
02580 
02581 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines