tesseract  4.1.0
colpartition.cpp
Go to the documentation of this file.
1 // File: colpartition.cpp
3 // Description: Class to hold partitions of the page that correspond
4 // roughly to text lines.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "colpartition.h"
25 #include "colpartitiongrid.h"
26 #include "colpartitionset.h"
27 #include "detlinefit.h"
28 #include "dppoint.h"
29 #include "imagefind.h"
30 #include "workingpartset.h"
31 #include "host.h" // for NearlyEqual
32 
33 #include <algorithm>
34 
35 namespace tesseract {
36 
37 ELIST2IZE(ColPartition)
38 CLISTIZE(ColPartition)
39 
41 
42 // Maximum change in spacing (in inches) to ignore.
43 const double kMaxSpacingDrift = 1.0 / 72; // 1/72 is one point.
44 // Maximum fraction of line height used as an additional allowance
45 // for top spacing.
46 const double kMaxTopSpacingFraction = 0.25;
47 // What multiple of the largest line height should be used as an upper bound
48 // for whether lines are in the same text block?
49 const double kMaxSameBlockLineSpacing = 3;
50 // Maximum ratio of sizes for lines to be considered the same size.
51 const double kMaxSizeRatio = 1.5;
52 // Fraction of max of leader width and gap for max IQR of gaps.
53 const double kMaxLeaderGapFractionOfMax = 0.25;
54 // Fraction of min of leader width and gap for max IQR of gaps.
55 const double kMaxLeaderGapFractionOfMin = 0.5;
56 // Minimum number of blobs to be considered a leader.
57 const int kMinLeaderCount = 5;
58 // Minimum score for a STRONG_CHAIN textline.
59 const int kMinStrongTextValue = 6;
60 // Minimum score for a CHAIN textline.
61 const int kMinChainTextValue = 3;
62 // Minimum number of blobs for strong horizontal text lines.
64 // Minimum height (in image pixels) for strong horizontal text lines.
66 // Minimum aspect ratio for strong horizontal text lines.
68 // Maximum upper quartile error allowed on a baseline fit as a fraction
69 // of height.
70 const double kMaxBaselineError = 0.4375;
71 // Min coverage for a good baseline between vectors
72 const double kMinBaselineCoverage = 0.5;
73 // Max RMS color noise to compare colors.
74 const int kMaxRMSColorNoise = 128;
75 // Maximum distance to allow a partition color to be to use that partition
76 // in smoothing neighbouring types. This is a squared distance.
77 const int kMaxColorDistance = 900;
78 
79 // blob_type is the blob_region_type_ of the blobs in this partition.
80 // Vertical is the direction of logical vertical on the possibly skewed image.
81 ColPartition::ColPartition(BlobRegionType blob_type, const ICOORD& vertical)
82  : left_margin_(-INT32_MAX), right_margin_(INT32_MAX),
83  median_bottom_(INT32_MAX), median_top_(-INT32_MAX), median_height_(0),
84  median_left_(INT32_MAX), median_right_(-INT32_MAX), median_width_(0),
85  blob_type_(blob_type), flow_(BTFT_NONE), good_blob_score_(0),
86  good_width_(false), good_column_(false),
87  left_key_tab_(false), right_key_tab_(false),
88  left_key_(0), right_key_(0), type_(PT_UNKNOWN), vertical_(vertical),
89  working_set_(nullptr), last_add_was_vertical_(false), block_owned_(false),
90  desperately_merged_(false),
91  first_column_(-1), last_column_(-1), column_set_(nullptr),
92  side_step_(0), top_spacing_(0), bottom_spacing_(0),
93  type_before_table_(PT_UNKNOWN), inside_table_column_(false),
94  nearest_neighbor_above_(nullptr), nearest_neighbor_below_(nullptr),
95  space_above_(0), space_below_(0), space_to_left_(0), space_to_right_(0),
96  owns_blobs_(true) {
97  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
98 }
99 
100 // Constructs a fake ColPartition with a single fake BLOBNBOX, all made
101 // from a single TBOX.
102 // WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and
103 // the ColPartition owns the BLOBNBOX!!!
104 // Call DeleteBoxes before deleting the ColPartition.
106  PolyBlockType block_type,
107  BlobRegionType blob_type,
108  BlobTextFlowType flow) {
109  ColPartition* part = new ColPartition(blob_type, ICOORD(0, 1));
110  part->set_type(block_type);
111  part->set_flow(flow);
112  part->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(box)));
113  part->set_left_margin(box.left());
114  part->set_right_margin(box.right());
115  part->SetBlobTypes();
116  part->ComputeLimits();
117  part->ClaimBoxes();
118  return part;
119 }
120 
121 // Constructs and returns a ColPartition with the given real BLOBNBOX,
122 // and sets it up to be a "big" partition (single-blob partition bigger
123 // than the surrounding text that may be a dropcap, two or more vertically
124 // touching characters, or some graphic element.
125 // If the given list is not nullptr, the partition is also added to the list.
127  ColPartition_LIST* big_part_list) {
128  box->set_owner(nullptr);
129  ColPartition* single = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
130  single->set_flow(BTFT_NONE);
131  single->AddBox(box);
132  single->ComputeLimits();
133  single->ClaimBoxes();
134  single->SetBlobTypes();
135  single->set_block_owned(true);
136  if (big_part_list != nullptr) {
137  ColPartition_IT part_it(big_part_list);
138  part_it.add_to_end(single);
139  }
140  return single;
141 }
142 
144  // Remove this as a partner of all partners, as we don't want them
145  // referring to a deleted object.
146  ColPartition_C_IT it(&upper_partners_);
147  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
148  it.data()->RemovePartner(false, this);
149  }
150  it.set_to_list(&lower_partners_);
151  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
152  it.data()->RemovePartner(true, this);
153  }
154 }
155 
156 // Constructs a fake ColPartition with no BLOBNBOXes to represent a
157 // horizontal or vertical line, given a type and a bounding box.
159  const ICOORD& vertical,
160  int left, int bottom,
161  int right, int top) {
162  auto* part = new ColPartition(blob_type, vertical);
163  part->bounding_box_ = TBOX(left, bottom, right, top);
164  part->median_bottom_ = bottom;
165  part->median_top_ = top;
166  part->median_height_ = top - bottom;
167  part->median_left_ = left;
168  part->median_right_ = right;
169  part->median_width_ = right - left;
170  part->left_key_ = part->BoxLeftKey();
171  part->right_key_ = part->BoxRightKey();
172  return part;
173 }
174 
175 
176 // Adds the given box to the partition, updating the partition bounds.
177 // The list of boxes in the partition is updated, ensuring that no box is
178 // recorded twice, and the boxes are kept in increasing left position.
180  TBOX box = bbox->bounding_box();
181  // Update the partition limits.
182  if (boxes_.length() == 0) {
183  bounding_box_ = box;
184  } else {
185  bounding_box_ += box;
186  }
187 
188  if (IsVerticalType()) {
189  if (!last_add_was_vertical_) {
190  boxes_.sort(SortByBoxBottom<BLOBNBOX>);
191  last_add_was_vertical_ = true;
192  }
193  boxes_.add_sorted(SortByBoxBottom<BLOBNBOX>, true, bbox);
194  } else {
195  if (last_add_was_vertical_) {
196  boxes_.sort(SortByBoxLeft<BLOBNBOX>);
197  last_add_was_vertical_ = false;
198  }
199  boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>, true, bbox);
200  }
201  if (!left_key_tab_)
202  left_key_ = BoxLeftKey();
203  if (!right_key_tab_)
204  right_key_ = BoxRightKey();
205  if (TabFind::WithinTestRegion(2, box.left(), box.bottom()))
206  tprintf("Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\n",
207  box.left(), box.bottom(), box.right(), box.top(),
208  bounding_box_.left(), bounding_box_.right());
209 }
210 
211 // Removes the given box from the partition, updating the bounds.
213  BLOBNBOX_C_IT bb_it(&boxes_);
214  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
215  if (box == bb_it.data()) {
216  bb_it.extract();
217  ComputeLimits();
218  return;
219  }
220  }
221 }
222 
223 // Returns the tallest box in the partition, as measured perpendicular to the
224 // presumed flow of text.
226  BLOBNBOX* biggest = nullptr;
227  BLOBNBOX_C_IT bb_it(&boxes_);
228  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
229  BLOBNBOX* bbox = bb_it.data();
230  if (IsVerticalType()) {
231  if (biggest == nullptr ||
232  bbox->bounding_box().width() > biggest->bounding_box().width())
233  biggest = bbox;
234  } else {
235  if (biggest == nullptr ||
236  bbox->bounding_box().height() > biggest->bounding_box().height())
237  biggest = bbox;
238  }
239  }
240  return biggest;
241 }
242 
243 // Returns the bounding box excluding the given box.
245  TBOX result;
246  BLOBNBOX_C_IT bb_it(&boxes_);
247  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
248  if (box != bb_it.data()) {
249  result += bb_it.data()->bounding_box();
250  }
251  }
252  return result;
253 }
254 
255 // Claims the boxes in the boxes_list by marking them with a this owner
256 // pointer. If a box is already owned, then it must be owned by this.
258  BLOBNBOX_C_IT bb_it(&boxes_);
259  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
260  BLOBNBOX* bblob = bb_it.data();
261  ColPartition* other = bblob->owner();
262  if (other == nullptr) {
263  // Normal case: ownership is available.
264  bblob->set_owner(this);
265  } else {
266  ASSERT_HOST(other == this);
267  }
268  }
269 }
270 
271 // nullptr the owner of the blobs in this partition, so they can be deleted
272 // independently of the ColPartition.
274  BLOBNBOX_C_IT bb_it(&boxes_);
275  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
276  BLOBNBOX* bblob = bb_it.data();
277  ASSERT_HOST(bblob->owner() == this || bblob->owner() == nullptr);
278  bblob->set_owner(nullptr);
279  }
280 }
281 
282 // nullptr the owner of the blobs in this partition that are owned by this
283 // partition, so they can be deleted independently of the ColPartition.
284 // Any blobs that are not owned by this partition get to keep their owner
285 // without an assert failure.
287  BLOBNBOX_C_IT bb_it(&boxes_);
288  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
289  BLOBNBOX* bblob = bb_it.data();
290  if (bblob->owner() == this)
291  bblob->set_owner(nullptr);
292  }
293 }
294 
295 // Nulls the owner of the blobs in this partition that are owned by this
296 // partition and not leader blobs, removing them from the boxes_ list, thus
297 // turning this partition back to a leader partition if it contains a leader,
298 // or otherwise leaving it empty. Returns true if any boxes remain.
300  BLOBNBOX_C_IT bb_it(&boxes_);
301  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
302  BLOBNBOX* bblob = bb_it.data();
303  if (bblob->flow() != BTFT_LEADER) {
304  if (bblob->owner() == this) bblob->set_owner(nullptr);
305  bb_it.extract();
306  }
307  }
308  if (bb_it.empty()) return false;
309  flow_ = BTFT_LEADER;
310  ComputeLimits();
311  return true;
312 }
313 
314 // Delete the boxes that this partition owns.
316  // Although the boxes_ list is a C_LIST, in some cases it owns the
317  // BLOBNBOXes, as the ColPartition takes ownership from the grid,
318  // and the BLOBNBOXes own the underlying C_BLOBs.
319  for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {
320  BLOBNBOX* bblob = bb_it.extract();
321  delete bblob->cblob();
322  delete bblob;
323  }
324 }
325 
326 // Reflects the partition in the y-axis, assuming that its blobs have
327 // already been done. Corrects only a limited part of the members, since
328 // this function is assumed to be used shortly after initial creation, which
329 // is before a lot of the members are used.
331  BLOBNBOX_CLIST reversed_boxes;
332  BLOBNBOX_C_IT reversed_it(&reversed_boxes);
333  // Reverse the order of the boxes_.
334  BLOBNBOX_C_IT bb_it(&boxes_);
335  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
336  reversed_it.add_before_then_move(bb_it.extract());
337  }
338  bb_it.add_list_after(&reversed_boxes);
339  ASSERT_HOST(!left_key_tab_ && !right_key_tab_);
340  int tmp = left_margin_;
341  left_margin_ = -right_margin_;
342  right_margin_ = -tmp;
343  ComputeLimits();
344 }
345 
346 // Returns true if this is a legal partition - meaning that the conditions
347 // left_margin <= bounding_box left
348 // left_key <= bounding box left key
349 // bounding box left <= bounding box right
350 // and likewise for right margin and key
351 // are all met.
353  if (bounding_box_.left() > bounding_box_.right()) {
354  if (textord_debug_bugs) {
355  tprintf("Bounding box invalid\n");
356  Print();
357  }
358  return false; // Bounding box invalid.
359  }
360  if (left_margin_ > bounding_box_.left() ||
361  right_margin_ < bounding_box_.right()) {
362  if (textord_debug_bugs) {
363  tprintf("Margins invalid\n");
364  Print();
365  }
366  return false; // Margins invalid.
367  }
368  if (left_key_ > BoxLeftKey() || right_key_ < BoxRightKey()) {
369  if (textord_debug_bugs) {
370  tprintf("Key inside box: %d v %d or %d v %d\n",
371  left_key_, BoxLeftKey(), right_key_, BoxRightKey());
372  Print();
373  }
374  return false; // Keys inside the box.
375  }
376  return true;
377 }
378 
379 // Returns true if the left and right edges are approximately equal.
380 bool ColPartition::MatchingColumns(const ColPartition& other) const {
381  int y = (MidY() + other.MidY()) / 2;
382  if (!NearlyEqual(other.LeftAtY(y) / kColumnWidthFactor,
383  LeftAtY(y) / kColumnWidthFactor, 1))
384  return false;
385  if (!NearlyEqual(other.RightAtY(y) / kColumnWidthFactor,
386  RightAtY(y) / kColumnWidthFactor, 1))
387  return false;
388  return true;
389 }
390 
391 // Returns true if the colors match for two text partitions.
393  if (color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise &&
394  other.color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise)
395  return false; // Too noisy.
396 
397  // Colors must match for other to count.
398  double d_this1_o = ImageFind::ColorDistanceFromLine(other.color1_,
399  other.color2_,
400  color1_);
401  double d_this2_o = ImageFind::ColorDistanceFromLine(other.color1_,
402  other.color2_,
403  color2_);
404  double d_o1_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
405  other.color1_);
406  double d_o2_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
407  other.color2_);
408 // All 4 distances must be small enough.
409  return d_this1_o < kMaxColorDistance && d_this2_o < kMaxColorDistance &&
410  d_o1_this < kMaxColorDistance && d_o2_this < kMaxColorDistance;
411 }
412 
413 // Returns true if the sizes match for two text partitions,
414 // taking orientation into account. See also SizesSimilar.
415 bool ColPartition::MatchingSizes(const ColPartition& other) const {
416  if (blob_type_ == BRT_VERT_TEXT || other.blob_type_ == BRT_VERT_TEXT)
417  return !TabFind::DifferentSizes(median_width_, other.median_width_);
418  else
419  return !TabFind::DifferentSizes(median_height_, other.median_height_);
420 }
421 
422 // Returns true if there is no tabstop violation in merging this and other.
424  if (bounding_box_.right() < other.bounding_box_.left() &&
425  bounding_box_.right() < other.LeftBlobRule())
426  return false;
427  if (other.bounding_box_.right() < bounding_box_.left() &&
428  other.bounding_box_.right() < LeftBlobRule())
429  return false;
430  if (bounding_box_.left() > other.bounding_box_.right() &&
431  bounding_box_.left() > other.RightBlobRule())
432  return false;
433  if (other.bounding_box_.left() > bounding_box_.right() &&
434  other.bounding_box_.left() > RightBlobRule())
435  return false;
436  return true;
437 }
438 
439 // Returns true if other has a similar stroke width to this.
441  double fractional_tolerance,
442  double constant_tolerance) const {
443  int match_count = 0;
444  int nonmatch_count = 0;
445  BLOBNBOX_C_IT box_it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
446  BLOBNBOX_C_IT other_it(const_cast<BLOBNBOX_CLIST*>(&other.boxes_));
447  box_it.mark_cycle_pt();
448  other_it.mark_cycle_pt();
449  while (!box_it.cycled_list() && !other_it.cycled_list()) {
450  if (box_it.data()->MatchingStrokeWidth(*other_it.data(),
451  fractional_tolerance,
452  constant_tolerance))
453  ++match_count;
454  else
455  ++nonmatch_count;
456  box_it.forward();
457  other_it.forward();
458  }
459  return match_count > nonmatch_count;
460 }
461 
462 // Returns true if base is an acceptable diacritic base char merge
463 // with this as the diacritic.
464 // Returns true if:
465 // (1) this is a ColPartition containing only diacritics, and
466 // (2) the base characters indicated on the diacritics all believably lie
467 // within the text line of the candidate ColPartition.
469  bool debug) const {
470  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
471  int min_top = INT32_MAX;
472  int max_bottom = -INT32_MAX;
473  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
474  BLOBNBOX* blob = it.data();
475  if (!blob->IsDiacritic()) {
476  if (debug) {
477  tprintf("Blob is not a diacritic:");
478  blob->bounding_box().print();
479  }
480  return false; // All blobs must have diacritic bases.
481  }
482  if (blob->base_char_top() < min_top)
483  min_top = blob->base_char_top();
484  if (blob->base_char_bottom() > max_bottom)
485  max_bottom = blob->base_char_bottom();
486  }
487  // If the intersection of all vertical ranges of all base characters
488  // overlaps the median range of this, then it is OK.
489  bool result = min_top > candidate.median_bottom_ &&
490  max_bottom < candidate.median_top_;
491  if (debug) {
492  if (result)
493  tprintf("OKDiacritic!\n");
494  else
495  tprintf("y ranges don\'t overlap: %d-%d / %d-%d\n",
496  max_bottom, min_top, median_bottom_, median_top_);
497  }
498  return result;
499 }
500 
501 // Sets the sort key using either the tab vector, or the bounding box if
502 // the tab vector is nullptr. If the tab_vector lies inside the bounding_box,
503 // use the edge of the box as a key any way.
504 void ColPartition::SetLeftTab(const TabVector* tab_vector) {
505  if (tab_vector != nullptr) {
506  left_key_ = tab_vector->sort_key();
507  left_key_tab_ = left_key_ <= BoxLeftKey();
508  } else {
509  left_key_tab_ = false;
510  }
511  if (!left_key_tab_)
512  left_key_ = BoxLeftKey();
513 }
514 
515 // As SetLeftTab, but with the right.
516 void ColPartition::SetRightTab(const TabVector* tab_vector) {
517  if (tab_vector != nullptr) {
518  right_key_ = tab_vector->sort_key();
519  right_key_tab_ = right_key_ >= BoxRightKey();
520  } else {
521  right_key_tab_ = false;
522  }
523  if (!right_key_tab_)
524  right_key_ = BoxRightKey();
525 }
526 
527 // Copies the left/right tab from the src partition, but if take_box is
528 // true, copies the box instead and uses that as a key.
529 void ColPartition::CopyLeftTab(const ColPartition& src, bool take_box) {
530  left_key_tab_ = take_box ? false : src.left_key_tab_;
531  if (left_key_tab_) {
532  left_key_ = src.left_key_;
533  } else {
534  bounding_box_.set_left(XAtY(src.BoxLeftKey(), MidY()));
535  left_key_ = BoxLeftKey();
536  }
537  if (left_margin_ > bounding_box_.left())
538  left_margin_ = src.left_margin_;
539 }
540 
541 // As CopyLeftTab, but with the right.
542 void ColPartition::CopyRightTab(const ColPartition& src, bool take_box) {
543  right_key_tab_ = take_box ? false : src.right_key_tab_;
544  if (right_key_tab_) {
545  right_key_ = src.right_key_;
546  } else {
547  bounding_box_.set_right(XAtY(src.BoxRightKey(), MidY()));
548  right_key_ = BoxRightKey();
549  }
550  if (right_margin_ < bounding_box_.right())
551  right_margin_ = src.right_margin_;
552 }
553 
554 // Returns the left rule line x coord of the leftmost blob.
556  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
557  return it.data()->left_rule();
558 }
559 // Returns the right rule line x coord of the rightmost blob.
561  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
562  it.move_to_last();
563  return it.data()->right_rule();
564 }
565 
567  ASSERT_HOST(type < BSTT_COUNT);
568  return special_blobs_densities_[type];
569 }
570 
572  ASSERT_HOST(type < BSTT_COUNT);
573  BLOBNBOX_C_IT blob_it(&boxes_);
574  int count = 0;
575  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
576  BLOBNBOX* blob = blob_it.data();
578  if (blob_type == type) {
579  count++;
580  }
581  }
582 
583  return count;
584 }
585 
587  const BlobSpecialTextType type, const float density) {
588  ASSERT_HOST(type < BSTT_COUNT);
589  special_blobs_densities_[type] = density;
590 }
591 
593  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
594  if (boxes_.empty()) {
595  return;
596  }
597 
598  BLOBNBOX_C_IT blob_it(&boxes_);
599  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
600  BLOBNBOX* blob = blob_it.data();
602  special_blobs_densities_[type]++;
603  }
604 
605  for (float& special_blobs_density : special_blobs_densities_) {
606  special_blobs_density /= boxes_.length();
607  }
608 }
609 
610 // Add a partner above if upper, otherwise below.
611 // Add them uniquely and keep the list sorted by box left.
612 // Partnerships are added symmetrically to partner and this.
613 void ColPartition::AddPartner(bool upper, ColPartition* partner) {
614  if (upper) {
615  partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>,
616  true, this);
617  upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
618  } else {
619  partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>,
620  true, this);
621  lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
622  }
623 }
624 
625 // Removes the partner from this, but does not remove this from partner.
626 // This asymmetric removal is so as not to mess up the iterator that is
627 // working on partner's partner list.
628 void ColPartition::RemovePartner(bool upper, ColPartition* partner) {
629  ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);
630  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
631  if (it.data() == partner) {
632  it.extract();
633  break;
634  }
635  }
636 }
637 
638 // Returns the partner if the given partner is a singleton, otherwise nullptr.
640  ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
641  if (!partners->singleton())
642  return nullptr;
643  ColPartition_C_IT it(partners);
644  return it.data();
645 }
646 
647 // Merge with the other partition and delete it.
649  // The result has to either own all of the blobs or none of them.
650  // Verify the flag is consistent.
651  ASSERT_HOST(owns_blobs() == other->owns_blobs());
652  // TODO(nbeato): check owns_blobs better. Right now owns_blobs
653  // should always be true when this is called. So there is no issues.
654  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
655  bounding_box_.bottom()) ||
656  TabFind::WithinTestRegion(2, other->bounding_box_.left(),
657  other->bounding_box_.bottom())) {
658  tprintf("Merging:");
659  Print();
660  other->Print();
661  }
662 
663  // Update the special_blobs_densities_.
664  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
665  for (int type = 0; type < BSTT_COUNT; ++type) {
666  unsigned w1 = boxes_.length();
667  unsigned w2 = other->boxes_.length();
668  float new_val = special_blobs_densities_[type] * w1 +
669  other->special_blobs_densities_[type] * w2;
670  if (!w1 || !w2) {
671  ASSERT_HOST((w1 + w2) > 0);
672  special_blobs_densities_[type] = new_val / (w1 + w2);
673  }
674  }
675 
676  // Merge the two sorted lists.
677  BLOBNBOX_C_IT it(&boxes_);
678  BLOBNBOX_C_IT it2(&other->boxes_);
679  for (; !it2.empty(); it2.forward()) {
680  BLOBNBOX* bbox2 = it2.extract();
681  ColPartition* prev_owner = bbox2->owner();
682  if (prev_owner != other && prev_owner != nullptr) {
683  // A blob on other's list is owned by someone else; let them have it.
684  continue;
685  }
686  ASSERT_HOST(prev_owner == other || prev_owner == nullptr);
687  if (prev_owner == other)
688  bbox2->set_owner(this);
689  it.add_to_end(bbox2);
690  }
691  left_margin_ = std::min(left_margin_, other->left_margin_);
692  right_margin_ = std::max(right_margin_, other->right_margin_);
693  if (other->left_key_ < left_key_) {
694  left_key_ = other->left_key_;
695  left_key_tab_ = other->left_key_tab_;
696  }
697  if (other->right_key_ > right_key_) {
698  right_key_ = other->right_key_;
699  right_key_tab_ = other->right_key_tab_;
700  }
701  // Combine the flow and blob_type in a sensible way.
702  // Dominant flows stay.
703  if (!DominatesInMerge(flow_, other->flow_)) {
704  flow_ = other->flow_;
705  blob_type_ = other->blob_type_;
706  }
707  SetBlobTypes();
708  if (IsVerticalType()) {
709  boxes_.sort(SortByBoxBottom<BLOBNBOX>);
710  last_add_was_vertical_ = true;
711  } else {
712  boxes_.sort(SortByBoxLeft<BLOBNBOX>);
713  last_add_was_vertical_ = false;
714  }
715  ComputeLimits();
716  // Fix partner lists. other is going away, so remove it as a
717  // partner of all its partners and add this in its place.
718  for (int upper = 0; upper < 2; ++upper) {
719  ColPartition_CLIST partners;
720  ColPartition_C_IT part_it(&partners);
721  part_it.add_list_after(upper ? &other->upper_partners_
722  : &other->lower_partners_);
723  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
724  ColPartition* partner = part_it.extract();
725  partner->RemovePartner(!upper, other);
726  partner->RemovePartner(!upper, this);
727  partner->AddPartner(!upper, this);
728  }
729  }
730  delete other;
731  if (cb != nullptr) {
732  SetColumnGoodness(cb);
733  }
734 }
735 
736 // Merge1 and merge2 are candidates to be merged, yet their combined box
737 // overlaps this. Is that allowed?
738 // Returns true if the overlap between this and the merged pair of
739 // merge candidates is sufficiently trivial to be allowed.
740 // The merged box can graze the edge of this by the ok_box_overlap
741 // if that exceeds the margin to the median top and bottom.
742 // ok_box_overlap should be set by the caller appropriate to the sizes of
743 // the text involved, and is usually a fraction of the median size of merge1
744 // and/or merge2, or this.
745 // TODO(rays) Determine whether vertical text needs to be considered.
747  const ColPartition& merge2,
748  int ok_box_overlap, bool debug) {
749  // Vertical partitions are not allowed to be involved.
750  if (IsVerticalType() || merge1.IsVerticalType() || merge2.IsVerticalType()) {
751  if (debug)
752  tprintf("Vertical partition\n");
753  return false;
754  }
755  // The merging partitions must strongly overlap each other.
756  if (!merge1.VSignificantCoreOverlap(merge2)) {
757  if (debug)
758  tprintf("Voverlap %d (%d)\n",
759  merge1.VCoreOverlap(merge2),
760  merge1.VSignificantCoreOverlap(merge2));
761  return false;
762  }
763  // The merged box must not overlap the median bounds of this.
764  TBOX merged_box(merge1.bounding_box());
765  merged_box += merge2.bounding_box();
766  if (merged_box.bottom() < median_top_ && merged_box.top() > median_bottom_ &&
767  merged_box.bottom() < bounding_box_.top() - ok_box_overlap &&
768  merged_box.top() > bounding_box_.bottom() + ok_box_overlap) {
769  if (debug)
770  tprintf("Excessive box overlap\n");
771  return false;
772  }
773  // Looks OK!
774  return true;
775 }
776 
777 // Find the blob at which to split this to minimize the overlap with the
778 // given box. Returns the first blob to go in the second partition.
780  if (boxes_.empty() || boxes_.singleton())
781  return nullptr;
782  BLOBNBOX_C_IT it(&boxes_);
783  TBOX left_box(it.data()->bounding_box());
784  for (it.forward(); !it.at_first(); it.forward()) {
785  BLOBNBOX* bbox = it.data();
786  left_box += bbox->bounding_box();
787  if (left_box.overlap(box))
788  return bbox;
789  }
790  return nullptr;
791 }
792 
793 // Split this partition keeping the first half in this and returning
794 // the second half.
795 // Splits by putting the split_blob and the blobs that follow
796 // in the second half, and the rest in the first half.
798  ColPartition* split_part = ShallowCopy();
799  split_part->set_owns_blobs(owns_blobs());
800  BLOBNBOX_C_IT it(&boxes_);
801  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
802  BLOBNBOX* bbox = it.data();
803  ColPartition* prev_owner = bbox->owner();
804  ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == nullptr);
805  if (bbox == split_blob || !split_part->boxes_.empty()) {
806  split_part->AddBox(it.extract());
807  if (owns_blobs() && prev_owner != nullptr)
808  bbox->set_owner(split_part);
809  }
810  }
811  ASSERT_HOST(!it.empty());
812  if (split_part->IsEmpty()) {
813  // Split part ended up with nothing. Possible if split_blob is not
814  // in the list of blobs.
815  delete split_part;
816  return nullptr;
817  }
818  right_key_tab_ = false;
819  split_part->left_key_tab_ = false;
820  ComputeLimits();
821  // TODO(nbeato) Merge Ray's CL like this:
822  // if (owns_blobs())
823  // SetBlobTextlineGoodness();
824  split_part->ComputeLimits();
825  // TODO(nbeato) Merge Ray's CL like this:
826  // if (split_part->owns_blobs())
827  // split_part->SetBlobTextlineGoodness();
828  return split_part;
829 }
830 
831 // Split this partition at the given x coordinate, returning the right
832 // half and keeping the left half in this.
834  if (split_x <= bounding_box_.left() || split_x >= bounding_box_.right())
835  return nullptr; // There will be no change.
836  ColPartition* split_part = ShallowCopy();
837  split_part->set_owns_blobs(owns_blobs());
838  BLOBNBOX_C_IT it(&boxes_);
839  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
840  BLOBNBOX* bbox = it.data();
841  ColPartition* prev_owner = bbox->owner();
842  ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == nullptr);
843  const TBOX& box = bbox->bounding_box();
844  if (box.left() >= split_x) {
845  split_part->AddBox(it.extract());
846  if (owns_blobs() && prev_owner != nullptr)
847  bbox->set_owner(split_part);
848  }
849  }
850  if (it.empty()) {
851  // Possible if split-x passes through the first blob.
852  it.add_list_after(&split_part->boxes_);
853  }
854  ASSERT_HOST(!it.empty());
855  if (split_part->IsEmpty()) {
856  // Split part ended up with nothing. Possible if split_x passes
857  // through the last blob.
858  delete split_part;
859  return nullptr;
860  }
861  right_key_tab_ = false;
862  split_part->left_key_tab_ = false;
863  right_margin_ = split_x;
864  split_part->left_margin_ = split_x;
865  ComputeLimits();
866  split_part->ComputeLimits();
867  return split_part;
868 }
869 
870 // Recalculates all the coordinate limits of the partition.
872  bounding_box_ = TBOX(); // Clear it
873  BLOBNBOX_C_IT it(&boxes_);
874  BLOBNBOX* bbox = nullptr;
875  int non_leader_count = 0;
876  if (it.empty()) {
877  bounding_box_.set_left(left_margin_);
878  bounding_box_.set_right(right_margin_);
879  bounding_box_.set_bottom(0);
880  bounding_box_.set_top(0);
881  } else {
882  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
883  bbox = it.data();
884  bounding_box_ += bbox->bounding_box();
885  if (bbox->flow() != BTFT_LEADER)
886  ++non_leader_count;
887  }
888  }
889  if (!left_key_tab_)
890  left_key_ = BoxLeftKey();
891  if (left_key_ > BoxLeftKey() && textord_debug_bugs) {
892  // TODO(rays) investigate the causes of these error messages, to find
893  // out if they are genuinely harmful, or just indicative of junk input.
894  tprintf("Computed left-illegal partition\n");
895  Print();
896  }
897  if (!right_key_tab_)
898  right_key_ = BoxRightKey();
899  if (right_key_ < BoxRightKey() && textord_debug_bugs) {
900  tprintf("Computed right-illegal partition\n");
901  Print();
902  }
903  if (it.empty())
904  return;
905  if (IsImageType() || blob_type() == BRT_RECTIMAGE ||
906  blob_type() == BRT_POLYIMAGE) {
907  median_top_ = bounding_box_.top();
908  median_bottom_ = bounding_box_.bottom();
909  median_height_ = bounding_box_.height();
910  median_left_ = bounding_box_.left();
911  median_right_ = bounding_box_.right();
912  median_width_ = bounding_box_.width();
913  } else {
914  STATS top_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
915  STATS bottom_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
916  STATS height_stats(0, bounding_box_.height() + 1);
917  STATS left_stats(bounding_box_.left(), bounding_box_.right() + 1);
918  STATS right_stats(bounding_box_.left(), bounding_box_.right() + 1);
919  STATS width_stats(0, bounding_box_.width() + 1);
920  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
921  bbox = it.data();
922  if (non_leader_count == 0 || bbox->flow() != BTFT_LEADER) {
923  const TBOX& box = bbox->bounding_box();
924  int area = box.area();
925  top_stats.add(box.top(), area);
926  bottom_stats.add(box.bottom(), area);
927  height_stats.add(box.height(), area);
928  left_stats.add(box.left(), area);
929  right_stats.add(box.right(), area);
930  width_stats.add(box.width(), area);
931  }
932  }
933  median_top_ = static_cast<int>(top_stats.median() + 0.5);
934  median_bottom_ = static_cast<int>(bottom_stats.median() + 0.5);
935  median_height_ = static_cast<int>(height_stats.median() + 0.5);
936  median_left_ = static_cast<int>(left_stats.median() + 0.5);
937  median_right_ = static_cast<int>(right_stats.median() + 0.5);
938  median_width_ = static_cast<int>(width_stats.median() + 0.5);
939  }
940 
941  if (right_margin_ < bounding_box_.right() && textord_debug_bugs) {
942  tprintf("Made partition with bad right coords");
943  Print();
944  }
945  if (left_margin_ > bounding_box_.left() && textord_debug_bugs) {
946  tprintf("Made partition with bad left coords");
947  Print();
948  }
949  // Fix partner lists. The bounding box has changed and partners are stored
950  // in bounding box order, so remove and reinsert this as a partner
951  // of all its partners.
952  for (int upper = 0; upper < 2; ++upper) {
953  ColPartition_CLIST partners;
954  ColPartition_C_IT part_it(&partners);
955  part_it.add_list_after(upper ? &upper_partners_ : &lower_partners_);
956  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
957  ColPartition* partner = part_it.extract();
958  partner->RemovePartner(!upper, this);
959  partner->AddPartner(!upper, this);
960  }
961  }
962  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
963  bounding_box_.bottom())) {
964  tprintf("Recomputed box for partition %p\n", this);
965  Print();
966  }
967 }
968 
969 // Returns the number of boxes that overlap the given box.
971  BLOBNBOX_C_IT it(&boxes_);
972  int overlap_count = 0;
973  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
974  BLOBNBOX* bbox = it.data();
975  if (box.overlap(bbox->bounding_box()))
976  ++overlap_count;
977  }
978  return overlap_count;
979 }
980 
981 // Computes and sets the type_ and first_column_, last_column_ and column_set_.
982 // resolution refers to the ppi resolution of the image.
983 void ColPartition::SetPartitionType(int resolution, ColPartitionSet* columns) {
984  int first_spanned_col = -1;
985  ColumnSpanningType span_type =
986  columns->SpanningType(resolution,
987  bounding_box_.left(), bounding_box_.right(),
988  std::min(bounding_box_.height(), bounding_box_.width()),
989  MidY(), left_margin_, right_margin_,
990  &first_column_, &last_column_,
991  &first_spanned_col);
992  column_set_ = columns;
993  if (first_column_ < last_column_ && span_type == CST_PULLOUT &&
994  !IsLineType()) {
995  // Unequal columns may indicate that the pullout spans one of the columns
996  // it lies in, so force it to be allocated to just that column.
997  if (first_spanned_col >= 0) {
998  first_column_ = first_spanned_col;
999  last_column_ = first_spanned_col;
1000  } else {
1001  if ((first_column_ & 1) == 0)
1002  last_column_ = first_column_;
1003  else if ((last_column_ & 1) == 0)
1004  first_column_ = last_column_;
1005  else
1006  first_column_ = last_column_ = (first_column_ + last_column_) / 2;
1007  }
1008  }
1009  type_ = PartitionType(span_type);
1010 }
1011 
1012 // Returns the PartitionType from the current BlobRegionType and a column
1013 // flow spanning type ColumnSpanningType, generated by
1014 // ColPartitionSet::SpanningType, that indicates how the partition sits
1015 // in the columns.
1017  if (flow == CST_NOISE) {
1018  if (blob_type_ != BRT_HLINE && blob_type_ != BRT_VLINE &&
1019  blob_type_ != BRT_RECTIMAGE && blob_type_ != BRT_VERT_TEXT)
1020  return PT_NOISE;
1021  flow = CST_FLOWING;
1022  }
1023 
1024  switch (blob_type_) {
1025  case BRT_NOISE:
1026  return PT_NOISE;
1027  case BRT_HLINE:
1028  return PT_HORZ_LINE;
1029  case BRT_VLINE:
1030  return PT_VERT_LINE;
1031  case BRT_RECTIMAGE:
1032  case BRT_POLYIMAGE:
1033  switch (flow) {
1034  case CST_FLOWING:
1035  return PT_FLOWING_IMAGE;
1036  case CST_HEADING:
1037  return PT_HEADING_IMAGE;
1038  case CST_PULLOUT:
1039  return PT_PULLOUT_IMAGE;
1040  default:
1041  ASSERT_HOST(!"Undefined flow type for image!");
1042  }
1043  break;
1044  case BRT_VERT_TEXT:
1045  return PT_VERTICAL_TEXT;
1046  case BRT_TEXT:
1047  case BRT_UNKNOWN:
1048  default:
1049  switch (flow) {
1050  case CST_FLOWING:
1051  return PT_FLOWING_TEXT;
1052  case CST_HEADING:
1053  return PT_HEADING_TEXT;
1054  case CST_PULLOUT:
1055  return PT_PULLOUT_TEXT;
1056  default:
1057  ASSERT_HOST(!"Undefined flow type for text!");
1058  }
1059  }
1060  ASSERT_HOST(!"Should never get here!");
1061  return PT_NOISE;
1062 }
1063 
1064 // Returns the first and last column touched by this partition.
1065 // resolution refers to the ppi resolution of the image.
1066 void ColPartition::ColumnRange(int resolution, ColPartitionSet* columns,
1067  int* first_col, int* last_col) {
1068  int first_spanned_col = -1;
1069  ColumnSpanningType span_type =
1070  columns->SpanningType(resolution,
1071  bounding_box_.left(), bounding_box_.right(),
1072  std::min(bounding_box_.height(), bounding_box_.width()),
1073  MidY(), left_margin_, right_margin_,
1074  first_col, last_col,
1075  &first_spanned_col);
1076  type_ = PartitionType(span_type);
1077 }
1078 
1079 // Sets the internal flags good_width_ and good_column_.
1081  int y = MidY();
1082  int width = RightAtY(y) - LeftAtY(y);
1083  good_width_ = cb->Run(width);
1084  good_column_ = blob_type_ == BRT_TEXT && left_key_tab_ && right_key_tab_;
1085 }
1086 
1087 // Determines whether the blobs in this partition mostly represent
1088 // a leader (fixed pitch sequence) and sets the member blobs accordingly.
1089 // Note that height is assumed to have been tested elsewhere, and that this
1090 // function will find most fixed-pitch text as leader without a height filter.
1091 // Leader detection is limited to sequences of identical width objects,
1092 // such as .... or ----, so patterns, such as .-.-.-.-. will not be found.
1094  bool result = false;
1095  // Gather statistics on the gaps between blobs and the widths of the blobs.
1096  int part_width = bounding_box_.width();
1097  STATS gap_stats(0, part_width);
1098  STATS width_stats(0, part_width);
1099  BLOBNBOX_C_IT it(&boxes_);
1100  BLOBNBOX* prev_blob = it.data();
1101  prev_blob->set_flow(BTFT_NEIGHBOURS);
1102  width_stats.add(prev_blob->bounding_box().width(), 1);
1103  int blob_count = 1;
1104  for (it.forward(); !it.at_first(); it.forward()) {
1105  BLOBNBOX* blob = it.data();
1106  int left = blob->bounding_box().left();
1107  int right = blob->bounding_box().right();
1108  gap_stats.add(left - prev_blob->bounding_box().right(), 1);
1109  width_stats.add(right - left, 1);
1110  blob->set_flow(BTFT_NEIGHBOURS);
1111  prev_blob = blob;
1112  ++blob_count;
1113  }
1114  double median_gap = gap_stats.median();
1115  double median_width = width_stats.median();
1116  double max_width = std::max(median_gap, median_width);
1117  double min_width = std::min(median_gap, median_width);
1118  double gap_iqr = gap_stats.ile(0.75f) - gap_stats.ile(0.25f);
1119  if (textord_debug_tabfind >= 4) {
1120  tprintf("gap iqr = %g, blob_count=%d, limits=%g,%g\n",
1121  gap_iqr, blob_count, max_width * kMaxLeaderGapFractionOfMax,
1122  min_width * kMaxLeaderGapFractionOfMin);
1123  }
1124  if (gap_iqr < max_width * kMaxLeaderGapFractionOfMax &&
1125  gap_iqr < min_width * kMaxLeaderGapFractionOfMin &&
1126  blob_count >= kMinLeaderCount) {
1127  // This is stable enough to be called a leader, so check the widths.
1128  // Since leader dashes can join, run a dp cutting algorithm and go
1129  // on the cost.
1130  int offset = static_cast<int>(ceil(gap_iqr * 2));
1131  int min_step = static_cast<int>(median_gap + median_width + 0.5);
1132  int max_step = min_step + offset;
1133  min_step -= offset;
1134  // Pad the buffer with min_step/2 on each end.
1135  int part_left = bounding_box_.left() - min_step / 2;
1136  part_width += min_step;
1137  auto* projection = new DPPoint[part_width];
1138  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1139  BLOBNBOX* blob = it.data();
1140  int left = blob->bounding_box().left();
1141  int right = blob->bounding_box().right();
1142  int height = blob->bounding_box().height();
1143  for (int x = left; x < right; ++x) {
1144  projection[left - part_left].AddLocalCost(height);
1145  }
1146  }
1147  DPPoint* best_end = DPPoint::Solve(min_step, max_step, false,
1149  part_width, projection);
1150  if (best_end != nullptr && best_end->total_cost() < blob_count) {
1151  // Good enough. Call it a leader.
1152  result = true;
1153  bool modified_blob_list = false;
1154  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1155  BLOBNBOX* blob = it.data();
1156  // If the first or last blob is spaced too much, don't mark it.
1157  if (it.at_first()) {
1158  int gap = it.data_relative(1)->bounding_box().left() -
1159  blob->bounding_box().right();
1160  if (blob->bounding_box().width() + gap > max_step) {
1161  it.extract();
1162  modified_blob_list = true;
1163  continue;
1164  }
1165  }
1166  if (it.at_last()) {
1167  int gap = blob->bounding_box().left() -
1168  it.data_relative(-1)->bounding_box().right();
1169  if (blob->bounding_box().width() + gap > max_step) {
1170  it.extract();
1171  modified_blob_list = true;
1172  break;
1173  }
1174  }
1175  blob->set_region_type(BRT_TEXT);
1176  blob->set_flow(BTFT_LEADER);
1177  }
1178  if (modified_blob_list) ComputeLimits();
1179  blob_type_ = BRT_TEXT;
1180  flow_ = BTFT_LEADER;
1181  } else if (textord_debug_tabfind) {
1182  if (best_end == nullptr) {
1183  tprintf("No path\n");
1184  } else {
1185  tprintf("Total cost = %d vs allowed %d\n", best_end->total_cost(),
1186  blob_count);
1187  }
1188  }
1189  delete [] projection;
1190  }
1191  return result;
1192 }
1193 
1194 // Given the result of TextlineProjection::EvaluateColPartition, (positive for
1195 // horizontal text, negative for vertical text, and near zero for non-text),
1196 // sets the blob_type_ and flow_ for this partition to indicate whether it
1197 // is strongly or weakly vertical or horizontal text, or non-text.
1198 // The function assumes that the blob neighbours are valid (from
1199 // StrokeWidth::SetNeighbours) and that those neighbours have their
1200 // region_type() set.
1202  int blob_count = 0; // Total # blobs.
1203  int good_blob_score_ = 0; // Total # good strokewidth neighbours.
1204  int noisy_count = 0; // Total # neighbours marked as noise.
1205  int hline_count = 0;
1206  int vline_count = 0;
1207  BLOBNBOX_C_IT it(&boxes_);
1208  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1209  BLOBNBOX* blob = it.data();
1210  ++blob_count;
1211  noisy_count += blob->NoisyNeighbours();
1212  good_blob_score_ += blob->GoodTextBlob();
1213  if (blob->region_type() == BRT_HLINE) ++hline_count;
1214  if (blob->region_type() == BRT_VLINE) ++vline_count;
1215  }
1216  flow_ = BTFT_NEIGHBOURS;
1217  blob_type_ = BRT_UNKNOWN;
1218  if (hline_count > vline_count) {
1219  flow_ = BTFT_NONE;
1220  blob_type_ = BRT_HLINE;
1221  } else if (vline_count > hline_count) {
1222  flow_ = BTFT_NONE;
1223  blob_type_ = BRT_VLINE;
1224  } else if (value < -1 || 1 < value) {
1225  int long_side;
1226  int short_side;
1227  if (value > 0) {
1228  long_side = bounding_box_.width();
1229  short_side = bounding_box_.height();
1230  blob_type_ = BRT_TEXT;
1231  } else {
1232  long_side = bounding_box_.height();
1233  short_side = bounding_box_.width();
1234  blob_type_ = BRT_VERT_TEXT;
1235  }
1236  // We will combine the old metrics using aspect ratio and blob counts
1237  // with the input value by allowing a strong indication to flip the
1238  // STRONG_CHAIN/CHAIN flow values.
1239  int strong_score = blob_count >= kHorzStrongTextlineCount ? 1 : 0;
1240  if (short_side > kHorzStrongTextlineHeight) ++strong_score;
1241  if (short_side * kHorzStrongTextlineAspect < long_side) ++strong_score;
1242  if (abs(value) >= kMinStrongTextValue)
1243  flow_ = BTFT_STRONG_CHAIN;
1244  else if (abs(value) >= kMinChainTextValue)
1245  flow_ = BTFT_CHAIN;
1246  else
1247  flow_ = BTFT_NEIGHBOURS;
1248  // Upgrade chain to strong chain if the other indicators are good
1249  if (flow_ == BTFT_CHAIN && strong_score == 3)
1250  flow_ = BTFT_STRONG_CHAIN;
1251  // Downgrade strong vertical text to chain if the indicators are bad.
1252  if (flow_ == BTFT_STRONG_CHAIN && value < 0 && strong_score < 2)
1253  flow_ = BTFT_CHAIN;
1254  }
1255  if (flow_ == BTFT_NEIGHBOURS) {
1256  // Check for noisy neighbours.
1257  if (noisy_count >= blob_count) {
1258  flow_ = BTFT_NONTEXT;
1259  blob_type_= BRT_NOISE;
1260  }
1261  }
1262  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
1263  bounding_box_.bottom())) {
1264  tprintf("RegionFlowTypesFromProjectionValue count=%d, noisy=%d, score=%d,",
1265  blob_count, noisy_count, good_blob_score_);
1266  tprintf(" Projection value=%d, flow=%d, blob_type=%d\n",
1267  value, flow_, blob_type_);
1268  Print();
1269  }
1270  SetBlobTypes();
1271 }
1272 
1273 // Sets all blobs with the partition blob type and flow, but never overwrite
1274 // leader blobs, as we need to be able to identify them later.
1276  if (!owns_blobs())
1277  return;
1278  BLOBNBOX_C_IT it(&boxes_);
1279  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1280  BLOBNBOX* blob = it.data();
1281  if (blob->flow() != BTFT_LEADER)
1282  blob->set_flow(flow_);
1283  blob->set_region_type(blob_type_);
1284  ASSERT_HOST(blob->owner() == nullptr || blob->owner() == this);
1285  }
1286 }
1287 
1288 // Returns true if a decent baseline can be fitted through the blobs.
1289 // Works for both horizontal and vertical text.
1291  // Approximation of the baseline.
1292  DetLineFit linepoints;
1293  // Calculation of the mean height on this line segment. Note that these
1294  // variable names apply to the context of a horizontal line, and work
1295  // analogously, rather than literally in the case of a vertical line.
1296  int total_height = 0;
1297  int coverage = 0;
1298  int height_count = 0;
1299  int width = 0;
1300  BLOBNBOX_C_IT it(&boxes_);
1301  TBOX box(it.data()->bounding_box());
1302  // Accumulate points representing the baseline at the middle of each blob,
1303  // but add an additional point for each end of the line. This makes it
1304  // harder to fit a severe skew angle, as it is most likely not right.
1305  if (IsVerticalType()) {
1306  // For a vertical line, use the right side as the baseline.
1307  ICOORD first_pt(box.right(), box.bottom());
1308  // Use the bottom-right of the first (bottom) box, the top-right of the
1309  // last, and the middle-right of all others.
1310  linepoints.Add(first_pt);
1311  for (it.forward(); !it.at_last(); it.forward()) {
1312  BLOBNBOX* blob = it.data();
1313  box = blob->bounding_box();
1314  ICOORD box_pt(box.right(), (box.top() + box.bottom()) / 2);
1315  linepoints.Add(box_pt);
1316  total_height += box.width();
1317  coverage += box.height();
1318  ++height_count;
1319  }
1320  box = it.data()->bounding_box();
1321  ICOORD last_pt(box.right(), box.top());
1322  linepoints.Add(last_pt);
1323  width = last_pt.y() - first_pt.y();
1324 
1325  } else {
1326  // Horizontal lines use the bottom as the baseline.
1327  TBOX box(it.data()->bounding_box());
1328  // Use the bottom-left of the first box, the the bottom-right of the last,
1329  // and the middle of all others.
1330  ICOORD first_pt(box.left(), box.bottom());
1331  linepoints.Add(first_pt);
1332  for (it.forward(); !it.at_last(); it.forward()) {
1333  BLOBNBOX* blob = it.data();
1334  box = blob->bounding_box();
1335  ICOORD box_pt((box.left() + box.right()) / 2, box.bottom());
1336  linepoints.Add(box_pt);
1337  total_height += box.height();
1338  coverage += box.width();
1339  ++height_count;
1340  }
1341  box = it.data()->bounding_box();
1342  ICOORD last_pt(box.right(), box.bottom());
1343  linepoints.Add(last_pt);
1344  width = last_pt.x() - first_pt.x();
1345  }
1346  // Maximum median error allowed to be a good text line.
1347  if (height_count == 0)
1348  return false;
1349  double max_error = kMaxBaselineError * total_height / height_count;
1350  ICOORD start_pt, end_pt;
1351  double error = linepoints.Fit(&start_pt, &end_pt);
1352  return error < max_error && coverage >= kMinBaselineCoverage * width;
1353 }
1354 
1355 // Adds this ColPartition to a matching WorkingPartSet if one can be found,
1356 // otherwise starts a new one in the appropriate column, ending the previous.
1357 void ColPartition::AddToWorkingSet(const ICOORD& bleft, const ICOORD& tright,
1358  int resolution,
1359  ColPartition_LIST* used_parts,
1360  WorkingPartSet_LIST* working_sets) {
1361  if (block_owned_)
1362  return; // Done it already.
1363  block_owned_ = true;
1364  WorkingPartSet_IT it(working_sets);
1365  // If there is an upper partner use its working_set_ directly.
1366  ColPartition* partner = SingletonPartner(true);
1367  if (partner != nullptr && partner->working_set_ != nullptr) {
1368  working_set_ = partner->working_set_;
1369  working_set_->AddPartition(this);
1370  return;
1371  }
1372  if (partner != nullptr && textord_debug_bugs) {
1373  tprintf("Partition with partner has no working set!:");
1374  Print();
1375  partner->Print();
1376  }
1377  // Search for the column that the left edge fits in.
1378  WorkingPartSet* work_set = nullptr;
1379  it.move_to_first();
1380  int col_index = 0;
1381  for (it.mark_cycle_pt(); !it.cycled_list() &&
1382  col_index != first_column_;
1383  it.forward(), ++col_index);
1384  if (textord_debug_tabfind >= 2) {
1385  tprintf("Match is %s for:", (col_index & 1) ? "Real" : "Between");
1386  Print();
1387  }
1388  if (it.cycled_list() && textord_debug_bugs) {
1389  tprintf("Target column=%d, only had %d\n", first_column_, col_index);
1390  }
1391  ASSERT_HOST(!it.cycled_list());
1392  work_set = it.data();
1393  // If last_column_ != first_column, then we need to scoop up all blocks
1394  // between here and the last_column_ and put back in work_set.
1395  if (!it.cycled_list() && last_column_ != first_column_ && !IsPulloutType()) {
1396  // Find the column that the right edge falls in.
1397  BLOCK_LIST completed_blocks;
1398  TO_BLOCK_LIST to_blocks;
1399  for (; !it.cycled_list() && col_index <= last_column_;
1400  it.forward(), ++col_index) {
1401  WorkingPartSet* end_set = it.data();
1402  end_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts,
1403  &completed_blocks, &to_blocks);
1404  }
1405  work_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
1406  }
1407  working_set_ = work_set;
1408  work_set->AddPartition(this);
1409 }
1410 
1411 // From the given block_parts list, builds one or more BLOCKs and
1412 // corresponding TO_BLOCKs, such that the line spacing is uniform in each.
1413 // Created blocks are appended to the end of completed_blocks and to_blocks.
1414 // The used partitions are put onto used_parts, as they may still be referred
1415 // to in the partition grid. bleft, tright and resolution are the bounds
1416 // and resolution of the original image.
1417 void ColPartition::LineSpacingBlocks(const ICOORD& bleft, const ICOORD& tright,
1418  int resolution,
1419  ColPartition_LIST* block_parts,
1420  ColPartition_LIST* used_parts,
1421  BLOCK_LIST* completed_blocks,
1422  TO_BLOCK_LIST* to_blocks) {
1423  int page_height = tright.y() - bleft.y();
1424  // Compute the initial spacing stats.
1425  ColPartition_IT it(block_parts);
1426  int part_count = 0;
1427  int max_line_height = 0;
1428 
1429  // TODO(joeliu): We should add some special logic for PT_INLINE_EQUATION type
1430  // because their line spacing with their neighbors maybe smaller and their
1431  // height may be slightly larger.
1432 
1433  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1434  ColPartition* part = it.data();
1435  ASSERT_HOST(!part->boxes()->empty());
1436  STATS side_steps(0, part->bounding_box().height());
1437  if (part->bounding_box().height() > max_line_height)
1438  max_line_height = part->bounding_box().height();
1439  BLOBNBOX_C_IT blob_it(part->boxes());
1440  int prev_bottom = blob_it.data()->bounding_box().bottom();
1441  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1442  BLOBNBOX* blob = blob_it.data();
1443  int bottom = blob->bounding_box().bottom();
1444  int step = bottom - prev_bottom;
1445  if (step < 0)
1446  step = -step;
1447  side_steps.add(step, 1);
1448  prev_bottom = bottom;
1449  }
1450  part->set_side_step(static_cast<int>(side_steps.median() + 0.5));
1451  if (!it.at_last()) {
1452  ColPartition* next_part = it.data_relative(1);
1453  part->set_bottom_spacing(part->median_bottom() -
1454  next_part->median_bottom());
1455  part->set_top_spacing(part->median_top() - next_part->median_top());
1456  } else {
1457  part->set_bottom_spacing(page_height);
1458  part->set_top_spacing(page_height);
1459  }
1460  if (textord_debug_tabfind) {
1461  part->Print();
1462  tprintf("side step = %.2f, top spacing = %d, bottom spacing=%d\n",
1463  side_steps.median(), part->top_spacing(), part->bottom_spacing());
1464  }
1465  ++part_count;
1466  }
1467  if (part_count == 0)
1468  return;
1469 
1470  SmoothSpacings(resolution, page_height, block_parts);
1471 
1472  // Move the partitions into individual block lists and make the blocks.
1473  BLOCK_IT block_it(completed_blocks);
1474  TO_BLOCK_IT to_block_it(to_blocks);
1475  ColPartition_LIST spacing_parts;
1476  ColPartition_IT sp_block_it(&spacing_parts);
1477  int same_block_threshold = max_line_height * kMaxSameBlockLineSpacing;
1478  for (it.mark_cycle_pt(); !it.empty();) {
1479  ColPartition* part = it.extract();
1480  sp_block_it.add_to_end(part);
1481  it.forward();
1482  if (it.empty() || part->bottom_spacing() > same_block_threshold ||
1483  !part->SpacingsEqual(*it.data(), resolution)) {
1484  // There is a spacing boundary. Check to see if it.data() belongs
1485  // better in the current block or the next one.
1486  if (!it.empty() && part->bottom_spacing() <= same_block_threshold) {
1487  ColPartition* next_part = it.data();
1488  // If there is a size match one-way, then the middle line goes with
1489  // its matched size, otherwise it goes with the smallest spacing.
1490  ColPartition* third_part = it.at_last() ? nullptr : it.data_relative(1);
1491  if (textord_debug_tabfind) {
1492  tprintf("Spacings unequal: upper:%d/%d, lower:%d/%d,"
1493  " sizes %d %d %d\n",
1494  part->top_spacing(), part->bottom_spacing(),
1495  next_part->top_spacing(), next_part->bottom_spacing(),
1496  part->median_height(), next_part->median_height(),
1497  third_part != nullptr ? third_part->median_height() : 0);
1498  }
1499  // We can only consider adding the next line to the block if the sizes
1500  // match and the lines are close enough for their size.
1501  if (part->SizesSimilar(*next_part) &&
1502  next_part->median_height() * kMaxSameBlockLineSpacing >
1503  part->bottom_spacing() &&
1504  part->median_height() * kMaxSameBlockLineSpacing >
1505  part->top_spacing()) {
1506  // Even now, we can only add it as long as the third line doesn't
1507  // match in the same way and have a smaller bottom spacing.
1508  if (third_part == nullptr ||
1509  !next_part->SizesSimilar(*third_part) ||
1510  third_part->median_height() * kMaxSameBlockLineSpacing <=
1511  next_part->bottom_spacing() ||
1512  next_part->median_height() * kMaxSameBlockLineSpacing <=
1513  next_part->top_spacing() ||
1514  next_part->bottom_spacing() > part->bottom_spacing()) {
1515  // Add to the current block.
1516  sp_block_it.add_to_end(it.extract());
1517  it.forward();
1518  if (textord_debug_tabfind) {
1519  tprintf("Added line to current block.\n");
1520  }
1521  }
1522  }
1523  }
1524  TO_BLOCK* to_block = MakeBlock(bleft, tright, &spacing_parts, used_parts);
1525  if (to_block != nullptr) {
1526  to_block_it.add_to_end(to_block);
1527  block_it.add_to_end(to_block->block);
1528  }
1529  sp_block_it.set_to_list(&spacing_parts);
1530  } else {
1531  if (textord_debug_tabfind && !it.empty()) {
1532  ColPartition* next_part = it.data();
1533  tprintf("Spacings equal: upper:%d/%d, lower:%d/%d, median:%d/%d\n",
1534  part->top_spacing(), part->bottom_spacing(),
1535  next_part->top_spacing(), next_part->bottom_spacing(),
1536  part->median_height(), next_part->median_height());
1537  }
1538  }
1539  }
1540 }
1541 
1542 // Helper function to clip the input pos to the given bleft, tright bounds.
1543 static void ClipCoord(const ICOORD& bleft, const ICOORD& tright, ICOORD* pos) {
1544  if (pos->x() < bleft.x())
1545  pos->set_x(bleft.x());
1546  if (pos->x() > tright.x())
1547  pos->set_x(tright.x());
1548  if (pos->y() < bleft.y())
1549  pos->set_y(bleft.y());
1550  if (pos->y() > tright.y())
1551  pos->set_y(tright.y());
1552 }
1553 
1554 // Helper moves the blobs from the given list of block_parts into the block
1555 // itself. Sets up the block for (old) textline formation correctly for
1556 // vertical and horizontal text. The partitions are moved to used_parts
1557 // afterwards, as they cannot be deleted yet.
1558 static TO_BLOCK* MoveBlobsToBlock(bool vertical_text, int line_spacing,
1559  BLOCK* block,
1560  ColPartition_LIST* block_parts,
1561  ColPartition_LIST* used_parts) {
1562  // Make a matching TO_BLOCK and put all the BLOBNBOXes from the parts in it.
1563  // Move all the parts to a done list as they are no longer needed, except
1564  // that have have to continue to exist until the part grid is deleted.
1565  // Compute the median blob size as we go, as the block needs to know.
1566  TBOX block_box(block->pdblk.bounding_box());
1567  STATS sizes(0, std::max(block_box.width(), block_box.height()));
1568  bool text_type = block->pdblk.poly_block()->IsText();
1569  ColPartition_IT it(block_parts);
1570  auto* to_block = new TO_BLOCK(block);
1571  BLOBNBOX_IT blob_it(&to_block->blobs);
1572  ColPartition_IT used_it(used_parts);
1573  for (it.move_to_first(); !it.empty(); it.forward()) {
1574  ColPartition* part = it.extract();
1575  // Transfer blobs from all regions to the output blocks.
1576  // Blobs for non-text regions will be used to define the polygonal
1577  // bounds of the region.
1578  for (BLOBNBOX_C_IT bb_it(part->boxes()); !bb_it.empty();
1579  bb_it.forward()) {
1580  BLOBNBOX* bblob = bb_it.extract();
1581  if (bblob->owner() != part) {
1582  tprintf("Ownership incorrect for blob:");
1583  bblob->bounding_box().print();
1584  tprintf("Part=");
1585  part->Print();
1586  if (bblob->owner() == nullptr) {
1587  tprintf("Not owned\n");
1588  } else {
1589  tprintf("Owner part:");
1590  bblob->owner()->Print();
1591  }
1592  }
1593  ASSERT_HOST(bblob->owner() == part);
1594  // Assert failure here is caused by arbitrarily changing the partition
1595  // type without also changing the blob type, such as in
1596  // InsertSmallBlobsAsUnknowns.
1597  ASSERT_HOST(!text_type || bblob->region_type() >= BRT_UNKNOWN);
1598  C_OUTLINE_LIST* outlines = bblob->cblob()->out_list();
1599  C_OUTLINE_IT ol_it(outlines);
1600  ASSERT_HOST(!text_type || ol_it.data()->pathlength() > 0);
1601  if (vertical_text)
1602  sizes.add(bblob->bounding_box().width(), 1);
1603  else
1604  sizes.add(bblob->bounding_box().height(), 1);
1605  blob_it.add_after_then_move(bblob);
1606  }
1607  used_it.add_to_end(part);
1608  }
1609  if (text_type && blob_it.empty()) {
1610  delete block;
1611  delete to_block;
1612  return nullptr;
1613  }
1614  to_block->line_size = sizes.median();
1615  if (vertical_text) {
1616  int block_width = block->pdblk.bounding_box().width();
1617  if (block_width < line_spacing)
1618  line_spacing = block_width;
1619  to_block->line_spacing = static_cast<float>(line_spacing);
1620  to_block->max_blob_size = static_cast<float>(block_width + 1);
1621  } else {
1622  int block_height = block->pdblk.bounding_box().height();
1623  if (block_height < line_spacing)
1624  line_spacing = block_height;
1625  to_block->line_spacing = static_cast<float>(line_spacing);
1626  to_block->max_blob_size = static_cast<float>(block_height + 1);
1627  }
1628  return to_block;
1629 }
1630 
1631 // Constructs a block from the given list of partitions.
1632 // Arguments are as LineSpacingBlocks above.
1633 TO_BLOCK* ColPartition::MakeBlock(const ICOORD& bleft, const ICOORD& tright,
1634  ColPartition_LIST* block_parts,
1635  ColPartition_LIST* used_parts) {
1636  if (block_parts->empty())
1637  return nullptr; // Nothing to do.
1638  // If the block_parts are not in reading order, then it will make an invalid
1639  // block polygon and bounding_box, so sort by bounding box now just to make
1640  // sure.
1641  block_parts->sort(&ColPartition::SortByBBox);
1642  ColPartition_IT it(block_parts);
1643  ColPartition* part = it.data();
1644  PolyBlockType type = part->type();
1645  if (type == PT_VERTICAL_TEXT)
1646  return MakeVerticalTextBlock(bleft, tright, block_parts, used_parts);
1647  // LineSpacingBlocks has handed us a collection of evenly spaced lines and
1648  // put the average spacing in each partition, so we can just take the
1649  // linespacing from the first partition.
1650  int line_spacing = part->bottom_spacing();
1651  if (line_spacing < part->median_height())
1652  line_spacing = part->bounding_box().height();
1653  ICOORDELT_LIST vertices;
1654  ICOORDELT_IT vert_it(&vertices);
1655  ICOORD start, end;
1656  int min_x = INT32_MAX;
1657  int max_x = -INT32_MAX;
1658  int min_y = INT32_MAX;
1659  int max_y = -INT32_MAX;
1660  int iteration = 0;
1661  do {
1662  if (iteration == 0)
1663  ColPartition::LeftEdgeRun(&it, &start, &end);
1664  else
1665  ColPartition::RightEdgeRun(&it, &start, &end);
1666  ClipCoord(bleft, tright, &start);
1667  ClipCoord(bleft, tright, &end);
1668  vert_it.add_after_then_move(new ICOORDELT(start));
1669  vert_it.add_after_then_move(new ICOORDELT(end));
1670  UpdateRange(start.x(), &min_x, &max_x);
1671  UpdateRange(end.x(), &min_x, &max_x);
1672  UpdateRange(start.y(), &min_y, &max_y);
1673  UpdateRange(end.y(), &min_y, &max_y);
1674  if ((iteration == 0 && it.at_first()) ||
1675  (iteration == 1 && it.at_last())) {
1676  ++iteration;
1677  it.move_to_last();
1678  }
1679  } while (iteration < 2);
1681  tprintf("Making block at (%d,%d)->(%d,%d)\n",
1682  min_x, min_y, max_x, max_y);
1683  auto* block = new BLOCK("", true, 0, 0, min_x, min_y, max_x, max_y);
1684  block->pdblk.set_poly_block(new POLY_BLOCK(&vertices, type));
1685  return MoveBlobsToBlock(false, line_spacing, block, block_parts, used_parts);
1686 }
1687 
1688 // Constructs a block from the given list of vertical text partitions.
1689 // Currently only creates rectangular blocks.
1691  const ICOORD& tright,
1692  ColPartition_LIST* block_parts,
1693  ColPartition_LIST* used_parts) {
1694  if (block_parts->empty())
1695  return nullptr; // Nothing to do.
1696  ColPartition_IT it(block_parts);
1697  ColPartition* part = it.data();
1698  TBOX block_box = part->bounding_box();
1699  int line_spacing = block_box.width();
1700  PolyBlockType type = it.data()->type();
1701  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1702  block_box += it.data()->bounding_box();
1703  }
1704  if (textord_debug_tabfind) {
1705  tprintf("Making block at:");
1706  block_box.print();
1707  }
1708  auto* block = new BLOCK("", true, 0, 0, block_box.left(), block_box.bottom(),
1709  block_box.right(), block_box.top());
1710  block->pdblk.set_poly_block(new POLY_BLOCK(block_box, type));
1711  return MoveBlobsToBlock(true, line_spacing, block, block_parts, used_parts);
1712 }
1713 
1714 // Makes a TO_ROW matching this and moves all the blobs to it, transferring
1715 // ownership to to returned TO_ROW.
1717  BLOBNBOX_C_IT blob_it(&boxes_);
1718  TO_ROW* row = nullptr;
1719  int line_size = IsVerticalType() ? median_width_ : median_height_;
1720  // Add all the blobs to a single TO_ROW.
1721  for (; !blob_it.empty(); blob_it.forward()) {
1722  BLOBNBOX* blob = blob_it.extract();
1723 // blob->compute_bounding_box();
1724  int top = blob->bounding_box().top();
1725  int bottom = blob->bounding_box().bottom();
1726  if (row == nullptr) {
1727  row = new TO_ROW(blob, static_cast<float>(top),
1728  static_cast<float>(bottom),
1729  static_cast<float>(line_size));
1730  } else {
1731  row->add_blob(blob, static_cast<float>(top),
1732  static_cast<float>(bottom),
1733  static_cast<float>(line_size));
1734  }
1735  }
1736  return row;
1737 }
1738 
1739 // Returns a copy of everything except the list of boxes. The resulting
1740 // ColPartition is only suitable for keeping in a column candidate list.
1742  auto* part = new ColPartition(blob_type_, vertical_);
1743  part->left_margin_ = left_margin_;
1744  part->right_margin_ = right_margin_;
1745  part->bounding_box_ = bounding_box_;
1746  memcpy(part->special_blobs_densities_, special_blobs_densities_,
1747  sizeof(special_blobs_densities_));
1748  part->median_bottom_ = median_bottom_;
1749  part->median_top_ = median_top_;
1750  part->median_height_ = median_height_;
1751  part->median_left_ = median_left_;
1752  part->median_right_ = median_right_;
1753  part->median_width_ = median_width_;
1754  part->good_width_ = good_width_;
1755  part->good_column_ = good_column_;
1756  part->left_key_tab_ = left_key_tab_;
1757  part->right_key_tab_ = right_key_tab_;
1758  part->type_ = type_;
1759  part->flow_ = flow_;
1760  part->left_key_ = left_key_;
1761  part->right_key_ = right_key_;
1762  part->first_column_ = first_column_;
1763  part->last_column_ = last_column_;
1764  part->owns_blobs_ = false;
1765  return part;
1766 }
1767 
1769  ColPartition* copy = ShallowCopy();
1770  copy->set_owns_blobs(false);
1771  BLOBNBOX_C_IT inserter(copy->boxes());
1772  BLOBNBOX_C_IT traverser(boxes());
1773  for (traverser.mark_cycle_pt(); !traverser.cycled_list(); traverser.forward())
1774  inserter.add_after_then_move(traverser.data());
1775  return copy;
1776 }
1777 
1778 #ifndef GRAPHICS_DISABLED
1779 // Provides a color for BBGrid to draw the rectangle.
1780 // Must be kept in sync with PolyBlockType.
1782  if (type_ == PT_UNKNOWN)
1783  return BLOBNBOX::TextlineColor(blob_type_, flow_);
1784  return POLY_BLOCK::ColorForPolyBlockType(type_);
1785 }
1786 #endif // GRAPHICS_DISABLED
1787 
1788 // Keep in sync with BlobRegionType.
1789 static char kBlobTypes[BRT_COUNT + 1] = "NHSRIUVT";
1790 
1791 // Prints debug information on this.
1792 void ColPartition::Print() const {
1793  int y = MidY();
1794  tprintf("ColPart:%c(M%d-%c%d-B%d/%d,%d/%d)->(%dB-%d%c-%dM/%d,%d/%d)"
1795  " w-ok=%d, v-ok=%d, type=%d%c%d, fc=%d, lc=%d, boxes=%d"
1796  " ts=%d bs=%d ls=%d rs=%d\n",
1797  boxes_.empty() ? 'E' : ' ',
1798  left_margin_, left_key_tab_ ? 'T' : 'B', LeftAtY(y),
1799  bounding_box_.left(), median_left_,
1800  bounding_box_.bottom(), median_bottom_,
1801  bounding_box_.right(), RightAtY(y), right_key_tab_ ? 'T' : 'B',
1802  right_margin_, median_right_, bounding_box_.top(), median_top_,
1803  good_width_, good_column_, type_,
1804  kBlobTypes[blob_type_], flow_,
1805  first_column_, last_column_, boxes_.length(),
1806  space_above_, space_below_, space_to_left_, space_to_right_);
1807 }
1808 
1809 // Prints debug information on the colors.
1811  tprintf("Colors:(%d, %d, %d)%d -> (%d, %d, %d)\n",
1812  color1_[COLOR_RED], color1_[COLOR_GREEN], color1_[COLOR_BLUE],
1813  color1_[L_ALPHA_CHANNEL],
1814  color2_[COLOR_RED], color2_[COLOR_GREEN], color2_[COLOR_BLUE]);
1815 }
1816 
1817 // Sets the types of all partitions in the run to be the max of the types.
1818 void ColPartition::SmoothPartnerRun(int working_set_count) {
1819  STATS left_stats(0, working_set_count);
1820  STATS right_stats(0, working_set_count);
1821  PolyBlockType max_type = type_;
1822  ColPartition* partner;
1823  for (partner = SingletonPartner(false); partner != nullptr;
1824  partner = partner->SingletonPartner(false)) {
1825  if (partner->type_ > max_type)
1826  max_type = partner->type_;
1827  if (column_set_ == partner->column_set_) {
1828  left_stats.add(partner->first_column_, 1);
1829  right_stats.add(partner->last_column_, 1);
1830  }
1831  }
1832  type_ = max_type;
1833  // TODO(rays) Either establish that it isn't necessary to set the columns,
1834  // or find a way to do it that does not cause an assert failure in
1835  // AddToWorkingSet.
1836 #if 0
1837  first_column_ = left_stats.mode();
1838  last_column_ = right_stats.mode();
1839  if (last_column_ < first_column_)
1840  last_column_ = first_column_;
1841 #endif
1842 
1843  for (partner = SingletonPartner(false); partner != nullptr;
1844  partner = partner->SingletonPartner(false)) {
1845  partner->type_ = max_type;
1846 #if 0 // See TODO above
1847  if (column_set_ == partner->column_set_) {
1848  partner->first_column_ = first_column_;
1849  partner->last_column_ = last_column_;
1850  }
1851 #endif
1852  }
1853 }
1854 
1855 // ======= Scenario common to all Refine*Partners* functions =======
1856 // ColPartitions are aiming to represent textlines, or horizontal slices
1857 // of images, and we are trying to form bi-directional (upper/lower) chains
1858 // of UNIQUE partner ColPartitions that can be made into blocks.
1859 // The ColPartitions have previously been typed (see SetPartitionType)
1860 // according to a combination of the content type and
1861 // how they lie on the columns. We want to chain text into
1862 // groups of a single type, but image ColPartitions may have been typed
1863 // differently in different parts of the image, due to being non-rectangular.
1864 //
1865 // We previously ran a search for upper and lower partners, but there may
1866 // be more than one, and they may be of mixed types, so now we wish to
1867 // refine the partners down to at most one.
1868 // A heading may have multiple partners:
1869 // ===============================
1870 // ======== ========== =========
1871 // ======== ========== =========
1872 // but it should be a different type.
1873 // A regular flowing text line may have multiple partners:
1874 // ================== ===================
1875 // ======= ================= ===========
1876 // This could be the start of a pull-out, or it might all be in a single
1877 // column and might be caused by tightly spaced text, bold words, bullets,
1878 // funny punctuation etc, all of which can cause textlines to be split into
1879 // multiple ColPartitions. Pullouts and figure captions should now be different
1880 // types so we can more aggressively merge groups of partners that all sit
1881 // in a single column.
1882 //
1883 // Cleans up the partners of the given type so that there is at most
1884 // one partner. This makes block creation simpler.
1885 // If get_desperate is true, goes to more desperate merge methods
1886 // to merge flowing text before breaking partnerships.
1888  ColPartitionGrid* grid) {
1889  if (TypesSimilar(type_, type)) {
1890  RefinePartnersInternal(true, get_desperate, grid);
1891  RefinePartnersInternal(false, get_desperate, grid);
1892  } else if (type == PT_COUNT) {
1893  // This is the final pass. Make sure only the correctly typed
1894  // partners surivive, however many there are.
1895  RefinePartnersByType(true, &upper_partners_);
1896  RefinePartnersByType(false, &lower_partners_);
1897  // It is possible for a merge to have given a partition multiple
1898  // partners again, so the last resort is to use overlap which is
1899  // guaranteed to leave at most one partner left.
1900  if (!upper_partners_.empty() && !upper_partners_.singleton())
1901  RefinePartnersByOverlap(true, &upper_partners_);
1902  if (!lower_partners_.empty() && !lower_partners_.singleton())
1903  RefinePartnersByOverlap(false, &lower_partners_);
1904  }
1905 }
1906 
1908 
1909 // Cleans up the partners above if upper is true, else below.
1910 // If get_desperate is true, goes to more desperate merge methods
1911 // to merge flowing text before breaking partnerships.
1912 void ColPartition::RefinePartnersInternal(bool upper, bool get_desperate,
1913  ColPartitionGrid* grid) {
1914  ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
1915  if (!partners->empty() && !partners->singleton()) {
1916  RefinePartnersByType(upper, partners);
1917  if (!partners->empty() && !partners->singleton()) {
1918  // Check for transitive partnerships and break the cycle.
1919  RefinePartnerShortcuts(upper, partners);
1920  if (!partners->empty() && !partners->singleton()) {
1921  // Types didn't fix it. Flowing text keeps the one with the longest
1922  // sequence of singleton matching partners. All others max overlap.
1923  if (TypesSimilar(type_, PT_FLOWING_TEXT) && get_desperate) {
1924  RefineTextPartnersByMerge(upper, false, partners, grid);
1925  if (!partners->empty() && !partners->singleton())
1926  RefineTextPartnersByMerge(upper, true, partners, grid);
1927  }
1928  // The last resort is to use overlap.
1929  if (!partners->empty() && !partners->singleton())
1930  RefinePartnersByOverlap(upper, partners);
1931  }
1932  }
1933  }
1934 }
1935 
1936 // Cleans up the partners above if upper is true, else below.
1937 // Restricts the partners to only desirable types. For text and BRT_HLINE this
1938 // means the same type_ , and for image types it means any image type.
1939 void ColPartition::RefinePartnersByType(bool upper,
1940  ColPartition_CLIST* partners) {
1941  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
1942  bounding_box_.bottom());
1943  if (debug) {
1944  tprintf("Refining %d %s partners by type for:\n",
1945  partners->length(), upper ? "Upper" : "Lower");
1946  Print();
1947  }
1948  ColPartition_C_IT it(partners);
1949  // Purify text by type.
1950  if (!IsImageType() && !IsLineType() && type() != PT_TABLE) {
1951  // Keep only partners matching type_.
1952  // Exception: PT_VERTICAL_TEXT is allowed to stay with the other
1953  // text types if it is the only partner.
1954  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1955  ColPartition* partner = it.data();
1956  if (!TypesSimilar(type_, partner->type_)) {
1957  if (debug) {
1958  tprintf("Removing partner:");
1959  partner->Print();
1960  }
1961  partner->RemovePartner(!upper, this);
1962  it.extract();
1963  } else if (debug) {
1964  tprintf("Keeping partner:");
1965  partner->Print();
1966  }
1967  }
1968  } else {
1969  // Only polyimages are allowed to have partners of any kind!
1970  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1971  ColPartition* partner = it.data();
1972  if (partner->blob_type() != BRT_POLYIMAGE ||
1973  blob_type() != BRT_POLYIMAGE) {
1974  if (debug) {
1975  tprintf("Removing partner:");
1976  partner->Print();
1977  }
1978  partner->RemovePartner(!upper, this);
1979  it.extract();
1980  } else if (debug) {
1981  tprintf("Keeping partner:");
1982  partner->Print();
1983  }
1984  }
1985  }
1986 }
1987 
1988 // Cleans up the partners above if upper is true, else below.
1989 // Remove transitive partnerships: this<->a, and a<->b and this<->b.
1990 // Gets rid of this<->b, leaving a clean chain.
1991 // Also if we have this<->a and a<->this, then gets rid of this<->a, as
1992 // this has multiple partners.
1993 void ColPartition::RefinePartnerShortcuts(bool upper,
1994  ColPartition_CLIST* partners) {
1995  bool done_any = false;
1996  do {
1997  done_any = false;
1998  ColPartition_C_IT it(partners);
1999  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2000  ColPartition* a = it.data();
2001  // Check for a match between all of a's partners (it1/b1) and all
2002  // of this's partners (it2/b2).
2003  ColPartition_C_IT it1(upper ? &a->upper_partners_ : &a->lower_partners_);
2004  for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
2005  ColPartition* b1 = it1.data();
2006  if (b1 == this) {
2007  done_any = true;
2008  it.extract();
2009  a->RemovePartner(!upper, this);
2010  break;
2011  }
2012  ColPartition_C_IT it2(partners);
2013  for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
2014  ColPartition* b2 = it2.data();
2015  if (b1 == b2) {
2016  // Jackpot! b2 should not be a partner of this.
2017  it2.extract();
2018  b2->RemovePartner(!upper, this);
2019  done_any = true;
2020  // That potentially invalidated all the iterators, so break out
2021  // and start again.
2022  break;
2023  }
2024  }
2025  if (done_any)
2026  break;
2027  }
2028  if (done_any)
2029  break;
2030  }
2031  } while (done_any && !partners->empty() && !partners->singleton());
2032 }
2033 
2034 // Cleans up the partners above if upper is true, else below.
2035 // If multiple text partners can be merged, (with each other, NOT with this),
2036 // then do so.
2037 // If desperate is true, then an increase in overlap with the merge is
2038 // allowed. If the overlap increases, then the desperately_merged_ flag
2039 // is set, indicating that the textlines probably need to be regenerated
2040 // by aggressive line fitting/splitting, as there are probably vertically
2041 // joined blobs that cross textlines.
2042 void ColPartition::RefineTextPartnersByMerge(bool upper, bool desperate,
2043  ColPartition_CLIST* partners,
2044  ColPartitionGrid* grid) {
2045  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
2046  bounding_box_.bottom());
2047  if (debug) {
2048  tprintf("Refining %d %s partners by merge for:\n",
2049  partners->length(), upper ? "Upper" : "Lower");
2050  Print();
2051  }
2052  while (!partners->empty() && !partners->singleton()) {
2053  // Absorb will mess up the iterators, so we have to merge one partition
2054  // at a time and rebuild the iterators each time.
2055  ColPartition_C_IT it(partners);
2056  ColPartition* part = it.data();
2057  // Gather a list of merge candidates, from the list of partners, that
2058  // are all in the same single column. See general scenario comment above.
2059  ColPartition_CLIST candidates;
2060  ColPartition_C_IT cand_it(&candidates);
2061  for (it.forward(); !it.at_first(); it.forward()) {
2062  ColPartition* candidate = it.data();
2063  if (part->first_column_ == candidate->last_column_ &&
2064  part->last_column_ == candidate->first_column_)
2065  cand_it.add_after_then_move(it.data());
2066  }
2067  int overlap_increase;
2068  ColPartition* candidate = grid->BestMergeCandidate(part, &candidates, debug,
2069  nullptr, &overlap_increase);
2070  if (candidate != nullptr && (overlap_increase <= 0 || desperate)) {
2071  if (debug) {
2072  tprintf("Merging:hoverlap=%d, voverlap=%d, OLI=%d\n",
2073  part->HCoreOverlap(*candidate), part->VCoreOverlap(*candidate),
2074  overlap_increase);
2075  }
2076  // Remove before merge and re-insert to keep the integrity of the grid.
2077  grid->RemoveBBox(candidate);
2078  grid->RemoveBBox(part);
2079  part->Absorb(candidate, nullptr);
2080  // We modified the box of part, so re-insert it into the grid.
2081  grid->InsertBBox(true, true, part);
2082  if (overlap_increase > 0)
2083  part->desperately_merged_ = true;
2084  } else {
2085  break; // Can't merge.
2086  }
2087  }
2088 }
2089 
2090 // Cleans up the partners above if upper is true, else below.
2091 // Keep the partner with the biggest overlap.
2092 void ColPartition::RefinePartnersByOverlap(bool upper,
2093  ColPartition_CLIST* partners) {
2094  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
2095  bounding_box_.bottom());
2096  if (debug) {
2097  tprintf("Refining %d %s partners by overlap for:\n",
2098  partners->length(), upper ? "Upper" : "Lower");
2099  Print();
2100  }
2101  ColPartition_C_IT it(partners);
2102  ColPartition* best_partner = it.data();
2103  // Find the partner with the best overlap.
2104  int best_overlap = 0;
2105  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2106  ColPartition* partner = it.data();
2107  int overlap = std::min(bounding_box_.right(), partner->bounding_box_.right())
2108  - std::max(bounding_box_.left(), partner->bounding_box_.left());
2109  if (overlap > best_overlap) {
2110  best_overlap = overlap;
2111  best_partner = partner;
2112  }
2113  }
2114  // Keep only the best partner.
2115  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2116  ColPartition* partner = it.data();
2117  if (partner != best_partner) {
2118  if (debug) {
2119  tprintf("Removing partner:");
2120  partner->Print();
2121  }
2122  partner->RemovePartner(!upper, this);
2123  it.extract();
2124  }
2125  }
2126 }
2127 
2128 // Return true if bbox belongs better in this than other.
2129 bool ColPartition::ThisPartitionBetter(BLOBNBOX* bbox,
2130  const ColPartition& other) {
2131  const TBOX& box = bbox->bounding_box();
2132  // Margins take priority.
2133  int left = box.left();
2134  int right = box.right();
2135  if (left < left_margin_ || right > right_margin_)
2136  return false;
2137  if (left < other.left_margin_ || right > other.right_margin_)
2138  return true;
2139  int top = box.top();
2140  int bottom = box.bottom();
2141  int this_overlap = std::min(top, median_top_) - std::max(bottom, median_bottom_);
2142  int other_overlap = std::min(top, other.median_top_) -
2143  std::max(bottom, other.median_bottom_);
2144  int this_miss = median_top_ - median_bottom_ - this_overlap;
2145  int other_miss = other.median_top_ - other.median_bottom_ - other_overlap;
2146  if (TabFind::WithinTestRegion(3, box.left(), box.bottom())) {
2147  tprintf("Unique on (%d,%d)->(%d,%d) overlap %d/%d, miss %d/%d, mt=%d/%d\n",
2148  box.left(), box.bottom(), box.right(), box.top(),
2149  this_overlap, other_overlap, this_miss, other_miss,
2150  median_top_, other.median_top_);
2151  }
2152  if (this_miss < other_miss)
2153  return true;
2154  if (this_miss > other_miss)
2155  return false;
2156  if (this_overlap > other_overlap)
2157  return true;
2158  if (this_overlap < other_overlap)
2159  return false;
2160  return median_top_ >= other.median_top_;
2161 }
2162 
2163 // Returns the median line-spacing between the current position and the end
2164 // of the list.
2165 // The iterator is passed by value so the iteration does not modify the
2166 // caller's iterator.
2167 static int MedianSpacing(int page_height, ColPartition_IT it) {
2168  STATS stats(0, page_height);
2169  while (!it.cycled_list()) {
2170  ColPartition* part = it.data();
2171  it.forward();
2172  stats.add(part->bottom_spacing(), 1);
2173  stats.add(part->top_spacing(), 1);
2174  }
2175  return static_cast<int>(stats.median() + 0.5);
2176 }
2177 
2178 // Returns true if this column partition is in the same column as
2179 // part. This function will only work after the SetPartitionType function
2180 // has been called on both column partitions. This is useful for
2181 // doing a SideSearch when you want things in the same page column.
2182 //
2183 // Currently called by the table detection code to identify if potential table
2184 // partitions exist in the same column.
2186  // Overlap does not occur when last < part.first or first > part.last.
2187  // In other words, one is completely to the side of the other.
2188  // This is just DeMorgan's law applied to that so the function returns true.
2189  return (last_column_ >= part.first_column_) &&
2190  (first_column_ <= part.last_column_);
2191 }
2192 
2193 // Smoothes the spacings in the list into groups of equal linespacing.
2194 // resolution is the resolution of the original image, used as a basis
2195 // for thresholds in change of spacing. page_height is in pixels.
2196 void ColPartition::SmoothSpacings(int resolution, int page_height,
2197  ColPartition_LIST* parts) {
2198  // The task would be trivial if we didn't have to allow for blips -
2199  // occasional offsets in spacing caused by anomalous text, such as all
2200  // caps, groups of descenders, joined words, Arabic etc.
2201  // The neighbourhood stores a consecutive group of partitions so that
2202  // blips can be detected correctly, yet conservatively enough to not
2203  // mistake genuine spacing changes for blips. See example below.
2204  ColPartition* neighbourhood[PN_COUNT];
2205  ColPartition_IT it(parts);
2206  it.mark_cycle_pt();
2207  // Although we know nothing about the spacings is this list, the median is
2208  // used as an approximation to allow blips.
2209  // If parts of this block aren't spaced to the median, then we can't
2210  // accept blips in those parts, but we'll recalculate it each time we
2211  // split the block, so the median becomes more likely to match all the text.
2212  int median_space = MedianSpacing(page_height, it);
2213  ColPartition_IT start_it(it);
2214  ColPartition_IT end_it(it);
2215  for (int i = 0; i < PN_COUNT; ++i) {
2216  if (i < PN_UPPER || it.cycled_list()) {
2217  neighbourhood[i] = nullptr;
2218  } else {
2219  if (i == PN_LOWER)
2220  end_it = it;
2221  neighbourhood[i] = it.data();
2222  it.forward();
2223  }
2224  }
2225  while (neighbourhood[PN_UPPER] != nullptr) {
2226  // Test for end of a group. Normally SpacingsEqual is true within a group,
2227  // but in the case of a blip, it will be false. Here is an example:
2228  // Line enum Spacing below (spacing between tops of lines)
2229  // 1 ABOVE2 20
2230  // 2 ABOVE1 20
2231  // 3 UPPER 15
2232  // 4 LOWER 25
2233  // 5 BELOW1 20
2234  // 6 BELOW2 20
2235  // Line 4 is all in caps (regular caps), so the spacing between line 3
2236  // and line 4 (looking at the tops) is smaller than normal, and the
2237  // spacing between line 4 and line 5 is larger than normal, but the
2238  // two of them add to twice the normal spacing.
2239  // The following if has to accept unequal spacings 3 times to pass the
2240  // blip (20/15, 15/25 and 25/20)
2241  // When the blip is in the middle, OKSpacingBlip tests that one of
2242  // ABOVE1 and BELOW1 matches the median.
2243  // The first time, everything is shifted down 1, so we present
2244  // OKSpacingBlip with neighbourhood+1 and check that PN_UPPER is median.
2245  // The last time, everything is shifted up 1, so we present OKSpacingBlip
2246  // with neighbourhood-1 and check that PN_LOWER matches the median.
2247  if (neighbourhood[PN_LOWER] == nullptr ||
2248  (!neighbourhood[PN_UPPER]->SpacingsEqual(*neighbourhood[PN_LOWER],
2249  resolution) &&
2250  !OKSpacingBlip(resolution, median_space, neighbourhood) &&
2251  (!OKSpacingBlip(resolution, median_space, neighbourhood - 1) ||
2252  !neighbourhood[PN_LOWER]->SpacingEqual(median_space, resolution)) &&
2253  (!OKSpacingBlip(resolution, median_space, neighbourhood + 1) ||
2254  !neighbourhood[PN_UPPER]->SpacingEqual(median_space, resolution)))) {
2255  // The group has ended. PN_UPPER is the last member.
2256  // Compute the mean spacing over the group.
2257  ColPartition_IT sum_it(start_it);
2258  ColPartition* last_part = neighbourhood[PN_UPPER];
2259  double total_bottom = 0.0;
2260  double total_top = 0.0;
2261  int total_count = 0;
2262  ColPartition* upper = sum_it.data();
2263  // We do not process last_part, as its spacing is different.
2264  while (upper != last_part) {
2265  total_bottom += upper->bottom_spacing();
2266  total_top += upper->top_spacing();
2267  ++total_count;
2268  sum_it.forward();
2269  upper = sum_it.data();
2270  }
2271  if (total_count > 0) {
2272  // There were at least 2 lines, so set them all to the mean.
2273  int top_spacing = static_cast<int>(total_top / total_count + 0.5);
2274  int bottom_spacing = static_cast<int>(total_bottom / total_count + 0.5);
2275  if (textord_debug_tabfind) {
2276  tprintf("Spacing run ended. Cause:");
2277  if (neighbourhood[PN_LOWER] == nullptr) {
2278  tprintf("No more lines\n");
2279  } else {
2280  tprintf("Spacing change. Spacings:\n");
2281  for (int i = 0; i < PN_COUNT; ++i) {
2282  if (neighbourhood[i] == nullptr) {
2283  tprintf("NULL");
2284  if (i > 0 && neighbourhood[i - 1] != nullptr) {
2285  if (neighbourhood[i - 1]->SingletonPartner(false) != nullptr) {
2286  tprintf(" Lower partner:");
2287  neighbourhood[i - 1]->SingletonPartner(false)->Print();
2288  } else {
2289  tprintf(" nullptr lower partner:\n");
2290  }
2291  } else {
2292  tprintf("\n");
2293  }
2294  } else {
2295  tprintf("Top = %d, bottom = %d\n",
2296  neighbourhood[i]->top_spacing(),
2297  neighbourhood[i]->bottom_spacing());
2298  }
2299  }
2300  }
2301  tprintf("Mean spacing = %d/%d\n", top_spacing, bottom_spacing);
2302  }
2303  sum_it = start_it;
2304  upper = sum_it.data();
2305  while (upper != last_part) {
2306  upper->set_top_spacing(top_spacing);
2307  upper->set_bottom_spacing(bottom_spacing);
2308  if (textord_debug_tabfind) {
2309  tprintf("Setting mean on:");
2310  upper->Print();
2311  }
2312  sum_it.forward();
2313  upper = sum_it.data();
2314  }
2315  }
2316  // PN_LOWER starts the next group and end_it is the next start_it.
2317  start_it = end_it;
2318  // Recalculate the median spacing to maximize the chances of detecting
2319  // spacing blips.
2320  median_space = MedianSpacing(page_height, end_it);
2321  }
2322  // Shuffle pointers.
2323  for (int j = 1; j < PN_COUNT; ++j) {
2324  neighbourhood[j - 1] = neighbourhood[j];
2325  }
2326  if (it.cycled_list()) {
2327  neighbourhood[PN_COUNT - 1] = nullptr;
2328  } else {
2329  neighbourhood[PN_COUNT - 1] = it.data();
2330  it.forward();
2331  }
2332  end_it.forward();
2333  }
2334 }
2335 
2336 // Returns true if the parts array of pointers to partitions matches the
2337 // condition for a spacing blip. See SmoothSpacings for what this means
2338 // and how it is used.
2339 bool ColPartition::OKSpacingBlip(int resolution, int median_spacing,
2340  ColPartition** parts) {
2341  if (parts[PN_UPPER] == nullptr || parts[PN_LOWER] == nullptr)
2342  return false;
2343  // The blip is OK if upper and lower sum to an OK value and at least
2344  // one of above1 and below1 is equal to the median.
2345  return parts[PN_UPPER]->SummedSpacingOK(*parts[PN_LOWER],
2346  median_spacing, resolution) &&
2347  ((parts[PN_ABOVE1] != nullptr &&
2348  parts[PN_ABOVE1]->SpacingEqual(median_spacing, resolution)) ||
2349  (parts[PN_BELOW1] != nullptr &&
2350  parts[PN_BELOW1]->SpacingEqual(median_spacing, resolution)));
2351 }
2352 
2353 // Returns true if both the top and bottom spacings of this match the given
2354 // spacing to within suitable margins dictated by the image resolution.
2355 bool ColPartition::SpacingEqual(int spacing, int resolution) const {
2356  int bottom_error = BottomSpacingMargin(resolution);
2357  int top_error = TopSpacingMargin(resolution);
2358  return NearlyEqual(bottom_spacing_, spacing, bottom_error) &&
2359  NearlyEqual(top_spacing_, spacing, top_error);
2360 }
2361 
2362 // Returns true if both the top and bottom spacings of this and other
2363 // match to within suitable margins dictated by the image resolution.
2364 bool ColPartition::SpacingsEqual(const ColPartition& other,
2365  int resolution) const {
2366  int bottom_error = std::max(BottomSpacingMargin(resolution),
2367  other.BottomSpacingMargin(resolution));
2368  int top_error = std::max(TopSpacingMargin(resolution),
2369  other.TopSpacingMargin(resolution));
2370  return NearlyEqual(bottom_spacing_, other.bottom_spacing_, bottom_error) &&
2371  (NearlyEqual(top_spacing_, other.top_spacing_, top_error) ||
2372  NearlyEqual(top_spacing_ + other.top_spacing_, bottom_spacing_ * 2,
2373  bottom_error));
2374 }
2375 
2376 // Returns true if the sum spacing of this and other match the given
2377 // spacing (or twice the given spacing) to within a suitable margin dictated
2378 // by the image resolution.
2379 bool ColPartition::SummedSpacingOK(const ColPartition& other,
2380  int spacing, int resolution) const {
2381  int bottom_error = std::max(BottomSpacingMargin(resolution),
2382  other.BottomSpacingMargin(resolution));
2383  int top_error = std::max(TopSpacingMargin(resolution),
2384  other.TopSpacingMargin(resolution));
2385  int bottom_total = bottom_spacing_ + other.bottom_spacing_;
2386  int top_total = top_spacing_ + other.top_spacing_;
2387  return (NearlyEqual(spacing, bottom_total, bottom_error) &&
2388  NearlyEqual(spacing, top_total, top_error)) ||
2389  (NearlyEqual(spacing * 2, bottom_total, bottom_error) &&
2390  NearlyEqual(spacing * 2, top_total, top_error));
2391 }
2392 
2393 // Returns a suitable spacing margin that can be applied to bottoms of
2394 // text lines, based on the resolution and the stored side_step_.
2395 int ColPartition::BottomSpacingMargin(int resolution) const {
2396  return static_cast<int>(kMaxSpacingDrift * resolution + 0.5) + side_step_;
2397 }
2398 
2399 // Returns a suitable spacing margin that can be applied to tops of
2400 // text lines, based on the resolution and the stored side_step_.
2401 int ColPartition::TopSpacingMargin(int resolution) const {
2402  return static_cast<int>(kMaxTopSpacingFraction * median_height_ + 0.5) +
2403  BottomSpacingMargin(resolution);
2404 }
2405 
2406 // Returns true if the median text sizes of this and other agree to within
2407 // a reasonable multiplicative factor.
2408 bool ColPartition::SizesSimilar(const ColPartition& other) const {
2409  return median_height_ <= other.median_height_ * kMaxSizeRatio &&
2410  other.median_height_ <= median_height_ * kMaxSizeRatio;
2411 }
2412 
2413 // Helper updates margin_left and margin_right, being the bounds of the left
2414 // margin of part of a block. Returns false and does not update the bounds if
2415 // this partition has a disjoint margin with the established margin.
2416 static bool UpdateLeftMargin(const ColPartition& part,
2417  int* margin_left, int* margin_right) {
2418  const TBOX& part_box = part.bounding_box();
2419  int top = part_box.top();
2420  int bottom = part_box.bottom();
2421  int tl_key = part.SortKey(part.left_margin(), top);
2422  int tr_key = part.SortKey(part_box.left(), top);
2423  int bl_key = part.SortKey(part.left_margin(), bottom);
2424  int br_key = part.SortKey(part_box.left(), bottom);
2425  int left_key = std::max(tl_key, bl_key);
2426  int right_key = std::min(tr_key, br_key);
2427  if (left_key <= *margin_right && right_key >= *margin_left) {
2428  // This part is good - let's keep it.
2429  *margin_right = std::min(*margin_right, right_key);
2430  *margin_left = std::max(*margin_left, left_key);
2431  return true;
2432  }
2433  return false;
2434 }
2435 
2436 // Computes and returns in start, end a line segment formed from a
2437 // forwards-iterated group of left edges of partitions that satisfy the
2438 // condition that the intersection of the left margins is non-empty, ie the
2439 // rightmost left margin is to the left of the leftmost left bounding box edge.
2440 // On return the iterator is set to the start of the next run.
2441 void ColPartition::LeftEdgeRun(ColPartition_IT* part_it,
2442  ICOORD* start, ICOORD* end) {
2443  ColPartition* part = part_it->data();
2444  ColPartition* start_part = part;
2445  int start_y = part->bounding_box_.top();
2446  if (!part_it->at_first()) {
2447  int prev_bottom = part_it->data_relative(-1)->bounding_box_.bottom();
2448  if (prev_bottom < start_y)
2449  start_y = prev_bottom;
2450  else if (prev_bottom > start_y)
2451  start_y = (start_y + prev_bottom) / 2;
2452  }
2453  int end_y = part->bounding_box_.bottom();
2454  int margin_right = INT32_MAX;
2455  int margin_left = -INT32_MAX;
2456  UpdateLeftMargin(*part, &margin_left, &margin_right);
2457  do {
2458  part_it->forward();
2459  part = part_it->data();
2460  } while (!part_it->at_first() &&
2461  UpdateLeftMargin(*part, &margin_left, &margin_right));
2462  // The run ended. If we were pushed inwards, compute the next run and
2463  // extend it backwards into the run we just calculated to find the end of
2464  // this run that provides a tight box.
2465  int next_margin_right = INT32_MAX;
2466  int next_margin_left = -INT32_MAX;
2467  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right);
2468  if (next_margin_left > margin_right) {
2469  ColPartition_IT next_it(*part_it);
2470  do {
2471  next_it.forward();
2472  part = next_it.data();
2473  } while (!next_it.at_first() &&
2474  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2475  // Now extend the next run backwards into the original run to get the
2476  // tightest fit.
2477  do {
2478  part_it->backward();
2479  part = part_it->data();
2480  } while (part != start_part &&
2481  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2482  part_it->forward();
2483  }
2484  // Now calculate the end_y.
2485  part = part_it->data_relative(-1);
2486  end_y = part->bounding_box_.bottom();
2487  if (!part_it->at_first() && part_it->data()->bounding_box_.top() < end_y)
2488  end_y = (end_y + part_it->data()->bounding_box_.top()) / 2;
2489  start->set_y(start_y);
2490  start->set_x(part->XAtY(margin_right, start_y));
2491  end->set_y(end_y);
2492  end->set_x(part->XAtY(margin_right, end_y));
2493  if (textord_debug_tabfind && !part_it->at_first())
2494  tprintf("Left run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2495  start_y, end_y, part->XAtY(margin_left, end_y),
2496  end->x(), part->left_margin_, part->bounding_box_.left());
2497 }
2498 
2499 // Helper updates margin_left and margin_right, being the bounds of the right
2500 // margin of part of a block. Returns false and does not update the bounds if
2501 // this partition has a disjoint margin with the established margin.
2502 static bool UpdateRightMargin(const ColPartition& part,
2503  int* margin_left, int* margin_right) {
2504  const TBOX& part_box = part.bounding_box();
2505  int top = part_box.top();
2506  int bottom = part_box.bottom();
2507  int tl_key = part.SortKey(part_box.right(), top);
2508  int tr_key = part.SortKey(part.right_margin(), top);
2509  int bl_key = part.SortKey(part_box.right(), bottom);
2510  int br_key = part.SortKey(part.right_margin(), bottom);
2511  int left_key = std::max(tl_key, bl_key);
2512  int right_key = std::min(tr_key, br_key);
2513  if (left_key <= *margin_right && right_key >= *margin_left) {
2514  // This part is good - let's keep it.
2515  *margin_right = std::min(*margin_right, right_key);
2516  *margin_left = std::max(*margin_left, left_key);
2517  return true;
2518  }
2519  return false;
2520 }
2521 
2522 // Computes and returns in start, end a line segment formed from a
2523 // backwards-iterated group of right edges of partitions that satisfy the
2524 // condition that the intersection of the right margins is non-empty, ie the
2525 // leftmost right margin is to the right of the rightmost right bounding box
2526 // edge.
2527 // On return the iterator is set to the start of the next run.
2528 void ColPartition::RightEdgeRun(ColPartition_IT* part_it,
2529  ICOORD* start, ICOORD* end) {
2530  ColPartition* part = part_it->data();
2531  ColPartition* start_part = part;
2532  int start_y = part->bounding_box_.bottom();
2533  if (!part_it->at_last()) {
2534  int next_y = part_it->data_relative(1)->bounding_box_.top();
2535  if (next_y > start_y)
2536  start_y = next_y;
2537  else if (next_y < start_y)
2538  start_y = (start_y + next_y) / 2;
2539  }
2540  int end_y = part->bounding_box_.top();
2541  int margin_right = INT32_MAX;
2542  int margin_left = -INT32_MAX;
2543  UpdateRightMargin(*part, &margin_left, &margin_right);
2544  do {
2545  part_it->backward();
2546  part = part_it->data();
2547  } while (!part_it->at_last() &&
2548  UpdateRightMargin(*part, &margin_left, &margin_right));
2549  // The run ended. If we were pushed inwards, compute the next run and
2550  // extend it backwards to find the end of this run for a tight box.
2551  int next_margin_right = INT32_MAX;
2552  int next_margin_left = -INT32_MAX;
2553  UpdateRightMargin(*part, &next_margin_left, &next_margin_right);
2554  if (next_margin_right < margin_left) {
2555  ColPartition_IT next_it(*part_it);
2556  do {
2557  next_it.backward();
2558  part = next_it.data();
2559  } while (!next_it.at_last() &&
2560  UpdateRightMargin(*part, &next_margin_left,
2561  &next_margin_right));
2562  // Now extend the next run forwards into the original run to get the
2563  // tightest fit.
2564  do {
2565  part_it->forward();
2566  part = part_it->data();
2567  } while (part != start_part &&
2568  UpdateRightMargin(*part, &next_margin_left,
2569  &next_margin_right));
2570  part_it->backward();
2571  }
2572  // Now calculate the end_y.
2573  part = part_it->data_relative(1);
2574  end_y = part->bounding_box().top();
2575  if (!part_it->at_last() &&
2576  part_it->data()->bounding_box_.bottom() > end_y)
2577  end_y = (end_y + part_it->data()->bounding_box_.bottom()) / 2;
2578  start->set_y(start_y);
2579  start->set_x(part->XAtY(margin_left, start_y));
2580  end->set_y(end_y);
2581  end->set_x(part->XAtY(margin_left, end_y));
2582  if (textord_debug_tabfind && !part_it->at_last())
2583  tprintf("Right run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2584  start_y, end_y, end->x(), part->XAtY(margin_right, end_y),
2585  part->bounding_box_.right(), part->right_margin_);
2586 }
2587 
2588 } // namespace tesseract.
void CopyRightTab(const ColPartition &src, bool take_box)
bool IsVerticalType() const
Definition: colpartition.h:442
ColPartition * SplitAt(int split_x)
const int kMaxColorDistance
bool IsLineType() const
Definition: colpartition.h:426
static ColPartition * FakePartition(const TBOX &box, PolyBlockType block_type, BlobRegionType blob_type, BlobTextFlowType flow)
bool IsDiacritic() const
Definition: blobbox.h:380
int median_height() const
Definition: colpartition.h:137
int32_t area() const
Definition: rect.h:122
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:120
void SetPartitionType(int resolution, ColPartitionSet *columns)
int16_t top() const
Definition: rect.h:58
void SetRegionAndFlowTypesFromProjectionValue(int value)
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
const int kHorzStrongTextlineHeight
const int kMinStrongTextValue
void set_top(int y)
Definition: rect.h:61
Definition: rect.h:34
void set_right(int x)
Definition: rect.h:82
void print() const
Definition: rect.h:278
void set_owns_blobs(bool owns_blobs)
Definition: colpartition.h:295
void set_type(PolyBlockType t)
Definition: colpartition.h:185
static TO_BLOCK * MakeVerticalTextBlock(const ICOORD &bleft, const ICOORD &tright, ColPartition_LIST *block_parts, ColPartition_LIST *used_parts)
const double kMaxBaselineError
int total_cost() const
Definition: dppoint.h:72
double ile(double frac) const
Definition: statistc.cpp:172
void set_x(int16_t xin)
rewrite function
Definition: points.h:61
bool IsPulloutType() const
Definition: colpartition.h:438
int textord_debug_bugs
Definition: alignedblob.cpp:28
C_BLOB * cblob() const
Definition: blobbox.h:268
float SpecialBlobsDensity(const BlobSpecialTextType type) const
void ExtractCompletedBlocks(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *used_parts, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
ScrollView::Color BoxColor() const
ColPartition * SplitAtBlob(BLOBNBOX *split_blob)
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
tesseract::ColPartition * owner() const
Definition: blobbox.h:352
bool ConfirmNoTabViolation(const ColPartition &other) const
void SetLeftTab(const TabVector *tab_vector)
void AddBox(BLOBNBOX *box)
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:75
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:444
int LeftAtY(int y) const
Definition: colpartition.h:341
const int kColumnWidthFactor
Definition: tabfind.h:42
int GoodTextBlob() const
Definition: blobbox.cpp:226
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:37
void set_block_owned(bool owned)
Definition: colpartition.h:209
int HCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:385
void set_bottom_spacing(int spacing)
Definition: colpartition.h:224
void SmoothPartnerRun(int working_set_count)
integer coordinate
Definition: points.h:31
void add_blob(BLOBNBOX *blob, float top, float bottom, float row_size)
Definition: blobbox.cpp:733
const TBOX & bounding_box() const
Definition: blobbox.h:230
const double kMaxSameBlockLineSpacing
int16_t y() const
access_function
Definition: points.h:56
void set_y(int16_t yin)
rewrite function
Definition: points.h:65
int base_char_bottom() const
Definition: blobbox.h:386
BlobRegionType blob_type() const
Definition: colpartition.h:149
BlobTextFlowType flow() const
Definition: blobbox.h:295
bool IsInSameColumnAs(const ColPartition &part) const
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
void SetSpecialBlobsDensity(const BlobSpecialTextType type, const float density)
void AddPartner(bool upper, ColPartition *partner)
void set_left(int x)
Definition: rect.h:75
int bottom_spacing() const
Definition: colpartition.h:221
void AddToWorkingSet(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *used_parts, WorkingPartSet_LIST *working_set)
static DPPoint * Solve(int min_step, int max_step, bool debug, CostFunc cost_func, int size, DPPoint *points)
Definition: dppoint.cpp:31
int right_margin() const
Definition: colpartition.h:119
BLOCK * block
Definition: blobbox.h:788
void set_right_margin(int margin)
Definition: colpartition.h:122
ColPartition * CopyButDontOwnBlobs()
int NoisyNeighbours() const
Definition: blobbox.cpp:237
const double kMinBaselineCoverage
Definition: capi.h:134
int median_bottom() const
Definition: colpartition.h:128
int16_t height() const
Definition: rect.h:108
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
const double kMaxLeaderGapFractionOfMax
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:51
bool OKDiacriticMerge(const ColPartition &candidate, bool debug) const
PolyBlockType type() const
Definition: colpartition.h:182
PolyBlockType
Definition: publictypes.h:53
bool MatchingTextColor(const ColPartition &other) const
const int kMinLeaderCount
TBOX BoundsWithoutBox(BLOBNBOX *box)
const TBOX & bounding_box() const
Definition: colpartition.h:110
bool OKMergeOverlap(const ColPartition &merge1, const ColPartition &merge2, int ok_box_overlap, bool debug)
void AddPartition(ColPartition *part)
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:355
ColPartition * ShallowCopy() const
int BoxRightKey() const
Definition: colpartition.h:337
int median_width() const
Definition: colpartition.h:143
Definition: capi.h:143
void SetRightTab(const TabVector *tab_vector)
int16_t x() const
access function
Definition: points.h:52
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
static ColPartition * MakeBigPartition(BLOBNBOX *box, ColPartition_LIST *big_part_list)
static double ColorDistanceFromLine(const uint8_t *line1, const uint8_t *line2, const uint8_t *point)
Definition: imagefind.cpp:355
bool overlap(const TBOX &box) const
Definition: rect.h:355
BlobRegionType
Definition: blobbox.h:72
const int kHorzStrongTextlineCount
int textord_debug_tabfind
Definition: alignedblob.cpp:27
int VCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:376
static bool DifferentSizes(int size1, int size2)
Definition: tabfind.cpp:407
int16_t width() const
Definition: rect.h:115
ColPartition * BestMergeCandidate(const ColPartition *part, ColPartition_CLIST *candidates, bool debug, TessResultCallback2< bool, const ColPartition *, const ColPartition * > *confirm_cb, int *overlap_increase)
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:191
static ScrollView::Color ColorForPolyBlockType(PolyBlockType type)
Returns a color to draw the given type.
Definition: polyblk.cpp:393
virtual R Run(A1)=0
const double kMaxSizeRatio
int64_t CostWithVariance(const DPPoint *prev)
Definition: dppoint.cpp:69
const int kMinChainTextValue
int16_t right() const
Definition: rect.h:79
bool VSignificantCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:391
static TO_BLOCK * MakeBlock(const ICOORD &bleft, const ICOORD &tright, ColPartition_LIST *block_parts, ColPartition_LIST *used_parts)
int16_t bottom() const
Definition: rect.h:65
bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2)
Definition: blobbox.h:129
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
bool MatchingStrokeWidth(const ColPartition &other, double fractional_tolerance, double constant_tolerance) const
void set_flow(BlobTextFlowType f)
Definition: colpartition.h:158
void RefinePartners(PolyBlockType type, bool get_desperate, ColPartitionGrid *grid)
BlobRegionType region_type() const
Definition: blobbox.h:283
int SpecialBlobsCount(const BlobSpecialTextType type)
BlobTextFlowType
Definition: blobbox.h:114
bool MatchingColumns(const ColPartition &other) const
PolyBlockType PartitionType(ColumnSpanningType flow) const
static void LineSpacingBlocks(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *block_parts, ColPartition_LIST *used_parts, BLOCK_LIST *completed_blocks, TO_BLOCK_LIST *to_blocks)
const int kHorzStrongTextlineAspect
BlobTextFlowType flow() const
Definition: colpartition.h:155
#define ASSERT_HOST(x)
Definition: errcode.h:88
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:488
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:286
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:289
int16_t left() const
Definition: rect.h:72
ColPartition * SingletonPartner(bool upper)
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:298
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:535
void RemoveBox(BLOBNBOX *box)
void set_top_spacing(int spacing)
Definition: colpartition.h:230
static bool TypesSimilar(PolyBlockType type1, PolyBlockType type2)
Definition: colpartition.h:419
int left_margin() const
Definition: colpartition.h:113
const double kMaxTopSpacingFraction
BLOBNBOX * OverlapSplitBlob(const TBOX &box)
bool MatchingSizes(const ColPartition &other) const
void CopyLeftTab(const ColPartition &src, bool take_box)
int SortKey(int x, int y) const
Definition: colpartition.h:317
int32_t mode() const
Definition: statistc.cpp:113
void set_poly_block(POLY_BLOCK *blk)
set the poly block
Definition: pdblock.h:58
bool IsText() const
Definition: polyblk.h:49
double median() const
Definition: statistc.cpp:237
bool IsImageType() const
Definition: colpartition.h:430
void SetColumnGoodness(WidthCallback *cb)
const double kMaxLeaderGapFractionOfMin
int top_spacing() const
Definition: colpartition.h:227
#define ELIST2IZE(CLASSNAME)
Definition: elst2.h:959
Definition: statistc.h:31
void RemovePartner(bool upper, ColPartition *partner)
int CountOverlappingBoxes(const TBOX &box)
void set_left_margin(int margin)
Definition: colpartition.h:116
ColumnSpanningType SpanningType(int resolution, int left, int right, int height, int y, int left_margin, int right_margin, int *first_col, int *last_col, int *first_spanned_col)
static bool WithinTestRegion(int detail_level, int x, int y)
void set_bottom(int y)
Definition: rect.h:68
void Absorb(ColPartition *other, WidthCallback *cb)
BLOBNBOX_CLIST * boxes()
Definition: colpartition.h:188
bool owns_blobs() const
Definition: colpartition.h:292
BlobSpecialTextType
Definition: blobbox.h:96
void set_side_step(int step)
Definition: colpartition.h:218
static int SortByBBox(const void *p1, const void *p2)
Definition: colpartition.h:715
int count(LIST var_list)
Definition: oldlist.cpp:96
int base_char_top() const
Definition: blobbox.h:383
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:241
Definition: ocrblock.h:29
void ColumnRange(int resolution, ColPartitionSet *columns, int *first_col, int *last_col)
int RightAtY(int y) const
Definition: colpartition.h:345
static ColPartition * MakeLinePartition(BlobRegionType blob_type, const ICOORD &vertical, int left, int bottom, int right, int top)
const double kMaxSpacingDrift
int sort_key() const
Definition: tabvector.h:158
Definition: capi.h:142
int XAtY(int sort_key, int y) const
Definition: colpartition.h:321
const int kMaxRMSColorNoise