tesseract  4.1.0
tabvector.cpp
Go to the documentation of this file.
1 // File: tabvector.cpp
3 // Description: Class to hold a near-vertical vector representing a tab-stop.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2008, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifdef HAVE_CONFIG_H
20 #include "config_auto.h"
21 #endif
22 
23 #include "tabvector.h"
24 #include "blobbox.h"
25 #include "colfind.h"
26 #include "colpartitionset.h"
27 #include "detlinefit.h"
28 #include "statistc.h"
29 
30 #include <algorithm>
31 
32 namespace tesseract {
33 
34 // Multiple of height used as a gutter for evaluation search.
35 const int kGutterMultiple = 4;
36 // Multiple of neighbour gap that we expect the gutter gap to be at minimum.
38 // Pixel distance for tab vectors to be considered the same.
39 const int kSimilarVectorDist = 10;
40 // Pixel distance for ragged tab vectors to be considered the same if there
41 // is nothing in the overlap box
42 const int kSimilarRaggedDist = 50;
43 // Max multiple of height to allow filling in between blobs when evaluating.
44 const int kMaxFillinMultiple = 11;
45 // Min fraction of mean gutter size to allow a gutter on a good tab blob.
46 const double kMinGutterFraction = 0.5;
47 // Multiple of 1/n lines as a minimum gutter in evaluation.
48 const double kLineCountReciprocal = 4.0;
49 // Constant add-on for minimum gutter for aligned tabs.
50 const double kMinAlignedGutter = 0.25;
51 // Constant add-on for minimum gutter for ragged tabs.
52 const double kMinRaggedGutter = 1.5;
53 
55  "max fraction of mean blob width allowed for vertical gaps in vertical text");
56 
58  "Fraction of box matches required to declare a line vertical");
59 
61 
62 // Create a constraint for the top or bottom of this TabVector.
63 void TabConstraint::CreateConstraint(TabVector* vector, bool is_top) {
64  auto* constraint = new TabConstraint(vector, is_top);
65  auto* constraints = new TabConstraint_LIST;
66  TabConstraint_IT it(constraints);
67  it.add_to_end(constraint);
68  if (is_top)
69  vector->set_top_constraints(constraints);
70  else
71  vector->set_bottom_constraints(constraints);
72 }
73 
74 // Test to see if the constraints are compatible enough to merge.
75 bool TabConstraint::CompatibleConstraints(TabConstraint_LIST* list1,
76  TabConstraint_LIST* list2) {
77  if (list1 == list2)
78  return false;
79  int y_min = -INT32_MAX;
80  int y_max = INT32_MAX;
81  if (textord_debug_tabfind > 3)
82  tprintf("Testing constraint compatibility\n");
83  GetConstraints(list1, &y_min, &y_max);
84  GetConstraints(list2, &y_min, &y_max);
85  if (textord_debug_tabfind > 3)
86  tprintf("Resulting range = [%d,%d]\n", y_min, y_max);
87  return y_max >= y_min;
88 }
89 
90 // Merge the lists of constraints and update the TabVector pointers.
91 // The second list is deleted.
92 void TabConstraint::MergeConstraints(TabConstraint_LIST* list1,
93  TabConstraint_LIST* list2) {
94  if (list1 == list2)
95  return;
96  TabConstraint_IT it(list2);
97  if (textord_debug_tabfind > 3)
98  tprintf("Merging constraints\n");
99  // The vectors of all constraints on list2 are now going to be on list1.
100  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
101  TabConstraint* constraint = it.data();
102  if (textord_debug_tabfind> 3)
103  constraint->vector_->Print("Merge");
104  if (constraint->is_top_)
105  constraint->vector_->set_top_constraints(list1);
106  else
107  constraint->vector_->set_bottom_constraints(list1);
108  }
109  it = list1;
110  it.add_list_before(list2);
111  delete list2;
112 }
113 
114 // Set all the tops and bottoms as appropriate to a mean of the
115 // constrained range. Delete all the constraints and list.
116 void TabConstraint::ApplyConstraints(TabConstraint_LIST* constraints) {
117  int y_min = -INT32_MAX;
118  int y_max = INT32_MAX;
119  GetConstraints(constraints, &y_min, &y_max);
120  int y = (y_min + y_max) / 2;
121  TabConstraint_IT it(constraints);
122  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
123  TabConstraint* constraint = it.data();
124  TabVector* v = constraint->vector_;
125  if (constraint->is_top_) {
126  v->SetYEnd(y);
127  v->set_top_constraints(nullptr);
128  } else {
129  v->SetYStart(y);
130  v->set_bottom_constraints(nullptr);
131  }
132  }
133  delete constraints;
134 }
135 
136 TabConstraint::TabConstraint(TabVector* vector, bool is_top)
137  : vector_(vector), is_top_(is_top) {
138  if (is_top) {
139  y_min_ = vector->endpt().y();
140  y_max_ = vector->extended_ymax();
141  } else {
142  y_max_ = vector->startpt().y();
143  y_min_ = vector->extended_ymin();
144  }
145 }
146 
147 // Get the max of the mins and the min of the maxes.
148 void TabConstraint::GetConstraints(TabConstraint_LIST* constraints,
149  int* y_min, int* y_max) {
150  TabConstraint_IT it(constraints);
151  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
152  TabConstraint* constraint = it.data();
153  if (textord_debug_tabfind > 3) {
154  tprintf("Constraint is [%d,%d]", constraint->y_min_, constraint->y_max_);
155  constraint->vector_->Print(" for");
156  }
157  *y_min = std::max(*y_min, constraint->y_min_);
158  *y_max = std::min(*y_max, constraint->y_max_);
159  }
160 }
161 
164 
165 // The constructor is private. See the bottom of the file...
166 
167 
168 // Public factory to build a TabVector from a list of boxes.
169 // The TabVector will be of the given alignment type.
170 // The input vertical vector is used in fitting, and the output
171 // vertical_x, vertical_y have the resulting line vector added to them
172 // if the alignment is not ragged.
173 // The extended_start_y and extended_end_y are the maximum possible
174 // extension to the line segment that can be used to align with others.
175 // The input CLIST of BLOBNBOX good_points is consumed and taken over.
176 TabVector* TabVector::FitVector(TabAlignment alignment, ICOORD vertical,
177  int extended_start_y, int extended_end_y,
178  BLOBNBOX_CLIST* good_points,
179  int* vertical_x, int* vertical_y) {
180  auto* vector = new TabVector(extended_start_y, extended_end_y,
181  alignment, good_points);
182  if (!vector->Fit(vertical, false)) {
183  delete vector;
184  return nullptr;
185  }
186  if (!vector->IsRagged()) {
187  vertical = vector->endpt_ - vector->startpt_;
188  int weight = vector->BoxCount();
189  *vertical_x += vertical.x() * weight;
190  *vertical_y += vertical.y() * weight;
191  }
192  return vector;
193 }
194 
195 // Build a ragged TabVector by copying another's direction, shifting it
196 // to match the given blob, and making its initial extent the height
197 // of the blob, but its extended bounds from the bounds of the original.
199  const ICOORD& vertical_skew, BLOBNBOX* blob)
200  : extended_ymin_(src.extended_ymin_), extended_ymax_(src.extended_ymax_),
201  sort_key_(0), percent_score_(0), mean_width_(0),
202  needs_refit_(true), needs_evaluation_(true), intersects_other_lines_(false),
203  alignment_(alignment),
204  top_constraints_(nullptr), bottom_constraints_(nullptr) {
205  BLOBNBOX_C_IT it(&boxes_);
206  it.add_to_end(blob);
207  TBOX box = blob->bounding_box();
208  if (IsLeftTab()) {
209  startpt_ = box.botleft();
210  endpt_ = box.topleft();
211  } else {
212  startpt_ = box.botright();
213  endpt_ = box.topright();
214  }
215  sort_key_ = SortKey(vertical_skew,
216  (startpt_.x() + endpt_.x()) / 2,
217  (startpt_.y() + endpt_.y()) / 2);
218  if (textord_debug_tabfind > 3)
219  Print("Constructed a new tab vector:");
220 }
221 
222 // Copies basic attributes of a tab vector for simple operations.
223 // Copies things such startpt, endpt, range.
224 // Does not copy things such as partners, boxes, or constraints.
225 // This is useful if you only need vector information for processing, such
226 // as in the table detection code.
228  auto* copy = new TabVector();
229  copy->startpt_ = startpt_;
230  copy->endpt_ = endpt_;
231  copy->alignment_ = alignment_;
232  copy->extended_ymax_ = extended_ymax_;
233  copy->extended_ymin_ = extended_ymin_;
234  copy->intersects_other_lines_ = intersects_other_lines_;
235  return copy;
236 }
237 
238 // Extend this vector to include the supplied blob if it doesn't
239 // already have it.
241  TBOX new_box = new_blob->bounding_box();
242  BLOBNBOX_C_IT it(&boxes_);
243  if (!it.empty()) {
244  BLOBNBOX* blob = it.data();
245  TBOX box = blob->bounding_box();
246  while (!it.at_last() && box.top() <= new_box.top()) {
247  if (blob == new_blob)
248  return; // We have it already.
249  it.forward();
250  blob = it.data();
251  box = blob->bounding_box();
252  }
253  if (box.top() >= new_box.top()) {
254  it.add_before_stay_put(new_blob);
255  needs_refit_ = true;
256  return;
257  }
258  }
259  needs_refit_ = true;
260  it.add_after_stay_put(new_blob);
261 }
262 
263 // Set the ycoord of the start and move the xcoord to match.
264 void TabVector::SetYStart(int start_y) {
265  startpt_.set_x(XAtY(start_y));
266  startpt_.set_y(start_y);
267 }
268 // Set the ycoord of the end and move the xcoord to match.
269 void TabVector::SetYEnd(int end_y) {
270  endpt_.set_x(XAtY(end_y));
271  endpt_.set_y(end_y);
272 }
273 
274 // Rotate the ends by the given vector. Auto flip start and end if needed.
275 void TabVector::Rotate(const FCOORD& rotation) {
276  startpt_.rotate(rotation);
277  endpt_.rotate(rotation);
278  int dx = endpt_.x() - startpt_.x();
279  int dy = endpt_.y() - startpt_.y();
280  if ((dy < 0 && abs(dy) > abs(dx)) || (dx < 0 && abs(dx) > abs(dy))) {
281  // Need to flip start/end.
282  ICOORD tmp = startpt_;
283  startpt_ = endpt_;
284  endpt_ = tmp;
285  }
286 }
287 
288 // Setup the initial constraints, being the limits of
289 // the vector and the extended ends.
291  TabConstraint::CreateConstraint(this, false);
293 }
294 
295 // Setup the constraints between the partners of this TabVector.
297  // With the first and last partner, we want a common bottom and top,
298  // respectively, and for each change of partner, we want a common
299  // top of first with bottom of next.
300  TabVector_C_IT it(&partners_);
301  TabVector* prev_partner = nullptr;
302  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
303  TabVector* partner = it.data();
304  if (partner->top_constraints_ == nullptr ||
305  partner->bottom_constraints_ == nullptr) {
306  partner->Print("Impossible: has no constraints");
307  Print("This vector has it as a partner");
308  continue;
309  }
310  if (prev_partner == nullptr) {
311  // This is the first partner, so common bottom.
312  if (TabConstraint::CompatibleConstraints(bottom_constraints_,
313  partner->bottom_constraints_))
314  TabConstraint::MergeConstraints(bottom_constraints_,
315  partner->bottom_constraints_);
316  } else {
317  // We need prev top to be common with partner bottom.
318  if (TabConstraint::CompatibleConstraints(prev_partner->top_constraints_,
319  partner->bottom_constraints_))
320  TabConstraint::MergeConstraints(prev_partner->top_constraints_,
321  partner->bottom_constraints_);
322  }
323  prev_partner = partner;
324  if (it.at_last()) {
325  // This is the last partner, so common top.
326  if (TabConstraint::CompatibleConstraints(top_constraints_,
327  partner->top_constraints_))
328  TabConstraint::MergeConstraints(top_constraints_,
329  partner->top_constraints_);
330  }
331  }
332 }
333 
334 // Setup the constraints between this and its partner.
336  if (TabConstraint::CompatibleConstraints(bottom_constraints_,
337  partner->bottom_constraints_))
338  TabConstraint::MergeConstraints(bottom_constraints_,
339  partner->bottom_constraints_);
340  if (TabConstraint::CompatibleConstraints(top_constraints_,
341  partner->top_constraints_))
342  TabConstraint::MergeConstraints(top_constraints_,
343  partner->top_constraints_);
344 }
345 
346 // Use the constraints to modify the top and bottom.
348  if (top_constraints_ != nullptr)
349  TabConstraint::ApplyConstraints(top_constraints_);
350  if (bottom_constraints_ != nullptr)
351  TabConstraint::ApplyConstraints(bottom_constraints_);
352 }
353 
354 // Merge close tab vectors of the same side that overlap.
356  TabVector_LIST* vectors,
357  BlobGrid* grid) {
358  TabVector_IT it1(vectors);
359  for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
360  TabVector* v1 = it1.data();
361  TabVector_IT it2(it1);
362  for (it2.forward(); !it2.at_first(); it2.forward()) {
363  TabVector* v2 = it2.data();
364  if (v2->SimilarTo(vertical, *v1, grid)) {
365  // Merge into the forward one, in case the combined vector now
366  // overlaps one in between.
367  if (textord_debug_tabfind) {
368  v2->Print("Merging");
369  v1->Print("by deleting");
370  }
371  v2->MergeWith(vertical, it1.extract());
372  if (textord_debug_tabfind) {
373  v2->Print("Producing");
374  }
375  ICOORD merged_vector = v2->endpt();
376  merged_vector -= v2->startpt();
377  if (textord_debug_tabfind && abs(merged_vector.x()) > 100) {
378  v2->Print("Garbage result of merge?");
379  }
380  break;
381  }
382  }
383  }
384 }
385 
386 // Return true if this vector is the same side, overlaps, and close
387 // enough to the other to be merged.
388 bool TabVector::SimilarTo(const ICOORD& vertical,
389  const TabVector& other, BlobGrid* grid) const {
390  if ((IsRightTab() && other.IsRightTab()) ||
391  (IsLeftTab() && other.IsLeftTab())) {
392  // If they don't overlap, at least in extensions, then there is no chance.
393  if (ExtendedOverlap(other.extended_ymax_, other.extended_ymin_) < 0)
394  return false;
395  // A fast approximation to the scale factor of the sort_key_.
396  int v_scale = abs(vertical.y());
397  if (v_scale == 0)
398  v_scale = 1;
399  // If they are close enough, then OK.
400  if (sort_key_ + kSimilarVectorDist * v_scale >= other.sort_key_ &&
401  sort_key_ - kSimilarVectorDist * v_scale <= other.sort_key_)
402  return true;
403  // Ragged tabs get a bigger threshold.
404  if (!IsRagged() || !other.IsRagged() ||
405  sort_key_ + kSimilarRaggedDist * v_scale < other.sort_key_ ||
406  sort_key_ - kSimilarRaggedDist * v_scale > other.sort_key_)
407  return false;
408  if (grid == nullptr) {
409  // There is nothing else to test!
410  return true;
411  }
412  // If there is nothing in the rectangle between the vector that is going to
413  // move, and the place it is moving to, then they can be merged.
414  // Setup a vertical search for any blob.
415  const TabVector* mover = (IsRightTab() &&
416  sort_key_ < other.sort_key_) ? this : &other;
417  int top_y = mover->endpt_.y();
418  int bottom_y = mover->startpt_.y();
419  int left = std::min(mover->XAtY(top_y), mover->XAtY(bottom_y));
420  int right = std::max(mover->XAtY(top_y), mover->XAtY(bottom_y));
421  int shift = abs(sort_key_ - other.sort_key_) / v_scale;
422  if (IsRightTab()) {
423  right += shift;
424  } else {
425  left -= shift;
426  }
427 
429  vsearch.StartVerticalSearch(left, right, top_y);
430  BLOBNBOX* blob;
431  while ((blob = vsearch.NextVerticalSearch(true)) != nullptr) {
432  const TBOX& box = blob->bounding_box();
433  if (box.top() > bottom_y)
434  return true; // Nothing found.
435  if (box.bottom() < top_y)
436  continue; // Doesn't overlap.
437  int left_at_box = XAtY(box.bottom());
438  int right_at_box = left_at_box;
439  if (IsRightTab())
440  right_at_box += shift;
441  else
442  left_at_box -= shift;
443  if (std::min(right_at_box, static_cast<int>(box.right())) > std::max(left_at_box, static_cast<int>(box.left())))
444  return false;
445  }
446  return true; // Nothing found.
447  }
448  return false;
449 }
450 
451 // Eat the other TabVector into this and delete it.
452 void TabVector::MergeWith(const ICOORD& vertical, TabVector* other) {
453  extended_ymin_ = std::min(extended_ymin_, other->extended_ymin_);
454  extended_ymax_ = std::max(extended_ymax_, other->extended_ymax_);
455  if (other->IsRagged()) {
456  alignment_ = other->alignment_;
457  }
458  // Merge sort the two lists of boxes.
459  BLOBNBOX_C_IT it1(&boxes_);
460  BLOBNBOX_C_IT it2(&other->boxes_);
461  while (!it2.empty()) {
462  BLOBNBOX* bbox2 = it2.extract();
463  it2.forward();
464  TBOX box2 = bbox2->bounding_box();
465  BLOBNBOX* bbox1 = it1.data();
466  TBOX box1 = bbox1->bounding_box();
467  while (box1.bottom() < box2.bottom() && !it1.at_last()) {
468  it1.forward();
469  bbox1 = it1.data();
470  box1 = bbox1->bounding_box();
471  }
472  if (box1.bottom() < box2.bottom()) {
473  it1.add_to_end(bbox2);
474  } else if (bbox1 != bbox2) {
475  it1.add_before_stay_put(bbox2);
476  }
477  }
478  Fit(vertical, true);
479  other->Delete(this);
480 }
481 
482 // Add a new element to the list of partner TabVectors.
483 // Partners must be added in order of increasing y coordinate of the text line
484 // that makes them partners.
485 // Groups of identical partners are merged into one.
487  if (IsSeparator() || partner->IsSeparator())
488  return;
489  TabVector_C_IT it(&partners_);
490  if (!it.empty()) {
491  it.move_to_last();
492  if (it.data() == partner)
493  return;
494  }
495  it.add_after_then_move(partner);
496 }
497 
498 // Return true if other is a partner of this.
499 bool TabVector::IsAPartner(const TabVector* other) {
500  TabVector_C_IT it(&partners_);
501  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
502  if (it.data() == other)
503  return true;
504  }
505  return false;
506 }
507 
508 // These names must be synced with the TabAlignment enum in tabvector.h.
509 static const char* const kAlignmentNames[] = {
510  "Left Aligned",
511  "Left Ragged",
512  "Center",
513  "Right Aligned",
514  "Right Ragged",
515  "Separator"
516 };
517 
518 // Print basic information about this tab vector.
519 void TabVector::Print(const char* prefix) {
520  tprintf(
521  "%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d,"
522  " partners=%d\n",
523  prefix, kAlignmentNames[alignment_], startpt_.x(), startpt_.y(),
524  endpt_.x(), endpt_.y(), mean_width_, percent_score_, sort_key_,
525  boxes_.length(), partners_.length());
526 }
527 
528 // Print basic information about this tab vector and every box in it.
529 void TabVector::Debug(const char* prefix) {
530  Print(prefix);
531  BLOBNBOX_C_IT it(&boxes_);
532  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
533  BLOBNBOX* bbox = it.data();
534  const TBOX& box = bbox->bounding_box();
535  tprintf("Box at (%d,%d)->(%d,%d)\n",
536  box.left(), box.bottom(), box.right(), box.top());
537  }
538 }
539 
540 // Draw this tabvector in place in the given window.
542 #ifndef GRAPHICS_DISABLED
544  tab_win->Pen(ScrollView::BLUE);
545  else if (alignment_ == TA_LEFT_ALIGNED)
546  tab_win->Pen(ScrollView::LIME_GREEN);
547  else if (alignment_ == TA_LEFT_RAGGED)
548  tab_win->Pen(ScrollView::DARK_GREEN);
549  else if (alignment_ == TA_RIGHT_ALIGNED)
550  tab_win->Pen(ScrollView::PINK);
551  else if (alignment_ == TA_RIGHT_RAGGED)
552  tab_win->Pen(ScrollView::CORAL);
553  else
554  tab_win->Pen(ScrollView::WHITE);
555  tab_win->Line(startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y());
556  tab_win->Pen(ScrollView::GREY);
557  tab_win->Line(startpt_.x(), startpt_.y(), startpt_.x(), extended_ymin_);
558  tab_win->Line(endpt_.x(), extended_ymax_, endpt_.x(), endpt_.y());
559  char score_buf[64];
560  snprintf(score_buf, sizeof(score_buf), "%d", percent_score_);
561  tab_win->TextAttributes("Times", 50, false, false, false);
562  tab_win->Text(startpt_.x(), startpt_.y(), score_buf);
563 #endif
564 }
565 
566 // Refit the line and/or re-evaluate the vector if the dirty flags are set.
568  TabFind* finder) {
569  if (needs_refit_)
570  Fit(vertical, true);
571  if (needs_evaluation_)
572  Evaluate(vertical, finder);
573 }
574 
575 // Evaluate the vector in terms of coverage of its length by good-looking
576 // box edges. A good looking box is one where its nearest neighbour on the
577 // inside is nearer than half the distance its nearest neighbour on the
578 // outside of the putative column. Bad boxes are removed from the line.
579 // A second pass then further filters boxes by requiring that the gutter
580 // width be a minimum fraction of the mean gutter along the line.
581 void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
582  bool debug = false;
583  needs_evaluation_ = false;
584  int length = endpt_.y() - startpt_.y();
585  if (length == 0 || boxes_.empty()) {
586  percent_score_ = 0;
587  Print("Zero length in evaluate");
588  return;
589  }
590  // Compute the mean box height.
591  BLOBNBOX_C_IT it(&boxes_);
592  int mean_height = 0;
593  int height_count = 0;
594  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
595  BLOBNBOX* bbox = it.data();
596  const TBOX& box = bbox->bounding_box();
597  int height = box.height();
598  mean_height += height;
599  ++height_count;
600  }
601  if (height_count > 0) mean_height /= height_count;
602  int max_gutter = kGutterMultiple * mean_height;
603  if (IsRagged()) {
604  // Ragged edges face a tougher test in that the gap must always be within
605  // the height of the blob.
606  max_gutter = kGutterToNeighbourRatio * mean_height;
607  }
608 
609  STATS gutters(0, max_gutter + 1);
610  // Evaluate the boxes for their goodness, calculating the coverage as we go.
611  // Remove boxes that are not good and shorten the list to the first and
612  // last good boxes.
613  int num_deleted_boxes = 0;
614  bool text_on_image = false;
615  int good_length = 0;
616  const TBOX* prev_good_box = nullptr;
617  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
618  BLOBNBOX* bbox = it.data();
619  const TBOX& box = bbox->bounding_box();
620  int mid_y = (box.top() + box.bottom()) / 2;
621  if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) {
622  if (!debug) {
623  tprintf("After already deleting %d boxes, ", num_deleted_boxes);
624  Print("Starting evaluation");
625  }
626  debug = true;
627  }
628  // A good box is one where the nearest neighbour on the inside is closer
629  // than half the distance to the nearest neighbour on the outside
630  // (of the putative column).
631  bool left = IsLeftTab();
632  int tab_x = XAtY(mid_y);
633  int gutter_width;
634  int neighbour_gap;
635  finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
636  bbox, &gutter_width, &neighbour_gap);
637  if (debug) {
638  tprintf("Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\n",
639  box.left(), box.bottom(), box.right(), box.top(),
640  gutter_width, neighbour_gap);
641  }
642  // Now we can make the test.
643  if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) {
644  // A good box contributes its height to the good_length.
645  good_length += box.top() - box.bottom();
646  gutters.add(gutter_width, 1);
647  // Two good boxes together contribute the gap between them
648  // to the good_length as well, as long as the gap is not
649  // too big.
650  if (prev_good_box != nullptr) {
651  int vertical_gap = box.bottom() - prev_good_box->top();
652  double size1 = sqrt(static_cast<double>(prev_good_box->area()));
653  double size2 = sqrt(static_cast<double>(box.area()));
654  if (vertical_gap < kMaxFillinMultiple * std::min(size1, size2))
655  good_length += vertical_gap;
656  if (debug) {
657  tprintf("Box and prev good, gap=%d, target %g, goodlength=%d\n",
658  vertical_gap, kMaxFillinMultiple * std::min(size1, size2),
659  good_length);
660  }
661  } else {
662  // Adjust the start to the first good box.
663  SetYStart(box.bottom());
664  }
665  prev_good_box = &box;
666  if (bbox->flow() == BTFT_TEXT_ON_IMAGE)
667  text_on_image = true;
668  } else {
669  // Get rid of boxes that are not good.
670  if (debug) {
671  tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, ndist %d\n",
672  box.left(), box.bottom(), box.right(), box.top(),
673  gutter_width, neighbour_gap);
674  }
675  it.extract();
676  ++num_deleted_boxes;
677  }
678  }
679  if (debug) {
680  Print("Evaluating:");
681  }
682  // If there are any good boxes, do it again, except this time get rid of
683  // boxes that have a gutter that is a small fraction of the mean gutter.
684  // This filters out ends that run into a coincidental gap in the text.
685  int search_top = endpt_.y();
686  int search_bottom = startpt_.y();
687  int median_gutter = IntCastRounded(gutters.median());
688  if (gutters.get_total() > 0) {
689  prev_good_box = nullptr;
690  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
691  BLOBNBOX* bbox = it.data();
692  const TBOX& box = bbox->bounding_box();
693  int mid_y = (box.top() + box.bottom()) / 2;
694  // A good box is one where the gutter width is at least some constant
695  // fraction of the mean gutter width.
696  bool left = IsLeftTab();
697  int tab_x = XAtY(mid_y);
698  int max_gutter = kGutterMultiple * mean_height;
699  if (IsRagged()) {
700  // Ragged edges face a tougher test in that the gap must always be
701  // within the height of the blob.
702  max_gutter = kGutterToNeighbourRatio * mean_height;
703  }
704  int gutter_width;
705  int neighbour_gap;
706  finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
707  bbox, &gutter_width, &neighbour_gap);
708  // Now we can make the test.
709  if (gutter_width >= median_gutter * kMinGutterFraction) {
710  if (prev_good_box == nullptr) {
711  // Adjust the start to the first good box.
712  SetYStart(box.bottom());
713  search_bottom = box.top();
714  }
715  prev_good_box = &box;
716  search_top = box.bottom();
717  } else {
718  // Get rid of boxes that are not good.
719  if (debug) {
720  tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\n",
721  box.left(), box.bottom(), box.right(), box.top(),
722  gutter_width, median_gutter);
723  }
724  it.extract();
725  ++num_deleted_boxes;
726  }
727  }
728  }
729  // If there has been a good box, adjust the end.
730  if (prev_good_box != nullptr) {
731  SetYEnd(prev_good_box->top());
732  // Compute the percentage of the vector that is occupied by good boxes.
733  int length = endpt_.y() - startpt_.y();
734  percent_score_ = 100 * good_length / length;
735  if (num_deleted_boxes > 0) {
736  needs_refit_ = true;
737  FitAndEvaluateIfNeeded(vertical, finder);
738  if (boxes_.empty())
739  return;
740  }
741  // Test the gutter over the whole vector, instead of just at the boxes.
742  int required_shift;
743  if (search_bottom > search_top) {
744  search_bottom = startpt_.y();
745  search_top = endpt_.y();
746  }
747  double min_gutter_width = kLineCountReciprocal / boxes_.length();
748  min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter;
749  min_gutter_width *= mean_height;
750  int max_gutter_width = IntCastRounded(min_gutter_width) + 1;
751  if (median_gutter > max_gutter_width)
752  max_gutter_width = median_gutter;
753  int gutter_width = finder->GutterWidth(search_bottom, search_top, *this,
754  text_on_image, max_gutter_width,
755  &required_shift);
756  if (gutter_width < min_gutter_width) {
757  if (debug) {
758  tprintf("Rejecting bad tab Vector with %d gutter vs %g min\n",
759  gutter_width, min_gutter_width);
760  }
761  boxes_.shallow_clear();
762  percent_score_ = 0;
763  } else if (debug) {
764  tprintf("Final gutter %d, vs limit of %g, required shift = %d\n",
765  gutter_width, min_gutter_width, required_shift);
766  }
767  } else {
768  // There are no good boxes left, so score is 0.
769  percent_score_ = 0;
770  }
771 
772  if (debug) {
773  Print("Evaluation complete:");
774  }
775 }
776 
777 // (Re)Fit a line to the stored points. Returns false if the line
778 // is degenerate. Althougth the TabVector code mostly doesn't care about the
779 // direction of lines, XAtY would give silly results for a horizontal line.
780 // The class is mostly aimed at use for vertical lines representing
781 // horizontal tab stops.
782 bool TabVector::Fit(ICOORD vertical, bool force_parallel) {
783  needs_refit_ = false;
784  if (boxes_.empty()) {
785  // Don't refit something with no boxes, as that only happens
786  // in Evaluate, and we don't want to end up with a zero vector.
787  if (!force_parallel)
788  return false;
789  // If we are forcing parallel, then we just need to set the sort_key_.
790  ICOORD midpt = startpt_;
791  midpt += endpt_;
792  midpt /= 2;
793  sort_key_ = SortKey(vertical, midpt.x(), midpt.y());
794  return startpt_.y() != endpt_.y();
795  }
796  if (!force_parallel && !IsRagged()) {
797  // Use a fitted line as the vertical.
798  DetLineFit linepoints;
799  BLOBNBOX_C_IT it(&boxes_);
800  // Fit a line to all the boxes in the list.
801  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
802  BLOBNBOX* bbox = it.data();
803  const TBOX& box = bbox->bounding_box();
804  int x1 = IsRightTab() ? box.right() : box.left();
805  ICOORD boxpt(x1, box.bottom());
806  linepoints.Add(boxpt);
807  if (it.at_last()) {
808  ICOORD top_pt(x1, box.top());
809  linepoints.Add(top_pt);
810  }
811  }
812  linepoints.Fit(&startpt_, &endpt_);
813  if (startpt_.y() != endpt_.y()) {
814  vertical = endpt_;
815  vertical -= startpt_;
816  }
817  }
818  int start_y = startpt_.y();
819  int end_y = endpt_.y();
820  sort_key_ = IsLeftTab() ? INT32_MAX : -INT32_MAX;
821  BLOBNBOX_C_IT it(&boxes_);
822  // Choose a line parallel to the vertical such that all boxes are on the
823  // correct side of it.
824  mean_width_ = 0;
825  int width_count = 0;
826  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
827  BLOBNBOX* bbox = it.data();
828  const TBOX& box = bbox->bounding_box();
829  mean_width_ += box.width();
830  ++width_count;
831  int x1 = IsRightTab() ? box.right() : box.left();
832  // Test both the bottom and the top, as one will be more extreme, depending
833  // on the direction of skew.
834  int bottom_y = box.bottom();
835  int top_y = box.top();
836  int key = SortKey(vertical, x1, bottom_y);
837  if (IsLeftTab() == (key < sort_key_)) {
838  sort_key_ = key;
839  startpt_ = ICOORD(x1, bottom_y);
840  }
841  key = SortKey(vertical, x1, top_y);
842  if (IsLeftTab() == (key < sort_key_)) {
843  sort_key_ = key;
844  startpt_ = ICOORD(x1, top_y);
845  }
846  if (it.at_first())
847  start_y = bottom_y;
848  if (it.at_last())
849  end_y = top_y;
850  }
851  if (width_count > 0) {
852  mean_width_ = (mean_width_ + width_count - 1) / width_count;
853  }
854  endpt_ = startpt_ + vertical;
855  needs_evaluation_ = true;
856  if (start_y != end_y) {
857  // Set the ends of the vector to fully include the first and last blobs.
858  startpt_.set_x(XAtY(vertical, sort_key_, start_y));
859  startpt_.set_y(start_y);
860  endpt_.set_x(XAtY(vertical, sort_key_, end_y));
861  endpt_.set_y(end_y);
862  return true;
863  }
864  return false;
865 }
866 
867 // Returns the singleton partner if there is one, or nullptr otherwise.
869  if (!partners_.singleton())
870  return nullptr;
871  TabVector_C_IT partner_it(&partners_);
872  TabVector* partner = partner_it.data();
873  return partner;
874 }
875 
876 // Return the partner of this TabVector if the vector qualifies as
877 // being a vertical text line, otherwise nullptr.
879  if (!partners_.singleton())
880  return nullptr;
881  TabVector_C_IT partner_it(&partners_);
882  TabVector* partner = partner_it.data();
883  BLOBNBOX_C_IT box_it1(&boxes_);
884  BLOBNBOX_C_IT box_it2(&partner->boxes_);
885  // Count how many boxes are also in the other list.
886  // At the same time, gather the mean width and median vertical gap.
887  if (textord_debug_tabfind > 1) {
888  Print("Testing for vertical text");
889  partner->Print(" partner");
890  }
891  int num_matched = 0;
892  int num_unmatched = 0;
893  int total_widths = 0;
894  int width = startpt().x() - partner->startpt().x();
895  if (width < 0)
896  width = -width;
897  STATS gaps(0, width * 2);
898  BLOBNBOX* prev_bbox = nullptr;
899  box_it2.mark_cycle_pt();
900  for (box_it1.mark_cycle_pt(); !box_it1.cycled_list(); box_it1.forward()) {
901  BLOBNBOX* bbox = box_it1.data();
902  TBOX box = bbox->bounding_box();
903  if (prev_bbox != nullptr) {
904  gaps.add(box.bottom() - prev_bbox->bounding_box().top(), 1);
905  }
906  while (!box_it2.cycled_list() && box_it2.data() != bbox &&
907  box_it2.data()->bounding_box().bottom() < box.bottom()) {
908  box_it2.forward();
909  }
910  if (!box_it2.cycled_list() && box_it2.data() == bbox &&
911  bbox->region_type() >= BRT_UNKNOWN &&
912  (prev_bbox == nullptr || prev_bbox->region_type() >= BRT_UNKNOWN))
913  ++num_matched;
914  else
915  ++num_unmatched;
916  total_widths += box.width();
917  prev_bbox = bbox;
918  }
919  if (num_unmatched + num_matched == 0) return nullptr;
920  double avg_width = total_widths * 1.0 / (num_unmatched + num_matched);
921  double max_gap = textord_tabvector_vertical_gap_fraction * avg_width;
922  int min_box_match = static_cast<int>((num_matched + num_unmatched) *
924  bool is_vertical = (gaps.get_total() > 0 &&
925  num_matched >= min_box_match &&
926  gaps.median() <= max_gap);
927  if (textord_debug_tabfind > 1) {
928  tprintf("gaps=%d, matched=%d, unmatched=%d, min_match=%d "
929  "median gap=%.2f, width=%.2f max_gap=%.2f Vertical=%s\n",
930  gaps.get_total(), num_matched, num_unmatched, min_box_match,
931  gaps.median(), avg_width, max_gap, is_vertical?"Yes":"No");
932  }
933  return (is_vertical) ? partner : nullptr;
934 }
935 
936 // The constructor is private.
938  TabAlignment alignment, BLOBNBOX_CLIST* boxes)
939  : extended_ymin_(extended_ymin), extended_ymax_(extended_ymax),
940  sort_key_(0), percent_score_(0), mean_width_(0),
941  needs_refit_(true), needs_evaluation_(true), alignment_(alignment),
942  top_constraints_(nullptr), bottom_constraints_(nullptr) {
943  BLOBNBOX_C_IT it(&boxes_);
944  it.add_list_after(boxes);
945 }
946 
947 // Delete this, but first, repoint all the partners to point to
948 // replacement. If replacement is nullptr, then partner relationships
949 // are removed.
950 void TabVector::Delete(TabVector* replacement) {
951  TabVector_C_IT it(&partners_);
952  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
953  TabVector* partner = it.data();
954  TabVector_C_IT p_it(&partner->partners_);
955  // If partner already has replacement in its list, then make
956  // replacement null, and just remove this TabVector when we find it.
957  TabVector* partner_replacement = replacement;
958  for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
959  TabVector* p_partner = p_it.data();
960  if (p_partner == partner_replacement) {
961  partner_replacement = nullptr;
962  break;
963  }
964  }
965  // Remove all references to this, and replace with replacement if not nullptr.
966  for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
967  TabVector* p_partner = p_it.data();
968  if (p_partner == this) {
969  p_it.extract();
970  if (partner_replacement != nullptr)
971  p_it.add_before_stay_put(partner_replacement);
972  }
973  }
974  if (partner_replacement != nullptr) {
975  partner_replacement->AddPartner(partner);
976  }
977  }
978  delete this;
979 }
980 
981 
982 } // namespace tesseract.
int IntCastRounded(double x)
Definition: helpers.h:175
int32_t area() const
Definition: rect.h:122
int16_t top() const
Definition: rect.h:58
static void MergeConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2)
Definition: tabvector.cpp:92
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:652
void rotate(const FCOORD &vec)
Definition: points.h:536
Definition: rect.h:34
void FitAndEvaluateIfNeeded(const ICOORD &vertical, TabFind *finder)
Definition: tabvector.cpp:567
void Display(ScrollView *tab_win)
Definition: tabvector.cpp:541
Definition: points.h:188
int GutterWidth(int bottom_y, int top_y, const TabVector &v, bool ignore_unmergeables, int max_gutter_width, int *required_shift)
Definition: tabfind.cpp:161
void set_x(int16_t xin)
rewrite function
Definition: points.h:61
void Evaluate(const ICOORD &vertical, TabFind *finder)
Definition: tabvector.cpp:581
const int kSimilarVectorDist
Definition: tabvector.cpp:39
#define double_VAR(name, val, comment)
Definition: params.h:312
bool IsLeftTab() const
Definition: tabvector.h:213
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:75
static void ApplyConstraints(TabConstraint_LIST *constraints)
Definition: tabvector.cpp:116
bool SimilarTo(const ICOORD &vertical, const TabVector &other, BlobGrid *grid) const
Definition: tabvector.cpp:388
const double kMinGutterFraction
Definition: tabvector.cpp:46
integer coordinate
Definition: points.h:31
void Print(const char *prefix)
Definition: tabvector.cpp:519
const TBOX & bounding_box() const
Definition: blobbox.h:230
const int kSimilarRaggedDist
Definition: tabvector.cpp:42
int16_t y() const
access_function
Definition: points.h:56
void set_y(int16_t yin)
rewrite function
Definition: points.h:65
bool IsAPartner(const TabVector *other)
Definition: tabvector.cpp:499
int extended_ymax() const
Definition: tabvector.h:152
void set_top_constraints(TabConstraint_LIST *constraints)
Definition: tabvector.h:164
BlobTextFlowType flow() const
Definition: blobbox.h:295
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
void set_bottom_constraints(TabConstraint_LIST *constraints)
Definition: tabvector.h:167
double textord_tabvector_vertical_box_ratio
Definition: tabvector.cpp:58
int extended_ymin() const
Definition: tabvector.h:155
const int kGutterMultiple
Definition: tabvector.cpp:35
const ICOORD & botleft() const
Definition: rect.h:92
ICOORD topleft() const
Definition: rect.h:100
const double kMinRaggedGutter
Definition: tabvector.cpp:52
static int SortKey(const ICOORD &vertical, int x, int y)
Definition: tabvector.h:280
int16_t height() const
Definition: rect.h:108
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:51
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:635
void SetYEnd(int end_y)
Definition: tabvector.cpp:269
const int kMaxFillinMultiple
Definition: tabvector.cpp:44
static void MergeSimilarTabVectors(const ICOORD &vertical, TabVector_LIST *vectors, BlobGrid *grid)
Definition: tabvector.cpp:355
const double kLineCountReciprocal
Definition: tabvector.cpp:48
#define ELISTIZE(CLASSNAME)
Definition: elst.h:955
bool textord_debug_printable
Definition: alignedblob.cpp:33
TabVector * GetSinglePartner()
Definition: tabvector.cpp:868
ICOORD botright() const
Definition: rect.h:96
int ExtendedOverlap(int top_y, int bottom_y) const
Definition: tabvector.h:208
const ICOORD & topright() const
Definition: rect.h:104
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:804
double textord_tabvector_vertical_gap_fraction
Definition: tabvector.cpp:55
const ICOORD & startpt() const
Definition: tabvector.h:146
void Debug(const char *prefix)
Definition: tabvector.cpp:529
void MergeWith(const ICOORD &vertical, TabVector *other)
Definition: tabvector.cpp:452
void Pen(Color color)
Definition: scrollview.cpp:719
int16_t x() const
access function
Definition: points.h:52
void SetYStart(int start_y)
Definition: tabvector.cpp:264
void ExtendToBox(BLOBNBOX *blob)
Definition: tabvector.cpp:240
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
static bool CompatibleConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2)
Definition: tabvector.cpp:75
static void CreateConstraint(TabVector *vector, bool is_top)
Definition: tabvector.cpp:63
int textord_debug_tabfind
Definition: alignedblob.cpp:27
int16_t width() const
Definition: rect.h:115
void Rotate(const FCOORD &rotation)
Definition: tabvector.cpp:275
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
const ICOORD & endpt() const
Definition: tabvector.h:149
void AddPartner(TabVector *partner)
Definition: tabvector.cpp:486
BlobRegionType region_type() const
Definition: blobbox.h:283
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:790
int XAtY(int y) const
Definition: tabvector.h:189
TabVector * ShallowCopy() const
Definition: tabvector.cpp:227
int32_t get_total() const
Definition: statistc.h:84
int16_t left() const
Definition: rect.h:72
void GutterWidthAndNeighbourGap(int tab_x, int mean_height, int max_gutter, bool left, BLOBNBOX *bbox, int *gutter_width, int *neighbour_gap)
Definition: tabfind.cpp:208
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:532
void SetupPartnerConstraints()
Definition: tabvector.cpp:296
bool IsRagged() const
Definition: tabvector.h:229
double median() const
Definition: statistc.cpp:237
const double kMinAlignedGutter
Definition: tabvector.cpp:50
const int kGutterToNeighbourRatio
Definition: tabvector.cpp:37
#define ELIST2IZE(CLASSNAME)
Definition: elst2.h:959
Definition: statistc.h:31
static bool WithinTestRegion(int detail_level, int x, int y)
bool Fit(ICOORD vertical, bool force_parallel)
Definition: tabvector.cpp:782
bool IsSeparator() const
Definition: tabvector.h:221
bool IsRightTab() const
Definition: tabvector.h:217
TabVector * VerticalTextlinePartner()
Definition: tabvector.cpp:878