tesseract  4.1.0
blobbox.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: blobbox.h (Formerly blobnbox.h)
3  * Description: Code for the textord blob class.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef BLOBBOX_H
20 #define BLOBBOX_H
21 
22 #include <cinttypes> // for PRId32
23 #include <cmath> // for std::sqrt
24 #include <cstdint> // for int16_t, int32_t
25 #include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
26 #include "elst2.h" // for ELIST2_ITERATOR, ELIST2IZEH, ELIST2_LINK
27 #include "errcode.h" // for ASSERT_HOST
28 #include "ocrblock.h" // for BLOCK
29 #include "params.h" // for DoubleParam, double_VAR_H
30 #include "pdblock.h" // for PDBLK
31 #include "points.h" // for FCOORD, ICOORD, ICOORDELT_LIST
32 #include "quspline.h" // for QSPLINE
33 #include "rect.h" // for TBOX
34 #include "scrollview.h" // for ScrollView, ScrollView::Color
35 #include "statistc.h" // for STATS
36 #include "stepblob.h" // for C_BLOB
37 #include "tprintf.h" // for tprintf
38 #include "werd.h" // for WERD_LIST
39 
40 class C_OUTLINE;
41 
42 struct Pix;
43 
45 {
46  PITCH_DUNNO, // insufficient data
47  PITCH_DEF_FIXED, // definitely fixed
48  PITCH_MAYBE_FIXED, // could be
53 };
54 
55 // The possible tab-stop types of each side of a BLOBNBOX.
56 // The ordering is important, as it is used for deleting dead-ends in the
57 // search. ALIGNED, CONFIRMED and VLINE should remain greater than the
58 // non-aligned, unset, or deleted members.
59 enum TabType {
60  TT_NONE, // Not a tab.
61  TT_DELETED, // Not a tab after detailed analysis.
62  TT_MAYBE_RAGGED, // Initial designation of a tab-stop candidate.
63  TT_MAYBE_ALIGNED, // Initial designation of a tab-stop candidate.
64  TT_CONFIRMED, // Aligned with neighbours.
65  TT_VLINE // Detected as a vertical line.
66 };
67 
68 // The possible region types of a BLOBNBOX.
69 // Note: keep all the text types > BRT_UNKNOWN and all the image types less.
70 // Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
71 // *Type static functions below.
73  BRT_NOISE, // Neither text nor image.
74  BRT_HLINE, // Horizontal separator line.
75  BRT_VLINE, // Vertical separator line.
76  BRT_RECTIMAGE, // Rectangular image.
77  BRT_POLYIMAGE, // Non-rectangular image.
78  BRT_UNKNOWN, // Not determined yet.
79  BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented.
80  BRT_TEXT, // Convincing text.
81 
82  BRT_COUNT // Number of possibilities.
83 };
84 
85 // enum for elements of arrays that refer to neighbours.
86 // NOTE: keep in this order, so ^2 can be used to flip direction.
93 };
94 
95 // enum for special type of text characters, such as math symbol or italic.
97  BSTT_NONE, // No special.
98  BSTT_ITALIC, // Italic style.
99  BSTT_DIGIT, // Digit symbols.
100  BSTT_MATH, // Mathmatical symobls (not including digit).
101  BSTT_UNCLEAR, // Characters with low recognition rate.
102  BSTT_SKIP, // Characters that we skip labeling (usually too small).
104 };
105 
107  return static_cast<BlobNeighbourDir>(dir ^ 2);
108 }
109 
110 // BlobTextFlowType indicates the quality of neighbouring information
111 // related to a chain of connected components, either horizontally or
112 // vertically. Also used by ColPartition for the collection of blobs
113 // within, which should all have the same value in most cases.
115  BTFT_NONE, // No text flow set yet.
116  BTFT_NONTEXT, // Flow too poor to be likely text.
117  BTFT_NEIGHBOURS, // Neighbours support flow in this direction.
118  BTFT_CHAIN, // There is a weak chain of text in this direction.
119  BTFT_STRONG_CHAIN, // There is a strong chain of text in this direction.
120  BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image.
121  BTFT_LEADER, // Leader dots/dashes etc.
123 };
124 
125 // Returns true if type1 dominates type2 in a merge. Mostly determined by the
126 // ordering of the enum, LEADER is weak and dominates nothing.
127 // The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that
128 // this cannot be true if t1 == t2, so the result is undefined.
130  // LEADER always loses.
131  if (type1 == BTFT_LEADER) return false;
132  if (type2 == BTFT_LEADER) return true;
133  // With those out of the way, the ordering of the enum determines the result.
134  return type1 >= type2;
135 }
136 
137 namespace tesseract {
138 class ColPartition;
139 }
140 
141 class BLOBNBOX;
143 class BLOBNBOX:public ELIST_LINK
144 {
145  public:
148  }
149  explicit BLOBNBOX(C_BLOB *srcblob) {
150  box = srcblob->bounding_box();
152  cblob_ptr = srcblob;
153  area = static_cast<int>(srcblob->area());
154  }
156  if (owns_cblob_) delete cblob_ptr;
157  }
158  static BLOBNBOX* RealBlob(C_OUTLINE* outline) {
159  auto* blob = new C_BLOB(outline);
160  return new BLOBNBOX(blob);
161  }
162 
163  // Rotates the box and the underlying blob.
164  void rotate(FCOORD rotation);
165 
166  // Methods that act on the box without touching the underlying blob.
167  // Reflect the box in the y-axis, leaving the underlying blob untouched.
168  void reflect_box_in_y_axis();
169  // Rotates the box by the angle given by rotation.
170  // If the blob is a diacritic, then only small rotations for skew
171  // correction can be applied.
172  void rotate_box(FCOORD rotation);
173  // Moves just the box by the given vector.
175  if (IsDiacritic()) {
176  box.move(v);
177  base_char_top_ += v.y();
178  base_char_bottom_ += v.y();
179  } else {
180  box.move(v);
181  set_diacritic_box(box);
182  }
183  }
184  void merge(BLOBNBOX *nextblob);
185  void really_merge(BLOBNBOX* other);
186  void chop( // fake chop blob
187  BLOBNBOX_IT *start_it, // location of this
188  BLOBNBOX_IT *blob_it, // iterator
189  FCOORD rotation, // for landscape
190  float xheight); // line height
191 
192  void NeighbourGaps(int gaps[BND_COUNT]) const;
193  void MinMaxGapsClipped(int* h_min, int* h_max,
194  int* v_min, int* v_max) const;
195  void CleanNeighbours();
196  // Returns positive if there is at least one side neighbour that has a
197  // similar stroke width and is not on the other side of a rule line.
198  int GoodTextBlob() const;
199  // Returns the number of side neighbours that are of type BRT_NOISE.
200  int NoisyNeighbours() const;
201 
202  // Returns true if the blob is noise and has no owner.
203  bool DeletableNoise() const {
204  return owner() == nullptr && region_type() == BRT_NOISE;
205  }
206 
207  // Returns true, and sets vert_possible/horz_possible if the blob has some
208  // feature that makes it individually appear to flow one way.
209  // eg if it has a high aspect ratio, yet has a complex shape, such as a
210  // joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1.
211  bool DefiniteIndividualFlow();
212 
213  // Returns true if there is no tabstop violation in merging this and other.
214  bool ConfirmNoTabViolation(const BLOBNBOX& other) const;
215 
216  // Returns true if other has a similar stroke width to this.
217  bool MatchingStrokeWidth(const BLOBNBOX& other,
218  double fractional_tolerance,
219  double constant_tolerance) const;
220 
221  // Returns a bounding box of the outline contained within the
222  // given horizontal range.
223  TBOX BoundsWithinLimits(int left, int right);
224 
225  // Estimates and stores the baseline position based on the shape of the
226  // outline.
228 
229  // Simple accessors.
230  const TBOX& bounding_box() const {
231  return box;
232  }
233  // Set the bounding box. Use with caution.
234  // Normally use compute_bounding_box instead.
235  void set_bounding_box(const TBOX& new_box) {
236  box = new_box;
237  base_char_top_ = box.top();
238  base_char_bottom_ = box.bottom();
239  }
241  box = cblob_ptr->bounding_box();
242  base_char_top_ = box.top();
243  base_char_bottom_ = box.bottom();
244  baseline_y_ = box.bottom();
245  }
246  const TBOX& reduced_box() const {
247  return red_box;
248  }
249  void set_reduced_box(TBOX new_box) {
250  red_box = new_box;
251  reduced = true;
252  }
253  int32_t enclosed_area() const {
254  return area;
255  }
256  bool joined_to_prev() const {
257  return joined != 0;
258  }
259  bool red_box_set() const {
260  return reduced != 0;
261  }
262  int repeated_set() const {
263  return repeated_set_;
264  }
265  void set_repeated_set(int set_id) {
266  repeated_set_ = set_id;
267  }
268  C_BLOB *cblob() const {
269  return cblob_ptr;
270  }
272  return left_tab_type_;
273  }
274  void set_left_tab_type(TabType new_type) {
275  left_tab_type_ = new_type;
276  }
278  return right_tab_type_;
279  }
280  void set_right_tab_type(TabType new_type) {
281  right_tab_type_ = new_type;
282  }
284  return region_type_;
285  }
287  region_type_ = new_type;
288  }
290  return spt_type_;
291  }
293  spt_type_ = new_type;
294  }
296  return flow_;
297  }
299  flow_ = value;
300  }
301  bool vert_possible() const {
302  return vert_possible_;
303  }
304  void set_vert_possible(bool value) {
305  vert_possible_ = value;
306  }
307  bool horz_possible() const {
308  return horz_possible_;
309  }
310  void set_horz_possible(bool value) {
311  horz_possible_ = value;
312  }
313  int left_rule() const {
314  return left_rule_;
315  }
316  void set_left_rule(int new_left) {
317  left_rule_ = new_left;
318  }
319  int right_rule() const {
320  return right_rule_;
321  }
322  void set_right_rule(int new_right) {
323  right_rule_ = new_right;
324  }
325  int left_crossing_rule() const {
326  return left_crossing_rule_;
327  }
328  void set_left_crossing_rule(int new_left) {
329  left_crossing_rule_ = new_left;
330  }
331  int right_crossing_rule() const {
332  return right_crossing_rule_;
333  }
334  void set_right_crossing_rule(int new_right) {
335  right_crossing_rule_ = new_right;
336  }
337  float horz_stroke_width() const {
338  return horz_stroke_width_;
339  }
340  void set_horz_stroke_width(float width) {
341  horz_stroke_width_ = width;
342  }
343  float vert_stroke_width() const {
344  return vert_stroke_width_;
345  }
346  void set_vert_stroke_width(float width) {
347  vert_stroke_width_ = width;
348  }
349  float area_stroke_width() const {
350  return area_stroke_width_;
351  }
353  return owner_;
354  }
356  owner_ = new_owner;
357  }
358  bool leader_on_left() const {
359  return leader_on_left_;
360  }
361  void set_leader_on_left(bool flag) {
362  leader_on_left_ = flag;
363  }
364  bool leader_on_right() const {
365  return leader_on_right_;
366  }
367  void set_leader_on_right(bool flag) {
368  leader_on_right_ = flag;
369  }
371  return neighbours_[n];
372  }
374  return good_stroke_neighbours_[n];
375  }
377  neighbours_[n] = neighbour;
378  good_stroke_neighbours_[n] = good;
379  }
380  bool IsDiacritic() const {
381  return base_char_top_ != box.top() || base_char_bottom_ != box.bottom();
382  }
383  int base_char_top() const {
384  return base_char_top_;
385  }
386  int base_char_bottom() const {
387  return base_char_bottom_;
388  }
389  int baseline_position() const {
390  return baseline_y_;
391  }
392  int line_crossings() const {
393  return line_crossings_;
394  }
395  void set_line_crossings(int value) {
396  line_crossings_ = value;
397  }
398  void set_diacritic_box(const TBOX& diacritic_box) {
399  base_char_top_ = diacritic_box.top();
400  base_char_bottom_ = diacritic_box.bottom();
401  }
403  return base_char_blob_;
404  }
406  base_char_blob_ = blob;
407  }
408  void set_owns_cblob(bool value) { owns_cblob_ = value; }
409 
410  bool UniquelyVertical() const {
411  return vert_possible_ && !horz_possible_;
412  }
413  bool UniquelyHorizontal() const {
414  return horz_possible_ && !vert_possible_;
415  }
416 
417  // Returns true if the region type is text.
419  return type == BRT_TEXT || type == BRT_VERT_TEXT;
420  }
421  // Returns true if the region type is image.
423  return type == BRT_RECTIMAGE || type == BRT_POLYIMAGE;
424  }
425  // Returns true if the region type is line.
427  return type == BRT_HLINE || type == BRT_VLINE;
428  }
429  // Returns true if the region type cannot be merged.
431  return IsLineType(type) || IsImageType(type);
432  }
433  // Helper to call CleanNeighbours on all blobs on the list.
434  static void CleanNeighbours(BLOBNBOX_LIST* blobs);
435  // Helper to delete all the deletable blobs on the list.
436  static void DeleteNoiseBlobs(BLOBNBOX_LIST* blobs);
437  // Helper to compute edge offsets for all the blobs on the list.
438  // See coutln.h for an explanation of edge offsets.
439  static void ComputeEdgeOffsets(Pix* thresholds, Pix* grey,
440  BLOBNBOX_LIST* blobs);
441 
442 #ifndef GRAPHICS_DISABLED
443  // Helper to draw all the blobs on the list in the given body_colour,
444  // with child outlines in the child_colour.
445  static void PlotBlobs(BLOBNBOX_LIST* list,
446  ScrollView::Color body_colour,
447  ScrollView::Color child_colour,
448  ScrollView* win);
449  // Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the
450  // given list in the given body_colour, with child outlines in the
451  // child_colour.
452  static void PlotNoiseBlobs(BLOBNBOX_LIST* list,
453  ScrollView::Color body_colour,
454  ScrollView::Color child_colour,
455  ScrollView* win);
456 
458  BlobTextFlowType flow_type);
459 
460  // Keep in sync with BlobRegionType.
461  ScrollView::Color BoxColor() const;
462 
463  void plot(ScrollView* window, // window to draw in
464  ScrollView::Color blob_colour, // for outer bits
465  ScrollView::Color child_colour); // for holes
466 #endif
467 
468  // Initializes the bulk of the members to default values for use at
469  // construction time.
471  cblob_ptr = nullptr;
472  owns_cblob_ = false;
473  area = 0;
474  area_stroke_width_ = 0.0f;
475  horz_stroke_width_ = 0.0f;
476  vert_stroke_width_ = 0.0f;
477  ReInit();
478  }
479  // Initializes members set by StrokeWidth and beyond, without discarding
480  // stored area and strokewidth values, which are expensive to calculate.
481  void ReInit() {
482  joined = false;
483  reduced = false;
484  repeated_set_ = 0;
485  left_tab_type_ = TT_NONE;
486  right_tab_type_ = TT_NONE;
487  region_type_ = BRT_UNKNOWN;
488  flow_ = BTFT_NONE;
489  spt_type_ = BSTT_SKIP;
490  left_rule_ = 0;
491  right_rule_ = 0;
492  left_crossing_rule_ = 0;
493  right_crossing_rule_ = 0;
494  if (area_stroke_width_ == 0.0f && area > 0 && cblob() != nullptr
495  && cblob()->perimeter()!=0)
496  area_stroke_width_ = 2.0f * area / cblob()->perimeter();
497  owner_ = nullptr;
498  base_char_top_ = box.top();
499  base_char_bottom_ = box.bottom();
500  baseline_y_ = box.bottom();
501  line_crossings_ = 0;
502  base_char_blob_ = nullptr;
503  horz_possible_ = false;
504  vert_possible_ = false;
505  leader_on_left_ = false;
506  leader_on_right_ = false;
507  ClearNeighbours();
508  }
509 
511  for (int n = 0; n < BND_COUNT; ++n) {
512  neighbours_[n] = nullptr;
513  good_stroke_neighbours_[n] = false;
514  }
515  }
516 
517  private:
518  C_BLOB *cblob_ptr; // edgestep blob
519  TBOX box; // bounding box
520  TBOX red_box; // bounding box
521  signed int area:30; // enclosed area
522  unsigned joined : 1; // joined to prev
523  unsigned reduced : 1; // reduced box set
524  int repeated_set_; // id of the set of repeated blobs
525  TabType left_tab_type_; // Indicates tab-stop assessment
526  TabType right_tab_type_; // Indicates tab-stop assessment
527  BlobRegionType region_type_; // Type of region this blob belongs to
528  BlobTextFlowType flow_; // Quality of text flow.
529  int16_t left_rule_; // x-coord of nearest but not crossing rule line
530  int16_t right_rule_; // x-coord of nearest but not crossing rule line
531  int16_t left_crossing_rule_; // x-coord of nearest or crossing rule line
532  int16_t right_crossing_rule_; // x-coord of nearest or crossing rule line
533  int16_t base_char_top_; // y-coord of top/bottom of diacritic base,
534  int16_t base_char_bottom_; // if it exists else top/bottom of this blob.
535  int16_t baseline_y_; // Estimate of baseline position.
536  int line_crossings_; // Number of line intersections touched.
537  BLOBNBOX* base_char_blob_; // The blob that was the base char.
538  float horz_stroke_width_; // Median horizontal stroke width
539  float vert_stroke_width_; // Median vertical stroke width
540  float area_stroke_width_; // Stroke width from area/perimeter ratio.
541  tesseract::ColPartition* owner_; // Who will delete me when I am not needed
542  BlobSpecialTextType spt_type_; // Special text type.
543  BLOBNBOX* neighbours_[BND_COUNT];
544  bool good_stroke_neighbours_[BND_COUNT];
545  bool horz_possible_; // Could be part of horizontal flow.
546  bool vert_possible_; // Could be part of vertical flow.
547  bool leader_on_left_; // There is a leader to the left.
548  bool leader_on_right_; // There is a leader to the right.
549  // Iff true, then the destructor should delete the cblob_ptr.
550  // TODO(rays) migrate all uses to correctly setting this flag instead of
551  // deleting the C_BLOB before deleting the BLOBNBOX.
552  bool owns_cblob_;
553 };
554 
555 class TO_ROW: public ELIST2_LINK
556 {
557  public:
558  static const int kErrorWeight = 3;
559 
560  TO_ROW() {
561  clear();
562  } //empty
563  TO_ROW( //constructor
564  BLOBNBOX *blob, //from first blob
565  float top, //of row //target height
566  float bottom,
567  float row_size);
568 
569  void print() const;
570  float max_y() const { //access function
571  return y_max;
572  }
573  float min_y() const {
574  return y_min;
575  }
576  float mean_y() const {
577  return (y_min + y_max) / 2.0f;
578  }
579  float initial_min_y() const {
580  return initial_y_min;
581  }
582  float line_m() const { //access to line fit
583  return m;
584  }
585  float line_c() const {
586  return c;
587  }
588  float line_error() const {
589  return error;
590  }
591  float parallel_c() const {
592  return para_c;
593  }
594  float parallel_error() const {
595  return para_error;
596  }
597  float believability() const { //baseline goodness
598  return credibility;
599  }
600  float intercept() const { //real parallel_c
601  return y_origin;
602  }
603  void add_blob( //put in row
604  BLOBNBOX *blob, //blob to add
605  float top, //of row //target height
606  float bottom,
607  float row_size);
608  void insert_blob( //put in row in order
609  BLOBNBOX *blob);
610 
611  BLOBNBOX_LIST *blob_list() { //get list
612  return &blobs;
613  }
614 
615  void set_line( //set line spec
616  float new_m, //line to set
617  float new_c,
618  float new_error) {
619  m = new_m;
620  c = new_c;
621  error = new_error;
622  }
623  void set_parallel_line( //set fixed gradient line
624  float gradient, //page gradient
625  float new_c,
626  float new_error) {
627  para_c = new_c;
628  para_error = new_error;
629  credibility = blobs.length() - kErrorWeight * new_error;
630  y_origin = new_c / std::sqrt(1 + gradient * gradient);
631  //real intercept
632  }
633  void set_limits( //set min,max
634  float new_min, //bottom and
635  float new_max) { //top of row
636  y_min = new_min;
637  y_max = new_max;
638  }
639  void compute_vertical_projection();
640  //get projection
641 
642  bool rep_chars_marked() const {
643  return num_repeated_sets_ != -1;
644  }
646  num_repeated_sets_ = -1;
647  }
648  int num_repeated_sets() const {
649  return num_repeated_sets_;
650  }
651  void set_num_repeated_sets(int num_sets) {
652  num_repeated_sets_ = num_sets;
653  }
654 
655  // true when dead
656  bool merged;
657  bool all_caps; // had no ascenders
658  bool used_dm_model; // in guessing pitch
659  int16_t projection_left; // start of projection
660  int16_t projection_right; // start of projection
661  PITCH_TYPE pitch_decision; // how strong is decision
662  float fixed_pitch; // pitch or 0
663  float fp_space; // sp if fixed pitch
664  float fp_nonsp; // nonsp if fixed pitch
665  float pr_space; // sp if prop
666  float pr_nonsp; // non sp if prop
667  float spacing; // to "next" row
668  float xheight; // of line
669  int xheight_evidence; // number of blobs of height xheight
670  float ascrise; // ascenders
671  float descdrop; // descenders
672  float body_size; // of CJK characters. Assumed to be
673  // xheight+ascrise for non-CJK text.
674  int32_t min_space; // min size for real space
675  int32_t max_nonspace; // max size of non-space
676  int32_t space_threshold; // space vs nonspace
677  float kern_size; // average non-space
678  float space_size; // average space
679  WERD_LIST rep_words; // repeated chars
680  ICOORDELT_LIST char_cells; // fixed pitch cells
681  QSPLINE baseline; // curved baseline
682  STATS projection; // vertical projection
683 
684  private:
685  void clear(); // clear all values to reasonable defaults
686 
687  BLOBNBOX_LIST blobs; //blobs in row
688  float y_min; //coords
689  float y_max;
690  float initial_y_min;
691  float m, c; //line spec
692  float error; //line error
693  float para_c; //constrained fit
694  float para_error;
695  float y_origin; //rotated para_c;
696  float credibility; //baseline believability
697  int num_repeated_sets_; // number of sets of repeated blobs
698  // set to -1 if we have not searched
699  // for repeated blobs in this row yet
700 };
701 
703 class TO_BLOCK:public ELIST_LINK
704 {
705  public:
706  TO_BLOCK() : pitch_decision(PITCH_DUNNO) {
707  clear();
708  } //empty
709  TO_BLOCK( //constructor
710  BLOCK *src_block); //real block
711  ~TO_BLOCK();
712 
713  void clear(); // clear all scalar members.
714 
715  TO_ROW_LIST *get_rows() { //access function
716  return &row_list;
717  }
718 
719  // Rotate all the blobnbox lists and the underlying block. Then update the
720  // median size statistic from the blobs list.
721  void rotate(const FCOORD& rotation) {
722  BLOBNBOX_LIST* blobnbox_list[] = {&blobs, &underlines, &noise_blobs,
723  &small_blobs, &large_blobs, nullptr};
724  for (BLOBNBOX_LIST** list = blobnbox_list; *list != nullptr; ++list) {
725  BLOBNBOX_IT it(*list);
726  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
727  it.data()->rotate(rotation);
728  }
729  }
730  // Rotate the block
731  ASSERT_HOST(block->pdblk.poly_block() != nullptr);
732  block->rotate(rotation);
733  // Update the median size statistic from the blobs list.
734  STATS widths(0, block->pdblk.bounding_box().width());
735  STATS heights(0, block->pdblk.bounding_box().height());
736  BLOBNBOX_IT blob_it(&blobs);
737  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
738  widths.add(blob_it.data()->bounding_box().width(), 1);
739  heights.add(blob_it.data()->bounding_box().height(), 1);
740  }
741  block->set_median_size(static_cast<int>(widths.median() + 0.5),
742  static_cast<int>(heights.median() + 0.5));
743  }
744 
745  void print_rows() { //debug info
746  TO_ROW_IT row_it = &row_list;
747  TO_ROW *row;
748 
749  for (row_it.mark_cycle_pt(); !row_it.cycled_list();
750  row_it.forward()) {
751  row = row_it.data();
752  tprintf("Row range (%g,%g), para_c=%g, blobcount=%" PRId32 "\n",
753  row->min_y(), row->max_y(), row->parallel_c(),
754  row->blob_list()->length());
755  }
756  }
757 
758  // Reorganizes the blob lists with a different definition of small, medium
759  // and large, compared to the original definition.
760  // Height is still the primary filter key, but medium width blobs of small
761  // height become medium, and very wide blobs of small height stay small.
762  void ReSetAndReFilterBlobs();
763 
764  // Deletes noise blobs from all lists where not owned by a ColPartition.
765  void DeleteUnownedNoise();
766 
767  // Computes and stores the edge offsets on each blob for use in feature
768  // extraction, using greyscale if the supplied grey and thresholds pixes
769  // are 8-bit or otherwise (if nullptr or not 8 bit) the original binary
770  // edge step outlines.
771  // Thresholds must either be the same size as grey or an integer down-scale
772  // of grey.
773  // See coutln.h for an explanation of edge offsets.
774  void ComputeEdgeOffsets(Pix* thresholds, Pix* grey);
775 
776 #ifndef GRAPHICS_DISABLED
777  // Draw the noise blobs from all lists in red.
778  void plot_noise_blobs(ScrollView* to_win);
779  // Draw the blobs on on the various lists in the block in different colors.
780  void plot_graded_blobs(ScrollView* to_win);
781 #endif
782 
783  BLOBNBOX_LIST blobs; //medium size
784  BLOBNBOX_LIST underlines; //underline blobs
785  BLOBNBOX_LIST noise_blobs; //very small
786  BLOBNBOX_LIST small_blobs; //fairly small
787  BLOBNBOX_LIST large_blobs; //big blobs
788  BLOCK *block; //real block
789  PITCH_TYPE pitch_decision; //how strong is decision
790  float line_spacing; //estimate
791  // line_size is a lower-bound estimate of the font size in pixels of
792  // the text in the block (with ascenders and descenders), being a small
793  // (1.25) multiple of the median height of filtered blobs.
794  // In most cases the font size will be bigger, but it will be closer
795  // if the text is allcaps, or in a no-x-height script.
796  float line_size; //estimate
797  float max_blob_size; //line assignment limit
798  float baseline_offset; //phase shift
799  float xheight; //median blob size
800  float fixed_pitch; //pitch or 0
801  float kern_size; //average non-space
802  float space_size; //average space
803  int32_t min_space; //min definite space
804  int32_t max_nonspace; //max definite
805  float fp_space; //sp if fixed pitch
806  float fp_nonsp; //nonsp if fixed pitch
807  float pr_space; //sp if prop
808  float pr_nonsp; //non sp if prop
809  TO_ROW *key_row; //starting row
810 
811  private:
812  TO_ROW_LIST row_list; //temporary rows
813 };
814 
817 "Weighting for error in believability");
818 void find_cblob_limits( //get y limits
819  C_BLOB *blob, //blob to search
820  float leftx, //x limits
821  float rightx,
822  FCOORD rotation, //for landscape
823  float &ymin, //output y limits
824  float &ymax);
825 void find_cblob_vlimits( //get y limits
826  C_BLOB *blob, //blob to search
827  float leftx, //x limits
828  float rightx,
829  float &ymin, //output y limits
830  float &ymax);
831 void find_cblob_hlimits( //get x limits
832  C_BLOB *blob, //blob to search
833  float bottomy, //y limits
834  float topy,
835  float &xmin, //output x limits
836  float &xymax);
837 C_BLOB *crotate_cblob( //rotate it
838  C_BLOB *blob, //blob to search
839  FCOORD rotation //for landscape
840  );
841 TBOX box_next( //get bounding box
842  BLOBNBOX_IT *it //iterator to blobds
843  );
844 TBOX box_next_pre_chopped( //get bounding box
845  BLOBNBOX_IT *it //iterator to blobds
846  );
847 void vertical_cblob_projection( //project outlines
848  C_BLOB *blob, //blob to project
849  STATS *stats //output
850  );
851 void vertical_coutline_projection( //project outlines
852  C_OUTLINE *outline, //outline to project
853  STATS *stats //output
854  );
855 #ifndef GRAPHICS_DISABLED
856 void plot_blob_list(ScrollView* win, // window to draw in
857  BLOBNBOX_LIST *list, // blob list
858  ScrollView::Color body_colour, // colour to draw
859  ScrollView::Color child_colour); // colour of child
860 #endif // GRAPHICS_DISABLED
861 #endif
void set_owns_cblob(bool value)
Definition: blobbox.h:408
void set_parallel_line(float gradient, float new_c, float new_error)
Definition: blobbox.h:623
bool IsDiacritic() const
Definition: blobbox.h:380
void set_leader_on_right(bool flag)
Definition: blobbox.h:367
void reflect_box_in_y_axis()
Definition: blobbox.cpp:62
bool used_dm_model
Definition: blobbox.h:658
float min_y() const
Definition: blobbox.h:573
TabType right_tab_type() const
Definition: blobbox.h:277
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:665
int16_t top() const
Definition: rect.h:58
PITCH_TYPE pitch_decision
Definition: blobbox.h:661
float xheight
Definition: blobbox.h:668
void set_vert_possible(bool value)
Definition: blobbox.h:304
float pr_space
Definition: blobbox.h:665
TO_BLOCK()
Definition: blobbox.h:706
void set_right_tab_type(TabType new_type)
Definition: blobbox.h:280
void set_base_char_blob(BLOBNBOX *blob)
Definition: blobbox.h:405
int left_rule() const
Definition: blobbox.h:313
void find_cblob_vlimits(C_BLOB *blob, float leftx, float rightx, float &ymin, float &ymax)
Definition: blobbox.cpp:539
bool all_caps
Definition: blobbox.h:657
Definition: rect.h:34
int32_t perimeter()
Definition: stepblob.cpp:292
float descdrop
Definition: blobbox.h:671
int line_crossings() const
Definition: blobbox.h:392
void ConstructionInit()
Definition: blobbox.h:470
int16_t projection_right
Definition: blobbox.h:660
WERD_LIST rep_words
Definition: blobbox.h:679
void CleanNeighbours()
Definition: blobbox.cpp:214
void set_right_crossing_rule(int new_right)
Definition: blobbox.h:334
float fp_space
Definition: blobbox.h:663
int32_t max_nonspace
Definition: blobbox.h:804
Definition: points.h:188
QSPLINE baseline
Definition: blobbox.h:681
float fp_nonsp
Definition: blobbox.h:806
static BLOBNBOX * RealBlob(C_OUTLINE *outline)
Definition: blobbox.h:158
int num_repeated_sets() const
Definition: blobbox.h:648
BlobNeighbourDir
Definition: blobbox.h:87
C_BLOB * cblob() const
Definition: blobbox.h:268
void vertical_coutline_projection(C_OUTLINE *outline, STATS *stats)
Definition: blobbox.cpp:888
tesseract::ColPartition * owner() const
Definition: blobbox.h:352
static void PlotBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour, ScrollView::Color child_colour, ScrollView *win)
Definition: blobbox.cpp:419
float line_size
Definition: blobbox.h:796
PITCH_TYPE
Definition: blobbox.h:44
float baseline_offset
Definition: blobbox.h:798
void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good)
Definition: blobbox.h:376
BLOBNBOX_LIST underlines
Definition: blobbox.h:784
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:444
void set_line(float new_m, float new_c, float new_error)
Definition: blobbox.h:615
int GoodTextBlob() const
Definition: blobbox.cpp:226
void set_diacritic_box(const TBOX &diacritic_box)
Definition: blobbox.h:398
TBOX BoundsWithinLimits(int left, int right)
Definition: blobbox.cpp:333
bool DeletableNoise() const
Definition: blobbox.h:203
void ReInit()
Definition: blobbox.h:481
void set_horz_stroke_width(float width)
Definition: blobbox.h:340
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:785
integer coordinate
Definition: points.h:31
const TBOX & bounding_box() const
Definition: blobbox.h:230
int16_t y() const
access_function
Definition: points.h:56
int base_char_bottom() const
Definition: blobbox.h:386
float max_y() const
Definition: blobbox.h:570
PITCH_TYPE pitch_decision
Definition: blobbox.h:789
float spacing
Definition: blobbox.h:667
float pr_nonsp
Definition: blobbox.h:808
TBOX bounding_box() const
Definition: stepblob.cpp:253
BlobTextFlowType flow() const
Definition: blobbox.h:295
#define double_VAR_H(name, val, comment)
Definition: params.h:301
void set_left_rule(int new_left)
Definition: blobbox.h:316
void find_cblob_limits(C_BLOB *blob, float leftx, float rightx, FCOORD rotation, float &ymin, float &ymax)
Definition: blobbox.cpp:499
void set_line_crossings(int value)
Definition: blobbox.h:395
int32_t min_space
Definition: blobbox.h:803
void set_horz_possible(bool value)
Definition: blobbox.h:310
int baseline_position() const
Definition: blobbox.h:389
void rotate(FCOORD rotation)
Definition: blobbox.cpp:55
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:786
static bool IsLineType(BlobRegionType type)
Definition: blobbox.h:426
BLOCK * block
Definition: blobbox.h:788
bool rep_chars_marked() const
Definition: blobbox.h:642
void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const
Definition: blobbox.cpp:200
int NoisyNeighbours() const
Definition: blobbox.cpp:237
int16_t projection_left
Definition: blobbox.h:659
BLOBNBOX()
Definition: blobbox.h:146
float horz_stroke_width() const
Definition: blobbox.h:337
int32_t min_space
Definition: blobbox.h:674
ScrollView::Color BoxColor() const
Definition: blobbox.cpp:481
int32_t space_threshold
Definition: blobbox.h:676
void really_merge(BLOBNBOX *other)
Definition: blobbox.cpp:103
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:636
static void DeleteNoiseBlobs(BLOBNBOX_LIST *blobs)
Definition: blobbox.cpp:372
float line_m() const
Definition: blobbox.h:582
float fixed_pitch
Definition: blobbox.h:662
void compute_bounding_box()
Definition: blobbox.h:240
void set_left_crossing_rule(int new_left)
Definition: blobbox.h:328
bool merged
Definition: blobbox.h:656
int right_rule() const
Definition: blobbox.h:319
float line_spacing
Definition: blobbox.h:790
bool leader_on_right() const
Definition: blobbox.h:364
void merge(BLOBNBOX *nextblob)
Definition: blobbox.cpp:92
bool ConfirmNoTabViolation(const BLOBNBOX &other) const
Definition: blobbox.cpp:292
void vertical_cblob_projection(C_BLOB *blob, STATS *stats)
Definition: blobbox.cpp:868
void set_limits(float new_min, float new_max)
Definition: blobbox.h:633
bool red_box_set() const
Definition: blobbox.h:259
ScrollView * to_win
Definition: drawtord.cpp:35
void set_bounding_box(const TBOX &new_box)
Definition: blobbox.h:235
float max_blob_size
Definition: blobbox.h:797
float fp_space
Definition: blobbox.h:805
bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance, double constant_tolerance) const
Definition: blobbox.cpp:305
float fixed_pitch
Definition: blobbox.h:800
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:355
int xheight_evidence
Definition: blobbox.h:669
float space_size
Definition: blobbox.h:678
bool vert_possible() const
Definition: blobbox.h:301
float intercept() const
Definition: blobbox.h:600
void rotate_box(FCOORD rotation)
Definition: blobbox.cpp:71
TabType left_tab_type() const
Definition: blobbox.h:271
~BLOBNBOX()
Definition: blobbox.h:155
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
float parallel_c() const
Definition: blobbox.h:591
double textord_error_weight
TO_ROW_LIST * get_rows()
Definition: blobbox.h:715
BlobRegionType
Definition: blobbox.h:72
void ClearNeighbours()
Definition: blobbox.h:510
BLOBNBOX * base_char_blob() const
Definition: blobbox.h:402
static bool UnMergeableType(BlobRegionType type)
Definition: blobbox.h:430
static bool IsImageType(BlobRegionType type)
Definition: blobbox.h:422
int32_t enclosed_area() const
Definition: blobbox.h:253
const TBOX & reduced_box() const
Definition: blobbox.h:246
void rotate(const FCOORD &rotation)
Definition: blobbox.h:721
float fp_nonsp
Definition: blobbox.h:664
void find_cblob_hlimits(C_BLOB *blob, float bottomy, float topy, float &xmin, float &xymax)
Definition: blobbox.cpp:576
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:942
float pr_nonsp
Definition: blobbox.h:666
int16_t bottom() const
Definition: rect.h:65
bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2)
Definition: blobbox.h:129
BlobRegionType region_type() const
Definition: blobbox.h:283
BLOBNBOX * neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:370
static bool IsTextType(BlobRegionType type)
Definition: blobbox.h:418
BlobTextFlowType
Definition: blobbox.h:114
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:611
void plot_blob_list(ScrollView *win, BLOBNBOX_LIST *list, ScrollView::Color body_colour, ScrollView::Color child_colour)
Definition: blobbox.cpp:1086
void set_vert_stroke_width(float width)
Definition: blobbox.h:346
float pr_space
Definition: blobbox.h:807
TabType
Definition: blobbox.h:59
void set_reduced_box(TBOX new_box)
Definition: blobbox.h:249
float line_c() const
Definition: blobbox.h:585
#define ELIST2IZEH(CLASSNAME)
Definition: elst2.h:947
TO_ROW * key_row
Definition: blobbox.h:809
#define ASSERT_HOST(x)
Definition: errcode.h:88
bool DefiniteIndividualFlow()
Definition: blobbox.cpp:252
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:286
BLOBNBOX_LIST blobs
Definition: blobbox.h:783
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:289
bool horz_possible() const
Definition: blobbox.h:307
void EstimateBaselinePosition()
Definition: blobbox.cpp:357
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:298
float believability() const
Definition: blobbox.h:597
float parallel_error() const
Definition: blobbox.h:594
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:787
int right_crossing_rule() const
Definition: blobbox.h:331
STATS projection
Definition: blobbox.h:682
void clear_rep_chars_marked()
Definition: blobbox.h:645
void set_leader_on_left(bool flag)
Definition: blobbox.h:361
void set_right_rule(int new_right)
Definition: blobbox.h:322
float ascrise
Definition: blobbox.h:670
int32_t max_nonspace
Definition: blobbox.h:675
void chop(BLOBNBOX_IT *start_it, BLOBNBOX_IT *blob_it, FCOORD rotation, float xheight)
Definition: blobbox.cpp:120
TO_ROW()
Definition: blobbox.h:560
bool UniquelyHorizontal() const
Definition: blobbox.h:413
void set_left_tab_type(TabType new_type)
Definition: blobbox.h:274
BLOBNBOX(C_BLOB *srcblob)
Definition: blobbox.h:149
C_BLOB * crotate_cblob(C_BLOB *blob, FCOORD rotation)
Definition: blobbox.cpp:611
static void ComputeEdgeOffsets(Pix *thresholds, Pix *grey, BLOBNBOX_LIST *blobs)
Definition: blobbox.cpp:385
int repeated_set() const
Definition: blobbox.h:262
float mean_y() const
Definition: blobbox.h:576
void translate_box(ICOORD v)
Definition: blobbox.h:174
bool UniquelyVertical() const
Definition: blobbox.h:410
float area_stroke_width() const
Definition: blobbox.h:349
float initial_min_y() const
Definition: blobbox.h:579
void plot(ScrollView *window, ScrollView::Color blob_colour, ScrollView::Color child_colour)
Definition: blobbox.cpp:485
float kern_size
Definition: blobbox.h:677
void set_repeated_set(int set_id)
Definition: blobbox.h:265
Definition: statistc.h:31
void set_special_text_type(BlobSpecialTextType new_type)
Definition: blobbox.h:292
float body_size
Definition: blobbox.h:672
float kern_size
Definition: blobbox.h:801
float line_error() const
Definition: blobbox.h:588
static void PlotNoiseBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour, ScrollView::Color child_colour, ScrollView *win)
Definition: blobbox.cpp:432
ICOORDELT_LIST char_cells
Definition: blobbox.h:680
BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir)
Definition: blobbox.h:106
BlobSpecialTextType
Definition: blobbox.h:96
void NeighbourGaps(int gaps[BND_COUNT]) const
Definition: blobbox.cpp:181
float space_size
Definition: blobbox.h:802
float vert_stroke_width() const
Definition: blobbox.h:343
int left_crossing_rule() const
Definition: blobbox.h:325
int base_char_top() const
Definition: blobbox.h:383
bool joined_to_prev() const
Definition: blobbox.h:256
Definition: ocrblock.h:29
bool leader_on_left() const
Definition: blobbox.h:358
bool good_stroke_neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:373
void print_rows()
Definition: blobbox.h:745
float xheight
Definition: blobbox.h:799
void set_num_repeated_sets(int num_sets)
Definition: blobbox.h:651
int32_t area()
Definition: stepblob.cpp:273