tesseract  4.1.0
OL_BUCKETS Class Reference

#include <edgblob.h>

Public Member Functions

 ~OL_BUCKETS ()=default
 
C_OUTLINE_LIST * start_scan ()
 
C_OUTLINE_LIST * scan_next ()
 
OL_BUCKETS::OL_BUCKETS

Construct an array of buckets for associating outlines into blobs.

 OL_BUCKETS (ICOORD bleft, ICOORD tright)
 
OL_BUCKETS::operator(

Return a pointer to a list of C_OUTLINEs corresponding to the given pixel coordinates.

C_OUTLINE_LIST * operator() (int16_t x, int16_t y)
 
OL_BUCKETS::count_children

Find number of descendants of this outline.

int32_t count_children (C_OUTLINE *outline, int32_t max_count)
 
OL_BUCKETS::outline_complexity

This is the new version of count_child.

The goal of this function is to determine if an outline and its interiors could be part of a character blob. This is done by computing a "complexity" index for the outline, which is the return value of this function, and checking it against a threshold. The max_count is used for short-circuiting the recursion and forcing a rejection that guarantees to fail the threshold test. The complexity F for outline X with N children X[i] is F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild so each layer of nesting increases complexity exponentially. An outline can be rejected as a text blob candidate if its complexity is too high, has too many children(likely a container), or has too many layers of nested inner loops. This has the side-effect of flattening out boxed or reversed video text regions.

int32_t outline_complexity (C_OUTLINE *outline, int32_t max_count, int16_t depth)
 
OL_BUCKETS::extract_children

Find number of descendants of this outline.

void extract_children (C_OUTLINE *outline, C_OUTLINE_IT *it)
 

Detailed Description

Definition at line 32 of file edgblob.h.

Constructor & Destructor Documentation

OL_BUCKETS::OL_BUCKETS ( ICOORD  bleft,
ICOORD  tright 
)

Definition at line 64 of file edgblob.cpp.

66  : bl(bleft), tr(tright) {
67  bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1;
68  bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1;
69  // make array
70  buckets.reset(new C_OUTLINE_LIST[bxdim * bydim]);
71  index = 0;
72 }
int16_t y() const
access_function
Definition: points.h:56
int16_t x() const
access function
Definition: points.h:52
#define BUCKETSIZE
Definition: edgblob.h:30
OL_BUCKETS::~OL_BUCKETS ( )
default

Member Function Documentation

int32_t OL_BUCKETS::count_children ( C_OUTLINE outline,
int32_t  max_count 
)

Definition at line 179 of file edgblob.cpp.

182  {
183  bool parent_box; // could it be boxy
184  int16_t xmin, xmax; // coord limits
185  int16_t ymin, ymax;
186  int16_t xindex, yindex; // current bucket
187  C_OUTLINE *child; // current child
188  int32_t child_count; // no of children
189  int32_t grandchild_count; // no of grandchildren
190  int32_t parent_area; // potential box
191  float max_parent_area; // potential box
192  int32_t child_area; // current child
193  int32_t child_length; // current child
194  TBOX olbox;
195  C_OUTLINE_IT child_it; // search iterator
196 
197  olbox = outline->bounding_box();
198  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
199  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
200  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
201  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
202  child_count = 0;
203  grandchild_count = 0;
204  parent_area = 0;
205  max_parent_area = 0;
206  parent_box = true;
207  for (yindex = ymin; yindex <= ymax; yindex++) {
208  for (xindex = xmin; xindex <= xmax; xindex++) {
209  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
210  if (child_it.empty())
211  continue;
212  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
213  child_it.forward()) {
214  child = child_it.data();
215  if (child != outline && *child < *outline) {
216  child_count++;
217  if (child_count <= max_count) {
218  int max_grand =(max_count - child_count) /
219  edges_children_per_grandchild;
220  if (max_grand > 0)
221  grandchild_count += count_children(child, max_grand) *
222  edges_children_per_grandchild;
223  else
224  grandchild_count += count_children(child, 1);
225  }
226  if (child_count + grandchild_count > max_count) {
227  if (edges_debug)
228  tprintf("Discarding parent with child count=%d, gc=%d\n",
229  child_count,grandchild_count);
230  return child_count + grandchild_count;
231  }
232  if (parent_area == 0) {
233  parent_area = outline->outer_area();
234  if (parent_area < 0)
235  parent_area = -parent_area;
236  max_parent_area = outline->bounding_box().area() * edges_boxarea;
237  if (parent_area < max_parent_area)
238  parent_box = false;
239  }
240  if (parent_box &&
241  (!edges_children_fix ||
242  child->bounding_box().height() > edges_min_nonhole)) {
243  child_area = child->outer_area();
244  if (child_area < 0)
245  child_area = -child_area;
246  if (edges_children_fix) {
247  if (parent_area - child_area < max_parent_area) {
248  parent_box = false;
249  continue;
250  }
251  if (grandchild_count > 0) {
252  if (edges_debug)
253  tprintf("Discarding parent of area %d, child area=%d, max%g "
254  "with gc=%d\n",
255  parent_area, child_area, max_parent_area,
256  grandchild_count);
257  return max_count + 1;
258  }
259  child_length = child->pathlength();
260  if (child_length * child_length >
261  child_area * edges_patharea_ratio) {
262  if (edges_debug)
263  tprintf("Discarding parent of area %d, child area=%d, max%g "
264  "with child length=%d\n",
265  parent_area, child_area, max_parent_area,
266  child_length);
267  return max_count + 1;
268  }
269  }
270  if (child_area < child->bounding_box().area() * edges_childarea) {
271  if (edges_debug)
272  tprintf("Discarding parent of area %d, child area=%d, max%g "
273  "with child rect=%d\n",
274  parent_area, child_area, max_parent_area,
275  child->bounding_box().area());
276  return max_count + 1;
277  }
278  }
279  }
280  }
281  }
282  }
283  return child_count + grandchild_count;
284 }
int32_t area() const
Definition: rect.h:122
int16_t top() const
Definition: rect.h:58
Definition: rect.h:34
const TBOX & bounding_box() const
Definition: coutln.h:113
int32_t count_children(C_OUTLINE *outline, int32_t max_count)
Definition: edgblob.cpp:179
int16_t y() const
access_function
Definition: points.h:56
int16_t height() const
Definition: rect.h:108
int32_t pathlength() const
Definition: coutln.h:135
int32_t outer_area() const
Definition: coutln.cpp:308
int16_t x() const
access function
Definition: points.h:52
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
int16_t left() const
Definition: rect.h:72
#define BUCKETSIZE
Definition: edgblob.h:30
void OL_BUCKETS::extract_children ( C_OUTLINE outline,
C_OUTLINE_IT *  it 
)

Definition at line 295 of file edgblob.cpp.

298  {
299  int16_t xmin, xmax; // coord limits
300  int16_t ymin, ymax;
301  int16_t xindex, yindex; // current bucket
302  TBOX olbox;
303  C_OUTLINE_IT child_it; // search iterator
304 
305  olbox = outline->bounding_box();
306  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
307  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
308  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
309  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
310  for (yindex = ymin; yindex <= ymax; yindex++) {
311  for (xindex = xmin; xindex <= xmax; xindex++) {
312  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
313  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
314  child_it.forward()) {
315  if (*child_it.data() < *outline) {
316  it->add_after_then_move(child_it.extract());
317  }
318  }
319  }
320  }
321 }
int16_t top() const
Definition: rect.h:58
Definition: rect.h:34
const TBOX & bounding_box() const
Definition: coutln.h:113
int16_t y() const
access_function
Definition: points.h:56
int16_t x() const
access function
Definition: points.h:52
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
int16_t left() const
Definition: rect.h:72
#define BUCKETSIZE
Definition: edgblob.h:30
C_OUTLINE_LIST * OL_BUCKETS::operator() ( int16_t  x,
int16_t  y 
)

Definition at line 83 of file edgblob.cpp.

85  {
86  return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE];
87 }
int16_t y() const
access_function
Definition: points.h:56
int16_t x() const
access function
Definition: points.h:52
#define BUCKETSIZE
Definition: edgblob.h:30
int32_t OL_BUCKETS::outline_complexity ( C_OUTLINE outline,
int32_t  max_count,
int16_t  depth 
)

Definition at line 110 of file edgblob.cpp.

114  {
115  int16_t xmin, xmax; // coord limits
116  int16_t ymin, ymax;
117  int16_t xindex, yindex; // current bucket
118  C_OUTLINE *child; // current child
119  int32_t child_count; // no of children
120  int32_t grandchild_count; // no of grandchildren
121  C_OUTLINE_IT child_it; // search iterator
122 
123  TBOX olbox = outline->bounding_box();
124  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
125  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
126  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
127  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
128  child_count = 0;
129  grandchild_count = 0;
130  if (++depth > edges_max_children_layers) // nested loops are too deep
131  return max_count + depth;
132 
133  for (yindex = ymin; yindex <= ymax; yindex++) {
134  for (xindex = xmin; xindex <= xmax; xindex++) {
135  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
136  if (child_it.empty())
137  continue;
138  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
139  child_it.forward()) {
140  child = child_it.data();
141  if (child == outline || !(*child < *outline))
142  continue;
143  child_count++;
144 
145  if (child_count > edges_max_children_per_outline) { // too fragmented
146  if (edges_debug)
147  tprintf("Discard outline on child_count=%d > "
148  "max_children_per_outline=%d\n",
149  child_count,
150  static_cast<int32_t>(edges_max_children_per_outline));
151  return max_count + child_count;
152  }
153 
154  // Compute the "complexity" of each child recursively
155  int32_t remaining_count = max_count - child_count - grandchild_count;
156  if (remaining_count > 0)
157  grandchild_count += edges_children_per_grandchild *
158  outline_complexity(child, remaining_count, depth);
159  if (child_count + grandchild_count > max_count) { // too complex
160  if (edges_debug)
161  tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
162  "> max_count=%d\n",
163  child_count, grandchild_count, max_count);
164  return child_count + grandchild_count;
165  }
166  }
167  }
168  }
169  return child_count + grandchild_count;
170 }
int16_t top() const
Definition: rect.h:58
Definition: rect.h:34
const TBOX & bounding_box() const
Definition: coutln.h:113
int16_t y() const
access_function
Definition: points.h:56
int32_t outline_complexity(C_OUTLINE *outline, int32_t max_count, int16_t depth)
Definition: edgblob.cpp:110
int16_t x() const
access function
Definition: points.h:52
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
int16_t left() const
Definition: rect.h:72
#define BUCKETSIZE
Definition: edgblob.h:30
C_OUTLINE_LIST* OL_BUCKETS::scan_next ( )
inline

Definition at line 51 of file edgblob.h.

51  {
52  for (; buckets[index].empty () && index < bxdim * bydim - 1; index++);
53  return &buckets[index];
54  }
C_OUTLINE_LIST* OL_BUCKETS::start_scan ( )
inline

Definition at line 45 of file edgblob.h.

45  {
46  for (index = 0; buckets[index].empty () && index < bxdim * bydim - 1;
47  index++);
48  return &buckets[index];
49  }

The documentation for this class was generated from the following files: