tesseract  4.1.0
edgblob.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: edgblob.cpp (Formerly edgeloop.c)
3  * Description: Functions to clean up an outline before approximation.
4  * Author: Ray Smith
5  *
6  *(C) Copyright 1991, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0(the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include "scanedg.h"
20 #include "drawedg.h"
21 #include "edgloop.h"
22 #include "edgblob.h"
23 
24 // Include automatically generated configuration file if running autoconf.
25 #ifdef HAVE_CONFIG_H
26 #include "config_auto.h"
27 #endif
28 
29 // Control parameters used in outline_complexity(), which rejects an outline
30 // if any one of the 3 conditions is satisfied:
31 // - number of children exceeds edges_max_children_per_outline
32 // - number of nested layers exceeds edges_max_children_layers
33 // - joint complexity exceeds edges_children_count_limit(as in child_count())
34 static BOOL_VAR(edges_use_new_outline_complexity, false,
35  "Use the new outline complexity module");
36 static INT_VAR(edges_max_children_per_outline, 10,
37  "Max number of children inside a character outline");
38 static INT_VAR(edges_max_children_layers, 5,
39  "Max layers of nested children inside a character outline");
40 static BOOL_VAR(edges_debug, false,
41  "turn on debugging for this module");
42 
43 static INT_VAR(edges_children_per_grandchild, 10,
44  "Importance ratio for chucking outlines");
45 static INT_VAR(edges_children_count_limit, 45,
46  "Max holes allowed in blob");
47 static BOOL_VAR(edges_children_fix, false,
48  "Remove boxy parents of char-like children");
49 static INT_VAR(edges_min_nonhole, 12,
50  "Min pixels for potential char in box");
51 static INT_VAR(edges_patharea_ratio, 40,
52  "Max lensq/area for acceptable child outline");
53 static double_VAR(edges_childarea, 0.5,
54  "Min area fraction of child outline");
55 static double_VAR(edges_boxarea, 0.875,
56  "Min area fraction of grandchild for box");
57 
65 ICOORD bleft, // corners
66 ICOORD tright): bl(bleft), tr(tright) {
67  bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1;
68  bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1;
69  // make array
70  buckets.reset(new C_OUTLINE_LIST[bxdim * bydim]);
71  index = 0;
72 }
73 
74 
82 C_OUTLINE_LIST *
83 OL_BUCKETS::operator()( // array access
84 int16_t x, // image coords
85 int16_t y) {
86  return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE];
87 }
88 
89 
111  C_OUTLINE *outline, // parent outline
112  int32_t max_count, // max output
113  int16_t depth // recurion depth
114  ) {
115  int16_t xmin, xmax; // coord limits
116  int16_t ymin, ymax;
117  int16_t xindex, yindex; // current bucket
118  C_OUTLINE *child; // current child
119  int32_t child_count; // no of children
120  int32_t grandchild_count; // no of grandchildren
121  C_OUTLINE_IT child_it; // search iterator
122 
123  TBOX olbox = outline->bounding_box();
124  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
125  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
126  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
127  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
128  child_count = 0;
129  grandchild_count = 0;
130  if (++depth > edges_max_children_layers) // nested loops are too deep
131  return max_count + depth;
132 
133  for (yindex = ymin; yindex <= ymax; yindex++) {
134  for (xindex = xmin; xindex <= xmax; xindex++) {
135  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
136  if (child_it.empty())
137  continue;
138  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
139  child_it.forward()) {
140  child = child_it.data();
141  if (child == outline || !(*child < *outline))
142  continue;
143  child_count++;
144 
145  if (child_count > edges_max_children_per_outline) { // too fragmented
146  if (edges_debug)
147  tprintf("Discard outline on child_count=%d > "
148  "max_children_per_outline=%d\n",
149  child_count,
150  static_cast<int32_t>(edges_max_children_per_outline));
151  return max_count + child_count;
152  }
153 
154  // Compute the "complexity" of each child recursively
155  int32_t remaining_count = max_count - child_count - grandchild_count;
156  if (remaining_count > 0)
157  grandchild_count += edges_children_per_grandchild *
158  outline_complexity(child, remaining_count, depth);
159  if (child_count + grandchild_count > max_count) { // too complex
160  if (edges_debug)
161  tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
162  "> max_count=%d\n",
163  child_count, grandchild_count, max_count);
164  return child_count + grandchild_count;
165  }
166  }
167  }
168  }
169  return child_count + grandchild_count;
170 }
171 
172 
178 // TODO(rays) Merge with outline_complexity.
179 int32_t OL_BUCKETS::count_children( // recursive count
180  C_OUTLINE *outline, // parent outline
181  int32_t max_count // max output
182  ) {
183  bool parent_box; // could it be boxy
184  int16_t xmin, xmax; // coord limits
185  int16_t ymin, ymax;
186  int16_t xindex, yindex; // current bucket
187  C_OUTLINE *child; // current child
188  int32_t child_count; // no of children
189  int32_t grandchild_count; // no of grandchildren
190  int32_t parent_area; // potential box
191  float max_parent_area; // potential box
192  int32_t child_area; // current child
193  int32_t child_length; // current child
194  TBOX olbox;
195  C_OUTLINE_IT child_it; // search iterator
196 
197  olbox = outline->bounding_box();
198  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
199  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
200  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
201  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
202  child_count = 0;
203  grandchild_count = 0;
204  parent_area = 0;
205  max_parent_area = 0;
206  parent_box = true;
207  for (yindex = ymin; yindex <= ymax; yindex++) {
208  for (xindex = xmin; xindex <= xmax; xindex++) {
209  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
210  if (child_it.empty())
211  continue;
212  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
213  child_it.forward()) {
214  child = child_it.data();
215  if (child != outline && *child < *outline) {
216  child_count++;
217  if (child_count <= max_count) {
218  int max_grand =(max_count - child_count) /
219  edges_children_per_grandchild;
220  if (max_grand > 0)
221  grandchild_count += count_children(child, max_grand) *
222  edges_children_per_grandchild;
223  else
224  grandchild_count += count_children(child, 1);
225  }
226  if (child_count + grandchild_count > max_count) {
227  if (edges_debug)
228  tprintf("Discarding parent with child count=%d, gc=%d\n",
229  child_count,grandchild_count);
230  return child_count + grandchild_count;
231  }
232  if (parent_area == 0) {
233  parent_area = outline->outer_area();
234  if (parent_area < 0)
235  parent_area = -parent_area;
236  max_parent_area = outline->bounding_box().area() * edges_boxarea;
237  if (parent_area < max_parent_area)
238  parent_box = false;
239  }
240  if (parent_box &&
241  (!edges_children_fix ||
242  child->bounding_box().height() > edges_min_nonhole)) {
243  child_area = child->outer_area();
244  if (child_area < 0)
245  child_area = -child_area;
246  if (edges_children_fix) {
247  if (parent_area - child_area < max_parent_area) {
248  parent_box = false;
249  continue;
250  }
251  if (grandchild_count > 0) {
252  if (edges_debug)
253  tprintf("Discarding parent of area %d, child area=%d, max%g "
254  "with gc=%d\n",
255  parent_area, child_area, max_parent_area,
256  grandchild_count);
257  return max_count + 1;
258  }
259  child_length = child->pathlength();
260  if (child_length * child_length >
261  child_area * edges_patharea_ratio) {
262  if (edges_debug)
263  tprintf("Discarding parent of area %d, child area=%d, max%g "
264  "with child length=%d\n",
265  parent_area, child_area, max_parent_area,
266  child_length);
267  return max_count + 1;
268  }
269  }
270  if (child_area < child->bounding_box().area() * edges_childarea) {
271  if (edges_debug)
272  tprintf("Discarding parent of area %d, child area=%d, max%g "
273  "with child rect=%d\n",
274  parent_area, child_area, max_parent_area,
275  child->bounding_box().area());
276  return max_count + 1;
277  }
278  }
279  }
280  }
281  }
282  }
283  return child_count + grandchild_count;
284 }
285 
286 
287 
288 
295 void OL_BUCKETS::extract_children( // recursive count
296  C_OUTLINE *outline, // parent outline
297  C_OUTLINE_IT *it // destination iterator
298  ) {
299  int16_t xmin, xmax; // coord limits
300  int16_t ymin, ymax;
301  int16_t xindex, yindex; // current bucket
302  TBOX olbox;
303  C_OUTLINE_IT child_it; // search iterator
304 
305  olbox = outline->bounding_box();
306  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
307  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
308  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
309  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
310  for (yindex = ymin; yindex <= ymax; yindex++) {
311  for (xindex = xmin; xindex <= xmax; xindex++) {
312  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
313  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
314  child_it.forward()) {
315  if (*child_it.data() < *outline) {
316  it->add_after_then_move(child_it.extract());
317  }
318  }
319  }
320  }
321 }
322 
323 
330 void extract_edges(Pix* pix, // thresholded image
331  BLOCK *block) { // block to scan
332  C_OUTLINE_LIST outlines; // outlines in block
333  C_OUTLINE_IT out_it = &outlines;
334 
335  block_edges(pix, &(block->pdblk), &out_it);
336  ICOORD bleft; // block box
337  ICOORD tright;
338  block->pdblk.bounding_box(bleft, tright);
339  // make blobs
340  outlines_to_blobs(block, bleft, tright, &outlines);
341 }
342 
343 
350 void outlines_to_blobs( // find blobs
351  BLOCK *block, // block to scan
352  ICOORD bleft,
353  ICOORD tright,
354  C_OUTLINE_LIST *outlines) {
355  // make buckets
356  OL_BUCKETS buckets(bleft, tright);
357 
358  fill_buckets(outlines, &buckets);
359  empty_buckets(block, &buckets);
360 }
361 
362 
369 void fill_buckets( // find blobs
370  C_OUTLINE_LIST *outlines, // outlines in block
371  OL_BUCKETS *buckets // output buckets
372  ) {
373  TBOX ol_box; // outline box
374  C_OUTLINE_IT out_it = outlines; // iterator
375  C_OUTLINE_IT bucket_it; // iterator in bucket
376  C_OUTLINE *outline; // current outline
377 
378  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
379  outline = out_it.extract(); // take off list
380  // get box
381  ol_box = outline->bounding_box();
382  bucket_it.set_to_list((*buckets) (ol_box.left(), ol_box.bottom()));
383  bucket_it.add_to_end(outline);
384  }
385 }
386 
387 
394 void empty_buckets( // find blobs
395  BLOCK *block, // block to scan
396  OL_BUCKETS *buckets // output buckets
397  ) {
398  bool good_blob; // healthy blob
399  C_OUTLINE_LIST outlines; // outlines in block
400  // iterator
401  C_OUTLINE_IT out_it = &outlines;
402  C_OUTLINE_IT bucket_it = buckets->start_scan();
403  C_OUTLINE_IT parent_it; // parent outline
404  C_BLOB_IT good_blobs = block->blob_list();
405  C_BLOB_IT junk_blobs = block->reject_blobs();
406 
407  while (!bucket_it.empty()) {
408  out_it.set_to_list(&outlines);
409  do {
410  parent_it = bucket_it; // find outermost
411  do {
412  bucket_it.forward();
413  } while (!bucket_it.at_first() &&
414  !(*parent_it.data() < *bucket_it.data()));
415  } while (!bucket_it.at_first());
416 
417  // move to new list
418  out_it.add_after_then_move(parent_it.extract());
419  good_blob = capture_children(buckets, &junk_blobs, &out_it);
420  C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs,
421  &junk_blobs);
422 
423  bucket_it.set_to_list(buckets->scan_next());
424  }
425 }
426 
427 
436 bool capture_children( // find children
437  OL_BUCKETS* buckets, // bucket sort clanss
438  C_BLOB_IT* reject_it, // dead grandchildren
439  C_OUTLINE_IT* blob_it // output outlines
440 ) {
441  C_OUTLINE *outline; // master outline
442  int32_t child_count; // no of children
443 
444  outline = blob_it->data();
445  if (edges_use_new_outline_complexity)
446  child_count = buckets->outline_complexity(outline,
447  edges_children_count_limit,
448  0);
449  else
450  child_count = buckets->count_children(outline,
451  edges_children_count_limit);
452  if (child_count > edges_children_count_limit)
453  return false;
454 
455  if (child_count > 0)
456  buckets->extract_children(outline, blob_it);
457  return true;
458 }
int32_t area() const
Definition: rect.h:122
int16_t top() const
Definition: rect.h:58
bool capture_children(OL_BUCKETS *buckets, C_BLOB_IT *reject_it, C_OUTLINE_IT *blob_it)
Definition: edgblob.cpp:436
Definition: rect.h:34
void empty_buckets(BLOCK *block, OL_BUCKETS *buckets)
Definition: edgblob.cpp:394
#define INT_VAR(name, val, comment)
Definition: params.h:303
const TBOX & bounding_box() const
Definition: coutln.h:113
#define double_VAR(name, val, comment)
Definition: params.h:312
int32_t count_children(C_OUTLINE *outline, int32_t max_count)
Definition: edgblob.cpp:179
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:330
integer coordinate
Definition: points.h:31
static void ConstructBlobsFromOutlines(bool good_blob, C_OUTLINE_LIST *outline_list, C_BLOB_IT *good_blobs_it, C_BLOB_IT *bad_blobs_it)
Definition: stepblob.cpp:189
int16_t y() const
access_function
Definition: points.h:56
int32_t outline_complexity(C_OUTLINE *outline, int32_t max_count, int16_t depth)
Definition: edgblob.cpp:110
void block_edges(Pix *t_pix, PDBLK *block, C_OUTLINE_IT *outline_it)
Definition: scanedg.cpp:37
int16_t height() const
Definition: rect.h:108
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
C_BLOB_LIST * reject_blobs()
Definition: ocrblock.h:132
int32_t pathlength() const
Definition: coutln.h:135
int32_t outer_area() const
Definition: coutln.cpp:308
int16_t x() const
access function
Definition: points.h:52
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void fill_buckets(C_OUTLINE_LIST *outlines, OL_BUCKETS *buckets)
Definition: edgblob.cpp:369
void outlines_to_blobs(BLOCK *block, ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines)
Definition: edgblob.cpp:350
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:191
C_OUTLINE_LIST * start_scan()
Definition: edgblob.h:45
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
int16_t left() const
Definition: rect.h:72
#define BOOL_VAR(name, val, comment)
Definition: params.h:306
C_OUTLINE_LIST * scan_next()
Definition: edgblob.h:51
void extract_children(C_OUTLINE *outline, C_OUTLINE_IT *it)
Definition: edgblob.cpp:295
#define BUCKETSIZE
Definition: edgblob.h:30
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:129
C_OUTLINE_LIST * operator()(int16_t x, int16_t y)
Definition: edgblob.cpp:83
Definition: ocrblock.h:29
OL_BUCKETS(ICOORD bleft, ICOORD tright)
Definition: edgblob.cpp:64