tesseract  4.1.3
pageiterator.h
Go to the documentation of this file.
1 // File: pageiterator.h
3 // Description: Iterator for tesseract page structure that avoids using
4 // tesseract internal data structures.
5 // Author: Ray Smith
6 // Created: Fri Feb 26 11:01:06 PST 2010
7 //
8 // (C) Copyright 2010, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
22 #define TESSERACT_CCMAIN_PAGEITERATOR_H_
23 
24 #include "platform.h"
25 #include "publictypes.h"
26 
27 struct BlamerBundle;
28 class C_BLOB_IT;
29 class PAGE_RES;
30 class PAGE_RES_IT;
31 class WERD;
32 struct Pix;
33 struct Pta;
34 
35 namespace tesseract {
36 
37 class Tesseract;
38 
53  public:
69  int scale, int scaled_yres,
70  int rect_left, int rect_top,
71  int rect_width, int rect_height);
72  virtual ~PageIterator();
73 
80  PageIterator(const PageIterator& src);
81  const PageIterator& operator=(const PageIterator& src);
82 
84  bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
85 
86  // ============= Moving around within the page ============.
87 
92  virtual void Begin();
93 
99  virtual void RestartParagraph();
100 
105  bool IsWithinFirstTextlineOfParagraph() const;
106 
112  virtual void RestartRow();
113 
125  virtual bool Next(PageIteratorLevel level);
126 
140  virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
141 
158  virtual bool IsAtFinalElement(PageIteratorLevel level,
159  PageIteratorLevel element) const;
160 
167  int Cmp(const PageIterator &other) const;
168 
169  // ============= Accessing data ==============.
170  // Coordinate system:
171  // Integer coordinates are at the cracks between the pixels.
172  // The top-left corner of the top-left pixel in the image is at (0,0).
173  // The bottom-right corner of the bottom-right pixel in the image is at
174  // (width, height).
175  // Every bounding box goes from the top-left of the top-left contained
176  // pixel to the bottom-right of the bottom-right contained pixel, so
177  // the bounding box of the single top-left pixel in the image is:
178  // (0,0)->(1,1).
179  // If an image rectangle has been set in the API, then returned coordinates
180  // relate to the original (full) image, rather than the rectangle.
181 
191  void SetBoundingBoxComponents(bool include_upper_dots,
192  bool include_lower_dots) {
193  include_upper_dots_ = include_upper_dots;
194  include_lower_dots_ = include_lower_dots;
195  }
196 
206  bool BoundingBox(PageIteratorLevel level,
207  int* left, int* top, int* right, int* bottom) const;
208  bool BoundingBox(PageIteratorLevel level, int padding,
209  int* left, int* top, int* right, int* bottom) const;
215  bool BoundingBoxInternal(PageIteratorLevel level,
216  int* left, int* top, int* right, int* bottom) const;
217 
219  bool Empty(PageIteratorLevel level) const;
220 
225  PolyBlockType BlockType() const;
226 
234  Pta* BlockPolygon() const;
235 
242  Pix* GetBinaryImage(PageIteratorLevel level) const;
243 
255  Pix* GetImage(PageIteratorLevel level, int padding, Pix* original_img,
256  int* left, int* top) const;
257 
264  bool Baseline(PageIteratorLevel level,
265  int* x1, int* y1, int* x2, int* y2) const;
266 
267  // Returns the attributes of the current row.
268  void RowAttributes(float* row_height, float* descenders,
269  float* ascenders) const;
270 
279  void Orientation(tesseract::Orientation *orientation,
280  tesseract::WritingDirection *writing_direction,
281  tesseract::TextlineOrder *textline_order,
282  float *deskew_angle) const;
283 
312  void ParagraphInfo(tesseract::ParagraphJustification *justification,
313  bool *is_list_item,
314  bool *is_crown,
315  int *first_line_indent) const;
316 
317  // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
318  // of the current word to the given pointer (takes ownership of the pointer)
319  // and returns true.
320  // Can only be used when iterating on the word level.
321  bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
322 
323  protected:
328  TESS_LOCAL void BeginWord(int offset);
329 
353  C_BLOB_IT* cblob_it_;
358  int scale_;
364 };
365 
366 } // namespace tesseract.
367 
368 #endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
PolyBlockType
Definition: publictypes.h:53
ParagraphJustification
Definition: publictypes.h:251
#define TESS_API
Definition: platform.h:54
#define TESS_LOCAL
Definition: platform.h:55
void SetBoundingBoxComponents(bool include_upper_dots, bool include_lower_dots)
Definition: pageiterator.h:191
Definition: werd.h:56