tesseract 3.04.01

textord/colfind.h

Go to the documentation of this file.
00001 
00002 // File:        colfind.h
00003 // Description: Class to find columns in the grid of BLOBNBOXes.
00004 // Author:      Ray Smith
00005 // Created:     Thu Feb 21 14:04:01 PST 2008
00006 //
00007 // (C) Copyright 2008, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifndef TESSERACT_TEXTORD_COLFIND_H__
00021 #define TESSERACT_TEXTORD_COLFIND_H__
00022 
00023 #include "tabfind.h"
00024 #include "imagefind.h"
00025 #include "colpartitiongrid.h"
00026 #include "colpartitionset.h"
00027 #include "ocrblock.h"
00028 #include "textlineprojection.h"
00029 
00030 class BLOCK_LIST;
00031 struct Boxa;
00032 struct Pixa;
00033 class DENORM;
00034 class ScrollView;
00035 class STATS;
00036 class TO_BLOCK;
00037 
00038 namespace tesseract {
00039 
00040 extern BOOL_VAR_H(textord_tabfind_find_tables, false, "run table detection");
00041 
00042 class ColPartitionSet;
00043 class ColPartitionSet_LIST;
00044 class ColSegment_LIST;
00045 class ColumnGroup_LIST;
00046 class LineSpacing;
00047 class StrokeWidth;
00048 class TempColumn_LIST;
00049 class EquationDetectBase;
00050 
00051 // The ColumnFinder class finds columns in the grid.
00052 class ColumnFinder : public TabFind {
00053  public:
00054   // Gridsize is an estimate of the text size in the image. A suitable value
00055   // is in TO_BLOCK::line_size after find_components has been used to make
00056   // the blobs.
00057   // bleft and tright are the bounds of the image (rectangle) being processed.
00058   // vlines is a (possibly empty) list of TabVector and vertical_x and y are
00059   // the sum logical vertical vector produced by LineFinder::FindVerticalLines.
00060   // If cjk_script is true, then broken CJK characters are fixed during
00061   // layout analysis to assist in detecting horizontal vs vertically written
00062   // textlines.
00063   ColumnFinder(int gridsize, const ICOORD& bleft, const ICOORD& tright,
00064                int resolution, bool cjk_script, double aligned_gap_fraction,
00065                TabVector_LIST* vlines, TabVector_LIST* hlines,
00066                int vertical_x, int vertical_y);
00067   virtual ~ColumnFinder();
00068 
00069   // Accessors for testing
00070   const DENORM* denorm() const {
00071     return denorm_;
00072   }
00073   const TextlineProjection* projection() const {
00074     return &projection_;
00075   }
00076   void set_cjk_script(bool is_cjk) {
00077     cjk_script_ = is_cjk;
00078   }
00079 
00080   // ======================================================================
00081   // The main function of ColumnFinder is broken into pieces to facilitate
00082   // optional insertion of orientation and script detection in an efficient
00083   // way. The calling sequence IS MANDATORY however, whether or not
00084   // OSD is being used:
00085   // 1. Construction.
00086   // 2. SetupAndFilterNoise.
00087   // 3. IsVerticallyAlignedText.
00088   // 4. CorrectOrientation.
00089   // 5. FindBlocks.
00090   // 6. Destruction. Use of a single column finder for multiple images does not
00091   //    make sense.
00092   // Throughout these steps, the ColPartitions are owned by part_grid_, which
00093   // means that that it must be kept correct. Exception: big_parts_ owns its
00094   // own ColPartitions.
00095   // The BLOBNBOXes are owned by the input TO_BLOCK for the whole time, except
00096   // for a phase in FindBlocks before TransformToBlocks, when they become
00097   // owned by the ColPartitions. The owner() ColPartition of a BLOBNBOX
00098   // indicates more of a betrothal for the majority of layout analysis, ie
00099   // which ColPartition will take ownership when the blobs are release from
00100   // the input TO_BLOCK. Exception: image_bblobs_ owns the fake blobs that
00101   // are part of the image regions, as they are not on any TO_BLOCK list.
00102   // TODO(rays) break up column finder further into smaller classes, as
00103   // there is a lot more to it than column finding now.
00104   // ======================================================================
00105 
00106   // Performs initial processing on the blobs in the input_block:
00107   // Setup the part_grid, stroke_width_, nontext_map_.
00108   // Obvious noise blobs are filtered out and used to mark the nontext_map_.
00109   // Initial stroke-width analysis is used to get local text alignment
00110   // direction, so the textline projection_ map can be setup.
00111   // On return, IsVerticallyAlignedText may be called (now optionally) to
00112   // determine the gross textline alignment of the page.
00113   void SetupAndFilterNoise(PageSegMode pageseg_mode, Pix* photo_mask_pix,
00114                            TO_BLOCK* input_block);
00115 
00116   // Tests for vertical alignment of text (returning true if so), and generates
00117   // a list of blobs (in osd_blobs) for orientation and script detection.
00118   // block is the single block for the whole page or rectangle to be OCRed.
00119   // Note that the vertical alignment may be due to text whose writing direction
00120   // is vertical, like say Japanese, or due to text whose writing direction is
00121   // horizontal but whose text appears vertically aligned because the image is
00122   // not the right way up.
00123   // find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio.
00124   bool IsVerticallyAlignedText(double find_vertical_text_ratio,
00125                                TO_BLOCK* block, BLOBNBOX_CLIST* osd_blobs);
00126 
00127   // Rotates the blobs and the TabVectors so that the gross writing direction
00128   // (text lines) are horizontal and lines are read down the page.
00129   // Applied rotation stored in rotation_.
00130   // A second rotation is calculated for application during recognition to
00131   // make the rotated blobs upright for recognition.
00132   // Subsequent rotation stored in text_rotation_.
00133   //
00134   // Arguments:
00135   //   vertical_text_lines is true if the text lines are vertical.
00136   //   recognition_rotation [0..3] is the number of anti-clockwise 90 degree
00137   //   rotations from osd required for the text to be upright and readable.
00138   void CorrectOrientation(TO_BLOCK* block, bool vertical_text_lines,
00139                           int recognition_rotation);
00140 
00141   // Finds blocks of text, image, rule line, table etc, returning them in the
00142   // blocks and to_blocks
00143   // (Each TO_BLOCK points to the basic BLOCK and adds more information.)
00144   // Image blocks are generated by a combination of photo_mask_pix (which may
00145   // NOT be NULL) and the rejected text found during preliminary textline
00146   // finding.
00147   // The input_block is the result of a call to find_components, and contains
00148   // the blobs found in the image or rectangle to be OCRed. These blobs will be
00149   // removed and placed in the output blocks, while unused ones will be deleted.
00150   // If single_column is true, the input is treated as single column, but
00151   // it is still divided into blocks of equal line spacing/text size.
00152   // scaled_color is scaled down by scaled_factor from the input color image,
00153   // and may be NULL if the input was not color.
00154   // grey_pix is optional, but if present must match the photo_mask_pix in size,
00155   // and must be a *real* grey image instead of binary_pix * 255.
00156   // thresholds_pix is expected to be present iff grey_pix is present and
00157   // can be an integer factor reduction of the grey_pix. It represents the
00158   // thresholds that were used to create the binary_pix from the grey_pix.
00159   // Small blobs that confuse the segmentation into lines are placed into
00160   // diacritic_blobs, with the intention that they be put into the most
00161   // appropriate word after the rest of layout analysis.
00162   // Returns -1 if the user hits the 'd' key in the blocks window while running
00163   // in debug mode, which requests a retry with more debug info.
00164   int FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, int scaled_factor,
00165                  TO_BLOCK* block, Pix* photo_mask_pix, Pix* thresholds_pix,
00166                  Pix* grey_pix, BLOCK_LIST* blocks,
00167                  BLOBNBOX_LIST* diacritic_blobs, TO_BLOCK_LIST* to_blocks);
00168 
00169   // Get the rotation required to deskew, and its inverse rotation.
00170   void GetDeskewVectors(FCOORD* deskew, FCOORD* reskew);
00171 
00172   // Set the equation detection pointer.
00173   void SetEquationDetect(EquationDetectBase* detect);
00174 
00175  private:
00176   // Displays the blob and block bounding boxes in a window called Blocks.
00177   void DisplayBlocks(BLOCK_LIST* blocks);
00178   // Displays the column edges at each grid y coordinate defined by
00179   // best_columns_.
00180   void DisplayColumnBounds(PartSetVector* sets);
00181 
00183 
00184   // Sets up column_sets_ (the determined column layout at each horizontal
00185   // slice). Returns false if the page is empty.
00186   bool MakeColumns(bool single_column);
00187   // Attempt to improve the column_candidates by expanding the columns
00188   // and adding new partitions from the partition sets in src_sets.
00189   // Src_sets may be equal to column_candidates, in which case it will
00190   // use them as a source to improve themselves.
00191   void ImproveColumnCandidates(PartSetVector* src_sets,
00192                                PartSetVector* column_sets);
00193   // Prints debug information on the column candidates.
00194   void PrintColumnCandidates(const char* title);
00195   // Finds the optimal set of columns that cover the entire image with as
00196   // few changes in column partition as possible.
00197   // Returns true if any part of the page is multi-column.
00198   bool AssignColumns(const PartSetVector& part_sets);
00199   // Finds the biggest range in part_sets_ that has no assigned column, but
00200   // column assignment is possible.
00201   bool BiggestUnassignedRange(int set_count, const bool* any_columns_possible,
00202                               int* start, int* end);
00203   // Finds the modal compatible column_set_ index within the given range.
00204   int RangeModalColumnSet(int** column_set_costs, const int* assigned_costs,
00205                           int start, int end);
00206   // Given that there are many column_set_id compatible columns in the range,
00207   // shrinks the range to the longest contiguous run of compatibility, allowing
00208   // gaps where no columns are possible, but not where competing columns are
00209   // possible.
00210   void ShrinkRangeToLongestRun(int** column_set_costs,
00211                                const int* assigned_costs,
00212                                const bool* any_columns_possible,
00213                                int column_set_id,
00214                                int* best_start, int* best_end);
00215   // Moves start in the direction of step, up to, but not including end while
00216   // the only incompatible regions are no more than kMaxIncompatibleColumnCount
00217   // in size, and the compatible regions beyond are bigger.
00218   void ExtendRangePastSmallGaps(int** column_set_costs,
00219                                 const int* assigned_costs,
00220                                 const bool* any_columns_possible,
00221                                 int column_set_id,
00222                                 int step, int end, int* start);
00223   // Assigns the given column_set_id to the part_sets_ in the given range.
00224   void AssignColumnToRange(int column_set_id, int start, int end,
00225                            int** column_set_costs, int* assigned_costs);
00226 
00227   // Computes the mean_column_gap_.
00228   void ComputeMeanColumnGap(bool any_multi_column);
00229 
00232 
00233   // Hoovers up all un-owned blobs and deletes them.
00234   // The rest get released from the block so the ColPartitions can pass
00235   // ownership to the output blocks.
00236   void ReleaseBlobsAndCleanupUnused(TO_BLOCK* block);
00237   // Splits partitions that cross columns where they have nothing in the gap.
00238   void GridSplitPartitions();
00239   // Merges partitions where there is vertical overlap, within a single column,
00240   // and the horizontal gap is small enough.
00241   void GridMergePartitions();
00242   // Inserts remaining noise blobs into the most applicable partition if any.
00243   // If there is no applicable partition, then the blobs are deleted.
00244   void InsertRemainingNoise(TO_BLOCK* block);
00245   // Remove partitions that come from horizontal lines that look like
00246   // underlines, but are not part of a table.
00247   void GridRemoveUnderlinePartitions();
00248   // Add horizontal line separators as partitions.
00249   void GridInsertHLinePartitions();
00250   // Add vertical line separators as partitions.
00251   void GridInsertVLinePartitions();
00252   // For every ColPartition in the grid, sets its type based on position
00253   // in the columns.
00254   void SetPartitionTypes();
00255   // Only images remain with multiple types in a run of partners.
00256   // Sets the type of all in the group to the maximum of the group.
00257   void SmoothPartnerRuns();
00258 
00260 
00261   // Helper functions for TransformToBlocks.
00262   // Add the part to the temp list in the correct order.
00263   void AddToTempPartList(ColPartition* part, ColPartition_CLIST* temp_list);
00264   // Add everything from the temp list to the work_set assuming correct order.
00265   void EmptyTempPartList(ColPartition_CLIST* temp_list,
00266                          WorkingPartSet_LIST* work_set);
00267 
00268   // Transform the grid of partitions to the output blocks.
00269   void TransformToBlocks(BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);
00270 
00271   // Reflect the blob boxes (but not the outlines) in the y-axis so that
00272   // the blocks get created in the correct RTL order. Rotates the blobs
00273   // in the input_block and the bblobs list.
00274   // The reflection is undone in RotateAndReskewBlocks by
00275   // reflecting the blocks themselves, and then recomputing the blob bounding
00276   //  boxes.
00277   void ReflectForRtl(TO_BLOCK* input_block, BLOBNBOX_LIST* bblobs);
00278 
00279   // Undo the deskew that was done in FindTabVectors, as recognition is done
00280   // without correcting blobs or blob outlines for skew.
00281   // Reskew the completed blocks to put them back to the original rotated coords
00282   // that were created by CorrectOrientation.
00283   // If the input_is_rtl, then reflect the blocks in the y-axis to undo the
00284   // reflection that was done before FindTabVectors.
00285   // Blocks that were identified as vertical text (relative to the rotated
00286   // coordinates) are further rotated so the text lines are horizontal.
00287   // blob polygonal outlines are rotated to match the position of the blocks
00288   // that they are in, and their bounding boxes are recalculated to be accurate.
00289   // Record appropriate inverse transformations and required
00290   // classifier transformation in the blocks.
00291   void RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST* to_blocks);
00292 
00293   // Computes the rotations for the block (to make textlines horizontal) and
00294   // for the blobs (for classification) and sets the appropriate members
00295   // of the given block.
00296   // Returns the rotation that needs to be applied to the blobs to make
00297   // them sit in the rotated block.
00298   FCOORD ComputeBlockAndClassifyRotation(BLOCK* block);
00299 
00300   // If true then the page language is cjk, so it is safe to perform
00301   // FixBrokenCJK.
00302   bool cjk_script_;
00303   // The minimum gutter width to apply for finding columns.
00304   // Modified when vertical text is detected to prevent detection of
00305   // vertical text lines as columns.
00306   int min_gutter_width_;
00307   // The mean gap between columns over the page.
00308   int mean_column_gap_;
00309   // Config param saved at construction time. Modifies min_gutter_width_ with
00310   // vertical text to prevent detection of vertical text as columns.
00311   double tabfind_aligned_gap_fraction_;
00312   // The rotation vector needed to convert original coords to deskewed.
00313   FCOORD deskew_;
00314   // The rotation vector needed to convert deskewed back to original coords.
00315   FCOORD reskew_;
00316   // The rotation vector used to rotate vertically oriented pages.
00317   FCOORD rotation_;
00318   // The rotation vector needed to convert the rotated back to original coords.
00319   FCOORD rerotate_;
00320   // The additional rotation vector needed to rotate text for recognition.
00321   FCOORD text_rotation_;
00322   // The column_sets_ contain the ordered candidate ColPartitionSets that
00323   // define the possible divisions of the page into columns.
00324   PartSetVector column_sets_;
00325   // A simple array of pointers to the best assigned column division at
00326   // each grid y coordinate.
00327   ColPartitionSet** best_columns_;
00328   // The grid used for creating initial partitions with strokewidth.
00329   StrokeWidth* stroke_width_;
00330   // The grid used to hold ColPartitions after the columns have been determined.
00331   ColPartitionGrid part_grid_;
00332   // List of ColPartitions that are no longer needed after they have been
00333   // turned into regions, but are kept around because they are referenced
00334   // by the part_grid_.
00335   ColPartition_LIST good_parts_;
00336   // List of ColPartitions that are big and might be dropcap or vertically
00337   // joined.
00338   ColPartition_LIST big_parts_;
00339   // List of ColPartitions that have been declared noise.
00340   ColPartition_LIST noise_parts_;
00341   // The fake blobs that are made from the images.
00342   BLOBNBOX_LIST image_bblobs_;
00343   // Horizontal line separators.
00344   TabVector_LIST horizontal_lines_;
00345   // Image map of photo/noise areas on the page.
00346   Pix* nontext_map_;
00347   // Textline projection map.
00348   TextlineProjection projection_;
00349   // Sequence of DENORMS that indicate how to get back to the original image
00350   // coordinate space. The destructor must delete all the DENORMs in the chain.
00351   DENORM* denorm_;
00352 
00353   // Various debug windows that automatically go away on completion.
00354   ScrollView* input_blobs_win_;
00355 
00356   // The equation region detector pointer. Note: This pointer is passed in by
00357   // member function SetEquationDetect, and releasing it is NOT owned by this
00358   // class.
00359   EquationDetectBase* equation_detect_;
00360 
00361   // Allow a subsequent instance to reuse the blocks window.
00362   // Not thread-safe, but multiple threads shouldn't be using windows anyway.
00363   static ScrollView* blocks_win_;
00364 };
00365 
00366 }  // namespace tesseract.
00367 
00368 #endif  // TESSERACT_TEXTORD_COLFIND_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines