|
tesseract 3.04.01
|
00001 00002 // File: colfind.h 00003 // Description: Class to find columns in the grid of BLOBNBOXes. 00004 // Author: Ray Smith 00005 // Created: Thu Feb 21 14:04:01 PST 2008 00006 // 00007 // (C) Copyright 2008, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef TESSERACT_TEXTORD_COLFIND_H__ 00021 #define TESSERACT_TEXTORD_COLFIND_H__ 00022 00023 #include "tabfind.h" 00024 #include "imagefind.h" 00025 #include "colpartitiongrid.h" 00026 #include "colpartitionset.h" 00027 #include "ocrblock.h" 00028 #include "textlineprojection.h" 00029 00030 class BLOCK_LIST; 00031 struct Boxa; 00032 struct Pixa; 00033 class DENORM; 00034 class ScrollView; 00035 class STATS; 00036 class TO_BLOCK; 00037 00038 namespace tesseract { 00039 00040 extern BOOL_VAR_H(textord_tabfind_find_tables, false, "run table detection"); 00041 00042 class ColPartitionSet; 00043 class ColPartitionSet_LIST; 00044 class ColSegment_LIST; 00045 class ColumnGroup_LIST; 00046 class LineSpacing; 00047 class StrokeWidth; 00048 class TempColumn_LIST; 00049 class EquationDetectBase; 00050 00051 // The ColumnFinder class finds columns in the grid. 00052 class ColumnFinder : public TabFind { 00053 public: 00054 // Gridsize is an estimate of the text size in the image. A suitable value 00055 // is in TO_BLOCK::line_size after find_components has been used to make 00056 // the blobs. 00057 // bleft and tright are the bounds of the image (rectangle) being processed. 00058 // vlines is a (possibly empty) list of TabVector and vertical_x and y are 00059 // the sum logical vertical vector produced by LineFinder::FindVerticalLines. 00060 // If cjk_script is true, then broken CJK characters are fixed during 00061 // layout analysis to assist in detecting horizontal vs vertically written 00062 // textlines. 00063 ColumnFinder(int gridsize, const ICOORD& bleft, const ICOORD& tright, 00064 int resolution, bool cjk_script, double aligned_gap_fraction, 00065 TabVector_LIST* vlines, TabVector_LIST* hlines, 00066 int vertical_x, int vertical_y); 00067 virtual ~ColumnFinder(); 00068 00069 // Accessors for testing 00070 const DENORM* denorm() const { 00071 return denorm_; 00072 } 00073 const TextlineProjection* projection() const { 00074 return &projection_; 00075 } 00076 void set_cjk_script(bool is_cjk) { 00077 cjk_script_ = is_cjk; 00078 } 00079 00080 // ====================================================================== 00081 // The main function of ColumnFinder is broken into pieces to facilitate 00082 // optional insertion of orientation and script detection in an efficient 00083 // way. The calling sequence IS MANDATORY however, whether or not 00084 // OSD is being used: 00085 // 1. Construction. 00086 // 2. SetupAndFilterNoise. 00087 // 3. IsVerticallyAlignedText. 00088 // 4. CorrectOrientation. 00089 // 5. FindBlocks. 00090 // 6. Destruction. Use of a single column finder for multiple images does not 00091 // make sense. 00092 // Throughout these steps, the ColPartitions are owned by part_grid_, which 00093 // means that that it must be kept correct. Exception: big_parts_ owns its 00094 // own ColPartitions. 00095 // The BLOBNBOXes are owned by the input TO_BLOCK for the whole time, except 00096 // for a phase in FindBlocks before TransformToBlocks, when they become 00097 // owned by the ColPartitions. The owner() ColPartition of a BLOBNBOX 00098 // indicates more of a betrothal for the majority of layout analysis, ie 00099 // which ColPartition will take ownership when the blobs are release from 00100 // the input TO_BLOCK. Exception: image_bblobs_ owns the fake blobs that 00101 // are part of the image regions, as they are not on any TO_BLOCK list. 00102 // TODO(rays) break up column finder further into smaller classes, as 00103 // there is a lot more to it than column finding now. 00104 // ====================================================================== 00105 00106 // Performs initial processing on the blobs in the input_block: 00107 // Setup the part_grid, stroke_width_, nontext_map_. 00108 // Obvious noise blobs are filtered out and used to mark the nontext_map_. 00109 // Initial stroke-width analysis is used to get local text alignment 00110 // direction, so the textline projection_ map can be setup. 00111 // On return, IsVerticallyAlignedText may be called (now optionally) to 00112 // determine the gross textline alignment of the page. 00113 void SetupAndFilterNoise(PageSegMode pageseg_mode, Pix* photo_mask_pix, 00114 TO_BLOCK* input_block); 00115 00116 // Tests for vertical alignment of text (returning true if so), and generates 00117 // a list of blobs (in osd_blobs) for orientation and script detection. 00118 // block is the single block for the whole page or rectangle to be OCRed. 00119 // Note that the vertical alignment may be due to text whose writing direction 00120 // is vertical, like say Japanese, or due to text whose writing direction is 00121 // horizontal but whose text appears vertically aligned because the image is 00122 // not the right way up. 00123 // find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio. 00124 bool IsVerticallyAlignedText(double find_vertical_text_ratio, 00125 TO_BLOCK* block, BLOBNBOX_CLIST* osd_blobs); 00126 00127 // Rotates the blobs and the TabVectors so that the gross writing direction 00128 // (text lines) are horizontal and lines are read down the page. 00129 // Applied rotation stored in rotation_. 00130 // A second rotation is calculated for application during recognition to 00131 // make the rotated blobs upright for recognition. 00132 // Subsequent rotation stored in text_rotation_. 00133 // 00134 // Arguments: 00135 // vertical_text_lines is true if the text lines are vertical. 00136 // recognition_rotation [0..3] is the number of anti-clockwise 90 degree 00137 // rotations from osd required for the text to be upright and readable. 00138 void CorrectOrientation(TO_BLOCK* block, bool vertical_text_lines, 00139 int recognition_rotation); 00140 00141 // Finds blocks of text, image, rule line, table etc, returning them in the 00142 // blocks and to_blocks 00143 // (Each TO_BLOCK points to the basic BLOCK and adds more information.) 00144 // Image blocks are generated by a combination of photo_mask_pix (which may 00145 // NOT be NULL) and the rejected text found during preliminary textline 00146 // finding. 00147 // The input_block is the result of a call to find_components, and contains 00148 // the blobs found in the image or rectangle to be OCRed. These blobs will be 00149 // removed and placed in the output blocks, while unused ones will be deleted. 00150 // If single_column is true, the input is treated as single column, but 00151 // it is still divided into blocks of equal line spacing/text size. 00152 // scaled_color is scaled down by scaled_factor from the input color image, 00153 // and may be NULL if the input was not color. 00154 // grey_pix is optional, but if present must match the photo_mask_pix in size, 00155 // and must be a *real* grey image instead of binary_pix * 255. 00156 // thresholds_pix is expected to be present iff grey_pix is present and 00157 // can be an integer factor reduction of the grey_pix. It represents the 00158 // thresholds that were used to create the binary_pix from the grey_pix. 00159 // Small blobs that confuse the segmentation into lines are placed into 00160 // diacritic_blobs, with the intention that they be put into the most 00161 // appropriate word after the rest of layout analysis. 00162 // Returns -1 if the user hits the 'd' key in the blocks window while running 00163 // in debug mode, which requests a retry with more debug info. 00164 int FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, int scaled_factor, 00165 TO_BLOCK* block, Pix* photo_mask_pix, Pix* thresholds_pix, 00166 Pix* grey_pix, BLOCK_LIST* blocks, 00167 BLOBNBOX_LIST* diacritic_blobs, TO_BLOCK_LIST* to_blocks); 00168 00169 // Get the rotation required to deskew, and its inverse rotation. 00170 void GetDeskewVectors(FCOORD* deskew, FCOORD* reskew); 00171 00172 // Set the equation detection pointer. 00173 void SetEquationDetect(EquationDetectBase* detect); 00174 00175 private: 00176 // Displays the blob and block bounding boxes in a window called Blocks. 00177 void DisplayBlocks(BLOCK_LIST* blocks); 00178 // Displays the column edges at each grid y coordinate defined by 00179 // best_columns_. 00180 void DisplayColumnBounds(PartSetVector* sets); 00181 00183 00184 // Sets up column_sets_ (the determined column layout at each horizontal 00185 // slice). Returns false if the page is empty. 00186 bool MakeColumns(bool single_column); 00187 // Attempt to improve the column_candidates by expanding the columns 00188 // and adding new partitions from the partition sets in src_sets. 00189 // Src_sets may be equal to column_candidates, in which case it will 00190 // use them as a source to improve themselves. 00191 void ImproveColumnCandidates(PartSetVector* src_sets, 00192 PartSetVector* column_sets); 00193 // Prints debug information on the column candidates. 00194 void PrintColumnCandidates(const char* title); 00195 // Finds the optimal set of columns that cover the entire image with as 00196 // few changes in column partition as possible. 00197 // Returns true if any part of the page is multi-column. 00198 bool AssignColumns(const PartSetVector& part_sets); 00199 // Finds the biggest range in part_sets_ that has no assigned column, but 00200 // column assignment is possible. 00201 bool BiggestUnassignedRange(int set_count, const bool* any_columns_possible, 00202 int* start, int* end); 00203 // Finds the modal compatible column_set_ index within the given range. 00204 int RangeModalColumnSet(int** column_set_costs, const int* assigned_costs, 00205 int start, int end); 00206 // Given that there are many column_set_id compatible columns in the range, 00207 // shrinks the range to the longest contiguous run of compatibility, allowing 00208 // gaps where no columns are possible, but not where competing columns are 00209 // possible. 00210 void ShrinkRangeToLongestRun(int** column_set_costs, 00211 const int* assigned_costs, 00212 const bool* any_columns_possible, 00213 int column_set_id, 00214 int* best_start, int* best_end); 00215 // Moves start in the direction of step, up to, but not including end while 00216 // the only incompatible regions are no more than kMaxIncompatibleColumnCount 00217 // in size, and the compatible regions beyond are bigger. 00218 void ExtendRangePastSmallGaps(int** column_set_costs, 00219 const int* assigned_costs, 00220 const bool* any_columns_possible, 00221 int column_set_id, 00222 int step, int end, int* start); 00223 // Assigns the given column_set_id to the part_sets_ in the given range. 00224 void AssignColumnToRange(int column_set_id, int start, int end, 00225 int** column_set_costs, int* assigned_costs); 00226 00227 // Computes the mean_column_gap_. 00228 void ComputeMeanColumnGap(bool any_multi_column); 00229 00232 00233 // Hoovers up all un-owned blobs and deletes them. 00234 // The rest get released from the block so the ColPartitions can pass 00235 // ownership to the output blocks. 00236 void ReleaseBlobsAndCleanupUnused(TO_BLOCK* block); 00237 // Splits partitions that cross columns where they have nothing in the gap. 00238 void GridSplitPartitions(); 00239 // Merges partitions where there is vertical overlap, within a single column, 00240 // and the horizontal gap is small enough. 00241 void GridMergePartitions(); 00242 // Inserts remaining noise blobs into the most applicable partition if any. 00243 // If there is no applicable partition, then the blobs are deleted. 00244 void InsertRemainingNoise(TO_BLOCK* block); 00245 // Remove partitions that come from horizontal lines that look like 00246 // underlines, but are not part of a table. 00247 void GridRemoveUnderlinePartitions(); 00248 // Add horizontal line separators as partitions. 00249 void GridInsertHLinePartitions(); 00250 // Add vertical line separators as partitions. 00251 void GridInsertVLinePartitions(); 00252 // For every ColPartition in the grid, sets its type based on position 00253 // in the columns. 00254 void SetPartitionTypes(); 00255 // Only images remain with multiple types in a run of partners. 00256 // Sets the type of all in the group to the maximum of the group. 00257 void SmoothPartnerRuns(); 00258 00260 00261 // Helper functions for TransformToBlocks. 00262 // Add the part to the temp list in the correct order. 00263 void AddToTempPartList(ColPartition* part, ColPartition_CLIST* temp_list); 00264 // Add everything from the temp list to the work_set assuming correct order. 00265 void EmptyTempPartList(ColPartition_CLIST* temp_list, 00266 WorkingPartSet_LIST* work_set); 00267 00268 // Transform the grid of partitions to the output blocks. 00269 void TransformToBlocks(BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks); 00270 00271 // Reflect the blob boxes (but not the outlines) in the y-axis so that 00272 // the blocks get created in the correct RTL order. Rotates the blobs 00273 // in the input_block and the bblobs list. 00274 // The reflection is undone in RotateAndReskewBlocks by 00275 // reflecting the blocks themselves, and then recomputing the blob bounding 00276 // boxes. 00277 void ReflectForRtl(TO_BLOCK* input_block, BLOBNBOX_LIST* bblobs); 00278 00279 // Undo the deskew that was done in FindTabVectors, as recognition is done 00280 // without correcting blobs or blob outlines for skew. 00281 // Reskew the completed blocks to put them back to the original rotated coords 00282 // that were created by CorrectOrientation. 00283 // If the input_is_rtl, then reflect the blocks in the y-axis to undo the 00284 // reflection that was done before FindTabVectors. 00285 // Blocks that were identified as vertical text (relative to the rotated 00286 // coordinates) are further rotated so the text lines are horizontal. 00287 // blob polygonal outlines are rotated to match the position of the blocks 00288 // that they are in, and their bounding boxes are recalculated to be accurate. 00289 // Record appropriate inverse transformations and required 00290 // classifier transformation in the blocks. 00291 void RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST* to_blocks); 00292 00293 // Computes the rotations for the block (to make textlines horizontal) and 00294 // for the blobs (for classification) and sets the appropriate members 00295 // of the given block. 00296 // Returns the rotation that needs to be applied to the blobs to make 00297 // them sit in the rotated block. 00298 FCOORD ComputeBlockAndClassifyRotation(BLOCK* block); 00299 00300 // If true then the page language is cjk, so it is safe to perform 00301 // FixBrokenCJK. 00302 bool cjk_script_; 00303 // The minimum gutter width to apply for finding columns. 00304 // Modified when vertical text is detected to prevent detection of 00305 // vertical text lines as columns. 00306 int min_gutter_width_; 00307 // The mean gap between columns over the page. 00308 int mean_column_gap_; 00309 // Config param saved at construction time. Modifies min_gutter_width_ with 00310 // vertical text to prevent detection of vertical text as columns. 00311 double tabfind_aligned_gap_fraction_; 00312 // The rotation vector needed to convert original coords to deskewed. 00313 FCOORD deskew_; 00314 // The rotation vector needed to convert deskewed back to original coords. 00315 FCOORD reskew_; 00316 // The rotation vector used to rotate vertically oriented pages. 00317 FCOORD rotation_; 00318 // The rotation vector needed to convert the rotated back to original coords. 00319 FCOORD rerotate_; 00320 // The additional rotation vector needed to rotate text for recognition. 00321 FCOORD text_rotation_; 00322 // The column_sets_ contain the ordered candidate ColPartitionSets that 00323 // define the possible divisions of the page into columns. 00324 PartSetVector column_sets_; 00325 // A simple array of pointers to the best assigned column division at 00326 // each grid y coordinate. 00327 ColPartitionSet** best_columns_; 00328 // The grid used for creating initial partitions with strokewidth. 00329 StrokeWidth* stroke_width_; 00330 // The grid used to hold ColPartitions after the columns have been determined. 00331 ColPartitionGrid part_grid_; 00332 // List of ColPartitions that are no longer needed after they have been 00333 // turned into regions, but are kept around because they are referenced 00334 // by the part_grid_. 00335 ColPartition_LIST good_parts_; 00336 // List of ColPartitions that are big and might be dropcap or vertically 00337 // joined. 00338 ColPartition_LIST big_parts_; 00339 // List of ColPartitions that have been declared noise. 00340 ColPartition_LIST noise_parts_; 00341 // The fake blobs that are made from the images. 00342 BLOBNBOX_LIST image_bblobs_; 00343 // Horizontal line separators. 00344 TabVector_LIST horizontal_lines_; 00345 // Image map of photo/noise areas on the page. 00346 Pix* nontext_map_; 00347 // Textline projection map. 00348 TextlineProjection projection_; 00349 // Sequence of DENORMS that indicate how to get back to the original image 00350 // coordinate space. The destructor must delete all the DENORMs in the chain. 00351 DENORM* denorm_; 00352 00353 // Various debug windows that automatically go away on completion. 00354 ScrollView* input_blobs_win_; 00355 00356 // The equation region detector pointer. Note: This pointer is passed in by 00357 // member function SetEquationDetect, and releasing it is NOT owned by this 00358 // class. 00359 EquationDetectBase* equation_detect_; 00360 00361 // Allow a subsequent instance to reuse the blocks window. 00362 // Not thread-safe, but multiple threads shouldn't be using windows anyway. 00363 static ScrollView* blocks_win_; 00364 }; 00365 00366 } // namespace tesseract. 00367 00368 #endif // TESSERACT_TEXTORD_COLFIND_H__