tesseract 3.04.01

textord/strokewidth.h

Go to the documentation of this file.
00001 
00002 // File:        strokewidth.h
00003 // Description: Subclass of BBGrid to find uniformity of strokewidth.
00004 // Author:      Ray Smith
00005 // Created:     Mon Mar 31 16:17:01 PST 2008
00006 //
00007 // (C) Copyright 2008, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifndef TESSERACT_TEXTORD_STROKEWIDTH_H__
00021 #define TESSERACT_TEXTORD_STROKEWIDTH_H__
00022 
00023 #include "blobbox.h"        // BlobNeighourDir.
00024 #include "blobgrid.h"         // Base class.
00025 #include "colpartitiongrid.h"
00026 #include "textlineprojection.h"
00027 
00028 class DENORM;
00029 class ScrollView;
00030 class TO_BLOCK;
00031 
00032 namespace tesseract {
00033 
00034 class ColPartition_LIST;
00035 class TabFind;
00036 class TextlineProjection;
00037 
00038 // Misc enums to clarify bool arguments for direction-controlling args.
00039 enum LeftOrRight {
00040   LR_LEFT,
00041   LR_RIGHT
00042 };
00043 
00044 // Return value from FindInitialPartitions indicates detection of severe
00045 // skew or noise.
00046 enum PartitionFindResult {
00047   PFR_OK,    // Everything is OK.
00048   PFR_SKEW,  // Skew was detected and rotated.
00049   PFR_NOISE  // Noise was detected and removed.
00050 };
00051 
00057 class StrokeWidth : public BlobGrid {
00058  public:
00059   StrokeWidth(int gridsize, const ICOORD& bleft, const ICOORD& tright);
00060   virtual ~StrokeWidth();
00061 
00062   // Sets the neighbours member of the medium-sized blobs in the block.
00063   // Searches on 4 sides of each blob for similar-sized, similar-strokewidth
00064   // blobs and sets pointers to the good neighbours.
00065   void SetNeighboursOnMediumBlobs(TO_BLOCK* block);
00066 
00067   // Sets the neighbour/textline writing direction members of the medium
00068   // and large blobs with optional repair of broken CJK characters first.
00069   // Repair of broken CJK is needed here because broken CJK characters
00070   // can fool the textline direction detection algorithm.
00071   void FindTextlineDirectionAndFixBrokenCJK(PageSegMode pageseg_mode,
00072                                             bool cjk_merge,
00073                                             TO_BLOCK* input_block);
00074 
00075   // To save computation, the process of generating partitions is broken
00076   // into the following 4 steps:
00077   // TestVerticalTextDirection
00078   // CorrectForRotation (used only if a rotation is to be applied)
00079   // FindLeaderPartitions
00080   // GradeBlobsIntoPartitions.
00081   // These functions are all required, in sequence, except for
00082   // CorrectForRotation, which is not needed if no rotation is applied.
00083 
00084   // Types all the blobs as vertical or horizontal text or unknown and
00085   // returns true if the majority are vertical.
00086   // If the blobs are rotated, it is necessary to call CorrectForRotation
00087   // after rotating everything, otherwise the work done here will be enough.
00088   // If osd_blobs is not null, a list of blobs from the dominant textline
00089   // direction are returned for use in orientation and script detection.
00090   // find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio.
00091   bool TestVerticalTextDirection(double find_vertical_text_ratio,
00092                                  TO_BLOCK* block,
00093                                  BLOBNBOX_CLIST* osd_blobs);
00094 
00095   // Corrects the data structures for the given rotation.
00096   void CorrectForRotation(const FCOORD& rerotation,
00097                           ColPartitionGrid* part_grid);
00098 
00099   // Finds leader partitions and inserts them into the give grid.
00100   void FindLeaderPartitions(TO_BLOCK* block,
00101                             ColPartitionGrid* part_grid);
00102 
00103   // Finds and marks noise those blobs that look like bits of vertical lines
00104   // that would otherwise screw up layout analysis.
00105   void RemoveLineResidue(ColPartition_LIST* big_part_list);
00106 
00107   // Types all the blobs as vertical text or horizontal text or unknown and
00108   // puts them into initial ColPartitions in the supplied part_grid.
00109   // rerotation determines how to get back to the image coordinates from the
00110   // blob coordinates (since they may have been rotated for vertical text).
00111   // block is the single block for the whole page or rectangle to be OCRed.
00112   // nontext_pix (full-size), is a binary mask used to prevent merges across
00113   // photo/text boundaries. It is not kept beyond this function.
00114   // denorm provides a mapping back to the image from the current blob
00115   // coordinate space.
00116   // projection provides a measure of textline density over the image and
00117   // provides functions to assist with diacritic detection. It should be a
00118   // pointer to a new TextlineProjection, and will be setup here.
00119   // part_grid is the output grid of textline partitions.
00120   // Large blobs that cause overlap are put in separate partitions and added
00121   // to the big_parts list.
00122   void GradeBlobsIntoPartitions(PageSegMode pageseg_mode,
00123                                 const FCOORD& rerotation, TO_BLOCK* block,
00124                                 Pix* nontext_pix, const DENORM* denorm,
00125                                 bool cjk_script, TextlineProjection* projection,
00126                                 BLOBNBOX_LIST* diacritic_blobs,
00127                                 ColPartitionGrid* part_grid,
00128                                 ColPartition_LIST* big_parts);
00129 
00130   // Handles a click event in a display window.
00131   virtual void HandleClick(int x, int y);
00132 
00133  private:
00134   // Computes the noise_density_ by summing the number of elements in a
00135   // neighbourhood of each grid cell.
00136   void ComputeNoiseDensity(TO_BLOCK* block, TabFind* line_grid);
00137 
00138   // Detects and marks leader dots/dashes.
00139   //    Leaders are horizontal chains of small or noise blobs that look
00140   //    monospace according to ColPartition::MarkAsLeaderIfMonospaced().
00141   // Detected leaders become the only occupants of the block->small_blobs list.
00142   // Non-leader small blobs get moved to the blobs list.
00143   // Non-leader noise blobs remain singletons in the noise list.
00144   // All small and noise blobs in high density regions are marked BTFT_NONTEXT.
00145   // block is the single block for the whole page or rectangle to be OCRed.
00146   // leader_parts is the output.
00147   void FindLeadersAndMarkNoise(TO_BLOCK* block,
00148                                ColPartition_LIST* leader_parts);
00149 
00152   void InsertBlobs(TO_BLOCK* block);
00153 
00154   // Fix broken CJK characters, using the fake joined blobs mechanism.
00155   // Blobs are really merged, ie the master takes all the outlines and the
00156   // others are deleted.
00157   // Returns true if sufficient blobs are merged that it may be worth running
00158   // again, due to a better estimate of character size.
00159   bool FixBrokenCJK(TO_BLOCK* block);
00160 
00161   // Collect blobs that overlap or are within max_dist of the input bbox.
00162   // Return them in the list of blobs and expand the bbox to be the union
00163   // of all the boxes. not_this is excluded from the search, as are blobs
00164   // that cause the merged box to exceed max_size in either dimension.
00165   void AccumulateOverlaps(const BLOBNBOX* not_this, bool debug,
00166                           int max_size, int max_dist,
00167                           TBOX* bbox, BLOBNBOX_CLIST* blobs);
00168 
00169   // For each blob in this grid, Finds the textline direction to be horizontal
00170   // or vertical according to distance to neighbours and 1st and 2nd order
00171   // neighbours. Non-text tends to end up without a definite direction.
00172   // Result is setting of the neighbours and vert_possible/horz_possible
00173   // flags in the BLOBNBOXes currently in this grid.
00174   // This function is called more than once if page orientation is uncertain,
00175   // so display_if_debugging is true on the final call to display the results.
00176   void FindTextlineFlowDirection(PageSegMode pageseg_mode,
00177                                  bool display_if_debugging);
00178 
00179   // Sets the neighbours and good_stroke_neighbours members of the blob by
00180   // searching close on all 4 sides.
00181   // When finding leader dots/dashes, there is a slightly different rule for
00182   // what makes a good neighbour.
00183   // If activate_line_trap, then line-like objects are found and isolated.
00184   void SetNeighbours(bool leaders, bool activate_line_trap, BLOBNBOX* blob);
00185 
00186   // Sets the good_stroke_neighbours member of the blob if it has a
00187   // GoodNeighbour on the given side.
00188   // Also sets the neighbour in the blob, whether or not a good one is found.
00189   // Return value is the number of neighbours in the line trap size range.
00190   // Leaders get extra special lenient treatment.
00191   int FindGoodNeighbour(BlobNeighbourDir dir, bool leaders, BLOBNBOX* blob);
00192 
00193   // Makes the blob to be only horizontal or vertical where evidence
00194   // is clear based on gaps of 2nd order neighbours.
00195   void SetNeighbourFlows(BLOBNBOX* blob);
00196 
00197   // Nullify the neighbours in the wrong directions where the direction
00198   // is clear-cut based on a distance margin. Good for isolating vertical
00199   // text from neighbouring horizontal text.
00200   void SimplifyObviousNeighbours(BLOBNBOX* blob);
00201 
00202   // Smoothes the vertical/horizontal type of the blob based on the
00203   // 2nd-order neighbours. If reset_all is true, then all blobs are
00204   // changed. Otherwise, only ambiguous blobs are processed.
00205   void SmoothNeighbourTypes(PageSegMode pageseg_mode, bool desperate,
00206                             BLOBNBOX* blob);
00207 
00208   // Checks the left or right side of the given leader partition and sets the
00209   // (opposite) leader_on_right or leader_on_left flags for blobs
00210   // that are next to the given side of the given leader partition.
00211   void MarkLeaderNeighbours(const ColPartition* part, LeftOrRight side);
00212 
00213   // Partition creation. Accumulates vertical and horizontal text chains,
00214   // puts the remaining blobs in as unknowns, and then merges/splits to
00215   // minimize overlap and smoothes the types with neighbours and the color
00216   // image if provided. rerotation is used to rotate the coordinate space
00217   // back to the nontext_map_ image.
00218   // If find_problems is true, detects possible noise pollution by the amount
00219   // of partition overlap that is created by the diacritics. If excessive, the
00220   // noise is separated out into diacritic blobs, and PFR_NOISE is returned.
00221   // [TODO(rays): if the partition overlap is caused by heavy skew, deskews
00222   // the components, saves the skew_angle and returns PFR_SKEW.] If the return
00223   // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be
00224   // called again after cleaning up the partly done work.
00225   PartitionFindResult FindInitialPartitions(PageSegMode pageseg_mode,
00226                                             const FCOORD& rerotation,
00227                                             bool find_problems, TO_BLOCK* block,
00228                                             BLOBNBOX_LIST* diacritic_blobs,
00229                                             ColPartitionGrid* part_grid,
00230                                             ColPartition_LIST* big_parts,
00231                                             FCOORD* skew_angle);
00232   // Detects noise by a significant increase in partition overlap from
00233   // pre_overlap to now, and removes noise from the union of all the overlapping
00234   // partitions, placing the blobs in diacritic_blobs. Returns true if any noise
00235   // was found and removed.
00236   bool DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box,
00237                             TO_BLOCK* block, ColPartitionGrid* part_grid,
00238                             BLOBNBOX_LIST* diacritic_blobs);
00239   // Finds vertical chains of text-like blobs and puts them in ColPartitions.
00240   void FindVerticalTextChains(ColPartitionGrid* part_grid);
00241   // Finds horizontal chains of text-like blobs and puts them in ColPartitions.
00242   void FindHorizontalTextChains(ColPartitionGrid* part_grid);
00243   // Finds diacritics and saves their base character in the blob.
00244   void TestDiacritics(ColPartitionGrid* part_grid, TO_BLOCK* block);
00245   // Searches this grid for an appropriately close and sized neighbour of the
00246   // given [small] blob. If such a blob is found, the diacritic base is saved
00247   // in the blob and true is returned.
00248   // The small_grid is a secondary grid that contains the small/noise objects
00249   // that are not in this grid, but may be useful for determining a connection
00250   // between blob and its potential base character. (See DiacriticXGapFilled.)
00251   bool DiacriticBlob(BlobGrid* small_grid, BLOBNBOX* blob);
00252   // Returns true if there is no gap between the base char and the diacritic
00253   // bigger than a fraction of the height of the base char:
00254   // Eg: line end.....'
00255   // The quote is a long way from the end of the line, yet it needs to be a
00256   // diacritic. To determine that the quote is not part of an image, or
00257   // a different text block, we check for other marks in the gap between
00258   // the base char and the diacritic.
00259   //                          '<--Diacritic
00260   // |---------|
00261   // |         |<-toobig-gap->
00262   // | Base    |<ok gap>
00263   // |---------|        x<-----Dot occupying gap
00264   // The grid is const really.
00265   bool DiacriticXGapFilled(BlobGrid* grid, const TBOX& diacritic_box,
00266                            const TBOX& base_box);
00267   // Merges diacritics with the ColPartition of the base character blob.
00268   void MergeDiacritics(TO_BLOCK* block, ColPartitionGrid* part_grid);
00269   // Any blobs on the large_blobs list of block that are still unowned by a
00270   // ColPartition, are probably drop-cap or vertically touching so the blobs
00271   // are removed to the big_parts list and treated separately.
00272   void RemoveLargeUnusedBlobs(TO_BLOCK* block,
00273                               ColPartitionGrid* part_grid,
00274                               ColPartition_LIST* big_parts);
00275 
00276     // All remaining unused blobs are put in individual ColPartitions.
00277   void PartitionRemainingBlobs(PageSegMode pageseg_mode,
00278                                ColPartitionGrid* part_grid);
00279 
00280   // If combine, put all blobs in the cell_list into a single partition,
00281   // otherwise put each one into its own partition.
00282   void MakePartitionsFromCellList(PageSegMode pageseg_mode, bool combine,
00283                                   ColPartitionGrid* part_grid,
00284                                   BLOBNBOX_CLIST* cell_list);
00285 
00286   // Helper function to finish setting up a ColPartition and insert into
00287   // part_grid.
00288   void CompletePartition(PageSegMode pageseg_mode, ColPartition* part,
00289                          ColPartitionGrid* part_grid);
00290 
00291   // Helper returns true if we are looking only for vertical textlines,
00292   // taking into account any rotation that has been done.
00293   bool FindingVerticalOnly(PageSegMode pageseg_mode) const {
00294     if (rerotation_.y() == 0.0f) {
00295       return pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
00296     }
00297     return !PSM_ORIENTATION_ENABLED(pageseg_mode) &&
00298            pageseg_mode != PSM_SINGLE_BLOCK_VERT_TEXT;
00299   }
00300   // Helper returns true if we are looking only for horizontal textlines,
00301   // taking into account any rotation that has been done.
00302   bool FindingHorizontalOnly(PageSegMode pageseg_mode) const {
00303     if (rerotation_.y() == 0.0f) {
00304       return !PSM_ORIENTATION_ENABLED(pageseg_mode) &&
00305              pageseg_mode != PSM_SINGLE_BLOCK_VERT_TEXT;
00306     }
00307     return pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
00308   }
00309 
00310   // Merge partitions where the merge appears harmless.
00311   void EasyMerges(ColPartitionGrid* part_grid);
00312 
00313   // Compute a search box based on the orientation of the partition.
00314   // Returns true if a suitable box can be calculated.
00315   // Callback for EasyMerges.
00316   bool OrientationSearchBox(ColPartition* part, TBOX* box);
00317 
00318   // Merge confirmation callback for EasyMerges.
00319   bool ConfirmEasyMerge(const ColPartition* p1, const ColPartition* p2);
00320 
00321   // Returns true if there is no significant noise in between the boxes.
00322   bool NoNoiseInBetween(const TBOX& box1, const TBOX& box2) const;
00323 
00324   // Displays the blobs colored according to the number of good neighbours
00325   // and the vertical/horizontal flow.
00326   ScrollView* DisplayGoodBlobs(const char* window_name, int x, int y);
00327 
00328   // Displays blobs colored according to whether or not they are diacritics.
00329   ScrollView* DisplayDiacritics(const char* window_name,
00330                                 int x, int y, TO_BLOCK* block);
00331 
00332  private:
00333   // Image map of photo/noise areas on the page. Borrowed pointer (not owned.)
00334   Pix* nontext_map_;
00335   // Textline projection map. Borrowed pointer.
00336   TextlineProjection* projection_;
00337   // DENORM used by projection_ to get back to image coords. Borrowed pointer.
00338   const DENORM* denorm_;
00339   // Bounding box of the grid.
00340   TBOX grid_box_;
00341   // Rerotation to get back to the original image.
00342   FCOORD rerotation_;
00343   // Windows for debug display.
00344   ScrollView* leaders_win_;
00345   ScrollView* initial_widths_win_;
00346   ScrollView* widths_win_;
00347   ScrollView* chains_win_;
00348   ScrollView* diacritics_win_;
00349   ScrollView* textlines_win_;
00350   ScrollView* smoothed_win_;
00351 };
00352 
00353 }  // namespace tesseract.
00354 
00355 #endif  // TESSERACT_TEXTORD_STROKEWIDTH_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines