tesseract 3.04.01

textord/textord.h

Go to the documentation of this file.
00001 
00002 // File:        textord.h
00003 // Description: The Textord class definition gathers text line and word
00004 //              finding functionality.
00005 // Author:      Ray Smith
00006 // Created:     Fri Mar 13 14:29:01 PDT 2009
00007 //
00008 // (C) Copyright 2009, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifndef TESSERACT_TEXTORD_TEXTORD_H__
00022 #define TESSERACT_TEXTORD_TEXTORD_H__
00023 
00024 #include "ccstruct.h"
00025 #include "bbgrid.h"
00026 #include "blobbox.h"
00027 #include "gap_map.h"
00028 #include "publictypes.h"  // For PageSegMode.
00029 
00030 class FCOORD;
00031 class BLOCK_LIST;
00032 class PAGE_RES;
00033 class TO_BLOCK;
00034 class TO_BLOCK_LIST;
00035 class ScrollView;
00036 
00037 namespace tesseract {
00038 
00039 // A simple class that can be used by BBGrid to hold a word and an expanded
00040 // bounding box that makes it easy to find words to put diacritics.
00041 class WordWithBox {
00042  public:
00043   WordWithBox() : word_(NULL) {}
00044   explicit WordWithBox(WERD *word)
00045       : word_(word), bounding_box_(word->bounding_box()) {
00046     int height = bounding_box_.height();
00047     bounding_box_.pad(height, height);
00048   }
00049 
00050   const TBOX &bounding_box() const { return bounding_box_; }
00051   // Returns the bounding box of only the good blobs.
00052   TBOX true_bounding_box() const { return word_->true_bounding_box(); }
00053   C_BLOB_LIST *RejBlobs() const { return word_->rej_cblob_list(); }
00054   const WERD *word() const { return word_; }
00055 
00056  private:
00057   // Borrowed pointer to a real word somewhere that must outlive this class.
00058   WERD *word_;
00059   // Cached expanded bounding box of the word, padded all round by its height.
00060   TBOX bounding_box_;
00061 };
00062 
00063 // Make it usable by BBGrid.
00064 CLISTIZEH(WordWithBox)
00065 typedef BBGrid<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> WordGrid;
00066 typedef GridSearch<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> WordSearch;
00067 
00068 class Textord {
00069  public:
00070   explicit Textord(CCStruct* ccstruct);
00071   ~Textord();
00072 
00073   // Make the textlines and words inside each block.
00074   // binary_pix is mandatory and is the binarized input after line removal.
00075   // grey_pix is optional, but if present must match the binary_pix in size,
00076   // and must be a *real* grey image instead of binary_pix * 255.
00077   // thresholds_pix is expected to be present iff grey_pix is present and
00078   // can be an integer factor reduction of the grey_pix. It represents the
00079   // thresholds that were used to create the binary_pix from the grey_pix.
00080   // diacritic_blobs contain small confusing components that should be added
00081   // to the appropriate word(s) in case they are really diacritics.
00082   void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width,
00083                    int height, Pix *binary_pix, Pix *thresholds_pix,
00084                    Pix *grey_pix, bool use_box_bottoms,
00085                    BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks,
00086                    TO_BLOCK_LIST *to_blocks);
00087 
00088   // If we were supposed to return only a single textline, and there is more
00089   // than one, clean up and leave only the best.
00090   void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES* page_res);
00091 
00092   bool use_cjk_fp_model() const {
00093     return use_cjk_fp_model_;
00094   }
00095   void set_use_cjk_fp_model(bool flag) {
00096     use_cjk_fp_model_ = flag;
00097   }
00098 
00099   // tospace.cpp ///////////////////////////////////////////
00100   void to_spacing(
00101       ICOORD page_tr,        //topright of page
00102       TO_BLOCK_LIST *blocks  //blocks on page
00103                                          );
00104   ROW *make_prop_words(TO_ROW *row,     // row to make
00105                        FCOORD rotation  // for drawing
00106                        );
00107   ROW *make_blob_words(TO_ROW *row,     // row to make
00108                        FCOORD rotation  // for drawing
00109                        );
00110   // tordmain.cpp ///////////////////////////////////////////
00111   void find_components(Pix* pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks);
00112   void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, BOOL8 testing_on);
00113 
00114  private:
00115   // For underlying memory management and other utilities.
00116   CCStruct* ccstruct_;
00117 
00118   // The size of the input image.
00119   ICOORD page_tr_;
00120 
00121   bool use_cjk_fp_model_;
00122 
00123   // makerow.cpp ///////////////////////////////////////////
00124   // Make the textlines inside each block.
00125   void MakeRows(PageSegMode pageseg_mode, const FCOORD& skew,
00126                 int width, int height, TO_BLOCK_LIST* to_blocks);
00127   // Make the textlines inside a single block.
00128   void MakeBlockRows(int min_spacing, int max_spacing,
00129                      const FCOORD& skew, TO_BLOCK* block,
00130                      ScrollView* win);
00131 
00132  public:
00133   void compute_block_xheight(TO_BLOCK *block, float gradient);
00134   void compute_row_xheight(TO_ROW *row,          // row to do
00135                            const FCOORD& rotation,
00136                            float gradient,       // global skew
00137                            int block_line_size);
00138   void make_spline_rows(TO_BLOCK *block,   // block to do
00139                         float gradient,    // gradient to fit
00140                         BOOL8 testing_on);
00141  private:
00143   void make_old_baselines(TO_BLOCK *block,   // block to do
00144                           BOOL8 testing_on,  // correct orientation
00145                           float gradient);
00146   void correlate_lines(TO_BLOCK *block, float gradient);
00147   void correlate_neighbours(TO_BLOCK *block,  // block rows are in.
00148                             TO_ROW **rows,    // rows of block.
00149                             int rowcount);    // no of rows to do.
00150   int correlate_with_stats(TO_ROW **rows,  // rows of block.
00151                            int rowcount,   // no of rows to do.
00152                            TO_BLOCK* block);
00153   void find_textlines(TO_BLOCK *block,  // block row is in
00154                       TO_ROW *row,      // row to do
00155                       int degree,       // required approximation
00156                       QSPLINE *spline);  // starting spline
00157   // tospace.cpp ///////////////////////////////////////////
00158   //DEBUG USE ONLY
00159   void block_spacing_stats(TO_BLOCK *block,
00160                            GAPMAP *gapmap,
00161                            BOOL8 &old_text_ord_proportional,
00162                            //resulting estimate
00163                            inT16 &block_space_gap_width,
00164                            //resulting estimate
00165                            inT16 &block_non_space_gap_width
00166                            );
00167   void row_spacing_stats(TO_ROW *row,
00168                          GAPMAP *gapmap,
00169                          inT16 block_idx,
00170                          inT16 row_idx,
00171                          //estimate for block
00172                          inT16 block_space_gap_width,
00173                          //estimate for block
00174                          inT16 block_non_space_gap_width
00175                          );
00176   void old_to_method(TO_ROW *row,
00177                      STATS *all_gap_stats,
00178                      STATS *space_gap_stats,
00179                      STATS *small_gap_stats,
00180                      inT16 block_space_gap_width,
00181                      //estimate for block
00182                      inT16 block_non_space_gap_width
00183                      );
00184   BOOL8 isolated_row_stats(TO_ROW *row,
00185                            GAPMAP *gapmap,
00186                            STATS *all_gap_stats,
00187                            BOOL8 suspected_table,
00188                            inT16 block_idx,
00189                            inT16 row_idx);
00190   inT16 stats_count_under(STATS *stats, inT16 threshold);
00191   void improve_row_threshold(TO_ROW *row, STATS *all_gap_stats);
00192   BOOL8 make_a_word_break(TO_ROW *row,   // row being made
00193                           TBOX blob_box, // for next_blob // how many blanks?
00194                           inT16 prev_gap,
00195                           TBOX prev_blob_box,
00196                           inT16 real_current_gap,
00197                           inT16 within_xht_current_gap,
00198                           TBOX next_blob_box,
00199                           inT16 next_gap,
00200                           uinT8 &blanks,
00201                           BOOL8 &fuzzy_sp,
00202                           BOOL8 &fuzzy_non,
00203                           BOOL8& prev_gap_was_a_space,
00204                           BOOL8& break_at_next_gap);
00205   BOOL8 narrow_blob(TO_ROW *row, TBOX blob_box);
00206   BOOL8 wide_blob(TO_ROW *row, TBOX blob_box);
00207   BOOL8 suspected_punct_blob(TO_ROW *row, TBOX box);
00208   void peek_at_next_gap(TO_ROW *row,
00209                         BLOBNBOX_IT box_it,
00210                         TBOX &next_blob_box,
00211                         inT16 &next_gap,
00212                         inT16 &next_within_xht_gap);
00213   void mark_gap(TBOX blob,    //blob following gap
00214                 inT16 rule,  // heuristic id
00215                 inT16 prev_gap,
00216                 inT16 prev_blob_width,
00217                 inT16 current_gap,
00218                 inT16 next_blob_width,
00219                 inT16 next_gap);
00220   float find_mean_blob_spacing(WERD *word);
00221   BOOL8 ignore_big_gap(TO_ROW *row,
00222                        inT32 row_length,
00223                        GAPMAP *gapmap,
00224                        inT16 left,
00225                        inT16 right);
00226   //get bounding box
00227   TBOX reduced_box_next(TO_ROW *row,     //current row
00228                         BLOBNBOX_IT *it  //iterator to blobds
00229                         );
00230   TBOX reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, inT16 *left_above_xht);
00231   // tordmain.cpp ///////////////////////////////////////////
00232   float filter_noise_blobs(BLOBNBOX_LIST *src_list,
00233                            BLOBNBOX_LIST *noise_list,
00234                            BLOBNBOX_LIST *small_list,
00235                            BLOBNBOX_LIST *large_list);
00236   // Fixes the block so it obeys all the rules:
00237   // Must have at least one ROW.
00238   // Must have at least one WERD.
00239   // WERDs contain a fake blob.
00240   void cleanup_nontext_block(BLOCK* block);
00241   void cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks);
00242   BOOL8 clean_noise_from_row(ROW *row);
00243   void clean_noise_from_words(ROW *row);
00244   // Remove outlines that are a tiny fraction in either width or height
00245   // of the word height.
00246   void clean_small_noise_from_words(ROW *row);
00247   // Groups blocks by rotation, then, for each group, makes a WordGrid and calls
00248   // TransferDiacriticsToWords to copy the diacritic blobs to the most
00249   // appropriate words in the group of blocks. Source blobs are not touched.
00250   void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
00251                                        BLOCK_LIST* blocks);
00252   // Places a copy of blobs that are near a word (after applying rotation to the
00253   // blob) in the most appropriate word, unless there is doubt, in which case a
00254   // blob can end up in two words. Source blobs are not touched.
00255   void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs,
00256                                  const FCOORD &rotation, WordGrid *word_grid);
00257 
00258  public:
00259   // makerow.cpp ///////////////////////////////////////////
00260   BOOL_VAR_H(textord_single_height_mode, false,
00261              "Script has no xheight, so use a single mode for horizontal text");
00262   // tospace.cpp ///////////////////////////////////////////
00263   BOOL_VAR_H(tosp_old_to_method, false, "Space stats use prechopping?");
00264   BOOL_VAR_H(tosp_old_to_constrain_sp_kn, false,
00265              "Constrain relative values of inter and intra-word gaps for "
00266              "old_to_method.");
00267   BOOL_VAR_H(tosp_only_use_prop_rows, true,
00268              "Block stats to use fixed pitch rows?");
00269   BOOL_VAR_H(tosp_force_wordbreak_on_punct, false,
00270              "Force word breaks on punct to break long lines in non-space "
00271              "delimited langs");
00272   BOOL_VAR_H(tosp_use_pre_chopping, false,
00273              "Space stats use prechopping?");
00274   BOOL_VAR_H(tosp_old_to_bug_fix, false,
00275              "Fix suspected bug in old code");
00276   BOOL_VAR_H(tosp_block_use_cert_spaces, true,
00277              "Only stat OBVIOUS spaces");
00278   BOOL_VAR_H(tosp_row_use_cert_spaces, true,
00279              "Only stat OBVIOUS spaces");
00280   BOOL_VAR_H(tosp_narrow_blobs_not_cert, true,
00281              "Only stat OBVIOUS spaces");
00282   BOOL_VAR_H(tosp_row_use_cert_spaces1, true,
00283              "Only stat OBVIOUS spaces");
00284   BOOL_VAR_H(tosp_recovery_isolated_row_stats, true,
00285              "Use row alone when inadequate cert spaces");
00286   BOOL_VAR_H(tosp_only_small_gaps_for_kern, false, "Better guess");
00287   BOOL_VAR_H(tosp_all_flips_fuzzy, false, "Pass ANY flip to context?");
00288   BOOL_VAR_H(tosp_fuzzy_limit_all, true,
00289              "Don't restrict kn->sp fuzzy limit to tables");
00290   BOOL_VAR_H(tosp_stats_use_xht_gaps, true,
00291              "Use within xht gap for wd breaks");
00292   BOOL_VAR_H(tosp_use_xht_gaps, true,
00293              "Use within xht gap for wd breaks");
00294   BOOL_VAR_H(tosp_only_use_xht_gaps, false,
00295              "Only use within xht gap for wd breaks");
00296   BOOL_VAR_H(tosp_rule_9_test_punct, false,
00297              "Don't chng kn to space next to punct");
00298   BOOL_VAR_H(tosp_flip_fuzz_kn_to_sp, true, "Default flip");
00299   BOOL_VAR_H(tosp_flip_fuzz_sp_to_kn, true, "Default flip");
00300   BOOL_VAR_H(tosp_improve_thresh, false,
00301              "Enable improvement heuristic");
00302   INT_VAR_H(tosp_debug_level, 0, "Debug data");
00303   INT_VAR_H(tosp_enough_space_samples_for_median, 3,
00304             "or should we use mean");
00305   INT_VAR_H(tosp_redo_kern_limit, 10,
00306             "No.samples reqd to reestimate for row");
00307   INT_VAR_H(tosp_few_samples, 40,
00308             "No.gaps reqd with 1 large gap to treat as a table");
00309   INT_VAR_H(tosp_short_row, 20,
00310             "No.gaps reqd with few cert spaces to use certs");
00311   INT_VAR_H(tosp_sanity_method, 1, "How to avoid being silly");
00312   double_VAR_H(tosp_old_sp_kn_th_factor, 2.0,
00313                "Factor for defining space threshold in terms of space and "
00314                "kern sizes");
00315   double_VAR_H(tosp_threshold_bias1, 0,
00316                "how far between kern and space?");
00317   double_VAR_H(tosp_threshold_bias2, 0,
00318                "how far between kern and space?");
00319   double_VAR_H(tosp_narrow_fraction, 0.3,
00320                "Fract of xheight for narrow");
00321   double_VAR_H(tosp_narrow_aspect_ratio, 0.48,
00322                "narrow if w/h less than this");
00323   double_VAR_H(tosp_wide_fraction, 0.52, "Fract of xheight for wide");
00324   double_VAR_H(tosp_wide_aspect_ratio, 0.0,
00325                "wide if w/h less than this");
00326   double_VAR_H(tosp_fuzzy_space_factor, 0.6,
00327                "Fract of xheight for fuzz sp");
00328   double_VAR_H(tosp_fuzzy_space_factor1, 0.5,
00329                "Fract of xheight for fuzz sp");
00330   double_VAR_H(tosp_fuzzy_space_factor2, 0.72,
00331                "Fract of xheight for fuzz sp");
00332   double_VAR_H(tosp_gap_factor, 0.83, "gap ratio to flip sp->kern");
00333   double_VAR_H(tosp_kern_gap_factor1, 2.0,
00334                "gap ratio to flip kern->sp");
00335   double_VAR_H(tosp_kern_gap_factor2, 1.3,
00336                "gap ratio to flip kern->sp");
00337   double_VAR_H(tosp_kern_gap_factor3, 2.5,
00338                "gap ratio to flip kern->sp");
00339   double_VAR_H(tosp_ignore_big_gaps, -1, "xht multiplier");
00340   double_VAR_H(tosp_ignore_very_big_gaps, 3.5, "xht multiplier");
00341   double_VAR_H(tosp_rep_space, 1.6, "rep gap multiplier for space");
00342   double_VAR_H(tosp_enough_small_gaps, 0.65,
00343                "Fract of kerns reqd for isolated row stats");
00344   double_VAR_H(tosp_table_kn_sp_ratio, 2.25,
00345                "Min difference of kn & sp in table");
00346   double_VAR_H(tosp_table_xht_sp_ratio, 0.33,
00347                "Expect spaces bigger than this");
00348   double_VAR_H(tosp_table_fuzzy_kn_sp_ratio, 3.0,
00349                "Fuzzy if less than this");
00350   double_VAR_H(tosp_fuzzy_kn_fraction, 0.5, "New fuzzy kn alg");
00351   double_VAR_H(tosp_fuzzy_sp_fraction, 0.5, "New fuzzy sp alg");
00352   double_VAR_H(tosp_min_sane_kn_sp, 1.5,
00353                "Don't trust spaces less than this time kn");
00354   double_VAR_H(tosp_init_guess_kn_mult, 2.2,
00355                "Thresh guess - mult kn by this");
00356   double_VAR_H(tosp_init_guess_xht_mult, 0.28,
00357                "Thresh guess - mult xht by this");
00358   double_VAR_H(tosp_max_sane_kn_thresh, 5.0,
00359                "Multiplier on kn to limit thresh");
00360   double_VAR_H(tosp_flip_caution, 0.0,
00361                "Don't autoflip kn to sp when large separation");
00362   double_VAR_H(tosp_large_kerning, 0.19,
00363                "Limit use of xht gap with large kns");
00364   double_VAR_H(tosp_dont_fool_with_small_kerns, -1,
00365                "Limit use of xht gap with odd small kns");
00366   double_VAR_H(tosp_near_lh_edge, 0,
00367                "Don't reduce box if the top left is non blank");
00368   double_VAR_H(tosp_silly_kn_sp_gap, 0.2,
00369                "Don't let sp minus kn get too small");
00370   double_VAR_H(tosp_pass_wide_fuzz_sp_to_context, 0.75,
00371                "How wide fuzzies need context");
00372   // tordmain.cpp ///////////////////////////////////////////
00373   BOOL_VAR_H(textord_no_rejects, false, "Don't remove noise blobs");
00374   BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs");
00375   BOOL_VAR_H(textord_show_boxes, false, "Display boxes");
00376   INT_VAR_H(textord_max_noise_size, 7, "Pixel size of noise");
00377   INT_VAR_H(textord_baseline_debug, 0, "Baseline debug level");
00378   double_VAR_H(textord_blob_size_bigile, 95, "Percentile for large blobs");
00379   double_VAR_H(textord_noise_area_ratio, 0.7,
00380                "Fraction of bounding box for noise");
00381   double_VAR_H(textord_blob_size_smallile, 20, "Percentile for small blobs");
00382   double_VAR_H(textord_initialx_ile, 0.75, "Ile of sizes for xheight guess");
00383   double_VAR_H(textord_initialasc_ile, 0.90, "Ile of sizes for xheight guess");
00384   INT_VAR_H(textord_noise_sizefraction, 10, "Fraction of size for maxima");
00385   double_VAR_H(textord_noise_sizelimit, 0.5, "Fraction of x for big t count");
00386   INT_VAR_H(textord_noise_translimit, 16, "Transitions for normal blob");
00387   double_VAR_H(textord_noise_normratio, 2.0, "Dot to norm ratio for deletion");
00388   BOOL_VAR_H(textord_noise_rejwords, true, "Reject noise-like words");
00389   BOOL_VAR_H(textord_noise_rejrows, true, "Reject noise-like rows");
00390   double_VAR_H(textord_noise_syfract, 0.2, "xh fract error for norm blobs");
00391   double_VAR_H(textord_noise_sxfract, 0.4,
00392                "xh fract width error for norm blobs");
00393   double_VAR_H(textord_noise_hfract, 1.0/64,
00394                "Height fraction to discard outlines as speckle noise");
00395   INT_VAR_H(textord_noise_sncount, 1, "super norm blobs to save row");
00396   double_VAR_H(textord_noise_rowratio, 6.0, "Dot to norm ratio for deletion");
00397   BOOL_VAR_H(textord_noise_debug, FALSE, "Debug row garbage detector");
00398   double_VAR_H(textord_blshift_maxshift, 0.00, "Max baseline shift");
00399   double_VAR_H(textord_blshift_xfraction, 9.99, "Min size of baseline shift");
00400 };
00401 }  // namespace tesseract.
00402 
00403 #endif  // TESSERACT_TEXTORD_TEXTORD_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines