tesseract 3.04.01

ccmain/tesseractclass.h

Go to the documentation of this file.
00001 
00002 // File:        tesseractclass.h
00003 // Description: The Tesseract class. It holds/owns everything needed
00004 //              to run Tesseract on a single language, and also a set of
00005 //              sub-Tesseracts to run sub-languages. For thread safety, *every*
00006 //              global variable goes in here, directly, or indirectly.
00007 //              This makes it safe to run multiple Tesseracts in different
00008 //              threads in parallel, and keeps the different language
00009 //              instances separate.
00010 // Author:      Ray Smith
00011 // Created:     Fri Mar 07 08:17:01 PST 2008
00012 //
00013 // (C) Copyright 2008, Google Inc.
00014 // Licensed under the Apache License, Version 2.0 (the "License");
00015 // you may not use this file except in compliance with the License.
00016 // You may obtain a copy of the License at
00017 // http://www.apache.org/licenses/LICENSE-2.0
00018 // Unless required by applicable law or agreed to in writing, software
00019 // distributed under the License is distributed on an "AS IS" BASIS,
00020 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021 // See the License for the specific language governing permissions and
00022 // limitations under the License.
00023 //
00025 
00026 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__
00027 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__
00028 
00029 #include "allheaders.h"
00030 #include "control.h"
00031 #include "docqual.h"
00032 #include "devanagari_processing.h"
00033 #include "genericvector.h"
00034 #include "params.h"
00035 #include "ocrclass.h"
00036 #include "textord.h"
00037 #include "wordrec.h"
00038 
00039 class BLOB_CHOICE_LIST_CLIST;
00040 class BLOCK_LIST;
00041 class CharSamp;
00042 struct OSResults;
00043 class PAGE_RES;
00044 class PAGE_RES_IT;
00045 struct Pix;
00046 class ROW;
00047 class SVMenuNode;
00048 class TBOX;
00049 class TO_BLOCK_LIST;
00050 class WERD;
00051 class WERD_CHOICE;
00052 class WERD_RES;
00053 
00054 
00055 // Top-level class for all tesseract global instance data.
00056 // This class either holds or points to all data used by an instance
00057 // of Tesseract, including the memory allocator. When this is
00058 // complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT!
00059 //
00060 // NOTE to developers: Do not create cyclic dependencies through this class!
00061 // The directory dependency tree must remain a tree! The keep this clean,
00062 // lower-level code (eg in ccutil, the bottom level) must never need to
00063 // know about the content of a higher-level directory.
00064 // The following scheme will grant the easiest access to lower-level
00065 // global members without creating a cyclic dependency:
00066 //
00067 // Class Hierarchy (^ = inheritance):
00068 //
00069 //             CCUtil (ccutil/ccutil.h)
00070 //                         ^      Members include: UNICHARSET
00071 //            CUtil (cutil/cutil_class.h)
00072 //                         ^       Members include: TBLOB*, TEXTBLOCK*
00073 //           CCStruct (ccstruct/ccstruct.h)
00074 //                         ^       Members include: Image
00075 //           Classify (classify/classify.h)
00076 //                         ^       Members include: Dict
00077 //             WordRec (wordrec/wordrec.h)
00078 //                         ^       Members include: WERD*, DENORM*
00079 //        Tesseract (ccmain/tesseractclass.h)
00080 //                                 Members include: Pix*, CubeRecoContext*,
00081 //                                 TesseractCubeCombiner*
00082 //
00083 // Other important classes:
00084 //
00085 //  TessBaseAPI (api/baseapi.h)
00086 //                                 Members include: BLOCK_LIST*, PAGE_RES*,
00087 //                                 Tesseract*, ImageThresholder*
00088 //  Dict (dict/dict.h)
00089 //                                 Members include: Image* (private)
00090 //
00091 // NOTE: that each level contains members that correspond to global
00092 // data that is defined (and used) at that level, not necessarily where
00093 // the type is defined so for instance:
00094 // BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs");
00095 // goes inside the Textord class, not the cc_util class.
00096 
00097 namespace tesseract {
00098 
00099 class ColumnFinder;
00100 #ifndef NO_CUBE_BUILD
00101 class CubeLineObject;
00102 class CubeObject;
00103 class CubeRecoContext;
00104 #endif
00105 class EquationDetect;
00106 class Tesseract;
00107 #ifndef NO_CUBE_BUILD
00108 class TesseractCubeCombiner;
00109 #endif
00110 
00111 // A collection of various variables for statistics and debugging.
00112 struct TesseractStats {
00113   TesseractStats()
00114     : adaption_word_number(0),
00115       doc_blob_quality(0),
00116       doc_outline_errs(0),
00117       doc_char_quality(0),
00118       good_char_count(0),
00119       doc_good_char_quality(0),
00120       word_count(0),
00121       dict_words(0),
00122       tilde_crunch_written(false),
00123       last_char_was_newline(true),
00124       last_char_was_tilde(false),
00125       write_results_empty_block(true) {}
00126 
00127   inT32 adaption_word_number;
00128   inT16 doc_blob_quality;
00129   inT16 doc_outline_errs;
00130   inT16 doc_char_quality;
00131   inT16 good_char_count;
00132   inT16 doc_good_char_quality;
00133   inT32 word_count;  // count of word in the document
00134   inT32 dict_words;  // number of dicitionary words in the document
00135   STRING dump_words_str;  // accumulator used by dump_words()
00136   // Flags used by write_results()
00137   bool tilde_crunch_written;
00138   bool last_char_was_newline;
00139   bool last_char_was_tilde;
00140   bool write_results_empty_block;
00141 };
00142 
00143 // Struct to hold all the pointers to relevant data for processing a word.
00144 struct WordData {
00145   WordData() : word(NULL), row(NULL), block(NULL), prev_word(NULL) {}
00146   explicit WordData(const PAGE_RES_IT& page_res_it)
00147     : word(page_res_it.word()), row(page_res_it.row()->row),
00148       block(page_res_it.block()->block), prev_word(NULL) {}
00149   WordData(BLOCK* block_in, ROW* row_in, WERD_RES* word_res)
00150     : word(word_res), row(row_in), block(block_in), prev_word(NULL) {}
00151 
00152   WERD_RES* word;
00153   ROW* row;
00154   BLOCK* block;
00155   WordData* prev_word;
00156   PointerVector<WERD_RES> lang_words;
00157 };
00158 
00159 // Definition of a Tesseract WordRecognizer. The WordData provides the context
00160 // of row/block, in_word holds an initialized, possibly pre-classified word,
00161 // that the recognizer may or may not consume (but if so it sets *in_word=NULL)
00162 // and produces one or more output words in out_words, which may be the
00163 // consumed in_word, or may be generated independently.
00164 // This api allows both a conventional tesseract classifier to work, or a
00165 // line-level classifier that generates multiple words from a merged input.
00166 typedef void (Tesseract::*WordRecognizer)(const WordData& word_data,
00167                                           WERD_RES** in_word,
00168                                           PointerVector<WERD_RES>* out_words);
00169 
00170 class Tesseract : public Wordrec {
00171  public:
00172   Tesseract();
00173   ~Tesseract();
00174 
00175   // Clear as much used memory as possible without resetting the adaptive
00176   // classifier or losing any other classifier data.
00177   void Clear();
00178   // Clear all memory of adaption for this and all subclassifiers.
00179   void ResetAdaptiveClassifier();
00180   // Clear the document dictionary for this and all subclassifiers.
00181   void ResetDocumentDictionary();
00182 
00183   // Set the equation detector.
00184   void SetEquationDetect(EquationDetect* detector);
00185 
00186   // Simple accessors.
00187   const FCOORD& reskew() const {
00188     return reskew_;
00189   }
00190   // Destroy any existing pix and return a pointer to the pointer.
00191   Pix** mutable_pix_binary() {
00192     Clear();
00193     return &pix_binary_;
00194   }
00195   Pix* pix_binary() const {
00196     return pix_binary_;
00197   }
00198   Pix* pix_grey() const {
00199     return pix_grey_;
00200   }
00201   void set_pix_grey(Pix* grey_pix) {
00202     pixDestroy(&pix_grey_);
00203     pix_grey_ = grey_pix;
00204   }
00205   // Returns a pointer to a Pix representing the best available image of the
00206   // page. The image will be 8-bit grey if the input was grey or color. Note
00207   // that in grey 0 is black and 255 is white. If the input was binary, then
00208   // the returned Pix will be binary. Note that here black is 1 and white is 0.
00209   // To tell the difference pixGetDepth() will return 8 or 1.
00210   // In either case, the return value is a borrowed Pix, and should not be
00211   // deleted or pixDestroyed.
00212   Pix* BestPix() const {
00213     return pix_grey_ != NULL ? pix_grey_ : pix_binary_;
00214   }
00215   void set_pix_thresholds(Pix* thresholds) {
00216     pixDestroy(&pix_thresholds_);
00217     pix_thresholds_ = thresholds;
00218   }
00219   int source_resolution() const {
00220     return source_resolution_;
00221   }
00222   void set_source_resolution(int ppi) {
00223     source_resolution_ = ppi;
00224   }
00225   int ImageWidth() const {
00226     return pixGetWidth(pix_binary_);
00227   }
00228   int ImageHeight() const {
00229     return pixGetHeight(pix_binary_);
00230   }
00231   Pix* scaled_color() const {
00232     return scaled_color_;
00233   }
00234   int scaled_factor() const {
00235     return scaled_factor_;
00236   }
00237   void SetScaledColor(int factor, Pix* color) {
00238     scaled_factor_ = factor;
00239     scaled_color_ = color;
00240   }
00241   const Textord& textord() const {
00242     return textord_;
00243   }
00244   Textord* mutable_textord() {
00245     return &textord_;
00246   }
00247 
00248   bool right_to_left() const {
00249     return right_to_left_;
00250   }
00251   int num_sub_langs() const {
00252     return sub_langs_.size();
00253   }
00254   Tesseract* get_sub_lang(int index) const {
00255     return sub_langs_[index];
00256   }
00257   // Returns true if any language uses Tesseract (as opposed to cube).
00258   bool AnyTessLang() const {
00259     if (tessedit_ocr_engine_mode != OEM_CUBE_ONLY) return true;
00260     for (int i = 0; i < sub_langs_.size(); ++i) {
00261       if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_CUBE_ONLY)
00262         return true;
00263     }
00264     return false;
00265   }
00266 
00267   void SetBlackAndWhitelist();
00268 
00269   // Perform steps to prepare underlying binary image/other data structures for
00270   // page segmentation. Uses the strategy specified in the global variable
00271   // pageseg_devanagari_split_strategy for perform splitting while preparing for
00272   // page segmentation.
00273   void PrepareForPageseg();
00274 
00275   // Perform steps to prepare underlying binary image/other data structures for
00276   // Tesseract OCR. The current segmentation is required by this method.
00277   // Uses the strategy specified in the global variable
00278   // ocr_devanagari_split_strategy for performing splitting while preparing for
00279   // Tesseract ocr.
00280   void PrepareForTessOCR(BLOCK_LIST* block_list,
00281                          Tesseract* osd_tess, OSResults* osr);
00282 
00283   int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
00284                   Tesseract* osd_tess, OSResults* osr);
00285   void SetupWordScripts(BLOCK_LIST* blocks);
00286   int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
00287                   TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
00288                   Tesseract* osd_tess, OSResults* osr);
00289   ColumnFinder* SetupPageSegAndDetectOrientation(
00290       PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
00291       OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
00292       Pix** music_mask_pix);
00293   // par_control.cpp
00294   void PrerecAllWordsPar(const GenericVector<WordData>& words);
00295 
00297   bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box,
00298                          const char* word_config, int pass);
00299   // Sets up the words ready for whichever engine is to be run
00300   void SetupAllWordsPassN(int pass_n,
00301                           const TBOX* target_word_box,
00302                           const char* word_config,
00303                           PAGE_RES* page_res,
00304                           GenericVector<WordData>* words);
00305   // Sets up the single word ready for whichever engine is to be run.
00306   void SetupWordPassN(int pass_n, WordData* word);
00307   // Runs word recognition on all the words.
00308   bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
00309                           PAGE_RES_IT* pr_it,
00310                           GenericVector<WordData>* words);
00311   bool recog_all_words(PAGE_RES* page_res,
00312                        ETEXT_DESC* monitor,
00313                        const TBOX* target_word_box,
00314                        const char* word_config,
00315                        int dopasses);
00316   void rejection_passes(PAGE_RES* page_res,
00317                         ETEXT_DESC* monitor,
00318                         const TBOX* target_word_box,
00319                         const char* word_config);
00320   void bigram_correction_pass(PAGE_RES *page_res);
00321   void blamer_pass(PAGE_RES* page_res);
00322   // Sets script positions and detects smallcaps on all output words.
00323   void script_pos_pass(PAGE_RES* page_res);
00324   // Helper to recognize the word using the given (language-specific) tesseract.
00325   // Returns positive if this recognizer found more new best words than the
00326   // number kept from best_words.
00327   int RetryWithLanguage(const WordData& word_data,
00328                         WordRecognizer recognizer,
00329                         WERD_RES** in_word,
00330                         PointerVector<WERD_RES>* best_words);
00331   // Moves good-looking "noise"/diacritics from the reject list to the main
00332   // blob list on the current word. Returns true if anything was done, and
00333   // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
00334   bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
00335                           bool* make_next_word_fuzzy);
00336   // Attempts to put noise/diacritic outlines into the blobs that they overlap.
00337   // Input: a set of noisy outlines that probably belong to the real_word.
00338   // Output: outlines that overlapped blobs are set to NULL and put back into
00339   // the word, either in the blobs or in the reject list.
00340   void AssignDiacriticsToOverlappingBlobs(
00341       const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
00342       PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
00343       GenericVector<bool>* overlapped_any_blob,
00344       GenericVector<C_BLOB*>* target_blobs);
00345   // Attempts to assign non-overlapping outlines to their nearest blobs or
00346   // make new blobs out of them.
00347   void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE*>& outlines,
00348                                   int pass, WERD* real_word, PAGE_RES_IT* pr_it,
00349                                   GenericVector<bool>* word_wanted,
00350                                   GenericVector<C_BLOB*>* target_blobs);
00351   // Starting with ok_outlines set to indicate which outlines overlap the blob,
00352   // chooses the optimal set (approximately) and returns true if any outlines
00353   // are desired, in which case ok_outlines indicates which ones.
00354   bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
00355                                    PAGE_RES_IT* pr_it, C_BLOB* blob,
00356                                    const GenericVector<C_OUTLINE*>& outlines,
00357                                    int num_outlines,
00358                                    GenericVector<bool>* ok_outlines);
00359   // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
00360   // the inclusion of the outlines, and returns the certainty of the raw choice.
00361   float ClassifyBlobPlusOutlines(const GenericVector<bool>& ok_outlines,
00362                                  const GenericVector<C_OUTLINE*>& outlines,
00363                                  int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
00364                                  STRING* best_str);
00365   // Classifies the given blob (part of word_data->word->word) as an individual
00366   // word, using languages, chopper etc, returning only the certainty of the
00367   // best raw choice, and undoing all the work done to fake out the word.
00368   float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
00369                            STRING* best_str, float* c2);
00370   void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
00371                                   WordData* word_data);
00372   void classify_word_pass1(const WordData& word_data,
00373                            WERD_RES** in_word,
00374                            PointerVector<WERD_RES>* out_words);
00375   void recog_pseudo_word(PAGE_RES* page_res,  // blocks to check
00376                          TBOX &selection_box);
00377 
00378   void fix_rep_char(PAGE_RES_IT* page_res_it);
00379 
00380   ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set,
00381                                               const char *s,
00382                                               const char *lengths);
00383   void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK* block);
00384   void classify_word_pass2(const WordData& word_data,
00385                            WERD_RES** in_word,
00386                            PointerVector<WERD_RES>* out_words);
00387   void ReportXhtFixResult(bool accept_new_word, float new_x_ht,
00388                           WERD_RES* word, WERD_RES* new_word);
00389   bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
00390   bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
00391   // Runs recognition with the test baseline shift and x-height and returns true
00392   // if there was an improvement in recognition result.
00393   bool TestNewNormalization(int original_misfits, float baseline_shift,
00394                             float new_x_ht, WERD_RES *word, BLOCK* block,
00395                             ROW *row);
00396   BOOL8 recog_interactive(PAGE_RES_IT* pr_it);
00397 
00398   // Set fonts of this word.
00399   void set_word_fonts(WERD_RES *word);
00400   void font_recognition_pass(PAGE_RES* page_res);
00401   void dictionary_correction_pass(PAGE_RES* page_res);
00402   BOOL8 check_debug_pt(WERD_RES *word, int location);
00403 
00405   bool SubAndSuperscriptFix(WERD_RES *word_res);
00406   void GetSubAndSuperscriptCandidates(const WERD_RES *word,
00407                                       int *num_rebuilt_leading,
00408                                       ScriptPos *leading_pos,
00409                                       float *leading_certainty,
00410                                       int *num_rebuilt_trailing,
00411                                       ScriptPos *trailing_pos,
00412                                       float *trailing_certainty,
00413                                       float *avg_certainty,
00414                                       float *unlikely_threshold);
00415   WERD_RES *TrySuperscriptSplits(int num_chopped_leading,
00416                                  float leading_certainty,
00417                                  ScriptPos leading_pos,
00418                                  int num_chopped_trailing,
00419                                  float trailing_certainty,
00420                                  ScriptPos trailing_pos,
00421                                  WERD_RES *word,
00422                                  bool *is_good,
00423                                  int *retry_leading,
00424                                  int *retry_trailing);
00425   bool BelievableSuperscript(bool debug,
00426                              const WERD_RES &word,
00427                              float certainty_threshold,
00428                              int *left_ok,
00429                              int *right_ok) const;
00430 
00432 #ifndef NO_CUBE_BUILD
00433   bool init_cube_objects(bool load_combiner,
00434                          TessdataManager *tessdata_manager);
00435   // Iterates through tesseract's results and calls cube on each word,
00436   // combining the results with the existing tesseract result.
00437   void run_cube_combiner(PAGE_RES *page_res);
00438   // Recognizes a single word using (only) cube. Compatible with
00439   // Tesseract's classify_word_pass1/classify_word_pass2.
00440   void cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word);
00441   // Cube recognizer to recognize a single word as with classify_word_pass1
00442   // but also returns the cube object in case the combiner is needed.
00443   CubeObject* cube_recognize_word(BLOCK* block, WERD_RES* word);
00444   // Combines the cube and tesseract results for a single word, leaving the
00445   // result in tess_word.
00446   void cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word,
00447                         WERD_RES* tess_word);
00448   // Call cube on the current word, and write the result to word.
00449   // Sets up a fake result  and returns false if something goes wrong.
00450   bool cube_recognize(CubeObject *cube_obj, BLOCK* block, WERD_RES *word);
00451   void fill_werd_res(const BoxWord& cube_box_word,
00452                      const char* cube_best_str,
00453                      WERD_RES* tess_werd_res);
00454   bool extract_cube_state(CubeObject* cube_obj, int* num_chars,
00455                           Boxa** char_boxes, CharSamp*** char_samples);
00456   bool create_cube_box_word(Boxa *char_boxes, int num_chars,
00457                             TBOX word_box, BoxWord* box_word);
00458 #endif
00459 
00460 
00461   void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box);
00462   void write_results(PAGE_RES_IT &page_res_it,  // full info
00463                      char newline_type,         // type of newline
00464                      BOOL8 force_eol            // override tilde crunch?
00465                     );
00466   void set_unlv_suspects(WERD_RES *word);
00467   UNICHAR_ID get_rep_char(WERD_RES *word);  // what char is repeated?
00468   BOOL8 acceptable_number_string(const char *s,
00469                                  const char *lengths);
00470   inT16 count_alphanums(const WERD_CHOICE &word);
00471   inT16 count_alphas(const WERD_CHOICE &word);
00473   void read_config_file(const char *filename, SetParamConstraint constraint);
00474   // Initialize for potentially a set of languages defined by the language
00475   // string and recursively any additional languages required by any language
00476   // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
00477   // See init_tesseract_internal for args.
00478   int init_tesseract(const char *arg0,
00479                      const char *textbase,
00480                      const char *language,
00481                      OcrEngineMode oem,
00482                      char **configs,
00483                      int configs_size,
00484                      const GenericVector<STRING> *vars_vec,
00485                      const GenericVector<STRING> *vars_values,
00486                      bool set_only_init_params);
00487   int init_tesseract(const char *datapath,
00488                      const char *language,
00489                      OcrEngineMode oem) {
00490     return init_tesseract(datapath, NULL, language, oem,
00491                           NULL, 0, NULL, NULL, false);
00492   }
00493   // Common initialization for a single language.
00494   // arg0 is the datapath for the tessdata directory, which could be the
00495   // path of the tessdata directory with no trailing /, or (if tessdata
00496   // lives in the same directory as the executable, the path of the executable,
00497   // hence the name arg0.
00498   // textbase is an optional output file basename (used only for training)
00499   // language is the language code to load.
00500   // oem controls which engine(s) will operate on the image
00501   // configs (argv) is an array of config filenames to load variables from.
00502   // May be NULL.
00503   // configs_size (argc) is the number of elements in configs.
00504   // vars_vec is an optional vector of variables to set.
00505   // vars_values is an optional corresponding vector of values for the variables
00506   // in vars_vec.
00507   // If set_only_init_params is true, then only the initialization variables
00508   // will be set.
00509   int init_tesseract_internal(const char *arg0,
00510                               const char *textbase,
00511                               const char *language,
00512                               OcrEngineMode oem,
00513                               char **configs,
00514                               int configs_size,
00515                               const GenericVector<STRING> *vars_vec,
00516                               const GenericVector<STRING> *vars_values,
00517                               bool set_only_init_params);
00518 
00519   // Set the universal_id member of each font to be unique among all
00520   // instances of the same font loaded.
00521   void SetupUniversalFontIds();
00522 
00523   int init_tesseract_lm(const char *arg0,
00524                         const char *textbase,
00525                         const char *language);
00526 
00527   void recognize_page(STRING& image_name);
00528   void end_tesseract();
00529 
00530   bool init_tesseract_lang_data(const char *arg0,
00531                                 const char *textbase,
00532                                 const char *language,
00533                                 OcrEngineMode oem,
00534                                 char **configs,
00535                                 int configs_size,
00536                                 const GenericVector<STRING> *vars_vec,
00537                                 const GenericVector<STRING> *vars_values,
00538                                 bool set_only_init_params);
00539 
00540   void ParseLanguageString(const char* lang_str,
00541                            GenericVector<STRING>* to_load,
00542                            GenericVector<STRING>* not_to_load);
00543 
00545   SVMenuNode *build_menu_new();
00546   #ifndef GRAPHICS_DISABLED
00547   void pgeditor_main(int width, int height, PAGE_RES* page_res);
00548   #endif  // GRAPHICS_DISABLED
00549   void process_image_event( // action in image win
00550                            const SVEvent &event);
00551   BOOL8 process_cmd_win_event(                 // UI command semantics
00552                               inT32 cmd_event,  // which menu item?
00553                               char *new_value   // any prompt data
00554                              );
00555   void debug_word(PAGE_RES* page_res, const TBOX &selection_box);
00556   void do_re_display(
00557       BOOL8 (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it));
00558   BOOL8 word_display(PAGE_RES_IT* pr_it);
00559   BOOL8 word_bln_display(PAGE_RES_IT* pr_it);
00560   BOOL8 word_blank_and_set_display(PAGE_RES_IT* pr_its);
00561   BOOL8 word_set_display(PAGE_RES_IT* pr_it);
00562   // #ifndef GRAPHICS_DISABLED
00563   BOOL8 word_dumper(PAGE_RES_IT* pr_it);
00564   // #endif  // GRAPHICS_DISABLED
00565   void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box);
00567   // make rej map for word
00568   void make_reject_map(WERD_RES *word, ROW *row, inT16 pass);
00569   BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map);
00570   inT16 first_alphanum_index(const char *word,
00571                              const char *word_lengths);
00572   inT16 first_alphanum_offset(const char *word,
00573                               const char *word_lengths);
00574   inT16 alpha_count(const char *word,
00575                     const char *word_lengths);
00576   BOOL8 word_contains_non_1_digit(const char *word,
00577                                   const char *word_lengths);
00578   void dont_allow_1Il(WERD_RES *word);
00579   inT16 count_alphanums(  //how many alphanums
00580                         WERD_RES *word);
00581   void flip_0O(WERD_RES *word);
00582   BOOL8 non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
00583   BOOL8 non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
00584   BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row);
00585   void nn_match_word(  //Match a word
00586                      WERD_RES *word,
00587                      ROW *row);
00588   void nn_recover_rejects(WERD_RES *word, ROW *row);
00589   void set_done(  //set done flag
00590                 WERD_RES *word,
00591                 inT16 pass);
00592   inT16 safe_dict_word(const WERD_RES *werd_res);  // is best_choice in dict?
00593   void flip_hyphens(WERD_RES *word);
00594   void reject_I_1_L(WERD_RES *word);
00595   void reject_edge_blobs(WERD_RES *word);
00596   void reject_mostly_rejects(WERD_RES *word);
00598   BOOL8 word_adaptable(  //should we adapt?
00599                        WERD_RES *word,
00600                        uinT16 mode);
00601 
00603   void recog_word_recursive(WERD_RES* word);
00604   void recog_word(WERD_RES *word);
00605   void split_and_recog_word(WERD_RES* word);
00606   void split_word(WERD_RES *word,
00607                   int split_pt,
00608                   WERD_RES **right_piece,
00609                   BlamerBundle **orig_blamer_bundle) const;
00610   void join_words(WERD_RES *word,
00611                   WERD_RES *word2,
00612                   BlamerBundle *orig_bb) const;
00614   BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position);
00615   inT16 eval_word_spacing(WERD_RES_LIST &word_res_list);
00616   void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block);
00617   inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
00618   void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
00619   void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
00620   void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block);
00621   void fix_fuzzy_spaces(                      //find fuzzy words
00622                         ETEXT_DESC *monitor,  //progress monitor
00623                         inT32 word_count,     //count of words in doc
00624                         PAGE_RES *page_res);
00625   void dump_words(WERD_RES_LIST &perm, inT16 score,
00626                   inT16 mode, BOOL8 improved);
00627   BOOL8 fixspace_thinks_word_done(WERD_RES *word);
00628   inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score);
00629   float blob_noise_score(TBLOB *blob);
00630   void break_noisiest_blob_word(WERD_RES_LIST &words);
00632   GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word);
00633   BOOL8 potential_word_crunch(WERD_RES *word,
00634                               GARBAGE_LEVEL garbage_level,
00635                               BOOL8 ok_dict_word);
00636   void tilde_crunch(PAGE_RES_IT &page_res_it);
00637   void unrej_good_quality_words(  //unreject potential
00638                                 PAGE_RES_IT &page_res_it);
00639   void doc_and_block_rejection(  //reject big chunks
00640                                PAGE_RES_IT &page_res_it,
00641                                BOOL8 good_quality_doc);
00642   void quality_based_rejection(PAGE_RES_IT &page_res_it,
00643                                BOOL8 good_quality_doc);
00644   void convert_bad_unlv_chs(WERD_RES *word_res);
00645   void tilde_delete(PAGE_RES_IT &page_res_it);
00646   inT16 word_blob_quality(WERD_RES *word, ROW *row);
00647   void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count,
00648                          inT16 *accepted_match_count);
00649   void unrej_good_chs(WERD_RES *word, ROW *row);
00650   inT16 count_outline_errs(char c, inT16 outline_count);
00651   inT16 word_outline_errs(WERD_RES *word);
00652   BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level);
00653   CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode);
00654   inT16 failure_count(WERD_RES *word);
00655   BOOL8 noise_outlines(TWERD *word);
00657   void
00658   process_selected_words (
00659       PAGE_RES* page_res, // blocks to check
00660       //function to call
00661       TBOX & selection_box,
00662       BOOL8 (tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it));
00664   void tess_add_doc_word(                          //test acceptability
00665                          WERD_CHOICE *word_choice  //after context
00666                         );
00667   void tess_segment_pass_n(int pass_n, WERD_RES *word);
00668   bool tess_acceptable_word(WERD_RES *word);
00669 
00671   // Applies the box file based on the image name fname, and resegments
00672   // the words in the block_list (page), with:
00673   // blob-mode: one blob per line in the box file, words as input.
00674   // word/line-mode: one blob per space-delimited unit after the #, and one word
00675   // per line in the box file. (See comment above for box file format.)
00676   // If find_segmentation is true, (word/line mode) then the classifier is used
00677   // to re-segment words/lines to match the space-delimited truth string for
00678   // each box. In this case, the input box may be for a word or even a whole
00679   // text line, and the output words will contain multiple blobs corresponding
00680   // to the space-delimited input string.
00681   // With find_segmentation false, no classifier is needed, but the chopper
00682   // can still be used to correctly segment touching characters with the help
00683   // of the input boxes.
00684   // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
00685   // from normal classification, ie. with a word, chopped_word, rebuild_word,
00686   // seam_array, denorm, box_word, and best_state, but NO best_choice or
00687   // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
00688   // Instead, the correct_text member of WERD_RES is set, and this may be later
00689   // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
00690   // is not required before calling ApplyBoxTraining.
00691   PAGE_RES* ApplyBoxes(const STRING& fname, bool find_segmentation,
00692                        BLOCK_LIST *block_list);
00693 
00694   // Any row xheight that is significantly different from the median is set
00695   // to the median.
00696   void PreenXHeights(BLOCK_LIST *block_list);
00697 
00698   // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
00699   // All fuzzy spaces are removed, and all the words are maximally chopped.
00700   PAGE_RES* SetupApplyBoxes(const GenericVector<TBOX>& boxes,
00701                             BLOCK_LIST *block_list);
00702   // Tests the chopper by exhaustively running chop_one_blob.
00703   // The word_res will contain filled chopped_word, seam_array, denorm,
00704   // box_word and best_state for the maximally chopped word.
00705   void MaximallyChopWord(const GenericVector<TBOX>& boxes,
00706                          BLOCK* block, ROW* row, WERD_RES* word_res);
00707   // Gather consecutive blobs that match the given box into the best_state
00708   // and corresponding correct_text.
00709   // Fights over which box owns which blobs are settled by pre-chopping and
00710   // applying the blobs to box or next_box with the least non-overlap.
00711   // Returns false if the box was in error, which can only be caused by
00712   // failing to find an appropriate blob for a box.
00713   // This means that occasionally, blobs may be incorrectly segmented if the
00714   // chopper fails to find a suitable chop point.
00715   bool ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
00716                         const TBOX& box, const TBOX& next_box,
00717                         const char* correct_text);
00718   // Consume all source blobs that strongly overlap the given box,
00719   // putting them into a new word, with the correct_text label.
00720   // Fights over which box owns which blobs are settled by
00721   // applying the blobs to box or next_box with the least non-overlap.
00722   // Returns false if the box was in error, which can only be caused by
00723   // failing to find an overlapping blob for a box.
00724   bool ResegmentWordBox(BLOCK_LIST *block_list,
00725                         const TBOX& box, const TBOX& next_box,
00726                         const char* correct_text);
00727   // Resegments the words by running the classifier in an attempt to find the
00728   // correct segmentation that produces the required string.
00729   void ReSegmentByClassification(PAGE_RES* page_res);
00730   // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
00731   // Returns false if an invalid UNICHAR_ID is encountered.
00732   bool ConvertStringToUnichars(const char* utf8,
00733                                GenericVector<UNICHAR_ID>* class_ids);
00734   // Resegments the word to achieve the target_text from the classifier.
00735   // Returns false if the re-segmentation fails.
00736   // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
00737   // applies a full search on the classifier results to find the best classified
00738   // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
00739   // substitutions ARE used.
00740   bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
00741                         WERD_RES* word_res);
00742   // Recursive helper to find a match to the target_text (from text_index
00743   // position) in the choices (from choices_pos position).
00744   // Choices is an array of GenericVectors, of length choices_length, with each
00745   // element representing a starting position in the word, and the
00746   // GenericVector holding classification results for a sequence of consecutive
00747   // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
00748   void SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
00749                      int choices_pos, int choices_length,
00750                      const GenericVector<UNICHAR_ID>& target_text,
00751                      int text_index,
00752                      float rating, GenericVector<int>* segmentation,
00753                      float* best_rating, GenericVector<int>* best_segmentation);
00754   // Counts up the labelled words and the blobs within.
00755   // Deletes all unused or emptied words, counting the unused ones.
00756   // Resets W_BOL and W_EOL flags correctly.
00757   // Builds the rebuild_word and rebuilds the box_word.
00758   void TidyUp(PAGE_RES* page_res);
00759   // Logs a bad box by line in the box file and box coords.
00760   void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
00761                        const char *err_msg);
00762   // Creates a fake best_choice entry in each WERD_RES with the correct text.
00763   void CorrectClassifyWords(PAGE_RES* page_res);
00764   // Call LearnWord to extract features for labelled blobs within each word.
00765   // Features are stored in an internal buffer.
00766   void ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res);
00767 
00769   // Returns the number of misfit blob tops in this word.
00770   int CountMisfitTops(WERD_RES *word_res);
00771   // Returns a new x-height in pixels (original image coords) that is
00772   // maximally compatible with the result in word_res.
00773   // Returns 0.0f if no x-height is found that is better than the current
00774   // estimate.
00775   float ComputeCompatibleXheight(WERD_RES *word_res, float* baseline_shift);
00777   // TODO(ocr-team): Find and remove obsolete parameters.
00778   BOOL_VAR_H(tessedit_resegment_from_boxes, false,
00779              "Take segmentation and labeling from box file");
00780   BOOL_VAR_H(tessedit_resegment_from_line_boxes, false,
00781               "Conversion of word/line box file to char box file");
00782   BOOL_VAR_H(tessedit_train_from_boxes, false,
00783              "Generate training data from boxed chars");
00784   BOOL_VAR_H(tessedit_make_boxes_from_boxes, false,
00785              "Generate more boxes from boxed chars");
00786   BOOL_VAR_H(tessedit_dump_pageseg_images, false,
00787              "Dump intermediate images made during page segmentation");
00788   INT_VAR_H(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
00789             "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
00790             " 5=line, 6=word, 7=char"
00791             " (Values from PageSegMode enum in publictypes.h)");
00792   INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
00793             "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults"
00794             " to loading and running only Tesseract (no Cube, no combiner)."
00795             " (Values from OcrEngineMode enum in tesseractclass.h)");
00796   STRING_VAR_H(tessedit_char_blacklist, "",
00797                "Blacklist of chars not to recognize");
00798   STRING_VAR_H(tessedit_char_whitelist, "",
00799                "Whitelist of chars to recognize");
00800   STRING_VAR_H(tessedit_char_unblacklist, "",
00801                "List of chars to override tessedit_char_blacklist");
00802   BOOL_VAR_H(tessedit_ambigs_training, false,
00803              "Perform training for ambiguities");
00804   INT_VAR_H(pageseg_devanagari_split_strategy,
00805             tesseract::ShiroRekhaSplitter::NO_SPLIT,
00806             "Whether to use the top-line splitting process for Devanagari "
00807             "documents while performing page-segmentation.");
00808   INT_VAR_H(ocr_devanagari_split_strategy,
00809             tesseract::ShiroRekhaSplitter::NO_SPLIT,
00810             "Whether to use the top-line splitting process for Devanagari "
00811             "documents while performing ocr.");
00812   STRING_VAR_H(tessedit_write_params_to_file, "",
00813                "Write all parameters to the given file.");
00814   BOOL_VAR_H(tessedit_adaption_debug, false,
00815              "Generate and print debug information for adaption");
00816   INT_VAR_H(bidi_debug, 0, "Debug level for BiDi");
00817   INT_VAR_H(applybox_debug, 1, "Debug level");
00818   INT_VAR_H(applybox_page, 0, "Page number to apply boxes from");
00819   STRING_VAR_H(applybox_exposure_pattern, ".exp",
00820                "Exposure value follows this pattern in the image"
00821                " filename. The name of the image files are expected"
00822                " to be in the form [lang].[fontname].exp[num].tif");
00823   BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode, false,
00824              "Learn both character fragments (as is done in the"
00825              " special low exposure mode) as well as unfragmented"
00826              " characters.");
00827   BOOL_VAR_H(applybox_learn_ngrams_mode, false,
00828              "Each bounding box is assumed to contain ngrams. Only"
00829              " learn the ngrams whose outlines overlap horizontally.");
00830   BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
00831   BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
00832   BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats");
00833   BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
00834              "Try to improve fuzzy spaces");
00835   BOOL_VAR_H(tessedit_unrej_any_wd, false,
00836              "Don't bother with word plausibility");
00837   BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
00838   BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height");
00839   BOOL_VAR_H(tessedit_enable_doc_dict, true,
00840              "Add words to the document dictionary");
00841   BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
00842   BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
00843   BOOL_VAR_H(tessedit_enable_bigram_correction, true,
00844              "Enable correction based on the word bigram dictionary.");
00845   BOOL_VAR_H(tessedit_enable_dict_correction, false,
00846              "Enable single word correction based on the dictionary.");
00847   INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram "
00848             "correction.");
00849   BOOL_VAR_H(enable_noise_removal, true,
00850              "Remove and conditionally reassign small outlines when they"
00851              " confuse layout analysis, determining diacritics vs noise");
00852   INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines");
00853   // Worst (min) certainty, for which a diacritic is allowed to make the base
00854   // character worse and still be included.
00855   double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty");
00856   // Worst (min) certainty, for which a non-overlapping diacritic is allowed to
00857   // make the base character worse and still be included.
00858   double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty");
00859   // Worst (min) certainty, for which a diacritic is allowed to make a new
00860   // stand-alone blob.
00861   double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty");
00862   // Factor of certainty margin for adding diacritics to not count as worse.
00863   double_VAR_H(noise_cert_factor, 0.375,
00864                "Scaling on certainty diff from Hingepoint");
00865   INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob");
00866   INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word");
00867   INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
00868   BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk");
00869   STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
00870   STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
00871   STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation");
00872   double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit");
00873   double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit");
00874   double_VAR_H(quality_outline_pc, 1.0,
00875                "good_quality_doc lte outline error limit");
00876   double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit");
00877   INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word");
00878   INT_VAR_H(tessedit_tess_adaption_mode, 0x27,
00879             "Adaptation decision algorithm for tess");
00880   BOOL_VAR_H(tessedit_minimal_rej_pass1, false,
00881              "Do minimal rejection on pass 1 output");
00882   BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria");
00883   BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity");
00884   INT_VAR_H(tessedit_test_adaption_mode, 3,
00885             "Adaptation decision algorithm for tess");
00886   BOOL_VAR_H(test_pt, false, "Test for point");
00887   double_VAR_H(test_pt_x, 99999.99, "xcoord");
00888   double_VAR_H(test_pt_y, 99999.99, "ycoord");
00889   INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
00890   BOOL_VAR_H(paragraph_text_based, true,
00891              "Run paragraph detection on the post-text-recognition "
00892              "(more accurate)");
00893   INT_VAR_H(cube_debug_level, 1, "Print cube debug info.");
00894   STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines");
00895   STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines");
00896   BOOL_VAR_H(docqual_excuse_outline_errs, false,
00897              "Allow outline errs in unrejection?");
00898   BOOL_VAR_H(tessedit_good_quality_unrej, true,
00899              "Reduce rejection on good docs");
00900   BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?");
00901   double_VAR_H(tessedit_reject_doc_percent, 65.00,
00902                "%rej allowed before rej whole doc");
00903   double_VAR_H(tessedit_reject_block_percent, 45.00,
00904                "%rej allowed before rej whole block");
00905   double_VAR_H(tessedit_reject_row_percent, 40.00,
00906                "%rej allowed before rej whole row");
00907   double_VAR_H(tessedit_whole_wd_rej_row_percent, 70.00,
00908                "Number of row rejects in whole word rejects"
00909                "which prevents whole row rejection");
00910   BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds, true,
00911              "Only rej partially rejected words in block rejection");
00912   BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds, true,
00913              "Only rej partially rejected words in row rejection");
00914   BOOL_VAR_H(tessedit_dont_blkrej_good_wds, false,
00915              "Use word segmentation quality metric");
00916   BOOL_VAR_H(tessedit_dont_rowrej_good_wds, false,
00917              "Use word segmentation quality metric");
00918   INT_VAR_H(tessedit_preserve_min_wd_len, 2,
00919             "Only preserve wds longer than this");
00920   BOOL_VAR_H(tessedit_row_rej_good_docs, true,
00921              "Apply row rejection to good docs");
00922   double_VAR_H(tessedit_good_doc_still_rowrej_wd, 1.1,
00923                "rej good doc wd if more than this fraction rejected");
00924   BOOL_VAR_H(tessedit_reject_bad_qual_wds, true,
00925              "Reject all bad quality wds");
00926   BOOL_VAR_H(tessedit_debug_doc_rejection, false, "Page stats");
00927   BOOL_VAR_H(tessedit_debug_quality_metrics, false,
00928              "Output data to debug file");
00929   BOOL_VAR_H(bland_unrej, false, "unrej potential with no chekcs");
00930   double_VAR_H(quality_rowrej_pc, 1.1,
00931                "good_quality_doc gte good char limit");
00932   BOOL_VAR_H(unlv_tilde_crunching, true,
00933              "Mark v.bad words for tilde crunch");
00934   BOOL_VAR_H(hocr_font_info, false,
00935              "Add font info to hocr output");
00936   BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
00937   BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
00938   double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
00939   BOOL_VAR_H(crunch_terrible_garbage, true, "As it says");
00940   double_VAR_H(crunch_poor_garbage_cert, -9.0,
00941                "crunch garbage cert lt this");
00942   double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this");
00943   double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this");
00944   double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this");
00945   BOOL_VAR_H(crunch_pot_garbage, true, "POTENTIAL crunch garbage");
00946   double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
00947   double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
00948   double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
00949   double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
00950   double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this");
00951   double_VAR_H(crunch_del_high_word, 1.5,
00952                "Del if word gt xht x this above bl");
00953   double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl");
00954   double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
00955   INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
00956   INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
00957   BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings");
00958   BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
00959   BOOL_VAR_H(crunch_leave_accept_strings, false,
00960              "Don't pot crunch sensible strings");
00961   BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
00962   INT_VAR_H(crunch_leave_lc_strings, 4,
00963             "Don't crunch words with long lower case strings");
00964   INT_VAR_H(crunch_leave_uc_strings, 4,
00965             "Don't crunch words with long lower case strings");
00966   INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
00967   INT_VAR_H(crunch_debug, 0, "As it says");
00968   INT_VAR_H(fixsp_non_noise_limit, 1,
00969             "How many non-noise blbs either side?");
00970   double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
00971   BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctation joins");
00972   INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing");
00973   INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug");
00974   STRING_VAR_H(numeric_punctuation, ".,",
00975                "Punct. chs expected WITHIN numbers");
00976   INT_VAR_H(x_ht_acceptance_tolerance, 8,
00977             "Max allowed deviation of blob top outside of font data");
00978   INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it");
00979   INT_VAR_H(superscript_debug, 0, "Debug level for sub & superscript fixer");
00980   double_VAR_H(superscript_worse_certainty, 2.0, "How many times worse "
00981                "certainty does a superscript position glyph need to be for us "
00982                "to try classifying it as a char with a different baseline?");
00983   double_VAR_H(superscript_bettered_certainty, 0.97, "What reduction in "
00984                "badness do we think sufficient to choose a superscript over "
00985                "what we'd thought.  For example, a value of 0.6 means we want "
00986                "to reduce badness of certainty by 40%");
00987   double_VAR_H(superscript_scaledown_ratio, 0.4,
00988                "A superscript scaled down more than this is unbelievably "
00989                "small.  For example, 0.3 means we expect the font size to "
00990                "be no smaller than 30% of the text line font size.");
00991   double_VAR_H(subscript_max_y_top, 0.5,
00992                "Maximum top of a character measured as a multiple of x-height "
00993                "above the baseline for us to reconsider whether it's a "
00994                "subscript.");
00995   double_VAR_H(superscript_min_y_bottom, 0.3,
00996               "Minimum bottom of a character measured as a multiple of "
00997               "x-height above the baseline for us to reconsider whether it's "
00998               "a superscript.");
00999   BOOL_VAR_H(tessedit_write_block_separators, false,
01000              "Write block separators in output");
01001   BOOL_VAR_H(tessedit_write_rep_codes, false,
01002              "Write repetition char code");
01003   BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
01004   BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
01005   BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
01006   BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
01007   STRING_VAR_H(unrecognised_char, "|",
01008                "Output char for unidentified blobs");
01009   INT_VAR_H(suspect_level, 99, "Suspect marker level");
01010   INT_VAR_H(suspect_space_level, 100,
01011             "Min suspect level for rejecting spaces");
01012   INT_VAR_H(suspect_short_words, 2,
01013             "Don't Suspect dict wds longer than this");
01014   BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
01015   double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit");
01016   double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
01017   BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
01018   BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING");
01019   BOOL_VAR_H(tessedit_word_for_word, false,
01020              "Make output have exactly one word per WERD");
01021   BOOL_VAR_H(tessedit_zero_kelvin_rejection, false,
01022              "Don't reject ANYTHING AT ALL");
01023   BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
01024   INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
01025   BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
01026   BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips");
01027   double_VAR_H(tessedit_lower_flip_hyphen, 1.5,
01028                "Aspect ratio dot/hyphen test");
01029   double_VAR_H(tessedit_upper_flip_hyphen, 1.8,
01030                "Aspect ratio dot/hyphen test");
01031   BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
01032   BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
01033   BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check");
01034   BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
01035   BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
01036   BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");
01037   BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check");
01038   BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check");
01039   double_VAR_H(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract");
01040   INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit");
01041   STRING_VAR_H(ok_repeated_ch_non_alphanum_wds, "-?*\075",
01042                "Allow NN to unrej");
01043   STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
01044   INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
01045   BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes");
01046   INT_VAR_H(tessedit_page_number, -1,
01047             "-1 -> All pages, else specifc page to process");
01048   BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE");
01049   BOOL_VAR_H(interactive_display_mode, false, "Run interactively?");
01050   STRING_VAR_H(file_type, ".tif", "Filename extension");
01051   BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
01052   INT_VAR_H(tessdata_manager_debug_level, 0,
01053             "Debug level for TessdataManager functions.");
01054   STRING_VAR_H(tessedit_load_sublangs, "",
01055                "List of languages to load with this one");
01056   BOOL_VAR_H(tessedit_use_primary_params_model, false,
01057              "In multilingual mode use params model of the primary language");
01058   // Min acceptable orientation margin (difference in scores between top and 2nd
01059   // choice in OSResults::orientations) to believe the page orientation.
01060   double_VAR_H(min_orientation_margin, 7.0,
01061                "Min acceptable orientation margin");
01062   BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding");
01063   BOOL_VAR_H(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model");
01064   BOOL_VAR_H(poly_allow_detailed_fx, false,
01065              "Allow feature extractors to see the original outline");
01066   BOOL_VAR_H(tessedit_init_config_only, false,
01067              "Only initialize with the config file. Useful if the instance is "
01068              "not going to be used for OCR but say only for layout analysis.");
01069   BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
01070   BOOL_VAR_H(textord_tabfind_vertical_text, true, "Enable vertical detection");
01071   BOOL_VAR_H(textord_tabfind_force_vertical_text, false,
01072              "Force using vertical text page mode");
01073   double_VAR_H(textord_tabfind_vertical_text_ratio, 0.5,
01074                "Fraction of textlines deemed vertical to use vertical page "
01075                "mode");
01076   double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
01077                "Fraction of height used as a minimum gap for aligned blobs.");
01078   INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
01079   BOOL_VAR_H(preserve_interword_spaces, false,
01080              "Preserve multiple interword spaces");
01081   BOOL_VAR_H(include_page_breaks, false,
01082              "Include page separator string in output text after each "
01083              "image/page.");
01084   STRING_VAR_H(page_separator, "\f",
01085                "Page separator (default is form feed control character)");
01086 
01087   // The following parameters were deprecated and removed from their original
01088   // locations. The parameters are temporarily kept here to give Tesseract
01089   // users a chance to updated their [lang].traineddata and config files
01090   // without introducing failures during Tesseract initialization.
01091   // TODO(ocr-team): remove these parameters from the code once we are
01092   // reasonably sure that Tesseract users have updated their data files.
01093   //
01094   // BEGIN DEPRECATED PARAMETERS
01095   BOOL_VAR_H(textord_tabfind_vertical_horizontal_mix, true,
01096              "find horizontal lines such as headers in vertical page mode");
01097   INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm");
01098   BOOL_VAR_H(load_fixed_length_dawgs, true,  "Load fixed length"
01099              " dawgs (e.g. for non-space delimited languages)");
01100   INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process");
01101   BOOL_VAR_H(permute_debug, 0, "char permutation debug");
01102   double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of"
01103                " current best rate to prune other hypotheses");
01104   BOOL_VAR_H(permute_script_word, 0,
01105              "Turn on word script consistency permuter");
01106   BOOL_VAR_H(segment_segcost_rating, 0,
01107              "incorporate segmentation cost in word rating?");
01108   double_VAR_H(segment_reward_script, 0.95,
01109                "Score multipler for script consistency within a word. "
01110                "Being a 'reward' factor, it should be <= 1. "
01111                "Smaller value implies bigger reward.");
01112   BOOL_VAR_H(permute_fixed_length_dawg, 0,
01113              "Turn on fixed-length phrasebook search permuter");
01114   BOOL_VAR_H(permute_chartype_word, 0,
01115              "Turn on character type (property) consistency permuter");
01116   double_VAR_H(segment_reward_chartype, 0.97,
01117                "Score multipler for char type consistency within a word. ");
01118   double_VAR_H(segment_reward_ngram_best_choice, 0.99,
01119                "Score multipler for ngram permuter's best choice"
01120                " (only used in the Han script path).");
01121   BOOL_VAR_H(ngram_permuter_activated, false,
01122              "Activate character-level n-gram-based permuter");
01123   BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter");
01124   INT_VAR_H(language_model_fixed_length_choices_depth, 3,
01125             "Depth of blob choice lists to explore"
01126             " when fixed length dawgs are on");
01127   BOOL_VAR_H(use_new_state_cost, FALSE,
01128              "use new state cost heuristics for segmentation state evaluation");
01129   double_VAR_H(heuristic_segcost_rating_base, 1.25,
01130                "base factor for adding segmentation cost into word rating."
01131                "It's a multiplying factor, the larger the value above 1, "
01132                "the bigger the effect of segmentation cost.");
01133   double_VAR_H(heuristic_weight_rating, 1,
01134                "weight associated with char rating in combined cost of state");
01135   double_VAR_H(heuristic_weight_width, 1000.0,
01136                "weight associated with width evidence in combined cost of"
01137                " state");
01138   double_VAR_H(heuristic_weight_seamcut, 0,
01139                "weight associated with seam cut in combined cost of state");
01140   double_VAR_H(heuristic_max_char_wh_ratio, 2.0,
01141                "max char width-to-height ratio allowed in segmentation");
01142   BOOL_VAR_H(enable_new_segsearch, false,
01143              "Enable new segmentation search path.");
01144   double_VAR_H(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
01145                "Maximum character width-to-height ratio for"
01146                "fixed pitch fonts");
01147   // END DEPRECATED PARAMETERS
01148 
01150   FILE *init_recog_training(const STRING &fname);
01151   void recog_training_segmented(const STRING &fname,
01152                                 PAGE_RES *page_res,
01153                                 volatile ETEXT_DESC *monitor,
01154                                 FILE *output_file);
01155   void ambigs_classify_and_output(const char *label,
01156                                   PAGE_RES_IT* pr_it,
01157                                   FILE *output_file);
01158 
01159 #ifndef NO_CUBE_BUILD
01160   inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; }
01161 #endif
01162 
01163  private:
01164   // The filename of a backup config file. If not null, then we currently
01165   // have a temporary debug config file loaded, and backup_config_file_
01166   // will be loaded, and set to null when debug is complete.
01167   const char* backup_config_file_;
01168   // The filename of a config file to read when processing a debug word.
01169   STRING word_config_;
01170   // Image used for input to layout analysis and tesseract recognition.
01171   // May be modified by the ShiroRekhaSplitter to eliminate the top-line.
01172   Pix* pix_binary_;
01173   // Unmodified image used for input to cube. Always valid.
01174   Pix* cube_binary_;
01175   // Grey-level input image if the input was not binary, otherwise NULL.
01176   Pix* pix_grey_;
01177   // Thresholds that were used to generate the thresholded image from grey.
01178   Pix* pix_thresholds_;
01179   // Input image resolution after any scaling. The resolution is not well
01180   // transmitted by operations on Pix, so we keep an independent record here.
01181   int source_resolution_;
01182   // The shiro-rekha splitter object which is used to split top-lines in
01183   // Devanagari words to provide a better word and grapheme segmentation.
01184   ShiroRekhaSplitter splitter_;
01185   // Page segmentation/layout
01186   Textord textord_;
01187   // True if the primary language uses right_to_left reading order.
01188   bool right_to_left_;
01189   Pix* scaled_color_;
01190   int scaled_factor_;
01191   FCOORD deskew_;
01192   FCOORD reskew_;
01193   TesseractStats stats_;
01194   // Sub-languages to be tried in addition to this.
01195   GenericVector<Tesseract*> sub_langs_;
01196   // Most recently used Tesseract out of this and sub_langs_. The default
01197   // language for the next word.
01198   Tesseract* most_recently_used_;
01199   // The size of the font table, ie max possible font id + 1.
01200   int font_table_size_;
01201 #ifndef NO_CUBE_BUILD
01202   // Cube objects.
01203   CubeRecoContext* cube_cntxt_;
01204   TesseractCubeCombiner *tess_cube_combiner_;
01205 #endif
01206   // Equation detector. Note: this pointer is NOT owned by the class.
01207   EquationDetect* equ_detect_;
01208 };
01209 
01210 }  // namespace tesseract
01211 
01212 
01213 #endif  // TESSERACT_CCMAIN_TESSERACTCLASS_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines