|
tesseract 3.04.01
|
00001 00002 // File: tesseractclass.h 00003 // Description: The Tesseract class. It holds/owns everything needed 00004 // to run Tesseract on a single language, and also a set of 00005 // sub-Tesseracts to run sub-languages. For thread safety, *every* 00006 // global variable goes in here, directly, or indirectly. 00007 // This makes it safe to run multiple Tesseracts in different 00008 // threads in parallel, and keeps the different language 00009 // instances separate. 00010 // Author: Ray Smith 00011 // Created: Fri Mar 07 08:17:01 PST 2008 00012 // 00013 // (C) Copyright 2008, Google Inc. 00014 // Licensed under the Apache License, Version 2.0 (the "License"); 00015 // you may not use this file except in compliance with the License. 00016 // You may obtain a copy of the License at 00017 // http://www.apache.org/licenses/LICENSE-2.0 00018 // Unless required by applicable law or agreed to in writing, software 00019 // distributed under the License is distributed on an "AS IS" BASIS, 00020 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 // See the License for the specific language governing permissions and 00022 // limitations under the License. 00023 // 00025 00026 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__ 00027 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__ 00028 00029 #include "allheaders.h" 00030 #include "control.h" 00031 #include "docqual.h" 00032 #include "devanagari_processing.h" 00033 #include "genericvector.h" 00034 #include "params.h" 00035 #include "ocrclass.h" 00036 #include "textord.h" 00037 #include "wordrec.h" 00038 00039 class BLOB_CHOICE_LIST_CLIST; 00040 class BLOCK_LIST; 00041 class CharSamp; 00042 struct OSResults; 00043 class PAGE_RES; 00044 class PAGE_RES_IT; 00045 struct Pix; 00046 class ROW; 00047 class SVMenuNode; 00048 class TBOX; 00049 class TO_BLOCK_LIST; 00050 class WERD; 00051 class WERD_CHOICE; 00052 class WERD_RES; 00053 00054 00055 // Top-level class for all tesseract global instance data. 00056 // This class either holds or points to all data used by an instance 00057 // of Tesseract, including the memory allocator. When this is 00058 // complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT! 00059 // 00060 // NOTE to developers: Do not create cyclic dependencies through this class! 00061 // The directory dependency tree must remain a tree! The keep this clean, 00062 // lower-level code (eg in ccutil, the bottom level) must never need to 00063 // know about the content of a higher-level directory. 00064 // The following scheme will grant the easiest access to lower-level 00065 // global members without creating a cyclic dependency: 00066 // 00067 // Class Hierarchy (^ = inheritance): 00068 // 00069 // CCUtil (ccutil/ccutil.h) 00070 // ^ Members include: UNICHARSET 00071 // CUtil (cutil/cutil_class.h) 00072 // ^ Members include: TBLOB*, TEXTBLOCK* 00073 // CCStruct (ccstruct/ccstruct.h) 00074 // ^ Members include: Image 00075 // Classify (classify/classify.h) 00076 // ^ Members include: Dict 00077 // WordRec (wordrec/wordrec.h) 00078 // ^ Members include: WERD*, DENORM* 00079 // Tesseract (ccmain/tesseractclass.h) 00080 // Members include: Pix*, CubeRecoContext*, 00081 // TesseractCubeCombiner* 00082 // 00083 // Other important classes: 00084 // 00085 // TessBaseAPI (api/baseapi.h) 00086 // Members include: BLOCK_LIST*, PAGE_RES*, 00087 // Tesseract*, ImageThresholder* 00088 // Dict (dict/dict.h) 00089 // Members include: Image* (private) 00090 // 00091 // NOTE: that each level contains members that correspond to global 00092 // data that is defined (and used) at that level, not necessarily where 00093 // the type is defined so for instance: 00094 // BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs"); 00095 // goes inside the Textord class, not the cc_util class. 00096 00097 namespace tesseract { 00098 00099 class ColumnFinder; 00100 #ifndef NO_CUBE_BUILD 00101 class CubeLineObject; 00102 class CubeObject; 00103 class CubeRecoContext; 00104 #endif 00105 class EquationDetect; 00106 class Tesseract; 00107 #ifndef NO_CUBE_BUILD 00108 class TesseractCubeCombiner; 00109 #endif 00110 00111 // A collection of various variables for statistics and debugging. 00112 struct TesseractStats { 00113 TesseractStats() 00114 : adaption_word_number(0), 00115 doc_blob_quality(0), 00116 doc_outline_errs(0), 00117 doc_char_quality(0), 00118 good_char_count(0), 00119 doc_good_char_quality(0), 00120 word_count(0), 00121 dict_words(0), 00122 tilde_crunch_written(false), 00123 last_char_was_newline(true), 00124 last_char_was_tilde(false), 00125 write_results_empty_block(true) {} 00126 00127 inT32 adaption_word_number; 00128 inT16 doc_blob_quality; 00129 inT16 doc_outline_errs; 00130 inT16 doc_char_quality; 00131 inT16 good_char_count; 00132 inT16 doc_good_char_quality; 00133 inT32 word_count; // count of word in the document 00134 inT32 dict_words; // number of dicitionary words in the document 00135 STRING dump_words_str; // accumulator used by dump_words() 00136 // Flags used by write_results() 00137 bool tilde_crunch_written; 00138 bool last_char_was_newline; 00139 bool last_char_was_tilde; 00140 bool write_results_empty_block; 00141 }; 00142 00143 // Struct to hold all the pointers to relevant data for processing a word. 00144 struct WordData { 00145 WordData() : word(NULL), row(NULL), block(NULL), prev_word(NULL) {} 00146 explicit WordData(const PAGE_RES_IT& page_res_it) 00147 : word(page_res_it.word()), row(page_res_it.row()->row), 00148 block(page_res_it.block()->block), prev_word(NULL) {} 00149 WordData(BLOCK* block_in, ROW* row_in, WERD_RES* word_res) 00150 : word(word_res), row(row_in), block(block_in), prev_word(NULL) {} 00151 00152 WERD_RES* word; 00153 ROW* row; 00154 BLOCK* block; 00155 WordData* prev_word; 00156 PointerVector<WERD_RES> lang_words; 00157 }; 00158 00159 // Definition of a Tesseract WordRecognizer. The WordData provides the context 00160 // of row/block, in_word holds an initialized, possibly pre-classified word, 00161 // that the recognizer may or may not consume (but if so it sets *in_word=NULL) 00162 // and produces one or more output words in out_words, which may be the 00163 // consumed in_word, or may be generated independently. 00164 // This api allows both a conventional tesseract classifier to work, or a 00165 // line-level classifier that generates multiple words from a merged input. 00166 typedef void (Tesseract::*WordRecognizer)(const WordData& word_data, 00167 WERD_RES** in_word, 00168 PointerVector<WERD_RES>* out_words); 00169 00170 class Tesseract : public Wordrec { 00171 public: 00172 Tesseract(); 00173 ~Tesseract(); 00174 00175 // Clear as much used memory as possible without resetting the adaptive 00176 // classifier or losing any other classifier data. 00177 void Clear(); 00178 // Clear all memory of adaption for this and all subclassifiers. 00179 void ResetAdaptiveClassifier(); 00180 // Clear the document dictionary for this and all subclassifiers. 00181 void ResetDocumentDictionary(); 00182 00183 // Set the equation detector. 00184 void SetEquationDetect(EquationDetect* detector); 00185 00186 // Simple accessors. 00187 const FCOORD& reskew() const { 00188 return reskew_; 00189 } 00190 // Destroy any existing pix and return a pointer to the pointer. 00191 Pix** mutable_pix_binary() { 00192 Clear(); 00193 return &pix_binary_; 00194 } 00195 Pix* pix_binary() const { 00196 return pix_binary_; 00197 } 00198 Pix* pix_grey() const { 00199 return pix_grey_; 00200 } 00201 void set_pix_grey(Pix* grey_pix) { 00202 pixDestroy(&pix_grey_); 00203 pix_grey_ = grey_pix; 00204 } 00205 // Returns a pointer to a Pix representing the best available image of the 00206 // page. The image will be 8-bit grey if the input was grey or color. Note 00207 // that in grey 0 is black and 255 is white. If the input was binary, then 00208 // the returned Pix will be binary. Note that here black is 1 and white is 0. 00209 // To tell the difference pixGetDepth() will return 8 or 1. 00210 // In either case, the return value is a borrowed Pix, and should not be 00211 // deleted or pixDestroyed. 00212 Pix* BestPix() const { 00213 return pix_grey_ != NULL ? pix_grey_ : pix_binary_; 00214 } 00215 void set_pix_thresholds(Pix* thresholds) { 00216 pixDestroy(&pix_thresholds_); 00217 pix_thresholds_ = thresholds; 00218 } 00219 int source_resolution() const { 00220 return source_resolution_; 00221 } 00222 void set_source_resolution(int ppi) { 00223 source_resolution_ = ppi; 00224 } 00225 int ImageWidth() const { 00226 return pixGetWidth(pix_binary_); 00227 } 00228 int ImageHeight() const { 00229 return pixGetHeight(pix_binary_); 00230 } 00231 Pix* scaled_color() const { 00232 return scaled_color_; 00233 } 00234 int scaled_factor() const { 00235 return scaled_factor_; 00236 } 00237 void SetScaledColor(int factor, Pix* color) { 00238 scaled_factor_ = factor; 00239 scaled_color_ = color; 00240 } 00241 const Textord& textord() const { 00242 return textord_; 00243 } 00244 Textord* mutable_textord() { 00245 return &textord_; 00246 } 00247 00248 bool right_to_left() const { 00249 return right_to_left_; 00250 } 00251 int num_sub_langs() const { 00252 return sub_langs_.size(); 00253 } 00254 Tesseract* get_sub_lang(int index) const { 00255 return sub_langs_[index]; 00256 } 00257 // Returns true if any language uses Tesseract (as opposed to cube). 00258 bool AnyTessLang() const { 00259 if (tessedit_ocr_engine_mode != OEM_CUBE_ONLY) return true; 00260 for (int i = 0; i < sub_langs_.size(); ++i) { 00261 if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) 00262 return true; 00263 } 00264 return false; 00265 } 00266 00267 void SetBlackAndWhitelist(); 00268 00269 // Perform steps to prepare underlying binary image/other data structures for 00270 // page segmentation. Uses the strategy specified in the global variable 00271 // pageseg_devanagari_split_strategy for perform splitting while preparing for 00272 // page segmentation. 00273 void PrepareForPageseg(); 00274 00275 // Perform steps to prepare underlying binary image/other data structures for 00276 // Tesseract OCR. The current segmentation is required by this method. 00277 // Uses the strategy specified in the global variable 00278 // ocr_devanagari_split_strategy for performing splitting while preparing for 00279 // Tesseract ocr. 00280 void PrepareForTessOCR(BLOCK_LIST* block_list, 00281 Tesseract* osd_tess, OSResults* osr); 00282 00283 int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, 00284 Tesseract* osd_tess, OSResults* osr); 00285 void SetupWordScripts(BLOCK_LIST* blocks); 00286 int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks, 00287 TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs, 00288 Tesseract* osd_tess, OSResults* osr); 00289 ColumnFinder* SetupPageSegAndDetectOrientation( 00290 PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess, 00291 OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, 00292 Pix** music_mask_pix); 00293 // par_control.cpp 00294 void PrerecAllWordsPar(const GenericVector<WordData>& words); 00295 00297 bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box, 00298 const char* word_config, int pass); 00299 // Sets up the words ready for whichever engine is to be run 00300 void SetupAllWordsPassN(int pass_n, 00301 const TBOX* target_word_box, 00302 const char* word_config, 00303 PAGE_RES* page_res, 00304 GenericVector<WordData>* words); 00305 // Sets up the single word ready for whichever engine is to be run. 00306 void SetupWordPassN(int pass_n, WordData* word); 00307 // Runs word recognition on all the words. 00308 bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, 00309 PAGE_RES_IT* pr_it, 00310 GenericVector<WordData>* words); 00311 bool recog_all_words(PAGE_RES* page_res, 00312 ETEXT_DESC* monitor, 00313 const TBOX* target_word_box, 00314 const char* word_config, 00315 int dopasses); 00316 void rejection_passes(PAGE_RES* page_res, 00317 ETEXT_DESC* monitor, 00318 const TBOX* target_word_box, 00319 const char* word_config); 00320 void bigram_correction_pass(PAGE_RES *page_res); 00321 void blamer_pass(PAGE_RES* page_res); 00322 // Sets script positions and detects smallcaps on all output words. 00323 void script_pos_pass(PAGE_RES* page_res); 00324 // Helper to recognize the word using the given (language-specific) tesseract. 00325 // Returns positive if this recognizer found more new best words than the 00326 // number kept from best_words. 00327 int RetryWithLanguage(const WordData& word_data, 00328 WordRecognizer recognizer, 00329 WERD_RES** in_word, 00330 PointerVector<WERD_RES>* best_words); 00331 // Moves good-looking "noise"/diacritics from the reject list to the main 00332 // blob list on the current word. Returns true if anything was done, and 00333 // sets make_next_word_fuzzy if blob(s) were added to the end of the word. 00334 bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it, 00335 bool* make_next_word_fuzzy); 00336 // Attempts to put noise/diacritic outlines into the blobs that they overlap. 00337 // Input: a set of noisy outlines that probably belong to the real_word. 00338 // Output: outlines that overlapped blobs are set to NULL and put back into 00339 // the word, either in the blobs or in the reject list. 00340 void AssignDiacriticsToOverlappingBlobs( 00341 const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word, 00342 PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted, 00343 GenericVector<bool>* overlapped_any_blob, 00344 GenericVector<C_BLOB*>* target_blobs); 00345 // Attempts to assign non-overlapping outlines to their nearest blobs or 00346 // make new blobs out of them. 00347 void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE*>& outlines, 00348 int pass, WERD* real_word, PAGE_RES_IT* pr_it, 00349 GenericVector<bool>* word_wanted, 00350 GenericVector<C_BLOB*>* target_blobs); 00351 // Starting with ok_outlines set to indicate which outlines overlap the blob, 00352 // chooses the optimal set (approximately) and returns true if any outlines 00353 // are desired, in which case ok_outlines indicates which ones. 00354 bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, 00355 PAGE_RES_IT* pr_it, C_BLOB* blob, 00356 const GenericVector<C_OUTLINE*>& outlines, 00357 int num_outlines, 00358 GenericVector<bool>* ok_outlines); 00359 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes 00360 // the inclusion of the outlines, and returns the certainty of the raw choice. 00361 float ClassifyBlobPlusOutlines(const GenericVector<bool>& ok_outlines, 00362 const GenericVector<C_OUTLINE*>& outlines, 00363 int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob, 00364 STRING* best_str); 00365 // Classifies the given blob (part of word_data->word->word) as an individual 00366 // word, using languages, chopper etc, returning only the certainty of the 00367 // best raw choice, and undoing all the work done to fake out the word. 00368 float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob, 00369 STRING* best_str, float* c2); 00370 void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it, 00371 WordData* word_data); 00372 void classify_word_pass1(const WordData& word_data, 00373 WERD_RES** in_word, 00374 PointerVector<WERD_RES>* out_words); 00375 void recog_pseudo_word(PAGE_RES* page_res, // blocks to check 00376 TBOX &selection_box); 00377 00378 void fix_rep_char(PAGE_RES_IT* page_res_it); 00379 00380 ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set, 00381 const char *s, 00382 const char *lengths); 00383 void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK* block); 00384 void classify_word_pass2(const WordData& word_data, 00385 WERD_RES** in_word, 00386 PointerVector<WERD_RES>* out_words); 00387 void ReportXhtFixResult(bool accept_new_word, float new_x_ht, 00388 WERD_RES* word, WERD_RES* new_word); 00389 bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row); 00390 bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row); 00391 // Runs recognition with the test baseline shift and x-height and returns true 00392 // if there was an improvement in recognition result. 00393 bool TestNewNormalization(int original_misfits, float baseline_shift, 00394 float new_x_ht, WERD_RES *word, BLOCK* block, 00395 ROW *row); 00396 BOOL8 recog_interactive(PAGE_RES_IT* pr_it); 00397 00398 // Set fonts of this word. 00399 void set_word_fonts(WERD_RES *word); 00400 void font_recognition_pass(PAGE_RES* page_res); 00401 void dictionary_correction_pass(PAGE_RES* page_res); 00402 BOOL8 check_debug_pt(WERD_RES *word, int location); 00403 00405 bool SubAndSuperscriptFix(WERD_RES *word_res); 00406 void GetSubAndSuperscriptCandidates(const WERD_RES *word, 00407 int *num_rebuilt_leading, 00408 ScriptPos *leading_pos, 00409 float *leading_certainty, 00410 int *num_rebuilt_trailing, 00411 ScriptPos *trailing_pos, 00412 float *trailing_certainty, 00413 float *avg_certainty, 00414 float *unlikely_threshold); 00415 WERD_RES *TrySuperscriptSplits(int num_chopped_leading, 00416 float leading_certainty, 00417 ScriptPos leading_pos, 00418 int num_chopped_trailing, 00419 float trailing_certainty, 00420 ScriptPos trailing_pos, 00421 WERD_RES *word, 00422 bool *is_good, 00423 int *retry_leading, 00424 int *retry_trailing); 00425 bool BelievableSuperscript(bool debug, 00426 const WERD_RES &word, 00427 float certainty_threshold, 00428 int *left_ok, 00429 int *right_ok) const; 00430 00432 #ifndef NO_CUBE_BUILD 00433 bool init_cube_objects(bool load_combiner, 00434 TessdataManager *tessdata_manager); 00435 // Iterates through tesseract's results and calls cube on each word, 00436 // combining the results with the existing tesseract result. 00437 void run_cube_combiner(PAGE_RES *page_res); 00438 // Recognizes a single word using (only) cube. Compatible with 00439 // Tesseract's classify_word_pass1/classify_word_pass2. 00440 void cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word); 00441 // Cube recognizer to recognize a single word as with classify_word_pass1 00442 // but also returns the cube object in case the combiner is needed. 00443 CubeObject* cube_recognize_word(BLOCK* block, WERD_RES* word); 00444 // Combines the cube and tesseract results for a single word, leaving the 00445 // result in tess_word. 00446 void cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word, 00447 WERD_RES* tess_word); 00448 // Call cube on the current word, and write the result to word. 00449 // Sets up a fake result and returns false if something goes wrong. 00450 bool cube_recognize(CubeObject *cube_obj, BLOCK* block, WERD_RES *word); 00451 void fill_werd_res(const BoxWord& cube_box_word, 00452 const char* cube_best_str, 00453 WERD_RES* tess_werd_res); 00454 bool extract_cube_state(CubeObject* cube_obj, int* num_chars, 00455 Boxa** char_boxes, CharSamp*** char_samples); 00456 bool create_cube_box_word(Boxa *char_boxes, int num_chars, 00457 TBOX word_box, BoxWord* box_word); 00458 #endif 00459 00460 00461 void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box); 00462 void write_results(PAGE_RES_IT &page_res_it, // full info 00463 char newline_type, // type of newline 00464 BOOL8 force_eol // override tilde crunch? 00465 ); 00466 void set_unlv_suspects(WERD_RES *word); 00467 UNICHAR_ID get_rep_char(WERD_RES *word); // what char is repeated? 00468 BOOL8 acceptable_number_string(const char *s, 00469 const char *lengths); 00470 inT16 count_alphanums(const WERD_CHOICE &word); 00471 inT16 count_alphas(const WERD_CHOICE &word); 00473 void read_config_file(const char *filename, SetParamConstraint constraint); 00474 // Initialize for potentially a set of languages defined by the language 00475 // string and recursively any additional languages required by any language 00476 // traineddata file (via tessedit_load_sublangs in its config) that is loaded. 00477 // See init_tesseract_internal for args. 00478 int init_tesseract(const char *arg0, 00479 const char *textbase, 00480 const char *language, 00481 OcrEngineMode oem, 00482 char **configs, 00483 int configs_size, 00484 const GenericVector<STRING> *vars_vec, 00485 const GenericVector<STRING> *vars_values, 00486 bool set_only_init_params); 00487 int init_tesseract(const char *datapath, 00488 const char *language, 00489 OcrEngineMode oem) { 00490 return init_tesseract(datapath, NULL, language, oem, 00491 NULL, 0, NULL, NULL, false); 00492 } 00493 // Common initialization for a single language. 00494 // arg0 is the datapath for the tessdata directory, which could be the 00495 // path of the tessdata directory with no trailing /, or (if tessdata 00496 // lives in the same directory as the executable, the path of the executable, 00497 // hence the name arg0. 00498 // textbase is an optional output file basename (used only for training) 00499 // language is the language code to load. 00500 // oem controls which engine(s) will operate on the image 00501 // configs (argv) is an array of config filenames to load variables from. 00502 // May be NULL. 00503 // configs_size (argc) is the number of elements in configs. 00504 // vars_vec is an optional vector of variables to set. 00505 // vars_values is an optional corresponding vector of values for the variables 00506 // in vars_vec. 00507 // If set_only_init_params is true, then only the initialization variables 00508 // will be set. 00509 int init_tesseract_internal(const char *arg0, 00510 const char *textbase, 00511 const char *language, 00512 OcrEngineMode oem, 00513 char **configs, 00514 int configs_size, 00515 const GenericVector<STRING> *vars_vec, 00516 const GenericVector<STRING> *vars_values, 00517 bool set_only_init_params); 00518 00519 // Set the universal_id member of each font to be unique among all 00520 // instances of the same font loaded. 00521 void SetupUniversalFontIds(); 00522 00523 int init_tesseract_lm(const char *arg0, 00524 const char *textbase, 00525 const char *language); 00526 00527 void recognize_page(STRING& image_name); 00528 void end_tesseract(); 00529 00530 bool init_tesseract_lang_data(const char *arg0, 00531 const char *textbase, 00532 const char *language, 00533 OcrEngineMode oem, 00534 char **configs, 00535 int configs_size, 00536 const GenericVector<STRING> *vars_vec, 00537 const GenericVector<STRING> *vars_values, 00538 bool set_only_init_params); 00539 00540 void ParseLanguageString(const char* lang_str, 00541 GenericVector<STRING>* to_load, 00542 GenericVector<STRING>* not_to_load); 00543 00545 SVMenuNode *build_menu_new(); 00546 #ifndef GRAPHICS_DISABLED 00547 void pgeditor_main(int width, int height, PAGE_RES* page_res); 00548 #endif // GRAPHICS_DISABLED 00549 void process_image_event( // action in image win 00550 const SVEvent &event); 00551 BOOL8 process_cmd_win_event( // UI command semantics 00552 inT32 cmd_event, // which menu item? 00553 char *new_value // any prompt data 00554 ); 00555 void debug_word(PAGE_RES* page_res, const TBOX &selection_box); 00556 void do_re_display( 00557 BOOL8 (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it)); 00558 BOOL8 word_display(PAGE_RES_IT* pr_it); 00559 BOOL8 word_bln_display(PAGE_RES_IT* pr_it); 00560 BOOL8 word_blank_and_set_display(PAGE_RES_IT* pr_its); 00561 BOOL8 word_set_display(PAGE_RES_IT* pr_it); 00562 // #ifndef GRAPHICS_DISABLED 00563 BOOL8 word_dumper(PAGE_RES_IT* pr_it); 00564 // #endif // GRAPHICS_DISABLED 00565 void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box); 00567 // make rej map for word 00568 void make_reject_map(WERD_RES *word, ROW *row, inT16 pass); 00569 BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map); 00570 inT16 first_alphanum_index(const char *word, 00571 const char *word_lengths); 00572 inT16 first_alphanum_offset(const char *word, 00573 const char *word_lengths); 00574 inT16 alpha_count(const char *word, 00575 const char *word_lengths); 00576 BOOL8 word_contains_non_1_digit(const char *word, 00577 const char *word_lengths); 00578 void dont_allow_1Il(WERD_RES *word); 00579 inT16 count_alphanums( //how many alphanums 00580 WERD_RES *word); 00581 void flip_0O(WERD_RES *word); 00582 BOOL8 non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id); 00583 BOOL8 non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id); 00584 BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row); 00585 void nn_match_word( //Match a word 00586 WERD_RES *word, 00587 ROW *row); 00588 void nn_recover_rejects(WERD_RES *word, ROW *row); 00589 void set_done( //set done flag 00590 WERD_RES *word, 00591 inT16 pass); 00592 inT16 safe_dict_word(const WERD_RES *werd_res); // is best_choice in dict? 00593 void flip_hyphens(WERD_RES *word); 00594 void reject_I_1_L(WERD_RES *word); 00595 void reject_edge_blobs(WERD_RES *word); 00596 void reject_mostly_rejects(WERD_RES *word); 00598 BOOL8 word_adaptable( //should we adapt? 00599 WERD_RES *word, 00600 uinT16 mode); 00601 00603 void recog_word_recursive(WERD_RES* word); 00604 void recog_word(WERD_RES *word); 00605 void split_and_recog_word(WERD_RES* word); 00606 void split_word(WERD_RES *word, 00607 int split_pt, 00608 WERD_RES **right_piece, 00609 BlamerBundle **orig_blamer_bundle) const; 00610 void join_words(WERD_RES *word, 00611 WERD_RES *word2, 00612 BlamerBundle *orig_bb) const; 00614 BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position); 00615 inT16 eval_word_spacing(WERD_RES_LIST &word_res_list); 00616 void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block); 00617 inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list); 00618 void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block); 00619 void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block); 00620 void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block); 00621 void fix_fuzzy_spaces( //find fuzzy words 00622 ETEXT_DESC *monitor, //progress monitor 00623 inT32 word_count, //count of words in doc 00624 PAGE_RES *page_res); 00625 void dump_words(WERD_RES_LIST &perm, inT16 score, 00626 inT16 mode, BOOL8 improved); 00627 BOOL8 fixspace_thinks_word_done(WERD_RES *word); 00628 inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score); 00629 float blob_noise_score(TBLOB *blob); 00630 void break_noisiest_blob_word(WERD_RES_LIST &words); 00632 GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word); 00633 BOOL8 potential_word_crunch(WERD_RES *word, 00634 GARBAGE_LEVEL garbage_level, 00635 BOOL8 ok_dict_word); 00636 void tilde_crunch(PAGE_RES_IT &page_res_it); 00637 void unrej_good_quality_words( //unreject potential 00638 PAGE_RES_IT &page_res_it); 00639 void doc_and_block_rejection( //reject big chunks 00640 PAGE_RES_IT &page_res_it, 00641 BOOL8 good_quality_doc); 00642 void quality_based_rejection(PAGE_RES_IT &page_res_it, 00643 BOOL8 good_quality_doc); 00644 void convert_bad_unlv_chs(WERD_RES *word_res); 00645 void tilde_delete(PAGE_RES_IT &page_res_it); 00646 inT16 word_blob_quality(WERD_RES *word, ROW *row); 00647 void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, 00648 inT16 *accepted_match_count); 00649 void unrej_good_chs(WERD_RES *word, ROW *row); 00650 inT16 count_outline_errs(char c, inT16 outline_count); 00651 inT16 word_outline_errs(WERD_RES *word); 00652 BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level); 00653 CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode); 00654 inT16 failure_count(WERD_RES *word); 00655 BOOL8 noise_outlines(TWERD *word); 00657 void 00658 process_selected_words ( 00659 PAGE_RES* page_res, // blocks to check 00660 //function to call 00661 TBOX & selection_box, 00662 BOOL8 (tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it)); 00664 void tess_add_doc_word( //test acceptability 00665 WERD_CHOICE *word_choice //after context 00666 ); 00667 void tess_segment_pass_n(int pass_n, WERD_RES *word); 00668 bool tess_acceptable_word(WERD_RES *word); 00669 00671 // Applies the box file based on the image name fname, and resegments 00672 // the words in the block_list (page), with: 00673 // blob-mode: one blob per line in the box file, words as input. 00674 // word/line-mode: one blob per space-delimited unit after the #, and one word 00675 // per line in the box file. (See comment above for box file format.) 00676 // If find_segmentation is true, (word/line mode) then the classifier is used 00677 // to re-segment words/lines to match the space-delimited truth string for 00678 // each box. In this case, the input box may be for a word or even a whole 00679 // text line, and the output words will contain multiple blobs corresponding 00680 // to the space-delimited input string. 00681 // With find_segmentation false, no classifier is needed, but the chopper 00682 // can still be used to correctly segment touching characters with the help 00683 // of the input boxes. 00684 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned 00685 // from normal classification, ie. with a word, chopped_word, rebuild_word, 00686 // seam_array, denorm, box_word, and best_state, but NO best_choice or 00687 // raw_choice, as they would require a UNICHARSET, which we aim to avoid. 00688 // Instead, the correct_text member of WERD_RES is set, and this may be later 00689 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords 00690 // is not required before calling ApplyBoxTraining. 00691 PAGE_RES* ApplyBoxes(const STRING& fname, bool find_segmentation, 00692 BLOCK_LIST *block_list); 00693 00694 // Any row xheight that is significantly different from the median is set 00695 // to the median. 00696 void PreenXHeights(BLOCK_LIST *block_list); 00697 00698 // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: 00699 // All fuzzy spaces are removed, and all the words are maximally chopped. 00700 PAGE_RES* SetupApplyBoxes(const GenericVector<TBOX>& boxes, 00701 BLOCK_LIST *block_list); 00702 // Tests the chopper by exhaustively running chop_one_blob. 00703 // The word_res will contain filled chopped_word, seam_array, denorm, 00704 // box_word and best_state for the maximally chopped word. 00705 void MaximallyChopWord(const GenericVector<TBOX>& boxes, 00706 BLOCK* block, ROW* row, WERD_RES* word_res); 00707 // Gather consecutive blobs that match the given box into the best_state 00708 // and corresponding correct_text. 00709 // Fights over which box owns which blobs are settled by pre-chopping and 00710 // applying the blobs to box or next_box with the least non-overlap. 00711 // Returns false if the box was in error, which can only be caused by 00712 // failing to find an appropriate blob for a box. 00713 // This means that occasionally, blobs may be incorrectly segmented if the 00714 // chopper fails to find a suitable chop point. 00715 bool ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box, 00716 const TBOX& box, const TBOX& next_box, 00717 const char* correct_text); 00718 // Consume all source blobs that strongly overlap the given box, 00719 // putting them into a new word, with the correct_text label. 00720 // Fights over which box owns which blobs are settled by 00721 // applying the blobs to box or next_box with the least non-overlap. 00722 // Returns false if the box was in error, which can only be caused by 00723 // failing to find an overlapping blob for a box. 00724 bool ResegmentWordBox(BLOCK_LIST *block_list, 00725 const TBOX& box, const TBOX& next_box, 00726 const char* correct_text); 00727 // Resegments the words by running the classifier in an attempt to find the 00728 // correct segmentation that produces the required string. 00729 void ReSegmentByClassification(PAGE_RES* page_res); 00730 // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID. 00731 // Returns false if an invalid UNICHAR_ID is encountered. 00732 bool ConvertStringToUnichars(const char* utf8, 00733 GenericVector<UNICHAR_ID>* class_ids); 00734 // Resegments the word to achieve the target_text from the classifier. 00735 // Returns false if the re-segmentation fails. 00736 // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and 00737 // applies a full search on the classifier results to find the best classified 00738 // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity 00739 // substitutions ARE used. 00740 bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text, 00741 WERD_RES* word_res); 00742 // Recursive helper to find a match to the target_text (from text_index 00743 // position) in the choices (from choices_pos position). 00744 // Choices is an array of GenericVectors, of length choices_length, with each 00745 // element representing a starting position in the word, and the 00746 // GenericVector holding classification results for a sequence of consecutive 00747 // blobs, with index 0 being a single blob, index 1 being 2 blobs etc. 00748 void SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices, 00749 int choices_pos, int choices_length, 00750 const GenericVector<UNICHAR_ID>& target_text, 00751 int text_index, 00752 float rating, GenericVector<int>* segmentation, 00753 float* best_rating, GenericVector<int>* best_segmentation); 00754 // Counts up the labelled words and the blobs within. 00755 // Deletes all unused or emptied words, counting the unused ones. 00756 // Resets W_BOL and W_EOL flags correctly. 00757 // Builds the rebuild_word and rebuilds the box_word. 00758 void TidyUp(PAGE_RES* page_res); 00759 // Logs a bad box by line in the box file and box coords. 00760 void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, 00761 const char *err_msg); 00762 // Creates a fake best_choice entry in each WERD_RES with the correct text. 00763 void CorrectClassifyWords(PAGE_RES* page_res); 00764 // Call LearnWord to extract features for labelled blobs within each word. 00765 // Features are stored in an internal buffer. 00766 void ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res); 00767 00769 // Returns the number of misfit blob tops in this word. 00770 int CountMisfitTops(WERD_RES *word_res); 00771 // Returns a new x-height in pixels (original image coords) that is 00772 // maximally compatible with the result in word_res. 00773 // Returns 0.0f if no x-height is found that is better than the current 00774 // estimate. 00775 float ComputeCompatibleXheight(WERD_RES *word_res, float* baseline_shift); 00777 // TODO(ocr-team): Find and remove obsolete parameters. 00778 BOOL_VAR_H(tessedit_resegment_from_boxes, false, 00779 "Take segmentation and labeling from box file"); 00780 BOOL_VAR_H(tessedit_resegment_from_line_boxes, false, 00781 "Conversion of word/line box file to char box file"); 00782 BOOL_VAR_H(tessedit_train_from_boxes, false, 00783 "Generate training data from boxed chars"); 00784 BOOL_VAR_H(tessedit_make_boxes_from_boxes, false, 00785 "Generate more boxes from boxed chars"); 00786 BOOL_VAR_H(tessedit_dump_pageseg_images, false, 00787 "Dump intermediate images made during page segmentation"); 00788 INT_VAR_H(tessedit_pageseg_mode, PSM_SINGLE_BLOCK, 00789 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," 00790 " 5=line, 6=word, 7=char" 00791 " (Values from PageSegMode enum in publictypes.h)"); 00792 INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY, 00793 "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults" 00794 " to loading and running only Tesseract (no Cube, no combiner)." 00795 " (Values from OcrEngineMode enum in tesseractclass.h)"); 00796 STRING_VAR_H(tessedit_char_blacklist, "", 00797 "Blacklist of chars not to recognize"); 00798 STRING_VAR_H(tessedit_char_whitelist, "", 00799 "Whitelist of chars to recognize"); 00800 STRING_VAR_H(tessedit_char_unblacklist, "", 00801 "List of chars to override tessedit_char_blacklist"); 00802 BOOL_VAR_H(tessedit_ambigs_training, false, 00803 "Perform training for ambiguities"); 00804 INT_VAR_H(pageseg_devanagari_split_strategy, 00805 tesseract::ShiroRekhaSplitter::NO_SPLIT, 00806 "Whether to use the top-line splitting process for Devanagari " 00807 "documents while performing page-segmentation."); 00808 INT_VAR_H(ocr_devanagari_split_strategy, 00809 tesseract::ShiroRekhaSplitter::NO_SPLIT, 00810 "Whether to use the top-line splitting process for Devanagari " 00811 "documents while performing ocr."); 00812 STRING_VAR_H(tessedit_write_params_to_file, "", 00813 "Write all parameters to the given file."); 00814 BOOL_VAR_H(tessedit_adaption_debug, false, 00815 "Generate and print debug information for adaption"); 00816 INT_VAR_H(bidi_debug, 0, "Debug level for BiDi"); 00817 INT_VAR_H(applybox_debug, 1, "Debug level"); 00818 INT_VAR_H(applybox_page, 0, "Page number to apply boxes from"); 00819 STRING_VAR_H(applybox_exposure_pattern, ".exp", 00820 "Exposure value follows this pattern in the image" 00821 " filename. The name of the image files are expected" 00822 " to be in the form [lang].[fontname].exp[num].tif"); 00823 BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode, false, 00824 "Learn both character fragments (as is done in the" 00825 " special low exposure mode) as well as unfragmented" 00826 " characters."); 00827 BOOL_VAR_H(applybox_learn_ngrams_mode, false, 00828 "Each bounding box is assumed to contain ngrams. Only" 00829 " learn the ngrams whose outlines overlap horizontally."); 00830 BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words"); 00831 BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices"); 00832 BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats"); 00833 BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true, 00834 "Try to improve fuzzy spaces"); 00835 BOOL_VAR_H(tessedit_unrej_any_wd, false, 00836 "Don't bother with word plausibility"); 00837 BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?"); 00838 BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height"); 00839 BOOL_VAR_H(tessedit_enable_doc_dict, true, 00840 "Add words to the document dictionary"); 00841 BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char"); 00842 BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats"); 00843 BOOL_VAR_H(tessedit_enable_bigram_correction, true, 00844 "Enable correction based on the word bigram dictionary."); 00845 BOOL_VAR_H(tessedit_enable_dict_correction, false, 00846 "Enable single word correction based on the dictionary."); 00847 INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram " 00848 "correction."); 00849 BOOL_VAR_H(enable_noise_removal, true, 00850 "Remove and conditionally reassign small outlines when they" 00851 " confuse layout analysis, determining diacritics vs noise"); 00852 INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines"); 00853 // Worst (min) certainty, for which a diacritic is allowed to make the base 00854 // character worse and still be included. 00855 double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty"); 00856 // Worst (min) certainty, for which a non-overlapping diacritic is allowed to 00857 // make the base character worse and still be included. 00858 double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty"); 00859 // Worst (min) certainty, for which a diacritic is allowed to make a new 00860 // stand-alone blob. 00861 double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty"); 00862 // Factor of certainty margin for adding diacritics to not count as worse. 00863 double_VAR_H(noise_cert_factor, 0.375, 00864 "Scaling on certainty diff from Hingepoint"); 00865 INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob"); 00866 INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word"); 00867 INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug"); 00868 BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk"); 00869 STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation"); 00870 STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation"); 00871 STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation"); 00872 double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit"); 00873 double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit"); 00874 double_VAR_H(quality_outline_pc, 1.0, 00875 "good_quality_doc lte outline error limit"); 00876 double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit"); 00877 INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word"); 00878 INT_VAR_H(tessedit_tess_adaption_mode, 0x27, 00879 "Adaptation decision algorithm for tess"); 00880 BOOL_VAR_H(tessedit_minimal_rej_pass1, false, 00881 "Do minimal rejection on pass 1 output"); 00882 BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria"); 00883 BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity"); 00884 INT_VAR_H(tessedit_test_adaption_mode, 3, 00885 "Adaptation decision algorithm for tess"); 00886 BOOL_VAR_H(test_pt, false, "Test for point"); 00887 double_VAR_H(test_pt_x, 99999.99, "xcoord"); 00888 double_VAR_H(test_pt_y, 99999.99, "ycoord"); 00889 INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info."); 00890 BOOL_VAR_H(paragraph_text_based, true, 00891 "Run paragraph detection on the post-text-recognition " 00892 "(more accurate)"); 00893 INT_VAR_H(cube_debug_level, 1, "Print cube debug info."); 00894 STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines"); 00895 STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines"); 00896 BOOL_VAR_H(docqual_excuse_outline_errs, false, 00897 "Allow outline errs in unrejection?"); 00898 BOOL_VAR_H(tessedit_good_quality_unrej, true, 00899 "Reduce rejection on good docs"); 00900 BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?"); 00901 double_VAR_H(tessedit_reject_doc_percent, 65.00, 00902 "%rej allowed before rej whole doc"); 00903 double_VAR_H(tessedit_reject_block_percent, 45.00, 00904 "%rej allowed before rej whole block"); 00905 double_VAR_H(tessedit_reject_row_percent, 40.00, 00906 "%rej allowed before rej whole row"); 00907 double_VAR_H(tessedit_whole_wd_rej_row_percent, 70.00, 00908 "Number of row rejects in whole word rejects" 00909 "which prevents whole row rejection"); 00910 BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds, true, 00911 "Only rej partially rejected words in block rejection"); 00912 BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds, true, 00913 "Only rej partially rejected words in row rejection"); 00914 BOOL_VAR_H(tessedit_dont_blkrej_good_wds, false, 00915 "Use word segmentation quality metric"); 00916 BOOL_VAR_H(tessedit_dont_rowrej_good_wds, false, 00917 "Use word segmentation quality metric"); 00918 INT_VAR_H(tessedit_preserve_min_wd_len, 2, 00919 "Only preserve wds longer than this"); 00920 BOOL_VAR_H(tessedit_row_rej_good_docs, true, 00921 "Apply row rejection to good docs"); 00922 double_VAR_H(tessedit_good_doc_still_rowrej_wd, 1.1, 00923 "rej good doc wd if more than this fraction rejected"); 00924 BOOL_VAR_H(tessedit_reject_bad_qual_wds, true, 00925 "Reject all bad quality wds"); 00926 BOOL_VAR_H(tessedit_debug_doc_rejection, false, "Page stats"); 00927 BOOL_VAR_H(tessedit_debug_quality_metrics, false, 00928 "Output data to debug file"); 00929 BOOL_VAR_H(bland_unrej, false, "unrej potential with no chekcs"); 00930 double_VAR_H(quality_rowrej_pc, 1.1, 00931 "good_quality_doc gte good char limit"); 00932 BOOL_VAR_H(unlv_tilde_crunching, true, 00933 "Mark v.bad words for tilde crunch"); 00934 BOOL_VAR_H(hocr_font_info, false, 00935 "Add font info to hocr output"); 00936 BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?"); 00937 BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?"); 00938 double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this"); 00939 BOOL_VAR_H(crunch_terrible_garbage, true, "As it says"); 00940 double_VAR_H(crunch_poor_garbage_cert, -9.0, 00941 "crunch garbage cert lt this"); 00942 double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this"); 00943 double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this"); 00944 double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this"); 00945 BOOL_VAR_H(crunch_pot_garbage, true, "POTENTIAL crunch garbage"); 00946 double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this"); 00947 double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this"); 00948 double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this"); 00949 double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this"); 00950 double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this"); 00951 double_VAR_H(crunch_del_high_word, 1.5, 00952 "Del if word gt xht x this above bl"); 00953 double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl"); 00954 double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this"); 00955 INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch"); 00956 INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed"); 00957 BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings"); 00958 BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring"); 00959 BOOL_VAR_H(crunch_leave_accept_strings, false, 00960 "Don't pot crunch sensible strings"); 00961 BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures"); 00962 INT_VAR_H(crunch_leave_lc_strings, 4, 00963 "Don't crunch words with long lower case strings"); 00964 INT_VAR_H(crunch_leave_uc_strings, 4, 00965 "Don't crunch words with long lower case strings"); 00966 INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions"); 00967 INT_VAR_H(crunch_debug, 0, "As it says"); 00968 INT_VAR_H(fixsp_non_noise_limit, 1, 00969 "How many non-noise blbs either side?"); 00970 double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this"); 00971 BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctation joins"); 00972 INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing"); 00973 INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug"); 00974 STRING_VAR_H(numeric_punctuation, ".,", 00975 "Punct. chs expected WITHIN numbers"); 00976 INT_VAR_H(x_ht_acceptance_tolerance, 8, 00977 "Max allowed deviation of blob top outside of font data"); 00978 INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it"); 00979 INT_VAR_H(superscript_debug, 0, "Debug level for sub & superscript fixer"); 00980 double_VAR_H(superscript_worse_certainty, 2.0, "How many times worse " 00981 "certainty does a superscript position glyph need to be for us " 00982 "to try classifying it as a char with a different baseline?"); 00983 double_VAR_H(superscript_bettered_certainty, 0.97, "What reduction in " 00984 "badness do we think sufficient to choose a superscript over " 00985 "what we'd thought. For example, a value of 0.6 means we want " 00986 "to reduce badness of certainty by 40%"); 00987 double_VAR_H(superscript_scaledown_ratio, 0.4, 00988 "A superscript scaled down more than this is unbelievably " 00989 "small. For example, 0.3 means we expect the font size to " 00990 "be no smaller than 30% of the text line font size."); 00991 double_VAR_H(subscript_max_y_top, 0.5, 00992 "Maximum top of a character measured as a multiple of x-height " 00993 "above the baseline for us to reconsider whether it's a " 00994 "subscript."); 00995 double_VAR_H(superscript_min_y_bottom, 0.3, 00996 "Minimum bottom of a character measured as a multiple of " 00997 "x-height above the baseline for us to reconsider whether it's " 00998 "a superscript."); 00999 BOOL_VAR_H(tessedit_write_block_separators, false, 01000 "Write block separators in output"); 01001 BOOL_VAR_H(tessedit_write_rep_codes, false, 01002 "Write repetition char code"); 01003 BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file"); 01004 BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file"); 01005 BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file"); 01006 BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file"); 01007 STRING_VAR_H(unrecognised_char, "|", 01008 "Output char for unidentified blobs"); 01009 INT_VAR_H(suspect_level, 99, "Suspect marker level"); 01010 INT_VAR_H(suspect_space_level, 100, 01011 "Min suspect level for rejecting spaces"); 01012 INT_VAR_H(suspect_short_words, 2, 01013 "Don't Suspect dict wds longer than this"); 01014 BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected"); 01015 double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit"); 01016 double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit"); 01017 BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures"); 01018 BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING"); 01019 BOOL_VAR_H(tessedit_word_for_word, false, 01020 "Make output have exactly one word per WERD"); 01021 BOOL_VAR_H(tessedit_zero_kelvin_rejection, false, 01022 "Don't reject ANYTHING AT ALL"); 01023 BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same"); 01024 INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm"); 01025 BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug"); 01026 BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips"); 01027 double_VAR_H(tessedit_lower_flip_hyphen, 1.5, 01028 "Aspect ratio dot/hyphen test"); 01029 double_VAR_H(tessedit_upper_flip_hyphen, 1.8, 01030 "Aspect ratio dot/hyphen test"); 01031 BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector"); 01032 BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test"); 01033 BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check"); 01034 BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control"); 01035 BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control"); 01036 BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control"); 01037 BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check"); 01038 BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check"); 01039 double_VAR_H(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract"); 01040 INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit"); 01041 STRING_VAR_H(ok_repeated_ch_non_alphanum_wds, "-?*\075", 01042 "Allow NN to unrej"); 01043 STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set"); 01044 INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this"); 01045 BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes"); 01046 INT_VAR_H(tessedit_page_number, -1, 01047 "-1 -> All pages, else specifc page to process"); 01048 BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE"); 01049 BOOL_VAR_H(interactive_display_mode, false, "Run interactively?"); 01050 STRING_VAR_H(file_type, ".tif", "Filename extension"); 01051 BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word"); 01052 INT_VAR_H(tessdata_manager_debug_level, 0, 01053 "Debug level for TessdataManager functions."); 01054 STRING_VAR_H(tessedit_load_sublangs, "", 01055 "List of languages to load with this one"); 01056 BOOL_VAR_H(tessedit_use_primary_params_model, false, 01057 "In multilingual mode use params model of the primary language"); 01058 // Min acceptable orientation margin (difference in scores between top and 2nd 01059 // choice in OSResults::orientations) to believe the page orientation. 01060 double_VAR_H(min_orientation_margin, 7.0, 01061 "Min acceptable orientation margin"); 01062 BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding"); 01063 BOOL_VAR_H(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model"); 01064 BOOL_VAR_H(poly_allow_detailed_fx, false, 01065 "Allow feature extractors to see the original outline"); 01066 BOOL_VAR_H(tessedit_init_config_only, false, 01067 "Only initialize with the config file. Useful if the instance is " 01068 "not going to be used for OCR but say only for layout analysis."); 01069 BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector"); 01070 BOOL_VAR_H(textord_tabfind_vertical_text, true, "Enable vertical detection"); 01071 BOOL_VAR_H(textord_tabfind_force_vertical_text, false, 01072 "Force using vertical text page mode"); 01073 double_VAR_H(textord_tabfind_vertical_text_ratio, 0.5, 01074 "Fraction of textlines deemed vertical to use vertical page " 01075 "mode"); 01076 double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75, 01077 "Fraction of height used as a minimum gap for aligned blobs."); 01078 INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible"); 01079 BOOL_VAR_H(preserve_interword_spaces, false, 01080 "Preserve multiple interword spaces"); 01081 BOOL_VAR_H(include_page_breaks, false, 01082 "Include page separator string in output text after each " 01083 "image/page."); 01084 STRING_VAR_H(page_separator, "\f", 01085 "Page separator (default is form feed control character)"); 01086 01087 // The following parameters were deprecated and removed from their original 01088 // locations. The parameters are temporarily kept here to give Tesseract 01089 // users a chance to updated their [lang].traineddata and config files 01090 // without introducing failures during Tesseract initialization. 01091 // TODO(ocr-team): remove these parameters from the code once we are 01092 // reasonably sure that Tesseract users have updated their data files. 01093 // 01094 // BEGIN DEPRECATED PARAMETERS 01095 BOOL_VAR_H(textord_tabfind_vertical_horizontal_mix, true, 01096 "find horizontal lines such as headers in vertical page mode"); 01097 INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm"); 01098 BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length" 01099 " dawgs (e.g. for non-space delimited languages)"); 01100 INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process"); 01101 BOOL_VAR_H(permute_debug, 0, "char permutation debug"); 01102 double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of" 01103 " current best rate to prune other hypotheses"); 01104 BOOL_VAR_H(permute_script_word, 0, 01105 "Turn on word script consistency permuter"); 01106 BOOL_VAR_H(segment_segcost_rating, 0, 01107 "incorporate segmentation cost in word rating?"); 01108 double_VAR_H(segment_reward_script, 0.95, 01109 "Score multipler for script consistency within a word. " 01110 "Being a 'reward' factor, it should be <= 1. " 01111 "Smaller value implies bigger reward."); 01112 BOOL_VAR_H(permute_fixed_length_dawg, 0, 01113 "Turn on fixed-length phrasebook search permuter"); 01114 BOOL_VAR_H(permute_chartype_word, 0, 01115 "Turn on character type (property) consistency permuter"); 01116 double_VAR_H(segment_reward_chartype, 0.97, 01117 "Score multipler for char type consistency within a word. "); 01118 double_VAR_H(segment_reward_ngram_best_choice, 0.99, 01119 "Score multipler for ngram permuter's best choice" 01120 " (only used in the Han script path)."); 01121 BOOL_VAR_H(ngram_permuter_activated, false, 01122 "Activate character-level n-gram-based permuter"); 01123 BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter"); 01124 INT_VAR_H(language_model_fixed_length_choices_depth, 3, 01125 "Depth of blob choice lists to explore" 01126 " when fixed length dawgs are on"); 01127 BOOL_VAR_H(use_new_state_cost, FALSE, 01128 "use new state cost heuristics for segmentation state evaluation"); 01129 double_VAR_H(heuristic_segcost_rating_base, 1.25, 01130 "base factor for adding segmentation cost into word rating." 01131 "It's a multiplying factor, the larger the value above 1, " 01132 "the bigger the effect of segmentation cost."); 01133 double_VAR_H(heuristic_weight_rating, 1, 01134 "weight associated with char rating in combined cost of state"); 01135 double_VAR_H(heuristic_weight_width, 1000.0, 01136 "weight associated with width evidence in combined cost of" 01137 " state"); 01138 double_VAR_H(heuristic_weight_seamcut, 0, 01139 "weight associated with seam cut in combined cost of state"); 01140 double_VAR_H(heuristic_max_char_wh_ratio, 2.0, 01141 "max char width-to-height ratio allowed in segmentation"); 01142 BOOL_VAR_H(enable_new_segsearch, false, 01143 "Enable new segmentation search path."); 01144 double_VAR_H(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, 01145 "Maximum character width-to-height ratio for" 01146 "fixed pitch fonts"); 01147 // END DEPRECATED PARAMETERS 01148 01150 FILE *init_recog_training(const STRING &fname); 01151 void recog_training_segmented(const STRING &fname, 01152 PAGE_RES *page_res, 01153 volatile ETEXT_DESC *monitor, 01154 FILE *output_file); 01155 void ambigs_classify_and_output(const char *label, 01156 PAGE_RES_IT* pr_it, 01157 FILE *output_file); 01158 01159 #ifndef NO_CUBE_BUILD 01160 inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; } 01161 #endif 01162 01163 private: 01164 // The filename of a backup config file. If not null, then we currently 01165 // have a temporary debug config file loaded, and backup_config_file_ 01166 // will be loaded, and set to null when debug is complete. 01167 const char* backup_config_file_; 01168 // The filename of a config file to read when processing a debug word. 01169 STRING word_config_; 01170 // Image used for input to layout analysis and tesseract recognition. 01171 // May be modified by the ShiroRekhaSplitter to eliminate the top-line. 01172 Pix* pix_binary_; 01173 // Unmodified image used for input to cube. Always valid. 01174 Pix* cube_binary_; 01175 // Grey-level input image if the input was not binary, otherwise NULL. 01176 Pix* pix_grey_; 01177 // Thresholds that were used to generate the thresholded image from grey. 01178 Pix* pix_thresholds_; 01179 // Input image resolution after any scaling. The resolution is not well 01180 // transmitted by operations on Pix, so we keep an independent record here. 01181 int source_resolution_; 01182 // The shiro-rekha splitter object which is used to split top-lines in 01183 // Devanagari words to provide a better word and grapheme segmentation. 01184 ShiroRekhaSplitter splitter_; 01185 // Page segmentation/layout 01186 Textord textord_; 01187 // True if the primary language uses right_to_left reading order. 01188 bool right_to_left_; 01189 Pix* scaled_color_; 01190 int scaled_factor_; 01191 FCOORD deskew_; 01192 FCOORD reskew_; 01193 TesseractStats stats_; 01194 // Sub-languages to be tried in addition to this. 01195 GenericVector<Tesseract*> sub_langs_; 01196 // Most recently used Tesseract out of this and sub_langs_. The default 01197 // language for the next word. 01198 Tesseract* most_recently_used_; 01199 // The size of the font table, ie max possible font id + 1. 01200 int font_table_size_; 01201 #ifndef NO_CUBE_BUILD 01202 // Cube objects. 01203 CubeRecoContext* cube_cntxt_; 01204 TesseractCubeCombiner *tess_cube_combiner_; 01205 #endif 01206 // Equation detector. Note: this pointer is NOT owned by the class. 01207 EquationDetect* equ_detect_; 01208 }; 01209 01210 } // namespace tesseract 01211 01212 01213 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H__