|
tesseract 3.04.01
|
00001 00002 // File: dict.h 00003 // Description: dict class. 00004 // Author: Samuel Charron 00005 // 00006 // (C) Copyright 2006, Google Inc. 00007 // Licensed under the Apache License, Version 2.0 (the "License"); 00008 // you may not use this file except in compliance with the License. 00009 // You may obtain a copy of the License at 00010 // http://www.apache.org/licenses/LICENSE-2.0 00011 // Unless required by applicable law or agreed to in writing, software 00012 // distributed under the License is distributed on an "AS IS" BASIS, 00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 // See the License for the specific language governing permissions and 00015 // limitations under the License. 00016 // 00018 00019 #ifndef TESSERACT_DICT_DICT_H_ 00020 #define TESSERACT_DICT_DICT_H_ 00021 00022 #include "ambigs.h" 00023 #include "dawg.h" 00024 #include "dawg_cache.h" 00025 #include "host.h" 00026 #include "oldlist.h" 00027 #include "ratngs.h" 00028 #include "stopper.h" 00029 #include "trie.h" 00030 #include "unicharset.h" 00031 #include "params_training_featdef.h" 00032 00033 class MATRIX; 00034 class WERD_RES; 00035 00036 #define MAX_WERD_LENGTH (inT64) 128 00037 #define NO_RATING -1 00038 00040 struct CHAR_FRAGMENT_INFO { 00041 UNICHAR_ID unichar_id; 00042 const CHAR_FRAGMENT *fragment; 00043 int num_fragments; 00044 float rating; 00045 float certainty; 00046 }; 00047 00048 namespace tesseract { 00049 00050 typedef GenericVector<Dawg *> DawgVector; 00051 00052 // 00053 // Constants 00054 // 00055 static const int kRatingPad = 4; 00056 static const char kDictWildcard[] = "\u2606"; // WHITE STAR 00057 static const int kDictMaxWildcards = 2; // max wildcards for a word 00058 // TODO(daria): If hyphens are different in different languages and can be 00059 // inferred from training data we should load their values dynamically. 00060 static const char kHyphenSymbol[] = "-"; 00061 static const char kSlashSymbol[] = "/"; 00062 static const char kQuestionSymbol[] = "?"; 00063 static const char kApostropheSymbol[] = "'"; 00064 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling 00065 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset 00066 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on 00067 static const int kDocDictMaxRepChars = 4; 00068 00069 // Enum for describing whether the x-height for the word is consistent: 00070 // 0 - everything is good. 00071 // 1 - there are one or two secondary (but consistent) baselines 00072 // [think subscript and superscript], or there is an oversized 00073 // first character. 00074 // 2 - the word is inconsistent. 00075 enum XHeightConsistencyEnum {XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT}; 00076 00077 struct DawgArgs { 00078 DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p) 00079 : active_dawgs(d), updated_dawgs(up), permuter(p) {} 00080 00081 DawgPositionVector *active_dawgs; 00082 DawgPositionVector *updated_dawgs; 00083 PermuterType permuter; 00084 }; 00085 00086 class Dict { 00087 public: 00088 Dict(CCUtil* image_ptr); 00089 ~Dict(); 00090 const CCUtil* getCCUtil() const { 00091 return ccutil_; 00092 } 00093 CCUtil* getCCUtil() { 00094 return ccutil_; 00095 } 00096 const UNICHARSET& getUnicharset() const { 00097 return getCCUtil()->unicharset; 00098 } 00099 UNICHARSET& getUnicharset() { 00100 return getCCUtil()->unicharset; 00101 } 00102 const UnicharAmbigs &getUnicharAmbigs() const { 00103 return getCCUtil()->unichar_ambigs; 00104 } 00105 00106 // Returns true if unichar_id is a word compounding character like - or /. 00107 inline bool compound_marker(UNICHAR_ID unichar_id) { 00108 const GenericVector<UNICHAR_ID>& normed_ids = 00109 getUnicharset().normed_ids(unichar_id); 00110 return normed_ids.size() == 1 && 00111 (normed_ids[0] == hyphen_unichar_id_ || 00112 normed_ids[0] == slash_unichar_id_); 00113 } 00114 // Returns true if unichar_id is an apostrophe-like character that may 00115 // separate prefix/suffix words from a main body word. 00116 inline bool is_apostrophe(UNICHAR_ID unichar_id) { 00117 const GenericVector<UNICHAR_ID>& normed_ids = 00118 getUnicharset().normed_ids(unichar_id); 00119 return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_; 00120 } 00121 00122 /* hyphen.cpp ************************************************************/ 00123 00125 inline bool hyphenated() const { return 00126 !last_word_on_line_ && hyphen_word_; 00127 } 00129 inline int hyphen_base_size() const { 00130 return this->hyphenated() ? hyphen_word_->length() : 0; 00131 } 00135 inline void copy_hyphen_info(WERD_CHOICE *word) const { 00136 if (this->hyphenated()) { 00137 *word = *hyphen_word_; 00138 if (hyphen_debug_level) word->print("copy_hyphen_info: "); 00139 } 00140 } 00142 inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const { 00143 if (!last_word_on_line_ || first_pos) 00144 return false; 00145 const GenericVector<UNICHAR_ID>& normed_ids = 00146 getUnicharset().normed_ids(unichar_id); 00147 return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_; 00148 } 00150 inline bool has_hyphen_end(const WERD_CHOICE &word) const { 00151 int word_index = word.length() - 1; 00152 return has_hyphen_end(word.unichar_id(word_index), word_index == 0); 00153 } 00157 void reset_hyphen_vars(bool last_word_on_line); 00160 void set_hyphen_word(const WERD_CHOICE &word, 00161 const DawgPositionVector &active_dawgs); 00162 00163 /* permdawg.cpp ************************************************************/ 00164 // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig(). 00165 // When this function is refactored, permdawg.cpp can be removed. 00166 00169 inline void update_best_choice(const WERD_CHOICE &word, 00170 WERD_CHOICE *best_choice) { 00171 if (word.rating() < best_choice->rating()) { 00172 *best_choice = word; 00173 } 00174 } 00178 void init_active_dawgs(DawgPositionVector *active_dawgs, 00179 bool ambigs_mode) const; 00180 // Fill the given vector with the default collection of any-length dawgs 00181 void default_dawgs(DawgPositionVector *anylength_dawgs, 00182 bool suppress_patterns) const; 00183 00184 00190 WERD_CHOICE *dawg_permute_and_select( 00191 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit); 00195 void go_deeper_dawg_fxn( 00196 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, 00197 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00198 bool word_ending, WERD_CHOICE *word, float certainties[], 00199 float *limit, WERD_CHOICE *best_choice, int *attempts_left, 00200 void *void_more_args); 00201 00203 void (Dict::*go_deeper_fxn_)(const char *debug, 00204 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00205 int char_choice_index, 00206 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00207 bool word_ending, WERD_CHOICE *word, 00208 float certainties[], float *limit, 00209 WERD_CHOICE *best_choice, int *attempts_left, 00210 void *void_more_args); 00211 // 00212 // Helper functions for dawg_permute_and_select(). 00213 // 00214 void permute_choices( 00215 const char *debug, 00216 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00217 int char_choice_index, 00218 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00219 WERD_CHOICE *word, 00220 float certainties[], 00221 float *limit, 00222 WERD_CHOICE *best_choice, 00223 int *attempts_left, 00224 void *more_args); 00225 00226 void append_choices( 00227 const char *debug, 00228 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00229 const BLOB_CHOICE &blob_choice, 00230 int char_choice_index, 00231 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00232 WERD_CHOICE *word, 00233 float certainties[], 00234 float *limit, 00235 WERD_CHOICE *best_choice, 00236 int *attempts_left, 00237 void *more_args); 00238 00239 bool fragment_state_okay(UNICHAR_ID curr_unichar_id, 00240 float curr_rating, float curr_certainty, 00241 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00242 const char *debug, int word_ending, 00243 CHAR_FRAGMENT_INFO *char_frag_info); 00244 00245 /* stopper.cpp *************************************************************/ 00246 bool NoDangerousAmbig(WERD_CHOICE *BestChoice, 00247 DANGERR *fixpt, 00248 bool fix_replaceable, 00249 MATRIX* ratings); 00250 // Replaces the corresponding wrong ngram in werd_choice with the correct 00251 // one. The whole correct n-gram is inserted into the ratings matrix and 00252 // the werd_choice: no more fragments!. Rating and certainty of new entries 00253 // in matrix and werd_choice are the sum and mean of the wrong ngram 00254 // respectively. 00255 // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes 00256 // mystring", with a new entry in the ratings matrix for ". 00257 void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, 00258 UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, 00259 MATRIX *ratings); 00260 00262 int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice); 00270 int UniformCertainties(const WERD_CHOICE& word); 00272 bool AcceptableChoice(const WERD_CHOICE& best_choice, 00273 XHeightConsistencyEnum xheight_consistency); 00277 bool AcceptableResult(WERD_RES* word); 00278 void EndDangerousAmbigs(); 00280 void DebugWordChoices(); 00282 void SettupStopperPass1(); 00284 void SettupStopperPass2(); 00285 /* context.cpp *************************************************************/ 00287 int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset); 00290 bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset); 00291 00292 /* dict.cpp ****************************************************************/ 00293 00296 static DawgCache *GlobalDawgCache(); 00297 void Load(DawgCache *dawg_cache); 00298 void End(); 00299 00300 // Resets the document dictionary analogous to ResetAdaptiveClassifier. 00301 void ResetDocumentDictionary() { 00302 if (pending_words_ != NULL) 00303 pending_words_->clear(); 00304 if (document_words_ != NULL) 00305 document_words_->clear(); 00306 } 00307 00343 // 00344 int def_letter_is_okay(void* void_dawg_args, 00345 UNICHAR_ID unichar_id, bool word_end) const; 00346 00347 int (Dict::*letter_is_okay_)(void* void_dawg_args, 00348 UNICHAR_ID unichar_id, bool word_end) const; 00350 int LetterIsOkay(void* void_dawg_args, 00351 UNICHAR_ID unichar_id, bool word_end) const { 00352 return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end); 00353 } 00354 00355 00357 double (Dict::*probability_in_context_)(const char* lang, 00358 const char* context, 00359 int context_bytes, 00360 const char* character, 00361 int character_bytes); 00363 double ProbabilityInContext(const char* context, 00364 int context_bytes, 00365 const char* character, 00366 int character_bytes) { 00367 return (this->*probability_in_context_)( 00368 getCCUtil()->lang.string(), 00369 context, context_bytes, 00370 character, character_bytes); 00371 } 00372 00374 double def_probability_in_context( 00375 const char* lang, const char* context, int context_bytes, 00376 const char* character, int character_bytes) { 00377 (void) context; 00378 (void) context_bytes; 00379 (void) character; 00380 (void) character_bytes; 00381 return 0.0; 00382 } 00383 double ngram_probability_in_context(const char* lang, 00384 const char* context, 00385 int context_bytes, 00386 const char* character, 00387 int character_bytes); 00388 00389 // Interface with params model. 00390 float (Dict::*params_model_classify_)(const char *lang, void *path); 00391 float ParamsModelClassify(const char *lang, void *path); 00392 // Call params_model_classify_ member function. 00393 float CallParamsModelClassify(void *path) { 00394 ASSERT_HOST(params_model_classify_ != NULL); // ASSERT_HOST -> assert 00395 return (this->*params_model_classify_)( 00396 getCCUtil()->lang.string(), path); 00397 } 00398 00399 inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; } 00400 inline UNICHAR_ID WildcardID() const { 00401 return wildcard_unichar_id_; 00402 } 00404 inline int NumDawgs() const { return dawgs_.size(); } 00406 inline const Dawg *GetDawg(int index) const { return dawgs_[index]; } 00408 inline const Dawg *GetPuncDawg() const { return punc_dawg_; } 00410 inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; } 00412 static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) { 00413 if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg 00414 NODE_REF node = dawg->next_node(edge_ref); 00415 if (node == 0) node = NO_EDGE; // end of word 00416 return node; 00417 } 00418 00419 // Given a unichar from a string and a given dawg, return the unichar 00420 // we should use to match in that dawg type. (for example, in the number 00421 // dawg, all numbers are transformed to kPatternUnicharId). 00422 inline UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const { 00423 if (!dawg) return ch; 00424 switch (dawg->type()) { 00425 case DAWG_TYPE_NUMBER: 00426 return getUnicharset().get_isdigit(ch) ? Dawg::kPatternUnicharID : ch; 00427 default: 00428 return ch; 00429 } 00430 } 00431 00437 void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, 00438 UNICHAR_ID unichar_id, bool word_end, 00439 DawgPositionVector *updated_dawgs, 00440 PermuterType *current_permuter) const; 00441 00445 00447 inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) { 00448 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || 00449 perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM || 00450 perm == USER_PATTERN_PERM || perm == COMPOUND_PERM || 00451 (numbers_ok && perm == NUMBER_PERM)); 00452 } 00453 int valid_word(const WERD_CHOICE &word, bool numbers_ok) const; 00454 int valid_word(const WERD_CHOICE &word) const { 00455 return valid_word(word, false); // return NO_PERM for words with digits 00456 } 00457 int valid_word_or_number(const WERD_CHOICE &word) const { 00458 return valid_word(word, true); // return NUMBER_PERM for valid numbers 00459 } 00461 int valid_word(const char *string) const { 00462 WERD_CHOICE word(string, getUnicharset()); 00463 return valid_word(word); 00464 } 00465 // Do the two WERD_CHOICEs form a meaningful bigram? 00466 bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const; 00471 bool valid_punctuation(const WERD_CHOICE &word); 00473 int good_choice(const WERD_CHOICE &choice); 00475 void add_document_word(const WERD_CHOICE &best_choice); 00477 void adjust_word(WERD_CHOICE *word, 00478 bool nonword, XHeightConsistencyEnum xheight_consistency, 00479 float additional_adjust, 00480 bool modify_rating, 00481 bool debug); 00483 inline void SetWordsegRatingAdjustFactor(float f) { 00484 wordseg_rating_adjust_factor_ = f; 00485 } 00486 00487 private: 00489 CCUtil* ccutil_; 00496 UnicharAmbigs *dang_ambigs_table_; 00498 UnicharAmbigs *replace_ambigs_table_; 00500 FLOAT32 reject_offset_; 00501 // Cached UNICHAR_IDs: 00502 UNICHAR_ID wildcard_unichar_id_; // kDictWildcard. 00503 UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol. 00504 UNICHAR_ID question_unichar_id_; // kQuestionSymbol. 00505 UNICHAR_ID slash_unichar_id_; // kSlashSymbol. 00506 UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol. 00507 // Hyphen-related variables. 00508 WERD_CHOICE *hyphen_word_; 00509 DawgPositionVector hyphen_active_dawgs_; 00510 bool last_word_on_line_; 00511 // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary 00512 // matching. The first member of each list is taken as canonical. For 00513 // example, the first list contains hyphens and dashes with the first symbol 00514 // being the ASCII hyphen minus. 00515 GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_; 00516 // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs. 00517 DawgCache *dawg_cache_; 00518 bool dawg_cache_is_ours_; // we should delete our own dawg_cache_ 00519 // Dawgs. 00520 DawgVector dawgs_; 00521 SuccessorListsVector successors_; 00522 Trie *pending_words_; 00523 // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if 00524 // any of them are present on the best choices list for a word pair. 00525 // the bigrams are stored as space-separated words where: 00526 // (1) leading and trailing punctuation has been removed from each word and 00527 // (2) any digits have been replaced with '?' marks. 00528 Dawg *bigram_dawg_; 00531 // TODO(daria): need to support multiple languages in the future, 00532 // so maybe will need to maintain a list of dawgs of each kind. 00533 Dawg *freq_dawg_; 00534 Dawg *unambig_dawg_; 00535 Dawg *punc_dawg_; 00536 Trie *document_words_; 00539 float wordseg_rating_adjust_factor_; 00540 // File for recording ambiguities discovered during dictionary search. 00541 FILE *output_ambig_words_file_; 00542 00543 public: 00547 STRING_VAR_H(user_words_file, "", "A filename of user-provided words."); 00548 STRING_VAR_H(user_words_suffix, "", 00549 "A suffix of user-provided words located in tessdata."); 00550 STRING_VAR_H(user_patterns_file, "", 00551 "A filename of user-provided patterns."); 00552 STRING_VAR_H(user_patterns_suffix, "", 00553 "A suffix of user-provided patterns located in tessdata."); 00554 BOOL_VAR_H(load_system_dawg, true, "Load system word dawg."); 00555 BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg."); 00556 BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg."); 00557 BOOL_VAR_H(load_punc_dawg, true, 00558 "Load dawg with punctuation patterns."); 00559 BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns."); 00560 BOOL_VAR_H(load_bigram_dawg, true, 00561 "Load dawg with special word bigrams."); 00562 double_VAR_H(xheight_penalty_subscripts, 0.125, 00563 "Score penalty (0.1 = 10%) added if there are subscripts " 00564 "or superscripts in a word, but it is otherwise OK."); 00565 double_VAR_H(xheight_penalty_inconsistent, 0.25, 00566 "Score penalty (0.1 = 10%) added if an xheight is " 00567 "inconsistent."); 00568 double_VAR_H(segment_penalty_dict_frequent_word, 1.0, 00569 "Score multiplier for word matches which have good case and" 00570 "are frequent in the given language (lower is better)."); 00571 00572 double_VAR_H(segment_penalty_dict_case_ok, 1.1, 00573 "Score multiplier for word matches that have good case " 00574 "(lower is better)."); 00575 00576 double_VAR_H(segment_penalty_dict_case_bad, 1.3125, 00577 "Default score multiplier for word matches, which may have " 00578 "case issues (lower is better)."); 00579 00580 // TODO(daria): remove this param when ngram permuter is deprecated. 00581 double_VAR_H(segment_penalty_ngram_best_choice, 1.24, 00582 "Multipler to for the best choice from the ngram model."); 00583 00584 double_VAR_H(segment_penalty_dict_nonword, 1.25, 00585 "Score multiplier for glyph fragment segmentations which " 00586 "do not match a dictionary word (lower is better)."); 00587 00588 double_VAR_H(segment_penalty_garbage, 1.50, 00589 "Score multiplier for poorly cased strings that are not in" 00590 " the dictionary and generally look like garbage (lower is" 00591 " better)."); 00592 STRING_VAR_H(output_ambig_words_file, "", 00593 "Output file for ambiguities found in the dictionary"); 00594 INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info" 00595 ", to 2 for more details, to 3 to see all the debug messages"); 00596 INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words."); 00597 INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list."); 00598 BOOL_VAR_H(use_only_first_uft8_step, false, 00599 "Use only the first UTF8 step of the given string" 00600 " when computing log probabilities."); 00601 double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor"); 00602 double_VAR_H(stopper_nondict_certainty_base, -2.50, 00603 "Certainty threshold for non-dict words"); 00604 double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0, 00605 "Reject certainty offset"); 00606 INT_VAR_H(stopper_smallword_size, 2, 00607 "Size of dict word to be treated as non-dict word"); 00608 double_VAR_H(stopper_certainty_per_char, -0.50, 00609 "Certainty to add for each dict char above small word size."); 00610 double_VAR_H(stopper_allowable_character_badness, 3.0, 00611 "Max certaintly variation allowed in a word (in sigma)"); 00612 INT_VAR_H(stopper_debug_level, 0, "Stopper debug level"); 00613 BOOL_VAR_H(stopper_no_acceptable_choices, false, 00614 "Make AcceptableChoice() always return false. Useful" 00615 " when there is a need to explore all segmentations"); 00616 BOOL_VAR_H(save_raw_choices, false, 00617 "Deprecated- backward compatibility only"); 00618 INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list"); 00619 STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information" 00620 " should be printed to stdout"); 00621 STRING_VAR_H(word_to_debug_lengths, "", 00622 "Lengths of unichars in word_to_debug"); 00623 INT_VAR_H(fragments_debug, 0, "Debug character fragments"); 00624 BOOL_VAR_H(segment_nonalphabetic_script, false, 00625 "Don't use any alphabetic-specific tricks." 00626 "Set to true in the traineddata config file for" 00627 " scripts that are cursive or inherently fixed-pitch"); 00628 BOOL_VAR_H(save_doc_words, 0, "Save Document Words"); 00629 double_VAR_H(doc_dict_pending_threshold, 0.0, 00630 "Worst certainty for using pending dictionary"); 00631 double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty" 00632 " for words that can be inserted into the document dictionary"); 00633 INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different" 00634 " character choices to consider during permutation." 00635 " This limit is especially useful when user patterns" 00636 " are specified, since overly generic patterns can result in" 00637 " dawg search exploring an overly large number of options."); 00638 }; 00639 } // namespace tesseract 00640 00641 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_