19 #ifndef TESSERACT_DICT_DICT_H_ 20 #define TESSERACT_DICT_DICT_H_ 34 #define CHARS_PER_LINE 500 35 #define MAX_WERD_LENGTH (int64_t) 128 54 static const int kRatingPad = 4;
55 static const char kDictWildcard[] =
"\u2606";
56 static const int kDictMaxWildcards = 2;
59 static const char kHyphenSymbol[] =
"-";
60 static const char kSlashSymbol[] =
"/";
61 static const char kQuestionSymbol[] =
"?";
62 static const char kApostropheSymbol[] =
"'";
63 static const float kSimCertaintyScale = -10.0;
64 static const float kSimCertaintyOffset = -10.0;
65 static const float kSimilarityFloor = 100.0;
66 static const int kDocDictMaxRepChars = 4;
78 : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}
98 return getCCUtil()->unicharset;
101 return getCCUtil()->unicharset;
104 return getCCUtil()->unichar_ambigs;
109 const UNICHARSET& unicharset = getUnicharset();
113 return normed_ids.
size() == 1 &&
114 (normed_ids[0] == hyphen_unichar_id_ ||
115 normed_ids[0] == slash_unichar_id_);
120 const UNICHARSET& unicharset = getUnicharset();
124 return normed_ids.
size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
131 !last_word_on_line_ && hyphen_word_;
135 return this->hyphenated() ? hyphen_word_->length() : 0;
141 if (this->hyphenated()) {
142 *word = *hyphen_word_;
143 if (hyphen_debug_level) word->
print(
"copy_hyphen_info: ");
149 if (!last_word_on_line_ || first_pos)
154 return normed_ids.
size() == 1 && normed_ids[0] == hyphen_unichar_id_;
158 int word_index = word.
length() - 1;
165 void reset_hyphen_vars(
bool last_word_on_line);
187 bool ambigs_mode)
const;
190 bool suppress_patterns)
const;
203 void go_deeper_dawg_fxn(
206 bool word_ending,
WERD_CHOICE *word,
float certainties[],
207 float *limit,
WERD_CHOICE *best_choice,
int *attempts_left,
208 void *void_more_args);
211 void (
Dict::*go_deeper_fxn_)(
const char *debug,
213 int char_choice_index,
216 float certainties[],
float *limit,
218 void *void_more_args);
222 void permute_choices(
225 int char_choice_index,
238 int char_choice_index,
247 bool fragment_state_okay(
UNICHAR_ID curr_unichar_id,
248 float curr_rating,
float curr_certainty,
250 const char *debug,
int word_ending,
256 bool fix_replaceable,
265 void ReplaceAmbig(
int wrong_ngram_begin_index,
int wrong_ngram_size,
270 int LengthOfShortestAlphaRun(
const WERD_CHOICE &WordChoice)
const;
280 bool AcceptableChoice(
const WERD_CHOICE& best_choice,
285 bool AcceptableResult(
WERD_RES *word)
const;
286 void EndDangerousAmbigs();
288 void DebugWordChoices();
290 void SettupStopperPass1();
292 void SettupStopperPass2();
306 void SetupForLoad(
DawgCache *dawg_cache);
318 if (pending_words_ !=
nullptr)
319 pending_words_->clear();
320 if (document_words_ !=
nullptr)
321 document_words_->clear();
360 int def_letter_is_okay(
void* void_dawg_args,
const UNICHARSET& unicharset,
363 int (
Dict::*letter_is_okay_)(
void* void_dawg_args,
369 return (this->*letter_is_okay_)(void_dawg_args,
375 double (
Dict::*probability_in_context_)(
const char* lang,
379 int character_bytes);
383 const char* character,
384 int character_bytes) {
385 return (this->*probability_in_context_)(
386 getCCUtil()->lang.string(),
387 context, context_bytes,
393 const char* lang,
const char* context,
int context_bytes,
394 const char* character,
int character_bytes) {
399 (void)character_bytes;
402 double ngram_probability_in_context(
const char* lang,
405 const char* character,
406 int character_bytes);
409 float (
Dict::*params_model_classify_)(
const char *lang,
void *path);
410 float ParamsModelClassify(
const char *lang,
void *path);
414 return (this->*params_model_classify_)(
415 getCCUtil()->lang.string(), path);
421 inline int NumDawgs()
const {
return dawgs_.size(); }
423 inline const Dawg *
GetDawg(
int index)
const {
return dawgs_[index]; }
430 if (edge_ref == NO_EDGE)
return 0;
432 if (node == 0) node = NO_EDGE;
440 const Dawg *dawg)
const {
441 if (!dawg)
return ch;
442 switch (dawg->
type()) {
444 return unicharset.
get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
471 int valid_word(
const WERD_CHOICE &word,
bool numbers_ok)
const;
473 return valid_word(word,
false);
476 return valid_word(word,
true);
481 return valid_word(word);
493 void add_document_word(
const WERD_CHOICE &best_choice);
497 float additional_adjust,
502 wordseg_rating_adjust_factor_ = f;
505 bool IsSpaceDelimitedLang()
const;
520 float reject_offset_;
530 bool last_word_on_line_;
538 bool dawg_cache_is_ours_;
542 Trie *pending_words_;
556 Trie *document_words_;
559 float wordseg_rating_adjust_factor_;
561 FILE *output_ambig_words_file_;
567 STRING_VAR_H(user_words_file,
"",
"A filename of user-provided words.");
569 "A suffix of user-provided words located in tessdata.");
571 "A filename of user-provided patterns.");
573 "A suffix of user-provided patterns located in tessdata.");
574 BOOL_VAR_H(load_system_dawg,
true,
"Load system word dawg.");
575 BOOL_VAR_H(load_freq_dawg,
true,
"Load frequent word dawg.");
576 BOOL_VAR_H(load_unambig_dawg,
true,
"Load unambiguous word dawg.");
578 "Load dawg with punctuation patterns.");
579 BOOL_VAR_H(load_number_dawg,
true,
"Load dawg with number patterns.");
581 "Load dawg with special word bigrams.");
583 "Score penalty (0.1 = 10%) added if there are subscripts " 584 "or superscripts in a word, but it is otherwise OK.");
586 "Score penalty (0.1 = 10%) added if an xheight is " 589 "Score multiplier for word matches which have good case and" 590 "are frequent in the given language (lower is better).");
593 "Score multiplier for word matches that have good case " 594 "(lower is better).");
597 "Default score multiplier for word matches, which may have " 598 "case issues (lower is better).");
601 "Score multiplier for glyph fragment segmentations which " 602 "do not match a dictionary word (lower is better).");
605 "Score multiplier for poorly cased strings that are not in" 606 " the dictionary and generally look like garbage (lower is" 609 "Output file for ambiguities found in the dictionary");
610 INT_VAR_H(dawg_debug_level, 0,
"Set to 1 for general debug info" 611 ", to 2 for more details, to 3 to see all the debug messages");
612 INT_VAR_H(hyphen_debug_level, 0,
"Debug level for hyphenated words.");
613 INT_VAR_H(max_viterbi_list_size, 10,
"Maximum size of viterbi list.");
615 "Use only the first UTF8 step of the given string" 616 " when computing log probabilities.");
619 "Certainty threshold for non-dict words");
620 double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0,
621 "Reject certainty offset");
623 "Size of dict word to be treated as non-dict word");
625 "Certainty to add for each dict char above small word size.");
627 "Max certaintly variation allowed in a word (in sigma)");
628 INT_VAR_H(stopper_debug_level, 0,
"Stopper debug level");
629 BOOL_VAR_H(stopper_no_acceptable_choices,
false,
630 "Make AcceptableChoice() always return false. Useful" 631 " when there is a need to explore all segmentations");
632 INT_VAR_H(tessedit_truncate_wordchoice_log, 10,
"Max words to keep in list");
633 STRING_VAR_H(word_to_debug,
"",
"Word for which stopper debug information" 634 " should be printed to stdout");
636 "Lengths of unichars in word_to_debug");
637 INT_VAR_H(fragments_debug, 0,
"Debug character fragments");
638 BOOL_VAR_H(segment_nonalphabetic_script,
false,
639 "Don't use any alphabetic-specific tricks." 640 "Set to true in the traineddata config file for" 641 " scripts that are cursive or inherently fixed-pitch");
644 "Worst certainty for using pending dictionary");
645 double_VAR_H(doc_dict_certainty_threshold, -2.25,
"Worst certainty" 646 " for words that can be inserted into the document dictionary");
647 INT_VAR_H(max_permuter_attempts, 10000,
"Maximum number of different" 648 " character choices to consider during permutation." 649 " This limit is especially useful when user patterns" 650 " are specified, since overly generic patterns can result in" 651 " dawg search exploring an overly large number of options.");
655 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_ #define BOOL_VAR_H(name, val, comment)
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
#define STRING_VAR_H(name, val, comment)
bool compound_marker(UNICHAR_ID unichar_id)
UNICHAR_ID WildcardID() const
void SetWildcardID(UNICHAR_ID id)
DawgPositionVector * updated_dawgs
void ResetDocumentDictionary()
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
#define INT_VAR_H(name, val, comment)
DawgPositionVector * active_dawgs
const UnicharAmbigs & getUnicharAmbigs() const
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
const CCUtil * getCCUtil() const
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
#define double_VAR_H(name, val, comment)
bool is_apostrophe(UNICHAR_ID unichar_id)
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
float CallParamsModelClassify(void *path)
bool get_isdigit(UNICHAR_ID unichar_id) const
int valid_word(const WERD_CHOICE &word) const
void copy_hyphen_info(WERD_CHOICE *word) const
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
UNICHAR_ID unichar_id(int index) const
const UNICHARSET & getUnicharset() const
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
int valid_word_or_number(const WERD_CHOICE &word) const
const CHAR_FRAGMENT * fragment
bool contains_unichar_id(UNICHAR_ID unichar_id) const
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
UNICHARSET & getUnicharset()
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
const UNICHARSET * unicharset() const
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.