32 probability_in_context_(&
tesseract::
Dict::def_probability_in_context),
33 params_model_classify_(nullptr),
35 wildcard_unichar_id_(INVALID_UNICHAR_ID),
36 apostrophe_unichar_id_(INVALID_UNICHAR_ID),
37 question_unichar_id_(INVALID_UNICHAR_ID),
38 slash_unichar_id_(INVALID_UNICHAR_ID),
39 hyphen_unichar_id_(INVALID_UNICHAR_ID),
40 STRING_MEMBER(user_words_file,
"",
"A filename of user-provided words.",
41 getCCUtil()->params()),
43 "A suffix of user-provided words located in tessdata.",
44 getCCUtil()->params()),
46 "A filename of user-provided patterns.",
47 getCCUtil()->params()),
49 "A suffix of user-provided patterns located in " 51 getCCUtil()->params()),
53 getCCUtil()->params()),
55 getCCUtil()->params()),
57 getCCUtil()->params()),
59 "Load dawg with punctuation" 61 getCCUtil()->params()),
63 "Load dawg with number" 65 getCCUtil()->params()),
67 "Load dawg with special word " 69 getCCUtil()->params()),
71 "Score penalty (0.1 = 10%) added if there are subscripts " 72 "or superscripts in a word, but it is otherwise OK.",
73 getCCUtil()->params()),
75 "Score penalty (0.1 = 10%) added if an xheight is " 77 getCCUtil()->params()),
79 "Score multiplier for word matches which have good case and" 80 " are frequent in the given language (lower is better).",
81 getCCUtil()->params()),
83 "Score multiplier for word matches that have good case " 85 getCCUtil()->params()),
87 "Default score multiplier for word matches, which may have " 88 "case issues (lower is better).",
89 getCCUtil()->params()),
91 "Score multiplier for glyph fragment segmentations which " 92 "do not match a dictionary word (lower is better).",
93 getCCUtil()->params()),
95 "Score multiplier for poorly cased strings that are not in" 96 " the dictionary and generally look like garbage (lower is" 98 getCCUtil()->params()),
100 "Output file for ambiguities found in the dictionary",
101 getCCUtil()->params()),
103 "Set to 1 for general debug info" 104 ", to 2 for more details, to 3 to see all the debug messages",
105 getCCUtil()->params()),
106 INT_MEMBER(hyphen_debug_level, 0,
"Debug level for hyphenated words.",
107 getCCUtil()->params()),
108 INT_MEMBER(max_viterbi_list_size, 10,
"Maximum size of viterbi list.",
109 getCCUtil()->params()),
111 "Use only the first UTF8 step of the given string" 112 " when computing log probabilities.",
113 getCCUtil()->params()),
114 double_MEMBER(certainty_scale, 20.0,
"Certainty scaling factor",
115 getCCUtil()->params()),
117 "Certainty threshold for non-dict words",
118 getCCUtil()->params()),
119 double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
120 "Reject certainty offset", getCCUtil()->params()),
122 "Size of dict word to be treated as non-dict word",
123 getCCUtil()->params()),
126 " for each dict char above small word size.",
127 getCCUtil()->params()),
129 "Max certaintly variation allowed in a word (in sigma)",
130 getCCUtil()->params()),
131 INT_MEMBER(stopper_debug_level, 0,
"Stopper debug level",
132 getCCUtil()->params()),
134 "Make AcceptableChoice() always return false. Useful" 135 " when there is a need to explore all segmentations",
136 getCCUtil()->params()),
137 INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
138 "Max words to keep in list", getCCUtil()->params()),
140 "Word for which stopper debug" 141 " information should be printed to stdout",
142 getCCUtil()->params()),
144 "Lengths of unichars in word_to_debug",
145 getCCUtil()->params()),
146 INT_MEMBER(fragments_debug, 0,
"Debug character fragments",
147 getCCUtil()->params()),
149 "Don't use any alphabetic-specific tricks." 150 " Set to true in the traineddata config file for" 151 " scripts that are cursive or inherently fixed-pitch",
152 getCCUtil()->params()),
153 BOOL_MEMBER(save_doc_words, 0,
"Save Document Words",
154 getCCUtil()->params()),
156 "Worst certainty for using pending dictionary",
157 getCCUtil()->params()),
159 "Worst certainty for words that can be inserted into the" 160 " document dictionary",
161 getCCUtil()->params()),
163 "Maximum number of different" 164 " character choices to consider during permutation." 165 " This limit is especially useful when user patterns" 166 " are specified, since overly generic patterns can result in" 167 " dawg search exploring an overly large number of options.",
168 getCCUtil()->params()) {
169 dang_ambigs_table_ =
nullptr;
170 replace_ambigs_table_ =
nullptr;
171 reject_offset_ = 0.0;
173 hyphen_word_ =
nullptr;
174 last_word_on_line_ =
false;
175 document_words_ =
nullptr;
176 dawg_cache_ =
nullptr;
177 dawg_cache_is_ours_ =
false;
178 pending_words_ =
nullptr;
179 bigram_dawg_ =
nullptr;
180 freq_dawg_ =
nullptr;
181 punc_dawg_ =
nullptr;
182 unambig_dawg_ =
nullptr;
183 wordseg_rating_adjust_factor_ = -1.0f;
184 output_ambig_words_file_ =
nullptr;
190 if (output_ambig_words_file_ !=
nullptr) fclose(output_ambig_words_file_);
209 if (dawg_cache !=
nullptr) {
210 dawg_cache_ = dawg_cache;
211 dawg_cache_is_ours_ =
false;
214 dawg_cache_is_ours_ =
true;
224 if (punc_dawg_) dawgs_ += punc_dawg_;
229 if (system_dawg) dawgs_ += system_dawg;
234 if (number_dawg) dawgs_ += number_dawg;
245 if (freq_dawg_) dawgs_ += freq_dawg_;
250 if (unambig_dawg_) dawgs_ += unambig_dawg_;
292 dawgs_ += document_words_;
305 if (punc_dawg_) dawgs_ += punc_dawg_;
310 if (system_dawg) dawgs_ += system_dawg;
315 if (number_dawg) dawgs_ += number_dawg;
361 if (dawgs_.
empty())
return false;
366 for (
int i = 0; i < dawgs_.
length(); ++i) {
367 const Dawg* dawg = dawgs_[i];
369 for (
int j = 0; j < dawgs_.
length(); ++j) {
370 const Dawg* other = dawgs_[j];
371 if (dawg !=
nullptr && other !=
nullptr &&
373 kDawgSuccessors[dawg->
type()][other->
type()])
382 if (dawgs_.
length() == 0)
return;
383 for (
int i = 0; i < dawgs_.
size(); i++) {
384 if (!dawg_cache_->
FreeDawg(dawgs_[i])) {
388 dawg_cache_->
FreeDawg(bigram_dawg_);
389 if (dawg_cache_is_ours_) {
391 dawg_cache_ =
nullptr;
396 document_words_ =
nullptr;
397 delete pending_words_;
398 pending_words_ =
nullptr;
406 auto* dawg_args =
static_cast<DawgArgs*
>(void_dawg_args);
412 "def_letter_is_okay: current unichar=%s word_end=%d" 413 " num active dawgs=%d\n",
415 dawg_args->active_dawgs->length());
422 unichar_id == INVALID_UNICHAR_ID) {
429 dawg_args->updated_dawgs->clear();
430 dawg_args->valid_end =
false;
435 for (
int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
436 const DawgPosition& pos = (*dawg_args->active_dawgs)[a];
437 const Dawg* punc_dawg =
441 if (!dawg && !punc_dawg) {
443 tprintf(
"Received DawgPosition with no dawg or punc_dawg. wth?\n");
451 if (punc_transition_edge != NO_EDGE) {
454 for (
int s = 0; s < slist.
length(); ++s) {
455 int sdawg_index = slist[s];
456 const Dawg* sdawg = dawgs_[sdawg_index];
459 if (dawg_edge != NO_EDGE) {
461 tprintf(
"Letter found in dawg %d\n", sdawg_index);
463 dawg_args->updated_dawgs->add_unique(
465 punc_transition_edge,
false),
467 "Append transition from punc dawg to current dawgs: ");
471 dawg_args->valid_end =
true;
476 punc_dawg->
edge_char_of(punc_node, unichar_id, word_end);
477 if (punc_edge != NO_EDGE) {
479 tprintf(
"Letter found in punctuation dawg\n");
481 dawg_args->updated_dawgs->add_unique(
485 if (punc_dawg->
end_of_word(punc_edge)) dawg_args->valid_end =
true;
497 : punc_dawg->
edge_char_of(punc_node, unichar_id, word_end);
498 if (punc_edge != NO_EDGE) {
499 dawg_args->updated_dawgs->add_unique(
504 if (punc_dawg->
end_of_word(punc_edge)) dawg_args->valid_end =
true;
527 node,
char_for_dawg(unicharset, unichar_id, dawg), word_end);
534 if (edge != NO_EDGE) {
540 tprintf(
"Punctuation constraint not satisfied at end of word.\n");
547 dawg_args->valid_end =
true;
548 dawg_args->updated_dawgs->add_unique(
552 "Append current dawg to updated active dawgs: ");
561 dawg_args->permuter = curr_perm;
564 tprintf(
"Returning %d for permuter code for this character.\n",
565 dawg_args->permuter);
567 return dawg_args->permuter;
578 unichar_id_patterns.
push_back(unichar_id);
580 &unichar_id_patterns);
581 for (
int i = 0; i < unichar_id_patterns.
size(); ++i) {
584 for (
int k = 0; k < 2; ++k) {
586 (k == 0) ? dawg->
edge_char_of(node, unichar_id_patterns[i], word_end)
588 unichar_id_patterns[i], word_end);
589 if (edge == NO_EDGE)
continue;
601 "Append current dawg to updated active dawgs: ");
610 bool ambigs_mode)
const {
613 *active_dawgs = hyphen_active_dawgs_;
615 for (i = 0; i < hyphen_active_dawgs_.
size(); ++i) {
617 hyphen_active_dawgs_[i].dawg_index,
618 hyphen_active_dawgs_[i].dawg_ref);
627 bool suppress_patterns)
const {
628 bool punc_dawg_available =
629 (punc_dawg_ !=
nullptr) &&
632 for (
int i = 0; i < dawgs_.
length(); i++) {
633 if (dawgs_[i] !=
nullptr &&
635 int dawg_ty = dawgs_[i]->type();
638 *dawg_pos_vec +=
DawgPosition(-1, NO_EDGE, i, NO_EDGE,
false);
643 }
else if (!punc_dawg_available || !subsumed_by_punc) {
644 *dawg_pos_vec +=
DawgPosition(i, NO_EDGE, -1, NO_EDGE,
false);
660 if (hyphen_word_)
return;
662 int stringlen = best_choice.
length();
664 if (
valid_word(best_choice) || stringlen < 2)
return;
667 if (best_choice.
length() >= kDocDictMaxRepChars) {
668 int num_rep_chars = 1;
670 for (
int i = 1; i < best_choice.
length(); ++i) {
676 if (num_rep_chars == kDocDictMaxRepChars)
return;
699 FILE* doc_word_file = fopen(filename.
string(),
"a");
700 if (doc_word_file ==
nullptr) {
701 tprintf(
"Error: Could not open file %s\n", filename.
string());
705 fclose(doc_word_file);
712 float additional_adjust,
bool modify_rating,
716 bool case_is_ok = (is_han ||
case_ok(*word));
719 float adjust_factor = additional_adjust;
720 float new_rating = word->
rating();
721 new_rating += kRatingPad;
722 const char* xheight_triggered =
"";
725 switch (xheight_consistency) {
728 xheight_triggered =
", xhtBAD";
732 xheight_triggered =
", xhtSUB";
742 tprintf(
"Consistency could not be calculated.\n");
746 tprintf(
"%sWord: %s %4.2f%s", nonword ?
"Non-" :
"",
751 if (case_is_ok && punc_is_ok) {
753 new_rating *= adjust_factor;
757 new_rating *= adjust_factor;
759 if (!case_is_ok)
tprintf(
", C");
760 if (!punc_is_ok)
tprintf(
", P");
765 if (!is_han && freq_dawg_ !=
nullptr && freq_dawg_->
word_in_dawg(*word)) {
768 new_rating *= adjust_factor;
772 new_rating *= adjust_factor;
777 new_rating *= adjust_factor;
781 new_rating -= kRatingPad;
782 if (modify_rating) word->
set_rating(new_rating);
783 if (debug)
tprintf(
" %4.2f --> %4.2f\n", adjust_factor, new_rating);
793 word_ptr = &temp_word;
801 int last_index = word_ptr->
length() - 1;
816 delete[] active_dawgs;
824 if (bigram_dawg_ ==
nullptr)
return false;
828 int w1start, w1end, w2start, w2end;
834 if (w1start >= w1end)
return word1.
length() < 3;
835 if (w2start >= w2end)
return word2.
length() < 3;
839 bigram_string.
reserve(w1end + w2end + 1);
840 for (
int i = w1start; i < w1end; i++) {
844 bigram_string.
push_back(question_unichar_id_);
846 bigram_string += normed_ids;
849 for (
int i = w2start; i < w2end; i++) {
853 bigram_string.
push_back(question_unichar_id_);
855 bigram_string += normed_ids;
858 for (
int i = 0; i < bigram_string.
size(); ++i) {
869 int last_index = word.
length() - 1;
871 for (i = 0; i <= last_index; ++i) {
874 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
878 }
else if ((new_len = new_word.length()) == 0 ||
883 for (i = 0; i < dawgs_.
size(); ++i) {
885 dawgs_[i]->word_in_dawg(new_word))
894 if (u_set.
han_sid() > 0)
return false;
896 if (u_set.
thai_sid() > 0)
return false;
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
const STRING & lang() const
void delete_data_pointers()
bool get_ispunctuation(UNICHAR_ID unichar_id) const
double xheight_penalty_subscripts
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
#define STRING_INIT_MEMBER(name, val, comment, vec)
static const UNICHAR_ID kPatternUnicharID
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
char * user_patterns_suffix
DawgPositionVector * updated_dawgs
#define double_MEMBER(name, val, comment, vec)
double doc_dict_certainty_threshold
virtual bool end_of_word(EDGE_REF edge_ref) const =0
#define BOOL_MEMBER(name, val, comment, vec)
void set_rating(float new_val)
const STRING & unichar_string() const
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
DawgPositionVector * active_dawgs
bool valid_punctuation(const WERD_CHOICE &word)
#define INT_MEMBER(name, val, comment, vec)
const CCUtil * getCCUtil() const
void LoadLSTM(const STRING &lang, TessdataManager *data_file)
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
void SetupForLoad(DawgCache *dawg_cache)
void punct_stripped(int *start_core, int *end_core) const
PermuterType permuter() const
int GetTopScriptID() const
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
void set_permuter(uint8_t perm)
bool get_isdigit(UNICHAR_ID unichar_id) const
double segment_penalty_garbage
void copy_hyphen_info(WERD_CHOICE *word) const
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
void Load(const STRING &lang, TessdataManager *data_file)
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
bool FreeDawg(Dawg *dawg)
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
char * user_patterns_file
const char * string() const
DLLSYM void tprintf(const char *format,...)
double xheight_penalty_inconsistent
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
double segment_penalty_dict_case_bad
UNICHAR_ID unichar_id(int index) const
bool IsSpaceDelimitedLang() const
Returns true if the language is space-delimited (not CJ, or T).
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
GenericVector< int > SuccessorList
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
const UNICHARSET & getUnicharset() const
#define STRING_MEMBER(name, val, comment, vec)
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
double segment_penalty_dict_frequent_word
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
void initialize_patterns(UNICHARSET *unicharset)
static TESS_API DawgCache * GlobalDawgCache()
double doc_dict_pending_threshold
double segment_penalty_dict_case_ok
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
bool get_isupper(UNICHAR_ID unichar_id) const
const STRING debug_string() const
bool contains_unichar_id(UNICHAR_ID unichar_id) const
void set_adjust_factor(float factor)
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
#define BOOL_INIT_MEMBER(name, val, comment, vec)
double segment_penalty_dict_nonword
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
const UNICHARSET * unicharset() const
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
STRING language_data_path_prefix
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const