tesseract  4.1.0
tesseract::Dict Class Reference

#include <dict.h>

Public Member Functions

 Dict (CCUtil *image_ptr)
 
 ~Dict ()
 
const CCUtilgetCCUtil () const
 
CCUtilgetCCUtil ()
 
const UNICHARSETgetUnicharset () const
 
UNICHARSETgetUnicharset ()
 
const UnicharAmbigsgetUnicharAmbigs () const
 
bool compound_marker (UNICHAR_ID unichar_id)
 
bool is_apostrophe (UNICHAR_ID unichar_id)
 
bool hyphenated () const
 Returns true if we've recorded the beginning of a hyphenated word. More...
 
int hyphen_base_size () const
 Size of the base word (the part on the line before) of a hyphenated word. More...
 
void copy_hyphen_info (WERD_CHOICE *word) const
 
bool has_hyphen_end (const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
 Check whether the word has a hyphen at the end. More...
 
bool has_hyphen_end (const WERD_CHOICE &word) const
 Same as above, but check the unichar at the end of the word. More...
 
void reset_hyphen_vars (bool last_word_on_line)
 
void set_hyphen_word (const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
 
void update_best_choice (const WERD_CHOICE &word, WERD_CHOICE *best_choice)
 
void init_active_dawgs (DawgPositionVector *active_dawgs, bool ambigs_mode) const
 
void default_dawgs (DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
 
bool NoDangerousAmbig (WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
 
void ReplaceAmbig (int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
 
int LengthOfShortestAlphaRun (const WERD_CHOICE &WordChoice) const
 Returns the length of the shortest alpha run in WordChoice. More...
 
int UniformCertainties (const WERD_CHOICE &word)
 
bool AcceptableChoice (const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
 Returns true if the given best_choice is good enough to stop. More...
 
bool AcceptableResult (WERD_RES *word) const
 
void EndDangerousAmbigs ()
 
void DebugWordChoices ()
 Prints the current choices for this word to stdout. More...
 
void SettupStopperPass1 ()
 Sets up stopper variables in preparation for the first pass. More...
 
void SettupStopperPass2 ()
 Sets up stopper variables in preparation for the second pass. More...
 
int case_ok (const WERD_CHOICE &word) const
 Check a string to see if it matches a set of lexical rules. More...
 
bool absolute_garbage (const WERD_CHOICE &word, const UNICHARSET &unicharset)
 
void SetupForLoad (DawgCache *dawg_cache)
 
void Load (const STRING &lang, TessdataManager *data_file)
 
void LoadLSTM (const STRING &lang, TessdataManager *data_file)
 
bool FinishLoad ()
 
void End ()
 
void ResetDocumentDictionary ()
 
int def_letter_is_okay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 
int LetterIsOkay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 Calls letter_is_okay_ member function. More...
 
double ProbabilityInContext (const char *context, int context_bytes, const char *character, int character_bytes)
 Calls probability_in_context_ member function. More...
 
double def_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Default (no-op) implementation of probability in context function. More...
 
double ngram_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 
float ParamsModelClassify (const char *lang, void *path)
 
float CallParamsModelClassify (void *path)
 
void SetWildcardID (UNICHAR_ID id)
 
UNICHAR_ID WildcardID () const
 
int NumDawgs () const
 Return the number of dawgs in the dawgs_ vector. More...
 
const DawgGetDawg (int index) const
 Return i-th dawg pointer recorded in the dawgs_ vector. More...
 
const DawgGetPuncDawg () const
 Return the points to the punctuation dawg. More...
 
const DawgGetUnambigDawg () const
 Return the points to the unambiguous words dawg. More...
 
UNICHAR_ID char_for_dawg (const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
 
void ProcessPatternEdges (const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
 
int valid_word (const WERD_CHOICE &word, bool numbers_ok) const
 
int valid_word (const WERD_CHOICE &word) const
 
int valid_word_or_number (const WERD_CHOICE &word) const
 
int valid_word (const char *string) const
 This function is used by api/tesseract_cube_combiner.cpp. More...
 
bool valid_bigram (const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
 
bool valid_punctuation (const WERD_CHOICE &word)
 
int good_choice (const WERD_CHOICE &choice)
 Returns true if a good answer is found for the unknown blob rating. More...
 
void add_document_word (const WERD_CHOICE &best_choice)
 Adds a word found on this document to the document specific dictionary. More...
 
void adjust_word (WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
 Adjusts the rating of the given word. More...
 
void SetWordsegRatingAdjustFactor (float f)
 Set wordseg_rating_adjust_factor_ to the given value. More...
 
bool IsSpaceDelimitedLang () const
 Returns true if the language is space-delimited (not CJ, or T). More...
 
go_deeper_dawg_fxn

If the choice being composed so far could be a dictionary word keep exploring choices.

WERD_CHOICEdawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
 
void go_deeper_dawg_fxn (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 
void permute_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
void append_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
fragment_state

Given the current char choice and information about previously seen fragments, determines whether adjacent character fragments are present and whether they can be concatenated.

The given prev_char_frag_info contains:

  • fragment: if not nullptr contains information about immediately preceding fragmented character choice
  • num_fragments: number of fragments that have been used so far to construct a character
  • certainty: certainty of the current choice or minimum certainty of all fragments concatenated so far
  • rating: rating of the current choice or sum of fragment ratings concatenated so far

The output char_frag_info is filled in as follows:

  • character: is set to be nullptr if the choice is a non-matching or non-ending fragment piece; is set to unichar of the given choice if it represents a regular character or a matching ending fragment
  • fragment,num_fragments,certainty,rating are set as described above
Returns
false if a non-matching fragment is discovered, true otherwise.
bool fragment_state_okay (UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
 

Static Public Member Functions

static TESS_API DawgCacheGlobalDawgCache ()
 
static NODE_REF GetStartingNode (const Dawg *dawg, EDGE_REF edge_ref)
 Returns the appropriate next node given the EDGE_REF. More...
 
static bool valid_word_permuter (uint8_t perm, bool numbers_ok)
 Check all the DAWGs to see if this word is in any of them. More...
 

Public Attributes

void(Dict::* go_deeper_fxn_ )(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 Pointer to go_deeper function. More...
 
int(Dict::* letter_is_okay_ )(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 
double(Dict::* probability_in_context_ )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Probability in context function used by the ngram permuter. More...
 
float(Dict::* params_model_classify_ )(const char *lang, void *path)
 
char * user_words_file = ""
 
char * user_words_suffix = ""
 
char * user_patterns_file = ""
 
char * user_patterns_suffix = ""
 
bool load_system_dawg = true
 
bool load_freq_dawg = true
 
bool load_unambig_dawg = true
 
bool load_punc_dawg = true
 
bool load_number_dawg = true
 
bool load_bigram_dawg = true
 
double xheight_penalty_subscripts = 0.125
 
double xheight_penalty_inconsistent = 0.25
 
double segment_penalty_dict_frequent_word = 1.0
 
double segment_penalty_dict_case_ok = 1.1
 
double segment_penalty_dict_case_bad = 1.3125
 
double segment_penalty_dict_nonword = 1.25
 
double segment_penalty_garbage = 1.50
 
char * output_ambig_words_file = ""
 
int dawg_debug_level = 0
 
int hyphen_debug_level = 0
 
int max_viterbi_list_size = 10
 
bool use_only_first_uft8_step = false
 
double certainty_scale = 20.0
 
double stopper_nondict_certainty_base = -2.50
 
double stopper_phase2_certainty_rejection_offset = 1.0
 
int stopper_smallword_size = 2
 
double stopper_certainty_per_char = -0.50
 
double stopper_allowable_character_badness = 3.0
 
int stopper_debug_level = 0
 
bool stopper_no_acceptable_choices = false
 
int tessedit_truncate_wordchoice_log = 10
 
char * word_to_debug = ""
 
char * word_to_debug_lengths = ""
 
int fragments_debug = 0
 
bool segment_nonalphabetic_script = false
 
bool save_doc_words = 0
 
double doc_dict_pending_threshold = 0.0
 
double doc_dict_certainty_threshold = -2.25
 
int max_permuter_attempts = 10000
 

Detailed Description

Definition at line 87 of file dict.h.

Constructor & Destructor Documentation

tesseract::Dict::Dict ( CCUtil image_ptr)

Definition at line 30 of file dict.cpp.

33  params_model_classify_(nullptr),
34  ccutil_(ccutil),
35  wildcard_unichar_id_(INVALID_UNICHAR_ID),
36  apostrophe_unichar_id_(INVALID_UNICHAR_ID),
37  question_unichar_id_(INVALID_UNICHAR_ID),
38  slash_unichar_id_(INVALID_UNICHAR_ID),
39  hyphen_unichar_id_(INVALID_UNICHAR_ID),
40  STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
41  getCCUtil()->params()),
43  "A suffix of user-provided words located in tessdata.",
44  getCCUtil()->params()),
46  "A filename of user-provided patterns.",
47  getCCUtil()->params()),
49  "A suffix of user-provided patterns located in "
50  "tessdata.",
51  getCCUtil()->params()),
52  BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
53  getCCUtil()->params()),
54  BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
55  getCCUtil()->params()),
56  BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
57  getCCUtil()->params()),
59  "Load dawg with punctuation"
60  " patterns.",
61  getCCUtil()->params()),
63  "Load dawg with number"
64  " patterns.",
65  getCCUtil()->params()),
67  "Load dawg with special word "
68  "bigrams.",
69  getCCUtil()->params()),
71  "Score penalty (0.1 = 10%) added if there are subscripts "
72  "or superscripts in a word, but it is otherwise OK.",
73  getCCUtil()->params()),
75  "Score penalty (0.1 = 10%) added if an xheight is "
76  "inconsistent.",
77  getCCUtil()->params()),
79  "Score multiplier for word matches which have good case and"
80  " are frequent in the given language (lower is better).",
81  getCCUtil()->params()),
83  "Score multiplier for word matches that have good case "
84  "(lower is better).",
85  getCCUtil()->params()),
87  "Default score multiplier for word matches, which may have "
88  "case issues (lower is better).",
89  getCCUtil()->params()),
91  "Score multiplier for glyph fragment segmentations which "
92  "do not match a dictionary word (lower is better).",
93  getCCUtil()->params()),
95  "Score multiplier for poorly cased strings that are not in"
96  " the dictionary and generally look like garbage (lower is"
97  " better).",
98  getCCUtil()->params()),
100  "Output file for ambiguities found in the dictionary",
101  getCCUtil()->params()),
103  "Set to 1 for general debug info"
104  ", to 2 for more details, to 3 to see all the debug messages",
105  getCCUtil()->params()),
106  INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
107  getCCUtil()->params()),
108  INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",
109  getCCUtil()->params()),
111  "Use only the first UTF8 step of the given string"
112  " when computing log probabilities.",
113  getCCUtil()->params()),
114  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
115  getCCUtil()->params()),
117  "Certainty threshold for non-dict words",
118  getCCUtil()->params()),
120  "Reject certainty offset", getCCUtil()->params()),
122  "Size of dict word to be treated as non-dict word",
123  getCCUtil()->params()),
125  "Certainty to add"
126  " for each dict char above small word size.",
127  getCCUtil()->params()),
129  "Max certaintly variation allowed in a word (in sigma)",
130  getCCUtil()->params()),
131  INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
132  getCCUtil()->params()),
134  "Make AcceptableChoice() always return false. Useful"
135  " when there is a need to explore all segmentations",
136  getCCUtil()->params()),
138  "Max words to keep in list", getCCUtil()->params()),
140  "Word for which stopper debug"
141  " information should be printed to stdout",
142  getCCUtil()->params()),
144  "Lengths of unichars in word_to_debug",
145  getCCUtil()->params()),
146  INT_MEMBER(fragments_debug, 0, "Debug character fragments",
147  getCCUtil()->params()),
149  "Don't use any alphabetic-specific tricks."
150  " Set to true in the traineddata config file for"
151  " scripts that are cursive or inherently fixed-pitch",
152  getCCUtil()->params()),
153  BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
154  getCCUtil()->params()),
156  "Worst certainty for using pending dictionary",
157  getCCUtil()->params()),
159  "Worst certainty for words that can be inserted into the"
160  " document dictionary",
161  getCCUtil()->params()),
163  "Maximum number of different"
164  " character choices to consider during permutation."
165  " This limit is especially useful when user patterns"
166  " are specified, since overly generic patterns can result in"
167  " dawg search exploring an overly large number of options.",
168  getCCUtil()->params()) {
169  dang_ambigs_table_ = nullptr;
170  replace_ambigs_table_ = nullptr;
171  reject_offset_ = 0.0;
172  go_deeper_fxn_ = nullptr;
173  hyphen_word_ = nullptr;
174  last_word_on_line_ = false;
175  document_words_ = nullptr;
176  dawg_cache_ = nullptr;
177  dawg_cache_is_ours_ = false;
178  pending_words_ = nullptr;
179  bigram_dawg_ = nullptr;
180  freq_dawg_ = nullptr;
181  punc_dawg_ = nullptr;
182  unambig_dawg_ = nullptr;
183  wordseg_rating_adjust_factor_ = -1.0f;
184  output_ambig_words_file_ = nullptr;
185 }
bool load_number_dawg
Definition: dict.h:579
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:375
int fragments_debug
Definition: dict.h:637
int hyphen_debug_level
Definition: dict.h:612
double xheight_penalty_subscripts
Definition: dict.h:584
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:409
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:333
char * word_to_debug
Definition: dict.h:634
char * user_patterns_suffix
Definition: dict.h:573
bool load_unambig_dawg
Definition: dict.h:576
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:324
bool segment_nonalphabetic_script
Definition: dict.h:641
bool load_punc_dawg
Definition: dict.h:578
double doc_dict_certainty_threshold
Definition: dict.h:646
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:318
char * output_ambig_words_file
Definition: dict.h:609
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:404
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:315
bool save_doc_words
Definition: dict.h:642
const CCUtil * getCCUtil() const
Definition: dict.h:91
bool stopper_no_acceptable_choices
Definition: dict.h:631
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:392
int tessedit_truncate_wordchoice_log
Definition: dict.h:632
double stopper_allowable_character_badness
Definition: dict.h:627
int stopper_debug_level
Definition: dict.h:628
double segment_penalty_garbage
Definition: dict.h:607
int max_permuter_attempts
Definition: dict.h:651
int stopper_smallword_size
Definition: dict.h:623
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:211
double stopper_nondict_certainty_base
Definition: dict.h:619
char * user_patterns_file
Definition: dict.h:571
double xheight_penalty_inconsistent
Definition: dict.h:587
char * user_words_suffix
Definition: dict.h:569
bool load_freq_dawg
Definition: dict.h:575
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:363
int max_viterbi_list_size
Definition: dict.h:613
char * word_to_debug_lengths
Definition: dict.h:636
char * user_words_file
Definition: dict.h:567
double segment_penalty_dict_case_bad
Definition: dict.h:598
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:321
double segment_penalty_dict_frequent_word
Definition: dict.h:590
bool load_system_dawg
Definition: dict.h:574
double doc_dict_pending_threshold
Definition: dict.h:644
double segment_penalty_dict_case_ok
Definition: dict.h:594
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:621
bool use_only_first_uft8_step
Definition: dict.h:616
double stopper_certainty_per_char
Definition: dict.h:625
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:330
double segment_penalty_dict_nonword
Definition: dict.h:602
int dawg_debug_level
Definition: dict.h:611
double certainty_scale
Definition: dict.h:617
bool load_bigram_dawg
Definition: dict.h:581
tesseract::Dict::~Dict ( )

Definition at line 187 of file dict.cpp.

187  {
188  End();
189  delete hyphen_word_;
190  if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_);
191 }
void End()
Definition: dict.cpp:381

Member Function Documentation

bool tesseract::Dict::absolute_garbage ( const WERD_CHOICE word,
const UNICHARSET unicharset 
)

Returns true if the word looks like an absolute garbage (e.g. image mistakenly recognized as text).

Definition at line 65 of file context.cpp.

66  {
67  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
68  int num_alphanum = 0;
69  for (int x = 0; x < word.length(); ++x) {
70  num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
71  unicharset.get_isdigit(word.unichar_id(x)));
72  }
73  return (static_cast<float>(num_alphanum) /
74  static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
75 }
int length() const
Definition: ratngs.h:303
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
bool tesseract::Dict::AcceptableChoice ( const WERD_CHOICE best_choice,
XHeightConsistencyEnum  xheight_consistency 
)

Returns true if the given best_choice is good enough to stop.

Definition at line 40 of file stopper.cpp.

41  {
42  float CertaintyThreshold = stopper_nondict_certainty_base;
43  int WordSize;
44 
45  if (stopper_no_acceptable_choices) return false;
46 
47  if (best_choice.length() == 0) return false;
48 
49  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
50  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
51  bool is_case_ok = case_ok(best_choice);
52 
53  if (stopper_debug_level >= 1) {
54  const char *xht = "UNKNOWN";
55  switch (xheight_consistency) {
56  case XH_GOOD: xht = "NORMAL"; break;
57  case XH_SUBNORMAL: xht = "SUBNORMAL"; break;
58  case XH_INCONSISTENT: xht = "INCONSISTENT"; break;
59  default: xht = "UNKNOWN";
60  }
61  tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
62  best_choice.unichar_string().string(),
63  (is_valid_word ? 'y' : 'n'),
64  (is_case_ok ? 'y' : 'n'),
65  xht,
66  best_choice.min_x_height(),
67  best_choice.max_x_height());
68  }
69  // Do not accept invalid words in PASS1.
70  if (reject_offset_ <= 0.0f && !is_valid_word) return false;
71  if (is_valid_word && is_case_ok) {
72  WordSize = LengthOfShortestAlphaRun(best_choice);
73  WordSize -= stopper_smallword_size;
74  if (WordSize < 0)
75  WordSize = 0;
76  CertaintyThreshold += WordSize * stopper_certainty_per_char;
77  }
78 
79  if (stopper_debug_level >= 1)
80  tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
81  best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
82 
83  if (no_dang_ambigs &&
84  best_choice.certainty() > CertaintyThreshold &&
85  xheight_consistency < XH_INCONSISTENT &&
86  UniformCertainties(best_choice)) {
87  return true;
88  } else {
89  if (stopper_debug_level >= 1) {
90  tprintf("AcceptableChoice() returned false"
91  " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
92  no_dang_ambigs, best_choice.certainty(),
93  CertaintyThreshold,
94  UniformCertainties(best_choice));
95  }
96  return false;
97  }
98 }
const STRING & unichar_string() const
Definition: ratngs.h:541
float min_x_height() const
Definition: ratngs.h:336
int length() const
Definition: ratngs.h:303
bool stopper_no_acceptable_choices
Definition: dict.h:631
int stopper_debug_level
Definition: dict.h:628
int stopper_smallword_size
Definition: dict.h:623
double stopper_nondict_certainty_base
Definition: dict.h:619
float rating() const
Definition: ratngs.h:327
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool dangerous_ambig_found() const
Definition: ratngs.h:363
float certainty() const
Definition: ratngs.h:330
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:46
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:459
float max_x_height() const
Definition: ratngs.h:339
double stopper_certainty_per_char
Definition: dict.h:625
uint8_t permuter() const
Definition: ratngs.h:346
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:465
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:440
bool tesseract::Dict::AcceptableResult ( WERD_RES word) const

Returns false if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.

Definition at line 100 of file stopper.cpp.

100  {
101  if (word->best_choice == nullptr) return false;
102  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
103  int WordSize;
104 
105  if (stopper_debug_level >= 1) {
106  tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
107  word->best_choice->debug_string().string(),
108  (valid_word(*word->best_choice) ? 'y' : 'n'),
109  (case_ok(*word->best_choice) ? 'y' : 'n'),
110  word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
111  word->best_choices.singleton() ? 'n' : 'y');
112  }
113 
114  if (word->best_choice->length() == 0 || !word->best_choices.singleton())
115  return false;
116  if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) {
117  WordSize = LengthOfShortestAlphaRun(*word->best_choice);
118  WordSize -= stopper_smallword_size;
119  if (WordSize < 0)
120  WordSize = 0;
121  CertaintyThreshold += WordSize * stopper_certainty_per_char;
122  }
123 
124  if (stopper_debug_level >= 1)
125  tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
126  word->best_choice->certainty(), CertaintyThreshold);
127 
128  if (word->best_choice->certainty() > CertaintyThreshold &&
130  if (stopper_debug_level >= 1)
131  tprintf("ACCEPTED\n");
132  return true;
133  } else {
134  if (stopper_debug_level >= 1)
135  tprintf("REJECTED\n");
136  return false;
137  }
138 }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:787
int length() const
Definition: ratngs.h:303
WERD_CHOICE_LIST best_choices
Definition: pageres.h:242
bool stopper_no_acceptable_choices
Definition: dict.h:631
int stopper_debug_level
Definition: dict.h:628
int stopper_smallword_size
Definition: dict.h:623
double stopper_nondict_certainty_base
Definition: dict.h:619
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool dangerous_ambig_found() const
Definition: ratngs.h:363
float certainty() const
Definition: ratngs.h:330
WERD_CHOICE * best_choice
Definition: pageres.h:234
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:46
const STRING debug_string() const
Definition: ratngs.h:505
double stopper_certainty_per_char
Definition: dict.h:625
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:440
void tesseract::Dict::add_document_word ( const WERD_CHOICE best_choice)

Adds a word found on this document to the document specific dictionary.

Definition at line 653 of file dict.cpp.

653  {
654  // Do not add hyphenated word parts to the document dawg.
655  // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
656  // called when the first part of the hyphenated word is
657  // discovered and while the second part of the word is recognized.
658  // hyphen_word_ is cleared in cc_recg() before the next word on
659  // the line is recognized.
660  if (hyphen_word_) return;
661 
662  int stringlen = best_choice.length();
663 
664  if (valid_word(best_choice) || stringlen < 2) return;
665 
666  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
667  if (best_choice.length() >= kDocDictMaxRepChars) {
668  int num_rep_chars = 1;
669  UNICHAR_ID uch_id = best_choice.unichar_id(0);
670  for (int i = 1; i < best_choice.length(); ++i) {
671  if (best_choice.unichar_id(i) != uch_id) {
672  num_rep_chars = 1;
673  uch_id = best_choice.unichar_id(i);
674  } else {
675  ++num_rep_chars;
676  if (num_rep_chars == kDocDictMaxRepChars) return;
677  }
678  }
679  }
680 
681  if (best_choice.certainty() < doc_dict_certainty_threshold ||
682  stringlen == 2) {
683  if (best_choice.certainty() < doc_dict_pending_threshold) return;
684 
685  if (!pending_words_->word_in_dawg(best_choice)) {
686  if (stringlen > 2 ||
687  (stringlen == 2 &&
688  getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
689  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
690  pending_words_->add_word_to_dawg(best_choice);
691  }
692  return;
693  }
694  }
695 
696  if (save_doc_words) {
697  STRING filename(getCCUtil()->imagefile);
698  filename += ".doc";
699  FILE* doc_word_file = fopen(filename.string(), "a");
700  if (doc_word_file == nullptr) {
701  tprintf("Error: Could not open file %s\n", filename.string());
702  ASSERT_HOST(doc_word_file);
703  }
704  fprintf(doc_word_file, "%s\n", best_choice.debug_string().string());
705  fclose(doc_word_file);
706  }
707  document_words_->add_word_to_dawg(best_choice);
708 }
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:169
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:787
Definition: strngs.h:45
double doc_dict_certainty_threshold
Definition: dict.h:646
int length() const
Definition: ratngs.h:303
bool save_doc_words
Definition: dict.h:642
const CCUtil * getCCUtil() const
Definition: dict.h:91
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
float certainty() const
Definition: ratngs.h:330
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
#define ASSERT_HOST(x)
Definition: errcode.h:88
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
double doc_dict_pending_threshold
Definition: dict.h:644
int UNICHAR_ID
Definition: unichar.h:34
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
const STRING debug_string() const
Definition: ratngs.h:505
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:65
void tesseract::Dict::adjust_word ( WERD_CHOICE word,
bool  nonword,
XHeightConsistencyEnum  xheight_consistency,
float  additional_adjust,
bool  modify_rating,
bool  debug 
)

Adjusts the rating of the given word.

Definition at line 710 of file dict.cpp.

713  {
714  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
715  word->GetTopScriptID() == getUnicharset().han_sid());
716  bool case_is_ok = (is_han || case_ok(*word));
717  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
718 
719  float adjust_factor = additional_adjust;
720  float new_rating = word->rating();
721  new_rating += kRatingPad;
722  const char* xheight_triggered = "";
723  if (word->length() > 1) {
724  // Calculate x-height and y-offset consistency penalties.
725  switch (xheight_consistency) {
726  case XH_INCONSISTENT:
727  adjust_factor += xheight_penalty_inconsistent;
728  xheight_triggered = ", xhtBAD";
729  break;
730  case XH_SUBNORMAL:
731  adjust_factor += xheight_penalty_subscripts;
732  xheight_triggered = ", xhtSUB";
733  break;
734  case XH_GOOD:
735  // leave the factor alone - all good!
736  break;
737  }
738  // TODO(eger): if nonword is true, but there is a "core" that is a dict
739  // word, negate nonword status.
740  } else {
741  if (debug) {
742  tprintf("Consistency could not be calculated.\n");
743  }
744  }
745  if (debug) {
746  tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
747  word->unichar_string().string(), word->rating(), xheight_triggered);
748  }
749 
750  if (nonword) { // non-dictionary word
751  if (case_is_ok && punc_is_ok) {
752  adjust_factor += segment_penalty_dict_nonword;
753  new_rating *= adjust_factor;
754  if (debug) tprintf(", W");
755  } else {
756  adjust_factor += segment_penalty_garbage;
757  new_rating *= adjust_factor;
758  if (debug) {
759  if (!case_is_ok) tprintf(", C");
760  if (!punc_is_ok) tprintf(", P");
761  }
762  }
763  } else { // dictionary word
764  if (case_is_ok) {
765  if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
767  adjust_factor += segment_penalty_dict_frequent_word;
768  new_rating *= adjust_factor;
769  if (debug) tprintf(", F");
770  } else {
771  adjust_factor += segment_penalty_dict_case_ok;
772  new_rating *= adjust_factor;
773  if (debug) tprintf(", ");
774  }
775  } else {
776  adjust_factor += segment_penalty_dict_case_bad;
777  new_rating *= adjust_factor;
778  if (debug) tprintf(", C");
779  }
780  }
781  new_rating -= kRatingPad;
782  if (modify_rating) word->set_rating(new_rating);
783  if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
784  word->set_adjust_factor(adjust_factor);
785 }
double xheight_penalty_subscripts
Definition: dict.h:584
void set_rating(float new_val)
Definition: ratngs.h:369
const STRING & unichar_string() const
Definition: ratngs.h:541
int length() const
Definition: ratngs.h:303
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:865
int null_sid() const
Definition: unicharset.h:884
int GetTopScriptID() const
Definition: ratngs.cpp:667
void set_permuter(uint8_t perm)
Definition: ratngs.h:375
double segment_penalty_garbage
Definition: dict.h:607
float rating() const
Definition: ratngs.h:327
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
double xheight_penalty_inconsistent
Definition: dict.h:587
double segment_penalty_dict_case_bad
Definition: dict.h:598
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
double segment_penalty_dict_frequent_word
Definition: dict.h:590
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:46
double segment_penalty_dict_case_ok
Definition: dict.h:594
void set_adjust_factor(float factor)
Definition: ratngs.h:309
int han_sid() const
Definition: unicharset.h:889
double segment_penalty_dict_nonword
Definition: dict.h:602
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:65
void tesseract::Dict::append_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
const BLOB_CHOICE blob_choice,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

append_choices

Checks to see whether or not the next choice is worth appending to the word being generated. If so then keeps going deeper into the word.

This function assumes that Dict::go_deeper_fxn_ is set.

Definition at line 239 of file permdawg.cpp.

250  {
251  int word_ending = (char_choice_index == char_choices.length() - 1);
252 
253  // Deal with fragments.
254  CHAR_FRAGMENT_INFO char_frag_info;
255  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
256  blob_choice.certainty(), prev_char_frag_info, debug,
257  word_ending, &char_frag_info)) {
258  return; // blob_choice must be an invalid fragment
259  }
260  // Search the next letter if this character is a fragment.
261  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
262  permute_choices(debug, char_choices, char_choice_index + 1,
263  &char_frag_info, word, certainties, limit,
264  best_choice, attempts_left, more_args);
265  return;
266  }
267 
268  // Add the next unichar.
269  float old_rating = word->rating();
270  float old_certainty = word->certainty();
271  uint8_t old_permuter = word->permuter();
272  certainties[word->length()] = char_frag_info.certainty;
274  char_frag_info.unichar_id, char_frag_info.num_fragments,
275  char_frag_info.rating, char_frag_info.certainty);
276 
277  // Explore the next unichar.
278  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
279  &char_frag_info, word_ending, word, certainties,
280  limit, best_choice, attempts_left, more_args);
281 
282  // Remove the unichar we added to explore other choices in it's place.
283  word->remove_last_unichar_id();
284  word->set_rating(old_rating);
285  word->set_certainty(old_certainty);
286  word->set_permuter(old_permuter);
287 }
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
int num_fragments
Definition: dict.h:42
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:452
int length() const
Definition: genericvector.h:84
float rating() const
Definition: ratngs.h:80
void set_rating(float new_val)
Definition: ratngs.h:369
int length() const
Definition: ratngs.h:303
void set_permuter(uint8_t perm)
Definition: ratngs.h:375
float certainty() const
Definition: ratngs.h:83
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:211
float rating() const
Definition: ratngs.h:327
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:314
void remove_last_unichar_id()
Definition: ratngs.h:483
UNICHAR_ID unichar_id
Definition: dict.h:40
float certainty
Definition: dict.h:44
float certainty() const
Definition: ratngs.h:330
float rating
Definition: dict.h:43
void set_certainty(float new_val)
Definition: ratngs.h:372
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:197
uint8_t permuter() const
Definition: ratngs.h:346
float tesseract::Dict::CallParamsModelClassify ( void *  path)
inline

Definition at line 412 of file dict.h.

412  {
413  ASSERT_HOST(params_model_classify_ != nullptr); // ASSERT_HOST -> assert
414  return (this->*params_model_classify_)(
415  getCCUtil()->lang.string(), path);
416  }
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:409
const CCUtil * getCCUtil() const
Definition: dict.h:91
const char * string() const
Definition: strngs.cpp:194
STRING lang
Definition: ccutil.h:69
#define ASSERT_HOST(x)
Definition: errcode.h:88
int tesseract::Dict::case_ok ( const WERD_CHOICE word) const

Check a string to see if it matches a set of lexical rules.

Definition at line 46 of file context.cpp.

46  {
47  int state = 0;
48  int x;
49  const UNICHARSET* unicharset = word.unicharset();
50  for (x = 0; x < word.length(); ++x) {
51  UNICHAR_ID ch_id = word.unichar_id(x);
52  if (unicharset->get_isupper(ch_id))
53  state = case_state_table[state][1];
54  else if (unicharset->get_islower(ch_id))
55  state = case_state_table[state][2];
56  else if (unicharset->get_isdigit(ch_id))
57  state = case_state_table[state][3];
58  else
59  state = case_state_table[state][0];
60  if (state == -1) return false;
61  }
62  return state != 5; // single lower is bad
63 }
int length() const
Definition: ratngs.h:303
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
const int case_state_table[6][4]
Definition: context.cpp:29
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int UNICHAR_ID
Definition: unichar.h:34
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
UNICHAR_ID tesseract::Dict::char_for_dawg ( const UNICHARSET unicharset,
UNICHAR_ID  ch,
const Dawg dawg 
) const
inline

Definition at line 439 of file dict.h.

440  {
441  if (!dawg) return ch;
442  switch (dawg->type()) {
443  case DAWG_TYPE_NUMBER:
444  return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
445  default:
446  return ch;
447  }
448  }
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:122
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
bool tesseract::Dict::compound_marker ( UNICHAR_ID  unichar_id)
inline

Definition at line 108 of file dict.h.

108  {
109  const UNICHARSET& unicharset = getUnicharset();
110  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
111  const GenericVector<UNICHAR_ID>& normed_ids =
112  unicharset.normed_ids(unichar_id);
113  return normed_ids.size() == 1 &&
114  (normed_ids[0] == hyphen_unichar_id_ ||
115  normed_ids[0] == slash_unichar_id_);
116  }
#define ASSERT_HOST(x)
Definition: errcode.h:88
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:835
int size() const
Definition: genericvector.h:70
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
void tesseract::Dict::copy_hyphen_info ( WERD_CHOICE word) const
inline

If this word is hyphenated copy the base word (the part on the line before) of a hyphenated word into the given word. This function assumes that word is not nullptr.

Definition at line 140 of file dict.h.

140  {
141  if (this->hyphenated()) {
142  *word = *hyphen_word_;
143  if (hyphen_debug_level) word->print("copy_hyphen_info: ");
144  }
145  }
int hyphen_debug_level
Definition: dict.h:612
void print() const
Definition: ratngs.h:580
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:130
WERD_CHOICE * tesseract::Dict::dawg_permute_and_select ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit 
)

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to explore all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

dawg_permute_and_select

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to search all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

Definition at line 168 of file permdawg.cpp.

169  {
170  auto *best_choice = new WERD_CHOICE(&getUnicharset());
171  best_choice->make_bad();
172  best_choice->set_rating(rating_limit);
173  if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
174  return best_choice;
175  auto *active_dawgs =
176  new DawgPositionVector[char_choices.length() + 1];
177  init_active_dawgs(&(active_dawgs[0]), true);
178  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
180 
181  float certainties[MAX_WERD_LENGTH];
183  int attempts_left = max_permuter_attempts;
184  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr,
185  char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice,
186  &attempts_left, &dawg_args);
187  delete[] active_dawgs;
188  return best_choice;
189 }
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:44
int length() const
Definition: genericvector.h:84
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:609
int max_permuter_attempts
Definition: dict.h:651
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:211
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
#define MAX_WERD_LENGTH
Definition: dict.h:35
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:197
int dawg_debug_level
Definition: dict.h:611
void tesseract::Dict::DebugWordChoices ( )

Prints the current choices for this word to stdout.

int tesseract::Dict::def_letter_is_okay ( void *  void_dawg_args,
const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  word_end 
) const

Returns the maximal permuter code (from ccstruct/ratngs.h) if in light of the current state the letter at word_index in the given word is allowed according to at least one of the dawgs in dawgs_, otherwise returns NO_PERM.

The state is described by void_dawg_args, which are interpreted as DawgArgs and contain relevant active dawg positions. Each entry in the active_dawgs vector contains an index into the dawgs_ vector and an EDGE_REF that indicates the last edge followed in the dawg. It also may contain a position in the punctuation dawg which describes surrounding punctuation (see struct DawgPosition).

Input: At word_index 0 dawg_args->active_dawgs should contain an entry for each dawg that may start at the beginning of a word, with punc_ref and edge_ref initialized to NO_EDGE. Since the punctuation dawg includes the empty pattern " " (meaning anything without surrounding punctuation), having a single entry for the punctuation dawg will cover all dawgs reachable therefrom – that includes all number and word dawgs. The only dawg non-reachable from the punctuation_dawg is the pattern dawg. If hyphen state needs to be applied, initial dawg_args->active_dawgs can be copied from the saved hyphen state (maintained by Dict). For word_index > 0 the corresponding state (active_dawgs and punc position) can be obtained from dawg_args->updated_dawgs passed to def_letter_is_okay for word_index-1. Note: the function assumes that active_dawgs, and updated_dawgs member variables of dawg_args are not nullptr.

Output: The function fills in dawg_args->updated_dawgs vector with the entries for dawgs that contain the word up to the letter at word_index.

Definition at line 404 of file dict.cpp.

405  {
406  auto* dawg_args = static_cast<DawgArgs*>(void_dawg_args);
407 
408  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
409 
410  if (dawg_debug_level >= 3) {
411  tprintf(
412  "def_letter_is_okay: current unichar=%s word_end=%d"
413  " num active dawgs=%d\n",
414  getUnicharset().debug_str(unichar_id).string(), word_end,
415  dawg_args->active_dawgs->length());
416  }
417 
418  // Do not accept words that contain kPatternUnicharID.
419  // (otherwise pattern dawgs would not function correctly).
420  // Do not accept words containing INVALID_UNICHAR_IDs.
421  if (unichar_id == Dawg::kPatternUnicharID ||
422  unichar_id == INVALID_UNICHAR_ID) {
423  dawg_args->permuter = NO_PERM;
424  return NO_PERM;
425  }
426 
427  // Initialization.
428  PermuterType curr_perm = NO_PERM;
429  dawg_args->updated_dawgs->clear();
430  dawg_args->valid_end = false;
431 
432  // Go over the active_dawgs vector and insert DawgPosition records
433  // with the updated ref (an edge with the corresponding unichar id) into
434  // dawg_args->updated_pos.
435  for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
436  const DawgPosition& pos = (*dawg_args->active_dawgs)[a];
437  const Dawg* punc_dawg =
438  pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
439  const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
440 
441  if (!dawg && !punc_dawg) {
442  // shouldn't happen.
443  tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
444  continue;
445  }
446  if (!dawg) {
447  // We're in the punctuation dawg. A core dawg has not been chosen.
448  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
449  EDGE_REF punc_transition_edge =
450  punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
451  if (punc_transition_edge != NO_EDGE) {
452  // Find all successors, and see which can transition.
453  const SuccessorList& slist = *(successors_[pos.punc_index]);
454  for (int s = 0; s < slist.length(); ++s) {
455  int sdawg_index = slist[s];
456  const Dawg* sdawg = dawgs_[sdawg_index];
457  UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
458  EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
459  if (dawg_edge != NO_EDGE) {
460  if (dawg_debug_level >= 3) {
461  tprintf("Letter found in dawg %d\n", sdawg_index);
462  }
463  dawg_args->updated_dawgs->add_unique(
464  DawgPosition(sdawg_index, dawg_edge, pos.punc_index,
465  punc_transition_edge, false),
466  dawg_debug_level > 0,
467  "Append transition from punc dawg to current dawgs: ");
468  if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
469  if (sdawg->end_of_word(dawg_edge) &&
470  punc_dawg->end_of_word(punc_transition_edge))
471  dawg_args->valid_end = true;
472  }
473  }
474  }
475  EDGE_REF punc_edge =
476  punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
477  if (punc_edge != NO_EDGE) {
478  if (dawg_debug_level >= 3) {
479  tprintf("Letter found in punctuation dawg\n");
480  }
481  dawg_args->updated_dawgs->add_unique(
482  DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
483  dawg_debug_level > 0, "Extend punctuation dawg: ");
484  if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
485  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
486  }
487  continue;
488  }
489 
490  if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
491  // We can end the main word here.
492  // If we can continue on the punc ref, add that possibility.
493  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
494  EDGE_REF punc_edge =
495  punc_node == NO_EDGE
496  ? NO_EDGE
497  : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
498  if (punc_edge != NO_EDGE) {
499  dawg_args->updated_dawgs->add_unique(
500  DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index,
501  punc_edge, true),
502  dawg_debug_level > 0, "Return to punctuation dawg: ");
503  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
504  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
505  }
506  }
507 
508  if (pos.back_to_punc) continue;
509 
510  // If we are dealing with the pattern dawg, look up all the
511  // possible edges, not only for the exact unichar_id, but also
512  // for all its character classes (alpha, digit, etc).
513  if (dawg->type() == DAWG_TYPE_PATTERN) {
514  ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args,
515  &curr_perm);
516  // There can't be any successors to dawg that is of type
517  // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
518  continue;
519  }
520 
521  // Find the edge out of the node for the unichar_id.
522  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
523  EDGE_REF edge =
524  (node == NO_EDGE)
525  ? NO_EDGE
526  : dawg->edge_char_of(
527  node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
528 
529  if (dawg_debug_level >= 3) {
530  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
531  pos.dawg_index, node, edge);
532  }
533 
534  if (edge != NO_EDGE) { // the unichar was found in the current dawg
535  if (dawg_debug_level >= 3) {
536  tprintf("Letter found in dawg %d\n", pos.dawg_index);
537  }
538  if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
539  if (dawg_debug_level >= 3) {
540  tprintf("Punctuation constraint not satisfied at end of word.\n");
541  }
542  continue;
543  }
544  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
545  if (dawg->end_of_word(edge) &&
546  (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref)))
547  dawg_args->valid_end = true;
548  dawg_args->updated_dawgs->add_unique(
549  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
550  false),
551  dawg_debug_level > 0,
552  "Append current dawg to updated active dawgs: ");
553  }
554  } // end for
555  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
556  // or if we found the current letter in a non-punctuation dawg. This
557  // allows preserving information on which dawg the "core" word came from.
558  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
559  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
560  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
561  dawg_args->permuter = curr_perm;
562  }
563  if (dawg_debug_level >= 2) {
564  tprintf("Returning %d for permuter code for this character.\n",
565  dawg_args->permuter);
566  }
567  return dawg_args->permuter;
568 }
PermuterType
Definition: ratngs.h:242
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:122
#define REFFORMAT
Definition: dawg.h:89
int64_t EDGE_REF
Definition: dawg.h:51
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:429
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
GenericVector< int > SuccessorList
Definition: dawg.h:65
#define ASSERT_HOST(x)
Definition: errcode.h:88
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int UNICHAR_ID
Definition: unichar.h:34
int64_t NODE_REF
Definition: dawg.h:52
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:439
int dawg_debug_level
Definition: dict.h:611
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:570
double tesseract::Dict::def_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Default (no-op) implementation of probability in context function.

Definition at line 392 of file dict.h.

394  {
395  (void)lang;
396  (void)context;
397  (void)context_bytes;
398  (void)character;
399  (void)character_bytes;
400  return 0.0;
401  }
void tesseract::Dict::default_dawgs ( DawgPositionVector anylength_dawgs,
bool  suppress_patterns 
) const

Definition at line 626 of file dict.cpp.

627  {
628  bool punc_dawg_available =
629  (punc_dawg_ != nullptr) &&
630  punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
631 
632  for (int i = 0; i < dawgs_.length(); i++) {
633  if (dawgs_[i] != nullptr &&
634  !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
635  int dawg_ty = dawgs_[i]->type();
636  bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
637  if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
638  *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
639  if (dawg_debug_level >= 3) {
640  tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
641  NO_EDGE);
642  }
643  } else if (!punc_dawg_available || !subsumed_by_punc) {
644  *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
645  if (dawg_debug_level >= 3) {
646  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
647  }
648  }
649  }
650  }
651 }
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:122
#define REFFORMAT
Definition: dawg.h:89
int length() const
Definition: genericvector.h:84
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
int dawg_debug_level
Definition: dict.h:611
void tesseract::Dict::End ( )

Definition at line 381 of file dict.cpp.

381  {
382  if (dawgs_.length() == 0) return; // Not safe to call twice.
383  for (int i = 0; i < dawgs_.size(); i++) {
384  if (!dawg_cache_->FreeDawg(dawgs_[i])) {
385  delete dawgs_[i];
386  }
387  }
388  dawg_cache_->FreeDawg(bigram_dawg_);
389  if (dawg_cache_is_ours_) {
390  delete dawg_cache_;
391  dawg_cache_ = nullptr;
392  }
393  successors_.delete_data_pointers();
394  dawgs_.clear();
395  successors_.clear();
396  document_words_ = nullptr;
397  delete pending_words_;
398  pending_words_ = nullptr;
399 }
void delete_data_pointers()
int length() const
Definition: genericvector.h:84
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:38
int size() const
Definition: genericvector.h:70
void tesseract::Dict::EndDangerousAmbigs ( )

Definition at line 356 of file stopper.cpp.

356 {}
bool tesseract::Dict::FinishLoad ( )

Definition at line 360 of file dict.cpp.

360  {
361  if (dawgs_.empty()) return false;
362  // Construct a list of corresponding successors for each dawg. Each entry, i,
363  // in the successors_ vector is a vector of integers that represent the
364  // indices into the dawgs_ vector of the successors for dawg i.
365  successors_.reserve(dawgs_.length());
366  for (int i = 0; i < dawgs_.length(); ++i) {
367  const Dawg* dawg = dawgs_[i];
368  auto* lst = new SuccessorList();
369  for (int j = 0; j < dawgs_.length(); ++j) {
370  const Dawg* other = dawgs_[j];
371  if (dawg != nullptr && other != nullptr &&
372  (dawg->lang() == other->lang()) &&
373  kDawgSuccessors[dawg->type()][other->type()])
374  *lst += j;
375  }
376  successors_ += lst;
377  }
378  return true;
379 }
int length() const
Definition: genericvector.h:84
void reserve(int size)
bool empty() const
Definition: genericvector.h:89
GenericVector< int > SuccessorList
Definition: dawg.h:65
bool tesseract::Dict::fragment_state_okay ( UNICHAR_ID  curr_unichar_id,
float  curr_rating,
float  curr_certainty,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
const char *  debug,
int  word_ending,
CHAR_FRAGMENT_INFO char_frag_info 
)

Definition at line 314 of file permdawg.cpp.

318  {
319  const CHAR_FRAGMENT *this_fragment =
320  getUnicharset().get_fragment(curr_unichar_id);
321  const CHAR_FRAGMENT *prev_fragment =
322  prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
323 
324  // Print debug info for fragments.
325  if (debug && (prev_fragment || this_fragment)) {
326  tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
327  getUnicharset().debug_str(curr_unichar_id).string(),
328  word_ending);
329  if (prev_fragment) {
330  tprintf("prev_fragment %s\n", prev_fragment->to_string().string());
331  }
332  if (this_fragment) {
333  tprintf("this_fragment %s\n", this_fragment->to_string().string());
334  }
335  }
336 
337  char_frag_info->unichar_id = curr_unichar_id;
338  char_frag_info->fragment = this_fragment;
339  char_frag_info->rating = curr_rating;
340  char_frag_info->certainty = curr_certainty;
341  char_frag_info->num_fragments = 1;
342  if (prev_fragment && !this_fragment) {
343  if (debug) tprintf("Skip choice with incomplete fragment\n");
344  return false;
345  }
346  if (this_fragment) {
347  // We are dealing with a fragment.
348  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
349  if (prev_fragment) {
350  if (!this_fragment->is_continuation_of(prev_fragment)) {
351  if (debug) tprintf("Non-matching fragment piece\n");
352  return false;
353  }
354  if (this_fragment->is_ending()) {
355  char_frag_info->unichar_id =
356  getUnicharset().unichar_to_id(this_fragment->get_unichar());
357  char_frag_info->fragment = nullptr;
358  if (debug) {
359  tprintf("Built character %s from fragments\n",
360  getUnicharset().debug_str(
361  char_frag_info->unichar_id).string());
362  }
363  } else {
364  if (debug) tprintf("Record fragment continuation\n");
365  char_frag_info->fragment = this_fragment;
366  }
367  // Update certainty and rating.
368  char_frag_info->rating =
369  prev_char_frag_info->rating + curr_rating;
370  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
371  char_frag_info->certainty =
372  std::min(curr_certainty, prev_char_frag_info->certainty);
373  } else {
374  if (this_fragment->is_beginning()) {
375  if (debug) tprintf("Record fragment beginning\n");
376  } else {
377  if (debug) {
378  tprintf("Non-starting fragment piece with no prev_fragment\n");
379  }
380  return false;
381  }
382  }
383  }
384  if (word_ending && char_frag_info->fragment) {
385  if (debug) tprintf("Word can not end with a fragment\n");
386  return false;
387  }
388  return true;
389 }
int num_fragments
Definition: dict.h:42
static STRING to_string(const char *unichar, int pos, int total, bool natural)
bool is_ending() const
Definition: unicharset.h:108
const char * get_unichar() const
Definition: unicharset.h:70
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:98
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
UNICHAR_ID unichar_id
Definition: dict.h:40
float certainty
Definition: dict.h:44
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
float rating
Definition: dict.h:43
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
const CHAR_FRAGMENT * fragment
Definition: dict.h:41
bool is_beginning() const
Definition: unicharset.h:105
const CCUtil* tesseract::Dict::getCCUtil ( ) const
inline

Definition at line 91 of file dict.h.

91  {
92  return ccutil_;
93  }
CCUtil* tesseract::Dict::getCCUtil ( )
inline

Definition at line 94 of file dict.h.

94  {
95  return ccutil_;
96  }
const Dawg* tesseract::Dict::GetDawg ( int  index) const
inline

Return i-th dawg pointer recorded in the dawgs_ vector.

Definition at line 423 of file dict.h.

423 { return dawgs_[index]; }
const Dawg* tesseract::Dict::GetPuncDawg ( ) const
inline

Return the points to the punctuation dawg.

Definition at line 425 of file dict.h.

425 { return punc_dawg_; }
static NODE_REF tesseract::Dict::GetStartingNode ( const Dawg dawg,
EDGE_REF  edge_ref 
)
inlinestatic

Returns the appropriate next node given the EDGE_REF.

Definition at line 429 of file dict.h.

429  {
430  if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
431  NODE_REF node = dawg->next_node(edge_ref);
432  if (node == 0) node = NO_EDGE; // end of word
433  return node;
434  }
int64_t NODE_REF
Definition: dawg.h:52
const Dawg* tesseract::Dict::GetUnambigDawg ( ) const
inline

Return the points to the unambiguous words dawg.

Definition at line 427 of file dict.h.

427 { return unambig_dawg_; }
const UnicharAmbigs& tesseract::Dict::getUnicharAmbigs ( ) const
inline

Definition at line 103 of file dict.h.

103  {
104  return getCCUtil()->unichar_ambigs;
105  }
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:72
const CCUtil * getCCUtil() const
Definition: dict.h:91
const UNICHARSET& tesseract::Dict::getUnicharset ( ) const
inline

Definition at line 97 of file dict.h.

97  {
98  return getCCUtil()->unicharset;
99  }
UNICHARSET unicharset
Definition: ccutil.h:71
const CCUtil * getCCUtil() const
Definition: dict.h:91
UNICHARSET& tesseract::Dict::getUnicharset ( )
inline

Definition at line 100 of file dict.h.

100  {
101  return getCCUtil()->unicharset;
102  }
UNICHARSET unicharset
Definition: ccutil.h:71
const CCUtil * getCCUtil() const
Definition: dict.h:91
DawgCache * tesseract::Dict::GlobalDawgCache ( )
static

Initialize Dict class - load dawgs from [lang].traineddata and user-specified wordlist and parttern list.

Definition at line 193 of file dict.cpp.

193  {
194  // This global cache (a singleton) will outlive every Tesseract instance
195  // (even those that someone else might declare as global statics).
196  static DawgCache cache;
197  return &cache;
198 }
void tesseract::Dict::go_deeper_dawg_fxn ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
bool  word_ending,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  void_more_args 
)

If the choice being composed so far could be a dictionary word and we have not reached the end of the word keep exploring the char_choices further.

Definition at line 44 of file permdawg.cpp.

48  {
49  auto *more_args = static_cast<DawgArgs *>(void_more_args);
50  word_ending = (char_choice_index == char_choices.size()-1);
51  int word_index = word->length() - 1;
52  if (best_choice->rating() < *limit) return;
53  // Look up char in DAWG
54 
55  // If the current unichar is an ngram first try calling
56  // letter_is_okay() for each unigram it contains separately.
57  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
58  bool checked_unigrams = false;
59  if (getUnicharset().get_isngram(orig_uch_id)) {
60  if (dawg_debug_level) {
61  tprintf("checking unigrams in an ngram %s\n",
62  getUnicharset().debug_str(orig_uch_id).string());
63  }
64  int num_unigrams = 0;
65  word->remove_last_unichar_id();
67  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
68  // Since the string came out of the unicharset, failure is impossible.
69  ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
70  nullptr));
71  bool unigrams_ok = true;
72  // Construct DawgArgs that reflect the current state.
73  DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
74  DawgPositionVector unigram_updated_dawgs;
75  DawgArgs unigram_dawg_args(&unigram_active_dawgs,
76  &unigram_updated_dawgs,
77  more_args->permuter);
78  // Check unigrams in the ngram with letter_is_okay().
79  for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
80  UNICHAR_ID uch_id = encoding[i];
81  ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
82  ++num_unigrams;
83  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
84  unigrams_ok = (this->*letter_is_okay_)(
85  &unigram_dawg_args, *word->unicharset(),
86  word->unichar_id(word_index+num_unigrams-1),
87  word_ending && i == encoding.size() - 1);
88  (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
89  if (dawg_debug_level) {
90  tprintf("unigram %s is %s\n",
91  getUnicharset().debug_str(uch_id).string(),
92  unigrams_ok ? "OK" : "not OK");
93  }
94  }
95  // Restore the word and copy the updated dawg state if needed.
96  while (num_unigrams-- > 0) word->remove_last_unichar_id();
97  word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
98  if (unigrams_ok) {
99  checked_unigrams = true;
100  more_args->permuter = unigram_dawg_args.permuter;
101  *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
102  }
103  }
104 
105  // Check which dawgs from the dawgs_ vector contain the word
106  // up to and including the current unichar.
107  if (checked_unigrams || (this->*letter_is_okay_)(
108  more_args, *word->unicharset(), word->unichar_id(word_index),
109  word_ending)) {
110  // Add a new word choice
111  if (word_ending) {
112  if (dawg_debug_level) {
113  tprintf("found word = %s\n", word->debug_string().string());
114  }
115  if (strcmp(output_ambig_words_file.string(), "") != 0) {
116  if (output_ambig_words_file_ == nullptr) {
117  output_ambig_words_file_ =
118  fopen(output_ambig_words_file.string(), "wb+");
119  if (output_ambig_words_file_ == nullptr) {
120  tprintf("Failed to open output_ambig_words_file %s\n",
121  output_ambig_words_file.string());
122  exit(1);
123  }
124  STRING word_str;
125  word->string_and_lengths(&word_str, nullptr);
126  word_str += " ";
127  fprintf(output_ambig_words_file_, "%s", word_str.string());
128  }
129  STRING word_str;
130  word->string_and_lengths(&word_str, nullptr);
131  word_str += " ";
132  fprintf(output_ambig_words_file_, "%s", word_str.string());
133  }
134  WERD_CHOICE *adjusted_word = word;
135  adjusted_word->set_permuter(more_args->permuter);
136  update_best_choice(*adjusted_word, best_choice);
137  } else { // search the next letter
138  // Make updated_* point to the next entries in the DawgPositionVector
139  // arrays (that were originally created in dawg_permute_and_select)
140  ++(more_args->updated_dawgs);
141  // Make active_dawgs and constraints point to the updated ones.
142  ++(more_args->active_dawgs);
143  permute_choices(debug, char_choices, char_choice_index + 1,
144  prev_char_frag_info, word, certainties, limit,
145  best_choice, attempts_left, more_args);
146  // Restore previous state to explore another letter in this position.
147  --(more_args->updated_dawgs);
148  --(more_args->active_dawgs);
149  }
150  } else {
151  if (dawg_debug_level) {
152  tprintf("last unichar not OK at index %d in %s\n",
153  word_index, word->debug_string().string());
154  }
155  }
156 }
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:468
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:452
Definition: strngs.h:45
char * output_ambig_words_file
Definition: dict.h:609
int length() const
Definition: ratngs.h:303
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:177
void set_permuter(uint8_t perm)
Definition: ratngs.h:375
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:449
float rating() const
Definition: ratngs.h:327
void remove_last_unichar_id()
Definition: ratngs.h:483
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:363
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
#define ASSERT_HOST(x)
Definition: errcode.h:88
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int UNICHAR_ID
Definition: unichar.h:34
const STRING debug_string() const
Definition: ratngs.h:505
int size() const
Definition: genericvector.h:70
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:197
int dawg_debug_level
Definition: dict.h:611
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
int tesseract::Dict::good_choice ( const WERD_CHOICE choice)

Returns true if a good answer is found for the unknown blob rating.

bool tesseract::Dict::has_hyphen_end ( const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  first_pos 
) const
inline

Check whether the word has a hyphen at the end.

Definition at line 147 of file dict.h.

148  {
149  if (!last_word_on_line_ || first_pos)
150  return false;
151  ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
152  const GenericVector<UNICHAR_ID>& normed_ids =
153  unicharset->normed_ids(unichar_id);
154  return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
155  }
#define ASSERT_HOST(x)
Definition: errcode.h:88
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:835
int size() const
Definition: genericvector.h:70
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
bool tesseract::Dict::has_hyphen_end ( const WERD_CHOICE word) const
inline

Same as above, but check the unichar at the end of the word.

Definition at line 157 of file dict.h.

157  {
158  int word_index = word.length() - 1;
159  return has_hyphen_end(word.unicharset(), word.unichar_id(word_index),
160  word_index == 0);
161  }
int length() const
Definition: ratngs.h:303
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:147
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
int tesseract::Dict::hyphen_base_size ( ) const
inline

Size of the base word (the part on the line before) of a hyphenated word.

Definition at line 134 of file dict.h.

134  {
135  return this->hyphenated() ? hyphen_word_->length() : 0;
136  }
int length() const
Definition: ratngs.h:303
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:130
bool tesseract::Dict::hyphenated ( ) const
inline

Returns true if we've recorded the beginning of a hyphenated word.

Definition at line 130 of file dict.h.

130  { return
131  !last_word_on_line_ && hyphen_word_;
132  }
void tesseract::Dict::init_active_dawgs ( DawgPositionVector active_dawgs,
bool  ambigs_mode 
) const

Fill the given active_dawgs vector with dawgs that could contain the beginning of the word. If hyphenated() returns true, copy the entries from hyphen_active_dawgs_ instead.

Definition at line 609 of file dict.cpp.

610  {
611  int i;
612  if (hyphenated()) {
613  *active_dawgs = hyphen_active_dawgs_;
614  if (dawg_debug_level >= 3) {
615  for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
616  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
617  hyphen_active_dawgs_[i].dawg_index,
618  hyphen_active_dawgs_[i].dawg_ref);
619  }
620  }
621  } else {
622  default_dawgs(active_dawgs, ambigs_mode);
623  }
624 }
#define REFFORMAT
Definition: dawg.h:89
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:626
int size() const
Definition: genericvector.h:70
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:130
int dawg_debug_level
Definition: dict.h:611
bool tesseract::Dict::is_apostrophe ( UNICHAR_ID  unichar_id)
inline

Definition at line 119 of file dict.h.

119  {
120  const UNICHARSET& unicharset = getUnicharset();
121  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
122  const GenericVector<UNICHAR_ID>& normed_ids =
123  unicharset.normed_ids(unichar_id);
124  return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
125  }
#define ASSERT_HOST(x)
Definition: errcode.h:88
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:835
int size() const
Definition: genericvector.h:70
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
bool tesseract::Dict::IsSpaceDelimitedLang ( ) const

Returns true if the language is space-delimited (not CJ, or T).

Definition at line 892 of file dict.cpp.

892  {
893  const UNICHARSET& u_set = getUnicharset();
894  if (u_set.han_sid() > 0) return false;
895  if (u_set.katakana_sid() > 0) return false;
896  if (u_set.thai_sid() > 0) return false;
897  return true;
898 }
int thai_sid() const
Definition: unicharset.h:892
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int han_sid() const
Definition: unicharset.h:889
int katakana_sid() const
Definition: unicharset.h:891
int tesseract::Dict::LengthOfShortestAlphaRun ( const WERD_CHOICE WordChoice) const

Returns the length of the shortest alpha run in WordChoice.

Definition at line 440 of file stopper.cpp.

440  {
441  int shortest = INT32_MAX;
442  int curr_len = 0;
443  for (int w = 0; w < WordChoice.length(); ++w) {
444  if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
445  curr_len++;
446  } else if (curr_len > 0) {
447  if (curr_len < shortest) shortest = curr_len;
448  curr_len = 0;
449  }
450  }
451  if (curr_len > 0 && curr_len < shortest) {
452  shortest = curr_len;
453  } else if (shortest == INT32_MAX) {
454  shortest = 0;
455  }
456  return shortest;
457 }
int length() const
Definition: ratngs.h:303
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
int tesseract::Dict::LetterIsOkay ( void *  void_dawg_args,
const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inline

Calls letter_is_okay_ member function.

Definition at line 367 of file dict.h.

368  {
369  return (this->*letter_is_okay_)(void_dawg_args,
370  unicharset, unichar_id, word_end);
371  }
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:363
void tesseract::Dict::Load ( const STRING lang,
TessdataManager data_file 
)

Definition at line 219 of file dict.cpp.

219  {
220  // Load dawgs_.
221  if (load_punc_dawg) {
222  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
223  dawg_debug_level, data_file);
224  if (punc_dawg_) dawgs_ += punc_dawg_;
225  }
226  if (load_system_dawg) {
227  Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
228  lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
229  if (system_dawg) dawgs_ += system_dawg;
230  }
231  if (load_number_dawg) {
232  Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
233  lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
234  if (number_dawg) dawgs_ += number_dawg;
235  }
236  if (load_bigram_dawg) {
237  bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
238  dawg_debug_level, data_file);
239  // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
240  // dawgs_!!
241  }
242  if (load_freq_dawg) {
243  freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
244  dawg_debug_level, data_file);
245  if (freq_dawg_) dawgs_ += freq_dawg_;
246  }
247  if (load_unambig_dawg) {
248  unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
249  dawg_debug_level, data_file);
250  if (unambig_dawg_) dawgs_ += unambig_dawg_;
251  }
252 
253  STRING name;
254  if (!user_words_suffix.empty() || !user_words_file.empty()) {
255  Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
256  getUnicharset().size(), dawg_debug_level);
257  if (!user_words_file.empty()) {
258  name = user_words_file;
259  } else {
261  name += user_words_suffix;
262  }
263  if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
265  tprintf("Error: failed to load %s\n", name.string());
266  delete trie_ptr;
267  } else {
268  dawgs_ += trie_ptr;
269  }
270  }
271 
272  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
273  Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
274  getUnicharset().size(), dawg_debug_level);
275  trie_ptr->initialize_patterns(&(getUnicharset()));
276  if (!user_patterns_file.empty()) {
277  name = user_patterns_file;
278  } else {
280  name += user_patterns_suffix;
281  }
282  if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
283  tprintf("Error: failed to load %s\n", name.string());
284  delete trie_ptr;
285  } else {
286  dawgs_ += trie_ptr;
287  }
288  }
289 
290  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
291  getUnicharset().size(), dawg_debug_level);
292  dawgs_ += document_words_;
293 
294  // This dawg is temporary and should not be searched by letter_is_ok.
295  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
296  getUnicharset().size(), dawg_debug_level);
297 }
bool load_number_dawg
Definition: dict.h:579
char * user_patterns_suffix
Definition: dict.h:573
bool load_unambig_dawg
Definition: dict.h:576
Definition: strngs.h:45
bool load_punc_dawg
Definition: dict.h:578
const CCUtil * getCCUtil() const
Definition: dict.h:91
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:45
char * user_patterns_file
Definition: dict.h:571
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
char * user_words_suffix
Definition: dict.h:569
bool load_freq_dawg
Definition: dict.h:575
char * user_words_file
Definition: dict.h:567
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
bool load_system_dawg
Definition: dict.h:574
int dawg_debug_level
Definition: dict.h:611
bool load_bigram_dawg
Definition: dict.h:581
STRING language_data_path_prefix
Definition: ccutil.h:70
void tesseract::Dict::LoadLSTM ( const STRING lang,
TessdataManager data_file 
)

Definition at line 300 of file dict.cpp.

300  {
301  // Load dawgs_.
302  if (load_punc_dawg) {
303  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
304  dawg_debug_level, data_file);
305  if (punc_dawg_) dawgs_ += punc_dawg_;
306  }
307  if (load_system_dawg) {
308  Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
309  lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
310  if (system_dawg) dawgs_ += system_dawg;
311  }
312  if (load_number_dawg) {
313  Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
314  lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
315  if (number_dawg) dawgs_ += number_dawg;
316  }
317 
318  // stolen from Dict::Load (but needs params_ from Tesseract
319  // langdata/config/api):
320  STRING name;
321  if (!user_words_suffix.empty() || !user_words_file.empty()) {
322  Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
323  getUnicharset().size(), dawg_debug_level);
324  if (!user_words_file.empty()) {
325  name = user_words_file;
326  } else {
328  name += user_words_suffix;
329  }
330  if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
332  tprintf("Error: failed to load %s\n", name.string());
333  delete trie_ptr;
334  } else {
335  dawgs_ += trie_ptr;
336  }
337  }
338 
339  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
340  Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
341  getUnicharset().size(), dawg_debug_level);
342  trie_ptr->initialize_patterns(&(getUnicharset()));
343  if (!user_patterns_file.empty()) {
344  name = user_patterns_file;
345  } else {
347  name += user_patterns_suffix;
348  }
349  if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
350  tprintf("Error: failed to load %s\n", name.string());
351  delete trie_ptr;
352  } else {
353  dawgs_ += trie_ptr;
354  }
355  }
356 }
bool load_number_dawg
Definition: dict.h:579
char * user_patterns_suffix
Definition: dict.h:573
Definition: strngs.h:45
bool load_punc_dawg
Definition: dict.h:578
const CCUtil * getCCUtil() const
Definition: dict.h:91
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:45
char * user_patterns_file
Definition: dict.h:571
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
char * user_words_suffix
Definition: dict.h:569
char * user_words_file
Definition: dict.h:567
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
bool load_system_dawg
Definition: dict.h:574
int dawg_debug_level
Definition: dict.h:611
STRING language_data_path_prefix
Definition: ccutil.h:70
double tesseract::Dict::ngram_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
bool tesseract::Dict::NoDangerousAmbig ( WERD_CHOICE BestChoice,
DANGERR fixpt,
bool  fix_replaceable,
MATRIX ratings 
)

Definition at line 140 of file stopper.cpp.

143  {
144  if (stopper_debug_level > 2) {
145  tprintf("\nRunning NoDangerousAmbig() for %s\n",
146  best_choice->debug_string().string());
147  }
148 
149  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
150  // for each unichar id in BestChoice.
151  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
152  int i;
153  bool ambigs_found = false;
154  // For each position in best_choice:
155  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
156  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
157  // -- look for ambiguities corresponding to wrong_ngram in the list while
158  // adding the following unichar_ids from best_choice to wrong_ngram
159  //
160  // Repeat the above procedure twice: first time look through
161  // ambigs to be replaced and replace all the ambiguities found;
162  // second time look through dangerous ambiguities and construct
163  // ambig_blob_choices with fake a blob choice for each ambiguity
164  // and pass them to dawg_permute_and_select() to search for
165  // ambiguous words in the dictionaries.
166  //
167  // Note that during the execution of the for loop (on the first pass)
168  // if replacements are made the length of best_choice might change.
169  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
170  bool replace = (fix_replaceable && pass == 0);
171  const UnicharAmbigsVector &table = replace ?
173  if (!replace) {
174  // Initialize ambig_blob_choices with lists containing a single
175  // unichar id for the corresponding position in best_choice.
176  // best_choice consisting from only the original letters will
177  // have a rating of 0.0.
178  for (i = 0; i < best_choice->length(); ++i) {
179  auto *lst = new BLOB_CHOICE_LIST();
180  BLOB_CHOICE_IT lst_it(lst);
181  // TODO(rays/antonova) Put real xheights and y shifts here.
182  lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
183  0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
184  ambig_blob_choices.push_back(lst);
185  }
186  }
187  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
188  int wrong_ngram_index;
189  int next_index;
190  int blob_index = 0;
191  for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
192  ++i) {
193  UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
194  if (stopper_debug_level > 2) {
195  tprintf("Looking for %s ngrams starting with %s:\n",
196  replace ? "replaceable" : "ambiguous",
197  getUnicharset().debug_str(curr_unichar_id).string());
198  }
199  int num_wrong_blobs = best_choice->state(i);
200  wrong_ngram_index = 0;
201  wrong_ngram[wrong_ngram_index] = curr_unichar_id;
202  if (curr_unichar_id == INVALID_UNICHAR_ID ||
203  curr_unichar_id >= table.size() ||
204  table[curr_unichar_id] == nullptr) {
205  continue; // there is no ambig spec for this unichar id
206  }
207  AmbigSpec_IT spec_it(table[curr_unichar_id]);
208  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
209  const AmbigSpec *ambig_spec = spec_it.data();
210  wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
211  int compare = UnicharIdArrayUtils::compare(wrong_ngram,
212  ambig_spec->wrong_ngram);
213  if (stopper_debug_level > 2) {
214  tprintf("candidate ngram: ");
216  tprintf("current ngram from spec: ");
217  UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
218  tprintf("comparison result: %d\n", compare);
219  }
220  if (compare == 0) {
221  // Record the place where we found an ambiguity.
222  if (fixpt != nullptr) {
223  UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
224  fixpt->push_back(DANGERR_INFO(
225  blob_index, blob_index + num_wrong_blobs, replace,
226  getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
227  leftmost_id));
228  if (stopper_debug_level > 1) {
229  tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
230  blob_index + num_wrong_blobs, false,
231  getUnicharset().get_isngram(
232  ambig_spec->correct_ngram_id),
233  getUnicharset().id_to_unichar(leftmost_id));
234  }
235  }
236 
237  if (replace) {
238  if (stopper_debug_level > 2) {
239  tprintf("replace ambiguity with %s : ",
240  getUnicharset().id_to_unichar(
241  ambig_spec->correct_ngram_id));
243  ambig_spec->correct_fragments, getUnicharset());
244  }
245  ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
246  ambig_spec->correct_ngram_id,
247  best_choice, ratings);
248  } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
249  // We found dang ambig - update ambig_blob_choices.
250  if (stopper_debug_level > 2) {
251  tprintf("found ambiguity: ");
253  ambig_spec->correct_fragments, getUnicharset());
254  }
255  ambigs_found = true;
256  for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
257  ++tmp_index) {
258  // Add a blob choice for the corresponding fragment of the
259  // ambiguity. These fake blob choices are initialized with
260  // negative ratings (which are not possible for real blob
261  // choices), so that dawg_permute_and_select() considers any
262  // word not consisting of only the original letters a better
263  // choice and stops searching for alternatives once such a
264  // choice is found.
265  BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
266  bc_it.add_to_end(new BLOB_CHOICE(
267  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
268  -1, 0, 1, 0, BCC_AMBIG));
269  }
270  }
271  spec_it.forward();
272  } else if (compare == -1) {
273  if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
274  ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
275  // Add the next unichar id to wrong_ngram and keep looking for
276  // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
277  wrong_ngram[++wrong_ngram_index] =
278  best_choice->unichar_id(next_index);
279  num_wrong_blobs += best_choice->state(next_index);
280  } else {
281  break; // no more matching ambigs in this AMBIG_SPEC_LIST
282  }
283  } else {
284  spec_it.forward();
285  }
286  } // end searching AmbigSpec_LIST
287  } // end searching best_choice
288  } // end searching replace and dangerous ambigs
289 
290  // If any ambiguities were found permute the constructed ambig_blob_choices
291  // to see if an alternative dictionary word can be found.
292  if (ambigs_found) {
293  if (stopper_debug_level > 2) {
294  tprintf("\nResulting ambig_blob_choices:\n");
295  for (i = 0; i < ambig_blob_choices.length(); ++i) {
296  print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
297  tprintf("\n");
298  }
299  }
300  WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
301  ambigs_found = (alt_word->rating() < 0.0);
302  if (ambigs_found) {
303  if (stopper_debug_level >= 1) {
304  tprintf ("Stopper: Possible ambiguous word = %s\n",
305  alt_word->debug_string().string());
306  }
307  if (fixpt != nullptr) {
308  // Note: Currently character choices combined from fragments can only
309  // be generated by NoDangrousAmbigs(). This code should be updated if
310  // the capability to produce classifications combined from character
311  // fragments is added to other functions.
312  int orig_i = 0;
313  for (i = 0; i < alt_word->length(); ++i) {
314  const UNICHARSET &uchset = getUnicharset();
315  bool replacement_is_ngram =
316  uchset.get_isngram(alt_word->unichar_id(i));
317  UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
318  if (replacement_is_ngram) {
319  // we have to extract the leftmost unichar from the ngram.
320  const char *str = uchset.id_to_unichar(leftmost_id);
321  int step = uchset.step(str);
322  if (step) leftmost_id = uchset.unichar_to_id(str, step);
323  }
324  int end_i = orig_i + alt_word->state(i);
325  if (alt_word->state(i) > 1 ||
326  (orig_i + 1 == end_i && replacement_is_ngram)) {
327  // Compute proper blob indices.
328  int blob_start = 0;
329  for (int j = 0; j < orig_i; ++j)
330  blob_start += best_choice->state(j);
331  int blob_end = blob_start;
332  for (int j = orig_i; j < end_i; ++j)
333  blob_end += best_choice->state(j);
334  fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
335  replacement_is_ngram, leftmost_id));
336  if (stopper_debug_level > 1) {
337  tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
338  true, replacement_is_ngram,
339  uchset.id_to_unichar(leftmost_id));
340  }
341  }
342  orig_i += alt_word->state(i);
343  }
344  }
345  }
346  delete alt_word;
347  }
348  if (output_ambig_words_file_ != nullptr) {
349  fprintf(output_ambig_words_file_, "\n");
350  }
351 
352  ambig_blob_choices.delete_data_pointers();
353  return !ambigs_found;
354 }
#define MAX_AMBIG_SIZE
Definition: ambigs.h:30
int step(const char *str) const
Definition: unicharset.cpp:233
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:168
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:141
int length() const
Definition: ratngs.h:303
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:103
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:153
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
Definition: ambigs.h:62
int stopper_debug_level
Definition: dict.h:628
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:98
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:833
float rating() const
Definition: ratngs.h:327
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int push_back(T object)
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:366
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int UNICHAR_ID
Definition: unichar.h:34
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:152
const STRING debug_string() const
Definition: ratngs.h:505
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:526
int state(int index) const
Definition: ratngs.h:319
int tesseract::Dict::NumDawgs ( ) const
inline

Return the number of dawgs in the dawgs_ vector.

Definition at line 421 of file dict.h.

421 { return dawgs_.size(); }
int size() const
Definition: genericvector.h:70
float tesseract::Dict::ParamsModelClassify ( const char *  lang,
void *  path 
)
void tesseract::Dict::permute_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

permute_choices

Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST with the given char_choice_index in char_choices.

Definition at line 197 of file permdawg.cpp.

207  {
208  if (debug) {
209  tprintf("%s permute_choices: char_choice_index=%d"
210  " limit=%g rating=%g, certainty=%g word=%s\n",
211  debug, char_choice_index, *limit, word->rating(),
212  word->certainty(), word->debug_string().string());
213  }
214  if (char_choice_index < char_choices.length()) {
215  BLOB_CHOICE_IT blob_choice_it;
216  blob_choice_it.set_to_list(char_choices.get(char_choice_index));
217  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
218  blob_choice_it.forward()) {
219  (*attempts_left)--;
220  append_choices(debug, char_choices, *(blob_choice_it.data()),
221  char_choice_index, prev_char_frag_info, word,
222  certainties, limit, best_choice, attempts_left, more_args);
223  if (*attempts_left <= 0) {
224  if (debug) tprintf("permute_choices(): attempts_left is 0\n");
225  break;
226  }
227  }
228  }
229 }
int length() const
Definition: genericvector.h:84
T & get(int index) const
float rating() const
Definition: ratngs.h:327
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
float certainty() const
Definition: ratngs.h:330
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:239
const STRING debug_string() const
Definition: ratngs.h:505
double tesseract::Dict::ProbabilityInContext ( const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Calls probability_in_context_ member function.

Definition at line 381 of file dict.h.

384  {
385  return (this->*probability_in_context_)(
386  getCCUtil()->lang.string(),
387  context, context_bytes,
388  character, character_bytes);
389  }
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:375
const CCUtil * getCCUtil() const
Definition: dict.h:91
const char * string() const
Definition: strngs.cpp:194
STRING lang
Definition: ccutil.h:69
void tesseract::Dict::ProcessPatternEdges ( const Dawg dawg,
const DawgPosition info,
UNICHAR_ID  unichar_id,
bool  word_end,
DawgArgs dawg_args,
PermuterType current_permuter 
) const

For each of the character classes of the given unichar_id (and the unichar_id itself) finds the corresponding outgoing node or self-loop in the given dawg and (after checking that it is valid) records it in dawg_args->updated_ative_dawgs. Updates current_permuter if any valid edges were found.

Definition at line 570 of file dict.cpp.

573  {
574  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
575  // Try to find the edge corresponding to the exact unichar_id and to all the
576  // edges corresponding to the character class of unichar_id.
577  GenericVector<UNICHAR_ID> unichar_id_patterns;
578  unichar_id_patterns.push_back(unichar_id);
579  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
580  &unichar_id_patterns);
581  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
582  // On the first iteration check all the outgoing edges.
583  // On the second iteration check all self-loops.
584  for (int k = 0; k < 2; ++k) {
585  EDGE_REF edge =
586  (k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
587  : dawg->pattern_loop_edge(pos.dawg_ref,
588  unichar_id_patterns[i], word_end);
589  if (edge == NO_EDGE) continue;
590  if (dawg_debug_level >= 3) {
591  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
592  pos.dawg_index, node, edge);
593  tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
594  }
595  if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
596  if (dawg->end_of_word(edge)) dawg_args->valid_end = true;
597  dawg_args->updated_dawgs->add_unique(
598  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
599  pos.back_to_punc),
600  dawg_debug_level > 0,
601  "Append current dawg to updated active dawgs: ");
602  }
603  }
604 }
#define REFFORMAT
Definition: dawg.h:89
int64_t EDGE_REF
Definition: dawg.h:51
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:429
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int push_back(T object)
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int64_t NODE_REF
Definition: dawg.h:52
int size() const
Definition: genericvector.h:70
int dawg_debug_level
Definition: dict.h:611
void tesseract::Dict::ReplaceAmbig ( int  wrong_ngram_begin_index,
int  wrong_ngram_size,
UNICHAR_ID  correct_ngram_id,
WERD_CHOICE werd_choice,
MATRIX ratings 
)

Definition at line 366 of file stopper.cpp.

368  {
369  int num_blobs_to_replace = 0;
370  int begin_blob_index = 0;
371  int i;
372  // Rating and certainty for the new BLOB_CHOICE are derived from the
373  // replaced choices.
374  float new_rating = 0.0f;
375  float new_certainty = 0.0f;
376  BLOB_CHOICE* old_choice = nullptr;
377  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
378  if (i >= wrong_ngram_begin_index) {
379  int num_blobs = werd_choice->state(i);
380  int col = begin_blob_index + num_blobs_to_replace;
381  int row = col + num_blobs - 1;
382  BLOB_CHOICE_LIST* choices = ratings->get(col, row);
383  ASSERT_HOST(choices != nullptr);
384  old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
385  ASSERT_HOST(old_choice != nullptr);
386  new_rating += old_choice->rating();
387  new_certainty += old_choice->certainty();
388  num_blobs_to_replace += num_blobs;
389  } else {
390  begin_blob_index += werd_choice->state(i);
391  }
392  }
393  new_certainty /= wrong_ngram_size;
394  // If there is no entry in the ratings matrix, add it.
395  MATRIX_COORD coord(begin_blob_index,
396  begin_blob_index + num_blobs_to_replace - 1);
397  if (!coord.Valid(*ratings)) {
398  ratings->IncreaseBandSize(coord.row - coord.col + 1);
399  }
400  if (ratings->get(coord.col, coord.row) == nullptr)
401  ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
402  BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
403  BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
404  if (choice != nullptr) {
405  // Already there. Upgrade if new rating better.
406  if (new_rating < choice->rating())
407  choice->set_rating(new_rating);
408  if (new_certainty < choice->certainty())
409  choice->set_certainty(new_certainty);
410  // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
411  } else {
412  // Need a new choice with the correct_ngram_id.
413  choice = new BLOB_CHOICE(*old_choice);
414  choice->set_unichar_id(correct_ngram_id);
415  choice->set_rating(new_rating);
416  choice->set_certainty(new_certainty);
417  choice->set_classifier(BCC_AMBIG);
418  choice->set_matrix_cell(coord.col, coord.row);
419  BLOB_CHOICE_IT it (new_choices);
420  it.add_to_end(choice);
421  }
422  // Remove current unichar from werd_choice. On the last iteration
423  // set the correct replacement unichar instead of removing a unichar.
424  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
425  ++replaced_count) {
426  if (replaced_count + 1 == wrong_ngram_size) {
427  werd_choice->set_blob_choice(wrong_ngram_begin_index,
428  num_blobs_to_replace, choice);
429  } else {
430  werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
431  }
432  }
433  if (stopper_debug_level >= 1) {
434  werd_choice->print("ReplaceAmbig() ");
435  tprintf("Modified blob_choices: ");
436  print_ratings_list("\n", new_choices, getUnicharset());
437  }
438 }
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:312
float rating() const
Definition: ratngs.h:80
void set_rating(float newrat)
Definition: ratngs.h:148
void set_matrix_cell(int col, int row)
Definition: ratngs.h:157
T get(ICOORD pos) const
Definition: matrix.h:231
int stopper_debug_level
Definition: dict.h:628
void print() const
Definition: ratngs.h:580
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:180
float certainty() const
Definition: ratngs.h:83
void set_classifier(BlobChoiceClassifier classifier)
Definition: ratngs.h:167
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:833
void remove_unichar_id(int index)
Definition: ratngs.h:484
void put(ICOORD pos, const T &thing)
Definition: matrix.h:223
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
#define ASSERT_HOST(x)
Definition: errcode.h:88
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int state(int index) const
Definition: ratngs.h:319
void set_certainty(float newrat)
Definition: ratngs.h:151
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:145
void tesseract::Dict::reset_hyphen_vars ( bool  last_word_on_line)

Unless the previous word was the last one on the line, and the current one is not (thus it is the first one on the line), erase hyphen_word_, clear hyphen_active_dawgs_, update last_word_on_line_.

Definition at line 28 of file hyphen.cpp.

28  {
29  if (!(last_word_on_line_ == true && last_word_on_line == false)) {
30  if (hyphen_word_ != nullptr) {
31  delete hyphen_word_;
32  hyphen_word_ = nullptr;
33  hyphen_active_dawgs_.clear();
34  }
35  }
36  if (hyphen_debug_level) {
37  tprintf("reset_hyphen_vars: last_word_on_line %d -> %d\n",
38  last_word_on_line_, last_word_on_line);
39  }
40  last_word_on_line_ = last_word_on_line;
41 }
int hyphen_debug_level
Definition: dict.h:612
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void tesseract::Dict::ResetDocumentDictionary ( )
inline

Definition at line 317 of file dict.h.

317  {
318  if (pending_words_ != nullptr)
319  pending_words_->clear();
320  if (document_words_ != nullptr)
321  document_words_->clear();
322  }
void clear()
Definition: trie.cpp:57
void tesseract::Dict::set_hyphen_word ( const WERD_CHOICE word,
const DawgPositionVector active_dawgs 
)

Update hyphen_word_, and copy the given DawgPositionVectors into hyphen_active_dawgs_ .

Definition at line 45 of file hyphen.cpp.

46  {
47  if (hyphen_word_ == nullptr) {
48  hyphen_word_ = new WERD_CHOICE(word.unicharset());
49  hyphen_word_->make_bad();
50  }
51  if (hyphen_word_->rating() > word.rating()) {
52  *hyphen_word_ = word;
53  // Remove the last unichar id as it is a hyphen, and remove
54  // any unichar_string/lengths that are present.
55  hyphen_word_->remove_last_unichar_id();
56  hyphen_active_dawgs_ = active_dawgs;
57  }
58  if (hyphen_debug_level) {
59  hyphen_word_->print("set_hyphen_word: ");
60  }
61 }
int hyphen_debug_level
Definition: dict.h:612
void print() const
Definition: ratngs.h:580
float rating() const
Definition: ratngs.h:327
void remove_last_unichar_id()
Definition: ratngs.h:483
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:443
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
void tesseract::Dict::SettupStopperPass1 ( )

Sets up stopper variables in preparation for the first pass.

Definition at line 358 of file stopper.cpp.

358  {
359  reject_offset_ = 0.0;
360 }
void tesseract::Dict::SettupStopperPass2 ( )

Sets up stopper variables in preparation for the second pass.

Definition at line 362 of file stopper.cpp.

362  {
364 }
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:621
void tesseract::Dict::SetupForLoad ( DawgCache dawg_cache)

Definition at line 201 of file dict.cpp.

201  {
202  if (dawgs_.length() != 0) this->End();
203 
204  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
205  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
206  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
207  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
208 
209  if (dawg_cache != nullptr) {
210  dawg_cache_ = dawg_cache;
211  dawg_cache_is_ours_ = false;
212  } else {
213  dawg_cache_ = new DawgCache();
214  dawg_cache_is_ours_ = true;
215  }
216 }
int length() const
Definition: genericvector.h:84
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
void End()
Definition: dict.cpp:381
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
void tesseract::Dict::SetWildcardID ( UNICHAR_ID  id)
inline

Definition at line 418 of file dict.h.

418 { wildcard_unichar_id_ = id; }
void tesseract::Dict::SetWordsegRatingAdjustFactor ( float  f)
inline

Set wordseg_rating_adjust_factor_ to the given value.

Definition at line 501 of file dict.h.

501  {
502  wordseg_rating_adjust_factor_ = f;
503  }
int tesseract::Dict::UniformCertainties ( const WERD_CHOICE word)

Returns true if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation. This test is used to catch words in which one character is much worse than the other characters in the word (i.e. false will be returned in that case). The algorithm computes the mean and std deviation of the certainties in the word with the worst certainty thrown out.

Definition at line 459 of file stopper.cpp.

459  {
460  float Certainty;
461  float WorstCertainty = FLT_MAX;
462  float CertaintyThreshold;
463  double TotalCertainty;
464  double TotalCertaintySquared;
465  double Variance;
466  float Mean, StdDev;
467  int word_length = word.length();
468 
469  if (word_length < 3)
470  return true;
471 
472  TotalCertainty = TotalCertaintySquared = 0.0;
473  for (int i = 0; i < word_length; ++i) {
474  Certainty = word.certainty(i);
475  TotalCertainty += Certainty;
476  TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
477  if (Certainty < WorstCertainty)
478  WorstCertainty = Certainty;
479  }
480 
481  // Subtract off worst certainty from statistics.
482  word_length--;
483  TotalCertainty -= WorstCertainty;
484  TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
485 
486  Mean = TotalCertainty / word_length;
487  Variance = ((word_length * TotalCertaintySquared -
488  TotalCertainty * TotalCertainty) /
489  (word_length * (word_length - 1)));
490  if (Variance < 0.0)
491  Variance = 0.0;
492  StdDev = sqrt(Variance);
493 
494  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
495  if (CertaintyThreshold > stopper_nondict_certainty_base)
496  CertaintyThreshold = stopper_nondict_certainty_base;
497 
498  if (word.certainty() < CertaintyThreshold) {
499  if (stopper_debug_level >= 1)
500  tprintf("Stopper: Non-uniform certainty = %4.1f"
501  " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
502  word.certainty(), Mean, StdDev, CertaintyThreshold);
503  return false;
504  } else {
505  return true;
506  }
507 }
int length() const
Definition: ratngs.h:303
double stopper_allowable_character_badness
Definition: dict.h:627
int stopper_debug_level
Definition: dict.h:628
double stopper_nondict_certainty_base
Definition: dict.h:619
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
float certainty() const
Definition: ratngs.h:330
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:602
void tesseract::Dict::update_best_choice ( const WERD_CHOICE word,
WERD_CHOICE best_choice 
)
inline

Copies word into best_choice if its rating is smaller than that of best_choice.

Definition at line 177 of file dict.h.

178  {
179  if (word.rating() < best_choice->rating()) {
180  *best_choice = word;
181  }
182  }
float rating() const
Definition: ratngs.h:327
bool tesseract::Dict::valid_bigram ( const WERD_CHOICE word1,
const WERD_CHOICE word2 
) const

Definition at line 822 of file dict.cpp.

823  {
824  if (bigram_dawg_ == nullptr) return false;
825 
826  // Extract the core word from the middle of each word with any digits
827  // replaced with question marks.
828  int w1start, w1end, w2start, w2end;
829  word1.punct_stripped(&w1start, &w1end);
830  word2.punct_stripped(&w2start, &w2end);
831 
832  // We don't want to penalize a single guillemet, hyphen, etc.
833  // But our bigram list doesn't have any information about punctuation.
834  if (w1start >= w1end) return word1.length() < 3;
835  if (w2start >= w2end) return word2.length() < 3;
836 
837  const UNICHARSET& uchset = getUnicharset();
838  GenericVector<UNICHAR_ID> bigram_string;
839  bigram_string.reserve(w1end + w2end + 1);
840  for (int i = w1start; i < w1end; i++) {
841  const GenericVector<UNICHAR_ID>& normed_ids =
842  getUnicharset().normed_ids(word1.unichar_id(i));
843  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
844  bigram_string.push_back(question_unichar_id_);
845  else
846  bigram_string += normed_ids;
847  }
848  bigram_string.push_back(UNICHAR_SPACE);
849  for (int i = w2start; i < w2end; i++) {
850  const GenericVector<UNICHAR_ID>& normed_ids =
851  getUnicharset().normed_ids(word2.unichar_id(i));
852  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
853  bigram_string.push_back(question_unichar_id_);
854  else
855  bigram_string += normed_ids;
856  }
857  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
858  for (int i = 0; i < bigram_string.size(); ++i) {
859  normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 0.0f,
860  0.0f);
861  }
862  return bigram_dawg_->word_in_dawg(normalized_word);
863 }
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:452
int length() const
Definition: ratngs.h:303
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:383
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
void reserve(int size)
int push_back(T object)
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:835
int size() const
Definition: genericvector.h:70
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:65
bool tesseract::Dict::valid_punctuation ( const WERD_CHOICE word)

Returns true if the word contains a valid punctuation pattern. Note: Since the domains of punctuation symbols and symblos used in numbers are not disjoint, a valid number might contain an invalid punctuation pattern (e.g. .99).

Definition at line 865 of file dict.cpp.

865  {
866  if (word.length() == 0) return NO_PERM;
867  int i;
868  WERD_CHOICE new_word(word.unicharset());
869  int last_index = word.length() - 1;
870  int new_len = 0;
871  for (i = 0; i <= last_index; ++i) {
872  UNICHAR_ID unichar_id = (word.unichar_id(i));
873  if (getUnicharset().get_ispunctuation(unichar_id)) {
874  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
875  } else if (!getUnicharset().get_isalpha(unichar_id) &&
876  !getUnicharset().get_isdigit(unichar_id)) {
877  return false; // neither punc, nor alpha, nor digit
878  } else if ((new_len = new_word.length()) == 0 ||
879  new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
880  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
881  }
882  }
883  for (i = 0; i < dawgs_.size(); ++i) {
884  if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
885  dawgs_[i]->word_in_dawg(new_word))
886  return true;
887  }
888  return false;
889 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:122
int length() const
Definition: ratngs.h:303
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int UNICHAR_ID
Definition: unichar.h:34
int size() const
Definition: genericvector.h:70
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
int tesseract::Dict::valid_word ( const WERD_CHOICE word,
bool  numbers_ok 
) const

Definition at line 787 of file dict.cpp.

787  {
788  const WERD_CHOICE* word_ptr = &word;
789  WERD_CHOICE temp_word(word.unicharset());
790  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
791  copy_hyphen_info(&temp_word);
792  temp_word += word;
793  word_ptr = &temp_word;
794  }
795  if (word_ptr->length() == 0) return NO_PERM;
796  // Allocate vectors for holding current and updated
797  // active_dawgs and initialize them.
798  auto* active_dawgs = new DawgPositionVector[2];
799  init_active_dawgs(&(active_dawgs[0]), false);
800  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
801  int last_index = word_ptr->length() - 1;
802  // Call letter_is_okay for each letter in the word.
803  for (int i = hyphen_base_size(); i <= last_index; ++i) {
804  if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
805  word_ptr->unichar_id(i), i == last_index)))
806  break;
807  // Swap active_dawgs, constraints with the corresponding updated vector.
808  if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
809  dawg_args.updated_dawgs = &(active_dawgs[0]);
810  ++(dawg_args.active_dawgs);
811  } else {
812  ++(dawg_args.updated_dawgs);
813  dawg_args.active_dawgs = &(active_dawgs[0]);
814  }
815  }
816  delete[] active_dawgs;
817  return valid_word_permuter(dawg_args.permuter, numbers_ok)
818  ? dawg_args.permuter
819  : NO_PERM;
820 }
int length() const
Definition: ratngs.h:303
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:609
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:140
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:363
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:130
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:134
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:465
int tesseract::Dict::valid_word ( const WERD_CHOICE word) const
inline

Definition at line 472 of file dict.h.

472  {
473  return valid_word(word, false); // return NO_PERM for words with digits
474  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:787
int tesseract::Dict::valid_word ( const char *  string) const
inline

This function is used by api/tesseract_cube_combiner.cpp.

Definition at line 479 of file dict.h.

479  {
480  WERD_CHOICE word(string, getUnicharset());
481  return valid_word(word);
482  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:787
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int tesseract::Dict::valid_word_or_number ( const WERD_CHOICE word) const
inline

Definition at line 475 of file dict.h.

475  {
476  return valid_word(word, true); // return NUMBER_PERM for valid numbers
477  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:787
static bool tesseract::Dict::valid_word_permuter ( uint8_t  perm,
bool  numbers_ok 
)
inlinestatic

Check all the DAWGs to see if this word is in any of them.

Read/Write/Access special purpose dawgs which contain words only of a certain length (used for phrase search for non-space-delimited languages).

Definition at line 465 of file dict.h.

465  {
466  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
467  perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
468  perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
469  (numbers_ok && perm == NUMBER_PERM));
470  }
UNICHAR_ID tesseract::Dict::WildcardID ( ) const
inline

Definition at line 419 of file dict.h.

419 { return wildcard_unichar_id_; }

Member Data Documentation

double tesseract::Dict::certainty_scale = 20.0

"Certainty scaling factor"

Definition at line 617 of file dict.h.

int tesseract::Dict::dawg_debug_level = 0

"Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages"

Definition at line 611 of file dict.h.

double tesseract::Dict::doc_dict_certainty_threshold = -2.25

"Worst certainty" " for words that can be inserted into the document dictionary"

Definition at line 646 of file dict.h.

double tesseract::Dict::doc_dict_pending_threshold = 0.0

"Worst certainty for using pending dictionary"

Definition at line 644 of file dict.h.

int tesseract::Dict::fragments_debug = 0

"Debug character fragments"

Definition at line 637 of file dict.h.

void(Dict::* tesseract::Dict::go_deeper_fxn_) (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)

Pointer to go_deeper function.

Definition at line 211 of file dict.h.

int tesseract::Dict::hyphen_debug_level = 0

"Debug level for hyphenated words."

Definition at line 612 of file dict.h.

int(Dict::* tesseract::Dict::letter_is_okay_) (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 363 of file dict.h.

bool tesseract::Dict::load_bigram_dawg = true

"Load dawg with special word bigrams."

Definition at line 581 of file dict.h.

bool tesseract::Dict::load_freq_dawg = true

"Load frequent word dawg."

Definition at line 575 of file dict.h.

bool tesseract::Dict::load_number_dawg = true

"Load dawg with number patterns."

Definition at line 579 of file dict.h.

bool tesseract::Dict::load_punc_dawg = true

"Load dawg with punctuation patterns."

Definition at line 578 of file dict.h.

bool tesseract::Dict::load_system_dawg = true

"Load system word dawg."

Definition at line 574 of file dict.h.

bool tesseract::Dict::load_unambig_dawg = true

"Load unambiguous word dawg."

Definition at line 576 of file dict.h.

int tesseract::Dict::max_permuter_attempts = 10000

"Maximum number of different" " character choices to consider during permutation." " This limit is especially useful when user patterns" " are specified, since overly generic patterns can result in" " dawg search exploring an overly large number of options."

Definition at line 651 of file dict.h.

int tesseract::Dict::max_viterbi_list_size = 10

"Maximum size of viterbi list."

Definition at line 613 of file dict.h.

char* tesseract::Dict::output_ambig_words_file = ""

"Output file for ambiguities found in the dictionary"

Definition at line 609 of file dict.h.

float(Dict::* tesseract::Dict::params_model_classify_) (const char *lang, void *path)

Definition at line 409 of file dict.h.

double(Dict::* tesseract::Dict::probability_in_context_) (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Probability in context function used by the ngram permuter.

Definition at line 375 of file dict.h.

bool tesseract::Dict::save_doc_words = 0

"Save Document Words"

Definition at line 642 of file dict.h.

bool tesseract::Dict::segment_nonalphabetic_script = false

"Don't use any alphabetic-specific tricks." "Set to true in the traineddata config file for" " scripts that are cursive or inherently fixed-pitch"

Definition at line 641 of file dict.h.

double tesseract::Dict::segment_penalty_dict_case_bad = 1.3125

"Default score multiplier for word matches, which may have " "case issues (lower is better)."

Definition at line 598 of file dict.h.

double tesseract::Dict::segment_penalty_dict_case_ok = 1.1

"Score multiplier for word matches that have good case " "(lower is better)."

Definition at line 594 of file dict.h.

double tesseract::Dict::segment_penalty_dict_frequent_word = 1.0

"Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better)."

Definition at line 590 of file dict.h.

double tesseract::Dict::segment_penalty_dict_nonword = 1.25

"Score multiplier for glyph fragment segmentations which " "do not match a dictionary word (lower is better)."

Definition at line 602 of file dict.h.

double tesseract::Dict::segment_penalty_garbage = 1.50

"Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better)."

Definition at line 607 of file dict.h.

double tesseract::Dict::stopper_allowable_character_badness = 3.0

"Max certaintly variation allowed in a word (in sigma)"

Definition at line 627 of file dict.h.

double tesseract::Dict::stopper_certainty_per_char = -0.50

"Certainty to add for each dict char above small word size."

Definition at line 625 of file dict.h.

int tesseract::Dict::stopper_debug_level = 0

"Stopper debug level"

Definition at line 628 of file dict.h.

bool tesseract::Dict::stopper_no_acceptable_choices = false

"Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations"

Definition at line 631 of file dict.h.

double tesseract::Dict::stopper_nondict_certainty_base = -2.50

"Certainty threshold for non-dict words"

Definition at line 619 of file dict.h.

double tesseract::Dict::stopper_phase2_certainty_rejection_offset = 1.0

"Reject certainty offset"

Definition at line 621 of file dict.h.

int tesseract::Dict::stopper_smallword_size = 2

"Size of dict word to be treated as non-dict word"

Definition at line 623 of file dict.h.

int tesseract::Dict::tessedit_truncate_wordchoice_log = 10

"Max words to keep in list"

Definition at line 632 of file dict.h.

bool tesseract::Dict::use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities."

Definition at line 616 of file dict.h.

char* tesseract::Dict::user_patterns_file = ""

"A filename of user-provided patterns."

Definition at line 571 of file dict.h.

char* tesseract::Dict::user_patterns_suffix = ""

"A suffix of user-provided patterns located in tessdata."

Definition at line 573 of file dict.h.

char* tesseract::Dict::user_words_file = ""

Variable members. These have to be declared and initialized after image_ptr_, which contains the pointer to the params vector - the member of its base CCUtil class. "A filename of user-provided words."

Definition at line 567 of file dict.h.

char* tesseract::Dict::user_words_suffix = ""

"A suffix of user-provided words located in tessdata."

Definition at line 569 of file dict.h.

char* tesseract::Dict::word_to_debug = ""

"Word for which stopper debug information" " should be printed to stdout"

Definition at line 634 of file dict.h.

char* tesseract::Dict::word_to_debug_lengths = ""

"Lengths of unichars in word_to_debug"

Definition at line 636 of file dict.h.

double tesseract::Dict::xheight_penalty_inconsistent = 0.25

"Score penalty (0.1 = 10%) added if an xheight is " "inconsistent."

Definition at line 587 of file dict.h.

double tesseract::Dict::xheight_penalty_subscripts = 0.125

"Score penalty (0.1 = 10%) added if there are subscripts " "or superscripts in a word, but it is otherwise OK."

Definition at line 584 of file dict.h.


The documentation for this class was generated from the following files: