tesseract  4.1.0
dict.h
Go to the documentation of this file.
1 // File: dict.h
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_DICT_DICT_H_
20 #define TESSERACT_DICT_DICT_H_
21 
22 #include "ambigs.h"
23 #include "dawg.h"
24 #include "dawg_cache.h"
25 #include "ratngs.h"
26 #include "stopper.h"
27 #include "trie.h"
28 #include "unicharset.h"
30 
31 class MATRIX;
32 class WERD_RES;
33 
34 #define CHARS_PER_LINE 500
35 #define MAX_WERD_LENGTH (int64_t) 128
36 #define NO_RATING -1
37 
43  float rating;
44  float certainty;
45 };
46 
47 namespace tesseract {
48 
50 
51 //
52 // Constants
53 //
54 static const int kRatingPad = 4;
55 static const char kDictWildcard[] = "\u2606"; // WHITE STAR
56 static const int kDictMaxWildcards = 2; // max wildcards for a word
57 // TODO(daria): If hyphens are different in different languages and can be
58 // inferred from training data we should load their values dynamically.
59 static const char kHyphenSymbol[] = "-";
60 static const char kSlashSymbol[] = "/";
61 static const char kQuestionSymbol[] = "?";
62 static const char kApostropheSymbol[] = "'";
63 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
64 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
65 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
66 static const int kDocDictMaxRepChars = 4;
67 
68 // Enum for describing whether the x-height for the word is consistent:
69 // 0 - everything is good.
70 // 1 - there are one or two secondary (but consistent) baselines
71 // [think subscript and superscript], or there is an oversized
72 // first character.
73 // 2 - the word is inconsistent.
75 
76 struct DawgArgs {
78  : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}
79 
83  // True if the current position is a valid word end.
84  bool valid_end;
85 };
86 
87 class Dict {
88  public:
89  Dict(CCUtil* image_ptr);
90  ~Dict();
91  const CCUtil* getCCUtil() const {
92  return ccutil_;
93  }
95  return ccutil_;
96  }
97  const UNICHARSET& getUnicharset() const {
98  return getCCUtil()->unicharset;
99  }
101  return getCCUtil()->unicharset;
102  }
104  return getCCUtil()->unichar_ambigs;
105  }
106 
107  // Returns true if unichar_id is a word compounding character like - or /.
109  const UNICHARSET& unicharset = getUnicharset();
110  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
111  const GenericVector<UNICHAR_ID>& normed_ids =
112  unicharset.normed_ids(unichar_id);
113  return normed_ids.size() == 1 &&
114  (normed_ids[0] == hyphen_unichar_id_ ||
115  normed_ids[0] == slash_unichar_id_);
116  }
117  // Returns true if unichar_id is an apostrophe-like character that may
118  // separate prefix/suffix words from a main body word.
120  const UNICHARSET& unicharset = getUnicharset();
121  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
122  const GenericVector<UNICHAR_ID>& normed_ids =
123  unicharset.normed_ids(unichar_id);
124  return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
125  }
126 
127  /* hyphen.cpp ************************************************************/
128 
130  inline bool hyphenated() const { return
131  !last_word_on_line_ && hyphen_word_;
132  }
134  inline int hyphen_base_size() const {
135  return this->hyphenated() ? hyphen_word_->length() : 0;
136  }
140  inline void copy_hyphen_info(WERD_CHOICE *word) const {
141  if (this->hyphenated()) {
142  *word = *hyphen_word_;
143  if (hyphen_debug_level) word->print("copy_hyphen_info: ");
144  }
145  }
147  inline bool has_hyphen_end(const UNICHARSET* unicharset,
148  UNICHAR_ID unichar_id, bool first_pos) const {
149  if (!last_word_on_line_ || first_pos)
150  return false;
151  ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
152  const GenericVector<UNICHAR_ID>& normed_ids =
153  unicharset->normed_ids(unichar_id);
154  return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
155  }
157  inline bool has_hyphen_end(const WERD_CHOICE &word) const {
158  int word_index = word.length() - 1;
159  return has_hyphen_end(word.unicharset(), word.unichar_id(word_index),
160  word_index == 0);
161  }
165  void reset_hyphen_vars(bool last_word_on_line);
168  void set_hyphen_word(const WERD_CHOICE &word,
169  const DawgPositionVector &active_dawgs);
170 
171  /* permdawg.cpp ************************************************************/
172  // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
173  // When this function is refactored, permdawg.cpp can be removed.
174 
177  inline void update_best_choice(const WERD_CHOICE &word,
178  WERD_CHOICE *best_choice) {
179  if (word.rating() < best_choice->rating()) {
180  *best_choice = word;
181  }
182  }
186  void init_active_dawgs(DawgPositionVector *active_dawgs,
187  bool ambigs_mode) const;
188  // Fill the given vector with the default collection of any-length dawgs
189  void default_dawgs(DawgPositionVector *anylength_dawgs,
190  bool suppress_patterns) const;
191 
192 
198  WERD_CHOICE *dawg_permute_and_select(
199  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
203  void go_deeper_dawg_fxn(
204  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
205  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
206  bool word_ending, WERD_CHOICE *word, float certainties[],
207  float *limit, WERD_CHOICE *best_choice, int *attempts_left,
208  void *void_more_args);
209 
211  void (Dict::*go_deeper_fxn_)(const char *debug,
212  const BLOB_CHOICE_LIST_VECTOR &char_choices,
213  int char_choice_index,
214  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
215  bool word_ending, WERD_CHOICE *word,
216  float certainties[], float *limit,
217  WERD_CHOICE *best_choice, int *attempts_left,
218  void *void_more_args);
219  //
220  // Helper functions for dawg_permute_and_select().
221  //
222  void permute_choices(
223  const char *debug,
224  const BLOB_CHOICE_LIST_VECTOR &char_choices,
225  int char_choice_index,
226  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
227  WERD_CHOICE *word,
228  float certainties[],
229  float *limit,
230  WERD_CHOICE *best_choice,
231  int *attempts_left,
232  void *more_args);
233 
234  void append_choices(
235  const char *debug,
236  const BLOB_CHOICE_LIST_VECTOR &char_choices,
237  const BLOB_CHOICE &blob_choice,
238  int char_choice_index,
239  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
240  WERD_CHOICE *word,
241  float certainties[],
242  float *limit,
243  WERD_CHOICE *best_choice,
244  int *attempts_left,
245  void *more_args);
246 
247  bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
248  float curr_rating, float curr_certainty,
249  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
250  const char *debug, int word_ending,
251  CHAR_FRAGMENT_INFO *char_frag_info);
252 
253  /* stopper.cpp *************************************************************/
254  bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
255  DANGERR *fixpt,
256  bool fix_replaceable,
257  MATRIX* ratings);
258  // Replaces the corresponding wrong ngram in werd_choice with the correct
259  // one. The whole correct n-gram is inserted into the ratings matrix and
260  // the werd_choice: no more fragments!. Rating and certainty of new entries
261  // in matrix and werd_choice are the sum and mean of the wrong ngram
262  // respectively.
263  // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
264  // mystring", with a new entry in the ratings matrix for ".
265  void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
266  UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
267  MATRIX *ratings);
268 
270  int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;
278  int UniformCertainties(const WERD_CHOICE& word);
280  bool AcceptableChoice(const WERD_CHOICE& best_choice,
281  XHeightConsistencyEnum xheight_consistency);
285  bool AcceptableResult(WERD_RES *word) const;
286  void EndDangerousAmbigs();
288  void DebugWordChoices();
290  void SettupStopperPass1();
292  void SettupStopperPass2();
293  /* context.cpp *************************************************************/
295  int case_ok(const WERD_CHOICE& word) const;
298  bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
299 
300  /* dict.cpp ****************************************************************/
301 
304  static TESS_API DawgCache *GlobalDawgCache();
305  // Sets up ready for a Load or LoadLSTM.
306  void SetupForLoad(DawgCache *dawg_cache);
307  // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
308  void Load(const STRING &lang, TessdataManager *data_file);
309  // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
310  void LoadLSTM(const STRING &lang, TessdataManager *data_file);
311  // Completes the loading process after Load() and/or LoadLSTM().
312  // Returns false if no dictionaries were loaded.
313  bool FinishLoad();
314  void End();
315 
316  // Resets the document dictionary analogous to ResetAdaptiveClassifier.
318  if (pending_words_ != nullptr)
319  pending_words_->clear();
320  if (document_words_ != nullptr)
321  document_words_->clear();
322  }
323 
359  //
360  int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
361  UNICHAR_ID unichar_id, bool word_end) const;
362 
363  int (Dict::*letter_is_okay_)(void* void_dawg_args,
364  const UNICHARSET& unicharset,
365  UNICHAR_ID unichar_id, bool word_end) const;
367  int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset,
368  UNICHAR_ID unichar_id, bool word_end) const {
369  return (this->*letter_is_okay_)(void_dawg_args,
370  unicharset, unichar_id, word_end);
371  }
372 
373 
375  double (Dict::*probability_in_context_)(const char* lang,
376  const char* context,
377  int context_bytes,
378  const char* character,
379  int character_bytes);
381  double ProbabilityInContext(const char* context,
382  int context_bytes,
383  const char* character,
384  int character_bytes) {
385  return (this->*probability_in_context_)(
386  getCCUtil()->lang.string(),
387  context, context_bytes,
388  character, character_bytes);
389  }
390 
393  const char* lang, const char* context, int context_bytes,
394  const char* character, int character_bytes) {
395  (void)lang;
396  (void)context;
397  (void)context_bytes;
398  (void)character;
399  (void)character_bytes;
400  return 0.0;
401  }
402  double ngram_probability_in_context(const char* lang,
403  const char* context,
404  int context_bytes,
405  const char* character,
406  int character_bytes);
407 
408  // Interface with params model.
409  float (Dict::*params_model_classify_)(const char *lang, void *path);
410  float ParamsModelClassify(const char *lang, void *path);
411  // Call params_model_classify_ member function.
412  float CallParamsModelClassify(void *path) {
413  ASSERT_HOST(params_model_classify_ != nullptr); // ASSERT_HOST -> assert
414  return (this->*params_model_classify_)(
415  getCCUtil()->lang.string(), path);
416  }
417 
418  inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; }
419  inline UNICHAR_ID WildcardID() const { return wildcard_unichar_id_; }
421  inline int NumDawgs() const { return dawgs_.size(); }
423  inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
425  inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
427  inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
429  static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
430  if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
431  NODE_REF node = dawg->next_node(edge_ref);
432  if (node == 0) node = NO_EDGE; // end of word
433  return node;
434  }
435 
436  // Given a unichar from a string and a given dawg, return the unichar
437  // we should use to match in that dawg type. (for example, in the number
438  // dawg, all numbers are transformed to kPatternUnicharId).
440  const Dawg *dawg) const {
441  if (!dawg) return ch;
442  switch (dawg->type()) {
443  case DAWG_TYPE_NUMBER:
444  return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
445  default:
446  return ch;
447  }
448  }
449 
455  void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info,
456  UNICHAR_ID unichar_id, bool word_end,
457  DawgArgs *dawg_args,
458  PermuterType *current_permuter) const;
459 
463 
465  inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {
466  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
467  perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
468  perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
469  (numbers_ok && perm == NUMBER_PERM));
470  }
471  int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
472  int valid_word(const WERD_CHOICE &word) const {
473  return valid_word(word, false); // return NO_PERM for words with digits
474  }
475  int valid_word_or_number(const WERD_CHOICE &word) const {
476  return valid_word(word, true); // return NUMBER_PERM for valid numbers
477  }
479  int valid_word(const char *string) const {
480  WERD_CHOICE word(string, getUnicharset());
481  return valid_word(word);
482  }
483  // Do the two WERD_CHOICEs form a meaningful bigram?
484  bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
489  bool valid_punctuation(const WERD_CHOICE &word);
491  int good_choice(const WERD_CHOICE &choice);
493  void add_document_word(const WERD_CHOICE &best_choice);
495  void adjust_word(WERD_CHOICE *word,
496  bool nonword, XHeightConsistencyEnum xheight_consistency,
497  float additional_adjust,
498  bool modify_rating,
499  bool debug);
501  inline void SetWordsegRatingAdjustFactor(float f) {
502  wordseg_rating_adjust_factor_ = f;
503  }
505  bool IsSpaceDelimitedLang() const;
506 
507  private:
509  CCUtil* ccutil_;
516  UnicharAmbigs *dang_ambigs_table_;
518  UnicharAmbigs *replace_ambigs_table_;
520  float reject_offset_;
521  // Cached UNICHAR_IDs:
522  UNICHAR_ID wildcard_unichar_id_; // kDictWildcard.
523  UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.
524  UNICHAR_ID question_unichar_id_; // kQuestionSymbol.
525  UNICHAR_ID slash_unichar_id_; // kSlashSymbol.
526  UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol.
527  // Hyphen-related variables.
528  WERD_CHOICE *hyphen_word_;
529  DawgPositionVector hyphen_active_dawgs_;
530  bool last_word_on_line_;
531  // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
532  // matching. The first member of each list is taken as canonical. For
533  // example, the first list contains hyphens and dashes with the first symbol
534  // being the ASCII hyphen minus.
535  GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
536  // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
537  DawgCache *dawg_cache_;
538  bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
539  // Dawgs.
540  DawgVector dawgs_;
541  SuccessorListsVector successors_;
542  Trie *pending_words_;
545  // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if
546  // any of them are present on the best choices list for a word pair.
547  // the bigrams are stored as space-separated words where:
548  // (1) leading and trailing punctuation has been removed from each word and
549  // (2) any digits have been replaced with '?' marks.
550  Dawg *bigram_dawg_;
551  // TODO(daria): need to support multiple languages in the future,
552  // so maybe will need to maintain a list of dawgs of each kind.
553  Dawg *freq_dawg_;
554  Dawg *unambig_dawg_;
555  Dawg *punc_dawg_;
556  Trie *document_words_;
559  float wordseg_rating_adjust_factor_;
560  // File for recording ambiguities discovered during dictionary search.
561  FILE *output_ambig_words_file_;
562 
563  public:
567  STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");
568  STRING_VAR_H(user_words_suffix, "",
569  "A suffix of user-provided words located in tessdata.");
570  STRING_VAR_H(user_patterns_file, "",
571  "A filename of user-provided patterns.");
572  STRING_VAR_H(user_patterns_suffix, "",
573  "A suffix of user-provided patterns located in tessdata.");
574  BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
575  BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
576  BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
577  BOOL_VAR_H(load_punc_dawg, true,
578  "Load dawg with punctuation patterns.");
579  BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
580  BOOL_VAR_H(load_bigram_dawg, true,
581  "Load dawg with special word bigrams.");
582  double_VAR_H(xheight_penalty_subscripts, 0.125,
583  "Score penalty (0.1 = 10%) added if there are subscripts "
584  "or superscripts in a word, but it is otherwise OK.");
585  double_VAR_H(xheight_penalty_inconsistent, 0.25,
586  "Score penalty (0.1 = 10%) added if an xheight is "
587  "inconsistent.");
588  double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
589  "Score multiplier for word matches which have good case and"
590  "are frequent in the given language (lower is better).");
591 
592  double_VAR_H(segment_penalty_dict_case_ok, 1.1,
593  "Score multiplier for word matches that have good case "
594  "(lower is better).");
595 
596  double_VAR_H(segment_penalty_dict_case_bad, 1.3125,
597  "Default score multiplier for word matches, which may have "
598  "case issues (lower is better).");
599 
600  double_VAR_H(segment_penalty_dict_nonword, 1.25,
601  "Score multiplier for glyph fragment segmentations which "
602  "do not match a dictionary word (lower is better).");
603 
604  double_VAR_H(segment_penalty_garbage, 1.50,
605  "Score multiplier for poorly cased strings that are not in"
606  " the dictionary and generally look like garbage (lower is"
607  " better).");
608  STRING_VAR_H(output_ambig_words_file, "",
609  "Output file for ambiguities found in the dictionary");
610  INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
611  ", to 2 for more details, to 3 to see all the debug messages");
612  INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
613  INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list.");
614  BOOL_VAR_H(use_only_first_uft8_step, false,
615  "Use only the first UTF8 step of the given string"
616  " when computing log probabilities.");
617  double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
618  double_VAR_H(stopper_nondict_certainty_base, -2.50,
619  "Certainty threshold for non-dict words");
620  double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0,
621  "Reject certainty offset");
622  INT_VAR_H(stopper_smallword_size, 2,
623  "Size of dict word to be treated as non-dict word");
624  double_VAR_H(stopper_certainty_per_char, -0.50,
625  "Certainty to add for each dict char above small word size.");
626  double_VAR_H(stopper_allowable_character_badness, 3.0,
627  "Max certaintly variation allowed in a word (in sigma)");
628  INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
629  BOOL_VAR_H(stopper_no_acceptable_choices, false,
630  "Make AcceptableChoice() always return false. Useful"
631  " when there is a need to explore all segmentations");
632  INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
633  STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
634  " should be printed to stdout");
635  STRING_VAR_H(word_to_debug_lengths, "",
636  "Lengths of unichars in word_to_debug");
637  INT_VAR_H(fragments_debug, 0, "Debug character fragments");
638  BOOL_VAR_H(segment_nonalphabetic_script, false,
639  "Don't use any alphabetic-specific tricks."
640  "Set to true in the traineddata config file for"
641  " scripts that are cursive or inherently fixed-pitch");
642  BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
643  double_VAR_H(doc_dict_pending_threshold, 0.0,
644  "Worst certainty for using pending dictionary");
645  double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
646  " for words that can be inserted into the document dictionary");
647  INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
648  " character choices to consider during permutation."
649  " This limit is especially useful when user patterns"
650  " are specified, since overly generic patterns can result in"
651  " dawg search exploring an overly large number of options.");
652 };
653 } // namespace tesseract
654 
655 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:297
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
Definition: dict.h:157
int num_fragments
Definition: dict.h:42
PermuterType
Definition: ratngs.h:242
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
Definition: dict.h:501
#define STRING_VAR_H(name, val, comment)
Definition: params.h:299
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:108
bool valid_end
Definition: dict.h:84
UNICHAR_ID WildcardID() const
Definition: dict.h:419
void SetWildcardID(UNICHAR_ID id)
Definition: dict.h:418
int64_t EDGE_REF
Definition: dawg.h:51
Definition: strngs.h:45
DawgPositionVector * updated_dawgs
Definition: dict.h:81
PermuterType permuter
Definition: dict.h:82
void ResetDocumentDictionary()
Definition: dict.h:317
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
Definition: dict.h:77
#define INT_VAR_H(name, val, comment)
Definition: params.h:295
DawgPositionVector * active_dawgs
Definition: dict.h:80
int length() const
Definition: ratngs.h:303
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:103
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:425
const CCUtil * getCCUtil() const
Definition: dict.h:91
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:177
#define double_VAR_H(name, val, comment)
Definition: params.h:301
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:119
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:392
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:423
float CallParamsModelClassify(void *path)
Definition: dict.h:412
#define TESS_API
Definition: platform.h:54
CCUtil * getCCUtil()
Definition: dict.h:94
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
int valid_word(const WERD_CHOICE &word) const
Definition: dict.h:472
XHeightConsistencyEnum
Definition: dict.h:74
void print() const
Definition: ratngs.h:580
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:140
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:429
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:381
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
Definition: dict.h:427
float rating() const
Definition: ratngs.h:327
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:421
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:147
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
Definition: dict.h:479
UNICHAR_ID unichar_id
Definition: dict.h:40
float certainty
Definition: dict.h:44
Definition: matrix.h:578
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
float rating
Definition: dict.h:43
#define ASSERT_HOST(x)
Definition: errcode.h:88
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:835
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:367
int UNICHAR_ID
Definition: unichar.h:34
int valid_word_or_number(const WERD_CHOICE &word) const
Definition: dict.h:475
int64_t NODE_REF
Definition: dawg.h:52
const CHAR_FRAGMENT * fragment
Definition: dict.h:41
int size() const
Definition: genericvector.h:70
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
DawgType type() const
Definition: dawg.h:124
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:130
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:439
UNICHARSET & getUnicharset()
Definition: dict.h:100
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:134
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:465