46 static inline double log2(
double n) {
55 :
INT_MEMBER(language_model_debug_level, 0,
"Language model debug level",
56 dict->getCCUtil()->params()),
58 "Turn on/off the use of character ngram model",
59 dict->getCCUtil()->params()),
61 "Maximum order of the character ngram model",
62 dict->getCCUtil()->params()),
63 INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
64 "Maximum number of prunable (those for which" 65 " PrunablePath() is true) entries in each viterbi list" 66 " recorded in BLOB_CHOICEs",
67 dict->getCCUtil()->params()),
68 INT_MEMBER(language_model_viterbi_list_max_size, 500,
69 "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
70 dict->getCCUtil()->params()),
72 "To avoid overly small denominators use this as the " 73 "floor of the probability returned by the ngram model.",
74 dict->getCCUtil()->params()),
76 "Average classifier score of a non-matching unichar.",
77 dict->getCCUtil()->params()),
78 BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
79 "Use only the first UTF8 step of the given string" 80 " when computing log probabilities.",
81 dict->getCCUtil()->params()),
83 "Strength of the character ngram model relative to the" 84 " character classifier ",
85 dict->getCCUtil()->params()),
87 "Factor to bring log-probs into the same range as ratings" 88 " when multiplied by outline length ",
89 dict->getCCUtil()->params()),
90 BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
91 "Words are delimited by space", dict->getCCUtil()->params()),
92 INT_MEMBER(language_model_min_compound_length, 3,
93 "Minimum length of compound words",
94 dict->getCCUtil()->params()),
96 "Penalty for words not in the frequent word dictionary",
97 dict->getCCUtil()->params()),
99 "Penalty for non-dictionary words",
100 dict->getCCUtil()->params()),
102 "Penalty for inconsistent punctuation",
103 dict->getCCUtil()->params()),
105 "Penalty for inconsistent case",
106 dict->getCCUtil()->params()),
108 "Penalty for inconsistent script",
109 dict->getCCUtil()->params()),
111 "Penalty for inconsistent character type",
112 dict->getCCUtil()->params()),
116 "Penalty for inconsistent font",
117 dict->getCCUtil()->params()),
119 "Penalty for inconsistent spacing",
120 dict->getCCUtil()->params()),
121 double_MEMBER(language_model_penalty_increment, 0.01,
"Penalty increment",
122 dict->getCCUtil()->params()),
123 INT_MEMBER(wordrec_display_segmentations, 0,
"Display Segmentations",
124 dict->getCCUtil()->params()),
126 "Use sigmoidal score for certainty",
127 dict->getCCUtil()->params()),
129 fontinfo_table_(fontinfo_table),
132 max_char_wh_ratio_(0.0),
133 acceptable_choice_found_(false) {
140 bool fixed_pitch,
float max_char_wh_ratio,
141 float rating_cert_scale) {
157 if (prev_word !=
nullptr && prev_word->
unichar_string() !=
nullptr) {
179 static void ScanParentsForCaseMix(
const UNICHARSET& unicharset,
181 if (parent_node ==
nullptr)
return;
183 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
190 if (other_case == unichar_id)
continue;
196 for (vit2.mark_cycle_pt(); !vit2.cycled_list() &&
197 vit2.data()->curr_b->unichar_id() != other_case;
199 if (!vit2.cycled_list()) {
210 static bool HasBetterCaseVariant(
const UNICHARSET& unicharset,
212 BLOB_CHOICE_LIST* choices) {
215 if (other_case == choice_id || other_case == INVALID_UNICHAR_ID)
219 BLOB_CHOICE_IT bc_it(choices);
220 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
222 if (better_choice->
unichar_id() == other_case)
224 else if (better_choice == choice)
257 bool just_classified,
258 int curr_col,
int curr_row,
259 BLOB_CHOICE_LIST *curr_list,
266 tprintf(
"\nUpdateState: col=%d row=%d %s",
267 curr_col, curr_row, just_classified ?
"just_classified" :
"");
269 tprintf(
"(parent=%p)\n", parent_node);
275 bool new_changed =
false;
281 bool has_alnum_mix =
false;
282 if (parent_node !=
nullptr) {
286 tprintf(
"No parents found to process\n");
290 has_alnum_mix =
true;
294 has_alnum_mix =
false;;
295 ScanParentsForCaseMix(unicharset, parent_node);
297 parent_node->
Print(
"Parent viterbi list");
302 ViterbiStateEntry_IT vit;
303 BLOB_CHOICE_IT c_it(curr_list);
304 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
316 if (c_it.at_first() || !new_changed)
320 if (first_digit == choice) blob_choice_flags |=
kDigitFlag;
322 if (parent_node ==
nullptr) {
334 if (HasBetterCaseVariant(unicharset, choice, curr_list))
340 blob_choice_flags, denom, word_end, curr_col, curr_row,
341 choice, curr_state,
nullptr, pain_points,
342 word_res, best_choice_bundle, blamer_bundle);
351 c_it.data(), blob_choice_flags,
352 unicharset, word_res, &vit,
353 &top_choice_flags)) !=
nullptr) {
366 HasBetterCaseVariant(unicharset, choice, curr_list))
371 top_choice_flags, denom, word_end, curr_col, curr_row,
372 c_it.data(), curr_state, parent_vse, pain_points,
373 word_res, best_choice_bundle, blamer_bundle);
390 BLOB_CHOICE_IT c_it(curr_list);
393 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
396 if (first_unichar ==
nullptr) first_unichar = c_it.data();
397 if (*first_lower ==
nullptr && unicharset.
get_islower(unichar_id)) {
398 *first_lower = c_it.data();
400 if (*first_upper ==
nullptr && unicharset.
get_isalpha(unichar_id) &&
402 *first_upper = c_it.data();
404 if (*first_digit ==
nullptr && unicharset.
get_isdigit(unichar_id)) {
405 *first_digit = c_it.data();
409 bool mixed = (*first_lower !=
nullptr || *first_upper !=
nullptr) &&
410 *first_digit !=
nullptr;
411 if (*first_lower ==
nullptr) *first_lower = first_unichar;
412 if (*first_upper ==
nullptr) *first_upper = first_unichar;
413 if (*first_digit ==
nullptr) *first_digit = first_unichar;
428 if (parent_node ==
nullptr)
return -1;
434 float lower_rating = 0.0f;
435 float upper_rating = 0.0f;
436 float digit_rating = 0.0f;
437 float top_rating = 0.0f;
440 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
447 while (unichar_id == INVALID_UNICHAR_ID &&
453 if (unichar_id != INVALID_UNICHAR_ID) {
455 if (top_lower ==
nullptr || lower_rating > rating) {
457 lower_rating = rating;
460 if (top_upper ==
nullptr || upper_rating > rating) {
462 upper_rating = rating;
465 if (top_digit ==
nullptr || digit_rating > rating) {
467 digit_rating = rating;
471 if (top_choice ==
nullptr || top_rating > rating) {
477 if (top_choice ==
nullptr)
return -1;
478 bool mixed = (top_lower !=
nullptr || top_upper !=
nullptr) &&
479 top_digit !=
nullptr;
480 if (top_lower ==
nullptr) top_lower = top_choice;
482 if (top_upper ==
nullptr) top_upper = top_choice;
484 if (top_digit ==
nullptr) top_digit = top_choice;
495 return mixed ? 1 : 0;
504 bool just_classified,
bool mixed_alnum,
const BLOB_CHOICE* bc,
506 WERD_RES* word_res, ViterbiStateEntry_IT* vse_it,
508 for (; !vse_it->cycled_list(); vse_it->forward()) {
512 if (!just_classified && !parent_vse->
updated)
continue;
514 parent_vse->
Print(
"Considering");
516 *top_choice_flags = blob_choice_flags;
529 (mixed_alnum || *top_choice_flags == 0))
535 (mixed_alnum || *top_choice_flags == 0))
544 tprintf(
"Parent %s has competition %s\n",
568 int curr_col,
int curr_row,
576 ViterbiStateEntry_IT vit;
578 tprintf(
"AddViterbiStateEntry for unichar %s rating=%.4f" 579 " certainty=%.4f top_choice_flags=0x%x",
583 tprintf(
" parent_vse=%p\n", parent_vse);
592 tprintf(
"AddViterbiStateEntry: viterbi list is full!\n");
601 float outline_length =
608 denom, curr_col, curr_row, outline_length, parent_vse);
611 bool liked_by_language_model = dawg_info !=
nullptr ||
612 (ngram_info !=
nullptr && !ngram_info->
pruned);
615 if (!liked_by_language_model && top_choice_flags == 0) {
617 tprintf(
"Language model components very early pruned this entry\n");
638 if (!liked_by_language_model && top_choice_flags == 0) {
640 tprintf(
"Language model components early pruned this entry\n");
649 word_res, &consistency_info);
650 if (dawg_info !=
nullptr && consistency_info.
invalid_punc) {
657 parent_vse, word_res, &associate_stats);
658 if (parent_vse !=
nullptr) {
665 parent_vse, b, 0.0, outline_length,
666 consistency_info, associate_stats, top_choice_flags, dawg_info,
671 tprintf(
"Adjusted cost = %g\n", new_vse->cost);
680 bool keep = new_vse->top_choice_flags || liked_by_language_model;
687 tprintf(
"Language model components did not like this entry\n");
701 tprintf(
"Discarded ViterbiEntry with high cost %g max cost %g\n",
712 best_choice_bundle, blamer_bundle);
715 new_vse != best_choice_bundle->
best_vse) {
717 tprintf(
"Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
736 new_vse->top_choice_flags) {
740 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
746 curr_vse->
cost > new_vse->cost) {
749 if (prunable_counter > 0 &&
PrunablePath(*curr_vse)) --prunable_counter;
751 if (prunable_counter == 0) {
754 tprintf(
"Set viterbi_state_entries_prunable_max_cost to %g\n",
757 prunable_counter = -1;
764 new_vse->Print(
"New");
766 curr_state->
Print(
"Updated viterbi list");
776 for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->
top_choice_flags &&
777 new_vse->
cost >= vit.data()->cost; vit.forward()) {
783 tprintf(
"GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
790 int curr_col,
int curr_row,
795 if (parent_vse ==
nullptr) {
799 if (parent_vse->
dawg_info ==
nullptr)
return nullptr;
819 if (parent_vse ==
nullptr || word_end ||
826 bool has_word_ending =
false;
834 has_word_ending =
true;
838 if (!has_word_ending)
return nullptr;
852 for (
int i = 0; i < normed_ids.
size(); ++i) {
854 tprintf(
"Test Letter OK for unichar %d, normed %d\n",
857 word_end && i == normed_ids.
size() - 1);
860 }
else if (i < normed_ids.
size() - 1) {
865 tprintf(
"Letter was OK for unichar %d, normed %d\n",
881 const char *unichar,
float certainty,
float denom,
882 int curr_col,
int curr_row,
float outline_length,
885 const char *pcontext_ptr =
"";
886 int pcontext_unichar_step_len = 0;
887 if (parent_vse ==
nullptr) {
892 pcontext_unichar_step_len =
896 int unichar_step_len = 0;
899 float ngram_and_classifier_cost =
901 pcontext_ptr, &unichar_step_len,
902 &pruned, &ngram_cost);
906 ngram_and_classifier_cost *=
909 if (parent_vse !=
nullptr) {
910 ngram_and_classifier_cost +=
916 int num_remove = (unichar_step_len + pcontext_unichar_step_len -
918 if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
919 while (num_remove > 0 && *pcontext_ptr !=
'\0') {
929 pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
930 ngram_and_classifier_cost);
931 ngram_info->context += unichar;
932 ngram_info->context_unichar_step_len += unichar_step_len;
941 int *unichar_step_len,
942 bool *found_small_prob,
944 const char *context_ptr = context;
945 char *modified_context =
nullptr;
946 char *modified_context_end =
nullptr;
947 const char *unichar_ptr = unichar;
948 const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
951 while (unichar_ptr < unichar_end &&
954 tprintf(
"prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
958 ++(*unichar_step_len);
964 if (unichar_ptr < unichar_end) {
965 if (modified_context ==
nullptr) {
966 size_t context_len = strlen(context);
968 new char[context_len + strlen(unichar_ptr) + step + 1];
969 memcpy(modified_context, context, context_len);
970 modified_context_end = modified_context + context_len;
971 context_ptr = modified_context;
973 strncpy(modified_context_end, unichar_ptr - step, step);
974 modified_context_end += step;
975 *modified_context_end =
'\0';
978 prob /=
static_cast<float>(*unichar_step_len);
981 *found_small_prob =
true;
984 *ngram_cost = -1.0*log2(prob);
985 float ngram_and_classifier_cost =
989 tprintf(
"-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
991 ngram_and_classifier_cost);
993 delete[] modified_context;
994 return ngram_and_classifier_cost;
998 if (curr_list->empty())
return 1.0f;
1001 BLOB_CHOICE_IT c_it(curr_list);
1002 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1036 consistency_info->
punc_ref = NO_EDGE;
1039 bool prev_is_numalpha = (parent_b !=
nullptr &&
1045 (is_apos && prev_is_numalpha)) ?
1047 if (consistency_info->
punc_ref == NO_EDGE ||
1055 node, pattern_unichar_id, word_end) : NO_EDGE;
1056 if (consistency_info->
punc_ref == NO_EDGE) {
1071 }
else if ((parent_b !=
nullptr) && unicharset.
get_isupper(unichar_id)) {
1094 if (parent_vse !=
nullptr &&
1100 consistency_info->
script_id = parent_script_id;
1102 if (consistency_info->
script_id != parent_script_id) {
1118 int fontinfo_id = -1;
1127 tprintf(
"pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1139 bool expected_gap_found =
false;
1140 float expected_gap = 0.0f;
1142 if (fontinfo_id >= 0) {
1143 ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1145 parent_b->
unichar_id(), unichar_id, &temp_gap)) {
1146 expected_gap = temp_gap;
1147 expected_gap_found =
true;
1152 int num_addends = 0;
1154 for (
int i = 0; i < 4; ++i) {
1157 }
else if (i == 1) {
1159 }
else if (i == 2) {
1164 ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1166 parent_b->
unichar_id(), unichar_id, &temp_gap)) {
1167 expected_gap += temp_gap;
1171 if (num_addends > 0) {
1172 expected_gap /=
static_cast<float>(num_addends);
1173 expected_gap_found =
true;
1176 if (expected_gap_found) {
1177 int actual_gap = word_res->
GetBlobsGap(curr_col-1);
1178 if (actual_gap == 0) {
1181 float gap_ratio = expected_gap / actual_gap;
1187 if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1192 tprintf(
"spacing for %s(%d) %s(%d) col %d: expected %g actual %d\n",
1195 unichar_id, curr_col, expected_gap, actual_gap);
1209 tprintf(
"ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1212 tprintf(
"%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1218 float adjustment = 1.0f;
1231 static_cast<float>(vse->
length);
1252 blamer_bundle, &truth_path);
1260 word->
print(
"UpdateBestChoice() constructed word");
1264 if (blamer_bundle !=
nullptr) {
1271 tprintf(
"Raw features extracted from %s (cost=%g) [ ",
1273 for (
float feature : curr_hyp.
features) {
1295 tprintf(
"Updated raw choice\n");
1319 best_choice_bundle->
updated =
true;
1320 best_choice_bundle->
best_vse = vse;
1322 tprintf(
"Updated best choice\n");
1334 if (blamer_bundle !=
nullptr) {
1348 int len = vse.
length <= kMaxSmallWordUnichars ? 0 :
1349 vse.
length <= kMaxMediumWordUnichars ? 1 : 2;
1399 if (truth_path !=
nullptr) {
1401 (blamer_bundle !=
nullptr &&
1412 float full_wh_ratio_mean = 0.0f;
1416 static_cast<float>(vse->
length));
1422 word->set_length(vse->
length);
1423 int total_blobs = 0;
1424 for (i = (vse->
length-1); i >= 0; --i) {
1425 if (blamer_bundle !=
nullptr && truth_path !=
nullptr && *truth_path &&
1427 *truth_path =
false;
1431 total_blobs += num_blobs;
1432 word->set_blob_choice(i, num_blobs, curr_b);
1436 if ((full_wh_ratio_mean != 0.0f &&
1437 ((curr_vse != vse && curr_vse->
parent_vse !=
nullptr) ||
1442 tprintf(
"full_wh_ratio_var += (%g-%g)^2\n",
1455 if (curr_vse ==
nullptr)
break;
1456 curr_b = curr_vse->
curr_b;
1461 if (full_wh_ratio_mean != 0.0f) {
UNICHAR_ID unichar_id() const
DawgPositionVector very_beginning_active_dawgs_
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
bool PrunablePath(const ViterbiStateEntry &vse)
double language_model_penalty_non_freq_dict_word
void Print(const char *msg) const
static const LanguageModelFlagsType kLowerCaseFlag
DawgPositionVector beginning_active_dawgs_
int GetBlobsGap(int blob_index)
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
DawgPositionVector active_dawgs
Bundle together all the things pertaining to the best choice/state.
bool get_ispunctuation(UNICHAR_ID unichar_id) const
ParamsModel params_model_
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
static const UNICHAR_ID kPatternUnicharID
int language_model_viterbi_list_max_size
float BodyMaxXHeight() const
bool compound_marker(UNICHAR_ID unichar_id)
float ratings_sum
sum of ratings of character on the path
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
int wordrec_display_segmentations
bool updated
Flag to indicate whether anything was changed.
float features[PTRAIN_NUM_FEATURE_TYPES]
GenericVector< int > blob_widths
void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc)
GenericVector< TBLOB * > blobs
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
DawgPositionVector * updated_dawgs
#define double_MEMBER(name, val, comment, vec)
virtual bool end_of_word(EDGE_REF edge_ref) const =0
AssociateStats associate_stats
character widths/gaps/seams
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
#define BOOL_MEMBER(name, val, comment, vec)
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
void set_rating(float new_val)
double language_model_penalty_non_dict_word
const STRING & unichar_string() const
int language_model_min_compound_length
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
float viterbi_state_entries_prunable_max_cost
ViterbiStateEntry_LIST viterbi_state_entries
Storage for the Viterbi state.
double language_model_ngram_nonmatch_score
DawgPositionVector * active_dawgs
bool language_model_ngram_space_delimited_language
double language_model_ngram_rating_factor
int context_unichar_step_len
static int utf8_step(const char *utf8_str)
#define INT_MEMBER(name, val, comment, vec)
static const LanguageModelFlagsType kDigitFlag
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
float ComputeCost(const float features[]) const
const char * id_to_unichar(UNICHAR_ID id) const
int viterbi_state_entries_length
Total number of entries in viterbi_state_entries.
int num_inconsistent_spaces
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
bool AcceptablePath(const ViterbiStateEntry &vse)
bool HasAlnumChoice(const UNICHARSET &unicharset)
int InconsistentXHeight() const
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
XHeightConsistencyEnum xht_decision
bool is_apostrophe(UNICHAR_ID unichar_id)
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
int tessedit_truncate_wordchoice_log
static const LanguageModelFlagsType kUpperCaseFlag
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
LanguageModelDawgInfo * dawg_info
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Struct to store information maintained by various language model components.
bool get_isdigit(UNICHAR_ID unichar_id) const
int NumInconsistentChartype() const
float CertaintyScore(float cert)
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
static int Compare(const void *e1, const void *e2)
const UNICHARSET * uch_set
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
bool get_isalpha(UNICHAR_ID unichar_id) const
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
void DisplaySegmentation(TWERD *word)
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
float ngram_cost
-ln(P_ngram_model(path))
int length
number of characters on the path
bool correct_segmentation_explored_
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
int viterbi_state_entries_prunable_length
Number and max cost of prunable paths in viterbi_state_entries.
LMConsistencyInfo consistency_info
path consistency info
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
bool LogNewRawChoice(WERD_CHOICE *word_choice)
int16_t fontinfo_id() const
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
ViterbiStateEntry * parent_vse
int correct_segmentation_length() const
int prev_word_unichar_step_len_
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
void UpdateBestRating(float rating)
const char * string() const
DLLSYM void tprintf(const char *format,...)
float min_certainty
minimum certainty on the path
static const LanguageModelFlagsType kSmallestRatingFlag
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
void set_best_choice_is_dict_and_top_choice(bool value)
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
static const float kMaxAvgNgramCost
double language_model_ngram_scale_factor
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
int language_model_ngram_order
BLOB_CHOICE * curr_b
Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
const MATRIX_COORD & matrix_cell()
LanguageModelFlagsType top_choice_flags
float ngram_and_classifier_cost
-[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
bool acceptable_choice_found_
ViterbiStateEntry * competing_vse
void reset_hyphen_vars(bool last_word_on_line)
static const LanguageModelFlagsType kXhtConsistentFlag
bool language_model_ngram_on
WERD_CHOICE * best_choice
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
void Print(const char *msg)
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
const UNICHARSET & getUnicharset() const
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
float BodyMinXHeight() const
int16_t fontinfo_id2() const
float full_wh_ratio_total
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
float outline_length
length of the outline so far
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
bool GuidedSegsearchStillGoing() const
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
int language_model_debug_level
bool get_isupper(UNICHAR_ID unichar_id) const
int NumInconsistentSpaces() const
int get_script(UNICHAR_ID unichar_id) const
static const float kBadRating
int NumInconsistentCase() const
PointerVector< LanguageModelState > beam
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
int language_model_viterbi_list_max_num_prunable
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
void print_state(const char *msg) const
#define BOOL_INIT_MEMBER(name, val, comment, vec)
double language_model_penalty_increment
bool language_model_ngram_use_only_first_uft8_step
double language_model_ngram_small_prob
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
LanguageModelNgramInfo * ngram_info
bool get_islower(UNICHAR_ID unichar_id) const
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
const UnicityTable< FontInfo > * fontinfo_table_