|
tesseract 3.04.01
|
00001 00002 // File: language_model.h 00003 // Description: Functions that utilize the knowledge about the properties, 00004 // structure and statistics of the language to help segmentation 00005 // search. 00006 // Author: Daria Antonova 00007 // Created: Mon Nov 11 11:26:43 PST 2009 00008 // 00009 // (C) Copyright 2009, Google Inc. 00010 // Licensed under the Apache License, Version 2.0 (the "License"); 00011 // you may not use this file except in compliance with the License. 00012 // You may obtain a copy of the License at 00013 // http://www.apache.org/licenses/LICENSE-2.0 00014 // Unless required by applicable law or agreed to in writing, software 00015 // distributed under the License is distributed on an "AS IS" BASIS, 00016 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00017 // See the License for the specific language governing permissions and 00018 // limitations under the License. 00019 // 00021 00022 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_ 00023 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_ 00024 00025 #include "associate.h" 00026 #include "dawg.h" 00027 #include "dict.h" 00028 #include "fontinfo.h" 00029 #include "intproto.h" 00030 #include "lm_consistency.h" 00031 #include "lm_pain_points.h" 00032 #include "lm_state.h" 00033 #include "matrix.h" 00034 #include "params.h" 00035 #include "pageres.h" 00036 #include "params_model.h" 00037 00038 namespace tesseract { 00039 00040 // This class that contains the data structures and functions necessary 00041 // to represent and use the knowledge about the language. 00042 class LanguageModel { 00043 public: 00044 // Masks for keeping track of top choices that should not be pruned out. 00045 static const LanguageModelFlagsType kSmallestRatingFlag = 0x1; 00046 static const LanguageModelFlagsType kLowerCaseFlag = 0x2; 00047 static const LanguageModelFlagsType kUpperCaseFlag = 0x4; 00048 static const LanguageModelFlagsType kDigitFlag = 0x8; 00049 static const LanguageModelFlagsType kXhtConsistentFlag = 0x10; 00050 00051 // Denominator for normalizing per-letter ngram cost when deriving 00052 // penalty adjustments. 00053 static const float kMaxAvgNgramCost; 00054 00055 LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict); 00056 ~LanguageModel(); 00057 00058 // Fills the given floats array with features extracted from path represented 00059 // by the given ViterbiStateEntry. See ccstruct/params_training_featdef.h 00060 // for feature information. 00061 // Note: the function assumes that features points to an array of size 00062 // PTRAIN_NUM_FEATURE_TYPES. 00063 static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, 00064 float features[]); 00065 00066 // Updates data structures that are used for the duration of the segmentation 00067 // search on the current word; 00068 void InitForWord(const WERD_CHOICE *prev_word, 00069 bool fixed_pitch, float max_char_wh_ratio, 00070 float rating_cert_scale); 00071 00072 // Updates language model state of the given BLOB_CHOICE_LIST (from 00073 // the ratings matrix) a its parent. Updates pain_points if new 00074 // problematic points are found in the segmentation graph. 00075 // 00076 // At most language_model_viterbi_list_size are kept in each 00077 // LanguageModelState.viterbi_state_entries list. 00078 // At most language_model_viterbi_list_max_num_prunable of those are prunable 00079 // (non-dictionary) paths. 00080 // The entries that represent dictionary word paths are kept at the front 00081 // of the list. 00082 // The list ordered by cost that is computed collectively by several 00083 // language model components (currently dawg and ngram components). 00084 bool UpdateState( 00085 bool just_classified, 00086 int curr_col, int curr_row, 00087 BLOB_CHOICE_LIST *curr_list, 00088 LanguageModelState *parent_node, 00089 LMPainPoints *pain_points, 00090 WERD_RES *word_res, 00091 BestChoiceBundle *best_choice_bundle, 00092 BlamerBundle *blamer_bundle); 00093 00094 // Returns true if an acceptable best choice was discovered. 00095 inline bool AcceptableChoiceFound() { return acceptable_choice_found_; } 00096 inline void SetAcceptableChoiceFound(bool val) { 00097 acceptable_choice_found_ = val; 00098 } 00099 // Returns the reference to ParamsModel. 00100 inline ParamsModel &getParamsModel() { return params_model_; } 00101 00102 protected: 00103 00104 inline float CertaintyScore(float cert) { 00105 if (language_model_use_sigmoidal_certainty) { 00106 // cert is assumed to be between 0 and -dict_->certainty_scale. 00107 // If you enable language_model_use_sigmoidal_certainty, you 00108 // need to adjust language_model_ngram_nonmatch_score as well. 00109 cert = -cert / dict_->certainty_scale; 00110 return 1.0f / (1.0f + exp(10.0f * cert)); 00111 } else { 00112 return (-1.0f / cert); 00113 } 00114 } 00115 00116 inline float ComputeAdjustment(int num_problems, float penalty) { 00117 if (num_problems == 0) return 0.0f; 00118 if (num_problems == 1) return penalty; 00119 return (penalty + (language_model_penalty_increment * 00120 static_cast<float>(num_problems-1))); 00121 } 00122 00123 // Computes the adjustment to the ratings sum based on the given 00124 // consistency_info. The paths with invalid punctuation, inconsistent 00125 // case and character type are penalized proportionally to the number 00126 // of inconsistencies on the path. 00127 inline float ComputeConsistencyAdjustment( 00128 const LanguageModelDawgInfo *dawg_info, 00129 const LMConsistencyInfo &consistency_info) { 00130 if (dawg_info != NULL) { 00131 return ComputeAdjustment(consistency_info.NumInconsistentCase(), 00132 language_model_penalty_case) + 00133 (consistency_info.inconsistent_script ? 00134 language_model_penalty_script : 0.0f); 00135 } 00136 return (ComputeAdjustment(consistency_info.NumInconsistentPunc(), 00137 language_model_penalty_punc) + 00138 ComputeAdjustment(consistency_info.NumInconsistentCase(), 00139 language_model_penalty_case) + 00140 ComputeAdjustment(consistency_info.NumInconsistentChartype(), 00141 language_model_penalty_chartype) + 00142 ComputeAdjustment(consistency_info.NumInconsistentSpaces(), 00143 language_model_penalty_spacing) + 00144 (consistency_info.inconsistent_script ? 00145 language_model_penalty_script : 0.0f) + 00146 (consistency_info.inconsistent_font ? 00147 language_model_penalty_font : 0.0f)); 00148 } 00149 00150 // Returns an adjusted ratings sum that includes inconsistency penalties, 00151 // penalties for non-dictionary paths and paths with dips in ngram 00152 // probability. 00153 float ComputeAdjustedPathCost(ViterbiStateEntry *vse); 00154 00155 // Finds the first lower and upper case letter and first digit in curr_list. 00156 // Uses the first character in the list in place of empty results. 00157 // Returns true if both alpha and digits are found. 00158 bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, 00159 BLOB_CHOICE **first_lower, 00160 BLOB_CHOICE **first_upper, 00161 BLOB_CHOICE **first_digit) const; 00162 // Forces there to be at least one entry in the overall set of the 00163 // viterbi_state_entries of each element of parent_node that has the 00164 // top_choice_flag set for lower, upper and digit using the same rules as 00165 // GetTopLowerUpperDigit, setting the flag on the first found suitable 00166 // candidate, whether or not the flag is set on some other parent. 00167 // Returns 1 if both alpha and digits are found among the parents, -1 if no 00168 // parents are found at all (a legitimate case), and 0 otherwise. 00169 int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const; 00170 00171 // Finds the next ViterbiStateEntry with which the given unichar_id can 00172 // combine sensibly, taking into account any mixed alnum/mixed case 00173 // situation, and whether this combination has been inspected before. 00174 ViterbiStateEntry* GetNextParentVSE( 00175 bool just_classified, bool mixed_alnum, 00176 const BLOB_CHOICE* bc, LanguageModelFlagsType blob_choice_flags, 00177 const UNICHARSET& unicharset, WERD_RES* word_res, 00178 ViterbiStateEntry_IT* vse_it, 00179 LanguageModelFlagsType* top_choice_flags) const; 00180 // Helper function that computes the cost of the path composed of the 00181 // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE. 00182 // If the new path looks good enough, adds a new ViterbiStateEntry to the 00183 // list of viterbi entries in the given BLOB_CHOICE and returns true. 00184 bool AddViterbiStateEntry( 00185 LanguageModelFlagsType top_choice_flags, float denom, bool word_end, 00186 int curr_col, int curr_row, BLOB_CHOICE *b, 00187 LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, 00188 LMPainPoints *pain_points, WERD_RES *word_res, 00189 BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); 00190 00191 // Determines whether a potential entry is a true top choice and 00192 // updates changed accordingly. 00193 // 00194 // Note: The function assumes that b, top_choice_flags and changed 00195 // are not NULL. 00196 void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, 00197 const ViterbiStateEntry *parent_vse, 00198 LanguageModelState *lms); 00199 00200 // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and 00201 // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo 00202 // with updated active dawgs, constraints and permuter. 00203 // 00204 // Note: the caller is responsible for deleting the returned pointer. 00205 LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, 00206 int curr_col, int curr_row, 00207 const BLOB_CHOICE &b, 00208 const ViterbiStateEntry *parent_vse); 00209 00210 // Computes p(unichar | parent context) and records it in ngram_cost. 00211 // If b.unichar_id() is an unlikely continuation of the parent context 00212 // sets found_small_prob to true and returns NULL. 00213 // Otherwise creates a new LanguageModelNgramInfo entry containing the 00214 // updated context (that includes b.unichar_id() at the end) and returns it. 00215 // 00216 // Note: the caller is responsible for deleting the returned pointer. 00217 LanguageModelNgramInfo *GenerateNgramInfo( 00218 const char *unichar, float certainty, float denom, 00219 int curr_col, int curr_row, float outline_length, 00220 const ViterbiStateEntry *parent_vse); 00221 00222 // Computes -(log(prob(classifier)) + log(prob(ngram model))) 00223 // for the given unichar in the given context. If there are multiple 00224 // unichars at one position - takes the average of their probabilities. 00225 // UNICHAR::utf8_step() is used to separate out individual UTF8 characters, 00226 // since probability_in_context() can only handle one at a time (while 00227 // unicharset might contain ngrams and glyphs composed from multiple UTF8 00228 // characters). 00229 float ComputeNgramCost(const char *unichar, float certainty, float denom, 00230 const char *context, int *unichar_step_len, 00231 bool *found_small_prob, float *ngram_prob); 00232 00233 // Computes the normalization factors for the classifier confidences 00234 // (used by ComputeNgramCost()). 00235 float ComputeDenom(BLOB_CHOICE_LIST *curr_list); 00236 00237 // Fills the given consistenty_info based on parent_vse.consistency_info 00238 // and on the consistency of the given unichar_id with parent_vse. 00239 void FillConsistencyInfo( 00240 int curr_col, bool word_end, BLOB_CHOICE *b, 00241 ViterbiStateEntry *parent_vse, 00242 WERD_RES *word_res, 00243 LMConsistencyInfo *consistency_info); 00244 00245 // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs 00246 // on the path represented by the given BLOB_CHOICE and language model 00247 // state entries (lmse, dse). The path is re-constructed by following 00248 // the parent pointers in the the lang model state entries). If the 00249 // constructed WERD_CHOICE is better than the best/raw choice recorded 00250 // in the best_choice_bundle, this function updates the corresponding 00251 // fields and sets best_choice_bunldle->updated to true. 00252 void UpdateBestChoice(ViterbiStateEntry *vse, 00253 LMPainPoints *pain_points, 00254 WERD_RES *word_res, 00255 BestChoiceBundle *best_choice_bundle, 00256 BlamerBundle *blamer_bundle); 00257 00258 // Constructs a WERD_CHOICE by tracing parent pointers starting with 00259 // the given LanguageModelStateEntry. Returns the constructed word. 00260 // Updates best_char_choices, certainties and state if they are not 00261 // NULL (best_char_choices and certainties are assumed to have the 00262 // length equal to lmse->length). 00263 // The caller is responsible for freeing memory associated with the 00264 // returned WERD_CHOICE. 00265 WERD_CHOICE *ConstructWord(ViterbiStateEntry *vse, 00266 WERD_RES *word_res, 00267 DANGERR *fixpt, 00268 BlamerBundle *blamer_bundle, 00269 bool *truth_path); 00270 00271 // Wrapper around AssociateUtils::ComputeStats(). 00272 inline void ComputeAssociateStats(int col, int row, 00273 float max_char_wh_ratio, 00274 ViterbiStateEntry *parent_vse, 00275 WERD_RES *word_res, 00276 AssociateStats *associate_stats) { 00277 AssociateUtils::ComputeStats( 00278 col, row, 00279 (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL, 00280 (parent_vse != NULL) ? parent_vse->length : 0, 00281 fixed_pitch_, max_char_wh_ratio, 00282 word_res, language_model_debug_level > 2, associate_stats); 00283 } 00284 00285 // Returns true if the path with such top_choice_flags and dawg_info 00286 // could be pruned out (i.e. is neither a system/user/frequent dictionary 00287 // nor a top choice path). 00288 // In non-space delimited languages all paths can be "somewhat" dictionary 00289 // words. In such languages we can not do dictionary-driven path pruning, 00290 // so paths with non-empty dawg_info are considered prunable. 00291 inline bool PrunablePath(const ViterbiStateEntry &vse) { 00292 if (vse.top_choice_flags) return false; 00293 if (vse.dawg_info != NULL && 00294 (vse.dawg_info->permuter == SYSTEM_DAWG_PERM || 00295 vse.dawg_info->permuter == USER_DAWG_PERM || 00296 vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false; 00297 return true; 00298 } 00299 00300 // Returns true if the given ViterbiStateEntry represents an acceptable path. 00301 inline bool AcceptablePath(const ViterbiStateEntry &vse) { 00302 return (vse.dawg_info != NULL || vse.Consistent() || 00303 (vse.ngram_info != NULL && !vse.ngram_info->pruned)); 00304 } 00305 00306 public: 00307 // Parameters. 00308 INT_VAR_H(language_model_debug_level, 0, "Language model debug level"); 00309 BOOL_VAR_H(language_model_ngram_on, false, 00310 "Turn on/off the use of character ngram model"); 00311 INT_VAR_H(language_model_ngram_order, 8, 00312 "Maximum order of the character ngram model"); 00313 INT_VAR_H(language_model_viterbi_list_max_num_prunable, 10, 00314 "Maximum number of prunable (those for which PrunablePath() is" 00315 " true) entries in each viterbi list recorded in BLOB_CHOICEs"); 00316 INT_VAR_H(language_model_viterbi_list_max_size, 500, 00317 "Maximum size of viterbi lists recorded in BLOB_CHOICEs"); 00318 double_VAR_H(language_model_ngram_small_prob, 0.000001, 00319 "To avoid overly small denominators use this as the floor" 00320 " of the probability returned by the ngram model"); 00321 double_VAR_H(language_model_ngram_nonmatch_score, -40.0, 00322 "Average classifier score of a non-matching unichar"); 00323 BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step, false, 00324 "Use only the first UTF8 step of the given string" 00325 " when computing log probabilities"); 00326 double_VAR_H(language_model_ngram_scale_factor, 0.03, 00327 "Strength of the character ngram model relative to the" 00328 " character classifier "); 00329 double_VAR_H(language_model_ngram_rating_factor, 16.0, 00330 "Factor to bring log-probs into the same range as ratings" 00331 " when multiplied by outline length "); 00332 BOOL_VAR_H(language_model_ngram_space_delimited_language, true, 00333 "Words are delimited by space"); 00334 INT_VAR_H(language_model_min_compound_length, 3, 00335 "Minimum length of compound words"); 00336 // Penalties used for adjusting path costs and final word rating. 00337 double_VAR_H(language_model_penalty_non_freq_dict_word, 0.1, 00338 "Penalty for words not in the frequent word dictionary"); 00339 double_VAR_H(language_model_penalty_non_dict_word, 0.15, 00340 "Penalty for non-dictionary words"); 00341 double_VAR_H(language_model_penalty_punc, 0.2, 00342 "Penalty for inconsistent punctuation"); 00343 double_VAR_H(language_model_penalty_case, 0.1, 00344 "Penalty for inconsistent case"); 00345 double_VAR_H(language_model_penalty_script, 0.5, 00346 "Penalty for inconsistent script"); 00347 double_VAR_H(language_model_penalty_chartype, 0.3, 00348 "Penalty for inconsistent character type"); 00349 double_VAR_H(language_model_penalty_font, 0.00, 00350 "Penalty for inconsistent font"); 00351 double_VAR_H(language_model_penalty_spacing, 0.05, 00352 "Penalty for inconsistent spacing"); 00353 double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment"); 00354 INT_VAR_H(wordrec_display_segmentations, 0, "Display Segmentations"); 00355 BOOL_VAR_H(language_model_use_sigmoidal_certainty, false, 00356 "Use sigmoidal score for certainty"); 00357 00358 00359 protected: 00360 // Member Variables. 00361 00362 // Temporary DawgArgs struct that is re-used across different words to 00363 // avoid dynamic memory re-allocation (should be cleared before each use). 00364 DawgArgs *dawg_args_; 00365 // Scaling for recovering blob outline length from rating and certainty. 00366 float rating_cert_scale_; 00367 00368 // The following variables are set at construction time. 00369 00370 // Pointer to fontinfo table (not owned by LanguageModel). 00371 const UnicityTable<FontInfo> *fontinfo_table_; 00372 00373 // Pointer to Dict class, that is used for querying the dictionaries 00374 // (the pointer is not owned by LanguageModel). 00375 Dict *dict_; 00376 00377 // TODO(daria): the following variables should become LanguageModel params 00378 // when the old code in bestfirst.cpp and heuristic.cpp is deprecated. 00379 // 00380 // Set to true if we are dealing with fixed pitch text 00381 // (set to assume_fixed_pitch_char_segment). 00382 bool fixed_pitch_; 00383 // Max char width-to-height ratio allowed 00384 // (set to segsearch_max_char_wh_ratio). 00385 float max_char_wh_ratio_; 00386 00387 // The following variables are initialized with InitForWord(). 00388 00389 // String representation of the classification of the previous word 00390 // (since this is only used by the character ngram model component, 00391 // only the last language_model_ngram_order of the word are stored). 00392 STRING prev_word_str_; 00393 int prev_word_unichar_step_len_; 00394 // Active dawg vector. 00395 DawgPositionVector *very_beginning_active_dawgs_; // includes continuation 00396 DawgPositionVector *beginning_active_dawgs_; 00397 // Set to true if acceptable choice was discovered. 00398 // Note: it would be nice to use this to terminate the search once an 00399 // acceptable choices is found. However we do not do that and once an 00400 // acceptable choice is found we finish looking for alternative choices 00401 // in the current segmentation graph and then exit the search (no more 00402 // classifications are done after an acceptable choice is found). 00403 // This is needed in order to let the search find the words very close to 00404 // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these 00405 // choices. This way the stopper will know that the best choice is not 00406 // ambiguous (i.e. there are best choices in the best choice list that have 00407 // ratings close to the very best one) and will be less likely to mis-adapt. 00408 bool acceptable_choice_found_; 00409 // Set to true if a choice representing correct segmentation was explored. 00410 bool correct_segmentation_explored_; 00411 00412 // Params models containing weights for for computing ViterbiStateEntry costs. 00413 ParamsModel params_model_; 00414 }; 00415 00416 } // namespace tesseract 00417 00418 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_