tesseract 3.04.01

wordrec/language_model.h

Go to the documentation of this file.
00001 
00002 // File:        language_model.h
00003 // Description: Functions that utilize the knowledge about the properties,
00004 //              structure and statistics of the language to help segmentation
00005 //              search.
00006 // Author:      Daria Antonova
00007 // Created:     Mon Nov 11 11:26:43 PST 2009
00008 //
00009 // (C) Copyright 2009, Google Inc.
00010 // Licensed under the Apache License, Version 2.0 (the "License");
00011 // you may not use this file except in compliance with the License.
00012 // You may obtain a copy of the License at
00013 // http://www.apache.org/licenses/LICENSE-2.0
00014 // Unless required by applicable law or agreed to in writing, software
00015 // distributed under the License is distributed on an "AS IS" BASIS,
00016 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017 // See the License for the specific language governing permissions and
00018 // limitations under the License.
00019 //
00021 
00022 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_
00023 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_
00024 
00025 #include "associate.h"
00026 #include "dawg.h"
00027 #include "dict.h"
00028 #include "fontinfo.h"
00029 #include "intproto.h"
00030 #include "lm_consistency.h"
00031 #include "lm_pain_points.h"
00032 #include "lm_state.h"
00033 #include "matrix.h"
00034 #include "params.h"
00035 #include "pageres.h"
00036 #include "params_model.h"
00037 
00038 namespace tesseract {
00039 
00040 // This class that contains the data structures and functions necessary
00041 // to represent and use the knowledge about the language.
00042 class LanguageModel {
00043  public:
00044   // Masks for keeping track of top choices that should not be pruned out.
00045   static const LanguageModelFlagsType kSmallestRatingFlag = 0x1;
00046   static const LanguageModelFlagsType kLowerCaseFlag = 0x2;
00047   static const LanguageModelFlagsType kUpperCaseFlag = 0x4;
00048   static const LanguageModelFlagsType kDigitFlag = 0x8;
00049   static const LanguageModelFlagsType kXhtConsistentFlag = 0x10;
00050 
00051   // Denominator for normalizing per-letter ngram cost when deriving
00052   // penalty adjustments.
00053   static const float kMaxAvgNgramCost;
00054 
00055   LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict);
00056   ~LanguageModel();
00057 
00058   // Fills the given floats array with features extracted from path represented
00059   // by the given ViterbiStateEntry. See ccstruct/params_training_featdef.h
00060   // for feature information.
00061   // Note: the function assumes that features points to an array of size
00062   // PTRAIN_NUM_FEATURE_TYPES.
00063   static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse,
00064                                       float features[]);
00065 
00066   // Updates data structures that are used for the duration of the segmentation
00067   // search on the current word;
00068   void InitForWord(const WERD_CHOICE *prev_word,
00069                    bool fixed_pitch, float max_char_wh_ratio,
00070                    float rating_cert_scale);
00071 
00072   // Updates language model state of the given BLOB_CHOICE_LIST (from
00073   // the ratings matrix) a its parent. Updates pain_points if new
00074   // problematic points are found in the segmentation graph.
00075   //
00076   // At most language_model_viterbi_list_size are kept in each
00077   // LanguageModelState.viterbi_state_entries list.
00078   // At most language_model_viterbi_list_max_num_prunable of those are prunable
00079   // (non-dictionary) paths.
00080   // The entries that represent dictionary word paths are kept at the front
00081   // of the list.
00082   // The list ordered by cost that is computed collectively by several
00083   // language model components (currently dawg and ngram components).
00084   bool UpdateState(
00085       bool just_classified,
00086       int curr_col, int curr_row,
00087       BLOB_CHOICE_LIST *curr_list,
00088       LanguageModelState *parent_node,
00089       LMPainPoints *pain_points,
00090       WERD_RES *word_res,
00091       BestChoiceBundle *best_choice_bundle,
00092       BlamerBundle *blamer_bundle);
00093 
00094   // Returns true if an acceptable best choice was discovered.
00095   inline bool AcceptableChoiceFound() { return acceptable_choice_found_; }
00096   inline void SetAcceptableChoiceFound(bool val) {
00097     acceptable_choice_found_ = val;
00098   }
00099   // Returns the reference to ParamsModel.
00100   inline ParamsModel &getParamsModel() { return params_model_; }
00101 
00102  protected:
00103 
00104   inline float CertaintyScore(float cert) {
00105     if (language_model_use_sigmoidal_certainty) {
00106       // cert is assumed to be between 0 and -dict_->certainty_scale.
00107       // If you enable language_model_use_sigmoidal_certainty, you
00108       // need to adjust language_model_ngram_nonmatch_score as well.
00109       cert = -cert / dict_->certainty_scale;
00110       return 1.0f / (1.0f + exp(10.0f * cert));
00111     } else {
00112       return (-1.0f / cert);
00113     }
00114   }
00115 
00116   inline float ComputeAdjustment(int num_problems, float penalty) {
00117     if (num_problems == 0) return 0.0f;
00118     if (num_problems == 1) return penalty;
00119     return (penalty + (language_model_penalty_increment *
00120                        static_cast<float>(num_problems-1)));
00121   }
00122 
00123   // Computes the adjustment to the ratings sum based on the given
00124   // consistency_info. The paths with invalid punctuation, inconsistent
00125   // case and character type are penalized proportionally to the number
00126   // of inconsistencies on the path.
00127   inline float ComputeConsistencyAdjustment(
00128       const LanguageModelDawgInfo *dawg_info,
00129       const LMConsistencyInfo &consistency_info) {
00130     if (dawg_info != NULL) {
00131       return ComputeAdjustment(consistency_info.NumInconsistentCase(),
00132                                language_model_penalty_case) +
00133           (consistency_info.inconsistent_script ?
00134              language_model_penalty_script : 0.0f);
00135     }
00136     return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
00137                               language_model_penalty_punc) +
00138             ComputeAdjustment(consistency_info.NumInconsistentCase(),
00139                               language_model_penalty_case) +
00140             ComputeAdjustment(consistency_info.NumInconsistentChartype(),
00141                               language_model_penalty_chartype) +
00142             ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
00143                               language_model_penalty_spacing) +
00144             (consistency_info.inconsistent_script ?
00145              language_model_penalty_script : 0.0f) +
00146             (consistency_info.inconsistent_font ?
00147              language_model_penalty_font : 0.0f));
00148   }
00149 
00150   // Returns an adjusted ratings sum that includes inconsistency penalties,
00151   // penalties for non-dictionary paths and paths with dips in ngram
00152   // probability.
00153   float ComputeAdjustedPathCost(ViterbiStateEntry *vse);
00154 
00155   // Finds the first lower and upper case letter and first digit in curr_list.
00156   // Uses the first character in the list in place of empty results.
00157   // Returns true if both alpha and digits are found.
00158   bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
00159                              BLOB_CHOICE **first_lower,
00160                              BLOB_CHOICE **first_upper,
00161                              BLOB_CHOICE **first_digit) const;
00162   // Forces there to be at least one entry in the overall set of the
00163   // viterbi_state_entries of each element of parent_node that has the
00164   // top_choice_flag set for lower, upper and digit using the same rules as
00165   // GetTopLowerUpperDigit, setting the flag on the first found suitable
00166   // candidate, whether or not the flag is set on some other parent.
00167   // Returns 1 if both alpha and digits are found among the parents, -1 if no
00168   // parents are found at all (a legitimate case), and 0 otherwise.
00169   int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const;
00170 
00171   // Finds the next ViterbiStateEntry with which the given unichar_id can
00172   // combine sensibly, taking into account any mixed alnum/mixed case
00173   // situation, and whether this combination has been inspected before.
00174   ViterbiStateEntry* GetNextParentVSE(
00175       bool just_classified, bool mixed_alnum,
00176       const BLOB_CHOICE* bc, LanguageModelFlagsType blob_choice_flags,
00177       const UNICHARSET& unicharset, WERD_RES* word_res,
00178       ViterbiStateEntry_IT* vse_it,
00179       LanguageModelFlagsType* top_choice_flags) const;
00180   // Helper function that computes the cost of the path composed of the
00181   // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE.
00182   // If the new path looks good enough, adds a new ViterbiStateEntry to the
00183   // list of viterbi entries in the given BLOB_CHOICE and returns true.
00184   bool AddViterbiStateEntry(
00185       LanguageModelFlagsType top_choice_flags, float denom, bool word_end,
00186       int curr_col, int curr_row, BLOB_CHOICE *b,
00187       LanguageModelState *curr_state, ViterbiStateEntry *parent_vse,
00188       LMPainPoints *pain_points, WERD_RES *word_res,
00189       BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
00190 
00191   // Determines whether a potential entry is a true top choice and
00192   // updates changed accordingly.
00193   //
00194   // Note: The function assumes that b, top_choice_flags and changed
00195   // are not NULL.
00196   void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse,
00197                              const ViterbiStateEntry *parent_vse,
00198                              LanguageModelState *lms);
00199 
00200   // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and
00201   // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo
00202   // with updated active dawgs, constraints and permuter.
00203   //
00204   // Note: the caller is responsible for deleting the returned pointer.
00205   LanguageModelDawgInfo *GenerateDawgInfo(bool word_end,
00206                                           int curr_col, int curr_row,
00207                                           const BLOB_CHOICE &b,
00208                                           const ViterbiStateEntry *parent_vse);
00209 
00210   // Computes p(unichar | parent context) and records it in ngram_cost.
00211   // If b.unichar_id() is an unlikely continuation of the parent context
00212   // sets found_small_prob to true and returns NULL.
00213   // Otherwise creates a new LanguageModelNgramInfo entry containing the
00214   // updated context (that includes b.unichar_id() at the end) and returns it.
00215   //
00216   // Note: the caller is responsible for deleting the returned pointer.
00217   LanguageModelNgramInfo *GenerateNgramInfo(
00218       const char *unichar, float certainty, float denom,
00219       int curr_col, int curr_row, float outline_length,
00220       const ViterbiStateEntry *parent_vse);
00221 
00222   // Computes -(log(prob(classifier)) + log(prob(ngram model)))
00223   // for the given unichar in the given context. If there are multiple
00224   // unichars at one position - takes the average of their probabilities.
00225   // UNICHAR::utf8_step() is used to separate out individual UTF8 characters,
00226   // since probability_in_context() can only handle one at a time (while
00227   // unicharset might contain ngrams and glyphs composed from multiple UTF8
00228   // characters).
00229   float ComputeNgramCost(const char *unichar, float certainty, float denom,
00230                          const char *context, int *unichar_step_len,
00231                          bool *found_small_prob, float *ngram_prob);
00232 
00233   // Computes the normalization factors for the classifier confidences
00234   // (used by ComputeNgramCost()).
00235   float ComputeDenom(BLOB_CHOICE_LIST *curr_list);
00236 
00237   // Fills the given consistenty_info based on parent_vse.consistency_info
00238   // and on the consistency of the given unichar_id with parent_vse.
00239   void FillConsistencyInfo(
00240       int curr_col, bool word_end, BLOB_CHOICE *b,
00241       ViterbiStateEntry *parent_vse,
00242       WERD_RES *word_res,
00243       LMConsistencyInfo *consistency_info);
00244 
00245   // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs
00246   // on the path represented by the given BLOB_CHOICE and language model
00247   // state entries (lmse, dse). The path is re-constructed by following
00248   // the parent pointers in the the lang model state entries). If the
00249   // constructed WERD_CHOICE is better than the best/raw choice recorded
00250   // in the best_choice_bundle, this function updates the corresponding
00251   // fields and sets best_choice_bunldle->updated to true.
00252   void UpdateBestChoice(ViterbiStateEntry *vse,
00253                         LMPainPoints *pain_points,
00254                         WERD_RES *word_res,
00255                         BestChoiceBundle *best_choice_bundle,
00256                         BlamerBundle *blamer_bundle);
00257 
00258   // Constructs a WERD_CHOICE by tracing parent pointers starting with
00259   // the given LanguageModelStateEntry. Returns the constructed word.
00260   // Updates best_char_choices, certainties and state if they are not
00261   // NULL (best_char_choices and certainties are assumed to have the
00262   // length equal to lmse->length).
00263   // The caller is responsible for freeing memory associated with the
00264   // returned WERD_CHOICE.
00265   WERD_CHOICE *ConstructWord(ViterbiStateEntry *vse,
00266                              WERD_RES *word_res,
00267                              DANGERR *fixpt,
00268                              BlamerBundle *blamer_bundle,
00269                              bool *truth_path);
00270 
00271   // Wrapper around AssociateUtils::ComputeStats().
00272   inline void ComputeAssociateStats(int col, int row,
00273                                     float max_char_wh_ratio,
00274                                     ViterbiStateEntry *parent_vse,
00275                                     WERD_RES *word_res,
00276                                     AssociateStats *associate_stats) {
00277     AssociateUtils::ComputeStats(
00278         col, row,
00279         (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
00280         (parent_vse != NULL) ? parent_vse->length : 0,
00281         fixed_pitch_, max_char_wh_ratio,
00282         word_res, language_model_debug_level > 2, associate_stats);
00283   }
00284 
00285   // Returns true if the path with such top_choice_flags and dawg_info
00286   // could be pruned out (i.e. is neither a system/user/frequent dictionary
00287   // nor a top choice path).
00288   // In non-space delimited languages all paths can be "somewhat" dictionary
00289   // words. In such languages we can not do dictionary-driven path pruning,
00290   // so paths with non-empty dawg_info are considered prunable.
00291   inline bool PrunablePath(const ViterbiStateEntry &vse) {
00292     if (vse.top_choice_flags) return false;
00293     if (vse.dawg_info != NULL &&
00294         (vse.dawg_info->permuter == SYSTEM_DAWG_PERM ||
00295          vse.dawg_info->permuter == USER_DAWG_PERM ||
00296          vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
00297     return true;
00298   }
00299 
00300   // Returns true if the given ViterbiStateEntry represents an acceptable path.
00301   inline bool AcceptablePath(const ViterbiStateEntry &vse) {
00302     return (vse.dawg_info != NULL || vse.Consistent() ||
00303             (vse.ngram_info != NULL && !vse.ngram_info->pruned));
00304   }
00305 
00306  public:
00307   // Parameters.
00308   INT_VAR_H(language_model_debug_level, 0, "Language model debug level");
00309   BOOL_VAR_H(language_model_ngram_on, false,
00310              "Turn on/off the use of character ngram model");
00311   INT_VAR_H(language_model_ngram_order, 8,
00312             "Maximum order of the character ngram model");
00313   INT_VAR_H(language_model_viterbi_list_max_num_prunable, 10,
00314             "Maximum number of prunable (those for which PrunablePath() is"
00315             " true) entries in each viterbi list recorded in BLOB_CHOICEs");
00316   INT_VAR_H(language_model_viterbi_list_max_size, 500,
00317             "Maximum size of viterbi lists recorded in BLOB_CHOICEs");
00318   double_VAR_H(language_model_ngram_small_prob, 0.000001,
00319                "To avoid overly small denominators use this as the floor"
00320                " of the probability returned by the ngram model");
00321   double_VAR_H(language_model_ngram_nonmatch_score, -40.0,
00322                "Average classifier score of a non-matching unichar");
00323   BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step, false,
00324              "Use only the first UTF8 step of the given string"
00325              " when computing log probabilities");
00326   double_VAR_H(language_model_ngram_scale_factor, 0.03,
00327                "Strength of the character ngram model relative to the"
00328                " character classifier ");
00329   double_VAR_H(language_model_ngram_rating_factor, 16.0,
00330                "Factor to bring log-probs into the same range as ratings"
00331                " when multiplied by outline length ");
00332   BOOL_VAR_H(language_model_ngram_space_delimited_language, true,
00333              "Words are delimited by space");
00334   INT_VAR_H(language_model_min_compound_length, 3,
00335             "Minimum length of compound words");
00336   // Penalties used for adjusting path costs and final word rating.
00337   double_VAR_H(language_model_penalty_non_freq_dict_word, 0.1,
00338                "Penalty for words not in the frequent word dictionary");
00339   double_VAR_H(language_model_penalty_non_dict_word, 0.15,
00340                "Penalty for non-dictionary words");
00341   double_VAR_H(language_model_penalty_punc, 0.2,
00342                "Penalty for inconsistent punctuation");
00343   double_VAR_H(language_model_penalty_case, 0.1,
00344                "Penalty for inconsistent case");
00345   double_VAR_H(language_model_penalty_script, 0.5,
00346                "Penalty for inconsistent script");
00347   double_VAR_H(language_model_penalty_chartype, 0.3,
00348                "Penalty for inconsistent character type");
00349   double_VAR_H(language_model_penalty_font, 0.00,
00350                "Penalty for inconsistent font");
00351   double_VAR_H(language_model_penalty_spacing, 0.05,
00352                "Penalty for inconsistent spacing");
00353   double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment");
00354   INT_VAR_H(wordrec_display_segmentations, 0, "Display Segmentations");
00355   BOOL_VAR_H(language_model_use_sigmoidal_certainty, false,
00356              "Use sigmoidal score for certainty");
00357 
00358 
00359  protected:
00360   // Member Variables.
00361 
00362   // Temporary DawgArgs struct that is re-used across different words to
00363   // avoid dynamic memory re-allocation (should be cleared before each use).
00364   DawgArgs *dawg_args_;
00365   // Scaling for recovering blob outline length from rating and certainty.
00366   float rating_cert_scale_;
00367 
00368   // The following variables are set at construction time.
00369 
00370   // Pointer to fontinfo table (not owned by LanguageModel).
00371   const UnicityTable<FontInfo> *fontinfo_table_;
00372 
00373   // Pointer to Dict class, that is used for querying the dictionaries
00374   // (the pointer is not owned by LanguageModel).
00375   Dict *dict_;
00376 
00377   // TODO(daria): the following variables should become LanguageModel params
00378   // when the old code in bestfirst.cpp and heuristic.cpp is deprecated.
00379   //
00380   // Set to true if we are dealing with fixed pitch text
00381   // (set to assume_fixed_pitch_char_segment).
00382   bool fixed_pitch_;
00383   // Max char width-to-height ratio allowed
00384   // (set to segsearch_max_char_wh_ratio).
00385   float max_char_wh_ratio_;
00386 
00387   // The following variables are initialized with InitForWord().
00388 
00389   // String representation of the classification of the previous word
00390   // (since this is only used by the character ngram model component,
00391   // only the last language_model_ngram_order of the word are stored).
00392   STRING prev_word_str_;
00393   int prev_word_unichar_step_len_;
00394   // Active dawg vector.
00395   DawgPositionVector *very_beginning_active_dawgs_;  // includes continuation
00396   DawgPositionVector *beginning_active_dawgs_;
00397   // Set to true if acceptable choice was discovered.
00398   // Note: it would be nice to use this to terminate the search once an
00399   // acceptable choices is found. However we do not do that and once an
00400   // acceptable choice is found we finish looking for alternative choices
00401   // in the current segmentation graph and then exit the search (no more
00402   // classifications are done after an acceptable choice is found).
00403   // This is needed in order to let the search find the words very close to
00404   // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these
00405   // choices. This way the stopper will know that the best choice is not
00406   // ambiguous (i.e. there are best choices in the best choice list that have
00407   // ratings close to the very best one) and will be less likely to mis-adapt.
00408   bool acceptable_choice_found_;
00409   // Set to true if a choice representing correct segmentation was explored.
00410   bool correct_segmentation_explored_;
00411 
00412   // Params models containing weights for for computing ViterbiStateEntry costs.
00413   ParamsModel params_model_;
00414 };
00415 
00416 }  // namespace tesseract
00417 
00418 #endif  // TESSERACT_WORDREC_LANGUAGE_MODEL_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines