tesseract 3.04.01

wordrec/language_model.cpp

Go to the documentation of this file.
00001 
00002 // File:        language_model.cpp
00003 // Description: Functions that utilize the knowledge about the properties,
00004 //              structure and statistics of the language to help recognition.
00005 // Author:      Daria Antonova
00006 // Created:     Mon Nov 11 11:26:43 PST 2009
00007 //
00008 // (C) Copyright 2009, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #include <math.h>
00022 
00023 #include "language_model.h"
00024 
00025 #include "dawg.h"
00026 #include "freelist.h"
00027 #include "intproto.h"
00028 #include "helpers.h"
00029 #include "lm_state.h"
00030 #include "lm_pain_points.h"
00031 #include "matrix.h"
00032 #include "params.h"
00033 #include "params_training_featdef.h"
00034 
00035 #if defined(_MSC_VER) || defined(ANDROID)
00036 double log2(double n) {
00037   return log(n) / log(2.0);
00038 }
00039 #endif  // _MSC_VER
00040 
00041 namespace tesseract {
00042 
00043 const float LanguageModel::kMaxAvgNgramCost = 25.0f;
00044 
00045 LanguageModel::LanguageModel(const UnicityTable<FontInfo> *fontinfo_table,
00046                              Dict *dict)
00047   : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
00048                dict->getCCUtil()->params()),
00049     BOOL_INIT_MEMBER(language_model_ngram_on, false,
00050                      "Turn on/off the use of character ngram model",
00051                      dict->getCCUtil()->params()),
00052     INT_MEMBER(language_model_ngram_order, 8,
00053                "Maximum order of the character ngram model",
00054                dict->getCCUtil()->params()),
00055     INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
00056                "Maximum number of prunable (those for which"
00057                " PrunablePath() is true) entries in each viterbi list"
00058                " recorded in BLOB_CHOICEs",
00059                dict->getCCUtil()->params()),
00060     INT_MEMBER(language_model_viterbi_list_max_size, 500,
00061                "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
00062                dict->getCCUtil()->params()),
00063     double_MEMBER(language_model_ngram_small_prob, 0.000001,
00064                   "To avoid overly small denominators use this as the "
00065                   "floor of the probability returned by the ngram model.",
00066                   dict->getCCUtil()->params()),
00067     double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
00068                   "Average classifier score of a non-matching unichar.",
00069                   dict->getCCUtil()->params()),
00070     BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
00071                 "Use only the first UTF8 step of the given string"
00072                 " when computing log probabilities.",
00073                 dict->getCCUtil()->params()),
00074     double_MEMBER(language_model_ngram_scale_factor, 0.03,
00075                   "Strength of the character ngram model relative to the"
00076                   " character classifier ",
00077                   dict->getCCUtil()->params()),
00078     double_MEMBER(language_model_ngram_rating_factor, 16.0,
00079                   "Factor to bring log-probs into the same range as ratings"
00080                   " when multiplied by outline length ",
00081                   dict->getCCUtil()->params()),
00082     BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
00083                 "Words are delimited by space",
00084                 dict->getCCUtil()->params()),
00085     INT_MEMBER(language_model_min_compound_length, 3,
00086                "Minimum length of compound words",
00087                dict->getCCUtil()->params()),
00088     double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
00089                   "Penalty for words not in the frequent word dictionary",
00090                   dict->getCCUtil()->params()),
00091     double_MEMBER(language_model_penalty_non_dict_word, 0.15,
00092                   "Penalty for non-dictionary words",
00093                   dict->getCCUtil()->params()),
00094     double_MEMBER(language_model_penalty_punc, 0.2,
00095                   "Penalty for inconsistent punctuation",
00096                   dict->getCCUtil()->params()),
00097     double_MEMBER(language_model_penalty_case, 0.1,
00098                   "Penalty for inconsistent case",
00099                   dict->getCCUtil()->params()),
00100     double_MEMBER(language_model_penalty_script, 0.5,
00101                   "Penalty for inconsistent script",
00102                   dict->getCCUtil()->params()),
00103     double_MEMBER(language_model_penalty_chartype, 0.3,
00104                   "Penalty for inconsistent character type",
00105                   dict->getCCUtil()->params()),
00106     // TODO(daria, rays): enable font consistency checking
00107     // after improving font analysis.
00108     double_MEMBER(language_model_penalty_font, 0.00,
00109                   "Penalty for inconsistent font",
00110                   dict->getCCUtil()->params()),
00111     double_MEMBER(language_model_penalty_spacing, 0.05,
00112                   "Penalty for inconsistent spacing",
00113                   dict->getCCUtil()->params()),
00114     double_MEMBER(language_model_penalty_increment, 0.01,
00115                   "Penalty increment",
00116                   dict->getCCUtil()->params()),
00117     INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
00118                dict->getCCUtil()->params()),
00119     BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
00120                      "Use sigmoidal score for certainty",
00121                      dict->getCCUtil()->params()),
00122   fontinfo_table_(fontinfo_table), dict_(dict),
00123   fixed_pitch_(false), max_char_wh_ratio_(0.0),
00124   acceptable_choice_found_(false) {
00125   ASSERT_HOST(dict_ != NULL);
00126   dawg_args_ = new DawgArgs(NULL, new DawgPositionVector(), NO_PERM);
00127   very_beginning_active_dawgs_ = new DawgPositionVector();
00128   beginning_active_dawgs_ = new DawgPositionVector();
00129 }
00130 
00131 LanguageModel::~LanguageModel() {
00132   delete very_beginning_active_dawgs_;
00133   delete beginning_active_dawgs_;
00134   delete dawg_args_->updated_dawgs;
00135   delete dawg_args_;
00136 }
00137 
00138 void LanguageModel::InitForWord(const WERD_CHOICE *prev_word,
00139                                 bool fixed_pitch, float max_char_wh_ratio,
00140                                 float rating_cert_scale) {
00141   fixed_pitch_ = fixed_pitch;
00142   max_char_wh_ratio_ = max_char_wh_ratio;
00143   rating_cert_scale_ = rating_cert_scale;
00144   acceptable_choice_found_ = false;
00145   correct_segmentation_explored_ = false;
00146 
00147   // Initialize vectors with beginning DawgInfos.
00148   very_beginning_active_dawgs_->clear();
00149   dict_->init_active_dawgs(very_beginning_active_dawgs_, false);
00150   beginning_active_dawgs_->clear();
00151   dict_->default_dawgs(beginning_active_dawgs_, false);
00152 
00153   // Fill prev_word_str_ with the last language_model_ngram_order
00154   // unichars from prev_word.
00155   if (language_model_ngram_on) {
00156     if (prev_word != NULL && prev_word->unichar_string() != NULL) {
00157       prev_word_str_ = prev_word->unichar_string();
00158       if (language_model_ngram_space_delimited_language) prev_word_str_ += ' ';
00159     } else {
00160       prev_word_str_ = " ";
00161     }
00162     const char *str_ptr = prev_word_str_.string();
00163     const char *str_end = str_ptr + prev_word_str_.length();
00164     int step;
00165     prev_word_unichar_step_len_ = 0;
00166     while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
00167       str_ptr += step;
00168       ++prev_word_unichar_step_len_;
00169     }
00170     ASSERT_HOST(str_ptr == str_end);
00171   }
00172 }
00173 
00178 static void ScanParentsForCaseMix(const UNICHARSET& unicharset,
00179                                   LanguageModelState* parent_node) {
00180   if (parent_node == NULL) return;
00181   ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
00182   for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
00183     ViterbiStateEntry* vse = vit.data();
00184     vse->competing_vse = NULL;
00185     UNICHAR_ID unichar_id = vse->curr_b->unichar_id();
00186     if (unicharset.get_isupper(unichar_id) ||
00187         unicharset.get_islower(unichar_id)) {
00188       UNICHAR_ID other_case = unicharset.get_other_case(unichar_id);
00189       if (other_case == unichar_id) continue;  // Not in unicharset.
00190       // Find other case in same list. There could be multiple entries with
00191       // the same unichar_id, but in theory, they should all point to the
00192       // same BLOB_CHOICE, and that is what we will be using to decide
00193       // which to keep.
00194       ViterbiStateEntry_IT vit2(&parent_node->viterbi_state_entries);
00195       for (vit2.mark_cycle_pt(); !vit2.cycled_list() &&
00196            vit2.data()->curr_b->unichar_id() != other_case;
00197            vit2.forward()) {}
00198       if (!vit2.cycled_list()) {
00199         vse->competing_vse = vit2.data();
00200       }
00201     }
00202   }
00203 }
00204 
00209 static bool HasBetterCaseVariant(const UNICHARSET& unicharset,
00210                                  const BLOB_CHOICE* choice,
00211                                  BLOB_CHOICE_LIST* choices) {
00212   UNICHAR_ID choice_id = choice->unichar_id();
00213   UNICHAR_ID other_case = unicharset.get_other_case(choice_id);
00214   if (other_case == choice_id || other_case == INVALID_UNICHAR_ID)
00215     return false;  // Not upper or lower or not in unicharset.
00216   if (unicharset.SizesDistinct(choice_id, other_case))
00217     return false;  // Can be separated by size.
00218   BLOB_CHOICE_IT bc_it(choices);
00219   for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
00220     BLOB_CHOICE* better_choice = bc_it.data();
00221     if (better_choice->unichar_id() == other_case)
00222       return true;  // Found an earlier instance of other_case.
00223     else if (better_choice == choice)
00224       return false;  // Reached the original choice.
00225   }
00226   return false;  // Should never happen, but just in case.
00227 }
00228 
00255 bool LanguageModel::UpdateState(
00256     bool just_classified,
00257     int curr_col, int curr_row,
00258     BLOB_CHOICE_LIST *curr_list,
00259     LanguageModelState *parent_node,
00260     LMPainPoints *pain_points,
00261     WERD_RES *word_res,
00262     BestChoiceBundle *best_choice_bundle,
00263     BlamerBundle *blamer_bundle) {
00264   if (language_model_debug_level > 0) {
00265     tprintf("\nUpdateState: col=%d row=%d %s",
00266             curr_col, curr_row, just_classified ? "just_classified" : "");
00267     if (language_model_debug_level > 5)
00268       tprintf("(parent=%p)\n", parent_node);
00269     else
00270       tprintf("\n");
00271   }
00272   // Initialize helper variables.
00273   bool word_end = (curr_row+1 >= word_res->ratings->dimension());
00274   bool new_changed = false;
00275   float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
00276   const UNICHARSET& unicharset = dict_->getUnicharset();
00277   BLOB_CHOICE *first_lower = NULL;
00278   BLOB_CHOICE *first_upper = NULL;
00279   BLOB_CHOICE *first_digit = NULL;
00280   bool has_alnum_mix = false;
00281   if (parent_node != NULL) {
00282     int result = SetTopParentLowerUpperDigit(parent_node);
00283     if (result < 0) {
00284       if (language_model_debug_level > 0)
00285         tprintf("No parents found to process\n");
00286       return false;
00287     }
00288     if (result > 0)
00289       has_alnum_mix = true;
00290   }
00291   if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
00292                              &first_digit))
00293     has_alnum_mix = false;;
00294   ScanParentsForCaseMix(unicharset, parent_node);
00295   if (language_model_debug_level > 3 && parent_node != NULL) {
00296     parent_node->Print("Parent viterbi list");
00297   }
00298   LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
00299 
00300   // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
00301   ViterbiStateEntry_IT vit;
00302   BLOB_CHOICE_IT c_it(curr_list);
00303   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
00304     BLOB_CHOICE* choice = c_it.data();
00305     // TODO(antonova): make sure commenting this out if ok for ngram
00306     // model scoring (I think this was introduced to fix ngram model quirks).
00307     // Skip NULL unichars unless it is the only choice.
00308     //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
00309     UNICHAR_ID unichar_id = choice->unichar_id();
00310     if (unicharset.get_fragment(unichar_id)) {
00311       continue;  // Skip fragments.
00312     }
00313     // Set top choice flags.
00314     LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
00315     if (c_it.at_first() || !new_changed)
00316       blob_choice_flags |= kSmallestRatingFlag;
00317     if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
00318     if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
00319     if (first_digit == choice) blob_choice_flags |= kDigitFlag;
00320 
00321     if (parent_node == NULL) {
00322       // Process the beginning of a word.
00323       // If there is a better case variant that is not distinguished by size,
00324       // skip this blob choice, as we have no choice but to accept the result
00325       // of the character classifier to distinguish between them, even if
00326       // followed by an upper case.
00327       // With words like iPoc, and other CamelBackWords, the lower-upper
00328       // transition can only be achieved if the classifier has the correct case
00329       // as the top choice, and leaving an initial I lower down the list
00330       // increases the chances of choosing IPoc simply because it doesn't
00331       // include such a transition. iPoc will beat iPOC and ipoc because
00332       // the other words are baseline/x-height inconsistent.
00333       if (HasBetterCaseVariant(unicharset, choice, curr_list))
00334         continue;
00335       // Upper counts as lower at the beginning of a word.
00336       if (blob_choice_flags & kUpperCaseFlag)
00337         blob_choice_flags |= kLowerCaseFlag;
00338       new_changed |= AddViterbiStateEntry(
00339           blob_choice_flags, denom, word_end, curr_col, curr_row,
00340           choice, curr_state, NULL, pain_points,
00341           word_res, best_choice_bundle, blamer_bundle);
00342     } else {
00343       // Get viterbi entries from each parent ViterbiStateEntry.
00344       vit.set_to_list(&parent_node->viterbi_state_entries);
00345       int vit_counter = 0;
00346       vit.mark_cycle_pt();
00347       ViterbiStateEntry* parent_vse = NULL;
00348       LanguageModelFlagsType top_choice_flags;
00349       while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
00350                                             c_it.data(), blob_choice_flags,
00351                                             unicharset, word_res, &vit,
00352                                             &top_choice_flags)) != NULL) {
00353         // Skip pruned entries and do not look at prunable entries if already
00354         // examined language_model_viterbi_list_max_num_prunable of those.
00355         if (PrunablePath(*parent_vse) &&
00356             (++vit_counter > language_model_viterbi_list_max_num_prunable ||
00357              (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
00358           continue;
00359         }
00360         // If the parent has no alnum choice, (ie choice is the first in a
00361         // string of alnum), and there is a better case variant that is not
00362         // distinguished by size, skip this blob choice/parent, as with the
00363         // initial blob treatment above.
00364         if (!parent_vse->HasAlnumChoice(unicharset) &&
00365             HasBetterCaseVariant(unicharset, choice, curr_list))
00366           continue;
00367         // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
00368         // looks good according to the Dawgs or character ngram model.
00369         new_changed |= AddViterbiStateEntry(
00370             top_choice_flags, denom, word_end, curr_col, curr_row,
00371             c_it.data(), curr_state, parent_vse, pain_points,
00372             word_res, best_choice_bundle, blamer_bundle);
00373       }
00374     }
00375   }
00376   return new_changed;
00377 }
00378 
00385 bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
00386                                           BLOB_CHOICE **first_lower,
00387                                           BLOB_CHOICE **first_upper,
00388                                           BLOB_CHOICE **first_digit) const {
00389   BLOB_CHOICE_IT c_it(curr_list);
00390   const UNICHARSET &unicharset = dict_->getUnicharset();
00391   BLOB_CHOICE *first_unichar = NULL;
00392   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
00393     UNICHAR_ID unichar_id = c_it.data()->unichar_id();
00394     if (unicharset.get_fragment(unichar_id)) continue;  // skip fragments
00395     if (first_unichar == NULL) first_unichar = c_it.data();
00396     if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
00397       *first_lower = c_it.data();
00398     }
00399     if (*first_upper == NULL && unicharset.get_isalpha(unichar_id) &&
00400         !unicharset.get_islower(unichar_id)) {
00401       *first_upper = c_it.data();
00402     }
00403     if (*first_digit == NULL && unicharset.get_isdigit(unichar_id)) {
00404       *first_digit = c_it.data();
00405     }
00406   }
00407   ASSERT_HOST(first_unichar != NULL);
00408   bool mixed = (*first_lower != NULL || *first_upper != NULL) &&
00409       *first_digit != NULL;
00410   if (*first_lower == NULL) *first_lower = first_unichar;
00411   if (*first_upper == NULL) *first_upper = first_unichar;
00412   if (*first_digit == NULL) *first_digit = first_unichar;
00413   return mixed;
00414 }
00415 
00425 int LanguageModel::SetTopParentLowerUpperDigit(
00426     LanguageModelState *parent_node) const {
00427   if (parent_node == NULL) return -1;
00428   UNICHAR_ID top_id = INVALID_UNICHAR_ID;
00429   ViterbiStateEntry* top_lower = NULL;
00430   ViterbiStateEntry* top_upper = NULL;
00431   ViterbiStateEntry* top_digit = NULL;
00432   ViterbiStateEntry* top_choice = NULL;
00433   float lower_rating = 0.0f;
00434   float upper_rating = 0.0f;
00435   float digit_rating = 0.0f;
00436   float top_rating = 0.0f;
00437   const UNICHARSET &unicharset = dict_->getUnicharset();
00438   ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
00439   for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
00440     ViterbiStateEntry* vse = vit.data();
00441     // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
00442     // back to the real character if needed.
00443     ViterbiStateEntry* unichar_vse = vse;
00444     UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
00445     float rating = unichar_vse->curr_b->rating();
00446     while (unichar_id == INVALID_UNICHAR_ID &&
00447            unichar_vse->parent_vse != NULL) {
00448       unichar_vse = unichar_vse->parent_vse;
00449       unichar_id = unichar_vse->curr_b->unichar_id();
00450       rating = unichar_vse->curr_b->rating();
00451     }
00452     if (unichar_id != INVALID_UNICHAR_ID) {
00453       if (unicharset.get_islower(unichar_id)) {
00454         if (top_lower == NULL || lower_rating > rating) {
00455           top_lower = vse;
00456           lower_rating = rating;
00457         }
00458       } else if (unicharset.get_isalpha(unichar_id)) {
00459         if (top_upper == NULL || upper_rating > rating) {
00460           top_upper = vse;
00461           upper_rating = rating;
00462         }
00463       } else if (unicharset.get_isdigit(unichar_id)) {
00464         if (top_digit == NULL || digit_rating > rating) {
00465           top_digit = vse;
00466           digit_rating = rating;
00467         }
00468       }
00469     }
00470     if (top_choice == NULL || top_rating > rating) {
00471       top_choice = vse;
00472       top_rating = rating;
00473       top_id = unichar_id;
00474     }
00475   }
00476   if (top_choice == NULL) return -1;
00477   bool mixed = (top_lower != NULL || top_upper != NULL) &&
00478       top_digit != NULL;
00479   if (top_lower == NULL) top_lower = top_choice;
00480   top_lower->top_choice_flags |= kLowerCaseFlag;
00481   if (top_upper == NULL) top_upper = top_choice;
00482   top_upper->top_choice_flags |= kUpperCaseFlag;
00483   if (top_digit == NULL) top_digit = top_choice;
00484   top_digit->top_choice_flags |= kDigitFlag;
00485   top_choice->top_choice_flags |= kSmallestRatingFlag;
00486   if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
00487       (top_choice->top_choice_flags &
00488           (kLowerCaseFlag | kUpperCaseFlag | kDigitFlag))) {
00489     // If the compound marker top choice carries any of the top alnum flags,
00490     // then give it all of them, allowing words like I-295 to be chosen.
00491     top_choice->top_choice_flags |=
00492         kLowerCaseFlag | kUpperCaseFlag | kDigitFlag;
00493   }
00494   return mixed ? 1 : 0;
00495 }
00496 
00502 ViterbiStateEntry* LanguageModel::GetNextParentVSE(
00503     bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc,
00504     LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset,
00505     WERD_RES* word_res, ViterbiStateEntry_IT* vse_it,
00506     LanguageModelFlagsType* top_choice_flags) const {
00507   for (; !vse_it->cycled_list(); vse_it->forward()) {
00508     ViterbiStateEntry* parent_vse = vse_it->data();
00509     // Only consider the parent if it has been updated or
00510     // if the current ratings cell has just been classified.
00511     if (!just_classified && !parent_vse->updated) continue;
00512     if (language_model_debug_level > 2)
00513       parent_vse->Print("Considering");
00514     // If the parent is non-alnum, then upper counts as lower.
00515     *top_choice_flags = blob_choice_flags;
00516     if ((blob_choice_flags & kUpperCaseFlag) &&
00517         !parent_vse->HasAlnumChoice(unicharset)) {
00518       *top_choice_flags |= kLowerCaseFlag;
00519     }
00520     *top_choice_flags &= parent_vse->top_choice_flags;
00521     UNICHAR_ID unichar_id = bc->unichar_id();
00522     const BLOB_CHOICE* parent_b = parent_vse->curr_b;
00523     UNICHAR_ID parent_id = parent_b->unichar_id();
00524     // Digits do not bind to alphas if there is a mix in both parent and current
00525     // or if the alpha is not the top choice.
00526     if (unicharset.get_isdigit(unichar_id) &&
00527         unicharset.get_isalpha(parent_id) &&
00528         (mixed_alnum || *top_choice_flags == 0))
00529       continue;  // Digits don't bind to alphas.
00530     // Likewise alphas do not bind to digits if there is a mix in both or if
00531     // the digit is not the top choice.
00532     if (unicharset.get_isalpha(unichar_id) &&
00533         unicharset.get_isdigit(parent_id) &&
00534         (mixed_alnum || *top_choice_flags == 0))
00535       continue;  // Alphas don't bind to digits.
00536     // If there is a case mix of the same alpha in the parent list, then
00537     // competing_vse is non-null and will be used to determine whether
00538     // or not to bind the current blob choice.
00539     if (parent_vse->competing_vse != NULL) {
00540       const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
00541       UNICHAR_ID other_id = competing_b->unichar_id();
00542       if (language_model_debug_level >= 5) {
00543         tprintf("Parent %s has competition %s\n",
00544                 unicharset.id_to_unichar(parent_id),
00545                 unicharset.id_to_unichar(other_id));
00546       }
00547       if (unicharset.SizesDistinct(parent_id, other_id)) {
00548         // If other_id matches bc wrt position and size, and parent_id, doesn't,
00549         // don't bind to the current parent.
00550         if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
00551                                 language_model_debug_level >= 5) &&
00552             !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
00553                                 language_model_debug_level >= 5))
00554           continue;  // Competing blobchoice has a better vertical match.
00555       }
00556     }
00557     vse_it->forward();
00558     return parent_vse;  // This one is good!
00559   }
00560   return NULL;  // Ran out of possibilities.
00561 }
00562 
00563 bool LanguageModel::AddViterbiStateEntry(
00564     LanguageModelFlagsType top_choice_flags,
00565     float denom,
00566     bool word_end,
00567     int curr_col, int curr_row,
00568     BLOB_CHOICE *b,
00569     LanguageModelState *curr_state,
00570     ViterbiStateEntry *parent_vse,
00571     LMPainPoints *pain_points,
00572     WERD_RES *word_res,
00573     BestChoiceBundle *best_choice_bundle,
00574     BlamerBundle *blamer_bundle) {
00575   ViterbiStateEntry_IT vit;
00576   if (language_model_debug_level > 1) {
00577     tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
00578             " certainty=%.4f top_choice_flags=0x%x",
00579             dict_->getUnicharset().id_to_unichar(b->unichar_id()),
00580             b->rating(), b->certainty(), top_choice_flags);
00581     if (language_model_debug_level > 5)
00582       tprintf(" parent_vse=%p\n", parent_vse);
00583     else
00584       tprintf("\n");
00585   }
00586   // Check whether the list is full.
00587   if (curr_state != NULL &&
00588       curr_state->viterbi_state_entries_length >=
00589           language_model_viterbi_list_max_size) {
00590     if (language_model_debug_level > 1) {
00591       tprintf("AddViterbiStateEntry: viterbi list is full!\n");
00592     }
00593     return false;
00594   }
00595 
00596   // Invoke Dawg language model component.
00597   LanguageModelDawgInfo *dawg_info =
00598     GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
00599 
00600   float outline_length =
00601       AssociateUtils::ComputeOutlineLength(rating_cert_scale_, *b);
00602   // Invoke Ngram language model component.
00603   LanguageModelNgramInfo *ngram_info = NULL;
00604   if (language_model_ngram_on) {
00605     ngram_info = GenerateNgramInfo(
00606         dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->certainty(),
00607         denom, curr_col, curr_row, outline_length, parent_vse);
00608     ASSERT_HOST(ngram_info != NULL);
00609   }
00610   bool liked_by_language_model = dawg_info != NULL ||
00611       (ngram_info != NULL && !ngram_info->pruned);
00612   // Quick escape if not liked by the language model, can't be consistent
00613   // xheight, and not top choice.
00614   if (!liked_by_language_model && top_choice_flags == 0) {
00615     if (language_model_debug_level > 1) {
00616       tprintf("Language model components very early pruned this entry\n");
00617     }
00618     delete ngram_info;
00619     delete dawg_info;
00620     return false;
00621   }
00622 
00623   // Check consistency of the path and set the relevant consistency_info.
00624   LMConsistencyInfo consistency_info(
00625     parent_vse != NULL ? &parent_vse->consistency_info : NULL);
00626   // Start with just the x-height consistency, as it provides significant
00627   // pruning opportunity.
00628   consistency_info.ComputeXheightConsistency(
00629       b, dict_->getUnicharset().get_ispunctuation(b->unichar_id()));
00630   // Turn off xheight consistent flag if not consistent.
00631   if (consistency_info.InconsistentXHeight()) {
00632     top_choice_flags &= ~kXhtConsistentFlag;
00633   }
00634 
00635   // Quick escape if not liked by the language model, not consistent xheight,
00636   // and not top choice.
00637   if (!liked_by_language_model && top_choice_flags == 0) {
00638     if (language_model_debug_level > 1) {
00639       tprintf("Language model components early pruned this entry\n");
00640     }
00641     delete ngram_info;
00642     delete dawg_info;
00643     return false;
00644   }
00645 
00646   // Compute the rest of the consistency info.
00647   FillConsistencyInfo(curr_col, word_end, b, parent_vse,
00648                       word_res, &consistency_info);
00649   if (dawg_info != NULL && consistency_info.invalid_punc) {
00650     consistency_info.invalid_punc = false;  // do not penalize dict words
00651   }
00652 
00653   // Compute cost of associating the blobs that represent the current unichar.
00654   AssociateStats associate_stats;
00655   ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
00656                         parent_vse, word_res, &associate_stats);
00657   if (parent_vse != NULL) {
00658     associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
00659     associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
00660   }
00661 
00662   // Create the new ViterbiStateEntry compute the adjusted cost of the path.
00663   ViterbiStateEntry *new_vse = new ViterbiStateEntry(
00664       parent_vse, b, 0.0, outline_length,
00665       consistency_info, associate_stats, top_choice_flags, dawg_info,
00666       ngram_info, (language_model_debug_level > 0) ?
00667           dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL);
00668   new_vse->cost = ComputeAdjustedPathCost(new_vse);
00669   if (language_model_debug_level >= 3)
00670     tprintf("Adjusted cost = %g\n", new_vse->cost);
00671 
00672   // Invoke Top Choice language model component to make the final adjustments
00673   // to new_vse->top_choice_flags.
00674   if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
00675     GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
00676   }
00677 
00678   // If language model components did not like this unichar - return.
00679   bool keep = new_vse->top_choice_flags || liked_by_language_model;
00680   if (!(top_choice_flags & kSmallestRatingFlag) &&  // no non-top choice paths
00681       consistency_info.inconsistent_script) {       // with inconsistent script
00682     keep = false;
00683   }
00684   if (!keep) {
00685     if (language_model_debug_level > 1) {
00686       tprintf("Language model components did not like this entry\n");
00687     }
00688     delete new_vse;
00689     return false;
00690   }
00691 
00692   // Discard this entry if it represents a prunable path and
00693   // language_model_viterbi_list_max_num_prunable such entries with a lower
00694   // cost have already been recorded.
00695   if (PrunablePath(*new_vse) &&
00696       (curr_state->viterbi_state_entries_prunable_length >=
00697        language_model_viterbi_list_max_num_prunable) &&
00698       new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
00699     if (language_model_debug_level > 1) {
00700       tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
00701               new_vse->cost,
00702               curr_state->viterbi_state_entries_prunable_max_cost);
00703     }
00704     delete new_vse;
00705     return false;
00706   }
00707 
00708   // Update best choice if needed.
00709   if (word_end) {
00710     UpdateBestChoice(new_vse, pain_points, word_res,
00711                      best_choice_bundle, blamer_bundle);
00712     // Discard the entry if UpdateBestChoice() found flaws in it.
00713     if (new_vse->cost >= WERD_CHOICE::kBadRating &&
00714         new_vse != best_choice_bundle->best_vse) {
00715       if (language_model_debug_level > 1) {
00716         tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
00717       }
00718       delete new_vse;
00719       return false;
00720     }
00721   }
00722 
00723   // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
00724   curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
00725                                                false, new_vse);
00726   curr_state->viterbi_state_entries_length++;
00727   if (PrunablePath(*new_vse)) {
00728     curr_state->viterbi_state_entries_prunable_length++;
00729   }
00730 
00731   // Update lms->viterbi_state_entries_prunable_max_cost and clear
00732   // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
00733   if ((curr_state->viterbi_state_entries_prunable_length >=
00734        language_model_viterbi_list_max_num_prunable) ||
00735       new_vse->top_choice_flags) {
00736     ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
00737     int prunable_counter = language_model_viterbi_list_max_num_prunable;
00738     vit.set_to_list(&(curr_state->viterbi_state_entries));
00739     for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
00740       ViterbiStateEntry *curr_vse = vit.data();
00741       // Clear the appropriate top choice flags of the entries in the
00742       // list that have cost higher thank new_entry->cost
00743       // (since they will not be top choices any more).
00744       if (curr_vse->top_choice_flags && curr_vse != new_vse &&
00745           curr_vse->cost > new_vse->cost) {
00746         curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
00747       }
00748       if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
00749       // Update curr_state->viterbi_state_entries_prunable_max_cost.
00750       if (prunable_counter == 0) {
00751         curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
00752         if (language_model_debug_level > 1) {
00753           tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
00754                   curr_state->viterbi_state_entries_prunable_max_cost);
00755         }
00756         prunable_counter = -1;  // stop counting
00757       }
00758     }
00759   }
00760 
00761   // Print the newly created ViterbiStateEntry.
00762   if (language_model_debug_level > 2) {
00763     new_vse->Print("New");
00764     if (language_model_debug_level > 5)
00765       curr_state->Print("Updated viterbi list");
00766   }
00767 
00768   return true;
00769 }
00770 
00771 void LanguageModel::GenerateTopChoiceInfo(ViterbiStateEntry *new_vse,
00772                                           const ViterbiStateEntry *parent_vse,
00773                                           LanguageModelState *lms) {
00774   ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
00775   for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
00776        new_vse->cost >= vit.data()->cost; vit.forward()) {
00777     // Clear the appropriate flags if the list already contains
00778     // a top choice entry with a lower cost.
00779     new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
00780   }
00781   if (language_model_debug_level > 2) {
00782     tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
00783             new_vse->top_choice_flags);
00784   }
00785 }
00786 
00787 LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(
00788     bool word_end,
00789     int curr_col, int curr_row,
00790     const BLOB_CHOICE &b,
00791     const ViterbiStateEntry *parent_vse) {
00792   // Initialize active_dawgs from parent_vse if it is not NULL.
00793   // Otherwise use very_beginning_active_dawgs_.
00794   if (parent_vse == NULL) {
00795     dawg_args_->active_dawgs = very_beginning_active_dawgs_;
00796     dawg_args_->permuter = NO_PERM;
00797   } else {
00798     if (parent_vse->dawg_info == NULL) return NULL;  // not a dict word path
00799     dawg_args_->active_dawgs = parent_vse->dawg_info->active_dawgs;
00800     dawg_args_->permuter = parent_vse->dawg_info->permuter;
00801   }
00802 
00803   // Deal with hyphenated words.
00804   if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
00805     if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
00806     return new LanguageModelDawgInfo(dawg_args_->active_dawgs,
00807                                      COMPOUND_PERM);
00808   }
00809 
00810   // Deal with compound words.
00811   if (dict_->compound_marker(b.unichar_id()) &&
00812       (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
00813     if (language_model_debug_level > 0) tprintf("Found compound marker\n");
00814     // Do not allow compound operators at the beginning and end of the word.
00815     // Do not allow more than one compound operator per word.
00816     // Do not allow compounding of words with lengths shorter than
00817     // language_model_min_compound_length
00818     if (parent_vse == NULL || word_end ||
00819         dawg_args_->permuter == COMPOUND_PERM ||
00820         parent_vse->length < language_model_min_compound_length) return NULL;
00821 
00822     int i;
00823     // Check a that the path terminated before the current character is a word.
00824     bool has_word_ending = false;
00825     for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) {
00826       const DawgPosition &pos = (*parent_vse->dawg_info->active_dawgs)[i];
00827       const Dawg *pdawg = pos.dawg_index < 0
00828           ? NULL : dict_->GetDawg(pos.dawg_index);
00829       if (pdawg == NULL || pos.back_to_punc) continue;;
00830       if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
00831           pdawg->end_of_word(pos.dawg_ref)) {
00832         has_word_ending = true;
00833         break;
00834       }
00835     }
00836     if (!has_word_ending) return NULL;
00837 
00838     if (language_model_debug_level > 0) tprintf("Compound word found\n");
00839     return new LanguageModelDawgInfo(beginning_active_dawgs_, COMPOUND_PERM);
00840   }  // done dealing with compound words
00841 
00842   LanguageModelDawgInfo *dawg_info = NULL;
00843 
00844   // Call LetterIsOkay().
00845   // Use the normalized IDs so that all shapes of ' can be allowed in words
00846   // like don't.
00847   const GenericVector<UNICHAR_ID>& normed_ids =
00848       dict_->getUnicharset().normed_ids(b.unichar_id());
00849   DawgPositionVector tmp_active_dawgs;
00850   for (int i = 0; i < normed_ids.size(); ++i) {
00851     if (language_model_debug_level > 2)
00852       tprintf("Test Letter OK for unichar %d, normed %d\n",
00853               b.unichar_id(), normed_ids[i]);
00854     dict_->LetterIsOkay(dawg_args_, normed_ids[i],
00855                         word_end && i == normed_ids.size() - 1);
00856     if (dawg_args_->permuter == NO_PERM) {
00857       break;
00858     } else if (i < normed_ids.size() - 1) {
00859       tmp_active_dawgs = *dawg_args_->updated_dawgs;
00860       dawg_args_->active_dawgs = &tmp_active_dawgs;
00861     }
00862     if (language_model_debug_level > 2)
00863       tprintf("Letter was OK for unichar %d, normed %d\n",
00864               b.unichar_id(), normed_ids[i]);
00865   }
00866   dawg_args_->active_dawgs = NULL;
00867   if (dawg_args_->permuter != NO_PERM) {
00868     dawg_info = new LanguageModelDawgInfo(dawg_args_->updated_dawgs,
00869                                           dawg_args_->permuter);
00870   } else if (language_model_debug_level > 3) {
00871     tprintf("Letter %s not OK!\n",
00872             dict_->getUnicharset().id_to_unichar(b.unichar_id()));
00873   }
00874 
00875   return dawg_info;
00876 }
00877 
00878 LanguageModelNgramInfo *LanguageModel::GenerateNgramInfo(
00879     const char *unichar, float certainty, float denom,
00880     int curr_col, int curr_row, float outline_length,
00881     const ViterbiStateEntry *parent_vse) {
00882   // Initialize parent context.
00883   const char *pcontext_ptr = "";
00884   int pcontext_unichar_step_len = 0;
00885   if (parent_vse == NULL) {
00886     pcontext_ptr = prev_word_str_.string();
00887     pcontext_unichar_step_len = prev_word_unichar_step_len_;
00888   } else {
00889     pcontext_ptr = parent_vse->ngram_info->context.string();
00890     pcontext_unichar_step_len =
00891       parent_vse->ngram_info->context_unichar_step_len;
00892   }
00893   // Compute p(unichar | parent context).
00894   int unichar_step_len = 0;
00895   bool pruned = false;
00896   float ngram_cost;
00897   float ngram_and_classifier_cost =
00898       ComputeNgramCost(unichar, certainty, denom,
00899                        pcontext_ptr, &unichar_step_len,
00900                        &pruned, &ngram_cost);
00901   // Normalize just the ngram_and_classifier_cost by outline_length.
00902   // The ngram_cost is used by the params_model, so it needs to be left as-is,
00903   // and the params model cost will be normalized by outline_length.
00904   ngram_and_classifier_cost *=
00905       outline_length / language_model_ngram_rating_factor;
00906   // Add the ngram_cost of the parent.
00907   if (parent_vse != NULL) {
00908     ngram_and_classifier_cost +=
00909         parent_vse->ngram_info->ngram_and_classifier_cost;
00910     ngram_cost += parent_vse->ngram_info->ngram_cost;
00911   }
00912 
00913   // Shorten parent context string by unichar_step_len unichars.
00914   int num_remove = (unichar_step_len + pcontext_unichar_step_len -
00915                     language_model_ngram_order);
00916   if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
00917   while (num_remove > 0 && *pcontext_ptr != '\0') {
00918     pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
00919     --num_remove;
00920   }
00921 
00922   // Decide whether to prune this ngram path and update changed accordingly.
00923   if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
00924 
00925   // Construct and return the new LanguageModelNgramInfo.
00926   LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(
00927       pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
00928       ngram_and_classifier_cost);
00929   ngram_info->context += unichar;
00930   ngram_info->context_unichar_step_len += unichar_step_len;
00931   assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
00932   return ngram_info;
00933 }
00934 
00935 float LanguageModel::ComputeNgramCost(const char *unichar,
00936                                       float certainty,
00937                                       float denom,
00938                                       const char *context,
00939                                       int *unichar_step_len,
00940                                       bool *found_small_prob,
00941                                       float *ngram_cost) {
00942   const char *context_ptr = context;
00943   char *modified_context = NULL;
00944   char *modified_context_end = NULL;
00945   const char *unichar_ptr = unichar;
00946   const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
00947   float prob = 0.0f;
00948   int step = 0;
00949   while (unichar_ptr < unichar_end &&
00950          (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
00951     if (language_model_debug_level > 1) {
00952       tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
00953               dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
00954     }
00955     prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
00956     ++(*unichar_step_len);
00957     if (language_model_ngram_use_only_first_uft8_step) break;
00958     unichar_ptr += step;
00959     // If there are multiple UTF8 characters present in unichar, context is
00960     // updated to include the previously examined characters from str,
00961     // unless use_only_first_uft8_step is true.
00962     if (unichar_ptr < unichar_end) {
00963       if (modified_context == NULL) {
00964         int context_len = strlen(context);
00965         modified_context =
00966           new char[context_len + strlen(unichar_ptr) + step + 1];
00967         strncpy(modified_context, context, context_len);
00968         modified_context_end = modified_context + context_len;
00969         context_ptr = modified_context;
00970       }
00971       strncpy(modified_context_end, unichar_ptr - step, step);
00972       modified_context_end += step;
00973       *modified_context_end = '\0';
00974     }
00975   }
00976   prob /= static_cast<float>(*unichar_step_len);  // normalize
00977   if (prob < language_model_ngram_small_prob) {
00978     if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
00979     *found_small_prob = true;
00980     prob = language_model_ngram_small_prob;
00981   }
00982   *ngram_cost = -1.0*log2(prob);
00983   float ngram_and_classifier_cost =
00984       -1.0*log2(CertaintyScore(certainty)/denom) +
00985       *ngram_cost * language_model_ngram_scale_factor;
00986   if (language_model_debug_level > 1) {
00987     tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
00988             unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
00989             ngram_and_classifier_cost);
00990   }
00991   if (modified_context != NULL) delete[] modified_context;
00992   return ngram_and_classifier_cost;
00993 }
00994 
00995 float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) {
00996   if (curr_list->empty()) return 1.0f;
00997   float denom = 0.0f;
00998   int len = 0;
00999   BLOB_CHOICE_IT c_it(curr_list);
01000   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
01001     ASSERT_HOST(c_it.data() != NULL);
01002     ++len;
01003     denom += CertaintyScore(c_it.data()->certainty());
01004   }
01005   assert(len != 0);
01006   // The ideal situation would be to have the classifier scores for
01007   // classifying each position as each of the characters in the unicharset.
01008   // Since we can not do this because of speed, we add a very crude estimate
01009   // of what these scores for the "missing" classifications would sum up to.
01010   denom += (dict_->getUnicharset().size() - len) *
01011     CertaintyScore(language_model_ngram_nonmatch_score);
01012 
01013   return denom;
01014 }
01015 
01016 void LanguageModel::FillConsistencyInfo(
01017     int curr_col,
01018     bool word_end,
01019     BLOB_CHOICE *b,
01020     ViterbiStateEntry *parent_vse,
01021     WERD_RES *word_res,
01022     LMConsistencyInfo *consistency_info) {
01023   const UNICHARSET &unicharset = dict_->getUnicharset();
01024   UNICHAR_ID unichar_id = b->unichar_id();
01025   BLOB_CHOICE* parent_b = parent_vse != NULL ? parent_vse->curr_b : NULL;
01026 
01027   // Check punctuation validity.
01028   if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
01029   if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
01030     if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
01031         (unicharset.get_isalpha(parent_b->unichar_id()) ||
01032          unicharset.get_isdigit(parent_b->unichar_id()))) {
01033       // reset punc_ref for compound words
01034       consistency_info->punc_ref = NO_EDGE;
01035     } else {
01036       bool is_apos = dict_->is_apostrophe(unichar_id);
01037       bool prev_is_numalpha = (parent_b != NULL &&
01038           (unicharset.get_isalpha(parent_b->unichar_id()) ||
01039            unicharset.get_isdigit(parent_b->unichar_id())));
01040       UNICHAR_ID pattern_unichar_id =
01041         (unicharset.get_isalpha(unichar_id) ||
01042          unicharset.get_isdigit(unichar_id) ||
01043          (is_apos && prev_is_numalpha)) ?
01044         Dawg::kPatternUnicharID : unichar_id;
01045       if (consistency_info->punc_ref == NO_EDGE ||
01046           pattern_unichar_id != Dawg::kPatternUnicharID ||
01047           dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
01048           Dawg::kPatternUnicharID) {
01049         NODE_REF node = Dict::GetStartingNode(dict_->GetPuncDawg(),
01050                                               consistency_info->punc_ref);
01051         consistency_info->punc_ref =
01052           (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
01053               node, pattern_unichar_id, word_end) : NO_EDGE;
01054         if (consistency_info->punc_ref == NO_EDGE) {
01055           consistency_info->invalid_punc = true;
01056         }
01057       }
01058     }
01059   }
01060 
01061   // Update case related counters.
01062   if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
01063     // Reset counters if we are dealing with a compound word.
01064     consistency_info->num_lower = 0;
01065     consistency_info->num_non_first_upper = 0;
01066   }
01067   else if (unicharset.get_islower(unichar_id)) {
01068     consistency_info->num_lower++;
01069   } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
01070     if (unicharset.get_isupper(parent_b->unichar_id()) ||
01071         consistency_info->num_lower > 0 ||
01072         consistency_info->num_non_first_upper > 0) {
01073       consistency_info->num_non_first_upper++;
01074     }
01075   }
01076 
01077   // Initialize consistency_info->script_id (use script of unichar_id
01078   // if it is not Common, use script id recorded by the parent otherwise).
01079   // Set inconsistent_script to true if the script of the current unichar
01080   // is not consistent with that of the parent.
01081   consistency_info->script_id = unicharset.get_script(unichar_id);
01082   // Hiragana and Katakana can mix with Han.
01083   if (dict_->getUnicharset().han_sid() != dict_->getUnicharset().null_sid()) {
01084     if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
01085          consistency_info->script_id == unicharset.hiragana_sid()) ||
01086         (unicharset.katakana_sid() != unicharset.null_sid() &&
01087          consistency_info->script_id == unicharset.katakana_sid())) {
01088       consistency_info->script_id = dict_->getUnicharset().han_sid();
01089     }
01090   }
01091 
01092   if (parent_vse != NULL &&
01093       (parent_vse->consistency_info.script_id !=
01094        dict_->getUnicharset().common_sid())) {
01095     int parent_script_id = parent_vse->consistency_info.script_id;
01096     // If script_id is Common, use script id of the parent instead.
01097     if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
01098       consistency_info->script_id = parent_script_id;
01099     }
01100     if (consistency_info->script_id != parent_script_id) {
01101       consistency_info->inconsistent_script = true;
01102     }
01103   }
01104 
01105   // Update chartype related counters.
01106   if (unicharset.get_isalpha(unichar_id)) {
01107     consistency_info->num_alphas++;
01108   } else if (unicharset.get_isdigit(unichar_id)) {
01109     consistency_info->num_digits++;
01110   } else if (!unicharset.get_ispunctuation(unichar_id)) {
01111     consistency_info->num_other++;
01112   }
01113 
01114   // Check font and spacing consistency.
01115   if (fontinfo_table_->size() > 0 && parent_b != NULL) {
01116     int fontinfo_id = -1;
01117     if (parent_b->fontinfo_id() == b->fontinfo_id() ||
01118         parent_b->fontinfo_id2() == b->fontinfo_id()) {
01119       fontinfo_id = b->fontinfo_id();
01120     } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
01121                 parent_b->fontinfo_id2() == b->fontinfo_id2()) {
01122       fontinfo_id = b->fontinfo_id2();
01123     }
01124     if(language_model_debug_level > 1) {
01125       tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
01126               (parent_b->fontinfo_id() >= 0) ?
01127                   fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
01128               (parent_b->fontinfo_id2() >= 0) ?
01129                   fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
01130               (b->fontinfo_id() >= 0) ?
01131                   fontinfo_table_->get(b->fontinfo_id()).name : "",
01132               (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
01133               (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
01134               fontinfo_id);
01135     }
01136     if (!word_res->blob_widths.empty()) {  // if we have widths/gaps info
01137       bool expected_gap_found = false;
01138       float expected_gap;
01139       int temp_gap;
01140       if (fontinfo_id >= 0) {  // found a common font
01141         ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
01142         if (fontinfo_table_->get(fontinfo_id).get_spacing(
01143             parent_b->unichar_id(), unichar_id, &temp_gap)) {
01144           expected_gap = temp_gap;
01145           expected_gap_found = true;
01146         }
01147       } else {
01148         consistency_info->inconsistent_font = true;
01149         // Get an average of the expected gaps in each font
01150         int num_addends = 0;
01151         expected_gap = 0;
01152         int temp_fid;
01153         for (int i = 0; i < 4; ++i) {
01154           if (i == 0) {
01155             temp_fid = parent_b->fontinfo_id();
01156           } else if (i == 1) {
01157             temp_fid = parent_b->fontinfo_id2();
01158           } else if (i == 2) {
01159             temp_fid = b->fontinfo_id();
01160           } else {
01161             temp_fid = b->fontinfo_id2();
01162           }
01163           ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
01164           if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
01165               parent_b->unichar_id(), unichar_id, &temp_gap)) {
01166             expected_gap += temp_gap;
01167             num_addends++;
01168           }
01169         }
01170         expected_gap_found = (num_addends > 0);
01171         if (num_addends > 0) {
01172           expected_gap /= static_cast<float>(num_addends);
01173         }
01174       }
01175       if (expected_gap_found) {
01176         float actual_gap =
01177             static_cast<float>(word_res->GetBlobsGap(curr_col-1));
01178         float gap_ratio = expected_gap / actual_gap;
01179         // TODO(rays) The gaps seem to be way off most of the time, saved by
01180         // the error here that the ratio was compared to 1/2, when it should
01181         // have been 0.5f. Find the source of the gaps discrepancy and put
01182         // the 0.5f here in place of 0.0f.
01183         // Test on 2476595.sj, pages 0 to 6. (In French.)
01184         if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
01185           consistency_info->num_inconsistent_spaces++;
01186         }
01187         if (language_model_debug_level > 1) {
01188           tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
01189                   unicharset.id_to_unichar(parent_b->unichar_id()),
01190                   parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
01191                   unichar_id, curr_col, expected_gap, actual_gap);
01192         }
01193       }
01194     }
01195   }
01196 }
01197 
01198 float LanguageModel::ComputeAdjustedPathCost(ViterbiStateEntry *vse) {
01199   ASSERT_HOST(vse != NULL);
01200   if (params_model_.Initialized()) {
01201     float features[PTRAIN_NUM_FEATURE_TYPES];
01202     ExtractFeaturesFromPath(*vse, features);
01203     float cost = params_model_.ComputeCost(features);
01204     if (language_model_debug_level > 3) {
01205       tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
01206       if (language_model_debug_level >= 5) {
01207         for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
01208           tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
01209         }
01210       }
01211     }
01212     return cost * vse->outline_length;
01213   } else {
01214     float adjustment = 1.0f;
01215     if (vse->dawg_info == NULL || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
01216       adjustment += language_model_penalty_non_freq_dict_word;
01217     }
01218     if (vse->dawg_info == NULL) {
01219       adjustment += language_model_penalty_non_dict_word;
01220       if (vse->length > language_model_min_compound_length) {
01221         adjustment += ((vse->length - language_model_min_compound_length) *
01222             language_model_penalty_increment);
01223       }
01224     }
01225     if (vse->associate_stats.shape_cost > 0) {
01226       adjustment += vse->associate_stats.shape_cost /
01227           static_cast<float>(vse->length);
01228     }
01229     if (language_model_ngram_on) {
01230       ASSERT_HOST(vse->ngram_info != NULL);
01231       return vse->ngram_info->ngram_and_classifier_cost * adjustment;
01232     } else {
01233       adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
01234                                                  vse->consistency_info);
01235       return vse->ratings_sum * adjustment;
01236     }
01237   }
01238 }
01239 
01240 void LanguageModel::UpdateBestChoice(
01241     ViterbiStateEntry *vse,
01242     LMPainPoints *pain_points,
01243     WERD_RES *word_res,
01244     BestChoiceBundle *best_choice_bundle,
01245     BlamerBundle *blamer_bundle) {
01246   bool truth_path;
01247   WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
01248                                     blamer_bundle, &truth_path);
01249   ASSERT_HOST(word != NULL);
01250   if (dict_->stopper_debug_level >= 1) {
01251     STRING word_str;
01252     word->string_and_lengths(&word_str, NULL);
01253     vse->Print(word_str.string());
01254   }
01255   if (language_model_debug_level > 0) {
01256     word->print("UpdateBestChoice() constructed word");
01257   }
01258   // Record features from the current path if necessary.
01259   ParamsTrainingHypothesis curr_hyp;
01260   if (blamer_bundle != NULL) {
01261     if (vse->dawg_info != NULL) vse->dawg_info->permuter =
01262         static_cast<PermuterType>(word->permuter());
01263     ExtractFeaturesFromPath(*vse, curr_hyp.features);
01264     word->string_and_lengths(&(curr_hyp.str), NULL);
01265     curr_hyp.cost = vse->cost;  // record cost for error rate computations
01266     if (language_model_debug_level > 0) {
01267       tprintf("Raw features extracted from %s (cost=%g) [ ",
01268               curr_hyp.str.string(), curr_hyp.cost);
01269       for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {
01270         tprintf("%g ", curr_hyp.features[deb_i]);
01271       }
01272       tprintf("]\n");
01273     }
01274     // Record the current hypothesis in params_training_bundle.
01275     blamer_bundle->AddHypothesis(curr_hyp);
01276     if (truth_path)
01277       blamer_bundle->UpdateBestRating(word->rating());
01278   }
01279   if (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()) {
01280     // The word was constructed solely for blamer_bundle->AddHypothesis, so
01281     // we no longer need it.
01282     delete word;
01283     return;
01284   }
01285   if (word_res->chopped_word != NULL && !word_res->chopped_word->blobs.empty())
01286     word->SetScriptPositions(false, word_res->chopped_word);
01287   // Update and log new raw_choice if needed.
01288   if (word_res->raw_choice == NULL ||
01289       word->rating() < word_res->raw_choice->rating()) {
01290     if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
01291       tprintf("Updated raw choice\n");
01292   }
01293   // Set the modified rating for best choice to vse->cost and log best choice.
01294   word->set_rating(vse->cost);
01295   // Call LogNewChoice() for best choice from Dict::adjust_word() since it
01296   // computes adjust_factor that is used by the adaption code (e.g. by
01297   // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
01298   // Note: the rating of the word is not adjusted.
01299   dict_->adjust_word(word, vse->dawg_info == NULL,
01300                      vse->consistency_info.xht_decision, 0.0,
01301                      false, language_model_debug_level > 0);
01302   // Hand ownership of the word over to the word_res.
01303   if (!word_res->LogNewCookedChoice(dict_->tessedit_truncate_wordchoice_log,
01304                                     dict_->stopper_debug_level >= 1, word)) {
01305     // The word was so bad that it was deleted.
01306     return;
01307   }
01308   if (word_res->best_choice == word) {
01309     // Word was the new best.
01310     if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
01311         AcceptablePath(*vse)) {
01312       acceptable_choice_found_ = true;
01313     }
01314     // Update best_choice_bundle.
01315     best_choice_bundle->updated = true;
01316     best_choice_bundle->best_vse = vse;
01317     if (language_model_debug_level > 0) {
01318       tprintf("Updated best choice\n");
01319       word->print_state("New state ");
01320     }
01321     // Update hyphen state if we are dealing with a dictionary word.
01322     if (vse->dawg_info != NULL) {
01323       if (dict_->has_hyphen_end(*word)) {
01324         dict_->set_hyphen_word(*word, *(dawg_args_->active_dawgs));
01325       } else {
01326         dict_->reset_hyphen_vars(true);
01327       }
01328     }
01329 
01330     if (blamer_bundle != NULL) {
01331       blamer_bundle->set_best_choice_is_dict_and_top_choice(
01332           vse->dawg_info != NULL && vse->top_choice_flags);
01333     }
01334   }
01335   if (wordrec_display_segmentations && word_res->chopped_word != NULL) {
01336     word->DisplaySegmentation(word_res->chopped_word);
01337   }
01338 }
01339 
01340 void LanguageModel::ExtractFeaturesFromPath(
01341     const ViterbiStateEntry &vse, float features[]) {
01342   memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
01343   // Record dictionary match info.
01344   int len = vse.length <= kMaxSmallWordUnichars ? 0 :
01345       vse.length <= kMaxMediumWordUnichars ? 1 : 2;
01346   if (vse.dawg_info != NULL) {
01347     int permuter = vse.dawg_info->permuter;
01348     if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
01349       if (vse.consistency_info.num_digits == vse.length) {
01350         features[PTRAIN_DIGITS_SHORT+len] = 1.0;
01351       } else {
01352         features[PTRAIN_NUM_SHORT+len] = 1.0;
01353       }
01354     } else if (permuter == DOC_DAWG_PERM) {
01355       features[PTRAIN_DOC_SHORT+len] = 1.0;
01356     } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
01357         permuter == COMPOUND_PERM) {
01358       features[PTRAIN_DICT_SHORT+len] = 1.0;
01359     } else if (permuter == FREQ_DAWG_PERM) {
01360       features[PTRAIN_FREQ_SHORT+len] = 1.0;
01361     }
01362   }
01363   // Record shape cost feature (normalized by path length).
01364   features[PTRAIN_SHAPE_COST_PER_CHAR] =
01365       vse.associate_stats.shape_cost / static_cast<float>(vse.length);
01366   // Record ngram cost. (normalized by the path length).
01367   features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
01368   if (vse.ngram_info != NULL) {
01369     features[PTRAIN_NGRAM_COST_PER_CHAR] =
01370         vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
01371   }
01372   // Record consistency-related features.
01373   // Disabled this feature for due to its poor performance.
01374   // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
01375   features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
01376   features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
01377   features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == NULL ?
01378       vse.consistency_info.NumInconsistentChartype() : 0.0;
01379   features[PTRAIN_NUM_BAD_SPACING] =
01380       vse.consistency_info.NumInconsistentSpaces();
01381   // Disabled this feature for now due to its poor performance.
01382   // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
01383 
01384   // Classifier-related features.
01385   features[PTRAIN_RATING_PER_CHAR] =
01386       vse.ratings_sum / static_cast<float>(vse.outline_length);
01387 }
01388 
01389 WERD_CHOICE *LanguageModel::ConstructWord(
01390     ViterbiStateEntry *vse,
01391     WERD_RES *word_res,
01392     DANGERR *fixpt,
01393     BlamerBundle *blamer_bundle,
01394     bool *truth_path) {
01395   if (truth_path != NULL) {
01396     *truth_path =
01397         (blamer_bundle != NULL &&
01398          vse->length == blamer_bundle->correct_segmentation_length());
01399   }
01400   BLOB_CHOICE *curr_b = vse->curr_b;
01401   ViterbiStateEntry *curr_vse = vse;
01402 
01403   int i;
01404   bool compound = dict_->hyphenated();  // treat hyphenated words as compound
01405 
01406   // Re-compute the variance of the width-to-height ratios (since we now
01407   // can compute the mean over the whole word).
01408   float full_wh_ratio_mean = 0.0f;
01409   if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
01410     vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
01411     full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
01412                           static_cast<float>(vse->length));
01413     vse->associate_stats.full_wh_ratio_var = 0.0f;
01414   }
01415 
01416   // Construct a WERD_CHOICE by tracing parent pointers.
01417   WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);
01418   word->set_length(vse->length);
01419   int total_blobs = 0;
01420   for (i = (vse->length-1); i >= 0; --i) {
01421     if (blamer_bundle != NULL && truth_path != NULL && *truth_path &&
01422         !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
01423         *truth_path = false;
01424     }
01425     // The number of blobs used for this choice is row - col + 1.
01426     int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
01427     total_blobs += num_blobs;
01428     word->set_blob_choice(i, num_blobs, curr_b);
01429     // Update the width-to-height ratio variance. Useful non-space delimited
01430     // languages to ensure that the blobs are of uniform width.
01431     // Skip leading and trailing punctuation when computing the variance.
01432     if ((full_wh_ratio_mean != 0.0f &&
01433          ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
01434           !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
01435       vse->associate_stats.full_wh_ratio_var +=
01436         pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
01437       if (language_model_debug_level > 2) {
01438         tprintf("full_wh_ratio_var += (%g-%g)^2\n",
01439                 full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
01440       }
01441     }
01442 
01443     // Mark the word as compound if compound permuter was set for any of
01444     // the unichars on the path (usually this will happen for unichars
01445     // that are compounding operators, like "-" and "/").
01446     if (!compound && curr_vse->dawg_info &&
01447         curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
01448 
01449     // Update curr_* pointers.
01450     curr_vse = curr_vse->parent_vse;
01451     if (curr_vse == NULL) break;
01452     curr_b = curr_vse->curr_b;
01453   }
01454   ASSERT_HOST(i == 0);  // check that we recorded all the unichar ids.
01455   ASSERT_HOST(total_blobs == word_res->ratings->dimension());
01456   // Re-adjust shape cost to include the updated width-to-height variance.
01457   if (full_wh_ratio_mean != 0.0f) {
01458     vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
01459   }
01460 
01461   word->set_rating(vse->ratings_sum);
01462   word->set_certainty(vse->min_certainty);
01463   word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
01464                       vse->consistency_info.BodyMaxXHeight());
01465   if (vse->dawg_info != NULL) {
01466     word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
01467   } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
01468     word->set_permuter(NGRAM_PERM);
01469   } else if (vse->top_choice_flags) {
01470     word->set_permuter(TOP_CHOICE_PERM);
01471   } else {
01472     word->set_permuter(NO_PERM);
01473   }
01474   word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
01475                                                             word_res->ratings));
01476   return word;
01477 }
01478 
01479 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines